GADGET-4
pm_periodic.cc
Go to the documentation of this file.
1 /*******************************************************************************
2  * \copyright This file is part of the GADGET4 N-body/SPH code developed
3  * \copyright by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
4  * \copyright (vspringel@mpa-garching.mpg.de) and all contributing authors.
5  *******************************************************************************/
6 
12 #include "gadgetconfig.h"
13 
14 #if defined(PMGRID) && defined(PERIODIC)
15 
16 #include <fftw3.h>
17 #include <math.h>
18 #include <mpi.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <algorithm>
24 
25 #include "../data/allvars.h"
26 #include "../data/dtypes.h"
27 #include "../data/intposconvert.h"
28 #include "../data/mymalloc.h"
29 #include "../domain/domain.h"
30 #include "../logs/timer.h"
31 #include "../main/simulation.h"
32 #include "../mpi_utils/mpi_utils.h"
33 #include "../pm/pm.h"
34 #include "../pm/pm_periodic.h"
35 #include "../sort/cxxsort.h"
36 #include "../system/system.h"
37 #include "../time_integration/timestep.h"
38 
69 #define GRIDz (GRIDZ / 2 + 1)
70 #define GRID2 (2 * GRIDz)
71 
72 /* short-cut macros for accessing different 3D arrays */
73 #define FI(x, y, z) (((large_array_offset)GRID2) * (GRIDY * (x) + (y)) + (z))
74 #define FCxy(c, z) (((large_array_offset)GRID2) * ((c)-myplan.firstcol_XY) + (z))
75 #define FCxz(c, y) (((large_array_offset)GRIDY) * ((c)-myplan.firstcol_XZ) + (y))
76 #define FCzy(c, x) (((large_array_offset)GRIDX) * ((c)-myplan.firstcol_ZY) + (x))
77 
78 #ifndef FFT_COLUMN_BASED
79 #define NI(x, y, z) (((large_array_offset)GRIDZ) * ((y) + (x)*myplan.nslab_y) + (z))
80 #endif
81 
86 void pm_periodic::pm_init_periodic(simparticles *Sp_ptr)
87 {
88  Sp = Sp_ptr;
89 
90  Sp->Asmth[0] = ASMTH * All.BoxSize / PMGRID;
91  Sp->Rcut[0] = RCUT * Sp->Asmth[0];
92 
93  /* Set up the FFTW-3 plan files. */
94  int ndimx[1] = {GRIDX}; /* dimension of the 1D transforms */
95  int ndimy[1] = {GRIDY}; /* dimension of the 1D transforms */
96  int ndimz[1] = {GRIDZ}; /* dimension of the 1D transforms */
97 
98  int max_GRID2 = 2 * (std::max<int>(std::max<int>(GRIDX, GRIDY), GRIDZ) / 2 + 1);
99 
100  /* temporarily allocate some arrays to make sure that out-of-place plans are created */
101  rhogrid = (fft_real *)Mem.mymalloc("rhogrid", max_GRID2 * sizeof(fft_real));
102  forcegrid = (fft_real *)Mem.mymalloc("forcegrid", max_GRID2 * sizeof(fft_real));
103 
104 #ifdef DOUBLEPRECISION_FFTW
105  int alignflag = 0;
106 #else
107  /* for single precision, the start of our FFT columns is presently only guaranteed to be 8-byte aligned */
108  int alignflag = FFTW_UNALIGNED;
109 #endif
110 
111  myplan.forward_plan_zdir = FFTW(plan_many_dft_r2c)(1, ndimz, 1, rhogrid, 0, 1, GRID2, (fft_complex *)forcegrid, 0, 1, GRIDz,
112  FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
113 
114 #ifndef FFT_COLUMN_BASED
115  int stride = GRIDz;
116 #else
117  int stride = 1;
118 #endif
119 
120  myplan.forward_plan_ydir =
121  FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDY, (fft_complex *)forcegrid, 0, stride,
122  GRIDz * GRIDY, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
123 
124  myplan.forward_plan_xdir =
125  FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDX, (fft_complex *)forcegrid, 0, stride,
126  GRIDz * GRIDX, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
127 
128  myplan.backward_plan_xdir =
129  FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDX, (fft_complex *)forcegrid, 0, stride,
130  GRIDz * GRIDX, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
131 
132  myplan.backward_plan_ydir =
133  FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDY, (fft_complex *)forcegrid, 0, stride,
134  GRIDz * GRIDY, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
135 
136  myplan.backward_plan_zdir = FFTW(plan_many_dft_c2r)(1, ndimz, 1, (fft_complex *)rhogrid, 0, 1, GRIDz, forcegrid, 0, 1, GRID2,
137  FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
138 
139  Mem.myfree(forcegrid);
140  Mem.myfree(rhogrid);
141 
142 #ifndef FFT_COLUMN_BASED
143 
144  my_slab_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
145 
146  maxfftsize = std::max<int>(myplan.largest_x_slab * GRIDY, myplan.largest_y_slab * GRIDX) * ((size_t)GRID2);
147 
148 #else
149 
150  my_column_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
151 
152  maxfftsize = myplan.max_datasize;
153 
154 #endif
155 
156 #if defined(GRAVITY_TALLBOX)
157  kernel = (fft_real *)Mem.mymalloc("kernel", maxfftsize * sizeof(fft_real));
158  fft_of_kernel = (fft_complex *)kernel;
159 
160  pmforce_setup_tallbox_kernel();
161 #endif
162 }
163 
164 /* Below, the two functions
165  *
166  * pmforce_ ...... _prepare_density()
167  * and
168  * pmforce_ ...... _readout_forces_or_potential(int dim)
169  *
170  * are defined in two different versions, one that works better for uniform
171  * simulations, the other for zoom-in runs. Only one of the two sets is used,
172  * depending on the setting of PM_ZOOM_OPTIMIZED.
173  */
174 
175 #ifdef PM_ZOOM_OPTIMIZED
176 
177 void pm_periodic::pmforce_zoom_optimized_prepare_density(int mode, int *typelist)
178 {
179  int level, recvTask;
180  MPI_Status status;
181 
182  particle_data *P = Sp->P;
183 
184  part = (part_slab_data *)Mem.mymalloc("part", 8 * (NSource * sizeof(part_slab_data)));
185  large_numpart_type *part_sortindex =
186  (large_numpart_type *)Mem.mymalloc("part_sortindex", 8 * (NSource * sizeof(large_numpart_type)));
187 
188 #ifdef FFT_COLUMN_BASED
189  int columns = GRIDX * GRIDY;
190  int avg = (columns - 1) / NTask + 1;
191  int exc = NTask * avg - columns;
192  int tasklastsection = NTask - exc;
193  int pivotcol = tasklastsection * avg;
194 #endif
195 
196  /* determine the cells each particle accesses */
197  for(int idx = 0; idx < NSource; idx++)
198  {
199  int i = Sp->get_active_index(idx);
200 
201  if(P[i].Ti_Current != All.Ti_Current)
202  Sp->drift_particle(&P[i], &Sp->SphP[i], All.Ti_Current);
203 
204  int slab_x, slab_y, slab_z;
205  if(mode == 2)
206  {
207  slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
208  slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
209  slab_z = (P[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
210  }
211  else if(mode == 3)
212  {
213  slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
214  slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
215  slab_z = (P[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
216  }
217  else
218  {
219  slab_x = P[i].IntPos[0] / INTCELL;
220  slab_y = P[i].IntPos[1] / INTCELL;
221  slab_z = P[i].IntPos[2] / INTCELL;
222  }
223 
224  large_numpart_type index_on_grid = ((large_numpart_type)idx) << 3;
225 
226  for(int xx = 0; xx < 2; xx++)
227  for(int yy = 0; yy < 2; yy++)
228  for(int zz = 0; zz < 2; zz++)
229  {
230  int slab_xx = slab_x + xx;
231  int slab_yy = slab_y + yy;
232  int slab_zz = slab_z + zz;
233 
234  if(slab_xx >= GRIDX)
235  slab_xx = 0;
236  if(slab_yy >= GRIDY)
237  slab_yy = 0;
238  if(slab_zz >= GRIDZ)
239  slab_zz = 0;
240 
241  large_array_offset offset = FI(slab_xx, slab_yy, slab_zz);
242 
243  part[index_on_grid].partindex = (i << 3) + (xx << 2) + (yy << 1) + zz;
244  part[index_on_grid].globalindex = offset;
245  part_sortindex[index_on_grid] = index_on_grid;
246  index_on_grid++;
247  }
248  }
249 
250  /* note: num_on_grid will be 8 times larger than the particle number, but num_field_points will generally be much smaller */
251 
252  large_array_offset num_field_points;
253  large_numpart_type num_on_grid = ((large_numpart_type)NSource) << 3;
254 
255  /* bring the part-field into the order of the accessed cells. This allows the removal of duplicates */
256 
257  mycxxsort(part_sortindex, part_sortindex + num_on_grid, pm_periodic_sortindex_comparator(part));
258 
259  if(num_on_grid > 0)
260  num_field_points = 1;
261  else
262  num_field_points = 0;
263 
264  /* determine the number of unique field points */
265  for(large_numpart_type i = 1; i < num_on_grid; i++)
266  {
267  if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
268  num_field_points++;
269  }
270 
271  /* allocate the local field */
272  localfield_globalindex = (large_array_offset *)Mem.mymalloc_movable(&localfield_globalindex, "localfield_globalindex",
273  num_field_points * sizeof(large_array_offset));
274  localfield_data = (fft_real *)Mem.mymalloc_movable(&localfield_data, "localfield_data", num_field_points * sizeof(fft_real));
275  localfield_first = (size_t *)Mem.mymalloc_movable(&localfield_first, "localfield_first", NTask * sizeof(size_t));
276  localfield_sendcount = (size_t *)Mem.mymalloc_movable(&localfield_sendcount, "localfield_sendcount", NTask * sizeof(size_t));
277  localfield_offset = (size_t *)Mem.mymalloc_movable(&localfield_offset, "localfield_offset", NTask * sizeof(size_t));
278  localfield_recvcount = (size_t *)Mem.mymalloc_movable(&localfield_recvcount, "localfield_recvcount", NTask * sizeof(size_t));
279 
280  for(int i = 0; i < NTask; i++)
281  {
282  localfield_first[i] = 0;
283  localfield_sendcount[i] = 0;
284  }
285 
286  /* establish the cross link between the part[ ]-array and the local list of
287  * mesh points. Also, count on which CPU the needed field points are stored.
288  */
289  num_field_points = 0;
290  for(large_numpart_type i = 0; i < num_on_grid; i++)
291  {
292  if(i > 0)
293  if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
294  num_field_points++;
295 
296  part[part_sortindex[i]].localindex = num_field_points;
297 
298  if(i > 0)
299  if(part[part_sortindex[i]].globalindex == part[part_sortindex[i - 1]].globalindex)
300  continue;
301 
302  localfield_globalindex[num_field_points] = part[part_sortindex[i]].globalindex;
303 
304 #ifndef FFT_COLUMN_BASED
305  int slab = part[part_sortindex[i]].globalindex / (GRIDY * GRID2);
306  int task = myplan.slab_to_task[slab];
307 #else
308  int task, column = part[part_sortindex[i]].globalindex / (GRID2);
309 
310  if(column < pivotcol)
311  task = column / avg;
312  else
313  task = (column - pivotcol) / (avg - 1) + tasklastsection;
314 #endif
315 
316  if(localfield_sendcount[task] == 0)
317  localfield_first[task] = num_field_points;
318 
319  localfield_sendcount[task]++;
320  }
321  num_field_points++;
322 
323  localfield_offset[0] = 0;
324  for(int i = 1; i < NTask; i++)
325  localfield_offset[i] = localfield_offset[i - 1] + localfield_sendcount[i - 1];
326 
327  Mem.myfree_movable(part_sortindex);
328  part_sortindex = NULL;
329 
330  /* now bin the local particle data onto the mesh list */
331  for(large_numpart_type i = 0; i < num_field_points; i++)
332  localfield_data[i] = 0;
333 
334  for(large_numpart_type i = 0; i < num_on_grid; i += 8)
335  {
336  int pindex = (part[i].partindex >> 3);
337  MyIntPosType rmd_x, rmd_y, rmd_z;
338 
339  if(mode == 2)
340  {
341  rmd_x = (P[pindex].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
342  rmd_y = (P[pindex].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
343  rmd_z = (P[pindex].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
344  }
345  else if(mode == 3)
346  {
347  rmd_x = (P[pindex].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
348  rmd_y = (P[pindex].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
349  rmd_z = (P[pindex].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
350  }
351  else
352  {
353  rmd_x = P[pindex].IntPos[0] % INTCELL;
354  rmd_y = P[pindex].IntPos[1] % INTCELL;
355  rmd_z = P[pindex].IntPos[2] % INTCELL;
356  }
357 
358  double dx = rmd_x * (1.0 / INTCELL);
359  double dy = rmd_y * (1.0 / INTCELL);
360  double dz = rmd_z * (1.0 / INTCELL);
361 
362  double weight = P[pindex].getMass();
363 
364  if(mode) /* only for power spectrum calculation */
365  if(typelist[P[pindex].getType()] == 0)
366  continue;
367 
368  localfield_data[part[i + 0].localindex] += weight * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
369  localfield_data[part[i + 1].localindex] += weight * (1.0 - dx) * (1.0 - dy) * dz;
370  localfield_data[part[i + 2].localindex] += weight * (1.0 - dx) * dy * (1.0 - dz);
371  localfield_data[part[i + 3].localindex] += weight * (1.0 - dx) * dy * dz;
372  localfield_data[part[i + 4].localindex] += weight * (dx) * (1.0 - dy) * (1.0 - dz);
373  localfield_data[part[i + 5].localindex] += weight * (dx) * (1.0 - dy) * dz;
374  localfield_data[part[i + 6].localindex] += weight * (dx)*dy * (1.0 - dz);
375  localfield_data[part[i + 7].localindex] += weight * (dx)*dy * dz;
376  }
377 
378  rhogrid = (fft_real *)Mem.mymalloc_clear("rhogrid", maxfftsize * sizeof(fft_real));
379 
380  /* exchange data and add contributions to the local mesh-path */
381  MPI_Alltoall(localfield_sendcount, sizeof(size_t), MPI_BYTE, localfield_recvcount, sizeof(size_t), MPI_BYTE, Communicator);
382 
383  for(level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
384  {
385  recvTask = ThisTask ^ level;
386 
387  if(recvTask < NTask)
388  {
389  if(level > 0)
390  {
391  import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
392  import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
393  localfield_recvcount[recvTask] * sizeof(large_array_offset));
394 
395  if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
396  {
397  myMPI_Sendrecv(localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
398  MPI_BYTE, recvTask, TAG_NONPERIOD_A, import_data, localfield_recvcount[recvTask] * sizeof(fft_real),
399  MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
400 
401  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
402  localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_B,
403  import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
404  TAG_NONPERIOD_B, Communicator, &status);
405  }
406  }
407  else
408  {
409  import_data = localfield_data + localfield_offset[ThisTask];
410  import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
411  }
412 
413  /* note: here every element in rhogrid is only accessed once, so there should be no race condition */
414  for(size_t i = 0; i < localfield_recvcount[recvTask]; i++)
415  {
416  /* determine offset in local FFT slab */
417 #ifndef FFT_COLUMN_BASED
418  large_array_offset offset =
419  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRIDY * ((large_array_offset)GRID2);
420 #else
421  large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
422 #endif
423  rhogrid[offset] += import_data[i];
424  }
425 
426  if(level > 0)
427  {
428  Mem.myfree(import_globalindex);
429  Mem.myfree(import_data);
430  }
431  }
432  }
433 }
434 
435 /* Function to read out the force component corresponding to spatial dimension 'dim'.
436  * If dim is negative, potential values are read out and assigned to particles.
437  */
438 void pm_periodic::pmforce_zoom_optimized_readout_forces_or_potential(fft_real *grid, int dim)
439 {
440  particle_data *P = Sp->P;
441 
442 #ifdef EVALPOTENTIAL
443 #ifdef GRAVITY_TALLBOX
444  double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential */
445 #else
446  double fac = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential */
447 #endif
448 #endif
449 
450  for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
451  {
452  int recvTask = ThisTask ^ level;
453 
454  if(recvTask < NTask)
455  {
456  if(level > 0)
457  {
458  import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
459  import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
460  localfield_recvcount[recvTask] * sizeof(large_array_offset));
461 
462  if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
463  {
464  MPI_Status status;
465  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
466  localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_C,
467  import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
468  TAG_NONPERIOD_C, Communicator, &status);
469  }
470  }
471  else
472  {
473  import_data = localfield_data + localfield_offset[ThisTask];
474  import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
475  }
476 
477  for(size_t i = 0; i < localfield_recvcount[recvTask]; i++)
478  {
479 #ifndef FFT_COLUMN_BASED
480  large_array_offset offset =
481  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRIDY * ((large_array_offset)GRID2);
482 #else
483  large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
484 #endif
485  import_data[i] = grid[offset];
486  }
487 
488  if(level > 0)
489  {
490  MPI_Status status;
491  myMPI_Sendrecv(import_data, localfield_recvcount[recvTask] * sizeof(fft_real), MPI_BYTE, recvTask, TAG_NONPERIOD_A,
492  localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
493  MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
494 
495  Mem.myfree(import_globalindex);
496  Mem.myfree(import_data);
497  }
498  }
499  }
500 
501  /* read out the force/potential values, which all have been assembled in localfield_data */
502  for(int idx = 0; idx < NSource; idx++)
503  {
504  int i = Sp->get_active_index(idx);
505 
506 #if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
507  if(!Sp->TimeBinSynchronized[P[i].TimeBinGrav])
508  continue;
509 #endif
510 
511  large_numpart_type j = (idx << 3);
512 
513  MyIntPosType rmd_x = P[i].IntPos[0] % INTCELL;
514  MyIntPosType rmd_y = P[i].IntPos[1] % INTCELL;
515  MyIntPosType rmd_z = P[i].IntPos[2] % INTCELL;
516 
517  double dx = rmd_x * (1.0 / INTCELL);
518  double dy = rmd_y * (1.0 / INTCELL);
519  double dz = rmd_z * (1.0 / INTCELL);
520 
521  double value = localfield_data[part[j + 0].localindex] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
522  localfield_data[part[j + 1].localindex] * (1.0 - dx) * (1.0 - dy) * dz +
523  localfield_data[part[j + 2].localindex] * (1.0 - dx) * dy * (1.0 - dz) +
524  localfield_data[part[j + 3].localindex] * (1.0 - dx) * dy * dz +
525  localfield_data[part[j + 4].localindex] * (dx) * (1.0 - dy) * (1.0 - dz) +
526  localfield_data[part[j + 5].localindex] * (dx) * (1.0 - dy) * dz +
527  localfield_data[part[j + 6].localindex] * (dx)*dy * (1.0 - dz) +
528  localfield_data[part[j + 7].localindex] * (dx)*dy * dz;
529 
530  if(dim < 0)
531  {
532 #ifdef EVALPOTENTIAL
533 #if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
534  P[i].PM_Potential += value * fac;
535 #else
536  P[i].Potential += value * fac;
537 #endif
538 #endif
539  }
540  else
541  {
542 #if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
543  Sp->P[i].GravPM[dim] += value;
544 #else
545  Sp->P[i].GravAccel[dim] += value;
546 #endif
547  }
548  }
549 }
550 
551 #else
552 
553 /*
554  * Here come the routines for a different communication algorithm that is better suited for a homogeneously loaded boxes.
555  */
556 
557 void pm_periodic::pmforce_uniform_optimized_prepare_density(int mode, int *typelist)
558 {
559  Sndpm_count = (size_t *)Mem.mymalloc("Sndpm_count", NTask * sizeof(size_t));
560  Sndpm_offset = (size_t *)Mem.mymalloc("Sndpm_offset", NTask * sizeof(size_t));
561  Rcvpm_count = (size_t *)Mem.mymalloc("Rcvpm_count", NTask * sizeof(size_t));
562  Rcvpm_offset = (size_t *)Mem.mymalloc("Rcvpm_offset", NTask * sizeof(size_t));
563 
564  particle_data *P = Sp->P;
565 
566  /* determine the slabs/columns each particles accesses */
567 
568 #ifdef FFT_COLUMN_BASED
569  int columns = GRIDX * GRIDY;
570  int avg = (columns - 1) / NTask + 1;
571  int exc = NTask * avg - columns;
572  int tasklastsection = NTask - exc;
573  int pivotcol = tasklastsection * avg;
574 #endif
575 
576  for(int rep = 0; rep < 2; rep++)
577  {
578  /* each threads needs to do the loop to clear its send_count[] array */
579  for(int j = 0; j < NTask; j++)
580  Sndpm_count[j] = 0;
581 
582  for(int idx = 0; idx < NSource; idx++)
583  {
584  int i = Sp->get_active_index(idx);
585 
586  if(P[i].Ti_Current != All.Ti_Current)
587  Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
588 
589  if(mode) /* only for power spectrum calculation */
590  if(typelist[P[i].getType()] == 0)
591  continue;
592 
593  int slab_x;
594  if(mode == 2)
595  slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
596  else if(mode == 3)
597  slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
598  else
599  slab_x = P[i].IntPos[0] / INTCELL;
600 
601  int slab_xx = slab_x + 1;
602 
603  if(slab_xx >= GRIDX)
604  slab_xx = 0;
605 
606 #ifndef FFT_COLUMN_BASED
607  if(rep == 0)
608  {
609  int task0 = myplan.slab_to_task[slab_x];
610  int task1 = myplan.slab_to_task[slab_xx];
611 
612  Sndpm_count[task0]++;
613 
614  if(task0 != task1)
615  Sndpm_count[task1]++;
616  }
617  else
618  {
619  int task0 = myplan.slab_to_task[slab_x];
620  int task1 = myplan.slab_to_task[slab_xx];
621 
622  size_t ind0 = Sndpm_offset[task0] + Sndpm_count[task0]++;
623 #ifndef LEAN
624  partout[ind0].Mass = P[i].getMass();
625 #endif
626  for(int j = 0; j < 3; j++)
627  partout[ind0].IntPos[j] = P[i].IntPos[j];
628 
629  if(task0 != task1)
630  {
631  size_t ind1 = Sndpm_offset[task1] + Sndpm_count[task1]++;
632 #ifndef LEAN
633  partout[ind1].Mass = P[i].getMass();
634 #endif
635  for(int j = 0; j < 3; j++)
636  partout[ind1].IntPos[j] = P[i].IntPos[j];
637  }
638  }
639 
640 #else
641  int slab_y;
642  if(mode == 2)
643  slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
644  else if(mode == 3)
645  slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
646  else
647  slab_y = P[i].IntPos[1] / INTCELL;
648 
649  int slab_yy = slab_y + 1;
650 
651  if(slab_yy >= GRIDY)
652  slab_yy = 0;
653 
654  int column0 = slab_x * GRIDY + slab_y;
655  int column1 = slab_x * GRIDY + slab_yy;
656  int column2 = slab_xx * GRIDY + slab_y;
657  int column3 = slab_xx * GRIDY + slab_yy;
658 
659  int task0, task1, task2, task3;
660 
661  if(column0 < pivotcol)
662  task0 = column0 / avg;
663  else
664  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
665 
666  if(column1 < pivotcol)
667  task1 = column1 / avg;
668  else
669  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
670 
671  if(column2 < pivotcol)
672  task2 = column2 / avg;
673  else
674  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
675 
676  if(column3 < pivotcol)
677  task3 = column3 / avg;
678  else
679  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
680 
681  if(rep == 0)
682  {
683  Sndpm_count[task0]++;
684  if(task1 != task0)
685  Sndpm_count[task1]++;
686  if(task2 != task1 && task2 != task0)
687  Sndpm_count[task2]++;
688  if(task3 != task0 && task3 != task1 && task3 != task2)
689  Sndpm_count[task3]++;
690  }
691  else
692  {
693  size_t ind0 = Sndpm_offset[task0] + Sndpm_count[task0]++;
694 #ifndef LEAN
695  partout[ind0].Mass = P[i].getMass();
696 #endif
697  for(int j = 0; j < 3; j++)
698  partout[ind0].IntPos[j] = P[i].IntPos[j];
699 
700  if(task1 != task0)
701  {
702  size_t ind1 = Sndpm_offset[task1] + Sndpm_count[task1]++;
703 #ifndef LEAN
704  partout[ind1].Mass = P[i].getMass();
705 #endif
706  for(int j = 0; j < 3; j++)
707  partout[ind1].IntPos[j] = P[i].IntPos[j];
708  }
709  if(task2 != task1 && task2 != task0)
710  {
711  size_t ind2 = Sndpm_offset[task2] + Sndpm_count[task2]++;
712 #ifndef LEAN
713  partout[ind2].Mass = P[i].getMass();
714 #endif
715  for(int j = 0; j < 3; j++)
716  partout[ind2].IntPos[j] = P[i].IntPos[j];
717  }
718  if(task3 != task0 && task3 != task1 && task3 != task2)
719  {
720  size_t ind3 = Sndpm_offset[task3] + Sndpm_count[task3]++;
721 #ifndef LEAN
722  partout[ind3].Mass = P[i].getMass();
723 #endif
724  for(int j = 0; j < 3; j++)
725  partout[ind3].IntPos[j] = P[i].IntPos[j];
726  }
727  }
728 #endif
729  }
730 
731  if(rep == 0)
732  {
733  MPI_Alltoall(Sndpm_count, sizeof(size_t), MPI_BYTE, Rcvpm_count, sizeof(size_t), MPI_BYTE, Communicator);
734 
735  nimport = 0, nexport = 0, Rcvpm_offset[0] = 0, Sndpm_offset[0] = 0;
736  for(int j = 0; j < NTask; j++)
737  {
738  nexport += Sndpm_count[j];
739  nimport += Rcvpm_count[j];
740 
741  if(j > 0)
742  {
743  Sndpm_offset[j] = Sndpm_offset[j - 1] + Sndpm_count[j - 1];
744  Rcvpm_offset[j] = Rcvpm_offset[j - 1] + Rcvpm_count[j - 1];
745  }
746  }
747 
748  /* allocate import and export buffer */
749  partin = (partbuf *)Mem.mymalloc_movable(&partin, "partin", nimport * sizeof(partbuf));
750  partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
751  }
752  }
753 
754  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
755  * transfer the data in chunks.
756  */
757  int flag_big = 0, flag_big_all;
758  for(int i = 0; i < NTask; i++)
759  if(Sndpm_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
760  flag_big = 1;
761 
762  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
763 
764  /* exchange particle data */
765  myMPI_Alltoallv(partout, Sndpm_count, Sndpm_offset, partin, Rcvpm_count, Rcvpm_offset, sizeof(partbuf), flag_big_all, Communicator);
766 
767  Mem.myfree(partout);
768 
769  /* allocate cleared density field */
770  rhogrid = (fft_real *)Mem.mymalloc_movable_clear(&rhogrid, "rhogrid", maxfftsize * sizeof(fft_real));
771 
772 #ifndef FFT_COLUMN_BASED
773  /* bin particle data onto mesh, in multi-threaded fashion */
774 
775  for(size_t i = 0; i < nimport; i++)
776  {
777  int slab_x, slab_y, slab_z;
778  MyIntPosType rmd_x, rmd_y, rmd_z;
779 
780  if(mode == 2)
781  {
782  slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
783  rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
784  slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
785  rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
786  slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
787  rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
788  }
789  else if(mode == 3)
790  {
791  slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
792  rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
793  slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
794  rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
795  slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
796  rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
797  }
798  else
799  {
800  slab_x = partin[i].IntPos[0] / INTCELL;
801  rmd_x = partin[i].IntPos[0] % INTCELL;
802  slab_y = partin[i].IntPos[1] / INTCELL;
803  rmd_y = partin[i].IntPos[1] % INTCELL;
804  slab_z = partin[i].IntPos[2] / INTCELL;
805  rmd_z = partin[i].IntPos[2] % INTCELL;
806  }
807 
808  double dx = rmd_x * (1.0 / INTCELL);
809  double dy = rmd_y * (1.0 / INTCELL);
810  double dz = rmd_z * (1.0 / INTCELL);
811 
812  int slab_xx = slab_x + 1;
813  int slab_yy = slab_y + 1;
814  int slab_zz = slab_z + 1;
815 
816  if(slab_xx >= GRIDX)
817  slab_xx = 0;
818  if(slab_yy >= GRIDY)
819  slab_yy = 0;
820  if(slab_zz >= GRIDZ)
821  slab_zz = 0;
822 
823 #ifdef LEAN
824  double mass = All.PartMass;
825 #else
826  double mass = partin[i].Mass;
827 #endif
828 
829  if(myplan.slab_to_task[slab_x] == ThisTask)
830  {
831  slab_x -= myplan.first_slab_x_of_task[ThisTask];
832 
833  rhogrid[FI(slab_x, slab_y, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
834  rhogrid[FI(slab_x, slab_y, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
835 
836  rhogrid[FI(slab_x, slab_yy, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
837  rhogrid[FI(slab_x, slab_yy, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
838  }
839 
840  if(myplan.slab_to_task[slab_xx] == ThisTask)
841  {
842  slab_xx -= myplan.first_slab_x_of_task[ThisTask];
843 
844  rhogrid[FI(slab_xx, slab_y, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
845  rhogrid[FI(slab_xx, slab_y, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
846 
847  rhogrid[FI(slab_xx, slab_yy, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
848  rhogrid[FI(slab_xx, slab_yy, slab_zz)] += (mass * (dx) * (dy) * (dz));
849  }
850  }
851 
852 #else
853 
854  int first_col = myplan.firstcol_XY;
855  int last_col = myplan.firstcol_XY + myplan.ncol_XY - 1;
856 
857  for(size_t i = 0; i < nimport; i++)
858  {
859  int slab_x, slab_y;
860  MyIntPosType rmd_x, rmd_y;
861  if(mode == 2)
862  {
863  slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
864  rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
865  slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
866  rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
867  }
868  else if(mode == 3)
869  {
870  slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
871  rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
872  slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
873  rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
874  }
875  else
876  {
877  slab_x = partin[i].IntPos[0] / INTCELL;
878  rmd_x = partin[i].IntPos[0] % INTCELL;
879  slab_y = partin[i].IntPos[1] / INTCELL;
880  rmd_y = partin[i].IntPos[1] % INTCELL;
881  }
882 
883  double dx = rmd_x * (1.0 / INTCELL);
884  double dy = rmd_y * (1.0 / INTCELL);
885 
886  int slab_xx = slab_x + 1;
887  int slab_yy = slab_y + 1;
888 
889  if(slab_xx >= GRIDX)
890  slab_xx = 0;
891 
892  if(slab_yy >= GRIDY)
893  slab_yy = 0;
894 
895  int col0 = slab_x * GRIDY + slab_y;
896  int col1 = slab_x * GRIDY + slab_yy;
897  int col2 = slab_xx * GRIDY + slab_y;
898  int col3 = slab_xx * GRIDY + slab_yy;
899 
900 #ifdef LEAN
901  double mass = All.PartMass;
902 #else
903  double mass = partin[i].Mass;
904 #endif
905 
906  int slab_z;
907  MyIntPosType rmd_z;
908  if(mode == 2)
909  {
910  slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
911  rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
912  }
913  else if(mode == 3)
914  {
915  slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
916  rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
917  }
918  else
919  {
920  slab_z = partin[i].IntPos[2] / INTCELL;
921  rmd_z = partin[i].IntPos[2] % INTCELL;
922  }
923 
924  double dz = rmd_z * (1.0 / INTCELL);
925 
926  int slab_zz = slab_z + 1;
927 
928  if(slab_zz >= GRIDZ)
929  slab_zz = 0;
930 
931  if(col0 >= first_col && col0 <= last_col)
932  {
933  rhogrid[FCxy(col0, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
934  rhogrid[FCxy(col0, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
935  }
936 
937  if(col1 >= first_col && col1 <= last_col)
938  {
939  rhogrid[FCxy(col1, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
940  rhogrid[FCxy(col1, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
941  }
942 
943  if(col2 >= first_col && col2 <= last_col)
944  {
945  rhogrid[FCxy(col2, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
946  rhogrid[FCxy(col2, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
947  }
948 
949  if(col3 >= first_col && col3 <= last_col)
950  {
951  rhogrid[FCxy(col3, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
952  rhogrid[FCxy(col3, slab_zz)] += (mass * (dx) * (dy) * (dz));
953  }
954  }
955 
956 #endif
957 }
958 
959 /* If dim<0, this function reads out the potential, otherwise Cartesian force components.
960  */
961 void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_xy(fft_real *grid, int dim)
962 {
963  particle_data *P = Sp->P;
964 
965 #ifdef EVALPOTENTIAL
966 #ifdef GRAVITY_TALLBOX
967  double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential */
968 #else
969  double fac = 4 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential */
970 #endif
971 #endif
972 
973  MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
974  MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
975 
976 #ifdef FFT_COLUMN_BASED
977  int columns = GRIDX * GRIDY;
978  int avg = (columns - 1) / NTask + 1;
979  int exc = NTask * avg - columns;
980  int tasklastsection = NTask - exc;
981  int pivotcol = tasklastsection * avg;
982 #endif
983 
984  for(size_t i = 0; i < nimport; i++)
985  {
986  flistin[i] = 0;
987 
988  int slab_x = partin[i].IntPos[0] / INTCELL;
989  int slab_y = partin[i].IntPos[1] / INTCELL;
990  int slab_z = partin[i].IntPos[2] / INTCELL;
991 
992  MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
993  MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
994  MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
995 
996  double dx = rmd_x * (1.0 / INTCELL);
997  double dy = rmd_y * (1.0 / INTCELL);
998  double dz = rmd_z * (1.0 / INTCELL);
999 
1000  int slab_xx = slab_x + 1;
1001  int slab_yy = slab_y + 1;
1002  int slab_zz = slab_z + 1;
1003 
1004  if(slab_xx >= GRIDX)
1005  slab_xx = 0;
1006  if(slab_yy >= GRIDY)
1007  slab_yy = 0;
1008  if(slab_zz >= GRIDZ)
1009  slab_zz = 0;
1010 
1011 #ifndef FFT_COLUMN_BASED
1012  if(myplan.slab_to_task[slab_x] == ThisTask)
1013  {
1014  slab_x -= myplan.first_slab_x_of_task[ThisTask];
1015 
1016  flistin[i] += grid[FI(slab_x, slab_y, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1017  grid[FI(slab_x, slab_y, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz) +
1018  grid[FI(slab_x, slab_yy, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) +
1019  grid[FI(slab_x, slab_yy, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1020  }
1021 
1022  if(myplan.slab_to_task[slab_xx] == ThisTask)
1023  {
1024  slab_xx -= myplan.first_slab_x_of_task[ThisTask];
1025 
1026  flistin[i] += grid[FI(slab_xx, slab_y, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) +
1027  grid[FI(slab_xx, slab_y, slab_zz)] * (dx) * (1.0 - dy) * (dz) +
1028  grid[FI(slab_xx, slab_yy, slab_z)] * (dx) * (dy) * (1.0 - dz) +
1029  grid[FI(slab_xx, slab_yy, slab_zz)] * (dx) * (dy) * (dz);
1030  }
1031 #else
1032  int column0 = slab_x * GRIDY + slab_y;
1033  int column1 = slab_x * GRIDY + slab_yy;
1034  int column2 = slab_xx * GRIDY + slab_y;
1035  int column3 = slab_xx * GRIDY + slab_yy;
1036 
1037  if(column0 >= myplan.firstcol_XY && column0 <= myplan.lastcol_XY)
1038  {
1039  flistin[i] += grid[FCxy(column0, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1040  grid[FCxy(column0, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz);
1041  }
1042  if(column1 >= myplan.firstcol_XY && column1 <= myplan.lastcol_XY)
1043  {
1044  flistin[i] +=
1045  grid[FCxy(column1, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FCxy(column1, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1046  }
1047 
1048  if(column2 >= myplan.firstcol_XY && column2 <= myplan.lastcol_XY)
1049  {
1050  flistin[i] +=
1051  grid[FCxy(column2, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FCxy(column2, slab_zz)] * (dx) * (1.0 - dy) * (dz);
1052  }
1053 
1054  if(column3 >= myplan.firstcol_XY && column3 <= myplan.lastcol_XY)
1055  {
1056  flistin[i] += grid[FCxy(column3, slab_z)] * (dx) * (dy) * (1.0 - dz) + grid[FCxy(column3, slab_zz)] * (dx) * (dy) * (dz);
1057  }
1058 #endif
1059  }
1060 
1061  /* exchange the potential component data */
1062  int flag_big = 0, flag_big_all;
1063  for(int i = 0; i < NTask; i++)
1064  if(Sndpm_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1065  flag_big = 1;
1066 
1067  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1068  * transfer the data in chunks.
1069  */
1070  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1071 
1072  /* exchange data */
1073  myMPI_Alltoallv(flistin, Rcvpm_count, Rcvpm_offset, flistout, Sndpm_count, Sndpm_offset, sizeof(MyFloat), flag_big_all,
1074  Communicator);
1075 
1076  /* each threads needs to do the loop to clear its send_count[] array */
1077  for(int j = 0; j < NTask; j++)
1078  Sndpm_count[j] = 0;
1079 
1080  for(int idx = 0; idx < NSource; idx++)
1081  {
1082  int i = Sp->get_active_index(idx);
1083 
1084  int slab_x = P[i].IntPos[0] / INTCELL;
1085  int slab_xx = slab_x + 1;
1086 
1087  if(slab_xx >= GRIDX)
1088  slab_xx = 0;
1089 
1090 #ifndef FFT_COLUMN_BASED
1091  int task0 = myplan.slab_to_task[slab_x];
1092  int task1 = myplan.slab_to_task[slab_xx];
1093 
1094  double value = flistout[Sndpm_offset[task0] + Sndpm_count[task0]++];
1095 
1096  if(task0 != task1)
1097  value += flistout[Sndpm_offset[task1] + Sndpm_count[task1]++];
1098 #else
1099  int slab_y = P[i].IntPos[1] / INTCELL;
1100  int slab_yy = slab_y + 1;
1101 
1102  if(slab_yy >= GRIDY)
1103  slab_yy = 0;
1104 
1105  int column0 = slab_x * GRIDY + slab_y;
1106  int column1 = slab_x * GRIDY + slab_yy;
1107  int column2 = slab_xx * GRIDY + slab_y;
1108  int column3 = slab_xx * GRIDY + slab_yy;
1109 
1110  int task0, task1, task2, task3;
1111 
1112  if(column0 < pivotcol)
1113  task0 = column0 / avg;
1114  else
1115  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1116 
1117  if(column1 < pivotcol)
1118  task1 = column1 / avg;
1119  else
1120  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1121 
1122  if(column2 < pivotcol)
1123  task2 = column2 / avg;
1124  else
1125  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1126 
1127  if(column3 < pivotcol)
1128  task3 = column3 / avg;
1129  else
1130  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1131 
1132  double value = flistout[Sndpm_offset[task0] + Sndpm_count[task0]++];
1133 
1134  if(task1 != task0)
1135  value += flistout[Sndpm_offset[task1] + Sndpm_count[task1]++];
1136 
1137  if(task2 != task1 && task2 != task0)
1138  value += flistout[Sndpm_offset[task2] + Sndpm_count[task2]++];
1139 
1140  if(task3 != task0 && task3 != task1 && task3 != task2)
1141  value += flistout[Sndpm_offset[task3] + Sndpm_count[task3]++];
1142 #endif
1143 
1144 #if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1145  if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1146  continue;
1147 #endif
1148 
1149  if(dim < 0)
1150  {
1151 #ifdef EVALPOTENTIAL
1152 #if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1153  Sp->P[i].PM_Potential += value * fac;
1154 #else
1155  Sp->P[i].Potential += value * fac;
1156 #endif
1157 #endif
1158  }
1159  else
1160  {
1161 #if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1162  Sp->P[i].GravPM[dim] += value;
1163 #else
1164  Sp->P[i].GravAccel[dim] += value;
1165 #endif
1166  }
1167  }
1168 
1169  Mem.myfree(flistout);
1170  Mem.myfree(flistin);
1171 }
1172 
1173 #ifdef FFT_COLUMN_BASED
1174 void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_xz(fft_real *grid, int dim)
1175 {
1176  if(dim != 1)
1177  Terminate("bummer");
1178 
1179  size_t *send_count = (size_t *)Mem.mymalloc("send_count", NTask * sizeof(size_t));
1180  size_t *send_offset = (size_t *)Mem.mymalloc("send_offset", NTask * sizeof(size_t));
1181  size_t *recv_count = (size_t *)Mem.mymalloc("recv_count", NTask * sizeof(size_t));
1182  size_t *recv_offset = (size_t *)Mem.mymalloc("recv_offset", NTask * sizeof(size_t));
1183 
1184  struct partbuf
1185  {
1186  MyIntPosType IntPos[3];
1187  };
1188 
1189  partbuf *partin, *partout;
1190  size_t nimport = 0, nexport = 0;
1191 
1192  particle_data *P = Sp->P;
1193 
1194  int columns = GRIDX * GRID2;
1195  int avg = (columns - 1) / NTask + 1;
1196  int exc = NTask * avg - columns;
1197  int tasklastsection = NTask - exc;
1198  int pivotcol = tasklastsection * avg;
1199 
1200  /* determine the slabs/columns each particles accesses */
1201  for(int rep = 0; rep < 2; rep++)
1202  {
1203  for(int j = 0; j < NTask; j++)
1204  send_count[j] = 0;
1205 
1206  for(int idx = 0; idx < NSource; idx++)
1207  {
1208  int i = Sp->get_active_index(idx);
1209 
1210  if(P[i].Ti_Current != All.Ti_Current)
1211  Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
1212 
1213  int slab_x = P[i].IntPos[0] / INTCELL;
1214  int slab_xx = slab_x + 1;
1215 
1216  if(slab_xx >= GRIDX)
1217  slab_xx = 0;
1218 
1219  int slab_z = P[i].IntPos[2] / INTCELL;
1220  int slab_zz = slab_z + 1;
1221 
1222  if(slab_zz >= GRIDZ)
1223  slab_zz = 0;
1224 
1225  int column0 = slab_x * GRID2 + slab_z;
1226  int column1 = slab_x * GRID2 + slab_zz;
1227  int column2 = slab_xx * GRID2 + slab_z;
1228  int column3 = slab_xx * GRID2 + slab_zz;
1229 
1230  int task0, task1, task2, task3;
1231 
1232  if(column0 < pivotcol)
1233  task0 = column0 / avg;
1234  else
1235  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1236 
1237  if(column1 < pivotcol)
1238  task1 = column1 / avg;
1239  else
1240  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1241 
1242  if(column2 < pivotcol)
1243  task2 = column2 / avg;
1244  else
1245  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1246 
1247  if(column3 < pivotcol)
1248  task3 = column3 / avg;
1249  else
1250  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1251 
1252  if(rep == 0)
1253  {
1254  send_count[task0]++;
1255  if(task1 != task0)
1256  send_count[task1]++;
1257  if(task2 != task1 && task2 != task0)
1258  send_count[task2]++;
1259  if(task3 != task0 && task3 != task1 && task3 != task2)
1260  send_count[task3]++;
1261  }
1262  else
1263  {
1264  size_t ind0 = send_offset[task0] + send_count[task0]++;
1265  for(int j = 0; j < 3; j++)
1266  partout[ind0].IntPos[j] = P[i].IntPos[j];
1267 
1268  if(task1 != task0)
1269  {
1270  size_t ind1 = send_offset[task1] + send_count[task1]++;
1271  for(int j = 0; j < 3; j++)
1272  partout[ind1].IntPos[j] = P[i].IntPos[j];
1273  }
1274  if(task2 != task1 && task2 != task0)
1275  {
1276  size_t ind2 = send_offset[task2] + send_count[task2]++;
1277 
1278  for(int j = 0; j < 3; j++)
1279  partout[ind2].IntPos[j] = P[i].IntPos[j];
1280  }
1281  if(task3 != task0 && task3 != task1 && task3 != task2)
1282  {
1283  size_t ind3 = send_offset[task3] + send_count[task3]++;
1284 
1285  for(int j = 0; j < 3; j++)
1286  partout[ind3].IntPos[j] = P[i].IntPos[j];
1287  }
1288  }
1289  }
1290 
1291  if(rep == 0)
1292  {
1293  MPI_Alltoall(send_count, sizeof(size_t), MPI_BYTE, recv_count, sizeof(size_t), MPI_BYTE, Communicator);
1294 
1295  nimport = 0, nexport = 0;
1296  recv_offset[0] = send_offset[0] = 0;
1297 
1298  for(int j = 0; j < NTask; j++)
1299  {
1300  nexport += send_count[j];
1301  nimport += recv_count[j];
1302 
1303  if(j > 0)
1304  {
1305  send_offset[j] = send_offset[j - 1] + send_count[j - 1];
1306  recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
1307  }
1308  }
1309 
1310  /* allocate import and export buffer */
1311  partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
1312  partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
1313  }
1314  }
1315 
1316  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1317  * transfer the data in chunks.
1318  */
1319  int flag_big = 0, flag_big_all;
1320  for(int i = 0; i < NTask; i++)
1321  if(send_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1322  flag_big = 1;
1323 
1324  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1325 
1326  /* exchange particle data */
1327  myMPI_Alltoallv(partout, send_count, send_offset, partin, recv_count, recv_offset, sizeof(partbuf), flag_big_all, Communicator);
1328 
1329  Mem.myfree(partout);
1330 
1331  MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
1332  MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
1333 
1334  for(size_t i = 0; i < nimport; i++)
1335  {
1336  flistin[i] = 0;
1337 
1338  int slab_x = partin[i].IntPos[0] / INTCELL;
1339  int slab_y = partin[i].IntPos[1] / INTCELL;
1340  int slab_z = partin[i].IntPos[2] / INTCELL;
1341 
1342  MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
1343  MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
1344  MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
1345 
1346  double dx = rmd_x * (1.0 / INTCELL);
1347  double dy = rmd_y * (1.0 / INTCELL);
1348  double dz = rmd_z * (1.0 / INTCELL);
1349 
1350  int slab_xx = slab_x + 1;
1351  int slab_yy = slab_y + 1;
1352  int slab_zz = slab_z + 1;
1353 
1354  if(slab_xx >= GRIDX)
1355  slab_xx = 0;
1356  if(slab_yy >= GRIDY)
1357  slab_yy = 0;
1358  if(slab_zz >= GRIDZ)
1359  slab_zz = 0;
1360 
1361  int column0 = slab_x * GRID2 + slab_z;
1362  int column1 = slab_x * GRID2 + slab_zz;
1363  int column2 = slab_xx * GRID2 + slab_z;
1364  int column3 = slab_xx * GRID2 + slab_zz;
1365 
1366  if(column0 >= myplan.firstcol_XZ && column0 <= myplan.lastcol_XZ)
1367  {
1368  flistin[i] += grid[FCxz(column0, slab_y)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1369  grid[FCxz(column0, slab_yy)] * (1.0 - dx) * (dy) * (1.0 - dz);
1370  }
1371  if(column1 >= myplan.firstcol_XZ && column1 <= myplan.lastcol_XZ)
1372  {
1373  flistin[i] +=
1374  grid[FCxz(column1, slab_y)] * (1.0 - dx) * (1.0 - dy) * (dz) + grid[FCxz(column1, slab_yy)] * (1.0 - dx) * (dy) * (dz);
1375  }
1376 
1377  if(column2 >= myplan.firstcol_XZ && column2 <= myplan.lastcol_XZ)
1378  {
1379  flistin[i] +=
1380  grid[FCxz(column2, slab_y)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FCxz(column2, slab_yy)] * (dx) * (dy) * (1.0 - dz);
1381  }
1382 
1383  if(column3 >= myplan.firstcol_XZ && column3 <= myplan.lastcol_XZ)
1384  {
1385  flistin[i] += grid[FCxz(column3, slab_y)] * (dx) * (1.0 - dy) * (dz) + grid[FCxz(column3, slab_yy)] * (dx) * (dy) * (dz);
1386  }
1387  }
1388 
1389  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1390  * transfer the data in chunks.
1391  */
1392  flag_big = 0;
1393  for(int i = 0; i < NTask; i++)
1394  if(send_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1395  flag_big = 1;
1396 
1397  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1398 
1399  /* exchange data */
1400  myMPI_Alltoallv(flistin, recv_count, recv_offset, flistout, send_count, send_offset, sizeof(MyFloat), flag_big_all, Communicator);
1401 
1402  for(int j = 0; j < NTask; j++)
1403  send_count[j] = 0;
1404 
1405  /* now assign to original points */
1406  for(int idx = 0; idx < NSource; idx++)
1407  {
1408  int i = Sp->get_active_index(idx);
1409 
1410  int slab_x = P[i].IntPos[0] / INTCELL;
1411  int slab_xx = slab_x + 1;
1412 
1413  if(slab_xx >= GRIDX)
1414  slab_xx = 0;
1415 
1416  int slab_z = P[i].IntPos[2] / INTCELL;
1417  int slab_zz = slab_z + 1;
1418 
1419  if(slab_zz >= GRIDZ)
1420  slab_zz = 0;
1421 
1422  int column0 = slab_x * GRID2 + slab_z;
1423  int column1 = slab_x * GRID2 + slab_zz;
1424  int column2 = slab_xx * GRID2 + slab_z;
1425  int column3 = slab_xx * GRID2 + slab_zz;
1426 
1427  int task0, task1, task2, task3;
1428 
1429  if(column0 < pivotcol)
1430  task0 = column0 / avg;
1431  else
1432  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1433 
1434  if(column1 < pivotcol)
1435  task1 = column1 / avg;
1436  else
1437  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1438 
1439  if(column2 < pivotcol)
1440  task2 = column2 / avg;
1441  else
1442  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1443 
1444  if(column3 < pivotcol)
1445  task3 = column3 / avg;
1446  else
1447  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1448 
1449  double value = flistout[send_offset[task0] + send_count[task0]++];
1450 
1451  if(task1 != task0)
1452  value += flistout[send_offset[task1] + send_count[task1]++];
1453 
1454  if(task2 != task1 && task2 != task0)
1455  value += flistout[send_offset[task2] + send_count[task2]++];
1456 
1457  if(task3 != task0 && task3 != task1 && task3 != task2)
1458  value += flistout[send_offset[task3] + send_count[task3]++];
1459 
1460 #if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1461  if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1462  continue;
1463 #endif
1464 
1465 #if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1466  Sp->P[i].GravPM[dim] += value;
1467 #else
1468  Sp->P[i].GravAccel[dim] += value;
1469 #endif
1470  }
1471 
1472  Mem.myfree(flistout);
1473  Mem.myfree(flistin);
1474  Mem.myfree(partin);
1475  Mem.myfree(recv_offset);
1476  Mem.myfree(recv_count);
1477  Mem.myfree(send_offset);
1478  Mem.myfree(send_count);
1479 }
1480 
1481 void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_zy(fft_real *grid, int dim)
1482 {
1483  if(dim != 0)
1484  Terminate("bummer");
1485 
1486  size_t *send_count = (size_t *)Mem.mymalloc("send_count", NTask * sizeof(size_t));
1487  size_t *send_offset = (size_t *)Mem.mymalloc("send_offset", NTask * sizeof(size_t));
1488  size_t *recv_count = (size_t *)Mem.mymalloc("recv_count", NTask * sizeof(size_t));
1489  size_t *recv_offset = (size_t *)Mem.mymalloc("recv_offset", NTask * sizeof(size_t));
1490 
1491  struct partbuf
1492  {
1493  MyIntPosType IntPos[3];
1494  };
1495 
1496  partbuf *partin, *partout;
1497  size_t nimport = 0, nexport = 0;
1498 
1499  particle_data *P = Sp->P;
1500 
1501  int columns = GRID2 * GRIDY;
1502  int avg = (columns - 1) / NTask + 1;
1503  int exc = NTask * avg - columns;
1504  int tasklastsection = NTask - exc;
1505  int pivotcol = tasklastsection * avg;
1506 
1507  /* determine the slabs/columns each particles accesses */
1508  for(int rep = 0; rep < 2; rep++)
1509  {
1510  for(int j = 0; j < NTask; j++)
1511  send_count[j] = 0;
1512 
1513  for(int idx = 0; idx < NSource; idx++)
1514  {
1515  int i = Sp->get_active_index(idx);
1516 
1517  if(P[i].Ti_Current != All.Ti_Current)
1518  Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
1519 
1520  int slab_z = P[i].IntPos[2] / INTCELL;
1521  int slab_zz = slab_z + 1;
1522 
1523  if(slab_zz >= GRIDZ)
1524  slab_zz = 0;
1525 
1526  int slab_y = P[i].IntPos[1] / INTCELL;
1527  int slab_yy = slab_y + 1;
1528 
1529  if(slab_yy >= GRIDY)
1530  slab_yy = 0;
1531 
1532  int column0 = slab_z * GRIDY + slab_y;
1533  int column1 = slab_z * GRIDY + slab_yy;
1534  int column2 = slab_zz * GRIDY + slab_y;
1535  int column3 = slab_zz * GRIDY + slab_yy;
1536 
1537  int task0, task1, task2, task3;
1538 
1539  if(column0 < pivotcol)
1540  task0 = column0 / avg;
1541  else
1542  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1543 
1544  if(column1 < pivotcol)
1545  task1 = column1 / avg;
1546  else
1547  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1548 
1549  if(column2 < pivotcol)
1550  task2 = column2 / avg;
1551  else
1552  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1553 
1554  if(column3 < pivotcol)
1555  task3 = column3 / avg;
1556  else
1557  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1558 
1559  if(rep == 0)
1560  {
1561  send_count[task0]++;
1562  if(task1 != task0)
1563  send_count[task1]++;
1564  if(task2 != task1 && task2 != task0)
1565  send_count[task2]++;
1566  if(task3 != task0 && task3 != task1 && task3 != task2)
1567  send_count[task3]++;
1568  }
1569  else
1570  {
1571  size_t ind0 = send_offset[task0] + send_count[task0]++;
1572  for(int j = 0; j < 3; j++)
1573  partout[ind0].IntPos[j] = P[i].IntPos[j];
1574 
1575  if(task1 != task0)
1576  {
1577  size_t ind1 = send_offset[task1] + send_count[task1]++;
1578  for(int j = 0; j < 3; j++)
1579  partout[ind1].IntPos[j] = P[i].IntPos[j];
1580  }
1581  if(task2 != task1 && task2 != task0)
1582  {
1583  size_t ind2 = send_offset[task2] + send_count[task2]++;
1584 
1585  for(int j = 0; j < 3; j++)
1586  partout[ind2].IntPos[j] = P[i].IntPos[j];
1587  }
1588  if(task3 != task0 && task3 != task1 && task3 != task2)
1589  {
1590  size_t ind3 = send_offset[task3] + send_count[task3]++;
1591 
1592  for(int j = 0; j < 3; j++)
1593  partout[ind3].IntPos[j] = P[i].IntPos[j];
1594  }
1595  }
1596  }
1597 
1598  if(rep == 0)
1599  {
1600  MPI_Alltoall(send_count, sizeof(size_t), MPI_BYTE, recv_count, sizeof(size_t), MPI_BYTE, Communicator);
1601 
1602  nimport = 0, nexport = 0;
1603  recv_offset[0] = send_offset[0] = 0;
1604 
1605  for(int j = 0; j < NTask; j++)
1606  {
1607  nexport += send_count[j];
1608  nimport += recv_count[j];
1609 
1610  if(j > 0)
1611  {
1612  send_offset[j] = send_offset[j - 1] + send_count[j - 1];
1613  recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
1614  }
1615  }
1616 
1617  /* allocate import and export buffer */
1618  partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
1619  partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
1620  }
1621  }
1622 
1623  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1624  * transfer the data in chunks.
1625  */
1626  int flag_big = 0, flag_big_all;
1627  for(int i = 0; i < NTask; i++)
1628  if(send_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1629  flag_big = 1;
1630 
1631  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1632 
1633  /* exchange particle data */
1634  myMPI_Alltoallv(partout, send_count, send_offset, partin, recv_count, recv_offset, sizeof(partbuf), flag_big_all, Communicator);
1635 
1636  Mem.myfree(partout);
1637 
1638  MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
1639  MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
1640 
1641  for(size_t i = 0; i < nimport; i++)
1642  {
1643  flistin[i] = 0;
1644 
1645  int slab_x = partin[i].IntPos[0] / INTCELL;
1646  int slab_y = partin[i].IntPos[1] / INTCELL;
1647  int slab_z = partin[i].IntPos[2] / INTCELL;
1648 
1649  MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
1650  MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
1651  MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
1652 
1653  double dx = rmd_x * (1.0 / INTCELL);
1654  double dy = rmd_y * (1.0 / INTCELL);
1655  double dz = rmd_z * (1.0 / INTCELL);
1656 
1657  int slab_xx = slab_x + 1;
1658  int slab_yy = slab_y + 1;
1659  int slab_zz = slab_z + 1;
1660 
1661  if(slab_xx >= GRIDX)
1662  slab_xx = 0;
1663  if(slab_yy >= GRIDY)
1664  slab_yy = 0;
1665  if(slab_zz >= GRIDZ)
1666  slab_zz = 0;
1667 
1668  int column0 = slab_z * GRIDY + slab_y;
1669  int column1 = slab_z * GRIDY + slab_yy;
1670  int column2 = slab_zz * GRIDY + slab_y;
1671  int column3 = slab_zz * GRIDY + slab_yy;
1672 
1673  if(column0 >= myplan.firstcol_ZY && column0 <= myplan.lastcol_ZY)
1674  {
1675  flistin[i] += grid[FCzy(column0, slab_x)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1676  grid[FCzy(column0, slab_xx)] * (dx) * (1.0 - dy) * (1.0 - dz);
1677  }
1678  if(column1 >= myplan.firstcol_ZY && column1 <= myplan.lastcol_ZY)
1679  {
1680  flistin[i] +=
1681  grid[FCzy(column1, slab_x)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FCzy(column1, slab_xx)] * (dx) * (dy) * (1.0 - dz);
1682  }
1683 
1684  if(column2 >= myplan.firstcol_ZY && column2 <= myplan.lastcol_ZY)
1685  {
1686  flistin[i] +=
1687  grid[FCzy(column2, slab_x)] * (1.0 - dx) * (1.0 - dy) * (dz) + grid[FCzy(column2, slab_xx)] * (dx) * (1.0 - dy) * (dz);
1688  }
1689 
1690  if(column3 >= myplan.firstcol_ZY && column3 <= myplan.lastcol_ZY)
1691  {
1692  flistin[i] += grid[FCzy(column3, slab_x)] * (1.0 - dx) * (dy) * (dz) + grid[FCzy(column3, slab_xx)] * (dx) * (dy) * (dz);
1693  }
1694  }
1695 
1696  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1697  * transfer the data in chunks.
1698  */
1699  flag_big = 0;
1700  for(int i = 0; i < NTask; i++)
1701  if(send_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1702  flag_big = 1;
1703 
1704  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1705 
1706  /* exchange data */
1707  myMPI_Alltoallv(flistin, recv_count, recv_offset, flistout, send_count, send_offset, sizeof(MyFloat), flag_big_all, Communicator);
1708 
1709  for(int j = 0; j < NTask; j++)
1710  send_count[j] = 0;
1711 
1712  /* now assign to original points */
1713  for(int idx = 0; idx < NSource; idx++)
1714  {
1715  int i = Sp->get_active_index(idx);
1716 
1717  int slab_z = P[i].IntPos[2] / INTCELL;
1718  int slab_zz = slab_z + 1;
1719 
1720  if(slab_zz >= GRIDZ)
1721  slab_zz = 0;
1722 
1723  int slab_y = P[i].IntPos[1] / INTCELL;
1724  int slab_yy = slab_y + 1;
1725 
1726  if(slab_yy >= GRIDY)
1727  slab_yy = 0;
1728 
1729  int column0 = slab_z * GRIDY + slab_y;
1730  int column1 = slab_z * GRIDY + slab_yy;
1731  int column2 = slab_zz * GRIDY + slab_y;
1732  int column3 = slab_zz * GRIDY + slab_yy;
1733 
1734  int task0, task1, task2, task3;
1735 
1736  if(column0 < pivotcol)
1737  task0 = column0 / avg;
1738  else
1739  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1740 
1741  if(column1 < pivotcol)
1742  task1 = column1 / avg;
1743  else
1744  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1745 
1746  if(column2 < pivotcol)
1747  task2 = column2 / avg;
1748  else
1749  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1750 
1751  if(column3 < pivotcol)
1752  task3 = column3 / avg;
1753  else
1754  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1755 
1756  double value = flistout[send_offset[task0] + send_count[task0]++];
1757 
1758  if(task1 != task0)
1759  value += flistout[send_offset[task1] + send_count[task1]++];
1760 
1761  if(task2 != task1 && task2 != task0)
1762  value += flistout[send_offset[task2] + send_count[task2]++];
1763 
1764  if(task3 != task0 && task3 != task1 && task3 != task2)
1765  value += flistout[send_offset[task3] + send_count[task3]++];
1766 
1767 #if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1768  if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1769  continue;
1770 #endif
1771 
1772 #if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1773  Sp->P[i].GravPM[dim] += value;
1774 #else
1775  Sp->P[i].GravAccel[dim] += value;
1776 #endif
1777  }
1778 
1779  Mem.myfree(flistout);
1780  Mem.myfree(flistin);
1781  Mem.myfree(partin);
1782  Mem.myfree(recv_offset);
1783  Mem.myfree(recv_count);
1784  Mem.myfree(send_offset);
1785  Mem.myfree(send_count);
1786 }
1787 #endif
1788 
1789 #endif
1790 
1803 void pm_periodic::pmforce_periodic(int mode, int *typelist)
1804 {
1805  int x, y, z;
1806 
1807  double tstart = Logs.second();
1808 
1809  if(mode == 0)
1810  mpi_printf("PM-PERIODIC: Starting periodic PM calculation. (Rcut=%g) presently allocated=%g MB\n", Sp->Rcut[0],
1812 
1813 #ifdef HIERARCHICAL_GRAVITY
1814  NSource = Sp->TimeBinsGravity.NActiveParticles;
1815 #else
1816  NSource = Sp->NumPart;
1817 #endif
1818 
1819 #ifndef TREEPM_NOTIMESPLIT
1820  if(NSource != Sp->NumPart)
1821  Terminate("unexpected NSource != Sp->NumPart");
1822 #endif
1823 
1824 #ifndef NUMPART_PER_TASK_LARGE
1825  if((((long long)Sp->NumPart) << 3) >= (((long long)1) << 31))
1826  Terminate("We are dealing with a too large particle number per MPI rank - enabling NUMPART_PER_TASK_LARGE might help.");
1827 #endif
1828 
1829  double asmth2 = Sp->Asmth[0] * Sp->Asmth[0];
1830  double d = All.BoxSize / PMGRID;
1831  double dhalf = 0.5 * d;
1832 
1833 #ifdef GRAVITY_TALLBOX
1834  double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential */
1835 #else
1836  double fac = 4 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential */
1837 #endif
1838 
1839  fac *= 1 / (2 * d); /* for finite differencing */
1840 
1841 #ifdef PM_ZOOM_OPTIMIZED
1842  pmforce_zoom_optimized_prepare_density(mode, typelist);
1843 #else
1844  pmforce_uniform_optimized_prepare_density(mode, typelist);
1845 #endif
1846 
1847  /* note: after density, we still keep the field 'partin' from the density assignment,
1848  * as we can use this later on to return potential and z-force
1849  */
1850 
1851  /* allocate the memory to hold the FFT fields */
1852 
1853  forcegrid = (fft_real *)Mem.mymalloc_movable(&forcegrid, "forcegrid", maxfftsize * sizeof(fft_real));
1854 
1855  workspace = forcegrid;
1856 
1857 #ifndef FFT_COLUMN_BASED
1858  fft_of_rhogrid = (fft_complex *)&rhogrid[0];
1859 #else
1860  fft_of_rhogrid = (fft_complex *)&workspace[0];
1861 #endif
1862 
1863  /* Do the FFT of the density field */
1864 #ifndef FFT_COLUMN_BASED
1865  my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], 1);
1866 #else
1867  my_column_based_fft(&myplan, rhogrid, workspace, 1); /* result is in workspace, not in rhogrid ! */
1868 #endif
1869 
1870  if(mode != 0)
1871  {
1872  pmforce_measure_powerspec(mode - 1, typelist);
1873 
1874 #if defined(FFT_COLUMN_BASED) && !defined(PM_ZOOM_OPTIMIZED)
1875  Mem.myfree_movable(partin);
1876  partin = NULL;
1877 #endif
1878  }
1879  else
1880  {
1881  /* multiply with Green's function in order to obtain the potential (or forces for spectral diffencing) */
1882 
1883  double kfacx = 2.0 * M_PI / (GRIDX * d);
1884  double kfacy = 2.0 * M_PI / (GRIDY * d);
1885  double kfacz = 2.0 * M_PI / (GRIDZ * d);
1886 
1887 #ifdef FFT_COLUMN_BASED
1888  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
1889  {
1890  large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
1891  y = ipcell / (GRIDX * GRIDz);
1892  int yr = ipcell % (GRIDX * GRIDz);
1893  z = yr / GRIDX;
1894  x = yr % GRIDX;
1895 #else
1896  for(x = 0; x < GRIDX; x++)
1897  for(y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
1898  for(z = 0; z < GRIDz; z++)
1899  {
1900 #endif
1901  int xx, yy, zz;
1902 
1903  if(x >= (GRIDX / 2))
1904  xx = x - GRIDX;
1905  else
1906  xx = x;
1907  if(y >= (GRIDY / 2))
1908  yy = y - GRIDY;
1909  else
1910  yy = y;
1911  if(z >= (GRIDZ / 2))
1912  zz = z - GRIDZ;
1913  else
1914  zz = z;
1915 
1916  double kx = kfacx * xx;
1917  double ky = kfacy * yy;
1918  double kz = kfacz * zz;
1919 
1920  double k2 = kx * kx + ky * ky + kz * kz;
1921 
1922  double smth = 1.0, deconv = 1.0;
1923 
1924  if(k2 > 0)
1925  {
1926  smth = -exp(-k2 * asmth2) / k2;
1927 
1928  /* do deconvolution */
1929 
1930  double fx = 1, fy = 1, fz = 1;
1931 
1932  if(xx != 0)
1933  {
1934  fx = kx * dhalf;
1935  fx = sin(fx) / fx;
1936  }
1937  if(yy != 0)
1938  {
1939  fy = ky * dhalf;
1940  fy = sin(fy) / fy;
1941  }
1942  if(zz != 0)
1943  {
1944  fz = kz * dhalf;
1945  fz = sin(fz) / fz;
1946  }
1947 
1948  double ff = 1 / (fx * fy * fz);
1949  deconv = ff * ff * ff * ff;
1950 
1951  smth *= deconv; /* deconvolution */
1952  }
1953 
1954 #ifndef FFT_COLUMN_BASED
1955  large_array_offset ip = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
1956 #endif
1957 
1958 #ifdef GRAVITY_TALLBOX
1959  double re = fft_of_rhogrid[ip][0] * fft_of_kernel[ip][0] - fft_of_rhogrid[ip][1] * fft_of_kernel[ip][1];
1960  double im = fft_of_rhogrid[ip][0] * fft_of_kernel[ip][1] + fft_of_rhogrid[ip][1] * fft_of_kernel[ip][0];
1961 
1962  fft_of_rhogrid[ip][0] = re * deconv * exp(-k2 * asmth2);
1963  fft_of_rhogrid[ip][1] = im * deconv * exp(-k2 * asmth2);
1964 #else
1965  fft_of_rhogrid[ip][0] *= smth;
1966  fft_of_rhogrid[ip][1] *= smth;
1967 #endif
1968  }
1969 
1970 #ifndef GRAVITY_TALLBOX
1971 #ifdef FFT_COLUMN_BASED
1972  if(myplan.second_transposed_firstcol == 0)
1973  fft_of_rhogrid[0][0] = fft_of_rhogrid[0][1] = 0.0;
1974 #else
1975  if(myplan.slabstart_y == 0)
1976  fft_of_rhogrid[0][0] = fft_of_rhogrid[0][1] = 0.0;
1977 #endif
1978 #endif
1979 
1980  /* Do the inverse FFT to get the potential/forces */
1981 
1982 #ifndef FFT_COLUMN_BASED
1983  my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], -1);
1984 #else
1985  my_column_based_fft(&myplan, workspace, rhogrid, -1);
1986 #endif
1987 
1988  /* Now rhogrid holds the potential/forces */
1989 
1990 #ifdef EVALPOTENTIAL
1991 #ifdef PM_ZOOM_OPTIMIZED
1992  pmforce_zoom_optimized_readout_forces_or_potential(rhogrid, -1);
1993 #else
1994  pmforce_uniform_optimized_readout_forces_or_potential_xy(rhogrid, -1);
1995 #endif
1996 #endif
1997 
1998  /* get the force components by finite differencing of the potential for each dimension,
1999  * and send the results back to the right CPUs
2000  */
2001 
2002  /* we do the x component last, because for differencing the potential in the x-direction, we need to construct the
2003  * transpose
2004  */
2005 
2006 #ifndef FFT_COLUMN_BASED
2007 
2008  /* z-direction */
2009  for(y = 0; y < GRIDY; y++)
2010  for(x = 0; x < myplan.nslab_x; x++)
2011  for(z = 0; z < GRIDZ; z++)
2012  {
2013  int zr = z + 1, zl = z - 1, zrr = z + 2, zll = z - 2;
2014  if(zr >= GRIDZ)
2015  zr -= GRIDZ;
2016  if(zrr >= GRIDZ)
2017  zrr -= GRIDZ;
2018  if(zl < 0)
2019  zl += GRIDZ;
2020  if(zll < 0)
2021  zll += GRIDZ;
2022 
2023  forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, y, zl)] - rhogrid[FI(x, y, zr)]) -
2024  (1.0 / 6) * (rhogrid[FI(x, y, zll)] - rhogrid[FI(x, y, zrr)]));
2025  }
2026 
2027 #ifdef PM_ZOOM_OPTIMIZED
2028  pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 2);
2029 #else
2030  pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 2);
2031 #endif
2032 
2033  /* y-direction */
2034  for(y = 0; y < GRIDY; y++)
2035  for(x = 0; x < myplan.nslab_x; x++)
2036  for(z = 0; z < GRIDZ; z++)
2037  {
2038  int yr = y + 1, yl = y - 1, yrr = y + 2, yll = y - 2;
2039  if(yr >= GRIDY)
2040  yr -= GRIDY;
2041  if(yrr >= GRIDY)
2042  yrr -= GRIDY;
2043  if(yl < 0)
2044  yl += GRIDY;
2045  if(yll < 0)
2046  yll += GRIDY;
2047 
2048  forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, yl, z)] - rhogrid[FI(x, yr, z)]) -
2049  (1.0 / 6) * (rhogrid[FI(x, yll, z)] - rhogrid[FI(x, yrr, z)]));
2050  }
2051 
2052 #ifdef PM_ZOOM_OPTIMIZED
2053  pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 1);
2054 #else
2055  pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 1);
2056 #endif
2057 
2058  /* x-direction */
2059  my_slab_transposeA(&myplan, rhogrid, forcegrid); /* compute the transpose of the potential field for finite differencing */
2060  /* note: for the x-direction, we difference the transposed field */
2061 
2062  for(x = 0; x < GRIDX; x++)
2063  for(y = 0; y < myplan.nslab_y; y++)
2064  for(z = 0; z < GRIDZ; z++)
2065  {
2066  int xrr = x + 2, xll = x - 2, xr = x + 1, xl = x - 1;
2067  if(xr >= GRIDX)
2068  xr -= GRIDX;
2069  if(xrr >= GRIDX)
2070  xrr -= GRIDX;
2071  if(xl < 0)
2072  xl += GRIDX;
2073  if(xll < 0)
2074  xll += GRIDX;
2075 
2076  forcegrid[NI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[NI(xl, y, z)] - rhogrid[NI(xr, y, z)]) -
2077  (1.0 / 6) * (rhogrid[NI(xll, y, z)] - rhogrid[NI(xrr, y, z)]));
2078  }
2079 
2080  my_slab_transposeB(&myplan, forcegrid, rhogrid); /* reverse the transpose from above */
2081 
2082 #ifdef PM_ZOOM_OPTIMIZED
2083  pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 0);
2084 #else
2085  pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 0);
2086 #endif
2087 
2088 #else
2089 
2090  /* z-direction */
2091  for(large_array_offset i = 0; i < myplan.ncol_XY; i++)
2092  {
2093  fft_real *forcep = &forcegrid[GRID2 * i];
2094  fft_real *potp = &rhogrid[GRID2 * i];
2095 
2096  for(int z = 0; z < GRIDZ; z++)
2097  {
2098  int zr = z + 1;
2099  int zl = z - 1;
2100  int zrr = z + 2;
2101  int zll = z - 2;
2102 
2103  if(zr >= GRIDZ)
2104  zr -= GRIDZ;
2105  if(zrr >= GRIDZ)
2106  zrr -= GRIDZ;
2107  if(zl < 0)
2108  zl += GRIDZ;
2109  if(zll < 0)
2110  zll += GRIDZ;
2111 
2112  forcep[z] = fac * ((4.0 / 3) * (potp[zl] - potp[zr]) - (1.0 / 6) * (potp[zll] - potp[zrr]));
2113  }
2114  }
2115 
2116 #ifdef PM_ZOOM_OPTIMIZED
2117  pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 2);
2118 #else
2119  pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 2);
2120 
2121  /* at this point we can free partin */
2122  Mem.myfree_movable(partin);
2123  partin = NULL;
2124 #endif
2125 
2126  /* y-direction */
2127  my_fft_swap23(&myplan, rhogrid, forcegrid); // rhogrid contains potential field, forcegrid the transposed field
2128 
2129  /* make an in-place computation */
2130  fft_real *column = (fft_real *)Mem.mymalloc("column", GRIDY * sizeof(fft_real));
2131 
2132  for(large_array_offset i = 0; i < myplan.ncol_XZ; i++)
2133  {
2134  memcpy(column, &forcegrid[GRIDY * i], GRIDY * sizeof(fft_real));
2135 
2136  fft_real *potp = column;
2137  fft_real *forcep = &forcegrid[GRIDY * i];
2138 
2139  for(int y = 0; y < GRIDY; y++)
2140  {
2141  int yr = y + 1;
2142  int yl = y - 1;
2143  int yrr = y + 2;
2144  int yll = y - 2;
2145 
2146  if(yr >= GRIDY)
2147  yr -= GRIDY;
2148  if(yrr >= GRIDY)
2149  yrr -= GRIDY;
2150  if(yl < 0)
2151  yl += GRIDY;
2152  if(yll < 0)
2153  yll += GRIDY;
2154 
2155  forcep[y] = fac * ((4.0 / 3) * (potp[yl] - potp[yr]) - (1.0 / 6) * (potp[yll] - potp[yrr]));
2156  }
2157  }
2158 
2159  Mem.myfree(column);
2160 
2161  /* now need to read out from forcegrid in a non-standard way */
2162 
2163 #ifdef PM_ZOOM_OPTIMIZED
2164  /* need a third field as scratch space */
2165  fft_real *scratch = (fft_real *)Mem.mymalloc("scratch", myplan.fftsize * sizeof(fft_real));
2166 
2167  my_fft_swap23back(&myplan, forcegrid, scratch);
2168  pmforce_zoom_optimized_readout_forces_or_potential(scratch, 1);
2169 
2170  Mem.myfree(scratch);
2171 #else
2172  pmforce_uniform_optimized_readout_forces_or_potential_xz(forcegrid, 1);
2173 #endif
2174 
2175  /* x-direction */
2176  my_fft_swap13(&myplan, rhogrid, forcegrid); // rhogrid contains potential field
2177 
2178  for(large_array_offset i = 0; i < myplan.ncol_ZY; i++)
2179  {
2180  fft_real *forcep = &rhogrid[GRIDX * i];
2181  fft_real *potp = &forcegrid[GRIDX * i];
2182 
2183  for(int x = 0; x < GRIDX; x++)
2184  {
2185  int xr = x + 1;
2186  int xl = x - 1;
2187  int xrr = x + 2;
2188  int xll = x - 2;
2189 
2190  if(xr >= GRIDX)
2191  xr -= GRIDX;
2192  if(xrr >= GRIDX)
2193  xrr -= GRIDX;
2194  if(xl < 0)
2195  xl += GRIDX;
2196  if(xll < 0)
2197  xll += GRIDX;
2198 
2199  forcep[x] = fac * ((4.0 / 3) * (potp[xl] - potp[xr]) - (1.0 / 6) * (potp[xll] - potp[xrr]));
2200  }
2201  }
2202 
2203  /* now need to read out from forcegrid in a non-standard way */
2204 #ifdef PM_ZOOM_OPTIMIZED
2205  my_fft_swap13back(&myplan, rhogrid, forcegrid);
2206  pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 0);
2207 #else
2208  pmforce_uniform_optimized_readout_forces_or_potential_zy(rhogrid, 0);
2209 #endif
2210 
2211 #endif
2212  }
2213 
2214  /* free stuff */
2215 
2216  Mem.myfree(forcegrid);
2217  Mem.myfree(rhogrid);
2218 
2219 #ifdef PM_ZOOM_OPTIMIZED
2220  Mem.myfree(localfield_recvcount);
2221  Mem.myfree(localfield_offset);
2222  Mem.myfree(localfield_sendcount);
2223  Mem.myfree(localfield_first);
2224  Mem.myfree(localfield_data);
2225  Mem.myfree(localfield_globalindex);
2226  Mem.myfree(part);
2227 #else
2228 #ifndef FFT_COLUMN_BASED
2229  Mem.myfree(partin);
2230 #endif
2231  Mem.myfree(Rcvpm_offset);
2232  Mem.myfree(Rcvpm_count);
2233  Mem.myfree(Sndpm_offset);
2234  Mem.myfree(Sndpm_count);
2235 #endif
2236 
2237  double tend = Logs.second();
2238 
2239  if(mode == 0)
2240  mpi_printf("PM-PERIODIC: done. (took %g seconds)\n", Logs.timediff(tstart, tend));
2241 }
2242 
2243 #ifdef GRAVITY_TALLBOX
2244 
2248 void pm_periodic::pmforce_setup_tallbox_kernel(void)
2249 {
2250  double d = All.BoxSize / PMGRID;
2251 
2252  mpi_printf("PM-PERIODIC: Setting up tallbox kernel (GRIDX=%d, GRIDY=%d, GRIDZ=%d)\n", GRIDX, GRIDY, GRIDZ);
2253 
2254  /* now set up kernel and its Fourier transform */
2255 
2256  for(int i = 0; i < maxfftsize; i++) /* clear local field */
2257  kernel[i] = 0;
2258 
2259 #ifndef FFT_COLUMN_BASED
2260  for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
2261  for(int j = 0; j < GRIDY; j++)
2262  {
2263 #else
2264  for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
2265  {
2266  int i = c / GRIDY;
2267  int j = c % GRIDY;
2268 #endif
2269 
2270  for(int k = 0; k < GRIDZ; k++)
2271  {
2272  int ii, jj, kk;
2273 
2274  if(i >= (GRIDX / 2))
2275  ii = i - GRIDX;
2276  else
2277  ii = i;
2278  if(j >= (GRIDY / 2))
2279  jj = j - GRIDY;
2280  else
2281  jj = j;
2282  if(k >= (GRIDZ / 2))
2283  kk = k - GRIDZ;
2284  else
2285  kk = k;
2286 
2287  double xx = ii * d;
2288  double yy = jj * d;
2289  double zz = kk * d;
2290 
2291  double pot = pmperiodic_tallbox_long_range_potential(xx, yy, zz);
2292 
2293 #ifndef FFT_COLUMN_BASED
2294  size_t ip = FI(i - myplan.slabstart_x, j, k);
2295 #else
2296  size_t ip = FCxy(c, k);
2297 #endif
2298  kernel[ip] = pot / All.BoxSize;
2299  }
2300 
2301 #ifndef FFT_COLUMN_BASED
2302  }
2303 #else
2304  }
2305 #endif
2306 
2307  /* Do the FFT of the kernel */
2308 
2309  fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
2310 
2311 #ifndef FFT_COLUMN_BASED
2312  my_slab_based_fft(&myplan, kernel, workspc, 1);
2313 #else
2314  my_column_based_fft(&myplan, kernel, workspc, 1); /* result is in workspace, not in kernel */
2315  memcpy(kernel, workspc, maxfftsize * sizeof(fft_real));
2316 #endif
2317 
2318  Mem.myfree(workspc);
2319 
2320  mpi_printf("PM-PERIODIC: Done setting up tallbox kernel\n");
2321 }
2322 
2323 double pm_periodic::pmperiodic_tallbox_long_range_potential(double x, double y, double z)
2324 {
2325  x /= All.BoxSize;
2326  y /= All.BoxSize;
2327  z /= All.BoxSize;
2328 
2329  double r = sqrt(x * x + y * y + z * z);
2330 
2331  if(r == 0)
2332  return 0;
2333 
2334  double xx, yy, zz;
2335  switch(GRAVITY_TALLBOX)
2336  {
2337  case 0:
2338  xx = y;
2339  yy = z;
2340  zz = x;
2341  break;
2342  case 1:
2343  xx = x;
2344  yy = z;
2345  zz = y;
2346  break;
2347  case 2:
2348  xx = x;
2349  yy = y;
2350  zz = z;
2351  break;
2352  }
2353  x = xx;
2354  y = yy;
2355  z = zz;
2356 
2357  /* the third dimension, z, is now the non-periodic one */
2358 
2359  double leff = sqrt(BOXX * BOXY);
2360  double alpha = 2.0 / leff;
2361 
2362  double sum1 = 0.0;
2363 
2364  int qxmax = (int)(10.0 / (BOXX * alpha) + 0.5);
2365  int qymax = (int)(10.0 / (BOXY * alpha) + 0.5);
2366 
2367  int nxmax = (int)(4.0 * alpha * BOXX + 0.5);
2368  int nymax = (int)(4.0 * alpha * BOXY + 0.5);
2369 
2370  for(int nx = -qxmax; nx <= qxmax; nx++)
2371  for(int ny = -qymax; ny <= qymax; ny++)
2372  {
2373  double dx = x - nx * BOXX;
2374  double dy = y - ny * BOXY;
2375  double r = sqrt(dx * dx + dy * dy + z * z);
2376  if(r > 0)
2377  sum1 += erfc(alpha * r) / r;
2378  }
2379 
2380  double alpha2 = alpha * alpha;
2381 
2382  double sum2 = 0.0;
2383 
2384  for(int nx = -nxmax; nx <= nxmax; nx++)
2385  for(int ny = -nymax; ny <= nymax; ny++)
2386  {
2387  if(nx != 0 || ny != 0)
2388  {
2389  double kx = (2.0 * M_PI / BOXX) * nx;
2390  double ky = (2.0 * M_PI / BOXY) * ny;
2391  double k2 = kx * kx + ky * ky;
2392  double k = sqrt(k2);
2393 
2394  if(k * z > 0)
2395  {
2396  double ex = exp(-k * z);
2397  if(ex > 0)
2398  sum2 += cos(kx * x + ky * y) * (erfc(k / (2 * alpha) + alpha * z) / ex + ex * erfc(k / (2 * alpha) - alpha * z)) / k;
2399  }
2400  else
2401  {
2402  double ex = exp(k * z);
2403  if(ex > 0)
2404  sum2 += cos(kx * x + ky * y) * (ex * erfc(k / (2 * alpha) + alpha * z) + erfc(k / (2 * alpha) - alpha * z) / ex) / k;
2405  }
2406  }
2407  }
2408 
2409  sum2 *= M_PI / (BOXX * BOXY);
2410 
2411  double psi = 2.0 * alpha / sqrt(M_PI) +
2412  (2 * sqrt(M_PI) / (BOXX * BOXY) * (exp(-alpha2 * z * z) / alpha + sqrt(M_PI) * z * erf(alpha * z))) - (sum1 + sum2);
2413 
2414  return psi;
2415 }
2416 #endif
2417 
2418 /*----------------------------------------------------------------------------------------------------*/
2419 /* Here comes code for the power-spectrum computation */
2420 /*----------------------------------------------------------------------------------------------------*/
2421 
2422 void pm_periodic::calculate_power_spectra(int num)
2423 {
2424  int n_type[NTYPES];
2425  long long ntot_type_all[NTYPES];
2426  /* determine global and local particle numbers */
2427  for(int n = 0; n < NTYPES; n++)
2428  n_type[n] = 0;
2429  for(int n = 0; n < Sp->NumPart; n++)
2430  n_type[Sp->P[n].getType()]++;
2431 
2432  sumup_large_ints(NTYPES, n_type, ntot_type_all, Communicator);
2433 
2434  int typeflag[NTYPES];
2435 
2436  for(int i = 0; i < NTYPES; i++)
2437  typeflag[i] = 1;
2438 
2439 #ifdef HIERARCHICAL_GRAVITY
2440  int flag_extra_allocate = 0;
2441  if(Sp->TimeBinsGravity.ActiveParticleList == NULL)
2442  {
2443  flag_extra_allocate = 1;
2444  Sp->TimeBinsGravity.timebins_allocate();
2445  }
2446 
2447  Sp->TimeBinsGravity.NActiveParticles = 0;
2448  for(int i = 0; i < Sp->NumPart; i++)
2449  Sp->TimeBinsGravity.ActiveParticleList[Sp->TimeBinsGravity.NActiveParticles++] = i;
2450 #endif
2451 
2452  if(ThisTask == 0)
2453  {
2454  char buf[MAXLEN_PATH_EXTRA];
2455  sprintf(buf, "%s/powerspecs", All.OutputDir);
2456  mkdir(buf, 02755);
2457  }
2458 
2459  sprintf(power_spec_fname, "%s/powerspecs/powerspec_%03d.txt", All.OutputDir, num);
2460 
2461  pmforce_do_powerspec(typeflag); /* calculate power spectrum for all particle types */
2462 
2463  /* check whether whether more than one type is in use */
2464  int count_types = 0;
2465  for(int i = 0; i < NTYPES; i++)
2466  if(ntot_type_all[i] > 0)
2467  count_types++;
2468 
2469  if(count_types > 1)
2470  for(int i = 0; i < NTYPES; i++)
2471  {
2472  if(ntot_type_all[i] > 0)
2473  {
2474  for(int j = 0; j < NTYPES; j++)
2475  typeflag[j] = 0;
2476 
2477  typeflag[i] = 1;
2478 
2479  sprintf(power_spec_fname, "%s/powerspecs/powerspec_type%d_%03d.txt", All.OutputDir, i, num);
2480 
2481  pmforce_do_powerspec(typeflag); /* calculate power spectrum for type i */
2482  }
2483  }
2484 
2485 #ifdef HIERARCHICAL_GRAVITY
2486  if(flag_extra_allocate)
2487  Sp->TimeBinsGravity.timebins_free();
2488 #endif
2489 }
2490 
2491 void pm_periodic::pmforce_do_powerspec(int *typeflag)
2492 {
2493  mpi_printf("POWERSPEC: Begin power spectrum. (typeflag=[");
2494  for(int i = 0; i < NTYPES; i++)
2495  mpi_printf(" %d ", typeflag[i]);
2496  mpi_printf("])\n");
2497 
2498  double tstart = Logs.second();
2499 
2500  pmforce_periodic(1, typeflag); /* calculate regular power spectrum for selected particle types */
2501 
2502  pmforce_periodic(2, typeflag); /* calculate folded power spectrum for selected particle types */
2503 
2504  pmforce_periodic(3, typeflag); /* calculate twice folded power spectrum for selected particle types */
2505 
2506  double tend = Logs.second();
2507 
2508  mpi_printf("POWERSPEC: End power spectrum. took %g seconds\n", Logs.timediff(tstart, tend));
2509 }
2510 
2511 void pm_periodic::pmforce_measure_powerspec(int flag, int *typeflag)
2512 {
2513  particle_data *P = Sp->P;
2514 
2515  long long CountModes[BINS_PS];
2516  double SumPowerUncorrected[BINS_PS]; /* without binning correction (as for shot noise) */
2517  double PowerUncorrected[BINS_PS]; /* without binning correction */
2518  double DeltaUncorrected[BINS_PS]; /* without binning correction */
2519  double ShotLimit[BINS_PS];
2520  double KWeightSum[BINS_PS];
2521  double Kbin[BINS_PS];
2522 
2523  double mass = 0, mass2 = 0, count = 0;
2524  for(int i = 0; i < Sp->NumPart; i++)
2525  if(typeflag[P[i].getType()])
2526  {
2527  mass += Sp->P[i].getMass();
2528  mass2 += Sp->P[i].getMass() * Sp->P[i].getMass();
2529  count += 1.0;
2530  }
2531 
2532  MPI_Allreduce(MPI_IN_PLACE, &mass, 1, MPI_DOUBLE, MPI_SUM, Communicator);
2533  MPI_Allreduce(MPI_IN_PLACE, &mass2, 1, MPI_DOUBLE, MPI_SUM, Communicator);
2534  MPI_Allreduce(MPI_IN_PLACE, &count, 1, MPI_DOUBLE, MPI_SUM, Communicator);
2535 
2536  double d = All.BoxSize / PMGRID;
2537  double dhalf = 0.5 * d;
2538 
2539  double fac = 1.0 / mass;
2540 
2541  double K0 = 2 * M_PI / All.BoxSize; /* minimum k */
2542  double K1 = 2 * M_PI / All.BoxSize * (POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC * PMGRID / 2); /* maximum k that can be measured */
2543  double binfac = BINS_PS / (log(K1) - log(K0));
2544 
2545  double kfacx = 2.0 * M_PI * LONG_X / All.BoxSize;
2546  double kfacy = 2.0 * M_PI * LONG_Y / All.BoxSize;
2547  double kfacz = 2.0 * M_PI * LONG_Z / All.BoxSize;
2548 
2549  for(int i = 0; i < BINS_PS; i++)
2550  {
2551  SumPowerUncorrected[i] = 0;
2552  CountModes[i] = 0;
2553  KWeightSum[i] = 0;
2554  }
2555 
2556 #ifdef FFT_COLUMN_BASED
2557  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
2558  {
2559  large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
2560  int y = ipcell / (GRIDX * GRIDz);
2561  int yr = ipcell % (GRIDX * GRIDz);
2562  int z = yr / GRIDX;
2563  int x = yr % GRIDX;
2564 #else
2565  for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
2566  for(int x = 0; x < GRIDX; x++)
2567  for(int z = 0; z < GRIDz; z++)
2568  {
2569 #endif
2570  int count_double;
2571 
2572  if(z >= 1 &&
2573  z < (GRIDZ + 1) / 2) /* these modes need to be counted twice due to the storage scheme for the FFT of a real field */
2574  count_double = 1;
2575  else
2576  count_double = 0;
2577 
2578  int xx, yy, zz;
2579 
2580  if(x >= (GRIDX / 2))
2581  xx = x - GRIDX;
2582  else
2583  xx = x;
2584 
2585  if(y >= (GRIDY / 2))
2586  yy = y - GRIDY;
2587  else
2588  yy = y;
2589 
2590  if(z >= (GRIDZ / 2))
2591  zz = z - GRIDZ;
2592  else
2593  zz = z;
2594 
2595  double kx = kfacx * xx;
2596  double ky = kfacy * yy;
2597  double kz = kfacz * zz;
2598 
2599  double k2 = kx * kx + ky * ky + kz * kz;
2600 
2601  if(k2 > 0)
2602  {
2603  /* do deconvolution */
2604  double fx = 1, fy = 1, fz = 1;
2605 
2606  if(xx != 0)
2607  {
2608  fx = kx * dhalf;
2609  fx = sin(fx) / fx;
2610  }
2611  if(yy != 0)
2612  {
2613  fy = ky * dhalf;
2614  fy = sin(fy) / fy;
2615  }
2616  if(zz != 0)
2617  {
2618  fz = kz * dhalf;
2619  fz = sin(fz) / fz;
2620  }
2621  double ff = 1 / (fx * fy * fz);
2622  double smth = ff * ff * ff * ff;
2623  /*
2624  * Note: The Fourier-transform of the density field (rho_hat) must be multiplied with ff^2
2625  * in order to do the de-convolution. Thats why po = rho_hat^2 gains a factor of ff^4.
2626  */
2627  /* end deconvolution */
2628 
2629 #ifndef FFT_COLUMN_BASED
2630  large_array_offset ip = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
2631 #endif
2632 
2633  double po = (fft_of_rhogrid[ip][0] * fft_of_rhogrid[ip][0] + fft_of_rhogrid[ip][1] * fft_of_rhogrid[ip][1]);
2634 
2635  po *= fac * fac * smth;
2636 
2637  double k = sqrt(k2);
2638 
2639  if(flag == 1)
2640  k *= POWERSPEC_FOLDFAC;
2641  else if(flag == 2)
2642  k *= POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC;
2643 
2644  if(k >= K0 && k < K1)
2645  {
2646  int bin = log(k / K0) * binfac;
2647 
2648  SumPowerUncorrected[bin] += po;
2649  CountModes[bin] += 1;
2650  KWeightSum[bin] += log(k);
2651 
2652  if(count_double)
2653  {
2654  SumPowerUncorrected[bin] += po;
2655  CountModes[bin] += 1;
2656  KWeightSum[bin] += log(k);
2657  }
2658  }
2659  }
2660  }
2661 
2662  MPI_Allreduce(MPI_IN_PLACE, SumPowerUncorrected, BINS_PS, MPI_DOUBLE, MPI_SUM, Communicator);
2663  MPI_Allreduce(MPI_IN_PLACE, CountModes, BINS_PS, MPI_LONG_LONG, MPI_SUM, Communicator);
2664  MPI_Allreduce(MPI_IN_PLACE, KWeightSum, BINS_PS, MPI_DOUBLE, MPI_SUM, Communicator);
2665 
2666  int count_non_zero_bins = 0;
2667  for(int i = 0; i < BINS_PS; i++)
2668  {
2669  if(CountModes[i] > 0)
2670  {
2671  Kbin[i] = exp(KWeightSum[i] / CountModes[i]);
2672  count_non_zero_bins++;
2673  }
2674  else
2675  Kbin[i] = exp((i + 0.5) / binfac + log(K0));
2676 
2677  if(CountModes[i] > 0)
2678  PowerUncorrected[i] = SumPowerUncorrected[i] / CountModes[i];
2679  else
2680  PowerUncorrected[i] = 0;
2681 
2682  DeltaUncorrected[i] = 4 * M_PI * pow(Kbin[i], 3) / pow(2 * M_PI / All.BoxSize, 3) * PowerUncorrected[i];
2683 
2684  ShotLimit[i] = 4 * M_PI * pow(Kbin[i], 3) / pow(2 * M_PI / All.BoxSize, 3) * (mass2 / (mass * mass));
2685  }
2686 
2687  /* store the result */
2688  if(ThisTask == 0)
2689  {
2690  FILE *fd;
2691 
2692  if(flag == 0)
2693  {
2694  if(!(fd = fopen(power_spec_fname, "w"))) /* store the unfolded spectrum */
2695  Terminate("can't open file `%s`\n", power_spec_fname);
2696  }
2697  else if(flag == 1 || flag == 2)
2698  {
2699  if(!(fd = fopen(power_spec_fname, "a"))) /* append the file, store the folded spectrum */
2700  Terminate("can't open file `%s`\n", power_spec_fname);
2701  }
2702  else
2703  Terminate("Something wrong.\n");
2704 
2705  fprintf(fd, "%g\n", All.Time);
2706  fprintf(fd, "%d\n", count_non_zero_bins);
2707  fprintf(fd, "%g\n", All.BoxSize);
2708  fprintf(fd, "%d\n", (int)(PMGRID));
2710  fprintf(fd, "%g\n", All.ComovingIntegrationOn > 0 ? linear_growth_factor(All.Time, 1.0) : 1.0);
2711 
2712  for(int i = 0; i < BINS_PS; i++)
2713  if(CountModes[i] > 0)
2714  fprintf(fd, "%g %g %g %g %g\n", Kbin[i], DeltaUncorrected[i], PowerUncorrected[i], (double)CountModes[i], ShotLimit[i]);
2715 
2716  if(flag == 2)
2717  {
2718  fprintf(fd, "%g\n", mass);
2719  fprintf(fd, "%g\n", count);
2720  fprintf(fd, "%g\n", mass * mass / mass2);
2721  }
2722 
2723  fclose(fd);
2724  }
2725 }
2726 
2727 #endif
global_data_all_processes All
Definition: main.cc:40
double timediff(double t0, double t1)
Definition: logs.cc:488
double second(void)
Definition: logs.cc:471
double getAllocatedBytesInMB(void)
Definition: mymalloc.h:68
#define RCUT
Definition: constants.h:293
#define MPI_MESSAGE_SIZELIMIT_IN_BYTES
Definition: constants.h:53
#define MAXLEN_PATH_EXTRA
Definition: constants.h:301
#define NTYPES
Definition: constants.h:308
#define M_PI
Definition: constants.h:56
#define ASMTH
Definition: constants.h:287
double mycxxsort(T *begin, T *end, Tcomp comp)
Definition: cxxsort.h:39
float MyFloat
Definition: dtypes.h:86
#define LONG_Y
Definition: dtypes.h:369
uint32_t MyIntPosType
Definition: dtypes.h:35
#define LONG_X
Definition: dtypes.h:361
#define LONG_Z
Definition: dtypes.h:377
logs Logs
Definition: main.cc:43
#define Terminate(...)
Definition: macros.h:19
void sumup_large_ints(int n, int *src, long long *res, MPI_Comm comm)
int myMPI_Sendrecv(void *sendbuf, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, size_t recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
#define TAG_NONPERIOD_B
Definition: mpi_utils.h:45
#define TAG_NONPERIOD_C
Definition: mpi_utils.h:46
void myMPI_Alltoallv(void *sendbuf, size_t *sendcounts, size_t *sdispls, void *recvbuf, size_t *recvcounts, size_t *rdispls, int len, int big_flag, MPI_Comm comm)
Definition: myalltoall.cc:177
#define TAG_NONPERIOD_A
Definition: mpi_utils.h:44
memory Mem
Definition: main.cc:44
expr exp(half arg)
Definition: half.hpp:2724
expr cos(half arg)
Definition: half.hpp:2823
expr sin(half arg)
Definition: half.hpp:2816
expr erf(half arg)
Definition: half.hpp:2918
expr log(half arg)
Definition: half.hpp:2745
expr pow(half base, half exp)
Definition: half.hpp:2803
expr erfc(half arg)
Definition: half.hpp:2925
expr sqrt(half arg)
Definition: half.hpp:2777
#define FFTW(x)
Definition: pm_mpi_fft.h:22
integertime Ti_Current
Definition: allvars.h:188
char OutputDir[MAXLEN_PATH]
Definition: allvars.h:272
MyDouble getMass(void)
vector< MyFloat > GravAccel
Definition: particle_data.h:55
MyIntPosType IntPos[3]
Definition: particle_data.h:53