GADGET-4
pm_nonperiodic.cc
Go to the documentation of this file.
1 /*******************************************************************************
2  * \copyright This file is part of the GADGET4 N-body/SPH code developed
3  * \copyright by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
4  * \copyright (vspringel@mpa-garching.mpg.de) and all contributing authors.
5  *******************************************************************************/
6 
12 #include "gadgetconfig.h"
13 
14 #if defined(PMGRID) && (!defined(PERIODIC) || defined(PLACEHIGHRESREGION))
15 
16 #include <fftw3.h>
17 #include <math.h>
18 #include <mpi.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 
23 #include "../data/allvars.h"
24 #include "../data/dtypes.h"
25 #include "../data/intposconvert.h"
26 #include "../data/mymalloc.h"
27 #include "../logs/timer.h"
28 #include "../main/simulation.h"
29 #include "../mpi_utils/mpi_utils.h"
30 #include "../pm/pm.h"
31 #include "../pm/pm_mpi_fft.h"
32 #include "../pm/pm_nonperiodic.h"
33 #include "../sort/cxxsort.h"
34 #include "../src/gravtree/gravtree.h"
35 #include "../src/time_integration/timestep.h"
36 #include "../system/system.h"
37 
38 #define GRID (HRPMGRID)
39 #define GRIDz (GRID / 2 + 1)
40 #define GRID2 (2 * GRIDz)
41 
42 #define FI(x, y, z) (((large_array_offset)GRID2) * (GRID * (x) + (y)) + (z))
43 #define FC(c, z) (((large_array_offset)GRID2) * ((c)-myplan.firstcol_XY) + (z))
44 #define TI(x, y, z) (((large_array_offset)GRID) * ((x) + (y)*myplan.nslab_x) + (z))
45 
55 void pm_nonperiodic::pm_init_regionsize(void)
56 {
57  /* first, find a reference coordinate by selecting an arbitrary particle in the respective regions. For definiteness, we choose the
58  * first particle */
59 
60  particle_data *P = Sp->P;
61  int have_low_mesh = NTask, have_high_mesh = NTask; /* default is we don't have a particle */
62 
63  if(Sp->NumPart > 0)
64  {
65  for(int j = 0; j < 3; j++)
66  Sp->ReferenceIntPos[LOW_MESH][j] = P[0].IntPos[j];
67 
68  have_low_mesh = ThisTask;
69  }
70 
71  for(int i = 0; i < Sp->NumPart; i++)
72  {
73 #ifdef PLACEHIGHRESREGION
74  if(((1 << P[i].getType()) & (PLACEHIGHRESREGION)))
75  {
76  for(int j = 0; j < 3; j++)
77  Sp->ReferenceIntPos[HIGH_MESH][j] = P[i].IntPos[j];
78 
79  have_high_mesh = ThisTask;
80  break;
81  }
82 #endif
83  }
84 
85  int have_global[4] = {have_low_mesh, ThisTask, have_high_mesh, ThisTask};
86 
87  MPI_Allreduce(MPI_IN_PLACE, have_global, 2, MPI_2INT, MPI_MINLOC, Communicator);
88 
89  if(have_global[0] >= NTask)
90  Terminate("have_global[0] >= NTask: Don't we have any particle?");
91 
92  MPI_Bcast(&Sp->ReferenceIntPos[LOW_MESH][0], 3 * sizeof(MyIntPosType), MPI_BYTE, have_global[1], Communicator);
93 
94 #ifdef PLACEHIGHRESREGION
95  if(have_global[2] >= NTask)
96  Terminate("have_global[2] >= NTask: Apparently there are no particles in high res region");
97 
98  MPI_Bcast(&Sp->ReferenceIntPos[HIGH_MESH][0], 3 * sizeof(MyIntPosType), MPI_BYTE, have_global[3], Communicator);
99 #endif
100 
101  /* find enclosing rectangle */
102 
103  MySignedIntPosType xmin[2][3], xmax[2][3];
104 
105  for(int j = 0; j < 3; j++)
106  {
107  xmin[LOW_MESH][j] = xmin[HIGH_MESH][j] = 0;
108  xmax[LOW_MESH][j] = xmax[HIGH_MESH][j] = 0;
109  }
110 
111  for(int i = 0; i < Sp->NumPart; i++)
112  {
113  MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[LOW_MESH][0], P[i].IntPos[1] - Sp->ReferenceIntPos[LOW_MESH][1],
114  P[i].IntPos[2] - Sp->ReferenceIntPos[LOW_MESH][2]};
115 
116  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
117 
118  for(int j = 0; j < 3; j++)
119  {
120  if(delta[j] > xmax[LOW_MESH][j])
121  xmax[LOW_MESH][j] = delta[j];
122  if(delta[j] < xmin[LOW_MESH][j])
123  xmin[LOW_MESH][j] = delta[j];
124  }
125 
126 #ifdef PLACEHIGHRESREGION
127  if(((1 << P[i].getType()) & (PLACEHIGHRESREGION)))
128  {
129  MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[HIGH_MESH][0],
130  P[i].IntPos[1] - Sp->ReferenceIntPos[HIGH_MESH][1],
131  P[i].IntPos[2] - Sp->ReferenceIntPos[HIGH_MESH][2]};
132 
133  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
134 
135  for(int j = 0; j < 3; j++)
136  {
137  if(delta[j] > xmax[HIGH_MESH][j])
138  xmax[HIGH_MESH][j] = delta[j];
139  if(delta[j] < xmin[HIGH_MESH][j])
140  xmin[HIGH_MESH][j] = delta[j];
141  }
142  }
143 #endif
144  }
145 
146  MPI_Allreduce(xmin, Sp->Xmintot, 6, MPI_MyIntPosType, MPI_MIN_MySignedIntPosType, Communicator);
147  MPI_Allreduce(xmax, Sp->Xmaxtot, 6, MPI_MyIntPosType, MPI_MAX_MySignedIntPosType, Communicator);
148 
149  for(int i = 0; i < 3; i++)
150  {
151  Sp->Xmaxtot[LOW_MESH][i] += 1; /* so that all particles fulfill xmin <= pos < xmax instead of xmin <= pos <= xmax*/
152  Sp->Xmaxtot[HIGH_MESH][i] += 1;
153  }
154 
155  MyIntPosType inner_meshsize[2], enclosing_meshsize[2];
156  int flag_recompute_kernel = 0;
157 
158  for(int mesh = 0; mesh < 2; mesh++)
159  {
160  inner_meshsize[mesh] = (MyIntPosType)(Sp->Xmaxtot[mesh][0] - Sp->Xmintot[mesh][0]);
161 
162  if((MyIntPosType)(Sp->Xmaxtot[mesh][1] - Sp->Xmintot[mesh][1]) > inner_meshsize[mesh])
163  inner_meshsize[mesh] = Sp->Xmaxtot[mesh][1] - Sp->Xmintot[mesh][1];
164 
165  if((MyIntPosType)(Sp->Xmaxtot[mesh][2] - Sp->Xmintot[mesh][2]) > inner_meshsize[mesh])
166  inner_meshsize[mesh] = Sp->Xmaxtot[mesh][2] - Sp->Xmintot[mesh][2];
167  }
168 
169  for(int mesh = 0; mesh < 2; mesh++)
170  {
171 #ifdef PERIODIC
172  if(mesh == LOW_MESH)
173  continue;
174 #endif
175 #ifndef PLACEHIGHRESREGION
176  if(mesh == HIGH_MESH)
177  continue;
178 #endif
179 
180  MyIntPosType blocksize = 1;
181  MyIntPosType mask = ~((MyIntPosType)0);
182 
183  if(mesh == LOW_MESH)
184  {
185  MyIntPosType refsize = inner_meshsize[mesh] >> 4; /* pick 1/8 as reference size */
186 
187  for(int i = 0; i < BITS_FOR_POSITIONS; i++)
188  {
189  if(blocksize >= refsize)
190  break;
191 
192  blocksize <<= 1;
193  mask <<= 1;
194  }
195  }
196  else
197  {
198  blocksize = Sp->PlacingBlocksize;
199  }
200 
201  mpi_printf(
202  "PM-NONPERIODIC: Allowed region for isolated PM mesh (%s): BEFORE (%g|%g|%g) -> (%g|%g|%g) inner meshsize=%g "
203  "blocksize=%g\n",
204  mesh == LOW_MESH ? "coarse" : "fine", Sp->FacIntToCoord * Sp->Xmintot[mesh][0], Sp->FacIntToCoord * Sp->Xmintot[mesh][1],
205  Sp->FacIntToCoord * Sp->Xmintot[mesh][2], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][0], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][1],
206  Sp->FacIntToCoord * Sp->Xmaxtot[mesh][2], Sp->FacIntToCoord * inner_meshsize[mesh], Sp->FacIntToCoord * blocksize);
207 
208  enclosing_meshsize[mesh] = 0;
209 
210  /* expand the box so that it aligns with blocksize */
211  for(int i = 0; i < 3; i++)
212  {
213  MyIntPosType left, right;
214  if(mesh == LOW_MESH)
215  {
216  /* now round it down */
217  left = ((Sp->Xmintot[mesh][i] + Sp->Xmaxtot[mesh][i]) / 2 - inner_meshsize[mesh] / 2) + Sp->ReferenceIntPos[mesh][i];
218  left &= mask;
219 
220  /* now round it up */
221  right = ((Sp->Xmintot[mesh][i] + Sp->Xmaxtot[mesh][i]) / 2 + inner_meshsize[mesh] / 2) + Sp->ReferenceIntPos[mesh][i];
222  right &= mask;
223  right += blocksize;
224  }
225  else
226  {
227  left = (Sp->ReferenceIntPos[HIGH_MESH][i] + Sp->Xmintot[HIGH_MESH][i]) & Sp->PlacingMask;
228  right = left + Sp->PlacingBlocksize;
229  }
230 
231  Sp->Xmintot[mesh][i] = left - Sp->ReferenceIntPos[mesh][i];
232  Sp->Xmaxtot[mesh][i] = right - Sp->ReferenceIntPos[mesh][i];
233 
234  Sp->Left[mesh][i] = left;
235 
236  Sp->MeshSize[mesh][i] = Sp->Xmaxtot[mesh][i] - Sp->Xmintot[mesh][i];
237 
238  if(Sp->MeshSize[mesh][i] > enclosing_meshsize[mesh])
239  enclosing_meshsize[mesh] = Sp->MeshSize[mesh][i];
240  }
241 
242  mpi_printf(
243  "PM-NONPERIODIC: Allowed region for isolated PM mesh (%s): AFTER (%g|%g|%g) -> (%g|%g|%g) enclosing_meshsize=%g "
244  "absolute space (%g|%g|%g) -> (%g|%g|%g)\n",
245  mesh == LOW_MESH ? "coarse" : "fine", Sp->FacIntToCoord * Sp->Xmintot[mesh][0], Sp->FacIntToCoord * Sp->Xmintot[mesh][1],
246  Sp->FacIntToCoord * Sp->Xmintot[mesh][2], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][0], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][1],
247  Sp->FacIntToCoord * Sp->Xmaxtot[mesh][2], Sp->FacIntToCoord * enclosing_meshsize[mesh],
248  Sp->FacIntToCoord * (Sp->Xmintot[mesh][0] + Sp->ReferenceIntPos[mesh][0]),
249  Sp->FacIntToCoord * (Sp->Xmintot[mesh][1] + Sp->ReferenceIntPos[mesh][1]),
250  Sp->FacIntToCoord * (Sp->Xmintot[mesh][2] + Sp->ReferenceIntPos[mesh][2]),
251  Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][0] + Sp->ReferenceIntPos[mesh][0]),
252  Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][1] + Sp->ReferenceIntPos[mesh][1]),
253  Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][2] + Sp->ReferenceIntPos[mesh][2]));
254 
255  if(enclosing_meshsize[mesh] != Sp->OldMeshSize[mesh])
256  {
257  flag_recompute_kernel = 1;
258  Sp->OldMeshSize[mesh] = enclosing_meshsize[mesh];
259  }
260 
261  /* this will produce enough room for zero-padding and buffer region to
262  allow finite differencing of the potential */
263  Sp->TotalMeshSize[mesh] = 2.0 * enclosing_meshsize[mesh] * (GRID / ((double)(GRID - 10)));
264 
265  /* move lower left corner by two cells to allow finite differencing of the potential by a 4-point function */
266  for(int i = 0; i < 3; i++)
267  Sp->Corner[mesh][i] = Sp->Xmintot[mesh][i] - (2.0 * Sp->TotalMeshSize[mesh] / GRID);
268 
269  Sp->Asmth[mesh] = ASMTH * (Sp->FacIntToCoord * Sp->TotalMeshSize[mesh]) / GRID;
270  Sp->Rcut[mesh] = RCUT * Sp->Asmth[mesh];
271  }
272 
273  static int first_init_done = 0; /* for detecting restart from restartfiles */
274  if(flag_recompute_kernel || first_init_done == 0)
275  {
276 #ifndef PERIODIC
277  mpi_printf("PM-NONPERIODIC: Recompute kernel course mesh: Asmth=%g Rcut=%g mesh cell size=%g\n", Sp->Asmth[LOW_MESH],
278  Sp->Rcut[LOW_MESH], Sp->FacIntToCoord * Sp->TotalMeshSize[LOW_MESH] / GRID);
279 #endif
280 #ifdef PLACEHIGHRESREGION
281  mpi_printf("PM-NONPERIODIC: Recompute kernel fine mesh: Asmth=%g Rcut=%g mesh cell size=%g\n", Sp->Asmth[HIGH_MESH],
282  Sp->Rcut[HIGH_MESH], Sp->FacIntToCoord * Sp->TotalMeshSize[HIGH_MESH] / GRID);
283 #endif
284 
285  pm_setup_nonperiodic_kernel();
286  first_init_done = 1;
287  }
288 }
289 
294 void pm_nonperiodic::pm_init_nonperiodic(simparticles *Sp_ptr)
295 {
296  Sp = Sp_ptr;
297 
298  /* Set up the FFTW-3 plan files. */
299  int ndim[1] = {GRID}; /* dimension of the 1D transforms */
300 
301  /* temporarily allocate some arrays to make sure that out-of-place plans are created */
302  rhogrid = (fft_real *)Mem.mymalloc("rhogrid", GRID2 * sizeof(fft_real));
303  forcegrid = (fft_real *)Mem.mymalloc("forcegrid", GRID2 * sizeof(fft_real));
304 
305 #ifdef DOUBLEPRECISION_FFTW
306  int alignflag = 0;
307 #else
308  /* for single precision, the start of our FFT columns is presently only guaranteed to be 8-byte aligned */
309  int alignflag = FFTW_UNALIGNED;
310 #endif
311 #ifndef FFT_COLUMN_BASED
312  int stride = GRIDz;
313 #else
314  int stride = 1;
315 #endif
316 
317  myplan.forward_plan_zdir = FFTW(plan_many_dft_r2c)(1, ndim, 1, rhogrid, 0, 1, GRID2, (fft_complex *)forcegrid, 0, 1, GRIDz,
318  FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
319 
320  myplan.forward_plan_xdir =
321  FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
322  GRIDz * GRID, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
323 
324  myplan.forward_plan_ydir =
325  FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
326  GRIDz * GRID, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
327 
328  myplan.backward_plan_zdir = FFTW(plan_many_dft_c2r)(1, ndim, 1, (fft_complex *)rhogrid, 0, 1, GRIDz, forcegrid, 0, 1, GRID2,
329  FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
330 
331  myplan.backward_plan_xdir =
332  FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
333  GRIDz * GRID, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
334 
335  myplan.backward_plan_ydir =
336  FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
337  GRIDz * GRID, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
338 
339  Mem.myfree(forcegrid);
340  Mem.myfree(rhogrid);
341 
342 #ifndef FFT_COLUMN_BASED
343 
344  my_slab_based_fft_init(&myplan, GRID, GRID, GRID);
345 
346  maxfftsize = myplan.largest_x_slab * GRID * ((size_t)GRID2);
347 
348 #else
349 
350  my_column_based_fft_init(&myplan, GRID, GRID, GRID);
351 
352  maxfftsize = myplan.max_datasize;
353 
354 #endif
355 
356  /* now allocate memory to hold the FFT fields */
357 
358  size_t bytes, bytes_tot = 0;
359 
360 #if !defined(PERIODIC)
361  kernel[0] = (fft_real *)Mem.mymalloc("kernel[0]", bytes = maxfftsize * sizeof(fft_real));
362  bytes_tot += bytes;
363  fft_of_kernel[0] = (fft_complex *)kernel[0];
364 #endif
365 
366 #if defined(PLACEHIGHRESREGION)
367  kernel[1] = (fft_real *)Mem.mymalloc("kernel[1]", bytes = maxfftsize * sizeof(fft_real));
368  bytes_tot += bytes;
369  fft_of_kernel[1] = (fft_complex *)kernel[1];
370 #endif
371 
372  mpi_printf("\nPM-NONPERIODIC: Allocated %g MByte for FFT kernel(s).\n\n", bytes_tot / (1024.0 * 1024.0));
373 }
374 
375 #ifdef PM_ZOOM_OPTIMIZED
376 
377 void pm_nonperiodic::pmforce_nonperiodic_zoom_optimized_prepare_density(int grnr)
378 {
379  MPI_Status status;
380 
381  particle_data *P = Sp->P;
382 
383  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
384 
385  part = (part_slab_data *)Mem.mymalloc("part", 8 * (NSource * sizeof(part_slab_data)));
386  large_numpart_type *part_sortindex =
387  (large_numpart_type *)Mem.mymalloc("part_sortindex", 8 * (NSource * sizeof(large_numpart_type)));
388 
389  int ngrid = 0;
390 
391 #ifdef FFT_COLUMN_BASED
392  int columns = GRIDX * GRIDY;
393  int avg = (columns - 1) / NTask + 1;
394  int exc = NTask * avg - columns;
395  int tasklastsection = NTask - exc;
396  int pivotcol = tasklastsection * avg;
397 #endif
398 
399  /* determine the cells each particle accesses */
400  for(int idx = 0; idx < NSource; idx++)
401  {
402  int i = Sp->get_active_index(idx);
403 
404  if(P[i].Ti_Current != All.Ti_Current)
405  Sp->drift_particle(&P[i], &Sp->SphP[i], All.Ti_Current);
406 
407  MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
408  P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
409 
410  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
411 
412  if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
413  continue;
414 
415  if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
416  continue;
417 
418  if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
419  continue;
420 
421  int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
422  int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
423  int slab_z = (int)(to_slab_fac * (delta[2] - Sp->Corner[grnr][2]));
424  int myngrid;
425 
426  myngrid = ngrid;
427  ngrid += 1;
428 
429  large_numpart_type index_on_grid = ((large_numpart_type)myngrid) * 8;
430 
431  for(int xx = 0; xx < 2; xx++)
432  for(int yy = 0; yy < 2; yy++)
433  for(int zz = 0; zz < 2; zz++)
434  {
435  int slab_xx = slab_x + xx;
436  int slab_yy = slab_y + yy;
437  int slab_zz = slab_z + zz;
438 
439  if(slab_xx >= GRID)
440  slab_xx -= GRID;
441  if(slab_yy >= GRID)
442  slab_yy -= GRID;
443  if(slab_zz >= GRID)
444  slab_zz -= GRID;
445 
446  large_array_offset offset = FI(slab_xx, slab_yy, slab_zz);
447 
448  part[index_on_grid].partindex = (i << 3) + (xx << 2) + (yy << 1) + zz;
449  part[index_on_grid].globalindex = offset;
450  part_sortindex[index_on_grid] = index_on_grid;
451  index_on_grid++;
452  }
453  }
454 
455  /* note: num_on_grid will be 8 times larger than the particle number, but num_field_points will generally be much smaller */
456  num_on_grid = ((large_numpart_type)ngrid) * 8;
457 
458  /* bring the part-field into the order of the accessed cells. This allows the removal of duplicates */
459  mycxxsort(part_sortindex, part_sortindex + num_on_grid, pm_nonperiodic_sortindex_comparator(part));
460 
461  large_array_offset num_field_points;
462 
463  if(num_on_grid > 0)
464  num_field_points = 1;
465  else
466  num_field_points = 0;
467 
468  /* determine the number of unique field points */
469  for(large_numpart_type i = 1; i < num_on_grid; i++)
470  {
471  if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
472  num_field_points++;
473  }
474 
475  /* allocate the local field */
476  localfield_globalindex = (large_array_offset *)Mem.mymalloc_movable(&localfield_globalindex, "localfield_globalindex",
477  num_field_points * sizeof(large_array_offset));
478  localfield_data = (fft_real *)Mem.mymalloc_movable(&localfield_data, "localfield_data", num_field_points * sizeof(fft_real));
479  localfield_first = (size_t *)Mem.mymalloc_movable(&localfield_first, "localfield_first", NTask * sizeof(size_t));
480  localfield_sendcount = (size_t *)Mem.mymalloc_movable(&localfield_sendcount, "localfield_sendcount", NTask * sizeof(size_t));
481  localfield_offset = (size_t *)Mem.mymalloc_movable(&localfield_offset, "localfield_offset", NTask * sizeof(size_t));
482  localfield_recvcount = (size_t *)Mem.mymalloc_movable(&localfield_recvcount, "localfield_recvcount", NTask * sizeof(size_t));
483 
484  for(int i = 0; i < NTask; i++)
485  {
486  localfield_first[i] = 0;
487  localfield_sendcount[i] = 0;
488  }
489 
490  /* establish the cross link between the part[ ]-array and the local list of
491  * mesh points. Also, count on which CPU the needed field points are stored.
492  */
493  num_field_points = 0;
494  for(large_numpart_type i = 0; i < num_on_grid; i++)
495  {
496  if(i > 0)
497  if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
498  num_field_points++;
499 
500  part[part_sortindex[i]].localindex = num_field_points;
501 
502  if(i > 0)
503  if(part[part_sortindex[i]].globalindex == part[part_sortindex[i - 1]].globalindex)
504  continue;
505 
506  localfield_globalindex[num_field_points] = part[part_sortindex[i]].globalindex;
507 
508 #ifndef FFT_COLUMN_BASED
509  int slab = part[part_sortindex[i]].globalindex / (GRID * GRID2);
510  int task = myplan.slab_to_task[slab];
511 #else
512  int task, column = part[part_sortindex[i]].globalindex / (GRID2);
513 
514  if(column < pivotcol)
515  task = column / avg;
516  else
517  task = (column - pivotcol) / (avg - 1) + tasklastsection;
518 #endif
519 
520  if(localfield_sendcount[task] == 0)
521  localfield_first[task] = num_field_points;
522 
523  localfield_sendcount[task]++;
524  }
525  num_field_points++;
526 
527  localfield_offset[0] = 0;
528  for(int i = 1; i < NTask; i++)
529  localfield_offset[i] = localfield_offset[i - 1] + localfield_sendcount[i - 1];
530 
531  Mem.myfree_movable(part_sortindex);
532  part_sortindex = NULL;
533 
534  /* now bin the local particle data onto the mesh list */
535  for(large_numpart_type i = 0; i < num_field_points; i++)
536  localfield_data[i] = 0;
537 
538  for(large_numpart_type i = 0; i < num_on_grid; i += 8)
539  {
540  int pindex = (part[i].partindex >> 3);
541 
542  MyIntPosType diff[3] = {P[pindex].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[pindex].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
543  P[pindex].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
544 
545  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
546 
547  double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
548  double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
549  double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
550 
551  int slab_x = (int)(dx);
552  int slab_y = (int)(dy);
553  int slab_z = (int)(dz);
554 
555  dx -= slab_x;
556  dy -= slab_y;
557  dz -= slab_z;
558 
559  double weight = P[pindex].getMass();
560 
561  localfield_data[part[i + 0].localindex] += weight * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
562  localfield_data[part[i + 1].localindex] += weight * (1.0 - dx) * (1.0 - dy) * dz;
563  localfield_data[part[i + 2].localindex] += weight * (1.0 - dx) * dy * (1.0 - dz);
564  localfield_data[part[i + 3].localindex] += weight * (1.0 - dx) * dy * dz;
565  localfield_data[part[i + 4].localindex] += weight * (dx) * (1.0 - dy) * (1.0 - dz);
566  localfield_data[part[i + 5].localindex] += weight * (dx) * (1.0 - dy) * dz;
567  localfield_data[part[i + 6].localindex] += weight * (dx)*dy * (1.0 - dz);
568  localfield_data[part[i + 7].localindex] += weight * (dx)*dy * dz;
569  }
570 
571  rhogrid = (fft_real *)Mem.mymalloc("rhogrid", maxfftsize * sizeof(fft_real));
572 
573  /* clear local FFT-mesh density field */
574  for(large_array_offset ii = 0; ii < maxfftsize; ii++)
575  rhogrid[ii] = 0;
576 
577  /* exchange data and add contributions to the local mesh-path */
578  MPI_Alltoall(localfield_sendcount, sizeof(size_t), MPI_BYTE, localfield_recvcount, sizeof(size_t), MPI_BYTE, Communicator);
579 
580  for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
581  {
582  int recvTask = ThisTask ^ level;
583 
584  if(recvTask < NTask)
585  {
586  if(level > 0)
587  {
588  import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
589  import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
590  localfield_recvcount[recvTask] * sizeof(large_array_offset));
591 
592  if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
593  {
594  myMPI_Sendrecv(localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
595  MPI_BYTE, recvTask, TAG_NONPERIOD_A, import_data, localfield_recvcount[recvTask] * sizeof(fft_real),
596  MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
597 
598  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
599  localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_B,
600  import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
601  TAG_NONPERIOD_B, Communicator, &status);
602  }
603  }
604  else
605  {
606  import_data = localfield_data + localfield_offset[ThisTask];
607  import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
608  }
609 
610  /* note: here every element in rhogrid is only accessed once, so there should be no race condition */
611  for(large_numpart_type i = 0; i < localfield_recvcount[recvTask]; i++)
612  {
613  /* determine offset in local FFT slab */
614 #ifndef FFT_COLUMN_BASED
615  large_array_offset offset =
616  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRID * ((large_array_offset)GRID2);
617 #else
618  large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
619 #endif
620  rhogrid[offset] += import_data[i];
621  }
622 
623  if(level > 0)
624  {
625  Mem.myfree(import_globalindex);
626  Mem.myfree(import_data);
627  }
628  }
629  }
630 }
631 
632 /* Function to read out the force component corresponding to spatial dimension 'dim'.
633  * If dim is negative, potential values are read out and assigned to particles.
634  */
635 void pm_nonperiodic::pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(int grnr, int dim)
636 {
637 #ifdef EVALPOTENTIAL
638  double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
639 #endif
640 
641  particle_data *P = Sp->P;
642 
643  MPI_Status status;
644 
645  fft_real *grid;
646 
647  if(dim < 0)
648  grid = rhogrid;
649  else
650  grid = forcegrid;
651 
652  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
653 
654  for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
655  {
656  int recvTask = ThisTask ^ level;
657 
658  if(recvTask < NTask)
659  {
660  if(level > 0)
661  {
662  import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
663  import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
664  localfield_recvcount[recvTask] * sizeof(large_array_offset));
665 
666  if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
667  {
668  myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
669  localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_C,
670  import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
671  TAG_NONPERIOD_C, Communicator, &status);
672  }
673  }
674  else
675  {
676  import_data = localfield_data + localfield_offset[ThisTask];
677  import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
678  }
679 
680  for(large_numpart_type i = 0; i < localfield_recvcount[recvTask]; i++)
681  {
682 #ifndef FFT_COLUMN_BASED
683  large_array_offset offset =
684  import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRID * ((large_array_offset)GRID2);
685 #else
686  large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
687 #endif
688  import_data[i] = grid[offset];
689  }
690 
691  if(level > 0)
692  {
693  myMPI_Sendrecv(import_data, localfield_recvcount[recvTask] * sizeof(fft_real), MPI_BYTE, recvTask, TAG_NONPERIOD_A,
694  localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
695  MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
696 
697  Mem.myfree(import_globalindex);
698  Mem.myfree(import_data);
699  }
700  }
701  }
702 
703  /* read out the force/potential values, which all have been assembled in localfield_data */
704 
705  int ngrid = (num_on_grid >> 3);
706 
707  for(int k = 0; k < ngrid; k++)
708  {
709  large_numpart_type j = (((large_numpart_type)k) << 3);
710 
711  int i = (part[j].partindex >> 3);
712 
713 #if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
714  if(!Sp->TimeBinSynchronized[P[i].TimeBinGrav])
715  continue;
716 #endif
717 
718  MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
719  P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
720 
721  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
722 
723  double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
724  double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
725  double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
726 
727  int slab_x = (int)(dx);
728  int slab_y = (int)(dy);
729  int slab_z = (int)(dz);
730 
731  dx -= slab_x;
732  dy -= slab_y;
733  dz -= slab_z;
734 
735  double value = +localfield_data[part[j + 0].localindex] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
736  localfield_data[part[j + 1].localindex] * (1.0 - dx) * (1.0 - dy) * dz +
737  localfield_data[part[j + 2].localindex] * (1.0 - dx) * dy * (1.0 - dz) +
738  localfield_data[part[j + 3].localindex] * (1.0 - dx) * dy * dz +
739  localfield_data[part[j + 4].localindex] * (dx) * (1.0 - dy) * (1.0 - dz) +
740  localfield_data[part[j + 5].localindex] * (dx) * (1.0 - dy) * dz +
741  localfield_data[part[j + 6].localindex] * (dx)*dy * (1.0 - dz) +
742  localfield_data[part[j + 7].localindex] * (dx)*dy * dz;
743 
744  if(dim < 0)
745  {
746 #ifdef EVALPOTENTIAL
747  P[i].Potential += value * fac;
748 #endif
749  }
750  else
751  {
752  Sp->P[i].GravAccel[dim] += value;
753  }
754  }
755 }
756 
757 #else
758 
759 void pm_nonperiodic::pmforce_nonperiodic_uniform_optimized_prepare_density(int grnr)
760 {
761  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
762 
763  Sndpm_count = (size_t *)Mem.mymalloc("Sndpm_count", NTask * sizeof(size_t));
764  Sndpm_offset = (size_t *)Mem.mymalloc("Sndpm_offset", NTask * sizeof(size_t));
765  Rcvpm_count = (size_t *)Mem.mymalloc("Rcvpm_count", NTask * sizeof(size_t));
766  Rcvpm_offset = (size_t *)Mem.mymalloc("Rcvpm_offset", NTask * sizeof(size_t));
767 
768 #ifdef FFT_COLUMN_BASED
769  int columns = GRIDX * GRIDY;
770  int avg = (columns - 1) / NTask + 1;
771  int exc = NTask * avg - columns;
772  int tasklastsection = NTask - exc;
773  int pivotcol = tasklastsection * avg;
774 #endif
775 
776  /* determine the slabs/columns each particles accesses */
777  {
778  size_t *send_count = Sndpm_count;
779 
780  for(int j = 0; j < NTask; j++)
781  send_count[j] = 0;
782 
783  for(int idx = 0; idx < NSource; idx++)
784  {
785  int i = Sp->get_active_index(idx);
786 
787  if(Sp->P[i].Ti_Current != All.Ti_Current)
788  Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
789 
790  MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
791  Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
792 
793  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
794 
795  if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
796  continue;
797 
798  if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
799  continue;
800 
801  if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
802  continue;
803 
804  int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
805  int slab_xx = slab_x + 1;
806 
807 #ifndef FFT_COLUMN_BASED
808  int task0 = myplan.slab_to_task[slab_x];
809  int task1 = myplan.slab_to_task[slab_xx];
810 
811  send_count[task0]++;
812  if(task0 != task1)
813  send_count[task1]++;
814 #else
815  int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
816  int slab_yy = slab_y + 1;
817 
818  int column0 = slab_x * GRID + slab_y;
819  int column1 = slab_x * GRID + slab_yy;
820  int column2 = slab_xx * GRID + slab_y;
821  int column3 = slab_xx * GRID + slab_yy;
822 
823  int task0, task1, task2, task3;
824 
825  if(column0 < pivotcol)
826  task0 = column0 / avg;
827  else
828  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
829 
830  if(column1 < pivotcol)
831  task1 = column1 / avg;
832  else
833  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
834 
835  if(column2 < pivotcol)
836  task2 = column2 / avg;
837  else
838  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
839 
840  if(column3 < pivotcol)
841  task3 = column3 / avg;
842  else
843  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
844 
845  send_count[task0]++;
846  if(task1 != task0)
847  send_count[task1]++;
848  if(task2 != task1 && task2 != task0)
849  send_count[task2]++;
850  if(task3 != task0 && task3 != task1 && task3 != task2)
851  send_count[task3]++;
852 #endif
853  }
854  }
855 
856  /* collect thread-specific offset table and collect the results from the other threads */
857  Sndpm_offset[0] = 0;
858  for(int i = 1; i < NTask; i++)
859  {
860  int ind = i;
861  int ind_prev = i - 1;
862 
863  Sndpm_offset[ind] = Sndpm_offset[ind_prev] + Sndpm_count[ind_prev];
864  }
865 
866  MPI_Alltoall(Sndpm_count, sizeof(size_t), MPI_BYTE, Rcvpm_count, sizeof(size_t), MPI_BYTE, Communicator);
867 
868  nimport = 0, nexport = 0, Rcvpm_offset[0] = 0, Sndpm_offset[0] = 0;
869  for(int j = 0; j < NTask; j++)
870  {
871  nexport += Sndpm_count[j];
872  nimport += Rcvpm_count[j];
873 
874  if(j > 0)
875  {
876  Sndpm_offset[j] = Sndpm_offset[j - 1] + Sndpm_count[j - 1];
877  Rcvpm_offset[j] = Rcvpm_offset[j - 1] + Rcvpm_count[j - 1];
878  }
879  }
880 
881  /* allocate import and export buffer */
882  partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
883  partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
884 
885  {
886  size_t *send_count = Sndpm_count;
887  size_t *send_offset = Sndpm_offset;
888 
889  for(int j = 0; j < NTask; j++)
890  send_count[j] = 0;
891 
892  /* fill export buffer */
893  for(int idx = 0; idx < NSource; idx++)
894  {
895  int i = Sp->get_active_index(idx);
896 
897  MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
898  Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
899 
900  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
901 
902  if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
903  continue;
904 
905  if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
906  continue;
907 
908  if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
909  continue;
910 
911  int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
912  int slab_xx = slab_x + 1;
913 
914 #ifndef FFT_COLUMN_BASED
915  int task0 = myplan.slab_to_task[slab_x];
916  int task1 = myplan.slab_to_task[slab_xx];
917 
918  size_t ind0 = send_offset[task0] + send_count[task0]++;
919  partout[ind0].Mass = Sp->P[i].getMass();
920  for(int j = 0; j < 3; j++)
921  partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
922 
923  if(task0 != task1)
924  {
925  size_t ind1 = send_offset[task1] + send_count[task1]++;
926  partout[ind1].Mass = Sp->P[i].getMass();
927  for(int j = 0; j < 3; j++)
928  partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
929  }
930 #else
931  int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
932  int slab_yy = slab_y + 1;
933 
934  int column0 = slab_x * GRID + slab_y;
935  int column1 = slab_x * GRID + slab_yy;
936  int column2 = slab_xx * GRID + slab_y;
937  int column3 = slab_xx * GRID + slab_yy;
938 
939  int task0, task1, task2, task3;
940 
941  if(column0 < pivotcol)
942  task0 = column0 / avg;
943  else
944  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
945 
946  if(column1 < pivotcol)
947  task1 = column1 / avg;
948  else
949  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
950 
951  if(column2 < pivotcol)
952  task2 = column2 / avg;
953  else
954  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
955 
956  if(column3 < pivotcol)
957  task3 = column3 / avg;
958  else
959  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
960 
961  size_t ind0 = send_offset[task0] + send_count[task0]++;
962  partout[ind0].Mass = Sp->P[i].getMass();
963  for(int j = 0; j < 3; j++)
964  partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
965 
966  if(task1 != task0)
967  {
968  size_t ind1 = send_offset[task1] + send_count[task1]++;
969  partout[ind1].Mass = Sp->P[i].getMass();
970  for(int j = 0; j < 3; j++)
971  partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
972  }
973  if(task2 != task1 && task2 != task0)
974  {
975  size_t ind2 = send_offset[task2] + send_count[task2]++;
976  partout[ind2].Mass = Sp->P[i].getMass();
977  for(int j = 0; j < 3; j++)
978  partout[ind2].IntPos[j] = Sp->P[i].IntPos[j];
979  }
980  if(task3 != task0 && task3 != task1 && task3 != task2)
981  {
982  size_t ind3 = send_offset[task3] + send_count[task3]++;
983  partout[ind3].Mass = Sp->P[i].getMass();
984  for(int j = 0; j < 3; j++)
985  partout[ind3].IntPos[j] = Sp->P[i].IntPos[j];
986  }
987 #endif
988  }
989  }
990 
991  int flag_big = 0, flag_big_all;
992  for(int i = 0; i < NTask; i++)
993  if(Sndpm_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
994  flag_big = 1;
995 
996  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
997  * transfer the data in chunks.
998  */
999  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1000 
1001  /* exchange particle data */
1002  myMPI_Alltoallv(partout, Sndpm_count, Sndpm_offset, partin, Rcvpm_count, Rcvpm_offset, sizeof(partbuf), flag_big_all, Communicator);
1003 
1004  Mem.myfree(partout);
1005 
1006  /* allocate density field */
1007  rhogrid = (fft_real *)Mem.mymalloc("rhogrid", maxfftsize * sizeof(fft_real));
1008 
1009  /* clear local FFT-mesh density field */
1010  for(size_t ii = 0; ii < maxfftsize; ii++)
1011  rhogrid[ii] = 0;
1012 
1013 #ifndef FFT_COLUMN_BASED
1014  /* bin particle data onto mesh, in multi-threaded fashion */
1015  {
1016  int first_y, count_y;
1017  subdivide_evenly(GRID, 1, 0, &first_y, &count_y);
1018  int last_y = first_y + count_y - 1;
1019 
1020  for(size_t i = 0; i < nimport; i++)
1021  {
1022  MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1023  partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1024 
1025  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1026 
1027  double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
1028  int slab_y = (int)(dy);
1029  dy -= slab_y;
1030 
1031  int slab_yy = slab_y + 1;
1032  int flag_slab_y, flag_slab_yy;
1033 
1034  if(slab_y >= first_y && slab_y <= last_y)
1035  flag_slab_y = 1;
1036  else
1037  flag_slab_y = 0;
1038 
1039  if(slab_yy >= first_y && slab_yy <= last_y)
1040  flag_slab_yy = 1;
1041  else
1042  flag_slab_yy = 0;
1043 
1044  if(flag_slab_y || flag_slab_yy)
1045  {
1046  double mass = partin[i].Mass;
1047 
1048  double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
1049  double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
1050  int slab_x = (int)(dx);
1051  int slab_z = (int)(dz);
1052  dx -= slab_x;
1053  dz -= slab_z;
1054 
1055  int slab_xx = slab_x + 1;
1056  int slab_zz = slab_z + 1;
1057 
1058  int flag_slab_x, flag_slab_xx;
1059 
1060  if(myplan.slab_to_task[slab_x] == ThisTask)
1061  {
1062  slab_x -= myplan.first_slab_x_of_task[ThisTask];
1063  flag_slab_x = 1;
1064  }
1065  else
1066  flag_slab_x = 0;
1067 
1068  if(myplan.slab_to_task[slab_xx] == ThisTask)
1069  {
1070  slab_xx -= myplan.first_slab_x_of_task[ThisTask];
1071  flag_slab_xx = 1;
1072  }
1073  else
1074  flag_slab_xx = 0;
1075 
1076  if(flag_slab_x)
1077  {
1078  if(flag_slab_y)
1079  {
1080  rhogrid[FI(slab_x, slab_y, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
1081  rhogrid[FI(slab_x, slab_y, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
1082  }
1083 
1084  if(flag_slab_yy)
1085  {
1086  rhogrid[FI(slab_x, slab_yy, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
1087  rhogrid[FI(slab_x, slab_yy, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
1088  }
1089  }
1090 
1091  if(flag_slab_xx)
1092  {
1093  if(flag_slab_y)
1094  {
1095  rhogrid[FI(slab_xx, slab_y, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
1096  rhogrid[FI(slab_xx, slab_y, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
1097  }
1098 
1099  if(flag_slab_yy)
1100  {
1101  rhogrid[FI(slab_xx, slab_yy, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
1102  rhogrid[FI(slab_xx, slab_yy, slab_zz)] += (mass * (dx) * (dy) * (dz));
1103  }
1104  }
1105  }
1106  }
1107  }
1108 
1109 #else
1110 
1111  struct data_cols
1112  {
1113  int col0, col1, col2, col3;
1114  double dx, dy;
1115  };
1116 
1117  data_cols *aux = (data_cols *)Mem.mymalloc("aux", nimport * sizeof(data_cols));
1118 
1119  for(int i = 0; i < nimport; i++)
1120  {
1121  MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1122  partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1123 
1124  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1125 
1126  aux[i].dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
1127  aux[i].dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
1128 
1129  int slab_x = (int)(aux[i].dx);
1130  int slab_y = (int)(aux[i].dy);
1131 
1132  aux[i].dx -= slab_x;
1133  aux[i].dy -= slab_y;
1134 
1135  int slab_xx = slab_x + 1;
1136  int slab_yy = slab_y + 1;
1137 
1138  aux[i].col0 = slab_x * GRID + slab_y;
1139  aux[i].col1 = slab_x * GRID + slab_yy;
1140  aux[i].col2 = slab_xx * GRID + slab_y;
1141  aux[i].col3 = slab_xx * GRID + slab_yy;
1142  }
1143 
1144  {
1145  int first_col, last_col, count_col;
1146  subdivide_evenly(myplan.ncol_XY, 1, 0, &first_col, &count_col);
1147  last_col = first_col + count_col - 1;
1148  first_col += myplan.firstcol_XY;
1149  last_col += myplan.firstcol_XY;
1150 
1151  for(int i = 0; i < nimport; i++)
1152  {
1153  int flag0, flag1, flag2, flag3;
1154  int col0 = aux[i].col0;
1155  int col1 = aux[i].col1;
1156  int col2 = aux[i].col2;
1157  int col3 = aux[i].col3;
1158 
1159  if(col0 >= first_col && col0 <= last_col)
1160  flag0 = 1;
1161  else
1162  flag0 = 0;
1163 
1164  if(col1 >= first_col && col1 <= last_col)
1165  flag1 = 1;
1166  else
1167  flag1 = 0;
1168 
1169  if(col2 >= first_col && col2 <= last_col)
1170  flag2 = 1;
1171  else
1172  flag2 = 0;
1173 
1174  if(col3 >= first_col && col3 <= last_col)
1175  flag3 = 1;
1176  else
1177  flag3 = 0;
1178 
1179  if(flag0 || flag1 || flag2 || flag3)
1180  {
1181  double mass = partin[i].Mass;
1182 
1183  double dx = aux[i].dx;
1184  double dy = aux[i].dy;
1185 
1186  MySignedIntPosType deltaz = (MySignedIntPosType)(partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]);
1187 
1188  double dz = to_slab_fac * (deltaz - Sp->Corner[grnr][2]);
1189  int slab_z = (int)(dz);
1190  dz -= slab_z;
1191  int slab_zz = slab_z + 1;
1192 
1193  if(flag0)
1194  {
1195  rhogrid[FC(col0, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
1196  rhogrid[FC(col0, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
1197  }
1198 
1199  if(flag1)
1200  {
1201  rhogrid[FC(col1, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
1202  rhogrid[FC(col1, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
1203  }
1204 
1205  if(flag2)
1206  {
1207  rhogrid[FC(col2, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
1208  rhogrid[FC(col2, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
1209  }
1210 
1211  if(flag3)
1212  {
1213  rhogrid[FC(col3, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
1214  rhogrid[FC(col3, slab_zz)] += (mass * (dx) * (dy) * (dz));
1215  }
1216  }
1217  }
1218  }
1219 
1220  Mem.myfree(aux);
1221 
1222 #endif
1223 }
1224 
1225 /* If dim<0, this function reads out the potential, otherwise Cartesian force components.
1226  */
1227 void pm_nonperiodic::pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(int grnr, int dim)
1228 {
1229 #ifdef EVALPOTENTIAL
1230  double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
1231 #endif
1232 
1233  double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
1234 
1235  double *flistin = (double *)Mem.mymalloc("flistin", nimport * sizeof(double));
1236  double *flistout = (double *)Mem.mymalloc("flistout", nexport * sizeof(double));
1237 
1238  fft_real *grid;
1239 
1240  if(dim < 0)
1241  grid = rhogrid;
1242  else
1243  grid = forcegrid;
1244 
1245 #ifdef FFT_COLUMN_BASED
1246  int columns = GRIDX * GRIDY;
1247  int avg = (columns - 1) / NTask + 1;
1248  int exc = NTask * avg - columns;
1249  int tasklastsection = NTask - exc;
1250  int pivotcol = tasklastsection * avg;
1251 #endif
1252 
1253  for(size_t i = 0; i < nimport; i++)
1254  {
1255  flistin[i] = 0;
1256 
1257  MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1258  partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1259 
1260  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1261 
1262  double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
1263  double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
1264  double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
1265 
1266  int slab_x = (int)(dx);
1267  int slab_y = (int)(dy);
1268  int slab_z = (int)(dz);
1269 
1270  dx -= slab_x;
1271  dy -= slab_y;
1272  dz -= slab_z;
1273 
1274  int slab_xx = slab_x + 1;
1275  int slab_yy = slab_y + 1;
1276  int slab_zz = slab_z + 1;
1277 
1278 #ifndef FFT_COLUMN_BASED
1279  if(myplan.slab_to_task[slab_x] == ThisTask)
1280  {
1281  slab_x -= myplan.first_slab_x_of_task[ThisTask];
1282 
1283  flistin[i] += +grid[FI(slab_x, slab_y, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1284  grid[FI(slab_x, slab_y, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz) +
1285  grid[FI(slab_x, slab_yy, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) +
1286  grid[FI(slab_x, slab_yy, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1287  }
1288 
1289  if(myplan.slab_to_task[slab_xx] == ThisTask)
1290  {
1291  slab_xx -= myplan.first_slab_x_of_task[ThisTask];
1292 
1293  flistin[i] += +grid[FI(slab_xx, slab_y, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) +
1294  grid[FI(slab_xx, slab_y, slab_zz)] * (dx) * (1.0 - dy) * (dz) +
1295  grid[FI(slab_xx, slab_yy, slab_z)] * (dx) * (dy) * (1.0 - dz) +
1296  grid[FI(slab_xx, slab_yy, slab_zz)] * (dx) * (dy) * (dz);
1297  }
1298 #else
1299  int column0 = slab_x * GRID + slab_y;
1300  int column1 = slab_x * GRID + slab_yy;
1301  int column2 = slab_xx * GRID + slab_y;
1302  int column3 = slab_xx * GRID + slab_yy;
1303 
1304  if(column0 >= myplan.firstcol_XY && column0 <= myplan.lastcol_XY)
1305  {
1306  flistin[i] += +grid[FC(column0, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1307  grid[FC(column0, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz);
1308  }
1309  if(column1 >= myplan.firstcol_XY && column1 <= myplan.lastcol_XY)
1310  {
1311  flistin[i] +=
1312  +grid[FC(column1, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FC(column1, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1313  }
1314 
1315  if(column2 >= myplan.firstcol_XY && column2 <= myplan.lastcol_XY)
1316  {
1317  flistin[i] +=
1318  +grid[FC(column2, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FC(column2, slab_zz)] * (dx) * (1.0 - dy) * (dz);
1319  }
1320 
1321  if(column3 >= myplan.firstcol_XY && column3 <= myplan.lastcol_XY)
1322  {
1323  flistin[i] += +grid[FC(column3, slab_z)] * (dx) * (dy) * (1.0 - dz) + grid[FC(column3, slab_zz)] * (dx) * (dy) * (dz);
1324  }
1325 #endif
1326  }
1327 
1328  /* exchange the potential component data */
1329  int flag_big = 0, flag_big_all;
1330  for(int i = 0; i < NTask; i++)
1331  if(Sndpm_count[i] * sizeof(double) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1332  flag_big = 1;
1333 
1334  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1335  * transfer the data in chunks.
1336  */
1337  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1338 
1339  /* exchange data */
1340  myMPI_Alltoallv(flistin, Rcvpm_count, Rcvpm_offset, flistout, Sndpm_count, Sndpm_offset, sizeof(double), flag_big_all, Communicator);
1341 
1342  /* now assign them to the correct particles */
1343 
1344  size_t *send_count = Sndpm_count;
1345  size_t *send_offset = Sndpm_offset;
1346 
1347  for(int j = 0; j < NTask; j++)
1348  send_count[j] = 0;
1349 
1350  for(int idx = 0; idx < NSource; idx++)
1351  {
1352  int i = Sp->get_active_index(idx);
1353 
1354  MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1355  Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1356 
1357  MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1358 
1359  if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
1360  continue;
1361 
1362  if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
1363  continue;
1364 
1365  if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
1366  continue;
1367 
1368  int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
1369  int slab_xx = slab_x + 1;
1370 
1371 #ifndef FFT_COLUMN_BASED
1372  int task0 = myplan.slab_to_task[slab_x];
1373  int task1 = myplan.slab_to_task[slab_xx];
1374 
1375  double value = flistout[send_offset[task0] + send_count[task0]++];
1376 
1377  if(task0 != task1)
1378  value += flistout[send_offset[task1] + send_count[task1]++];
1379 #else
1380  int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
1381  int slab_yy = slab_y + 1;
1382 
1383  int column0 = slab_x * GRID + slab_y;
1384  int column1 = slab_x * GRID + slab_yy;
1385  int column2 = slab_xx * GRID + slab_y;
1386  int column3 = slab_xx * GRID + slab_yy;
1387 
1388  int task0, task1, task2, task3;
1389 
1390  if(column0 < pivotcol)
1391  task0 = column0 / avg;
1392  else
1393  task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1394 
1395  if(column1 < pivotcol)
1396  task1 = column1 / avg;
1397  else
1398  task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1399 
1400  if(column2 < pivotcol)
1401  task2 = column2 / avg;
1402  else
1403  task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1404 
1405  if(column3 < pivotcol)
1406  task3 = column3 / avg;
1407  else
1408  task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1409 
1410  double value = flistout[send_offset[task0] + send_count[task0]++];
1411 
1412  if(task1 != task0)
1413  value += flistout[send_offset[task1] + send_count[task1]++];
1414 
1415  if(task2 != task1 && task2 != task0)
1416  value += flistout[send_offset[task2] + send_count[task2]++];
1417 
1418  if(task3 != task0 && task3 != task1 && task3 != task2)
1419  value += flistout[send_offset[task3] + send_count[task3]++];
1420 #endif
1421 
1422 #if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1423  if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1424  continue;
1425 #endif
1426 
1427  if(dim < 0)
1428  {
1429 #ifdef EVALPOTENTIAL
1430  Sp->P[i].Potential += value * fac;
1431 #endif
1432  }
1433  else
1434  {
1435  Sp->P[i].GravAccel[dim] += value;
1436  }
1437  }
1438 
1439  Mem.myfree(flistout);
1440  Mem.myfree(flistin);
1441 }
1442 #endif
1443 
1451 int pm_nonperiodic::pmforce_nonperiodic(int grnr)
1452 {
1453  double tstart = Logs.second();
1454 
1455  mpi_printf("PM-NONPERIODIC: Starting non-periodic PM calculation (Rcut=%g, grid=%d) presently allocated=%g MB.\n", Sp->Rcut[grnr],
1456  grnr, Mem.getAllocatedBytesInMB());
1457 
1458  if(grnr == 1 && Sp->Rcut[1] > Sp->Rcut[0])
1459  Terminate(
1460  "We have Sp->Rcut[1]=%g > Sp->Rcut[0]=%g, which means that the high-res cut-off is larger than the normal one... this is "
1461  "not good.",
1462  Sp->Rcut[1], Sp->Rcut[0]);
1463 
1464 #ifdef HIERARCHICAL_GRAVITY
1465  NSource = Sp->TimeBinsGravity.NActiveParticles;
1466 #else
1467  NSource = Sp->NumPart;
1468 #endif
1469 
1470 #ifndef TREEPM_NOTIMESPLIT
1471  if(NSource != Sp->NumPart)
1472  Terminate("unexpected NSource != Sp->NumPart");
1473 #endif
1474 
1475 #ifndef NUMPART_PER_TASK_LARGE
1476  if((((long long)Sp->NumPart) << 3) >= (((long long)1) << 31))
1477  Terminate("We are dealing with a too large particle number per MPI rank - enabling NUMPART_PER_TASK_LARGE might help.");
1478 #endif
1479 
1480  double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
1481  fac *= 1 / (2 * (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / GRID); /* for finite differencing */
1482 
1483 #ifdef PM_ZOOM_OPTIMIZED
1484  pmforce_nonperiodic_zoom_optimized_prepare_density(grnr);
1485 #else
1486  pmforce_nonperiodic_uniform_optimized_prepare_density(grnr);
1487 #endif
1488 
1489  /* allocate the memory to hold the FFT fields */
1490  forcegrid = (fft_real *)Mem.mymalloc("forcegrid", maxfftsize * sizeof(fft_real));
1491 
1492  workspace = forcegrid;
1493 
1494 #ifndef FFT_COLUMN_BASED
1495  fft_of_rhogrid = (fft_complex *)&rhogrid[0];
1496 #else
1497  fft_of_rhogrid = (fft_complex *)&workspace[0];
1498 #endif
1499 
1500  /* Do the FFT of the density field */
1501 #ifndef FFT_COLUMN_BASED
1502  my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], 1);
1503 #else
1504  my_column_based_fft(&myplan, rhogrid, workspace, 1); /* result is in workspace, not in rhogrid ! */
1505 #endif
1506 
1507  /* multiply with kernel in Fourier space */
1508  /* multiply with the Fourier transform of the Green's function (kernel) */
1509 
1510  /* multiply with Green's function in order to obtain the potential */
1511 
1512 #ifdef FFT_COLUMN_BASED
1513  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
1514  {
1515 #else
1516  for(int x = 0; x < GRID; x++)
1517  for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
1518  for(int z = 0; z < GRIDz; z++)
1519  {
1520 #endif
1521 
1522 #ifndef FFT_COLUMN_BASED
1523  large_array_offset ip = ((large_array_offset)GRIDz) * (GRID * (y - myplan.slabstart_y) + x) + z;
1524 #endif
1525 
1526  double re = fft_of_rhogrid[ip][0] * fft_of_kernel[grnr][ip][0] - fft_of_rhogrid[ip][1] * fft_of_kernel[grnr][ip][1];
1527  double im = fft_of_rhogrid[ip][0] * fft_of_kernel[grnr][ip][1] + fft_of_rhogrid[ip][1] * fft_of_kernel[grnr][ip][0];
1528 
1529  fft_of_rhogrid[ip][0] = re;
1530  fft_of_rhogrid[ip][1] = im;
1531  }
1532 
1533  /* Do the inverse FFT to get the potential */
1534 
1535 #ifndef FFT_COLUMN_BASED
1536  my_slab_based_fft(&myplan, rhogrid, workspace, -1);
1537 #else
1538  my_column_based_fft(&myplan, workspace, rhogrid, -1);
1539 #endif
1540 
1541  /* Now rhogrid holds the potential */
1542 
1543 #ifdef EVALPOTENTIAL
1544 #ifdef PM_ZOOM_OPTIMIZED
1545  pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(grnr, -1);
1546 #else
1547  pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(grnr, -1);
1548 #endif
1549 #endif
1550 
1551  /* get the force components by finite differencing of the potential for each dimension,
1552  * and send the results back to the right CPUs
1553  */
1554  for(int dim = 2; dim >= 0; dim--) /* Calculate each component of the force. */
1555  {
1556  /* we do the x component last, because for differencing the potential in the x-direction, we need to construct the transpose */
1557 #ifndef FFT_COLUMN_BASED
1558  if(dim == 0)
1559  my_slab_transposeA(&myplan, rhogrid, forcegrid); /* compute the transpose of the potential field for finite differencing */
1560 
1561  for(int y = 2; y < GRID / 2 - 2; y++)
1562  for(int x = 0; x < myplan.nslab_x; x++)
1563  if(x + myplan.slabstart_x >= 2 && x + myplan.slabstart_x < GRID / 2 - 2)
1564  for(int z = 2; z < GRID / 2 - 2; z++)
1565  {
1566  int yrr = y, yll = y, yr = y, yl = y;
1567  int zrr = z, zll = z, zr = z, zl = z;
1568 
1569  switch(dim)
1570  {
1571  case 0: /* note: for the x-direction, we difference the transposed direction (y) */
1572  case 1:
1573  yr = y + 1;
1574  yl = y - 1;
1575  yrr = y + 2;
1576  yll = y - 2;
1577 
1578  break;
1579  case 2:
1580  zr = z + 1;
1581  zl = z - 1;
1582  zrr = z + 2;
1583  zll = z - 2;
1584 
1585  break;
1586  }
1587 
1588  if(dim == 0)
1589  forcegrid[TI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[TI(x, yl, zl)] - rhogrid[TI(x, yr, zr)]) -
1590  (1.0 / 6) * (rhogrid[TI(x, yll, zll)] - rhogrid[TI(x, yrr, zrr)]));
1591  else
1592  forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, yl, zl)] - rhogrid[FI(x, yr, zr)]) -
1593  (1.0 / 6) * (rhogrid[FI(x, yll, zll)] - rhogrid[FI(x, yrr, zrr)]));
1594  }
1595 
1596  if(dim == 0)
1597  my_slab_transposeB(&myplan, forcegrid, rhogrid); /* reverse the transpose from above */
1598 #else
1599  fft_real *scratch;
1600 
1601  if(dim != 2)
1602  {
1603  scratch = (fft_real *)Mem.mymalloc("scratch", myplan.fftsize * sizeof(fft_real)); /* need a third field as scratch space */
1604  memcpy(scratch, rhogrid, myplan.fftsize * sizeof(fft_real));
1605 
1606  if(dim == 1)
1607  my_fft_swap23(&myplan, scratch, forcegrid);
1608  else
1609  my_fft_swap13(&myplan, scratch, forcegrid);
1610  }
1611 
1612  int ncols;
1613  if(dim == 2)
1614  ncols = myplan.ncol_XY;
1615  else if(dim == 1)
1616  ncols = myplan.ncol_XZ;
1617  else
1618  ncols = myplan.ncol_ZY;
1619 
1620  for(int i = 0; i < ncols; i++)
1621  {
1622  fft_real *forcep, *potp;
1623 
1624  if(dim != 2)
1625  {
1626  forcep = &scratch[GRID * i];
1627  potp = &forcegrid[GRID * i];
1628  }
1629  else
1630  {
1631  forcep = &forcegrid[GRID2 * i];
1632  potp = &rhogrid[GRID2 * i];
1633  }
1634 
1635  for(int z = 2; z < GRID / 2 - 2; z++)
1636  {
1637  int zr = z + 1;
1638  int zl = z - 1;
1639  int zrr = z + 2;
1640  int zll = z - 2;
1641 
1642  forcep[z] = fac * ((4.0 / 3) * (potp[zl] - potp[zr]) - (1.0 / 6) * (potp[zll] - potp[zrr]));
1643  }
1644  }
1645 
1646  if(dim != 2)
1647  {
1648  if(dim == 1)
1649  my_fft_swap23back(&myplan, scratch, forcegrid);
1650  else
1651  my_fft_swap13back(&myplan, scratch, forcegrid);
1652 
1653  Mem.myfree(scratch);
1654  }
1655 #endif
1656 
1657 #ifdef PM_ZOOM_OPTIMIZED
1658  pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(grnr, dim);
1659 #else
1660  pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(grnr, dim);
1661 #endif
1662  }
1663 
1664  /* free stuff */
1665  Mem.myfree(forcegrid);
1666  Mem.myfree(rhogrid);
1667 
1668 #ifdef PM_ZOOM_OPTIMIZED
1669  Mem.myfree(localfield_recvcount);
1670  Mem.myfree(localfield_offset);
1671  Mem.myfree(localfield_sendcount);
1672  Mem.myfree(localfield_first);
1673  Mem.myfree(localfield_data);
1674  Mem.myfree(localfield_globalindex);
1675  Mem.myfree(part);
1676 #else
1677  Mem.myfree(partin);
1678  Mem.myfree(Rcvpm_offset);
1679  Mem.myfree(Rcvpm_count);
1680  Mem.myfree(Sndpm_offset);
1681  Mem.myfree(Sndpm_count);
1682 #endif
1683 
1684  double tend = Logs.second();
1685 
1686  mpi_printf("PM-NONPERIODIC: done. (took %g seconds)\n", Logs.timediff(tstart, tend));
1687 
1688  return 0;
1689 }
1690 
1694 void pm_nonperiodic::pm_setup_nonperiodic_kernel(void)
1695 {
1696  mpi_printf("PM-NONPERIODIC: Setting up non-periodic PM kernel(s) (GRID=%d) presently allocated=%g MB).\n", (int)GRID,
1698 
1699  /* now set up kernel and its Fourier transform */
1700 
1701 #if !defined(PERIODIC)
1702  for(size_t i = 0; i < maxfftsize; i++) /* clear local field */
1703  kernel[0][i] = 0;
1704 
1705 #ifndef FFT_COLUMN_BASED
1706  for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
1707  for(int j = 0; j < GRID; j++)
1708  {
1709 #else
1710  for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
1711  {
1712  int i = c / GRID;
1713  int j = c % GRID;
1714 #endif
1715  for(int k = 0; k < GRID; k++)
1716  {
1717  double xx = ((double)i) / GRID;
1718  double yy = ((double)j) / GRID;
1719  double zz = ((double)k) / GRID;
1720 
1721  if(xx >= 0.5)
1722  xx -= 1.0;
1723  if(yy >= 0.5)
1724  yy -= 1.0;
1725  if(zz >= 0.5)
1726  zz -= 1.0;
1727 
1728  double r = sqrt(xx * xx + yy * yy + zz * zz);
1729 
1730  double u = 0.5 * r / (((double)ASMTH) / GRID);
1731 
1732  double fac = 1 - erfc(u);
1733 
1734 #ifndef FFT_COLUMN_BASED
1735  size_t ip = FI(i - myplan.slabstart_x, j, k);
1736 #else
1737  size_t ip = FC(c, k);
1738 #endif
1739  if(r > 0)
1740  kernel[0][ip] = -fac / r;
1741  else
1742  kernel[0][ip] = -1 / (sqrt(M_PI) * (((double)ASMTH) / GRID));
1743  }
1744  }
1745 
1746  {
1747  fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
1748  /* Do the FFT of the kernel */
1749 #ifndef FFT_COLUMN_BASED
1750  my_slab_based_fft(&myplan, kernel[0], workspc, 1);
1751 #else
1752  my_column_based_fft(&myplan, kernel[0], workspc, 1); /* result is in workspace, not in kernel */
1753  memcpy(kernel[0], workspc, maxfftsize * sizeof(fft_real));
1754 #endif
1755  Mem.myfree(workspc);
1756  }
1757 
1758 #endif
1759 
1760 #if defined(PLACEHIGHRESREGION)
1761 
1762  for(int i = 0; i < maxfftsize; i++) /* clear local field */
1763  kernel[1][i] = 0;
1764 
1765 #ifndef FFT_COLUMN_BASED
1766  for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
1767  for(int j = 0; j < GRID; j++)
1768  {
1769 #else
1770  for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
1771  {
1772  int i = c / GRID;
1773  int j = c % GRID;
1774 #endif
1775  for(int k = 0; k < GRID; k++)
1776  {
1777  double xx = ((double)i) / GRID;
1778  double yy = ((double)j) / GRID;
1779  double zz = ((double)k) / GRID;
1780 
1781  if(xx >= 0.5)
1782  xx -= 1.0;
1783  if(yy >= 0.5)
1784  yy -= 1.0;
1785  if(zz >= 0.5)
1786  zz -= 1.0;
1787 
1788  double r = sqrt(xx * xx + yy * yy + zz * zz);
1789 
1790  double u = 0.5 * r / (((double)ASMTH) / GRID);
1791 
1792  double fac = erfc(u * Sp->Asmth[1] / Sp->Asmth[0]) - erfc(u);
1793 
1794 #ifndef FFT_COLUMN_BASED
1795  size_t ip = FI(i - myplan.slabstart_x, j, k);
1796 #else
1797  size_t ip = FC(c, k);
1798 #endif
1799 
1800  if(r > 0)
1801  kernel[1][ip] = -fac / r;
1802  else
1803  {
1804  fac = 1 - Sp->Asmth[1] / Sp->Asmth[0];
1805  kernel[1][ip] = -fac / (sqrt(M_PI) * (((double)ASMTH) / GRID));
1806  }
1807  }
1808 #ifndef FFT_COLUMN_BASED
1809  }
1810 #else
1811  }
1812 #endif
1813 
1814  {
1815  fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
1816  /* Do the FFT of the kernel */
1817 #ifndef FFT_COLUMN_BASED
1818  my_slab_based_fft(&myplan, kernel[1], workspc, 1);
1819 #else
1820  my_column_based_fft(&myplan, kernel[1], workspc, 1); /* result is in workspace, not in kernel */
1821  memcpy(kernel[1], workspc, maxfftsize * sizeof(fft_real));
1822 #endif
1823  Mem.myfree(workspc);
1824  }
1825 
1826 #endif
1827 
1828  /* deconvolve the Greens function twice with the CIC kernel */
1829 #ifdef FFT_COLUMN_BASED
1830 
1831  for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
1832  {
1833  large_array_offset ipcell = ip + myplan.transposed_firstcol * GRID;
1834  int y = ipcell / (GRID * GRIDz);
1835  int yr = ipcell % (GRID * GRIDz);
1836  int z = yr / GRID;
1837  int x = yr % GRID;
1838 #else
1839  for(int x = 0; x < GRID; x++)
1840  for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
1841  for(int z = 0; z < GRIDz; z++)
1842  {
1843 #endif
1844  double kx, ky, kz;
1845 
1846  if(x > GRID / 2)
1847  kx = x - GRID;
1848  else
1849  kx = x;
1850  if(y > GRID / 2)
1851  ky = y - GRID;
1852  else
1853  ky = y;
1854  if(z > GRID / 2)
1855  kz = z - GRID;
1856  else
1857  kz = z;
1858 
1859  double k2 = kx * kx + ky * ky + kz * kz;
1860 
1861  if(k2 > 0)
1862  {
1863  double fx = 1, fy = 1, fz = 1;
1864 
1865  if(kx != 0)
1866  {
1867  fx = (M_PI * kx) / GRID;
1868  fx = sin(fx) / fx;
1869  }
1870  if(ky != 0)
1871  {
1872  fy = (M_PI * ky) / GRID;
1873  fy = sin(fy) / fy;
1874  }
1875  if(kz != 0)
1876  {
1877  fz = (M_PI * kz) / GRID;
1878  fz = sin(fz) / fz;
1879  }
1880 
1881  double ff = 1 / (fx * fy * fz);
1882  ff = ff * ff * ff * ff;
1883 
1884 #ifndef FFT_COLUMN_BASED
1885  large_array_offset ip = ((large_array_offset)GRIDz) * (GRID * (y - myplan.slabstart_y) + x) + z;
1886 #endif
1887 #if !defined(PERIODIC)
1888  fft_of_kernel[0][ip][0] *= ff;
1889  fft_of_kernel[0][ip][1] *= ff;
1890 #endif
1891 #if defined(PLACEHIGHRESREGION)
1892  fft_of_kernel[1][ip][0] *= ff;
1893  fft_of_kernel[1][ip][1] *= ff;
1894 #endif
1895  }
1896  }
1897 
1898  /* end deconvolution */
1899 }
1900 
1901 #endif
global_data_all_processes All
Definition: main.cc:40
double timediff(double t0, double t1)
Definition: logs.cc:488
double second(void)
Definition: logs.cc:471
double getAllocatedBytesInMB(void)
Definition: mymalloc.h:68
void my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
void my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
void mpi_printf(const char *fmt,...)
Definition: setcomm.h:55
int ThisTask
Definition: setcomm.h:33
int NTask
Definition: setcomm.h:32
int PTask
Definition: setcomm.h:34
MPI_Comm Communicator
Definition: setcomm.h:31
#define RCUT
Definition: constants.h:293
#define LOW_MESH
Definition: constants.h:33
#define HIGH_MESH
Definition: constants.h:34
#define MPI_MESSAGE_SIZELIMIT_IN_BYTES
Definition: constants.h:53
#define M_PI
Definition: constants.h:56
#define ASMTH
Definition: constants.h:287
double mycxxsort(T *begin, T *end, Tcomp comp)
Definition: cxxsort.h:39
uint32_t MyIntPosType
Definition: dtypes.h:35
int32_t MySignedIntPosType
Definition: dtypes.h:36
#define BITS_FOR_POSITIONS
Definition: dtypes.h:37
logs Logs
Definition: main.cc:43
#define Terminate(...)
Definition: macros.h:19
MPI_Datatype MPI_MyIntPosType
Definition: mpi_vars.cc:24
MPI_Op MPI_MAX_MySignedIntPosType
Definition: mpi_vars.cc:29
MPI_Op MPI_MIN_MySignedIntPosType
Definition: mpi_vars.cc:28
int myMPI_Sendrecv(void *sendbuf, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, size_t recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
#define TAG_NONPERIOD_B
Definition: mpi_utils.h:45
#define TAG_NONPERIOD_C
Definition: mpi_utils.h:46
void myMPI_Alltoallv(void *sendbuf, size_t *sendcounts, size_t *sdispls, void *recvbuf, size_t *recvcounts, size_t *rdispls, int len, int big_flag, MPI_Comm comm)
Definition: myalltoall.cc:177
#define TAG_NONPERIOD_A
Definition: mpi_utils.h:44
memory Mem
Definition: main.cc:44
expr sin(half arg)
Definition: half.hpp:2816
expr pow(half base, half exp)
Definition: half.hpp:2803
expr erfc(half arg)
Definition: half.hpp:2925
expr sqrt(half arg)
Definition: half.hpp:2777
#define FFTW(x)
Definition: pm_mpi_fft.h:22
integertime Ti_Current
Definition: allvars.h:188
MyDouble getMass(void)
vector< MyFloat > GravAccel
Definition: particle_data.h:55
MyIntPosType IntPos[3]
Definition: particle_data.h:53
void subdivide_evenly(long long N, int pieces, int index_bin, long long *first, int *count)
Definition: system.cc:51