GADGET-4
pm_mpi_fft.cc
Go to the documentation of this file.
1 /*******************************************************************************
2  * \copyright This file is part of the GADGET4 N-body/SPH code developed
3  * \copyright by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
4  * \copyright (vspringel@mpa-garching.mpg.de) and all contributing authors.
5  *******************************************************************************/
6 
12 #include "gadgetconfig.h"
13 
14 #if defined(PMGRID) || defined(NGENIC)
15 
16 #include <fftw3.h>
17 #include <math.h>
18 #include <mpi.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <algorithm>
23 
24 #include "../data/allvars.h"
25 #include "../data/dtypes.h"
26 #include "../data/mymalloc.h"
27 #include "../main/simulation.h"
28 #include "../mpi_utils/mpi_utils.h"
29 #include "../pm/pm.h"
30 #include "../system/system.h"
31 
32 /* We only use the one-dimensional FFTW3 routines, because the MPI versions of FFTW3
33  * allocated memory for themselves during the transforms (which we want to strictly avoid),
34  * and because we want to allow transforms that are so big that more than 2GB may be
35  * transferred betweeen processors.
36  */
37 
38 #ifndef FFT_COLUMN_BASED
39 
40 void pm_mpi_fft::my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
41 {
42  subdivide_evenly(NgridX, NTask, ThisTask, &plan->slabstart_x, &plan->nslab_x);
43  subdivide_evenly(NgridY, NTask, ThisTask, &plan->slabstart_y, &plan->nslab_y);
44 
45  plan->slab_to_task = (int *)Mem.mymalloc_movable(&plan->slab_to_task, "slab_to_task", NgridX * sizeof(int));
46 
47  for(int task = 0; task < NTask; task++)
48  {
49  int start, n;
50 
51  subdivide_evenly(NgridX, NTask, task, &start, &n);
52 
53  for(int i = start; i < start + n; i++)
54  plan->slab_to_task[i] = task;
55  }
56 
57  MPI_Allreduce(&plan->nslab_x, &plan->largest_x_slab, 1, MPI_INT, MPI_MAX, Communicator);
58  MPI_Allreduce(&plan->nslab_y, &plan->largest_y_slab, 1, MPI_INT, MPI_MAX, Communicator);
59 
60  plan->slabs_x_per_task = (int *)Mem.mymalloc_movable(&plan->slabs_x_per_task, "slabs_x_per_task", NTask * sizeof(int));
61  MPI_Allgather(&plan->nslab_x, 1, MPI_INT, plan->slabs_x_per_task, 1, MPI_INT, Communicator);
62 
63  plan->first_slab_x_of_task = (int *)Mem.mymalloc_movable(&plan->first_slab_x_of_task, "first_slab_x_of_task", NTask * sizeof(int));
64  MPI_Allgather(&plan->slabstart_x, 1, MPI_INT, plan->first_slab_x_of_task, 1, MPI_INT, Communicator);
65 
66  plan->slabs_y_per_task = (int *)Mem.mymalloc_movable(&plan->slabs_y_per_task, "slabs_y_per_task", NTask * sizeof(int));
67  MPI_Allgather(&plan->nslab_y, 1, MPI_INT, plan->slabs_y_per_task, 1, MPI_INT, Communicator);
68 
69  plan->first_slab_y_of_task = (int *)Mem.mymalloc_movable(&plan->first_slab_y_of_task, "first_slab_y_of_task", NTask * sizeof(int));
70  MPI_Allgather(&plan->slabstart_y, 1, MPI_INT, plan->first_slab_y_of_task, 1, MPI_INT, Communicator);
71 
72  plan->NgridX = NgridX;
73  plan->NgridY = NgridY;
74  plan->NgridZ = NgridZ;
75 
76  int Ngridz = NgridZ / 2 + 1; /* dimension needed in complex space */
77 
78  plan->Ngridz = Ngridz;
79  plan->Ngrid2 = 2 * Ngridz;
80 }
81 
82 void pm_mpi_fft::my_slab_based_fft_free(fft_plan *plan)
83 {
84  Mem.myfree(plan->first_slab_y_of_task);
85  Mem.myfree(plan->slabs_y_per_task);
86  Mem.myfree(plan->first_slab_x_of_task);
87  Mem.myfree(plan->slabs_x_per_task);
88  Mem.myfree(plan->slab_to_task);
89 }
90 
101 void pm_mpi_fft::my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch)
102 {
103  int n, prod, task, flag_big = 0, flag_big_all = 0;
104 
105  prod = NTask * plan->nslab_x;
106 
107  for(n = 0; n < prod; n++)
108  {
109  int x = n / NTask;
110  int task = n % NTask;
111 
112  int y;
113 
114  for(y = plan->first_slab_y_of_task[task]; y < plan->first_slab_y_of_task[task] + plan->slabs_y_per_task[task]; y++)
115  memcpy(scratch + ((size_t)plan->NgridZ) * (plan->first_slab_y_of_task[task] * plan->nslab_x +
116  x * plan->slabs_y_per_task[task] + (y - plan->first_slab_y_of_task[task])),
117  field + ((size_t)plan->Ngrid2) * (plan->NgridY * x + y), plan->NgridZ * sizeof(fft_real));
118  }
119 
120  size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
121  size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
122  size_t *soff = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
123  size_t *roff = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
124 
125  for(task = 0; task < NTask; task++)
126  {
127  scount[task] = plan->nslab_x * plan->slabs_y_per_task[task] * (plan->NgridZ * sizeof(fft_real));
128  rcount[task] = plan->nslab_y * plan->slabs_x_per_task[task] * (plan->NgridZ * sizeof(fft_real));
129 
130  soff[task] = plan->first_slab_y_of_task[task] * plan->nslab_x * (plan->NgridZ * sizeof(fft_real));
131  roff[task] = plan->first_slab_x_of_task[task] * plan->nslab_y * (plan->NgridZ * sizeof(fft_real));
132 
133  if(scount[task] > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
134  flag_big = 1;
135  }
136 
137  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
138 
139  myMPI_Alltoallv(scratch, scount, soff, field, rcount, roff, 1, flag_big_all, Communicator);
140 
141  Mem.myfree(roff);
142  Mem.myfree(soff);
143  Mem.myfree(rcount);
144  Mem.myfree(scount);
145 }
146 
156 void pm_mpi_fft::my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch)
157 {
158  int n, prod, task, flag_big = 0, flag_big_all = 0;
159 
160  size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
161  size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
162  size_t *soff = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
163  size_t *roff = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
164 
165  for(task = 0; task < NTask; task++)
166  {
167  rcount[task] = plan->nslab_x * plan->slabs_y_per_task[task] * (plan->NgridZ * sizeof(fft_real));
168  scount[task] = plan->nslab_y * plan->slabs_x_per_task[task] * (plan->NgridZ * sizeof(fft_real));
169 
170  roff[task] = plan->first_slab_y_of_task[task] * plan->nslab_x * (plan->NgridZ * sizeof(fft_real));
171  soff[task] = plan->first_slab_x_of_task[task] * plan->nslab_y * (plan->NgridZ * sizeof(fft_real));
172 
173  if(scount[task] > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
174  flag_big = 1;
175  }
176 
177  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
178 
179  myMPI_Alltoallv(field, scount, soff, scratch, rcount, roff, 1, flag_big_all, Communicator);
180 
181  Mem.myfree(roff);
182  Mem.myfree(soff);
183  Mem.myfree(rcount);
184  Mem.myfree(scount);
185 
186  prod = NTask * plan->nslab_x;
187 
188  for(n = 0; n < prod; n++)
189  {
190  int x = n / NTask;
191  int task = n % NTask;
192 
193  int y;
194  for(y = plan->first_slab_y_of_task[task]; y < plan->first_slab_y_of_task[task] + plan->slabs_y_per_task[task]; y++)
195  memcpy(field + ((size_t)plan->Ngrid2) * (plan->NgridY * x + y),
196  scratch + ((size_t)plan->NgridZ) * (plan->first_slab_y_of_task[task] * plan->nslab_x +
197  x * plan->slabs_y_per_task[task] + (y - plan->first_slab_y_of_task[task])),
198  plan->NgridZ * sizeof(fft_real));
199  }
200 }
201 
202 /* Given a slab-decomposed 3D field a[...] with total dimension [nx x ny x nz], whose first dimension is
203  * split across the processors, this routine outputs in b[] the transpose where then the second dimension is split
204  * across the processors. sx[] gives for each MPI task how many slabs it has, and firstx[] is the first
205  * slab for a given task. Likewise, sy[]/firsty[] gives the same thing for the transposed order. Note, the
206  * contents of the array a[] will be destroyed by the routine.
207  *
208  * An element (x,y,z) is accessed in a[] with index [([x - firstx] * ny + y) * nz + z]
209  * and in b[] as [((y - firsty) * nx + x) * nz + z]
210  *
211  * if mode = 1, the reverse operation is carried out.
212  */
213 void pm_mpi_fft::my_slab_transpose(void *av, void *bv, int *sx, int *firstx, int *sy, int *firsty, int nx, int ny, int nz, int mode)
214 {
215  char *a = (char *)av;
216  char *b = (char *)bv;
217 
218  size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
219  size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
220  size_t *soff = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
221  size_t *roff = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
222  int i, n, prod, flag_big = 0, flag_big_all = 0;
223 
224  for(i = 0; i < NTask; i++)
225  {
226  scount[i] = sy[i] * sx[ThisTask] * ((size_t)nz);
227  rcount[i] = sy[ThisTask] * sx[i] * ((size_t)nz);
228  soff[i] = firsty[i] * sx[ThisTask] * ((size_t)nz);
229  roff[i] = sy[ThisTask] * firstx[i] * ((size_t)nz);
230 
231  if(scount[i] * sizeof(fft_complex) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
232  flag_big = 1;
233  }
234 
235  /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
236  * transfer the data in chunks.
237  */
238  MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
239 
240  if(mode == 0)
241  {
242  /* first pack the data into contiguous blocks */
243  prod = NTask * sx[ThisTask];
244  for(n = 0; n < prod; n++)
245  {
246  int k = n / NTask;
247  int i = n % NTask;
248  int j;
249 
250  for(j = 0; j < sy[i]; j++)
251  memcpy(b + (k * sy[i] + j + firsty[i] * sx[ThisTask]) * (nz * sizeof(fft_complex)),
252  a + (k * ny + (firsty[i] + j)) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
253  }
254 
255  /* tranfer the data */
256  myMPI_Alltoallv(b, scount, soff, a, rcount, roff, sizeof(fft_complex), flag_big_all, Communicator);
257 
258  /* unpack the data into the right order */
259  prod = NTask * sy[ThisTask];
260  for(n = 0; n < prod; n++)
261  {
262  int j = n / NTask;
263  int i = n % NTask;
264  int k;
265 
266  for(k = 0; k < sx[i]; k++)
267  memcpy(b + (j * nx + k + firstx[i]) * (nz * sizeof(fft_complex)),
268  a + ((k + firstx[i]) * sy[ThisTask] + j) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
269  }
270  }
271  else
272  {
273  /* first pack the data into contiguous blocks */
274  prod = NTask * sy[ThisTask];
275  for(n = 0; n < prod; n++)
276  {
277  int j = n / NTask;
278  int i = n % NTask;
279  int k;
280 
281  for(k = 0; k < sx[i]; k++)
282  memcpy(b + ((k + firstx[i]) * sy[ThisTask] + j) * (nz * sizeof(fft_complex)),
283  a + (j * nx + k + firstx[i]) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
284  }
285 
286  /* tranfer the data */
287  myMPI_Alltoallv(b, rcount, roff, a, scount, soff, sizeof(fft_complex), flag_big_all, Communicator);
288 
289  /* unpack the data into the right order */
290  prod = NTask * sx[ThisTask];
291  for(n = 0; n < prod; n++)
292  {
293  int k = n / NTask;
294  int i = n % NTask;
295  int j;
296 
297  for(j = 0; j < sy[i]; j++)
298  memcpy(b + (k * ny + (firsty[i] + j)) * (nz * sizeof(fft_complex)),
299  a + (k * sy[i] + j + firsty[i] * sx[ThisTask]) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
300  }
301  }
302 
303  /* now the result is in b[] */
304 
305  Mem.myfree(roff);
306  Mem.myfree(soff);
307  Mem.myfree(rcount);
308  Mem.myfree(scount);
309 }
310 
311 void pm_mpi_fft::my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
312 {
313  int n, prod;
314  int slabsx = plan->slabs_x_per_task[ThisTask];
315  int slabsy = plan->slabs_y_per_task[ThisTask];
316 
317  int ngridx = plan->NgridX;
318  int ngridy = plan->NgridY;
319  int ngridz = plan->Ngridz;
320  int ngridz2 = 2 * ngridz;
321 
322  size_t ngridx_long = ngridx;
323  size_t ngridy_long = ngridy;
324  size_t ngridz_long = ngridz;
325  size_t ngridz2_long = ngridz2;
326 
327  fft_real *data_real = (fft_real *)data;
328  fft_complex *data_complex = (fft_complex *)data, *workspace_complex = (fft_complex *)workspace;
329 
330  if(forward == 1)
331  {
332  /* do the z-direction FFT, real to complex */
333  prod = slabsx * ngridy;
334  for(n = 0; n < prod; n++)
335  {
336  FFTW(execute_dft_r2c)(plan->forward_plan_zdir, data_real + n * ngridz2_long, workspace_complex + n * ngridz_long);
337  }
338 
339  /* do the y-direction FFT, complex to complex */
340  prod = slabsx * ngridz;
341  for(n = 0; n < prod; n++)
342  {
343  int i = n / ngridz;
344  int j = n % ngridz;
345 
346  FFTW(execute_dft)
347  (plan->forward_plan_ydir, workspace_complex + i * ngridz * ngridy_long + j, data_complex + i * ngridz * ngridy_long + j);
348  }
349 
350  /* now our data resides in data_complex[] */
351 
352  /* do the transpose */
353  my_slab_transpose(data_complex, workspace_complex, plan->slabs_x_per_task, plan->first_slab_x_of_task, plan->slabs_y_per_task,
354  plan->first_slab_y_of_task, ngridx, ngridy, ngridz, 0);
355 
356  /* now the data is in workspace_complex[] */
357 
358  /* finally, do the transform along the x-direction (we are in transposed order, x and y have interchanged */
359  prod = slabsy * ngridz;
360  for(n = 0; n < prod; n++)
361  {
362  int i = n / ngridz;
363  int j = n % ngridz;
364 
365  FFTW(execute_dft)
366  (plan->forward_plan_xdir, workspace_complex + i * ngridz * ngridx_long + j, data_complex + i * ngridz * ngridx_long + j);
367  }
368 
369  /* now the result is in data_complex[] */
370  }
371  else
372  {
373  prod = slabsy * ngridz;
374 
375  for(n = 0; n < prod; n++)
376  {
377  int i = n / ngridz;
378  int j = n % ngridz;
379 
380  FFTW(execute_dft)
381  (plan->backward_plan_xdir, data_complex + i * ngridz * ngridx_long + j, workspace_complex + i * ngridz * ngridx_long + j);
382  }
383 
384  my_slab_transpose(workspace_complex, data_complex, plan->slabs_x_per_task, plan->first_slab_x_of_task, plan->slabs_y_per_task,
385  plan->first_slab_y_of_task, ngridx, ngridy, ngridz, 1);
386 
387  prod = slabsx * ngridz;
388 
389  for(n = 0; n < prod; n++)
390  {
391  int i = n / ngridz;
392  int j = n % ngridz;
393 
394  FFTW(execute_dft)
395  (plan->backward_plan_ydir, data_complex + i * ngridz * ngridy_long + j, workspace_complex + i * ngridz * ngridy_long + j);
396  }
397 
398  prod = slabsx * ngridy;
399 
400  for(n = 0; n < prod; n++)
401  {
402  FFTW(execute_dft_c2r)(plan->backward_plan_zdir, workspace_complex + n * ngridz_long, data_real + n * ngridz2_long);
403  }
404 
405  /* now the result is in data[] */
406  }
407 }
408 
409 #else
410 
411 void pm_mpi_fft::my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
412 {
413  plan->NgridX = NgridX;
414  plan->NgridY = NgridY;
415  plan->NgridZ = NgridZ;
416 
417  int Ngridz = NgridZ / 2 + 1;
418 
419  plan->Ngridz = Ngridz;
420  plan->Ngrid2 = 2 * Ngridz;
421 
422  subdivide_evenly(plan->NgridX * plan->NgridY, NTask, ThisTask, &plan->firstcol_XY, &plan->ncol_XY);
423  subdivide_evenly(plan->NgridX * plan->Ngrid2, NTask, ThisTask, &plan->firstcol_XZ, &plan->ncol_XZ);
424  subdivide_evenly(plan->Ngrid2 * plan->NgridY, NTask, ThisTask, &plan->firstcol_ZY, &plan->ncol_ZY);
425 
426  plan->lastcol_XY = plan->firstcol_XY + plan->ncol_XY - 1;
427  plan->lastcol_XZ = plan->firstcol_XZ + plan->ncol_XZ - 1;
428  plan->lastcol_ZY = plan->firstcol_ZY + plan->ncol_ZY - 1;
429 
430  subdivide_evenly(NgridX * Ngridz, NTask, ThisTask, &plan->transposed_firstcol, &plan->transposed_ncol);
431  subdivide_evenly(NgridY * Ngridz, NTask, ThisTask, &plan->second_transposed_firstcol, &plan->second_transposed_ncol);
432 
433  plan->second_transposed_ncells = ((size_t)plan->NgridX) * plan->second_transposed_ncol;
434 
435  plan->max_datasize = ((size_t)plan->Ngrid2) * plan->ncol_XY;
436  plan->max_datasize = std::max<size_t>(plan->max_datasize, 2 * ((size_t)plan->NgridY) * plan->transposed_ncol);
437  plan->max_datasize = std::max<size_t>(plan->max_datasize, 2 * ((size_t)plan->NgridX) * plan->second_transposed_ncol);
438  plan->max_datasize = std::max<size_t>(plan->max_datasize, ((size_t)plan->ncol_XZ) * plan->NgridY);
439  plan->max_datasize = std::max<size_t>(plan->max_datasize, ((size_t)plan->ncol_ZY) * plan->NgridX);
440 
441  plan->fftsize = plan->max_datasize;
442 
443  plan->offsets_send_A = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_A, "offsets_send_A", NTask * sizeof(size_t));
444  plan->offsets_recv_A = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_A, "offsets_recv_A", NTask * sizeof(size_t));
445  plan->offsets_send_B = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_B, "offsets_send_B", NTask * sizeof(size_t));
446  plan->offsets_recv_B = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_B, "offsets_recv_B", NTask * sizeof(size_t));
447  plan->offsets_send_C = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_C, "offsets_send_C", NTask * sizeof(size_t));
448  plan->offsets_recv_C = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_C, "offsets_recv_C", NTask * sizeof(size_t));
449  plan->offsets_send_D = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_D, "offsets_send_D", NTask * sizeof(size_t));
450  plan->offsets_recv_D = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_D, "offsets_recv_D", NTask * sizeof(size_t));
451 
452  plan->count_send_A = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_A, "count_send_A", NTask * sizeof(size_t));
453  plan->count_recv_A = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_A, "count_recv_A", NTask * sizeof(size_t));
454  plan->count_send_B = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_B, "count_send_B", NTask * sizeof(size_t));
455  plan->count_recv_B = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_B, "count_recv_B", NTask * sizeof(size_t));
456  plan->count_send_C = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_C, "count_send_C", NTask * sizeof(size_t));
457  plan->count_recv_C = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_C, "count_recv_C", NTask * sizeof(size_t));
458  plan->count_send_D = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_D, "count_send_D", NTask * sizeof(size_t));
459  plan->count_recv_D = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_D, "count_recv_D", NTask * sizeof(size_t));
460  plan->count_send_13 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_13, "count_send_13", NTask * sizeof(size_t));
461  plan->count_recv_13 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_13, "count_recv_13", NTask * sizeof(size_t));
462  plan->count_send_23 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_23, "count_send_23", NTask * sizeof(size_t));
463  plan->count_recv_23 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_23, "count_recv_23", NTask * sizeof(size_t));
464  plan->count_send_13back =
465  (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_13back, "count_send_13back", NTask * sizeof(size_t));
466  plan->count_recv_13back =
467  (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_13back, "count_recv_13back", NTask * sizeof(size_t));
468  plan->count_send_23back =
469  (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_23back, "count_send_23back", NTask * sizeof(size_t));
470  plan->count_recv_23back =
471  (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_23back, "count_recv_23back", NTask * sizeof(size_t));
472 
473  int dimA[3] = {plan->NgridX, plan->NgridY, plan->Ngridz};
474  int permA[3] = {0, 2, 1};
475 
476  my_fft_column_remap(NULL, dimA, plan->firstcol_XY, plan->ncol_XY, NULL, permA, plan->transposed_firstcol, plan->transposed_ncol,
477  plan->offsets_send_A, plan->offsets_recv_A, plan->count_send_A, plan->count_recv_A, 1);
478 
479  int dimB[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
480  int permB[3] = {2, 1, 0};
481 
482  my_fft_column_remap(NULL, dimB, plan->transposed_firstcol, plan->transposed_ncol, NULL, permB, plan->second_transposed_firstcol,
483  plan->second_transposed_ncol, plan->offsets_send_B, plan->offsets_recv_B, plan->count_send_B, plan->count_recv_B,
484  1);
485 
486  int dimC[3] = {plan->NgridY, plan->Ngridz, plan->NgridX};
487  int permC[3] = {2, 1, 0};
488 
489  my_fft_column_remap(NULL, dimC, plan->second_transposed_firstcol, plan->second_transposed_ncol, NULL, permC,
490  plan->transposed_firstcol, plan->transposed_ncol, plan->offsets_send_C, plan->offsets_recv_C, plan->count_send_C,
491  plan->count_recv_C, 1);
492 
493  int dimD[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
494  int permD[3] = {0, 2, 1};
495 
496  my_fft_column_remap(NULL, dimD, plan->transposed_firstcol, plan->transposed_ncol, NULL, permD, plan->firstcol_XY, plan->ncol_XY,
497  plan->offsets_send_D, plan->offsets_recv_D, plan->count_send_D, plan->count_recv_D, 1);
498 
499  int dim23[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
500  int perm23[3] = {0, 2, 1};
501 
502  my_fft_column_transpose(NULL, dim23, plan->firstcol_XY, plan->ncol_XY, NULL, perm23, plan->firstcol_XZ, plan->ncol_XZ,
503  plan->count_send_23, plan->count_recv_23, 1);
504 
505  int dim23back[3] = {plan->NgridX, plan->Ngrid2, plan->NgridY};
506  int perm23back[3] = {0, 2, 1};
507 
508  my_fft_column_transpose(NULL, dim23back, plan->firstcol_XZ, plan->ncol_XZ, NULL, perm23back, plan->firstcol_XY, plan->ncol_XY,
509  plan->count_send_23back, plan->count_recv_23back, 1);
510 
511  int dim13[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
512  int perm13[3] = {2, 1, 0};
513 
514  my_fft_column_transpose(NULL, dim13, plan->firstcol_XY, plan->ncol_XY, NULL, perm13, plan->firstcol_ZY, plan->ncol_ZY,
515  plan->count_send_13, plan->count_recv_13, 1);
516 
517  int dim13back[3] = {plan->Ngrid2, plan->NgridY, plan->NgridX};
518  int perm13back[3] = {2, 1, 0};
519 
520  my_fft_column_transpose(NULL, dim13back, plan->firstcol_ZY, plan->ncol_ZY, NULL, perm13back, plan->firstcol_XY, plan->ncol_XY,
521  plan->count_send_13back, plan->count_recv_13back, 1);
522 }
523 
524 void pm_mpi_fft::my_column_based_fft_free(fft_plan *plan)
525 {
526  Mem.myfree(plan->count_recv_23back);
527  Mem.myfree(plan->count_send_23back);
528  Mem.myfree(plan->count_recv_13back);
529  Mem.myfree(plan->count_send_13back);
530  Mem.myfree(plan->count_recv_23);
531  Mem.myfree(plan->count_send_23);
532  Mem.myfree(plan->count_recv_13);
533  Mem.myfree(plan->count_send_13);
534  Mem.myfree(plan->count_recv_D);
535  Mem.myfree(plan->count_send_D);
536  Mem.myfree(plan->count_recv_C);
537  Mem.myfree(plan->count_send_C);
538  Mem.myfree(plan->count_recv_B);
539  Mem.myfree(plan->count_send_B);
540  Mem.myfree(plan->count_recv_A);
541  Mem.myfree(plan->count_send_A);
542 
543  Mem.myfree(plan->offsets_recv_D);
544  Mem.myfree(plan->offsets_send_D);
545  Mem.myfree(plan->offsets_recv_C);
546  Mem.myfree(plan->offsets_send_C);
547  Mem.myfree(plan->offsets_recv_B);
548  Mem.myfree(plan->offsets_send_B);
549  Mem.myfree(plan->offsets_recv_A);
550  Mem.myfree(plan->offsets_send_A);
551 }
552 
553 void pm_mpi_fft::my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out)
554 {
555  int dim23[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
556  int perm23[3] = {0, 2, 1};
557 
558  my_fft_column_transpose(data, dim23, plan->firstcol_XY, plan->ncol_XY, out, perm23, plan->firstcol_XZ, plan->ncol_XZ,
559  plan->count_send_23, plan->count_recv_23, 0);
560 }
561 
562 void pm_mpi_fft::my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out)
563 {
564  int dim23back[3] = {plan->NgridX, plan->Ngrid2, plan->NgridY};
565  int perm23back[3] = {0, 2, 1};
566 
567  my_fft_column_transpose(data, dim23back, plan->firstcol_XZ, plan->ncol_XZ, out, perm23back, plan->firstcol_XY, plan->ncol_XY,
568  plan->count_send_23back, plan->count_recv_23back, 0);
569 }
570 
571 void pm_mpi_fft::my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out)
572 {
573  int dim13[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
574  int perm13[3] = {2, 1, 0};
575 
576  my_fft_column_transpose(data, dim13, plan->firstcol_XY, plan->ncol_XY, out, perm13, plan->firstcol_ZY, plan->ncol_ZY,
577  plan->count_send_13, plan->count_recv_13, 0);
578 }
579 
580 void pm_mpi_fft::my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out)
581 {
582  int dim13back[3] = {plan->Ngrid2, plan->NgridY, plan->NgridX};
583  int perm13back[3] = {2, 1, 0};
584 
585  my_fft_column_transpose(data, dim13back, plan->firstcol_ZY, plan->ncol_ZY, out, perm13back, plan->firstcol_XY, plan->ncol_XY,
586  plan->count_send_13back, plan->count_recv_13back, 0);
587 }
588 
589 void pm_mpi_fft::my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
590 {
591  size_t n;
592  fft_real *data_real = (fft_real *)data, *workspace_real = (fft_real *)workspace;
593  fft_complex *data_complex = (fft_complex *)data, *workspace_complex = (fft_complex *)workspace;
594 
595  if(forward == 1)
596  {
597  /* do the z-direction FFT, real to complex */
598  for(n = 0; n < plan->ncol_XY; n++)
599  FFTW(execute_dft_r2c)(plan->forward_plan_zdir, data_real + n * plan->Ngrid2, workspace_complex + n * plan->Ngridz);
600 
601  int dimA[3] = {plan->NgridX, plan->NgridY, plan->Ngridz};
602  int permA[3] = {0, 2, 1};
603 
604  my_fft_column_remap(workspace_complex, dimA, plan->firstcol_XY, plan->ncol_XY, data_complex, permA, plan->transposed_firstcol,
605  plan->transposed_ncol, plan->offsets_send_A, plan->offsets_recv_A, plan->count_send_A, plan->count_recv_A,
606  0);
607 
608  /* do the y-direction FFT in 'data', complex to complex */
609  for(n = 0; n < plan->transposed_ncol; n++)
610  FFTW(execute_dft)(plan->forward_plan_ydir, data_complex + n * plan->NgridY, workspace_complex + n * plan->NgridY);
611 
612  int dimB[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
613  int permB[3] = {2, 1, 0};
614 
615  my_fft_column_remap(workspace_complex, dimB, plan->transposed_firstcol, plan->transposed_ncol, data_complex, permB,
616  plan->second_transposed_firstcol, plan->second_transposed_ncol, plan->offsets_send_B, plan->offsets_recv_B,
617  plan->count_send_B, plan->count_recv_B, 0);
618 
619  /* do the x-direction FFT in 'data', complex to complex */
620  for(n = 0; n < plan->second_transposed_ncol; n++)
621  FFTW(execute_dft)(plan->forward_plan_xdir, data_complex + n * plan->NgridX, workspace_complex + n * plan->NgridX);
622 
623  /* result is now in workspace */
624  }
625  else
626  {
627  /* do inverse FFT in 'data' */
628  for(n = 0; n < plan->second_transposed_ncol; n++)
629  FFTW(execute_dft)(plan->backward_plan_xdir, data_complex + n * plan->NgridX, workspace_complex + n * plan->NgridX);
630 
631  int dimC[3] = {plan->NgridY, plan->Ngridz, plan->NgridX};
632  int permC[3] = {2, 1, 0};
633 
634  my_fft_column_remap(workspace_complex, dimC, plan->second_transposed_firstcol, plan->second_transposed_ncol, data_complex, permC,
635  plan->transposed_firstcol, plan->transposed_ncol, plan->offsets_send_C, plan->offsets_recv_C,
636  plan->count_send_C, plan->count_recv_C, 0);
637 
638  /* do inverse FFT in 'data' */
639  for(n = 0; n < plan->transposed_ncol; n++)
640  FFTW(execute_dft)(plan->backward_plan_ydir, data_complex + n * plan->NgridY, workspace_complex + n * plan->NgridY);
641 
642  int dimD[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
643  int permD[3] = {0, 2, 1};
644 
645  my_fft_column_remap(workspace_complex, dimD, plan->transposed_firstcol, plan->transposed_ncol, data_complex, permD,
646  plan->firstcol_XY, plan->ncol_XY, plan->offsets_send_D, plan->offsets_recv_D, plan->count_send_D,
647  plan->count_recv_D, 0);
648 
649  /* do complex-to-real inverse transform on z-coordinates */
650  for(n = 0; n < plan->ncol_XY; n++)
651  FFTW(execute_dft_c2r)(plan->backward_plan_zdir, data_complex + n * plan->Ngridz, workspace_real + n * plan->Ngrid2);
652  }
653 }
654 
655 void pm_mpi_fft::my_fft_column_remap(fft_complex *data, int Ndims[3], /* global dimensions of data cube */
656  int in_firstcol, int in_ncol, /* first column and number of columns */
657  fft_complex *out, int perm[3], int out_firstcol, int out_ncol, size_t *offset_send,
658  size_t *offset_recv, size_t *count_send, size_t *count_recv, size_t just_count_flag)
659 {
660  int j, target, origin, ngrp, recvTask, perm_rev[3], xyz[3], uvw[3];
661  size_t nimport, nexport;
662 
663  /* determine the inverse permutation */
664  for(j = 0; j < 3; j++)
665  perm_rev[j] = perm[j];
666 
667  if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2)) /* not yet the inverse */
668  {
669  for(j = 0; j < 3; j++)
670  perm_rev[j] = perm[perm[j]];
671 
672  if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2))
673  Terminate("bummer");
674  }
675 
676  int in_colums = Ndims[0] * Ndims[1];
677  int in_avg = (in_colums - 1) / NTask + 1;
678  int in_exc = NTask * in_avg - in_colums;
679  int in_tasklastsection = NTask - in_exc;
680  int in_pivotcol = in_tasklastsection * in_avg;
681 
682  int out_colums = Ndims[perm[0]] * Ndims[perm[1]];
683  int out_avg = (out_colums - 1) / NTask + 1;
684  int out_exc = NTask * out_avg - out_colums;
685  int out_tasklastsection = NTask - out_exc;
686  int out_pivotcol = out_tasklastsection * out_avg;
687 
688  size_t i, ncells = ((size_t)in_ncol) * Ndims[2];
689 
690  xyz[0] = in_firstcol / Ndims[1];
691  xyz[1] = in_firstcol % Ndims[1];
692  xyz[2] = 0;
693 
694  memset(count_send, 0, NTask * sizeof(size_t));
695 
696  /* loop over all cells in input array and determine target processor */
697  for(i = 0; i < ncells; i++)
698  {
699  /* determine target task */
700  uvw[0] = xyz[perm[0]];
701  uvw[1] = xyz[perm[1]];
702  uvw[2] = xyz[perm[2]];
703 
704  int newcol = Ndims[perm[1]] * uvw[0] + uvw[1];
705  if(newcol < out_pivotcol)
706  target = newcol / out_avg;
707  else
708  target = (newcol - out_pivotcol) / (out_avg - 1) + out_tasklastsection;
709 
710  /* move data element to targettask */
711 
712  if(just_count_flag)
713  count_send[target]++;
714  else
715  {
716  size_t off = offset_send[target] + count_send[target]++;
717  out[off][0] = data[i][0];
718  out[off][1] = data[i][1];
719  }
720  xyz[2]++;
721  if(xyz[2] == Ndims[2])
722  {
723  xyz[2] = 0;
724  xyz[1]++;
725  if(xyz[1] == Ndims[1])
726  {
727  xyz[1] = 0;
728  xyz[0]++;
729  }
730  }
731  }
732 
733  if(just_count_flag)
734  {
735  MPI_Alltoall(count_send, sizeof(size_t), MPI_BYTE, count_recv, sizeof(size_t), MPI_BYTE, Communicator);
736 
737  for(j = 0, nimport = 0, nexport = 0, offset_send[0] = 0, offset_recv[0] = 0; j < NTask; j++)
738  {
739  nexport += count_send[j];
740  nimport += count_recv[j];
741 
742  if(j > 0)
743  {
744  offset_send[j] = offset_send[j - 1] + count_send[j - 1];
745  offset_recv[j] = offset_recv[j - 1] + count_recv[j - 1];
746  }
747  }
748 
749  if(nexport != ncells)
750  Terminate("nexport=%lld != ncells=%lld", (long long)nexport, (long long)ncells);
751  }
752  else
753  {
754  nimport = 0;
755 
756  /* exchange all the data */
757  for(ngrp = 0; ngrp < (1 << PTask); ngrp++)
758  {
759  recvTask = ThisTask ^ ngrp;
760 
761  if(recvTask < NTask)
762  {
763  if(count_send[recvTask] > 0 || count_recv[recvTask] > 0)
764  myMPI_Sendrecv(&out[offset_send[recvTask]], count_send[recvTask] * sizeof(fft_complex), MPI_BYTE, recvTask, TAG_DENS_A,
765  &data[offset_recv[recvTask]], count_recv[recvTask] * sizeof(fft_complex), MPI_BYTE, recvTask,
766  TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
767 
768  nimport += count_recv[recvTask];
769  }
770  }
771 
772  /* now loop over the new cell layout */
773  /* find enclosing rectangle around columns in new plane */
774 
775  int first[3], last[3];
776 
777  first[0] = out_firstcol / Ndims[perm[1]];
778  first[1] = out_firstcol % Ndims[perm[1]];
779  first[2] = 0;
780 
781  last[0] = (out_firstcol + out_ncol - 1) / Ndims[perm[1]];
782  last[1] = (out_firstcol + out_ncol - 1) % Ndims[perm[1]];
783  last[2] = Ndims[perm[2]] - 1;
784 
785  if(first[1] + out_ncol >= Ndims[perm[1]])
786  {
787  first[1] = 0;
788  last[1] = Ndims[perm[1]] - 1;
789  }
790 
791  /* now need to map this back to the old coordinates */
792 
793  int xyz_first[3], xyz_last[3];
794 
795  for(j = 0; j < 3; j++)
796  {
797  xyz_first[j] = first[perm_rev[j]];
798  xyz_last[j] = last[perm_rev[j]];
799  }
800 
801  memset(count_recv, 0, NTask * sizeof(size_t));
802 
803  size_t count = 0;
804 
805  /* traverse an enclosing box around the new cell layout in the old order */
806  for(xyz[0] = xyz_first[0]; xyz[0] <= xyz_last[0]; xyz[0]++)
807  for(xyz[1] = xyz_first[1]; xyz[1] <= xyz_last[1]; xyz[1]++)
808  for(xyz[2] = xyz_first[2]; xyz[2] <= xyz_last[2]; xyz[2]++)
809  {
810  /* check that the point is actually part of a column */
811  uvw[0] = xyz[perm[0]];
812  uvw[1] = xyz[perm[1]];
813  uvw[2] = xyz[perm[2]];
814 
815  int col = uvw[0] * Ndims[perm[1]] + uvw[1];
816 
817  if(col >= out_firstcol && col < out_firstcol + out_ncol)
818  {
819  /* determine origin task */
820  int newcol = Ndims[1] * xyz[0] + xyz[1];
821  if(newcol < in_pivotcol)
822  origin = newcol / in_avg;
823  else
824  origin = (newcol - in_pivotcol) / (in_avg - 1) + in_tasklastsection;
825 
826  size_t index = ((size_t)Ndims[perm[2]]) * (col - out_firstcol) + uvw[2];
827 
828  /* move data element from origin task */
829  size_t off = offset_recv[origin] + count_recv[origin]++;
830  out[index][0] = data[off][0];
831  out[index][1] = data[off][1];
832 
833  count++;
834  }
835  }
836 
837  if(count != nimport)
838  {
839  int fi = out_firstcol % Ndims[perm[1]];
840  int la = (out_firstcol + out_ncol - 1) % Ndims[perm[1]];
841 
842  Terminate("count=%lld nimport=%lld ncol=%d fi=%d la=%d first=%d last=%d\n", (long long)count, (long long)nimport, out_ncol,
843  fi, la, first[1], last[1]);
844  }
845  }
846 }
847 
848 void pm_mpi_fft::my_fft_column_transpose(fft_real *data, int Ndims[3], /* global dimensions of data cube */
849  int in_firstcol, int in_ncol, /* first column and number of columns */
850  fft_real *out, int perm[3], int out_firstcol, int out_ncol, size_t *count_send,
851  size_t *count_recv, size_t just_count_flag)
852 {
853  /* determine the inverse permutation */
854  int perm_rev[3];
855  for(int j = 0; j < 3; j++)
856  perm_rev[j] = perm[j];
857 
858  if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2)) /* not yet the inverse */
859  {
860  for(int j = 0; j < 3; j++)
861  perm_rev[j] = perm[perm[j]];
862 
863  if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2))
864  Terminate("bummer");
865  }
866 
867  int in_colums = Ndims[0] * Ndims[1];
868  int in_avg = (in_colums - 1) / NTask + 1;
869  int in_exc = NTask * in_avg - in_colums;
870  int in_tasklastsection = NTask - in_exc;
871  int in_pivotcol = in_tasklastsection * in_avg;
872 
873  int out_colums = Ndims[perm[0]] * Ndims[perm[1]];
874  int out_avg = (out_colums - 1) / NTask + 1;
875  int out_exc = NTask * out_avg - out_colums;
876  int out_tasklastsection = NTask - out_exc;
877  int out_pivotcol = out_tasklastsection * out_avg;
878 
879  if(just_count_flag)
880  memset(count_send, 0, NTask * sizeof(size_t));
881 
882  /* exchange all the data */
883  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
884  {
885  int target = ThisTask ^ ngrp;
886 
887  if(target < NTask)
888  {
889  // check whether we have anything to do
890  if(count_send[target] == 0 && count_recv[target] == 0 && just_count_flag == 0)
891  continue;
892 
893  /* determine enclosing rectangle of current region */
894  int source_first[3];
895  source_first[0] = in_firstcol / Ndims[1];
896  source_first[1] = in_firstcol % Ndims[1];
897  source_first[2] = 0;
898 
899  int source_last[3];
900  source_last[0] = (in_firstcol + in_ncol - 1) / Ndims[1];
901  source_last[1] = (in_firstcol + in_ncol - 1) % Ndims[1];
902  source_last[2] = Ndims[2] - 1;
903 
904  if(source_first[1] + in_ncol >= Ndims[1])
905  {
906  source_first[1] = 0;
907  source_last[1] = Ndims[1] - 1;
908  }
909 
910  /* determine target columns */
911 
912  int target_first_col = 0;
913  int long target_num_col = 0;
914 
915  if(target < out_tasklastsection)
916  {
917  target_first_col = target * out_avg;
918  target_num_col = out_avg;
919  }
920  else
921  {
922  target_first_col = (target - out_tasklastsection) * (out_avg - 1) + out_pivotcol;
923  target_num_col = (out_avg - 1);
924  }
925 
926  /* find enclosing rectangle around columns in new plane */
927  int first[3], last[3];
928 
929  first[0] = target_first_col / Ndims[perm[1]];
930  first[1] = target_first_col % Ndims[perm[1]];
931  first[2] = 0;
932 
933  last[0] = (target_first_col + target_num_col - 1) / Ndims[perm[1]];
934  last[1] = (target_first_col + target_num_col - 1) % Ndims[perm[1]];
935  last[2] = Ndims[perm[2]] - 1;
936 
937  if(first[1] + target_num_col >= Ndims[perm[1]])
938  {
939  first[1] = 0;
940  last[1] = Ndims[perm[1]] - 1;
941  }
942 
943  /* now we map this back to the old coordinates */
944  int xyz_first[3], xyz_last[3];
945 
946  for(int j = 0; j < 3; j++)
947  {
948  xyz_first[j] = first[perm_rev[j]];
949  xyz_last[j] = last[perm_rev[j]];
950  }
951 
952  /* determine common box */
953  int xyz_start[3], xyz_end[3];
954  for(int j = 0; j < 3; j++)
955  {
956  xyz_start[j] = std::max<int>(xyz_first[j], source_first[j]);
957  xyz_end[j] = std::min<int>(xyz_last[j], source_last[j]);
958  }
959 
960  int xyz[3];
961  for(int j = 0; j < 3; j++)
962  xyz[j] = xyz_start[j];
963 
964  /* now do the same determination for the flipped situation on the target side */
965 
966  int flip_in_firstcol = 0;
967  int flip_in_ncol = 0;
968 
969  if(target < in_tasklastsection)
970  {
971  flip_in_firstcol = target * in_avg;
972  flip_in_ncol = in_avg;
973  }
974  else
975  {
976  flip_in_firstcol = (target - in_tasklastsection) * (in_avg - 1) + in_pivotcol;
977  flip_in_ncol = (in_avg - 1);
978  }
979 
980  /* determine enclosing rectangle of current region */
981  int flip_source_first[3];
982  flip_source_first[0] = flip_in_firstcol / Ndims[1];
983  flip_source_first[1] = flip_in_firstcol % Ndims[1];
984  flip_source_first[2] = 0;
985 
986  int flip_source_last[3];
987  flip_source_last[0] = (flip_in_firstcol + flip_in_ncol - 1) / Ndims[1];
988  flip_source_last[1] = (flip_in_firstcol + flip_in_ncol - 1) % Ndims[1];
989  flip_source_last[2] = Ndims[2] - 1;
990 
991  if(flip_source_first[1] + flip_in_ncol >= Ndims[1])
992  {
993  flip_source_first[1] = 0;
994  flip_source_last[1] = Ndims[1] - 1;
995  }
996 
997  /* determine target columns */
998 
999  int flip_first_col = 0;
1000  int flip_num_col = 0;
1001 
1002  if(ThisTask < out_tasklastsection)
1003  {
1004  flip_first_col = ThisTask * out_avg;
1005  flip_num_col = out_avg;
1006  }
1007  else
1008  {
1009  flip_first_col = (ThisTask - out_tasklastsection) * (out_avg - 1) + out_pivotcol;
1010  flip_num_col = (out_avg - 1);
1011  }
1012 
1013  /* find enclosing rectangle around columns in new plane */
1014  int flip_first[3], flip_last[3];
1015 
1016  flip_first[0] = flip_first_col / Ndims[perm[1]];
1017  flip_first[1] = flip_first_col % Ndims[perm[1]];
1018  flip_first[2] = 0;
1019 
1020  flip_last[0] = (flip_first_col + flip_num_col - 1) / Ndims[perm[1]];
1021  flip_last[1] = (flip_first_col + flip_num_col - 1) % Ndims[perm[1]];
1022  flip_last[2] = Ndims[perm[2]] - 1;
1023 
1024  if(flip_first[1] + flip_num_col >= Ndims[perm[1]])
1025  {
1026  flip_first[1] = 0;
1027  flip_last[1] = Ndims[perm[1]] - 1;
1028  }
1029 
1030  /* now we map this back to the old coordinates */
1031  int abc_first[3], abc_last[3];
1032 
1033  for(int j = 0; j < 3; j++)
1034  {
1035  abc_first[j] = flip_first[perm_rev[j]];
1036  abc_last[j] = flip_last[perm_rev[j]];
1037  }
1038 
1039  /* determine common box */
1040  int abc_start[3], abc_end[3];
1041  for(int j = 0; j < 3; j++)
1042  {
1043  abc_start[j] = std::max<int>(abc_first[j], flip_source_first[j]);
1044  abc_end[j] = std::min<int>(abc_last[j], flip_source_last[j]);
1045  }
1046 
1047  int abc[3];
1048 
1049  for(int j = 0; j < 3; j++)
1050  abc[j] = abc_start[j];
1051 
1052  size_t tot_count_send = 0;
1053  size_t tot_count_recv = 0;
1054 
1055  /* now check how much free memory there is on the two partners, use at most half of it */
1056  size_t parnter_freebytes;
1057  myMPI_Sendrecv(&Mem.FreeBytes, sizeof(size_t), MPI_BYTE, target, TAG_DENS_B, &parnter_freebytes, sizeof(size_t), MPI_BYTE,
1058  target, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
1059 
1060  size_t freeb = std::min<size_t>(parnter_freebytes, Mem.FreeBytes);
1061 
1062  size_t limit = 0.5 * freeb / (sizeof(fft_real) + sizeof(fft_real));
1063 
1064  if(just_count_flag)
1065  limit = SIZE_MAX;
1066 
1067  int iter = 0;
1068  do
1069  {
1070  size_t limit_send = count_send[target] - tot_count_send;
1071  size_t limit_recv = count_recv[target] - tot_count_recv;
1072 
1073  if(just_count_flag)
1074  {
1075  limit_send = SIZE_MAX;
1076  limit_recv = SIZE_MAX;
1077  }
1078  else
1079  {
1080  if(limit_send > limit)
1081  limit_send = limit;
1082 
1083  if(limit_recv > limit)
1084  limit_recv = limit;
1085  }
1086 
1087  fft_real *buffer_send = NULL;
1088  fft_real *buffer_recv = NULL;
1089 
1090  if(just_count_flag == 0)
1091  {
1092  buffer_send = (fft_real *)Mem.mymalloc("buffer_send", limit_send * sizeof(fft_real));
1093  buffer_recv = (fft_real *)Mem.mymalloc("buffer_recv", limit_recv * sizeof(fft_real));
1094  }
1095 
1096  /* traverse the common box between the new and old layout */
1097  size_t count = 0;
1098 
1099  while(count < limit_send && xyz[0] <= xyz_end[0] && xyz[1] <= xyz_end[1] && xyz[2] <= xyz_end[2])
1100  {
1101  /* check that the point is actually part of a column in the old layout */
1102  int col_old = xyz[0] * Ndims[1] + xyz[1];
1103 
1104  if(col_old >= in_firstcol && col_old < in_firstcol + in_ncol)
1105  {
1106  /* check that the point is actually part of a column in the new layout */
1107  int uvw[3];
1108  uvw[0] = xyz[perm[0]];
1109  uvw[1] = xyz[perm[1]];
1110  uvw[2] = xyz[perm[2]];
1111 
1112  int col_new = uvw[0] * Ndims[perm[1]] + uvw[1];
1113 
1114  if(col_new >= target_first_col && col_new < target_first_col + target_num_col)
1115  {
1116  // ok, we found a match
1117 
1118  if(just_count_flag)
1119  count_send[target]++;
1120  else
1121  {
1122  long long source_cell = (Ndims[1] * xyz[0] + xyz[1] - in_firstcol) * Ndims[2] + xyz[2];
1123 
1124  buffer_send[count++] = data[source_cell];
1125  tot_count_send++;
1126  }
1127  }
1128  }
1129 
1130  xyz[2]++;
1131  if(xyz[2] > xyz_end[2])
1132  {
1133  xyz[2] = xyz_start[2];
1134  xyz[1]++;
1135  if(xyz[1] > xyz_end[1])
1136  {
1137  xyz[1] = xyz_start[1];
1138  xyz[0]++;
1139  }
1140  }
1141  }
1142 
1143  if(just_count_flag == 0)
1144  {
1145  myMPI_Sendrecv(buffer_send, limit_send * sizeof(fft_real), MPI_BYTE, target, TAG_DENS_A, buffer_recv,
1146  limit_recv * sizeof(fft_real), MPI_BYTE, target, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
1147 
1148  size_t count = 0;
1149  while(count < limit_recv && abc[0] <= abc_end[0] && abc[1] <= abc_end[1] && abc[2] <= abc_end[2])
1150  {
1151  /* check that the point is actually part of a column in the old layout */
1152  int col_old = abc[0] * Ndims[1] + abc[1];
1153 
1154  if(col_old >= flip_in_firstcol && col_old < flip_in_firstcol + flip_in_ncol)
1155  {
1156  /* check that the point is actually part of a column in the new layout */
1157  int uvw[3];
1158  uvw[0] = abc[perm[0]];
1159  uvw[1] = abc[perm[1]];
1160  uvw[2] = abc[perm[2]];
1161 
1162  int col_new = uvw[0] * Ndims[perm[1]] + uvw[1];
1163 
1164  if(col_new >= flip_first_col && col_new < flip_first_col + flip_num_col)
1165  {
1166  // ok, we found a match
1167 
1168  long long target_cell = (Ndims[perm[1]] * uvw[0] + uvw[1] - flip_first_col) * Ndims[perm[2]] + uvw[2];
1169 
1170  out[target_cell] = buffer_recv[count++];
1171  tot_count_recv++;
1172  }
1173  }
1174 
1175  abc[2]++;
1176  if(abc[2] > abc_end[2])
1177  {
1178  abc[2] = abc_start[2];
1179  abc[1]++;
1180  if(abc[1] > abc_end[1])
1181  {
1182  abc[1] = abc_start[1];
1183  abc[0]++;
1184  }
1185  }
1186  }
1187 
1188  Mem.myfree(buffer_recv);
1189  Mem.myfree(buffer_send);
1190  }
1191  else
1192  break;
1193 
1194  iter++;
1195 
1196  if(iter > 20)
1197  Terminate("high number of iterations: limit=%lld", (long long)limit);
1198  }
1199  while(tot_count_send < count_send[target] || tot_count_recv < count_recv[target]);
1200  }
1201  }
1202  if(just_count_flag)
1203  MPI_Alltoall(count_send, sizeof(size_t), MPI_BYTE, count_recv, sizeof(size_t), MPI_BYTE, Communicator);
1204 }
1205 
1206 #endif
1207 
1208 #endif
size_t FreeBytes
Definition: mymalloc.h:48
void my_slab_based_fft_free(fft_plan *plan)
void my_column_based_fft_free(fft_plan *plan)
void my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
void my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
int ThisTask
Definition: setcomm.h:33
int NTask
Definition: setcomm.h:32
int PTask
Definition: setcomm.h:34
MPI_Comm Communicator
Definition: setcomm.h:31
#define MPI_MESSAGE_SIZELIMIT_IN_BYTES
Definition: constants.h:53
#define Terminate(...)
Definition: macros.h:19
#define TAG_DENS_A
Definition: mpi_utils.h:50
int myMPI_Sendrecv(void *sendbuf, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, size_t recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
void myMPI_Alltoallv(void *sendbuf, size_t *sendcounts, size_t *sdispls, void *recvbuf, size_t *recvcounts, size_t *rdispls, int len, int big_flag, MPI_Comm comm)
Definition: myalltoall.cc:177
#define TAG_DENS_B
Definition: mpi_utils.h:51
memory Mem
Definition: main.cc:44
#define FFTW(x)
Definition: pm_mpi_fft.h:22
void subdivide_evenly(long long N, int pieces, int index_bin, long long *first, int *count)
Definition: system.cc:51