GADGET-4
subfind.cc
Go to the documentation of this file.
1 /*******************************************************************************
2  * \copyright This file is part of the GADGET4 N-body/SPH code developed
3  * \copyright by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
4  * \copyright (vspringel@mpa-garching.mpg.de) and all contributing authors.
5  *******************************************************************************/
6 
12 #include "gadgetconfig.h"
13 
14 #ifdef SUBFIND
15 
16 #include <mpi.h>
17 #include <unistd.h>
18 #include <algorithm>
19 #include <climits>
20 #include <cmath>
21 #include <cstdio>
22 #include <cstdlib>
23 #include <cstring>
24 
25 #include "../data/allvars.h"
26 #include "../data/dtypes.h"
27 #include "../data/mymalloc.h"
28 #include "../domain/domain.h"
29 #include "../fof/fof.h"
30 #include "../fof/fof_io.h"
31 #include "../gravtree/gravtree.h"
32 #include "../logs/timer.h"
33 #include "../main/simulation.h"
34 #include "../mergertree/mergertree.h"
35 #include "../mpi_utils/mpi_utils.h"
36 #include "../sort/cxxsort.h"
37 #include "../sort/parallel_sort.h"
38 #include "../subfind/subfind.h"
39 #include "../system/system.h"
40 #include "../time_integration/timestep.h"
41 
42 /* main routine of SUBFIND algorithm, for output number 'num'.
43  */
44 template <typename partset>
45 void fof<partset>::subfind_find_subhalos(int num, const char *basename, const char *grpcat_dirbasename)
46 {
47  TIMER_START(CPU_SUBFIND);
48 
49  double tstart = Logs.second();
50 
51  mpi_printf("\nSUBFIND: We now execute a parallel version of SUBFIND.\n");
52 
53 #ifdef MERGERTREE
54  double lensum = 0, partcount = 0;
55  for(int i = 0; i < Tp->NumPart; i++)
56  {
57  if(Tp->P[i].PrevSizeOfSubhalo.get() > 0)
58  partcount += 1;
59  lensum += Tp->P[i].PrevSizeOfSubhalo.get();
60  Tp->PS[i].SizeOfSubhalo.set(0); // initialize
61  }
62  MPI_Allreduce(MPI_IN_PLACE, &partcount, 1, MPI_DOUBLE, MPI_SUM, Communicator);
63  MPI_Allreduce(MPI_IN_PLACE, &lensum, 1, MPI_DOUBLE, MPI_SUM, Communicator);
64  mpi_printf("SUBFIND: Previous subhalo catalogue had approximately a size %g, and the summed squared subhalo size was %g\n",
65  partcount, lensum);
66 #endif
67 
68 #ifndef SUBFIND_HBT
69 
70  /* let's determine the total matter density around each particle/cell. For each point,
71  * we determine a smoothing radius based on a certain number of dark matter particles
72  * from the primary link type.
73  */
74  FoFGravTree.treeallocate(Tp->NumPart, Tp, FoFDomain);
75 
76  TIMER_STOP(CPU_SUBFIND);
77  FoFGravTree.treebuild(Tp->NumPart, NULL);
78  TIMER_START(CPU_SUBFIND);
79 
80  subfind_density_hsml_guess();
81 
82  /* find smoothing lengths for primary and secondary points in groups (counting primary particles only for setting the smoothing
83  * length) and then estimate total matter density around them
84  */
85  double cputime = subfind_density();
86 
87  mpi_printf("SUBFIND: iteration to correct primary neighbor count and density estimate took %g sec\n", cputime);
88 
89  FoFGravTree.treefree();
90 #endif
91 
93 
94  /* fill internal energy into auxiliary PS[] array */
95 #ifndef LEAN
96  for(int i = 0; i < Tp->NumPart; i++)
97  if(Tp->P[i].getType() == 0)
98  Tp->PS[i].Utherm = Tp->get_utherm_from_entropy(i);
99  else
100  Tp->PS[i].Utherm = 0;
101 #endif
102 
103  double GroupSizeThresholdFactor = 0.55;
104  int ncount = 0, nprocs = 0;
105  long long seriallen = 0;
106  long long sum_seriallen;
107 
108  /* Let's set a fiducial size limit for the maximum group size before we select the collective subfind algorithm */
109  do
110  {
111  GroupSizeThresholdFactor += 0.05;
112  ncount = nprocs = seriallen = 0;
113 
114  MaxSerialGroupLen = (int)(GroupSizeThresholdFactor * Tp->TotNumPart / NTask);
115 
116  /* Count, how many groups are above this limit, and how many processors we need for them */
117  for(int i = 0; i < Ngroups; i++)
118  if(Group[i].Len > MaxSerialGroupLen)
119  {
120  ncount++;
121  nprocs += ((Group[i].Len - 1) / MaxSerialGroupLen) + 1;
122  }
123  else
124  seriallen += Group[i].Len;
125 
126  MPI_Allreduce(&ncount, &Ncollective, 1, MPI_INT, MPI_SUM, Communicator);
127  MPI_Allreduce(&nprocs, &NprocsCollective, 1, MPI_INT, MPI_SUM, Communicator);
128  sumup_longs(1, &seriallen, &sum_seriallen, Communicator);
129  }
130  while(NprocsCollective + ((sum_seriallen > 0) ? 1 : 0) > NTask);
131 
132  mpi_printf("SUBFIND: Number of FOF halos treated with collective SubFind algorithm = %d\n", Ncollective);
133  mpi_printf("SUBFIND: Number of processors used in different partitions for the collective SubFind code = %d\n", NprocsCollective);
134  mpi_printf("SUBFIND: (The adopted size-limit for the collective algorithm was %d particles, for threshold size factor %g)\n",
135  MaxSerialGroupLen, GroupSizeThresholdFactor);
136  mpi_printf("SUBFIND: The other %lld FOF halos are treated in parallel with serial code\n", TotNgroups - Ncollective);
137 
138  /* set up a global table that informs about the processor assignment of the groups that are treated collectively */
139  ProcAssign = (proc_assign_data *)Mem.mymalloc_movable(&ProcAssign, "ProcAssign", Ncollective * sizeof(proc_assign_data));
140  proc_assign_data *locProcAssign = (proc_assign_data *)Mem.mymalloc("locProcAssign", ncount * sizeof(proc_assign_data));
141 
142  ncount = 0;
143  for(int i = 0; i < Ngroups; i++)
144  if(Group[i].Len > MaxSerialGroupLen)
145  {
146  locProcAssign[ncount].GroupNr = Group[i].GroupNr;
147  locProcAssign[ncount].Len = Group[i].Len;
148  ncount++;
149  }
150 
151  /* gather the information on the collective groups accross all CPUs */
152  int *recvcounts = (int *)Mem.mymalloc("recvcounts", sizeof(int) * NTask);
153  int *bytecounts = (int *)Mem.mymalloc("bytecounts", sizeof(int) * NTask);
154  int *byteoffset = (int *)Mem.mymalloc("byteoffset", sizeof(int) * NTask);
155 
156  MPI_Allgather(&ncount, 1, MPI_INT, recvcounts, 1, MPI_INT, Communicator);
157 
158  for(int task = 0; task < NTask; task++)
159  bytecounts[task] = recvcounts[task] * sizeof(proc_assign_data);
160 
161  byteoffset[0] = 0;
162  for(int task = 1; task < NTask; task++)
163  byteoffset[task] = byteoffset[task - 1] + bytecounts[task - 1];
164 
165  MPI_Allgatherv(locProcAssign, bytecounts[ThisTask], MPI_BYTE, ProcAssign, bytecounts, byteoffset, MPI_BYTE, Communicator);
166 
167  Mem.myfree(byteoffset);
168  Mem.myfree(bytecounts);
169  Mem.myfree(recvcounts);
170  Mem.myfree(locProcAssign);
171 
172  /* make sure, the table is sorted in ascending group-number order */
173  mycxxsort(ProcAssign, ProcAssign + Ncollective, subfind_compare_procassign_GroupNr);
174 
175  /* assign the processor sets for the collective groups and set disjoint color-flag to later split the processors into different
176  * communicators */
177  nprocs = 0;
178  CommSplitColor = ThisTask; /* by default, this places every processor in his own processor group */
179 
180  /* now we assign the same color for groups of CPUs that each do one halo collectively */
181  for(int i = 0; i < Ncollective; i++)
182  {
183  ProcAssign[i].FirstTask = nprocs;
184  ProcAssign[i].NTask = ((ProcAssign[i].Len - 1) / MaxSerialGroupLen) + 1;
185  nprocs += ProcAssign[i].NTask;
186 
187  if(ThisTask >= ProcAssign[i].FirstTask && ThisTask < (ProcAssign[i].FirstTask + ProcAssign[i].NTask))
188  CommSplitColor = i;
189  }
190 
191  /* Now assign a target task for each local group. For collective groups, the target task is the master in the CPU set, whereas
192  * the serial ones are distributed in a round-robin fashion to the remaining CPUs.
193  */
194  for(int i = 0; i < Ngroups; i++)
195  {
196  if(Group[i].Len > MaxSerialGroupLen) /* we have a collective group */
197  {
198  if(Group[i].GroupNr >= Ncollective || Group[i].GroupNr < 0)
199  Terminate("odd");
200  Group[i].TargetTask = ProcAssign[Group[i].GroupNr].FirstTask;
201  }
202  else
203  Group[i].TargetTask = ((Group[i].GroupNr - Ncollective) % (NTask - NprocsCollective)) + NprocsCollective;
204  }
205 
206  /* distribute the groups */
207  subfind_distribute_groups();
208 
209  /* sort the local groups by group number */
210  mycxxsort(Group, Group + Ngroups, fof_compare_Group_GroupNr);
211 
212  /* assign target CPUs for the particles in groups */
213  /* the particles not in groups will be distributed such that a close to uniform particle load results */
214  double t0 = Logs.second();
215  int *count_loc_task = (int *)Mem.mymalloc_clear("count_loc_task", NTask * sizeof(int));
216  int *count_task = (int *)Mem.mymalloc("count_task", NTask * sizeof(int));
217  int *count_free = (int *)Mem.mymalloc("count_free", NTask * sizeof(int));
218  int count_loc_free = 0;
219 
220  for(int i = 0; i < Tp->NumPart; i++)
221  {
222  if(Tp->PS[i].GroupNr.get() < HALONR_MAX) /* particle is in a group */
223  {
224  if(Tp->PS[i].GroupNr.get() < (MyIDType)Ncollective) /* we are in a collectively treated group */
225  Tp->PS[i].TargetTask = ProcAssign[Tp->PS[i].GroupNr.get()].FirstTask + (i % ProcAssign[Tp->PS[i].GroupNr.get()].NTask);
226  else /* otherwise the whole group will be treated by a single CPU */
227  Tp->PS[i].TargetTask = ((Tp->PS[i].GroupNr.get() - Ncollective) % (NTask - NprocsCollective)) + NprocsCollective;
228 
229  count_loc_task[Tp->PS[i].TargetTask]++;
230  }
231  else
232  {
233  Tp->PS[i].TargetTask = ThisTask; // default is that we stay
234  count_loc_free++;
235  }
236 
237  Tp->PS[i].TargetIndex = 0; /* unimportant here */
238  }
239 
240  /* get a list of how many unassigned and thus in principle movable particles every processor has */
241  MPI_Allgather(&count_loc_free, 1, MPI_INT, count_free, 1, MPI_INT, Communicator);
242 
243  /* determine how many particles from groups will end up on each processor */
244  MPI_Allreduce(count_loc_task, count_task, NTask, MPI_INT, MPI_SUM, Communicator);
245 
246  /* get the total particle count */
247  long long sum = 0;
248  for(int i = 0; i < NTask; i++)
249  sum += count_task[i] + count_free[i];
250 
251  /* find out the average maximum load, and what the local head-room to it would be */
252  int maxload = (sum + NTask - 1) / NTask;
253  for(int i = 0; i < NTask; i++)
254  {
255  count_task[i] = maxload - count_task[i]; /* this is the amount that can fit on this task */
256  if(count_task[i] < 0)
257  count_task[i] = 0;
258  }
259 
260  /* let's see how many particles can stay without degrading the load balance */
261  for(int i = 0; i < NTask; i++)
262  {
263  if(count_free[i] <= count_task[i])
264  {
265  count_task[i] -= count_free[i];
266  count_free[i] = 0;
267  }
268  else
269  {
270  count_free[i] -= count_task[i];
271  count_task[i] = 0;
272  }
273  }
274 
275  /* now we determine the optimum target task for the subset of local particles that should be moved to another rank for better load
276  * balance */
277  int current_task = 0;
278  for(int i = 0; i < ThisTask; i++)
279  {
280  while(count_free[i] > 0 && current_task < NTask)
281  {
282  if(count_free[i] < count_task[current_task])
283  {
284  count_task[current_task] -= count_free[i];
285  count_free[i] = 0;
286  }
287  else
288  {
289  count_free[i] -= count_task[current_task];
290  count_task[current_task] = 0;
291  current_task++;
292  }
293  }
294  }
295 
296  /* move a subset of particles such that close to uniform load is achieved */
297  for(int i = 0; i < Tp->NumPart && count_free[ThisTask] > 0; i++)
298  {
299  if(Tp->PS[i].GroupNr.get() == HALONR_MAX)
300  {
301  /* particle not in a group. Could in principle stay but we move it such that a good load balance is obtained. */
302  while(count_task[current_task] == 0 && current_task < NTask - 1)
303  current_task++;
304 
305  Tp->PS[i].TargetTask = current_task;
306  count_task[current_task]--;
307  count_free[ThisTask]--;
308  }
309  }
310 
311  Mem.myfree(count_free);
312  Mem.myfree(count_task);
313  Mem.myfree(count_loc_task);
314 
315  /* report current balance */
316  double balance = subfind_get_particle_balance();
317  mpi_printf("SUBFIND: particle balance=%g\n", balance);
318 
319  /* distribute particles such that groups are completely on the CPU(s) that do the corresponding group(s) */
320  FoFDomain->particle_exchange_based_on_PS(Communicator);
321  double t1 = Logs.second();
322  mpi_printf("SUBFIND: subfind_exchange() took %g sec\n", Logs.timediff(t0, t1));
323 
324  /* report balance that has been achieved */
325  balance = subfind_get_particle_balance();
326  mpi_printf("SUBFIND: particle balance for processing=%g\n", balance);
327 
328  /* lets estimate the maximum number of substructures we may find and need to store on the local CPU */
329  if(ThisTask < NprocsCollective)
330  {
331  MaxNsubhalos = (ProcAssign[CommSplitColor].Len / ProcAssign[CommSplitColor].NTask) / All.DesLinkNgb;
332  }
333  else
334  {
335  long long nlocid = 0;
336  for(int i = 0; i < Ngroups; i++)
337  nlocid += Group[i].Len;
338 
339  MaxNsubhalos = nlocid / All.DesLinkNgb; /* should be a quite conservative upper limit */
340  }
341 
342  Mem.myfree(ProcAssign);
343 
344  // some log-variables
345  count_decisions = 0;
346  count_different_decisions = 0;
347 
348  /* allocate storage space for locally found subhalos */
349  Nsubhalos = 0;
350  Subhalo = (subhalo_properties *)Mem.mymalloc_movable(&Subhalo, "Subhalo", MaxNsubhalos * sizeof(subhalo_properties));
351 
352  /* we can now split the communicator to give each collectively treated group its own processor set */
353  MPI_Comm_split(Communicator, CommSplitColor, ThisTask, &SubComm);
354  MPI_Comm_size(SubComm, &SubNTask);
355  MPI_Comm_rank(SubComm, &SubThisTask);
356 
357  double ti0 = Logs.second();
358  {
359  domain<partset> SubDomain(SubComm, Tp);
360 
361  if(SubDomain.NumNodes != 0)
362  Terminate("SubDomain.NumNodes=%d\n", SubDomain.NumNodes);
363 
364  if(CommSplitColor < Ncollective)
365  {
366  /* for the collective algorithm, we now do a further reshuffling to distribute the particles
367  * in density-order to the processors. This is here only a performance improvement later on
368  * for out distributed link-lists, reducing the number of MPI messages that need to be sent.
369  * */
370 
371  /* allocated storage for an auxiliary array needed for sorting */
372  sort_as_data *as = (sort_as_data *)Mem.mymalloc_movable(&as, "as", Tp->NumPart * sizeof(sort_as_data));
373 
374  for(int i = 0; i < Tp->NumPart; i++)
375  {
376  as[i].density = Tp->PS[i].u.s.u.DM_Density;
377  as[i].origin = (((long long)SubThisTask) << 32) + i;
378  }
379 
380  /* sort according to density */
381  mycxxsort_parallel(as, as + Tp->NumPart, subfind_compare_as_density, SubComm);
382 
383  for(int i = 0; i < Tp->NumPart; i++)
384  as[i].targettask = SubThisTask;
385 
386  /* revert to original order */
387  mycxxsort_parallel(as, as + Tp->NumPart, subfind_compare_as_origin, SubComm);
388 
389  for(int i = 0; i < Tp->NumPart; i++)
390  Tp->PS[i].TargetTask = as[i].targettask;
391 
392  Mem.myfree(as);
393 
394  SubDomain.particle_exchange_based_on_PS(SubComm);
395 
396  if(SubDomain.NumNodes != 0)
397  Terminate("SubDomain.NumNodes=%d\n", SubDomain.NumNodes);
398 
399  /* now call the routine that does the actual processing of the groups */
400  subfind_processing(&SubDomain, COLL_SUBFIND); /* we are one of the CPUs that does a collective group */
401  }
402  else
403  subfind_processing(&SubDomain, SERIAL_SUBFIND); /* we have several groups in full to be done by the local CPU */
404  }
405  double ti1 = Logs.second();
406  mpi_printf("SUBFIND: Processing overall took (total time=%g sec)\n", Logs.timediff(ti0, ti1));
407 
408  /* free the communicator again */
409  MPI_Comm_free(&SubComm);
410 
411 #ifndef SUBFIND_HBT
412  /* report some statistics */
413  MPI_Allreduce(MPI_IN_PLACE, &count_decisions, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
414  MPI_Allreduce(MPI_IN_PLACE, &count_different_decisions, 1, MPI_LONG_LONG, MPI_SUM, Communicator);
415  mpi_printf("SUBFIND: %lld out of %lld decisions, fraction %g, where influenced by previous subhalo size\n",
416  count_different_decisions, count_decisions, ((double)count_different_decisions) / count_decisions);
417 
418 #endif
419 
420  /* reestablish consistent global values for Sp.MaxPart/MaxPartSph in case they have diverged
421  * in the subcommunicators
422  */
423  int max_load, max_loadsph;
424  MPI_Allreduce(&Tp->MaxPart, &max_load, 1, MPI_INT, MPI_MAX, Communicator);
425  MPI_Allreduce(&Tp->MaxPartSph, &max_loadsph, 1, MPI_INT, MPI_MAX, Communicator);
426 
427  /* do resize */
428  Tp->reallocate_memory_maxpart(max_load);
429  Tp->PS = (subfind_data *)Mem.myrealloc_movable(Tp->PS, Tp->MaxPart * sizeof(subfind_data));
430 
431  Tp->reallocate_memory_maxpartsph(max_loadsph);
432 
433  /* distribute particles back to original CPU */
434  t0 = Logs.second();
435  for(int i = 0; i < Tp->NumPart; i++)
436  {
437  Tp->PS[i].TargetTask = Tp->PS[i].OriginTask;
438  Tp->PS[i].TargetIndex = Tp->PS[i].OriginIndex;
439  }
440 
441  FoFDomain->particle_exchange_based_on_PS(Communicator);
442  t1 = Logs.second();
443  if(ThisTask == 0)
444  printf("SUBFIND: subfind_exchange() (for return to original CPU) took %g sec\n", Logs.timediff(t0, t1));
445 
446  /* Now do he spherical overdensity calculations around group centers.
447  * For this, we need a search tree with all particles again.
448  */
449 
450  FoFGravTree.treeallocate(Tp->NumPart, Tp, FoFDomain);
451 
452  TIMER_STOP(CPU_SUBFIND);
453  FoFGravTree.treebuild(Tp->NumPart, NULL);
454  TIMER_START(CPU_SUBFIND);
455 
456  /* compute spherical overdensities for FOF groups */
457  double cputimeso = subfind_overdensity();
458  mpi_printf("SUBFIND: determining spherical overdensity masses took %g sec\n", cputimeso);
459 
460  FoFGravTree.treefree();
461 
462  /* sort the groups according to group/subhalo-number */
463  t0 = Logs.second();
464  mycxxsort_parallel(Group, Group + Ngroups, fof_compare_Group_GroupNr, Communicator);
465  mycxxsort_parallel(Subhalo, Subhalo + Nsubhalos, subfind_compare_Subhalo_GroupNr_SubRankInGr, Communicator);
466  t1 = Logs.second();
467  mpi_printf("SUBFIND: assembled and ordered groups and subhalos (took %g sec)\n", Logs.timediff(t0, t1));
468 
469  sumup_large_ints(1, &Nsubhalos, &TotNsubhalos, Communicator);
470 
471  /* determine largest subhalo and total particle/cell count in substructures */
472  long long lenmax = 0, glob_lenmax;
473  long long totlen = 0;
474  long long totsublength;
475  for(int i = 0; i < Nsubhalos; i++)
476  {
477  totlen += Subhalo[i].Len;
478 
479  if(Subhalo[i].Len > lenmax)
480  lenmax = Subhalo[i].Len;
481  }
482  sumup_longs(1, &totlen, &totsublength, Communicator);
483  MPI_Reduce(&lenmax, &glob_lenmax, 1, MPI_LONG_LONG, MPI_MAX, 0, Communicator);
484 
485  /* set binding energy of unbound particles to zero (was overwritten with Hsml before) */
486  for(int i = 0; i < Tp->NumPart; i++)
487  if(Tp->PS[i].SubRankInGr == INT_MAX)
488  Tp->PS[i].v.DM_BindingEnergy = 0;
489 
490  TIMER_START(CPU_SNAPSHOT);
491 
492  /* now do the output of the subhalo catalogue */
493  subfind_save_final(num, basename, grpcat_dirbasename);
494 
495  TIMER_STOP(CPU_SNAPSHOT);
496 
497  double tend = Logs.second();
498 
499  if(ThisTask == 0)
500  {
501  printf("SUBFIND: Finished with SUBFIND. (total time=%g sec)\n", Logs.timediff(tstart, tend));
502  printf("SUBFIND: Total number of subhalos with at least %d particles: %lld\n", All.DesLinkNgb, TotNsubhalos);
503  if(TotNsubhalos > 0)
504  {
505  printf("SUBFIND: Largest subhalo has %lld particles/cells.\n", glob_lenmax);
506  printf("SUBFIND: Total number of particles/cells in subhalos: %lld\n", totsublength);
507  }
508  }
509 
510  Mem.myfree_movable(Subhalo);
511 
512  TIMER_STOP(CPU_SUBFIND);
513 }
514 
515 template <typename partset>
516 void fof<partset>::subfind_save_final(int num, const char *basename, const char *grpcat_dirbasename)
517 {
518  double t0 = Logs.second();
519 
520  long long totsubs = 0;
521 
522  /* fill in the FirstSub-values */
523  for(int i = 0; i < Ngroups; i++)
524  {
525  if(i > 0)
526  Group[i].FirstSub = Group[i - 1].FirstSub + Group[i - 1].Nsubs;
527  else
528  Group[i].FirstSub = 0;
529  totsubs += Group[i].Nsubs;
530  }
531 
532  long long *Send_count = (long long *)Mem.mymalloc("Send_count", sizeof(long long) * this->NTask);
533  long long *Send_offset = (long long *)Mem.mymalloc("Send_offset", sizeof(long long) * this->NTask);
534 
535  MPI_Allgather(&totsubs, 1, MPI_LONG_LONG, Send_count, 1, MPI_LONG_LONG, Communicator);
536 
537  Send_offset[0] = 0;
538 
539  for(int i = 1; i < NTask; i++)
540  Send_offset[i] = Send_offset[i - 1] + Send_count[i - 1];
541 
542  for(int i = 0; i < Ngroups; i++)
543  {
544  if(Group[i].Nsubs > 0)
545  Group[i].FirstSub += Send_offset[ThisTask];
546  else
547  Group[i].FirstSub = -1;
548  }
549 
550  Mem.myfree(Send_offset);
551  Mem.myfree(Send_count);
552 
553  subfind_assign_subhalo_offsettype();
554 
555  fof_io<partset> FoF_IO{this, this->Communicator, All.SnapFormat};
556  FoF_IO.fof_subfind_save_groups(num, basename, grpcat_dirbasename);
557 
558  double t1 = Logs.second();
559 
560  mpi_printf("SUBFIND: Subgroup catalogues saved. took = %g sec\n", Logs.timediff(t0, t1));
561 }
562 
563 template <typename partset>
564 void fof<partset>::subfind_assign_subhalo_offsettype(void)
565 {
566  int *Send_count = (int *)Mem.mymalloc("Send_count", sizeof(int) * this->NTask);
567  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * this->NTask);
568  int *Recv_count = (int *)Mem.mymalloc("Recv_count", sizeof(int) * this->NTask);
569  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * this->NTask);
570 
571  if(Nsubhalos > 0)
572  {
573  Subhalo[0].SubRankInGr = 0;
574  for(int j = 0; j < NTYPES; j++)
575  Subhalo[0].OffsetType[j] = 0;
576  }
577 
578  for(int i = 1; i < Nsubhalos; i++)
579  if(Subhalo[i].GroupNr != Subhalo[i - 1].GroupNr)
580  {
581  Subhalo[i].SubRankInGr = 0;
582  for(int j = 0; j < NTYPES; j++)
583  Subhalo[i].OffsetType[j] = 0;
584  }
585  else
586  {
587  Subhalo[i].SubRankInGr = Subhalo[i - 1].SubRankInGr + 1;
588  for(int j = 0; j < NTYPES; j++)
589  Subhalo[i].OffsetType[j] = Subhalo[i - 1].OffsetType[j] + Subhalo[i - 1].LenType[j];
590  }
591 
592  struct subnr_info
593  {
594  int grnr;
595  int cnt;
596  long long stype_cum[NTYPES];
597  };
598  subnr_info subnr_data;
599 
600  if(Nsubhalos > 0)
601  {
602  subnr_data.grnr = Subhalo[Nsubhalos - 1].GroupNr;
603  subnr_data.cnt = Subhalo[Nsubhalos - 1].SubRankInGr + 1;
604  for(int j = 0; j < NTYPES; j++)
605  subnr_data.stype_cum[j] = Subhalo[Nsubhalos - 1].OffsetType[j] + Subhalo[Nsubhalos - 1].LenType[j];
606  }
607  else
608  subnr_data.grnr = INT_MAX;
609 
610  subnr_info *info_all = (subnr_info *)Mem.mymalloc("info_all", NTask * sizeof(subnr_info));
611  MPI_Allgather(&subnr_data, sizeof(subnr_info), MPI_BYTE, info_all, sizeof(subnr_info), MPI_BYTE, Communicator);
612 
613  if(Nsubhalos > 0)
614  {
615  int cnt = 0;
616  long long stype_cum[NTYPES];
617  for(int j = 0; j < NTYPES; j++)
618  stype_cum[j] = 0;
619 
620  for(int i = ThisTask - 1; i >= 0; i--)
621  if(info_all[i].grnr == Subhalo[0].GroupNr)
622  {
623  cnt += info_all[i].cnt;
624  for(int j = 0; j < NTYPES; j++)
625  stype_cum[j] += info_all[i].stype_cum[j];
626  }
627 
628  for(int i = 0; i < Nsubhalos; i++)
629  if(Subhalo[i].GroupNr == Subhalo[0].GroupNr)
630  {
631  Subhalo[i].SubRankInGr += cnt;
632  for(int j = 0; j < NTYPES; j++)
633  Subhalo[i].OffsetType[j] += stype_cum[j];
634  }
635  else
636  break;
637  }
638 
639  Mem.myfree(info_all);
640 
641  /* now need to send the subhalos to the processor that holds the parent group to inquire about the group
642  * offset and then add this in.
643  */
644 
645  /* tell everybody how many groups each processor holds */
646  int *gcount = (int *)Mem.mymalloc("gcount", NTask * sizeof(int));
647  MPI_Allgather(&Ngroups, 1, MPI_INT, gcount, 1, MPI_INT, Communicator);
648 
649  int nexport = 0, nimport = 0;
650 
651  struct group_info
652  {
653  int grindex;
654  int subindex;
655  long long OffsetType[NTYPES];
656  };
657  group_info *export_group_data = NULL, *import_group_data = NULL;
658 
659  for(int mode = 0; mode < 2; mode++)
660  {
661  for(int i = 0; i < NTask; i++)
662  Send_count[i] = 0;
663 
664  int target = 0;
665  long long first_gr_in_target = 0; /* this the first particle (of this type) on the current target processor */
666 
667  for(int i = 0; i < Nsubhalos; i++)
668  {
669  while(Subhalo[i].GroupNr >= first_gr_in_target + gcount[target])
670  {
671  if(target >= NTask)
672  Terminate("target=%d i=%d Nsubhalos=%d Subhalo[i],GroupNr=%lld\n", target, i, Nsubhalos, Subhalo[i].GroupNr);
673 
674  first_gr_in_target += gcount[target];
675  target++;
676  }
677 
678  if(mode == 0)
679  Send_count[target]++;
680  else
681  {
682  int off = Send_offset[target] + Send_count[target]++;
683  export_group_data[off].grindex = Subhalo[i].GroupNr - first_gr_in_target;
684  export_group_data[off].subindex = i;
685  }
686  }
687 
688  if(mode == 0)
689  {
690  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
691  Recv_offset[0] = Send_offset[0] = 0;
692  for(int j = 0; j < NTask; j++)
693  {
694  nimport += Recv_count[j];
695  nexport += Send_count[j];
696  if(j > 0)
697  {
698  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
699  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
700  }
701  }
702 
703  export_group_data = (group_info *)Mem.mymalloc("export_group_data", nexport * sizeof(group_info));
704  import_group_data = (group_info *)Mem.mymalloc("import_group_data", nimport * sizeof(group_info));
705  }
706  }
707 
708  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
709  {
710  int recvTask = ThisTask ^ ngrp;
711  if(recvTask < NTask)
712  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
713  MPI_Sendrecv(&export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
714  TAG_DENS_B, &import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE,
715  recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
716  }
717 
718  /* now let's go through the imported groups and assign the offsets */
719  for(int i = 0; i < nimport; i++)
720  for(int j = 0; j < NTYPES; j++)
721  import_group_data[i].OffsetType[j] = Group[import_group_data[i].grindex].OffsetType[j];
722 
723  /* send stuff back */
724  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
725  {
726  int recvTask = ThisTask ^ ngrp;
727  if(recvTask < NTask)
728  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
729  MPI_Sendrecv(&import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
730  TAG_DENS_B, &export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE,
731  recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
732  }
733 
734  /* add it in to subhalo offsets */
735  for(int i = 0; i < nexport; i++)
736  for(int j = 0; j < NTYPES; j++)
737  Subhalo[export_group_data[i].subindex].OffsetType[j] += export_group_data[i].OffsetType[j];
738 
739  Mem.myfree(import_group_data);
740  Mem.myfree(export_group_data);
741  Mem.myfree(gcount);
742 
743  Mem.myfree(Recv_offset);
744  Mem.myfree(Recv_count);
745  Mem.myfree(Send_offset);
746  Mem.myfree(Send_count);
747 }
748 
749 template <typename partset>
750 void fof<partset>::subfind_redetermine_groupnr(void)
751 {
752  /* tell everybody how many groups we have locally */
753  int *ngroups_all = (int *)Mem.mymalloc("ngroups_all", NTask * sizeof(int));
754  MPI_Allgather(&Ngroups, 1, MPI_INT, ngroups_all, 1, MPI_INT, Communicator);
755 
756  int nsubs_local = 0;
757  for(int i = 0; i < Ngroups; i++)
758  nsubs_local += Group[i].Nsubs;
759 
760  /* accumulate the corresponding subhalo numbers */
761  int *nsubs_all = (int *)Mem.mymalloc("nsubs_all", NTask * sizeof(int));
762  MPI_Allgather(&nsubs_local, 1, MPI_INT, nsubs_all, 1, MPI_INT, Communicator);
763 
764  /* Finally, also tell everybody how many subhalos we have locally */
765  int *nsubhalos_all = (int *)Mem.mymalloc("nsubhalos_all", NTask * sizeof(int));
766  MPI_Allgather(&Nsubhalos, 1, MPI_INT, nsubhalos_all, 1, MPI_INT, Communicator);
767 
768  long long subhalo_previous = 0;
769  for(int i = 0; i < ThisTask; i++)
770  subhalo_previous += nsubhalos_all[i];
771 
772  int nexport = 0, nimport = 0;
773 
774  struct group_info
775  {
776  long long subhalonr;
777  long long groupnr;
778  int subindex;
779  };
780  group_info *export_group_data = NULL, *import_group_data = NULL;
781 
782  int *Send_count = (int *)Mem.mymalloc("Send_count", sizeof(int) * this->NTask);
783  int *Send_offset = (int *)Mem.mymalloc("Send_offset", sizeof(int) * this->NTask);
784  int *Recv_count = (int *)Mem.mymalloc("Recv_count", sizeof(int) * this->NTask);
785  int *Recv_offset = (int *)Mem.mymalloc("Recv_offset", sizeof(int) * this->NTask);
786 
787  for(int mode = 0; mode < 2; mode++)
788  {
789  for(int i = 0; i < NTask; i++)
790  Send_count[i] = 0;
791 
792  int target = 0;
793  long long nsubs_previous = 0;
794 
795  for(int i = 0; i < Nsubhalos; i++)
796  {
797  long long subhalonr = subhalo_previous + i;
798 
799  while(subhalonr >= nsubs_previous + nsubs_all[target])
800  {
801  if(target >= NTask)
802  Terminate("target=%d\n", target);
803 
804  nsubs_previous += nsubs_all[target];
805  target++;
806  }
807 
808  if(mode == 0)
809  Send_count[target]++;
810  else
811  {
812  int off = Send_offset[target] + Send_count[target]++;
813  export_group_data[off].subhalonr = subhalonr;
814  export_group_data[off].subindex = i;
815  }
816  }
817 
818  if(mode == 0)
819  {
820  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, Communicator);
821  Recv_offset[0] = Send_offset[0] = 0;
822  for(int j = 0; j < NTask; j++)
823  {
824  nimport += Recv_count[j];
825  nexport += Send_count[j];
826  if(j > 0)
827  {
828  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
829  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
830  }
831  }
832 
833  export_group_data = (group_info *)Mem.mymalloc("export_group_data", nexport * sizeof(group_info));
834  import_group_data = (group_info *)Mem.mymalloc("import_group_data", nimport * sizeof(group_info));
835  }
836  }
837 
838  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
839  {
840  int recvTask = ThisTask ^ ngrp;
841  if(recvTask < NTask)
842  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
843  MPI_Sendrecv(&export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
844  TAG_DENS_B, &import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE,
845  recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
846  }
847 
848  /* now let's go through the imported subhalos and assign the group numbers */
849 
850  long long nsubs = 0;
851  for(int i = 0; i < ThisTask; i++)
852  nsubs += nsubs_all[i];
853 
854  long long group_previous = 0;
855  for(int i = 0; i < ThisTask; i++)
856  group_previous += ngroups_all[i];
857 
858  int gr = 0;
859  for(int i = 0; i < nimport; i++)
860  {
861  while(import_group_data[i].subhalonr >= nsubs + Group[gr].Nsubs)
862  {
863  nsubs += Group[gr].Nsubs;
864  gr++;
865 
866  if(gr >= Ngroups)
867  Terminate("i=%d|%d gr=%d >= Ngroups=%d\n", i, nimport, gr, Ngroups);
868  }
869 
870  import_group_data[i].groupnr = group_previous + gr;
871  }
872 
873  /* send stuff back */
874  for(int ngrp = 0; ngrp < (1 << PTask); ngrp++) /* note: here we also have a transfer from each task to itself (for ngrp=0) */
875  {
876  int recvTask = ThisTask ^ ngrp;
877  if(recvTask < NTask)
878  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
879  MPI_Sendrecv(&import_group_data[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(group_info), MPI_BYTE, recvTask,
880  TAG_DENS_B, &export_group_data[Send_offset[recvTask]], Send_count[recvTask] * sizeof(group_info), MPI_BYTE,
881  recvTask, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
882  }
883 
884  /* assign the groupnr */
885  for(int i = 0; i < nexport; i++)
886  {
887  /*
888  if(Subhalo[export_group_data[i].subindex].GroupNr != export_group_data[i].groupnr)
889  Terminate("Subhalo[export_group_data[i].subindex].GroupNr=%lld export_group_data[i].groupnr=%lld\n",
890  Subhalo[export_group_data[i].subindex].GroupNr, export_group_data[i].groupnr);
891  */
892  Subhalo[export_group_data[i].subindex].GroupNr = export_group_data[i].groupnr;
893  }
894 
895  Mem.myfree(import_group_data);
896  Mem.myfree(export_group_data);
897  Mem.myfree(Recv_offset);
898  Mem.myfree(Recv_count);
899  Mem.myfree(Send_offset);
900  Mem.myfree(Send_count);
901 
902  Mem.myfree(nsubhalos_all);
903  Mem.myfree(nsubs_all);
904  Mem.myfree(ngroups_all);
905 }
906 
907 template <typename partset>
908 double fof<partset>::subfind_get_particle_balance(void)
909 {
910  int maxpart;
911  long long sum;
912  MPI_Allreduce(&Tp->NumPart, &maxpart, 1, MPI_INT, MPI_MAX, Communicator);
913  sumup_large_ints(1, &Tp->NumPart, &sum, Communicator);
914  return maxpart / (((double)sum) / NTask);
915 }
916 
917 /* now make sure that the following classes are really instantiated, otherwise we may get a linking problem */
918 #include "../data/simparticles.h"
919 template class fof<simparticles>;
920 
921 #if defined(LIGHTCONE) && defined(LIGHTCONE_PARTICLES_GROUPS)
922 #include "../data/lcparticles.h"
923 template class fof<lcparticles>;
924 #endif
925 
926 #endif
global_data_all_processes All
Definition: main.cc:40
Definition: domain.h:31
Definition: fof_io.h:20
double timediff(double t0, double t1)
Definition: logs.cc:488
double second(void)
Definition: logs.cc:471
#define NTYPES
Definition: constants.h:308
double mycxxsort(T *begin, T *end, Tcomp comp)
Definition: cxxsort.h:39
@ COLL_SUBFIND
Definition: domain.h:24
@ SERIAL_SUBFIND
Definition: domain.h:25
unsigned int MyIDType
Definition: dtypes.h:68
#define HALONR_MAX
Definition: idstorage.h:20
logs Logs
Definition: main.cc:43
#define TIMER_START(counter)
Starts the timer counter.
Definition: logs.h:197
#define TIMER_STOP(counter)
Stops the timer counter.
Definition: logs.h:220
#define Terminate(...)
Definition: macros.h:19
void sumup_large_ints(int n, int *src, long long *res, MPI_Comm comm)
void sumup_longs(int n, long long *src, long long *res, MPI_Comm comm)
#define TAG_DENS_B
Definition: mpi_utils.h:51
memory Mem
Definition: main.cc:44
double mycxxsort_parallel(T *begin, T *end, Comp comp, MPI_Comm comm)
void set_cosmo_factors_for_current_time(void)
Definition: allvars.cc:20