Actual source code: vpscat.c
1: #define PETSCVEC_DLL
3: /*
4: Defines parallel vector scatters.
5: */
7: #include src/vec/is/isimpl.h
8: #include private/vecimpl.h
9: #include src/vec/vec/impls/dvecimpl.h
10: #include src/vec/vec/impls/mpi/pvecimpl.h
11: #include petscsys.h
15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
16: {
17: VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
18: VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
19: PetscErrorCode ierr;
20: PetscInt i;
21: PetscMPIInt rank;
22: PetscViewerFormat format;
23: PetscTruth iascii;
26: PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
27: if (iascii) {
28: MPI_Comm_rank(ctx->comm,&rank);
29: PetscViewerGetFormat(viewer,&format);
30: if (format == PETSC_VIEWER_ASCII_INFO) {
31: PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
33: MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
34: MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
35: itmp = to->starts[to->n+1];
36: MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
37: itmp = from->starts[from->n+1];
38: MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
39: MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,ctx->comm);
41: PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
42: PetscViewerASCIIPrintf(viewer," Maximum number sends %D\n",nsend_max);
43: PetscViewerASCIIPrintf(viewer," Maximum number receives %D\n",nrecv_max);
44: PetscViewerASCIIPrintf(viewer," Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
45: PetscViewerASCIIPrintf(viewer," Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
46: PetscViewerASCIIPrintf(viewer," Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
48: } else {
49: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
50: if (to->n) {
51: for (i=0; i<to->n; i++){
52: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
53: }
54: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
55: for (i=0; i<to->starts[to->n]; i++){
56: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
57: }
58: }
60: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
61: if (from->n) {
62: for (i=0; i<from->n; i++){
63: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
64: }
66: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
67: for (i=0; i<from->starts[from->n]; i++){
68: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
69: }
70: }
71: if (to->local.n) {
72: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
73: for (i=0; i<to->local.n; i++){
74: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
75: }
76: }
78: PetscViewerFlush(viewer);
79: }
80: } else {
81: SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
82: }
83: return(0);
84: }
86: /* -----------------------------------------------------------------------------------*/
87: /*
88: The next routine determines what part of the local part of the scatter is an
89: exact copy of values into their current location. We check this here and
90: then know that we need not perform that portion of the scatter.
91: */
94: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from)
95: {
96: PetscInt n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
98: PetscInt *nto_slots,*nfrom_slots,j = 0;
99:
101: for (i=0; i<n; i++) {
102: if (to_slots[i] != from_slots[i]) n_nonmatching++;
103: }
105: if (!n_nonmatching) {
106: to->nonmatching_computed = PETSC_TRUE;
107: to->n_nonmatching = from->n_nonmatching = 0;
108: PetscInfo1(0,"Reduced %D to 0\n", n);
109: } else if (n_nonmatching == n) {
110: to->nonmatching_computed = PETSC_FALSE;
111: PetscInfo(0,"All values non-matching\n");
112: } else {
113: to->nonmatching_computed= PETSC_TRUE;
114: to->n_nonmatching = from->n_nonmatching = n_nonmatching;
115: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
116: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
117: to->slots_nonmatching = nto_slots;
118: from->slots_nonmatching = nfrom_slots;
119: for (i=0; i<n; i++) {
120: if (to_slots[i] != from_slots[i]) {
121: nto_slots[j] = to_slots[i];
122: nfrom_slots[j] = from_slots[i];
123: j++;
124: }
125: }
126: PetscInfo2(0,"Reduced %D to %D\n",n,n_nonmatching);
127: }
128: return(0);
129: }
131: /* --------------------------------------------------------------------------------------*/
133: /* -------------------------------------------------------------------------------------*/
136: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
137: {
138: VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
139: VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
140: PetscErrorCode ierr;
143: PetscFree(to->local.vslots);
144: PetscFree(from->local.vslots);
145: PetscFree2(to->counts,to->displs);
146: PetscFree2(from->counts,from->displs);
147: PetscFree(to->local.slots_nonmatching);
148: PetscFree(from->local.slots_nonmatching);
149: PetscFree(to->rev_requests);
150: PetscFree(from->rev_requests);
151: PetscFree(to->requests);
152: PetscFree(from->requests);
153: PetscFree4(to->values,to->indices,to->starts,to->procs);
154: PetscFree2(to->sstatus,to->rstatus);
155: PetscFree4(from->values,from->indices,from->starts,from->procs);
156: PetscFree(from);
157: PetscFree(to);
158: PetscHeaderDestroy(ctx);
159: return(0);
160: }
164: /* --------------------------------------------------------------------------------------*/
165: /*
166: Special optimization to see if the local part of the scatter is actually
167: a copy. The scatter routines call PetscMemcpy() instead.
168:
169: */
172: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
173: {
174: PetscInt n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
175: PetscInt to_start,from_start;
179: to_start = to_slots[0];
180: from_start = from_slots[0];
182: for (i=1; i<n; i++) {
183: to_start += bs;
184: from_start += bs;
185: if (to_slots[i] != to_start) return(0);
186: if (from_slots[i] != from_start) return(0);
187: }
188: to->is_copy = PETSC_TRUE;
189: to->copy_start = to_slots[0];
190: to->copy_length = bs*sizeof(PetscScalar)*n;
191: from->is_copy = PETSC_TRUE;
192: from->copy_start = from_slots[0];
193: from->copy_length = bs*sizeof(PetscScalar)*n;
194: PetscInfo(0,"Local scatter is a copy, optimizing for it\n");
195: return(0);
196: }
198: /* --------------------------------------------------------------------------------------*/
202: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
203: {
204: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
205: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
206: PetscErrorCode ierr;
207: PetscInt ny,bs = in_from->bs;
210: out->begin = in->begin;
211: out->end = in->end;
212: out->copy = in->copy;
213: out->destroy = in->destroy;
214: out->view = in->view;
216: /* allocate entire send scatter context */
217: PetscNew(VecScatter_MPI_General,&out_to);
218: PetscNew(VecScatter_MPI_General,&out_from);
220: ny = in_to->starts[in_to->n];
221: out_to->n = in_to->n;
222: out_to->type = in_to->type;
223: out_to->sendfirst = in_to->sendfirst;
225: PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
226: PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,
227: out_to->n,PetscMPIInt,&out_to->procs);
228: PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,
229: &out_to->rstatus);
230: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
231: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
232: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
233:
234: out->todata = (void*)out_to;
235: out_to->local.n = in_to->local.n;
236: out_to->local.nonmatching_computed = PETSC_FALSE;
237: out_to->local.n_nonmatching = 0;
238: out_to->local.slots_nonmatching = 0;
239: if (in_to->local.n) {
240: PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
241: PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
242: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
243: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
244: } else {
245: out_to->local.vslots = 0;
246: out_from->local.vslots = 0;
247: }
249: /* allocate entire receive context */
250: out_from->type = in_from->type;
251: ny = in_from->starts[in_from->n];
252: out_from->n = in_from->n;
253: out_from->sendfirst = in_from->sendfirst;
255: PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
256: PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,
257: out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
258: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
259: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
260: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
261: out->fromdata = (void*)out_from;
262: out_from->local.n = in_from->local.n;
263: out_from->local.nonmatching_computed = PETSC_FALSE;
264: out_from->local.n_nonmatching = 0;
265: out_from->local.slots_nonmatching = 0;
267: /*
268: set up the request arrays for use with isend_init() and irecv_init()
269: */
270: {
271: PetscMPIInt tag;
272: MPI_Comm comm;
273: PetscInt *sstarts = out_to->starts, *rstarts = out_from->starts;
274: PetscMPIInt *sprocs = out_to->procs, *rprocs = out_from->procs;
275: PetscInt i;
276: PetscTruth flg;
277: MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
278: MPI_Request *rev_swaits,*rev_rwaits;
279: PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
281: PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
282: PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);
284: rev_rwaits = out_to->rev_requests;
285: rev_swaits = out_from->rev_requests;
287: out_from->bs = out_to->bs = bs;
288: tag = out->tag;
289: comm = out->comm;
291: /* Register the receives that you will use later (sends for scatter reverse) */
292: for (i=0; i<out_from->n; i++) {
293: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
294: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
295: }
297: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
298: if (flg) {
299: out_to->use_readyreceiver = PETSC_TRUE;
300: out_from->use_readyreceiver = PETSC_TRUE;
301: for (i=0; i<out_to->n; i++) {
302: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
303: }
304: if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
305: MPI_Barrier(comm);
306: PetscInfo(0,"Using VecScatter ready receiver mode\n");
307: } else {
308: out_to->use_readyreceiver = PETSC_FALSE;
309: out_from->use_readyreceiver = PETSC_FALSE;
310: flg = PETSC_FALSE;
311: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
312: if (flg) {
313: PetscInfo(0,"Using VecScatter Ssend mode\n");
314: }
315: for (i=0; i<out_to->n; i++) {
316: if (!flg) {
317: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
318: } else {
319: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
320: }
321: }
322: }
323: /* Register receives for scatter reverse */
324: for (i=0; i<out_to->n; i++) {
325: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
326: }
327: }
329: return(0);
330: }
331: /* --------------------------------------------------------------------------------------------------
332: Packs and unpacks the message data into send or from receive buffers.
334: These could be generated automatically.
336: Fortran kernels etc. could be used.
337: */
338: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
339: {
340: PetscInt i;
341: for (i=0; i<n; i++) {
342: y[i] = x[indicesx[i]];
343: }
344: }
345: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
346: {
347: PetscInt i;
348: switch (addv) {
349: case INSERT_VALUES:
350: for (i=0; i<n; i++) {
351: y[indicesy[i]] = x[i];
352: }
353: break;
354: case ADD_VALUES:
355: for (i=0; i<n; i++) {
356: y[indicesy[i]] += x[i];
357: }
358: break;
359: #if !defined(PETSC_USE_COMPLEX)
360: case MAX_VALUES:
361: for (i=0; i<n; i++) {
362: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
363: }
364: #else
365: case MAX_VALUES:
366: #endif
367: case NOT_SET_VALUES:
368: break;
369: }
370: }
372: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
373: {
374: PetscInt i;
375: switch (addv) {
376: case INSERT_VALUES:
377: for (i=0; i<n; i++) {
378: y[indicesy[i]] = x[indicesx[i]];
379: }
380: break;
381: case ADD_VALUES:
382: for (i=0; i<n; i++) {
383: y[indicesy[i]] += x[indicesx[i]];
384: }
385: break;
386: #if !defined(PETSC_USE_COMPLEX)
387: case MAX_VALUES:
388: for (i=0; i<n; i++) {
389: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
390: }
391: #else
392: case MAX_VALUES:
393: #endif
394: case NOT_SET_VALUES:
395: break;
396: }
397: }
399: /* ----------------------------------------------------------------------------------------------- */
400: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
401: {
402: PetscInt i,idx;
404: for (i=0; i<n; i++) {
405: idx = *indicesx++;
406: y[0] = x[idx];
407: y[1] = x[idx+1];
408: y += 2;
409: }
410: }
411: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
412: {
413: PetscInt i,idy;
415: switch (addv) {
416: case INSERT_VALUES:
417: for (i=0; i<n; i++) {
418: idy = *indicesy++;
419: y[idy] = x[0];
420: y[idy+1] = x[1];
421: x += 2;
422: }
423: break;
424: case ADD_VALUES:
425: for (i=0; i<n; i++) {
426: idy = *indicesy++;
427: y[idy] += x[0];
428: y[idy+1] += x[1];
429: x += 2;
430: }
431: break;
432: #if !defined(PETSC_USE_COMPLEX)
433: case MAX_VALUES:
434: for (i=0; i<n; i++) {
435: idy = *indicesy++;
436: y[idy] = PetscMax(y[idy],x[0]);
437: y[idy+1] = PetscMax(y[idy+1],x[1]);
438: x += 2;
439: }
440: #else
441: case MAX_VALUES:
442: #endif
443: case NOT_SET_VALUES:
444: break;
445: }
446: }
448: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
449: {
450: PetscInt i,idx,idy;
452: switch (addv) {
453: case INSERT_VALUES:
454: for (i=0; i<n; i++) {
455: idx = *indicesx++;
456: idy = *indicesy++;
457: y[idy] = x[idx];
458: y[idy+1] = x[idx+1];
459: }
460: break;
461: case ADD_VALUES:
462: for (i=0; i<n; i++) {
463: idx = *indicesx++;
464: idy = *indicesy++;
465: y[idy] += x[idx];
466: y[idy+1] += x[idx+1];
467: }
468: break;
469: #if !defined(PETSC_USE_COMPLEX)
470: case MAX_VALUES:
471: for (i=0; i<n; i++) {
472: idx = *indicesx++;
473: idy = *indicesy++;
474: y[idy] = PetscMax(y[idy],x[idx]);
475: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
476: }
477: #else
478: case MAX_VALUES:
479: #endif
480: case NOT_SET_VALUES:
481: break;
482: }
483: }
484: /* ----------------------------------------------------------------------------------------------- */
485: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
486: {
487: PetscInt i,idx;
489: for (i=0; i<n; i++) {
490: idx = *indicesx++;
491: y[0] = x[idx];
492: y[1] = x[idx+1];
493: y[2] = x[idx+2];
494: y += 3;
495: }
496: }
497: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
498: {
499: PetscInt i,idy;
501: switch (addv) {
502: case INSERT_VALUES:
503: for (i=0; i<n; i++) {
504: idy = *indicesy++;
505: y[idy] = x[0];
506: y[idy+1] = x[1];
507: y[idy+2] = x[2];
508: x += 3;
509: }
510: break;
511: case ADD_VALUES:
512: for (i=0; i<n; i++) {
513: idy = *indicesy++;
514: y[idy] += x[0];
515: y[idy+1] += x[1];
516: y[idy+2] += x[2];
517: x += 3;
518: }
519: break;
520: #if !defined(PETSC_USE_COMPLEX)
521: case MAX_VALUES:
522: for (i=0; i<n; i++) {
523: idy = *indicesy++;
524: y[idy] = PetscMax(y[idy],x[0]);
525: y[idy+1] = PetscMax(y[idy+1],x[1]);
526: y[idy+2] = PetscMax(y[idy+2],x[2]);
527: x += 3;
528: }
529: #else
530: case MAX_VALUES:
531: #endif
532: case NOT_SET_VALUES:
533: break;
534: }
535: }
537: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
538: {
539: PetscInt i,idx,idy;
541: switch (addv) {
542: case INSERT_VALUES:
543: for (i=0; i<n; i++) {
544: idx = *indicesx++;
545: idy = *indicesy++;
546: y[idy] = x[idx];
547: y[idy+1] = x[idx+1];
548: y[idy+2] = x[idx+2];
549: }
550: break;
551: case ADD_VALUES:
552: for (i=0; i<n; i++) {
553: idx = *indicesx++;
554: idy = *indicesy++;
555: y[idy] += x[idx];
556: y[idy+1] += x[idx+1];
557: y[idy+2] += x[idx+2];
558: }
559: break;
560: #if !defined(PETSC_USE_COMPLEX)
561: case MAX_VALUES:
562: for (i=0; i<n; i++) {
563: idx = *indicesx++;
564: idy = *indicesy++;
565: y[idy] = PetscMax(y[idy],x[idx]);
566: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
567: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
568: }
569: #else
570: case MAX_VALUES:
571: #endif
572: case NOT_SET_VALUES:
573: break;
574: }
575: }
576: /* ----------------------------------------------------------------------------------------------- */
577: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
578: {
579: PetscInt i,idx;
581: for (i=0; i<n; i++) {
582: idx = *indicesx++;
583: y[0] = x[idx];
584: y[1] = x[idx+1];
585: y[2] = x[idx+2];
586: y[3] = x[idx+3];
587: y += 4;
588: }
589: }
590: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
591: {
592: PetscInt i,idy;
594: switch (addv) {
595: case INSERT_VALUES:
596: for (i=0; i<n; i++) {
597: idy = *indicesy++;
598: y[idy] = x[0];
599: y[idy+1] = x[1];
600: y[idy+2] = x[2];
601: y[idy+3] = x[3];
602: x += 4;
603: }
604: break;
605: case ADD_VALUES:
606: for (i=0; i<n; i++) {
607: idy = *indicesy++;
608: y[idy] += x[0];
609: y[idy+1] += x[1];
610: y[idy+2] += x[2];
611: y[idy+3] += x[3];
612: x += 4;
613: }
614: break;
615: #if !defined(PETSC_USE_COMPLEX)
616: case MAX_VALUES:
617: for (i=0; i<n; i++) {
618: idy = *indicesy++;
619: y[idy] = PetscMax(y[idy],x[0]);
620: y[idy+1] = PetscMax(y[idy+1],x[1]);
621: y[idy+2] = PetscMax(y[idy+2],x[2]);
622: y[idy+3] = PetscMax(y[idy+3],x[3]);
623: x += 4;
624: }
625: #else
626: case MAX_VALUES:
627: #endif
628: case NOT_SET_VALUES:
629: break;
630: }
631: }
633: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
634: {
635: PetscInt i,idx,idy;
637: switch (addv) {
638: case INSERT_VALUES:
639: for (i=0; i<n; i++) {
640: idx = *indicesx++;
641: idy = *indicesy++;
642: y[idy] = x[idx];
643: y[idy+1] = x[idx+1];
644: y[idy+2] = x[idx+2];
645: y[idy+3] = x[idx+3];
646: }
647: break;
648: case ADD_VALUES:
649: for (i=0; i<n; i++) {
650: idx = *indicesx++;
651: idy = *indicesy++;
652: y[idy] += x[idx];
653: y[idy+1] += x[idx+1];
654: y[idy+2] += x[idx+2];
655: y[idy+3] += x[idx+3];
656: }
657: break;
658: #if !defined(PETSC_USE_COMPLEX)
659: case MAX_VALUES:
660: for (i=0; i<n; i++) {
661: idx = *indicesx++;
662: idy = *indicesy++;
663: y[idy] = PetscMax(y[idy],x[idx]);
664: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
665: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
666: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
667: }
668: #else
669: case MAX_VALUES:
670: #endif
671: case NOT_SET_VALUES:
672: break;
673: }
674: }
675: /* ----------------------------------------------------------------------------------------------- */
676: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
677: {
678: PetscInt i,idx;
680: for (i=0; i<n; i++) {
681: idx = *indicesx++;
682: y[0] = x[idx];
683: y[1] = x[idx+1];
684: y[2] = x[idx+2];
685: y[3] = x[idx+3];
686: y[4] = x[idx+4];
687: y += 5;
688: }
689: }
690: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
691: {
692: PetscInt i,idy;
694: switch (addv) {
695: case INSERT_VALUES:
696: for (i=0; i<n; i++) {
697: idy = *indicesy++;
698: y[idy] = x[0];
699: y[idy+1] = x[1];
700: y[idy+2] = x[2];
701: y[idy+3] = x[3];
702: y[idy+4] = x[4];
703: x += 5;
704: }
705: break;
706: case ADD_VALUES:
707: for (i=0; i<n; i++) {
708: idy = *indicesy++;
709: y[idy] += x[0];
710: y[idy+1] += x[1];
711: y[idy+2] += x[2];
712: y[idy+3] += x[3];
713: y[idy+4] += x[4];
714: x += 5;
715: }
716: break;
717: #if !defined(PETSC_USE_COMPLEX)
718: case MAX_VALUES:
719: for (i=0; i<n; i++) {
720: idy = *indicesy++;
721: y[idy] = PetscMax(y[idy],x[0]);
722: y[idy+1] = PetscMax(y[idy+1],x[1]);
723: y[idy+2] = PetscMax(y[idy+2],x[2]);
724: y[idy+3] = PetscMax(y[idy+3],x[3]);
725: y[idy+4] = PetscMax(y[idy+4],x[4]);
726: x += 5;
727: }
728: #else
729: case MAX_VALUES:
730: #endif
731: case NOT_SET_VALUES:
732: break;
733: }
734: }
736: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
737: {
738: PetscInt i,idx,idy;
740: switch (addv) {
741: case INSERT_VALUES:
742: for (i=0; i<n; i++) {
743: idx = *indicesx++;
744: idy = *indicesy++;
745: y[idy] = x[idx];
746: y[idy+1] = x[idx+1];
747: y[idy+2] = x[idx+2];
748: y[idy+3] = x[idx+3];
749: y[idy+4] = x[idx+4];
750: }
751: break;
752: case ADD_VALUES:
753: for (i=0; i<n; i++) {
754: idx = *indicesx++;
755: idy = *indicesy++;
756: y[idy] += x[idx];
757: y[idy+1] += x[idx+1];
758: y[idy+2] += x[idx+2];
759: y[idy+3] += x[idx+3];
760: y[idy+4] += x[idx+4];
761: }
762: break;
763: #if !defined(PETSC_USE_COMPLEX)
764: case MAX_VALUES:
765: for (i=0; i<n; i++) {
766: idx = *indicesx++;
767: idy = *indicesy++;
768: y[idy] = PetscMax(y[idy],x[idx]);
769: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
770: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
771: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
772: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
773: }
774: #else
775: case MAX_VALUES:
776: #endif
777: case NOT_SET_VALUES:
778: break;
779: }
780: }
781: /* ----------------------------------------------------------------------------------------------- */
782: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
783: {
784: PetscInt i,idx;
786: for (i=0; i<n; i++) {
787: idx = *indicesx++;
788: y[0] = x[idx];
789: y[1] = x[idx+1];
790: y[2] = x[idx+2];
791: y[3] = x[idx+3];
792: y[4] = x[idx+4];
793: y[5] = x[idx+5];
794: y += 6;
795: }
796: }
797: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
798: {
799: PetscInt i,idy;
801: switch (addv) {
802: case INSERT_VALUES:
803: for (i=0; i<n; i++) {
804: idy = *indicesy++;
805: y[idy] = x[0];
806: y[idy+1] = x[1];
807: y[idy+2] = x[2];
808: y[idy+3] = x[3];
809: y[idy+4] = x[4];
810: y[idy+5] = x[5];
811: x += 6;
812: }
813: break;
814: case ADD_VALUES:
815: for (i=0; i<n; i++) {
816: idy = *indicesy++;
817: y[idy] += x[0];
818: y[idy+1] += x[1];
819: y[idy+2] += x[2];
820: y[idy+3] += x[3];
821: y[idy+4] += x[4];
822: y[idy+5] += x[5];
823: x += 6;
824: }
825: break;
826: #if !defined(PETSC_USE_COMPLEX)
827: case MAX_VALUES:
828: for (i=0; i<n; i++) {
829: idy = *indicesy++;
830: y[idy] = PetscMax(y[idy],x[0]);
831: y[idy+1] = PetscMax(y[idy+1],x[1]);
832: y[idy+2] = PetscMax(y[idy+2],x[2]);
833: y[idy+3] = PetscMax(y[idy+3],x[3]);
834: y[idy+4] = PetscMax(y[idy+4],x[4]);
835: y[idy+5] = PetscMax(y[idy+5],x[5]);
836: x += 6;
837: }
838: #else
839: case MAX_VALUES:
840: #endif
841: case NOT_SET_VALUES:
842: break;
843: }
844: }
846: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
847: {
848: PetscInt i,idx,idy;
850: switch (addv) {
851: case INSERT_VALUES:
852: for (i=0; i<n; i++) {
853: idx = *indicesx++;
854: idy = *indicesy++;
855: y[idy] = x[idx];
856: y[idy+1] = x[idx+1];
857: y[idy+2] = x[idx+2];
858: y[idy+3] = x[idx+3];
859: y[idy+4] = x[idx+4];
860: y[idy+5] = x[idx+5];
861: }
862: break;
863: case ADD_VALUES:
864: for (i=0; i<n; i++) {
865: idx = *indicesx++;
866: idy = *indicesy++;
867: y[idy] += x[idx];
868: y[idy+1] += x[idx+1];
869: y[idy+2] += x[idx+2];
870: y[idy+3] += x[idx+3];
871: y[idy+4] += x[idx+4];
872: y[idy+5] += x[idx+5];
873: }
874: break;
875: #if !defined(PETSC_USE_COMPLEX)
876: case MAX_VALUES:
877: for (i=0; i<n; i++) {
878: idx = *indicesx++;
879: idy = *indicesy++;
880: y[idy] = PetscMax(y[idy],x[idx]);
881: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
882: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
883: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
884: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
885: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
886: }
887: #else
888: case MAX_VALUES:
889: #endif
890: case NOT_SET_VALUES:
891: break;
892: }
893: }
894: /* ----------------------------------------------------------------------------------------------- */
895: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
896: {
897: PetscInt i,idx;
899: for (i=0; i<n; i++) {
900: idx = *indicesx++;
901: y[0] = x[idx];
902: y[1] = x[idx+1];
903: y[2] = x[idx+2];
904: y[3] = x[idx+3];
905: y[4] = x[idx+4];
906: y[5] = x[idx+5];
907: y[6] = x[idx+6];
908: y += 7;
909: }
910: }
911: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
912: {
913: PetscInt i,idy;
915: switch (addv) {
916: case INSERT_VALUES:
917: for (i=0; i<n; i++) {
918: idy = *indicesy++;
919: y[idy] = x[0];
920: y[idy+1] = x[1];
921: y[idy+2] = x[2];
922: y[idy+3] = x[3];
923: y[idy+4] = x[4];
924: y[idy+5] = x[5];
925: y[idy+6] = x[6];
926: x += 7;
927: }
928: break;
929: case ADD_VALUES:
930: for (i=0; i<n; i++) {
931: idy = *indicesy++;
932: y[idy] += x[0];
933: y[idy+1] += x[1];
934: y[idy+2] += x[2];
935: y[idy+3] += x[3];
936: y[idy+4] += x[4];
937: y[idy+5] += x[5];
938: y[idy+6] += x[6];
939: x += 7;
940: }
941: break;
942: #if !defined(PETSC_USE_COMPLEX)
943: case MAX_VALUES:
944: for (i=0; i<n; i++) {
945: idy = *indicesy++;
946: y[idy] = PetscMax(y[idy],x[0]);
947: y[idy+1] = PetscMax(y[idy+1],x[1]);
948: y[idy+2] = PetscMax(y[idy+2],x[2]);
949: y[idy+3] = PetscMax(y[idy+3],x[3]);
950: y[idy+4] = PetscMax(y[idy+4],x[4]);
951: y[idy+5] = PetscMax(y[idy+5],x[5]);
952: y[idy+6] = PetscMax(y[idy+6],x[6]);
953: x += 7;
954: }
955: #else
956: case MAX_VALUES:
957: #endif
958: case NOT_SET_VALUES:
959: break;
960: }
961: }
963: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
964: {
965: PetscInt i,idx,idy;
967: switch (addv) {
968: case INSERT_VALUES:
969: for (i=0; i<n; i++) {
970: idx = *indicesx++;
971: idy = *indicesy++;
972: y[idy] = x[idx];
973: y[idy+1] = x[idx+1];
974: y[idy+2] = x[idx+2];
975: y[idy+3] = x[idx+3];
976: y[idy+4] = x[idx+4];
977: y[idy+5] = x[idx+5];
978: y[idy+6] = x[idx+6];
979: }
980: break;
981: case ADD_VALUES:
982: for (i=0; i<n; i++) {
983: idx = *indicesx++;
984: idy = *indicesy++;
985: y[idy] += x[idx];
986: y[idy+1] += x[idx+1];
987: y[idy+2] += x[idx+2];
988: y[idy+3] += x[idx+3];
989: y[idy+4] += x[idx+4];
990: y[idy+5] += x[idx+5];
991: y[idy+6] += x[idx+6];
992: }
993: break;
994: #if !defined(PETSC_USE_COMPLEX)
995: case MAX_VALUES:
996: for (i=0; i<n; i++) {
997: idx = *indicesx++;
998: idy = *indicesy++;
999: y[idy] = PetscMax(y[idy],x[idx]);
1000: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1001: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1002: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1003: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1004: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1005: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1006: }
1007: #else
1008: case MAX_VALUES:
1009: #endif
1010: case NOT_SET_VALUES:
1011: break;
1012: }
1013: }
1014: /* ----------------------------------------------------------------------------------------------- */
1015: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1016: {
1017: PetscInt i,idx;
1019: for (i=0; i<n; i++) {
1020: idx = *indicesx++;
1021: y[0] = x[idx];
1022: y[1] = x[idx+1];
1023: y[2] = x[idx+2];
1024: y[3] = x[idx+3];
1025: y[4] = x[idx+4];
1026: y[5] = x[idx+5];
1027: y[6] = x[idx+6];
1028: y[7] = x[idx+7];
1029: y += 8;
1030: }
1031: }
1032: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1033: {
1034: PetscInt i,idy;
1036: switch (addv) {
1037: case INSERT_VALUES:
1038: for (i=0; i<n; i++) {
1039: idy = *indicesy++;
1040: y[idy] = x[0];
1041: y[idy+1] = x[1];
1042: y[idy+2] = x[2];
1043: y[idy+3] = x[3];
1044: y[idy+4] = x[4];
1045: y[idy+5] = x[5];
1046: y[idy+6] = x[6];
1047: y[idy+7] = x[7];
1048: x += 8;
1049: }
1050: break;
1051: case ADD_VALUES:
1052: for (i=0; i<n; i++) {
1053: idy = *indicesy++;
1054: y[idy] += x[0];
1055: y[idy+1] += x[1];
1056: y[idy+2] += x[2];
1057: y[idy+3] += x[3];
1058: y[idy+4] += x[4];
1059: y[idy+5] += x[5];
1060: y[idy+6] += x[6];
1061: y[idy+7] += x[7];
1062: x += 8;
1063: }
1064: break;
1065: #if !defined(PETSC_USE_COMPLEX)
1066: case MAX_VALUES:
1067: for (i=0; i<n; i++) {
1068: idy = *indicesy++;
1069: y[idy] = PetscMax(y[idy],x[0]);
1070: y[idy+1] = PetscMax(y[idy+1],x[1]);
1071: y[idy+2] = PetscMax(y[idy+2],x[2]);
1072: y[idy+3] = PetscMax(y[idy+3],x[3]);
1073: y[idy+4] = PetscMax(y[idy+4],x[4]);
1074: y[idy+5] = PetscMax(y[idy+5],x[5]);
1075: y[idy+6] = PetscMax(y[idy+6],x[6]);
1076: y[idy+7] = PetscMax(y[idy+7],x[7]);
1077: x += 8;
1078: }
1079: #else
1080: case MAX_VALUES:
1081: #endif
1082: case NOT_SET_VALUES:
1083: break;
1084: }
1085: }
1087: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1088: {
1089: PetscInt i,idx,idy;
1091: switch (addv) {
1092: case INSERT_VALUES:
1093: for (i=0; i<n; i++) {
1094: idx = *indicesx++;
1095: idy = *indicesy++;
1096: y[idy] = x[idx];
1097: y[idy+1] = x[idx+1];
1098: y[idy+2] = x[idx+2];
1099: y[idy+3] = x[idx+3];
1100: y[idy+4] = x[idx+4];
1101: y[idy+5] = x[idx+5];
1102: y[idy+6] = x[idx+6];
1103: y[idy+7] = x[idx+7];
1104: }
1105: break;
1106: case ADD_VALUES:
1107: for (i=0; i<n; i++) {
1108: idx = *indicesx++;
1109: idy = *indicesy++;
1110: y[idy] += x[idx];
1111: y[idy+1] += x[idx+1];
1112: y[idy+2] += x[idx+2];
1113: y[idy+3] += x[idx+3];
1114: y[idy+4] += x[idx+4];
1115: y[idy+5] += x[idx+5];
1116: y[idy+6] += x[idx+6];
1117: y[idy+7] += x[idx+7];
1118: }
1119: break;
1120: #if !defined(PETSC_USE_COMPLEX)
1121: case MAX_VALUES:
1122: for (i=0; i<n; i++) {
1123: idx = *indicesx++;
1124: idy = *indicesy++;
1125: y[idy] = PetscMax(y[idy],x[idx]);
1126: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1127: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1128: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1129: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1130: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1131: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1132: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1133: }
1134: #else
1135: case MAX_VALUES:
1136: #endif
1137: case NOT_SET_VALUES:
1138: break;
1139: }
1140: }
1142: /* ----------------------------------------------------------------------------------------------- */
1143: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1144: {
1145: PetscInt i,idx;
1147: for (i=0; i<n; i++) {
1148: idx = *indicesx++;
1149: y[0] = x[idx];
1150: y[1] = x[idx+1];
1151: y[2] = x[idx+2];
1152: y[3] = x[idx+3];
1153: y[4] = x[idx+4];
1154: y[5] = x[idx+5];
1155: y[6] = x[idx+6];
1156: y[7] = x[idx+7];
1157: y[8] = x[idx+8];
1158: y[9] = x[idx+9];
1159: y[10] = x[idx+10];
1160: y[11] = x[idx+11];
1161: y += 12;
1162: }
1163: }
1164: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1165: {
1166: PetscInt i,idy;
1168: switch (addv) {
1169: case INSERT_VALUES:
1170: for (i=0; i<n; i++) {
1171: idy = *indicesy++;
1172: y[idy] = x[0];
1173: y[idy+1] = x[1];
1174: y[idy+2] = x[2];
1175: y[idy+3] = x[3];
1176: y[idy+4] = x[4];
1177: y[idy+5] = x[5];
1178: y[idy+6] = x[6];
1179: y[idy+7] = x[7];
1180: y[idy+8] = x[8];
1181: y[idy+9] = x[9];
1182: y[idy+10] = x[10];
1183: y[idy+11] = x[11];
1184: x += 12;
1185: }
1186: break;
1187: case ADD_VALUES:
1188: for (i=0; i<n; i++) {
1189: idy = *indicesy++;
1190: y[idy] += x[0];
1191: y[idy+1] += x[1];
1192: y[idy+2] += x[2];
1193: y[idy+3] += x[3];
1194: y[idy+4] += x[4];
1195: y[idy+5] += x[5];
1196: y[idy+6] += x[6];
1197: y[idy+7] += x[7];
1198: y[idy+8] += x[8];
1199: y[idy+9] += x[9];
1200: y[idy+10] += x[10];
1201: y[idy+11] += x[11];
1202: x += 12;
1203: }
1204: break;
1205: #if !defined(PETSC_USE_COMPLEX)
1206: case MAX_VALUES:
1207: for (i=0; i<n; i++) {
1208: idy = *indicesy++;
1209: y[idy] = PetscMax(y[idy],x[0]);
1210: y[idy+1] = PetscMax(y[idy+1],x[1]);
1211: y[idy+2] = PetscMax(y[idy+2],x[2]);
1212: y[idy+3] = PetscMax(y[idy+3],x[3]);
1213: y[idy+4] = PetscMax(y[idy+4],x[4]);
1214: y[idy+5] = PetscMax(y[idy+5],x[5]);
1215: y[idy+6] = PetscMax(y[idy+6],x[6]);
1216: y[idy+7] = PetscMax(y[idy+7],x[7]);
1217: y[idy+8] = PetscMax(y[idy+8],x[8]);
1218: y[idy+9] = PetscMax(y[idy+9],x[9]);
1219: y[idy+10] = PetscMax(y[idy+10],x[10]);
1220: y[idy+11] = PetscMax(y[idy+11],x[11]);
1221: x += 12;
1222: }
1223: #else
1224: case MAX_VALUES:
1225: #endif
1226: case NOT_SET_VALUES:
1227: break;
1228: }
1229: }
1231: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1232: {
1233: PetscInt i,idx,idy;
1235: switch (addv) {
1236: case INSERT_VALUES:
1237: for (i=0; i<n; i++) {
1238: idx = *indicesx++;
1239: idy = *indicesy++;
1240: y[idy] = x[idx];
1241: y[idy+1] = x[idx+1];
1242: y[idy+2] = x[idx+2];
1243: y[idy+3] = x[idx+3];
1244: y[idy+4] = x[idx+4];
1245: y[idy+5] = x[idx+5];
1246: y[idy+6] = x[idx+6];
1247: y[idy+7] = x[idx+7];
1248: y[idy+8] = x[idx+8];
1249: y[idy+9] = x[idx+9];
1250: y[idy+10] = x[idx+10];
1251: y[idy+11] = x[idx+11];
1252: }
1253: break;
1254: case ADD_VALUES:
1255: for (i=0; i<n; i++) {
1256: idx = *indicesx++;
1257: idy = *indicesy++;
1258: y[idy] += x[idx];
1259: y[idy+1] += x[idx+1];
1260: y[idy+2] += x[idx+2];
1261: y[idy+3] += x[idx+3];
1262: y[idy+4] += x[idx+4];
1263: y[idy+5] += x[idx+5];
1264: y[idy+6] += x[idx+6];
1265: y[idy+7] += x[idx+7];
1266: y[idy+8] += x[idx+8];
1267: y[idy+9] += x[idx+9];
1268: y[idy+10] += x[idx+10];
1269: y[idy+11] += x[idx+11];
1270: }
1271: break;
1272: #if !defined(PETSC_USE_COMPLEX)
1273: case MAX_VALUES:
1274: for (i=0; i<n; i++) {
1275: idx = *indicesx++;
1276: idy = *indicesy++;
1277: y[idy] = PetscMax(y[idy],x[idx]);
1278: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1279: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1280: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1281: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1282: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1283: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1284: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1285: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1286: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1287: y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1288: y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1289: }
1290: #else
1291: case MAX_VALUES:
1292: #endif
1293: case NOT_SET_VALUES:
1294: break;
1295: }
1296: }
1298: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1299: #define BS 1
1300: #include src/vec/vec/utils/vpscat.h
1301: #define BS 2
1302: #include src/vec/vec/utils/vpscat.h
1303: #define BS 3
1304: #include src/vec/vec/utils/vpscat.h
1305: #define BS 4
1306: #include src/vec/vec/utils/vpscat.h
1307: #define BS 5
1308: #include src/vec/vec/utils/vpscat.h
1309: #define BS 6
1310: #include src/vec/vec/utils/vpscat.h
1311: #define BS 7
1312: #include src/vec/vec/utils/vpscat.h
1313: #define BS 8
1314: #include src/vec/vec/utils/vpscat.h
1315: #define BS 12
1316: #include src/vec/vec/utils/vpscat.h
1318: /* ---------------------------------------------------------------------------------*/
1322: PetscErrorCode VecScatterDestroy_PtoP_X(VecScatter ctx)
1323: {
1324: VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
1325: VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
1326: PetscErrorCode ierr;
1327: PetscInt i;
1330: if (to->use_readyreceiver) {
1331: /*
1332: Since we have already posted sends we must cancel them before freeing
1333: the requests
1334: */
1335: for (i=0; i<from->n; i++) {
1336: MPI_Cancel(from->requests+i);
1337: }
1338: }
1340: if (to->use_alltoallw) {
1341: PetscFree3(to->wcounts,to->wdispls,to->types);
1342: PetscFree3(from->wcounts,from->wdispls,from->types);
1343: }
1345: if (to->use_alltoallv) {
1346: PetscFree2(to->counts,to->displs);
1347: PetscFree2(from->counts,from->displs);
1348: }
1350: /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
1351: /*
1352: IBM's PE version of MPI has a bug where freeing these guys will screw up later
1353: message passing.
1354: */
1355: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
1356: for (i=0; i<to->n; i++) {
1357: MPI_Request_free(to->requests + i);
1358: MPI_Request_free(to->rev_requests + i);
1359: }
1361: /*
1362: MPICH could not properly cancel requests thus with ready receiver mode we
1363: cannot free the requests. It may be fixed now, if not then put the following
1364: code inside a if !to->use_readyreceiver) {
1365: */
1366: for (i=0; i<from->n; i++) {
1367: MPI_Request_free(from->requests + i);
1368: MPI_Request_free(from->rev_requests + i);
1369: }
1370: #endif
1371:
1372: VecScatterDestroy_PtoP(ctx);
1373: return(0);
1374: }
1376: /* ==========================================================================================*/
1378: /* create parallel to sequential scatter context */
1380: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);
1384: PetscErrorCode VecScatterCreateLocal_PtoS(int nsends,int sendSizes[],int sendProcs[],int sendIdx[],int nrecvs,int recvSizes[],int recvProcs[],int recvIdx[],PetscInt bs,VecScatter ctx)
1385: {
1386: VecScatter_MPI_General *from, *to;
1387: PetscInt sendSize, recvSize;
1388: PetscInt n, i;
1391: /* allocate entire send scatter context */
1392: PetscNew(VecScatter_MPI_General,&to);
1393: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1394: to->n = nsends;
1395: for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1396: PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1397: PetscMalloc4(bs*sendSize,PetscScalar,&to->values,
1398: sendSize,PetscInt,&to->indices,
1399: to->n+1,PetscInt,&to->starts,
1400: to->n,PetscMPIInt,&to->procs);
1401: PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,
1402: PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1403: to->starts[0] = 0;
1404: for(n = 0; n < to->n; n++) {
1405: to->starts[n+1] = to->starts[n] + sendSizes[n];
1406: to->procs[n] = sendProcs[n];
1407: for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1408: to->indices[i] = sendIdx[i];
1409: }
1410: }
1411: ctx->todata = (void *) to;
1413: /* allocate entire receive scatter context */
1414: PetscNew(VecScatter_MPI_General,&from);
1415: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
1416: from->n = nrecvs;
1417: for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1418: PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1419: PetscMalloc4(bs*recvSize,PetscScalar,&from->values,
1420: recvSize,PetscInt,&from->indices,
1421: from->n+1,PetscInt,&from->starts,
1422: from->n,PetscMPIInt,&from->procs);
1423: from->starts[0] = 0;
1424: for(n = 0; n < from->n; n++) {
1425: from->starts[n+1] = from->starts[n] + recvSizes[n];
1426: from->procs[n] = recvProcs[n];
1427: for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1428: from->indices[i] = recvIdx[i];
1429: }
1430: }
1431: ctx->fromdata = (void *)from;
1433: /* No local scatter optimization */
1434: from->local.n = 0;
1435: from->local.vslots = 0;
1436: to->local.n = 0;
1437: to->local.vslots = 0;
1438: from->local.nonmatching_computed = PETSC_FALSE;
1439: from->local.n_nonmatching = 0;
1440: from->local.slots_nonmatching = 0;
1441: to->local.nonmatching_computed = PETSC_FALSE;
1442: to->local.n_nonmatching = 0;
1443: to->local.slots_nonmatching = 0;
1445: from->type = VEC_SCATTER_MPI_GENERAL;
1446: to->type = VEC_SCATTER_MPI_GENERAL;
1447: from->bs = bs;
1448: to->bs = bs;
1450: VecScatterCreateCommon_PtoS(from, to, ctx);
1451: return(0);
1452: }
1454: /*
1455: bs indicates how many elements there are in each block. Normally this would be 1.
1456: */
1459: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,PetscInt *inidx,PetscInt ny,PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1460: {
1461: VecScatter_MPI_General *from,*to;
1462: PetscMPIInt size,rank,imdex,tag,n;
1463: PetscInt *source = PETSC_NULL,*lens = PETSC_NULL,*owners = PETSC_NULL;
1464: PetscInt *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1465: PetscInt *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs;
1466: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1467: PetscInt *rvalues,*svalues,base,nmax,*values,*indx,nprocslocal;
1468: MPI_Comm comm;
1469: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1470: MPI_Status recv_status,*send_status;
1471: PetscErrorCode ierr;
1474: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1475: PetscObjectGetComm((PetscObject)xin,&comm);
1476: MPI_Comm_rank(comm,&rank);
1477: MPI_Comm_size(comm,&size);
1478: owners = xin->map.range;
1479: VecGetSize(yin,&lengthy);
1480: VecGetSize(xin,&lengthx);
1482: /* first count number of contributors to each processor */
1483: PetscMalloc2(2*size,PetscInt,&nprocs,nx,PetscInt,&owner);
1484: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1485: PetscSortIntWithArray(nx,inidx,inidy);
1486: if (nx > 0 && inidx[0] < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Parallel index %D is negative",inidx[0]);
1487: if (nx > 0 && inidx[nx-1] > lengthx) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"Parallel index %D is longer than vector length",inidx[nx-1],lengthx);
1488: j = 0;
1489: nsends = 0;
1490: for (i=0; i<nx; i++) {
1491: idx = inidx[i];
1492: for (; j<size; j++) {
1493: if (idx < owners[j+1]) {
1494: if (!nprocs[2*j]++) nsends++;
1495: nprocs[2*j+1] = 1;
1496: owner[i] = j;
1497: break;
1498: }
1499: }
1500: }
1501: nprocslocal = nprocs[2*rank];
1502: nprocs[2*rank] = nprocs[2*rank+1] = 0;
1503: if (nprocslocal) nsends--;
1505: /* inform other processors of number of messages and max length*/
1506: PetscMaxSum(comm,nprocs,&nmax,&nrecvs);
1508: /* post receives: */
1509: PetscMalloc4(nrecvs*nmax,PetscInt,&rvalues,nrecvs,PetscInt,&lens,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1510: for (i=0; i<nrecvs; i++) {
1511: MPI_Irecv((rvalues+nmax*i),nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1512: }
1514: /* do sends:
1515: 1) starts[i] gives the starting index in svalues for stuff going to
1516: the ith processor
1517: */
1518: PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1519: starts[0] = 0;
1520: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1521: for (i=0; i<nx; i++) {
1522: if (owner[i] != rank) {
1523: svalues[starts[owner[i]]++] = inidx[i];
1524: }
1525: }
1527: starts[0] = 0;
1528: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1529: count = 0;
1530: for (i=0; i<size; i++) {
1531: if (nprocs[2*i+1]) {
1532: MPI_Isend(svalues+starts[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count++);
1533: }
1534: }
1536: /* wait on receives */
1537: count = nrecvs;
1538: slen = 0;
1539: while (count) {
1540: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1541: /* unpack receives into our local space */
1542: MPI_Get_count(&recv_status,MPIU_INT,&n);
1543: source[imdex] = recv_status.MPI_SOURCE;
1544: lens[imdex] = n;
1545: slen += n;
1546: count--;
1547: }
1548:
1549: /* allocate entire send scatter context */
1550: PetscNew(VecScatter_MPI_General,&to);
1551: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1552: to->n = nrecvs;
1553: PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1554: PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,
1555: nrecvs,PetscMPIInt,&to->procs);
1556: PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,
1557: &to->rstatus);
1558: ctx->todata = (void*)to;
1559: to->starts[0] = 0;
1561: if (nrecvs) {
1562: PetscMalloc(nrecvs*sizeof(PetscInt),&indx);
1563: for (i=0; i<nrecvs; i++) indx[i] = i;
1564: PetscSortIntWithPermutation(nrecvs,source,indx);
1566: /* move the data into the send scatter */
1567: base = owners[rank];
1568: for (i=0; i<nrecvs; i++) {
1569: to->starts[i+1] = to->starts[i] + lens[indx[i]];
1570: to->procs[i] = source[indx[i]];
1571: values = rvalues + indx[i]*nmax;
1572: for (j=0; j<lens[indx[i]]; j++) {
1573: to->indices[to->starts[i] + j] = values[j] - base;
1574: }
1575: }
1576: PetscFree(indx);
1577: }
1578: PetscFree4(rvalues,lens,source,recv_waits);
1579:
1580: /* allocate entire receive scatter context */
1581: PetscNew(VecScatter_MPI_General,&from);
1582: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
1583: from->n = nsends;
1585: PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1586: PetscMalloc4(ny*bs,PetscScalar,&from->values,ny,PetscInt,&from->indices,
1587: nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1588: ctx->fromdata = (void*)from;
1590: /* move data into receive scatter */
1591: PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1592: count = 0; from->starts[0] = start[0] = 0;
1593: for (i=0; i<size; i++) {
1594: if (nprocs[2*i+1]) {
1595: lowner[i] = count;
1596: from->procs[count++] = i;
1597: from->starts[count] = start[count] = start[count-1] + nprocs[2*i];
1598: }
1599: }
1600: for (i=0; i<nx; i++) {
1601: if (owner[i] != rank) {
1602: from->indices[start[lowner[owner[i]]]++] = inidy[i];
1603: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1604: }
1605: }
1606: PetscFree2(lowner,start);
1607: PetscFree2(nprocs,owner);
1608:
1609: /* wait on sends */
1610: if (nsends) {
1611: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1612: MPI_Waitall(nsends,send_waits,send_status);
1613: PetscFree(send_status);
1614: }
1615: PetscFree3(svalues,send_waits,starts);
1617: if (nprocslocal) {
1618: PetscInt nt = from->local.n = to->local.n = nprocslocal;
1619: /* we have a scatter to ourselves */
1620: PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1621: PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1622: nt = 0;
1623: for (i=0; i<nx; i++) {
1624: idx = inidx[i];
1625: if (idx >= owners[rank] && idx < owners[rank+1]) {
1626: to->local.vslots[nt] = idx - owners[rank];
1627: from->local.vslots[nt++] = inidy[i];
1628: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1629: }
1630: }
1631: } else {
1632: from->local.n = 0;
1633: from->local.vslots = 0;
1634: to->local.n = 0;
1635: to->local.vslots = 0;
1636: }
1637: from->local.nonmatching_computed = PETSC_FALSE;
1638: from->local.n_nonmatching = 0;
1639: from->local.slots_nonmatching = 0;
1640: to->local.nonmatching_computed = PETSC_FALSE;
1641: to->local.n_nonmatching = 0;
1642: to->local.slots_nonmatching = 0;
1644: from->type = VEC_SCATTER_MPI_GENERAL;
1645: to->type = VEC_SCATTER_MPI_GENERAL;
1646: from->bs = bs;
1647: to->bs = bs;
1649: VecScatterCreateCommon_PtoS(from,to,ctx);
1650: return(0);
1651: }
1653: /*
1654: bs indicates how many elements there are in each block. Normally this would be 1.
1655: */
1658: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1659: {
1660: MPI_Comm comm = ctx->comm;
1661: PetscMPIInt tag = ctx->tag;
1662: PetscInt bs = to->bs;
1663: PetscMPIInt size;
1664: PetscInt i, n;
1666:
1668: MPI_Comm_size(comm,&size);
1669: /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1670: to->contiq = PETSC_FALSE;
1671: n = from->starts[from->n];
1672: from->contiq = PETSC_TRUE;
1673: for (i=1; i<n; i++) {
1674: if (from->indices[i] != from->indices[i-1] + bs) {
1675: from->contiq = PETSC_FALSE;
1676: break;
1677: }
1678: }
1680: PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoallw",&to->use_alltoallw);
1681: from->use_alltoallw = to->use_alltoallw;
1682: if (to->use_alltoallw) {
1683: to->use_alltoallv = PETSC_TRUE;
1684: } else {
1685: PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoallv",&to->use_alltoallv);
1686: }
1687: from->use_alltoallv = to->use_alltoallv;
1689: if (to->use_alltoallv) {
1690: ctx->destroy = VecScatterDestroy_PtoP;
1692: PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1693: PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1694: for (i=0; i<to->n; i++) {
1695: to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1696: }
1697: to->displs[0] = 0;
1698: for (i=1; i<size; i++) {
1699: to->displs[i] = to->displs[i-1] + to->counts[i-1];
1700: }
1702: PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1703: PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1704: for (i=0; i<from->n; i++) {
1705: from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1706: }
1707: from->displs[0] = 0;
1708: for (i=1; i<size; i++) {
1709: from->displs[i] = from->displs[i-1] + from->counts[i-1];
1710: }
1711: #if defined(PETSC_HAVE_MPI_ALLTOALLW)
1712: if (to->use_alltoallw) {
1713: ctx->packtogether = PETSC_FALSE;
1714: PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1715: PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1716: PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1717: for (i=0; i<size; i++) {
1718: to->types[i] = MPIU_SCALAR;
1719: }
1721: for (i=0; i<to->n; i++) {
1722: to->wcounts[to->procs[i]] = ((to->starts[i+1] - to->starts[i]) > 0) ? 1 : 0;
1723: MPI_Type_create_indexed_block(to->starts[i+1]-to->starts[i],bs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1724: MPI_Type_commit(to->types+to->procs[i]);
1725: }
1726: PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1727: PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1728: PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1729: for (i=0; i<size; i++) {
1730: from->types[i] = MPIU_SCALAR;
1731: }
1732: if (from->contiq) {
1733: PetscInfo(0,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoallw");
1734: for (i=0; i<from->n; i++) {
1735: from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1736: }
1737: if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1738: for (i=1; i<from->n; i++) {
1739: from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1740: }
1741: } else {
1742: for (i=0; i<from->n; i++) {
1743: from->wcounts[from->procs[i]] = ((from->starts[i+1] - from->starts[i]) > 0) ? 1 : 0;
1744: MPI_Type_create_indexed_block(from->starts[i+1]-from->starts[i],bs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1745: MPI_Type_commit(from->types+from->procs[i]);
1746: }
1747: }
1748: }
1749: #else
1750: to->use_alltoallw = PETSC_FALSE;
1751: from->use_alltoallw = PETSC_FALSE;
1752: #endif
1753: } else {
1754: PetscTruth use_rr = PETSC_FALSE, use_ssend = PETSC_FALSE;
1755: PetscInt *sstarts = to->starts, *rstarts = from->starts;
1756: PetscMPIInt *sprocs = to->procs, *rprocs = from->procs;
1757: MPI_Request *swaits = to->requests,*rwaits = from->requests;
1758: MPI_Request *rev_swaits,*rev_rwaits;
1759: PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
1761: /* allocate additional wait variables for the "reverse" scatter */
1762: PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1763: PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1764: to->rev_requests = rev_rwaits;
1765: from->rev_requests = rev_swaits;
1767: /* Register the receives that you will use later (sends for scatter reverse) */
1768: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&use_rr);
1769: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1770: if (use_rr) {
1771: PetscInfo(0,"Using VecScatter ready receiver mode\n");
1772: }
1773: if (use_ssend) {
1774: PetscInfo(0,"Using VecScatter Ssend mode\n");
1775: }
1777: for (i=0; i<from->n; i++) {
1778: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1779: if (!use_ssend) {
1780: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
1781: } else {
1782: MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
1783: }
1784: }
1786: if (use_rr) {
1787: to->use_readyreceiver = PETSC_TRUE;
1788: from->use_readyreceiver = PETSC_TRUE;
1789: for (i=0; i<to->n; i++) {
1790: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1791: }
1792: if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1793: MPI_Barrier(comm);
1794: } else {
1795: to->use_readyreceiver = PETSC_FALSE;
1796: from->use_readyreceiver = PETSC_FALSE;
1797: for (i=0; i<to->n; i++) {
1798: if (!use_ssend) {
1799: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1800: } else {
1801: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1802: }
1803: }
1804: }
1805: /* Register receives for scatter reverse */
1806: for (i=0; i<to->n; i++) {
1807: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
1808: }
1810: ctx->destroy = VecScatterDestroy_PtoP_X;
1811: ctx->copy = VecScatterCopy_PtoP_X;
1812: }
1813: PetscInfo1(0,"Using blocksize %D scatter\n",bs);
1814: switch (bs) {
1815: case 12:
1816: ctx->begin = VecScatterBegin_12;
1817: ctx->end = VecScatterEnd_12;
1818: break;
1819: case 8:
1820: ctx->begin = VecScatterBegin_8;
1821: ctx->end = VecScatterEnd_8;
1822: break;
1823: case 7:
1824: ctx->begin = VecScatterBegin_7;
1825: ctx->end = VecScatterEnd_7;
1826: break;
1827: case 6:
1828: ctx->begin = VecScatterBegin_6;
1829: ctx->end = VecScatterEnd_6;
1830: break;
1831: case 5:
1832: ctx->begin = VecScatterBegin_5;
1833: ctx->end = VecScatterEnd_5;
1834: break;
1835: case 4:
1836: ctx->begin = VecScatterBegin_4;
1837: ctx->end = VecScatterEnd_4;
1838: break;
1839: case 3:
1840: ctx->begin = VecScatterBegin_3;
1841: ctx->end = VecScatterEnd_3;
1842: break;
1843: case 2:
1844: ctx->begin = VecScatterBegin_2;
1845: ctx->end = VecScatterEnd_2;
1846: break;
1847: case 1:
1848: ctx->begin = VecScatterBegin_1;
1849: ctx->end = VecScatterEnd_1;
1850: break;
1851: default:
1852: SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
1853: }
1854: ctx->view = VecScatterView_MPI;
1856: /* Check if the local scatter is actually a copy; important special case */
1857: if (to->local.n) {
1858: VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
1859: }
1860: return(0);
1861: }
1865: /* ------------------------------------------------------------------------------------*/
1866: /*
1867: Scatter from local Seq vectors to a parallel vector.
1868: Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
1869: reverses the result.
1870: */
1873: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,PetscInt *inidx,PetscInt ny,PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1874: {
1875: PetscErrorCode ierr;
1876: MPI_Request *waits;
1877: VecScatter_MPI_General *to,*from;
1880: VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
1881: to = (VecScatter_MPI_General*)ctx->fromdata;
1882: from = (VecScatter_MPI_General*)ctx->todata;
1883: ctx->todata = (void*)to;
1884: ctx->fromdata = (void*)from;
1885: /* these two are special, they are ALWAYS stored in to struct */
1886: to->sstatus = from->sstatus;
1887: to->rstatus = from->rstatus;
1889: from->sstatus = 0;
1890: from->rstatus = 0;
1892: waits = from->rev_requests;
1893: from->rev_requests = from->requests;
1894: from->requests = waits;
1895: waits = to->rev_requests;
1896: to->rev_requests = to->requests;
1897: to->requests = waits;
1898: return(0);
1899: }
1901: /* ---------------------------------------------------------------------------------*/
1904: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,PetscInt *inidx,PetscInt ny,PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
1905: {
1907: PetscMPIInt size,rank,tag,imdex,n;
1908: PetscInt *lens = PETSC_NULL,*owners = xin->map.range;
1909: PetscInt *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
1910: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1911: PetscInt *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,nmax,*values = PETSC_NULL,lastidx;
1912: MPI_Comm comm;
1913: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1914: MPI_Status recv_status,*send_status = PETSC_NULL;
1915: PetscTruth duplicate = PETSC_FALSE;
1916: #if defined(PETSC_USE_DEBUG)
1917: PetscTruth found = PETSC_FALSE;
1918: #endif
1921: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1922: PetscObjectGetComm((PetscObject)xin,&comm);
1923: MPI_Comm_size(comm,&size);
1924: MPI_Comm_rank(comm,&rank);
1925: if (size == 1) {
1926: VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
1927: return(0);
1928: }
1930: /*
1931: Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
1932: They then call the StoPScatterCreate()
1933: */
1934: /* first count number of contributors to each processor */
1935: PetscMalloc3(2*size,PetscInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
1936: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1937: lastidx = -1;
1938: j = 0;
1939: for (i=0; i<nx; i++) {
1940: /* if indices are NOT locally sorted, need to start search at the beginning */
1941: if (lastidx > (idx = inidx[i])) j = 0;
1942: lastidx = idx;
1943: for (; j<size; j++) {
1944: if (idx >= owners[j] && idx < owners[j+1]) {
1945: nprocs[2*j]++;
1946: nprocs[2*j+1] = 1;
1947: owner[i] = j;
1948: #if defined(PETSC_USE_DEBUG)
1949: found = PETSC_TRUE;
1950: #endif
1951: break;
1952: }
1953: }
1954: #if defined(PETSC_USE_DEBUG)
1955: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
1956: found = PETSC_FALSE;
1957: #endif
1958: }
1959: nsends = 0; for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}
1961: /* inform other processors of number of messages and max length*/
1962: PetscMaxSum(comm,nprocs,&nmax,&nrecvs);
1964: /* post receives: */
1965: PetscMalloc6(2*nrecvs*nmax,PetscInt,&rvalues,2*nx,PetscInt,&svalues,2*nrecvs,PetscInt,&lens,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
1967: for (i=0; i<nrecvs; i++) {
1968: MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1969: }
1971: /* do sends:
1972: 1) starts[i] gives the starting index in svalues for stuff going to
1973: the ith processor
1974: */
1975: starts[0]= 0;
1976: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1977: for (i=0; i<nx; i++) {
1978: svalues[2*starts[owner[i]]] = inidx[i];
1979: svalues[1 + 2*starts[owner[i]]++] = inidy[i];
1980: }
1982: starts[0] = 0;
1983: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1984: count = 0;
1985: for (i=0; i<size; i++) {
1986: if (nprocs[2*i+1]) {
1987: MPI_Isend(svalues+2*starts[i],2*nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count);
1988: count++;
1989: }
1990: }
1991: PetscFree3(nprocs,owner,starts);
1993: /* wait on receives */
1994: count = nrecvs;
1995: slen = 0;
1996: while (count) {
1997: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1998: /* unpack receives into our local space */
1999: MPI_Get_count(&recv_status,MPIU_INT,&n);
2000: lens[imdex] = n/2;
2001: slen += n/2;
2002: count--;
2003: }
2004:
2005: PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2006: base = owners[rank];
2007: count = 0;
2008: for (i=0; i<nrecvs; i++) {
2009: values = rvalues + 2*i*nmax;
2010: for (j=0; j<lens[i]; j++) {
2011: local_inidx[count] = values[2*j] - base;
2012: local_inidy[count++] = values[2*j+1];
2013: }
2014: }
2016: /* wait on sends */
2017: if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2018: PetscFree6(rvalues,svalues,lens,recv_waits,send_waits,send_status);
2020: /*
2021: should sort and remove duplicates from local_inidx,local_inidy
2022: */
2024: #if defined(do_it_slow)
2025: /* sort on the from index */
2026: PetscSortIntWithArray(slen,local_inidx,local_inidy);
2027: start = 0;
2028: while (start < slen) {
2029: count = start+1;
2030: last = local_inidx[start];
2031: while (count < slen && last == local_inidx[count]) count++;
2032: if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2033: /* sort on to index */
2034: PetscSortInt(count-start,local_inidy+start);
2035: }
2036: /* remove duplicates; not most efficient way, but probably good enough */
2037: i = start;
2038: while (i < count-1) {
2039: if (local_inidy[i] != local_inidy[i+1]) {
2040: i++;
2041: } else { /* found a duplicate */
2042: duplicate = PETSC_TRUE;
2043: for (j=i; j<slen-1; j++) {
2044: local_inidx[j] = local_inidx[j+1];
2045: local_inidy[j] = local_inidy[j+1];
2046: }
2047: slen--;
2048: count--;
2049: }
2050: }
2051: start = count;
2052: }
2053: #endif
2054: if (duplicate) {
2055: PetscInfo(0,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2056: }
2057: VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2058: PetscFree2(local_inidx,local_inidy);
2059: return(0);
2060: }