Actual source code: vpscat.c

  1: #define PETSCVEC_DLL

  3: /*
  4:     Defines parallel vector scatters.
  5: */

 7:  #include src/vec/is/isimpl.h
 8:  #include private/vecimpl.h
 9:  #include src/vec/vec/impls/dvecimpl.h
 10:  #include src/vec/vec/impls/mpi/pvecimpl.h
 11:  #include petscsys.h

 15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 16: {
 17:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 18:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 19:   PetscErrorCode         ierr;
 20:   PetscInt               i;
 21:   PetscMPIInt            rank;
 22:   PetscViewerFormat      format;
 23:   PetscTruth             iascii;

 26:   PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
 27:   if (iascii) {
 28:     MPI_Comm_rank(ctx->comm,&rank);
 29:     PetscViewerGetFormat(viewer,&format);
 30:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 31:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 33:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 34:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 35:       itmp = to->starts[to->n+1];
 36:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 37:       itmp = from->starts[from->n+1];
 38:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 39:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,ctx->comm);

 41:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 42:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 43:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 44:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 45:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 46:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));

 48:     } else {
 49:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 50:       if (to->n) {
 51:         for (i=0; i<to->n; i++){
 52:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 53:         }
 54:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
 55:         for (i=0; i<to->starts[to->n]; i++){
 56:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
 57:         }
 58:       }

 60:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 61:       if (from->n) {
 62:         for (i=0; i<from->n; i++){
 63:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 64:         }

 66:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
 67:         for (i=0; i<from->starts[from->n]; i++){
 68:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
 69:         }
 70:       }
 71:       if (to->local.n) {
 72:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
 73:         for (i=0; i<to->local.n; i++){
 74:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
 75:         }
 76:       }

 78:       PetscViewerFlush(viewer);
 79:     }
 80:   } else {
 81:     SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 82:   }
 83:   return(0);
 84: }

 86: /* -----------------------------------------------------------------------------------*/
 87: /*
 88:       The next routine determines what part of  the local part of the scatter is an
 89:   exact copy of values into their current location. We check this here and
 90:   then know that we need not perform that portion of the scatter.
 91: */
 94: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from)
 95: {
 96:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
 98:   PetscInt       *nto_slots,*nfrom_slots,j = 0;
 99: 
101:   for (i=0; i<n; i++) {
102:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
103:   }

105:   if (!n_nonmatching) {
106:     to->nonmatching_computed = PETSC_TRUE;
107:     to->n_nonmatching        = from->n_nonmatching = 0;
108:     PetscInfo1(0,"Reduced %D to 0\n", n);
109:   } else if (n_nonmatching == n) {
110:     to->nonmatching_computed = PETSC_FALSE;
111:     PetscInfo(0,"All values non-matching\n");
112:   } else {
113:     to->nonmatching_computed= PETSC_TRUE;
114:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;
115:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
116:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
117:     to->slots_nonmatching   = nto_slots;
118:     from->slots_nonmatching = nfrom_slots;
119:     for (i=0; i<n; i++) {
120:       if (to_slots[i] != from_slots[i]) {
121:         nto_slots[j]   = to_slots[i];
122:         nfrom_slots[j] = from_slots[i];
123:         j++;
124:       }
125:     }
126:     PetscInfo2(0,"Reduced %D to %D\n",n,n_nonmatching);
127:   }
128:   return(0);
129: }

131: /* --------------------------------------------------------------------------------------*/

133: /* -------------------------------------------------------------------------------------*/
136: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
137: {
138:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
139:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
140:   PetscErrorCode         ierr;

143:   PetscFree(to->local.vslots);
144:   PetscFree(from->local.vslots);
145:   PetscFree2(to->counts,to->displs);
146:   PetscFree2(from->counts,from->displs);
147:   PetscFree(to->local.slots_nonmatching);
148:   PetscFree(from->local.slots_nonmatching);
149:   PetscFree(to->rev_requests);
150:   PetscFree(from->rev_requests);
151:   PetscFree(to->requests);
152:   PetscFree(from->requests);
153:   PetscFree4(to->values,to->indices,to->starts,to->procs);
154:   PetscFree2(to->sstatus,to->rstatus);
155:   PetscFree4(from->values,from->indices,from->starts,from->procs);
156:   PetscFree(from);
157:   PetscFree(to);
158:   PetscHeaderDestroy(ctx);
159:   return(0);
160: }



164: /* --------------------------------------------------------------------------------------*/
165: /*
166:     Special optimization to see if the local part of the scatter is actually 
167:     a copy. The scatter routines call PetscMemcpy() instead.
168:  
169: */
172: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
173: {
174:   PetscInt       n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
175:   PetscInt       to_start,from_start;

179:   to_start   = to_slots[0];
180:   from_start = from_slots[0];

182:   for (i=1; i<n; i++) {
183:     to_start   += bs;
184:     from_start += bs;
185:     if (to_slots[i]   != to_start)   return(0);
186:     if (from_slots[i] != from_start) return(0);
187:   }
188:   to->is_copy       = PETSC_TRUE;
189:   to->copy_start    = to_slots[0];
190:   to->copy_length   = bs*sizeof(PetscScalar)*n;
191:   from->is_copy     = PETSC_TRUE;
192:   from->copy_start  = from_slots[0];
193:   from->copy_length = bs*sizeof(PetscScalar)*n;
194:   PetscInfo(0,"Local scatter is a copy, optimizing for it\n");
195:   return(0);
196: }

198: /* --------------------------------------------------------------------------------------*/

202: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
203: {
204:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
205:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
206:   PetscErrorCode         ierr;
207:   PetscInt               ny,bs = in_from->bs;

210:   out->begin     = in->begin;
211:   out->end       = in->end;
212:   out->copy      = in->copy;
213:   out->destroy   = in->destroy;
214:   out->view      = in->view;

216:   /* allocate entire send scatter context */
217:   PetscNew(VecScatter_MPI_General,&out_to);
218:   PetscNew(VecScatter_MPI_General,&out_from);

220:   ny                = in_to->starts[in_to->n];
221:   out_to->n         = in_to->n;
222:   out_to->type      = in_to->type;
223:   out_to->sendfirst = in_to->sendfirst;

225:   PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
226:   PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,
227:                       out_to->n,PetscMPIInt,&out_to->procs);
228:   PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,
229:                       &out_to->rstatus);
230:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
231:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
232:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
233: 
234:   out->todata       = (void*)out_to;
235:   out_to->local.n   = in_to->local.n;
236:   out_to->local.nonmatching_computed = PETSC_FALSE;
237:   out_to->local.n_nonmatching        = 0;
238:   out_to->local.slots_nonmatching    = 0;
239:   if (in_to->local.n) {
240:     PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
241:     PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
242:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
243:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
244:   } else {
245:     out_to->local.vslots   = 0;
246:     out_from->local.vslots = 0;
247:   }

249:   /* allocate entire receive context */
250:   out_from->type      = in_from->type;
251:   ny                  = in_from->starts[in_from->n];
252:   out_from->n         = in_from->n;
253:   out_from->sendfirst = in_from->sendfirst;

255:   PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
256:   PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,
257:                       out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
258:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
259:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
260:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
261:   out->fromdata       = (void*)out_from;
262:   out_from->local.n   = in_from->local.n;
263:   out_from->local.nonmatching_computed = PETSC_FALSE;
264:   out_from->local.n_nonmatching        = 0;
265:   out_from->local.slots_nonmatching    = 0;

267:   /* 
268:       set up the request arrays for use with isend_init() and irecv_init()
269:   */
270:   {
271:     PetscMPIInt tag;
272:     MPI_Comm    comm;
273:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
274:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
275:     PetscInt    i;
276:     PetscTruth  flg;
277:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
278:     MPI_Request *rev_swaits,*rev_rwaits;
279:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

281:     PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
282:     PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);

284:     rev_rwaits = out_to->rev_requests;
285:     rev_swaits = out_from->rev_requests;

287:     out_from->bs = out_to->bs = bs;
288:     tag     = out->tag;
289:     comm    = out->comm;

291:     /* Register the receives that you will use later (sends for scatter reverse) */
292:     for (i=0; i<out_from->n; i++) {
293:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
294:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
295:     }

297:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
298:     if (flg) {
299:       out_to->use_readyreceiver    = PETSC_TRUE;
300:       out_from->use_readyreceiver  = PETSC_TRUE;
301:       for (i=0; i<out_to->n; i++) {
302:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
303:       }
304:       if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
305:       MPI_Barrier(comm);
306:       PetscInfo(0,"Using VecScatter ready receiver mode\n");
307:     } else {
308:       out_to->use_readyreceiver    = PETSC_FALSE;
309:       out_from->use_readyreceiver  = PETSC_FALSE;
310:       flg                          = PETSC_FALSE;
311:       PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
312:       if (flg) {
313:         PetscInfo(0,"Using VecScatter Ssend mode\n");
314:       }
315:       for (i=0; i<out_to->n; i++) {
316:         if (!flg) {
317:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
318:         } else {
319:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
320:         }
321:       }
322:     }
323:     /* Register receives for scatter reverse */
324:     for (i=0; i<out_to->n; i++) {
325:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
326:     }
327:   }

329:   return(0);
330: }
331: /* --------------------------------------------------------------------------------------------------
332:     Packs and unpacks the message data into send or from receive buffers. 

334:     These could be generated automatically. 

336:     Fortran kernels etc. could be used.
337: */
338: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
339: {
340:   PetscInt i;
341:   for (i=0; i<n; i++) {
342:     y[i]  = x[indicesx[i]];
343:   }
344: }
345: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
346: {
347:   PetscInt i;
348:   switch (addv) {
349:   case INSERT_VALUES:
350:     for (i=0; i<n; i++) {
351:       y[indicesy[i]] = x[i];
352:     }
353:     break;
354:   case ADD_VALUES:
355:     for (i=0; i<n; i++) {
356:       y[indicesy[i]] += x[i];
357:     }
358:     break;
359: #if !defined(PETSC_USE_COMPLEX)
360:   case MAX_VALUES:
361:     for (i=0; i<n; i++) {
362:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
363:     }
364: #else
365:   case MAX_VALUES:
366: #endif
367:   case NOT_SET_VALUES:
368:     break;
369:   }
370: }

372: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
373: {
374:   PetscInt i;
375:   switch (addv) {
376:   case INSERT_VALUES:
377:     for (i=0; i<n; i++) {
378:       y[indicesy[i]] = x[indicesx[i]];
379:     }
380:     break;
381:   case ADD_VALUES:
382:     for (i=0; i<n; i++) {
383:       y[indicesy[i]] += x[indicesx[i]];
384:     }
385:     break;
386: #if !defined(PETSC_USE_COMPLEX)
387:   case MAX_VALUES:
388:     for (i=0; i<n; i++) {
389:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
390:     }
391: #else
392:   case MAX_VALUES:
393: #endif
394:   case NOT_SET_VALUES:
395:     break;
396:   }
397: }

399:   /* ----------------------------------------------------------------------------------------------- */
400: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
401: {
402:   PetscInt i,idx;

404:   for (i=0; i<n; i++) {
405:     idx   = *indicesx++;
406:     y[0]  = x[idx];
407:     y[1]  = x[idx+1];
408:     y    += 2;
409:   }
410: }
411: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
412: {
413:   PetscInt i,idy;

415:   switch (addv) {
416:   case INSERT_VALUES:
417:     for (i=0; i<n; i++) {
418:       idy       = *indicesy++;
419:       y[idy]    = x[0];
420:       y[idy+1]  = x[1];
421:       x    += 2;
422:     }
423:     break;
424:   case ADD_VALUES:
425:     for (i=0; i<n; i++) {
426:       idy       = *indicesy++;
427:       y[idy]    += x[0];
428:       y[idy+1]  += x[1];
429:       x    += 2;
430:     }
431:     break;
432: #if !defined(PETSC_USE_COMPLEX)
433:   case MAX_VALUES:
434:     for (i=0; i<n; i++) {
435:       idy       = *indicesy++;
436:       y[idy]    = PetscMax(y[idy],x[0]);
437:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
438:       x    += 2;
439:     }
440: #else
441:   case MAX_VALUES:
442: #endif
443:   case NOT_SET_VALUES:
444:     break;
445:   }
446: }

448: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
449: {
450:   PetscInt i,idx,idy;

452:   switch (addv) {
453:   case INSERT_VALUES:
454:     for (i=0; i<n; i++) {
455:       idx       = *indicesx++;
456:       idy       = *indicesy++;
457:       y[idy]    = x[idx];
458:       y[idy+1]  = x[idx+1];
459:     }
460:     break;
461:   case ADD_VALUES:
462:     for (i=0; i<n; i++) {
463:       idx       = *indicesx++;
464:       idy       = *indicesy++;
465:       y[idy]    += x[idx];
466:       y[idy+1]  += x[idx+1];
467:     }
468:     break;
469: #if !defined(PETSC_USE_COMPLEX)
470:   case MAX_VALUES:
471:     for (i=0; i<n; i++) {
472:       idx       = *indicesx++;
473:       idy       = *indicesy++;
474:       y[idy]    = PetscMax(y[idy],x[idx]);
475:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
476:     }
477: #else
478:   case MAX_VALUES:
479: #endif
480:   case NOT_SET_VALUES:
481:     break;
482:   }
483: }
484:   /* ----------------------------------------------------------------------------------------------- */
485: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
486: {
487:   PetscInt i,idx;

489:   for (i=0; i<n; i++) {
490:     idx   = *indicesx++;
491:     y[0]  = x[idx];
492:     y[1]  = x[idx+1];
493:     y[2]  = x[idx+2];
494:     y    += 3;
495:   }
496: }
497: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
498: {
499:   PetscInt i,idy;

501:   switch (addv) {
502:   case INSERT_VALUES:
503:     for (i=0; i<n; i++) {
504:       idy       = *indicesy++;
505:       y[idy]    = x[0];
506:       y[idy+1]  = x[1];
507:       y[idy+2]  = x[2];
508:       x    += 3;
509:     }
510:     break;
511:   case ADD_VALUES:
512:     for (i=0; i<n; i++) {
513:       idy       = *indicesy++;
514:       y[idy]    += x[0];
515:       y[idy+1]  += x[1];
516:       y[idy+2]  += x[2];
517:       x    += 3;
518:     }
519:     break;
520: #if !defined(PETSC_USE_COMPLEX)
521:   case MAX_VALUES:
522:     for (i=0; i<n; i++) {
523:       idy       = *indicesy++;
524:       y[idy]    = PetscMax(y[idy],x[0]);
525:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
526:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
527:       x    += 3;
528:     }
529: #else
530:   case MAX_VALUES:
531: #endif
532:   case NOT_SET_VALUES:
533:     break;
534:   }
535: }

537: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
538: {
539:   PetscInt i,idx,idy;

541:   switch (addv) {
542:   case INSERT_VALUES:
543:     for (i=0; i<n; i++) {
544:       idx       = *indicesx++;
545:       idy       = *indicesy++;
546:       y[idy]    = x[idx];
547:       y[idy+1]  = x[idx+1];
548:       y[idy+2]  = x[idx+2];
549:     }
550:     break;
551:   case ADD_VALUES:
552:     for (i=0; i<n; i++) {
553:       idx       = *indicesx++;
554:       idy       = *indicesy++;
555:       y[idy]    += x[idx];
556:       y[idy+1]  += x[idx+1];
557:       y[idy+2]  += x[idx+2];
558:     }
559:     break;
560: #if !defined(PETSC_USE_COMPLEX)
561:   case MAX_VALUES:
562:     for (i=0; i<n; i++) {
563:       idx       = *indicesx++;
564:       idy       = *indicesy++;
565:       y[idy]    = PetscMax(y[idy],x[idx]);
566:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
567:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
568:     }
569: #else
570:   case MAX_VALUES:
571: #endif
572:   case NOT_SET_VALUES:
573:     break;
574:   }
575: }
576:   /* ----------------------------------------------------------------------------------------------- */
577: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
578: {
579:   PetscInt i,idx;

581:   for (i=0; i<n; i++) {
582:     idx   = *indicesx++;
583:     y[0]  = x[idx];
584:     y[1]  = x[idx+1];
585:     y[2]  = x[idx+2];
586:     y[3]  = x[idx+3];
587:     y    += 4;
588:   }
589: }
590: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
591: {
592:   PetscInt i,idy;

594:   switch (addv) {
595:   case INSERT_VALUES:
596:     for (i=0; i<n; i++) {
597:       idy       = *indicesy++;
598:       y[idy]    = x[0];
599:       y[idy+1]  = x[1];
600:       y[idy+2]  = x[2];
601:       y[idy+3]  = x[3];
602:       x    += 4;
603:     }
604:     break;
605:   case ADD_VALUES:
606:     for (i=0; i<n; i++) {
607:       idy       = *indicesy++;
608:       y[idy]    += x[0];
609:       y[idy+1]  += x[1];
610:       y[idy+2]  += x[2];
611:       y[idy+3]  += x[3];
612:       x    += 4;
613:     }
614:     break;
615: #if !defined(PETSC_USE_COMPLEX)
616:   case MAX_VALUES:
617:     for (i=0; i<n; i++) {
618:       idy       = *indicesy++;
619:       y[idy]    = PetscMax(y[idy],x[0]);
620:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
621:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
622:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
623:       x    += 4;
624:     }
625: #else
626:   case MAX_VALUES:
627: #endif
628:   case NOT_SET_VALUES:
629:     break;
630:   }
631: }

633: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
634: {
635:   PetscInt i,idx,idy;

637:   switch (addv) {
638:   case INSERT_VALUES:
639:     for (i=0; i<n; i++) {
640:       idx       = *indicesx++;
641:       idy       = *indicesy++;
642:       y[idy]    = x[idx];
643:       y[idy+1]  = x[idx+1];
644:       y[idy+2]  = x[idx+2];
645:       y[idy+3]  = x[idx+3];
646:     }
647:     break;
648:   case ADD_VALUES:
649:     for (i=0; i<n; i++) {
650:       idx       = *indicesx++;
651:       idy       = *indicesy++;
652:       y[idy]    += x[idx];
653:       y[idy+1]  += x[idx+1];
654:       y[idy+2]  += x[idx+2];
655:       y[idy+3]  += x[idx+3];
656:     }
657:     break;
658: #if !defined(PETSC_USE_COMPLEX)
659:   case MAX_VALUES:
660:     for (i=0; i<n; i++) {
661:       idx       = *indicesx++;
662:       idy       = *indicesy++;
663:       y[idy]    = PetscMax(y[idy],x[idx]);
664:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
665:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
666:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
667:     }
668: #else
669:   case MAX_VALUES:
670: #endif
671:   case NOT_SET_VALUES:
672:     break;
673:   }
674: }
675:   /* ----------------------------------------------------------------------------------------------- */
676: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
677: {
678:   PetscInt i,idx;

680:   for (i=0; i<n; i++) {
681:     idx   = *indicesx++;
682:     y[0]  = x[idx];
683:     y[1]  = x[idx+1];
684:     y[2]  = x[idx+2];
685:     y[3]  = x[idx+3];
686:     y[4]  = x[idx+4];
687:     y    += 5;
688:   }
689: }
690: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
691: {
692:   PetscInt i,idy;

694:   switch (addv) {
695:   case INSERT_VALUES:
696:     for (i=0; i<n; i++) {
697:       idy       = *indicesy++;
698:       y[idy]    = x[0];
699:       y[idy+1]  = x[1];
700:       y[idy+2]  = x[2];
701:       y[idy+3]  = x[3];
702:       y[idy+4]  = x[4];
703:       x    += 5;
704:     }
705:     break;
706:   case ADD_VALUES:
707:     for (i=0; i<n; i++) {
708:       idy       = *indicesy++;
709:       y[idy]    += x[0];
710:       y[idy+1]  += x[1];
711:       y[idy+2]  += x[2];
712:       y[idy+3]  += x[3];
713:       y[idy+4]  += x[4];
714:       x    += 5;
715:     }
716:     break;
717: #if !defined(PETSC_USE_COMPLEX)
718:   case MAX_VALUES:
719:     for (i=0; i<n; i++) {
720:       idy       = *indicesy++;
721:       y[idy]    = PetscMax(y[idy],x[0]);
722:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
723:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
724:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
725:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
726:       x    += 5;
727:     }
728: #else
729:   case MAX_VALUES:
730: #endif
731:   case NOT_SET_VALUES:
732:     break;
733:   }
734: }

736: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
737: {
738:   PetscInt i,idx,idy;

740:   switch (addv) {
741:   case INSERT_VALUES:
742:     for (i=0; i<n; i++) {
743:       idx       = *indicesx++;
744:       idy       = *indicesy++;
745:       y[idy]    = x[idx];
746:       y[idy+1]  = x[idx+1];
747:       y[idy+2]  = x[idx+2];
748:       y[idy+3]  = x[idx+3];
749:       y[idy+4]  = x[idx+4];
750:     }
751:     break;
752:   case ADD_VALUES:
753:     for (i=0; i<n; i++) {
754:       idx       = *indicesx++;
755:       idy       = *indicesy++;
756:       y[idy]    += x[idx];
757:       y[idy+1]  += x[idx+1];
758:       y[idy+2]  += x[idx+2];
759:       y[idy+3]  += x[idx+3];
760:       y[idy+4]  += x[idx+4];
761:     }
762:     break;
763: #if !defined(PETSC_USE_COMPLEX)
764:   case MAX_VALUES:
765:     for (i=0; i<n; i++) {
766:       idx       = *indicesx++;
767:       idy       = *indicesy++;
768:       y[idy]    = PetscMax(y[idy],x[idx]);
769:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
770:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
771:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
772:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
773:     }
774: #else
775:   case MAX_VALUES:
776: #endif
777:   case NOT_SET_VALUES:
778:     break;
779:   }
780: }
781:   /* ----------------------------------------------------------------------------------------------- */
782: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
783: {
784:   PetscInt i,idx;

786:   for (i=0; i<n; i++) {
787:     idx   = *indicesx++;
788:     y[0]  = x[idx];
789:     y[1]  = x[idx+1];
790:     y[2]  = x[idx+2];
791:     y[3]  = x[idx+3];
792:     y[4]  = x[idx+4];
793:     y[5]  = x[idx+5];
794:     y    += 6;
795:   }
796: }
797: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
798: {
799:   PetscInt i,idy;

801:   switch (addv) {
802:   case INSERT_VALUES:
803:     for (i=0; i<n; i++) {
804:       idy       = *indicesy++;
805:       y[idy]    = x[0];
806:       y[idy+1]  = x[1];
807:       y[idy+2]  = x[2];
808:       y[idy+3]  = x[3];
809:       y[idy+4]  = x[4];
810:       y[idy+5]  = x[5];
811:       x    += 6;
812:     }
813:     break;
814:   case ADD_VALUES:
815:     for (i=0; i<n; i++) {
816:       idy       = *indicesy++;
817:       y[idy]    += x[0];
818:       y[idy+1]  += x[1];
819:       y[idy+2]  += x[2];
820:       y[idy+3]  += x[3];
821:       y[idy+4]  += x[4];
822:       y[idy+5]  += x[5];
823:       x    += 6;
824:     }
825:     break;
826: #if !defined(PETSC_USE_COMPLEX)
827:   case MAX_VALUES:
828:     for (i=0; i<n; i++) {
829:       idy       = *indicesy++;
830:       y[idy]    = PetscMax(y[idy],x[0]);
831:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
832:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
833:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
834:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
835:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
836:       x    += 6;
837:     }
838: #else
839:   case MAX_VALUES:
840: #endif
841:   case NOT_SET_VALUES:
842:     break;
843:   }
844: }

846: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
847: {
848:   PetscInt i,idx,idy;

850:   switch (addv) {
851:   case INSERT_VALUES:
852:     for (i=0; i<n; i++) {
853:       idx       = *indicesx++;
854:       idy       = *indicesy++;
855:       y[idy]    = x[idx];
856:       y[idy+1]  = x[idx+1];
857:       y[idy+2]  = x[idx+2];
858:       y[idy+3]  = x[idx+3];
859:       y[idy+4]  = x[idx+4];
860:       y[idy+5]  = x[idx+5];
861:     }
862:     break;
863:   case ADD_VALUES:
864:     for (i=0; i<n; i++) {
865:       idx       = *indicesx++;
866:       idy       = *indicesy++;
867:       y[idy]    += x[idx];
868:       y[idy+1]  += x[idx+1];
869:       y[idy+2]  += x[idx+2];
870:       y[idy+3]  += x[idx+3];
871:       y[idy+4]  += x[idx+4];
872:       y[idy+5]  += x[idx+5];
873:     }
874:     break;
875: #if !defined(PETSC_USE_COMPLEX)
876:   case MAX_VALUES:
877:     for (i=0; i<n; i++) {
878:       idx       = *indicesx++;
879:       idy       = *indicesy++;
880:       y[idy]    = PetscMax(y[idy],x[idx]);
881:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
882:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
883:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
884:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
885:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
886:     }
887: #else
888:   case MAX_VALUES:
889: #endif
890:   case NOT_SET_VALUES:
891:     break;
892:   }
893: }
894:   /* ----------------------------------------------------------------------------------------------- */
895: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
896: {
897:   PetscInt i,idx;

899:   for (i=0; i<n; i++) {
900:     idx   = *indicesx++;
901:     y[0]  = x[idx];
902:     y[1]  = x[idx+1];
903:     y[2]  = x[idx+2];
904:     y[3]  = x[idx+3];
905:     y[4]  = x[idx+4];
906:     y[5]  = x[idx+5];
907:     y[6]  = x[idx+6];
908:     y    += 7;
909:   }
910: }
911: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
912: {
913:   PetscInt i,idy;

915:   switch (addv) {
916:   case INSERT_VALUES:
917:     for (i=0; i<n; i++) {
918:       idy       = *indicesy++;
919:       y[idy]    = x[0];
920:       y[idy+1]  = x[1];
921:       y[idy+2]  = x[2];
922:       y[idy+3]  = x[3];
923:       y[idy+4]  = x[4];
924:       y[idy+5]  = x[5];
925:       y[idy+6]  = x[6];
926:       x    += 7;
927:     }
928:     break;
929:   case ADD_VALUES:
930:     for (i=0; i<n; i++) {
931:       idy       = *indicesy++;
932:       y[idy]    += x[0];
933:       y[idy+1]  += x[1];
934:       y[idy+2]  += x[2];
935:       y[idy+3]  += x[3];
936:       y[idy+4]  += x[4];
937:       y[idy+5]  += x[5];
938:       y[idy+6]  += x[6];
939:       x    += 7;
940:     }
941:     break;
942: #if !defined(PETSC_USE_COMPLEX)
943:   case MAX_VALUES:
944:     for (i=0; i<n; i++) {
945:       idy       = *indicesy++;
946:       y[idy]    = PetscMax(y[idy],x[0]);
947:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
948:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
949:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
950:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
951:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
952:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
953:       x    += 7;
954:     }
955: #else
956:   case MAX_VALUES:
957: #endif
958:   case NOT_SET_VALUES:
959:     break;
960:   }
961: }

963: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
964: {
965:   PetscInt i,idx,idy;

967:   switch (addv) {
968:   case INSERT_VALUES:
969:     for (i=0; i<n; i++) {
970:       idx       = *indicesx++;
971:       idy       = *indicesy++;
972:       y[idy]    = x[idx];
973:       y[idy+1]  = x[idx+1];
974:       y[idy+2]  = x[idx+2];
975:       y[idy+3]  = x[idx+3];
976:       y[idy+4]  = x[idx+4];
977:       y[idy+5]  = x[idx+5];
978:       y[idy+6]  = x[idx+6];
979:     }
980:     break;
981:   case ADD_VALUES:
982:     for (i=0; i<n; i++) {
983:       idx       = *indicesx++;
984:       idy       = *indicesy++;
985:       y[idy]    += x[idx];
986:       y[idy+1]  += x[idx+1];
987:       y[idy+2]  += x[idx+2];
988:       y[idy+3]  += x[idx+3];
989:       y[idy+4]  += x[idx+4];
990:       y[idy+5]  += x[idx+5];
991:       y[idy+6]  += x[idx+6];
992:     }
993:     break;
994: #if !defined(PETSC_USE_COMPLEX)
995:   case MAX_VALUES:
996:     for (i=0; i<n; i++) {
997:       idx       = *indicesx++;
998:       idy       = *indicesy++;
999:       y[idy]    = PetscMax(y[idy],x[idx]);
1000:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1001:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1002:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1003:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1004:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1005:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1006:     }
1007: #else
1008:   case MAX_VALUES:
1009: #endif
1010:   case NOT_SET_VALUES:
1011:     break;
1012:   }
1013: }
1014:   /* ----------------------------------------------------------------------------------------------- */
1015: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1016: {
1017:   PetscInt i,idx;

1019:   for (i=0; i<n; i++) {
1020:     idx   = *indicesx++;
1021:     y[0]  = x[idx];
1022:     y[1]  = x[idx+1];
1023:     y[2]  = x[idx+2];
1024:     y[3]  = x[idx+3];
1025:     y[4]  = x[idx+4];
1026:     y[5]  = x[idx+5];
1027:     y[6]  = x[idx+6];
1028:     y[7]  = x[idx+7];
1029:     y    += 8;
1030:   }
1031: }
1032: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1033: {
1034:   PetscInt i,idy;

1036:   switch (addv) {
1037:   case INSERT_VALUES:
1038:     for (i=0; i<n; i++) {
1039:       idy       = *indicesy++;
1040:       y[idy]    = x[0];
1041:       y[idy+1]  = x[1];
1042:       y[idy+2]  = x[2];
1043:       y[idy+3]  = x[3];
1044:       y[idy+4]  = x[4];
1045:       y[idy+5]  = x[5];
1046:       y[idy+6]  = x[6];
1047:       y[idy+7]  = x[7];
1048:       x    += 8;
1049:     }
1050:     break;
1051:   case ADD_VALUES:
1052:     for (i=0; i<n; i++) {
1053:       idy       = *indicesy++;
1054:       y[idy]    += x[0];
1055:       y[idy+1]  += x[1];
1056:       y[idy+2]  += x[2];
1057:       y[idy+3]  += x[3];
1058:       y[idy+4]  += x[4];
1059:       y[idy+5]  += x[5];
1060:       y[idy+6]  += x[6];
1061:       y[idy+7]  += x[7];
1062:       x    += 8;
1063:     }
1064:     break;
1065: #if !defined(PETSC_USE_COMPLEX)
1066:   case MAX_VALUES:
1067:     for (i=0; i<n; i++) {
1068:       idy       = *indicesy++;
1069:       y[idy]    = PetscMax(y[idy],x[0]);
1070:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1071:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1072:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1073:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1074:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1075:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1076:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1077:       x    += 8;
1078:     }
1079: #else
1080:   case MAX_VALUES:
1081: #endif
1082:   case NOT_SET_VALUES:
1083:     break;
1084:   }
1085: }

1087: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1088: {
1089:   PetscInt i,idx,idy;

1091:   switch (addv) {
1092:   case INSERT_VALUES:
1093:     for (i=0; i<n; i++) {
1094:       idx       = *indicesx++;
1095:       idy       = *indicesy++;
1096:       y[idy]    = x[idx];
1097:       y[idy+1]  = x[idx+1];
1098:       y[idy+2]  = x[idx+2];
1099:       y[idy+3]  = x[idx+3];
1100:       y[idy+4]  = x[idx+4];
1101:       y[idy+5]  = x[idx+5];
1102:       y[idy+6]  = x[idx+6];
1103:       y[idy+7]  = x[idx+7];
1104:     }
1105:     break;
1106:   case ADD_VALUES:
1107:     for (i=0; i<n; i++) {
1108:       idx       = *indicesx++;
1109:       idy       = *indicesy++;
1110:       y[idy]    += x[idx];
1111:       y[idy+1]  += x[idx+1];
1112:       y[idy+2]  += x[idx+2];
1113:       y[idy+3]  += x[idx+3];
1114:       y[idy+4]  += x[idx+4];
1115:       y[idy+5]  += x[idx+5];
1116:       y[idy+6]  += x[idx+6];
1117:       y[idy+7]  += x[idx+7];
1118:     }
1119:     break;
1120: #if !defined(PETSC_USE_COMPLEX)
1121:   case MAX_VALUES:
1122:     for (i=0; i<n; i++) {
1123:       idx       = *indicesx++;
1124:       idy       = *indicesy++;
1125:       y[idy]    = PetscMax(y[idy],x[idx]);
1126:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1127:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1128:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1129:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1130:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1131:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1132:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1133:     }
1134: #else
1135:   case MAX_VALUES:
1136: #endif
1137:   case NOT_SET_VALUES:
1138:     break;
1139:   }
1140: }

1142:   /* ----------------------------------------------------------------------------------------------- */
1143: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1144: {
1145:   PetscInt i,idx;

1147:   for (i=0; i<n; i++) {
1148:     idx   = *indicesx++;
1149:     y[0]  = x[idx];
1150:     y[1]  = x[idx+1];
1151:     y[2]  = x[idx+2];
1152:     y[3]  = x[idx+3];
1153:     y[4]  = x[idx+4];
1154:     y[5]  = x[idx+5];
1155:     y[6]  = x[idx+6];
1156:     y[7]  = x[idx+7];
1157:     y[8]  = x[idx+8];
1158:     y[9]  = x[idx+9];
1159:     y[10] = x[idx+10];
1160:     y[11] = x[idx+11];
1161:     y    += 12;
1162:   }
1163: }
1164: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1165: {
1166:   PetscInt i,idy;

1168:   switch (addv) {
1169:   case INSERT_VALUES:
1170:     for (i=0; i<n; i++) {
1171:       idy       = *indicesy++;
1172:       y[idy]    = x[0];
1173:       y[idy+1]  = x[1];
1174:       y[idy+2]  = x[2];
1175:       y[idy+3]  = x[3];
1176:       y[idy+4]  = x[4];
1177:       y[idy+5]  = x[5];
1178:       y[idy+6]  = x[6];
1179:       y[idy+7]  = x[7];
1180:       y[idy+8]  = x[8];
1181:       y[idy+9]  = x[9];
1182:       y[idy+10] = x[10];
1183:       y[idy+11] = x[11];
1184:       x    += 12;
1185:     }
1186:     break;
1187:   case ADD_VALUES:
1188:     for (i=0; i<n; i++) {
1189:       idy       = *indicesy++;
1190:       y[idy]    += x[0];
1191:       y[idy+1]  += x[1];
1192:       y[idy+2]  += x[2];
1193:       y[idy+3]  += x[3];
1194:       y[idy+4]  += x[4];
1195:       y[idy+5]  += x[5];
1196:       y[idy+6]  += x[6];
1197:       y[idy+7]  += x[7];
1198:       y[idy+8]  += x[8];
1199:       y[idy+9]  += x[9];
1200:       y[idy+10] += x[10];
1201:       y[idy+11] += x[11];
1202:       x    += 12;
1203:     }
1204:     break;
1205: #if !defined(PETSC_USE_COMPLEX)
1206:   case MAX_VALUES:
1207:     for (i=0; i<n; i++) {
1208:       idy       = *indicesy++;
1209:       y[idy]    = PetscMax(y[idy],x[0]);
1210:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1211:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1212:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1213:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1214:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1215:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1216:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1217:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1218:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1219:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1220:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1221:       x    += 12;
1222:     }
1223: #else
1224:   case MAX_VALUES:
1225: #endif
1226:   case NOT_SET_VALUES:
1227:     break;
1228:   }
1229: }

1231: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1232: {
1233:   PetscInt i,idx,idy;

1235:   switch (addv) {
1236:   case INSERT_VALUES:
1237:     for (i=0; i<n; i++) {
1238:       idx       = *indicesx++;
1239:       idy       = *indicesy++;
1240:       y[idy]    = x[idx];
1241:       y[idy+1]  = x[idx+1];
1242:       y[idy+2]  = x[idx+2];
1243:       y[idy+3]  = x[idx+3];
1244:       y[idy+4]  = x[idx+4];
1245:       y[idy+5]  = x[idx+5];
1246:       y[idy+6]  = x[idx+6];
1247:       y[idy+7]  = x[idx+7];
1248:       y[idy+8]  = x[idx+8];
1249:       y[idy+9]  = x[idx+9];
1250:       y[idy+10] = x[idx+10];
1251:       y[idy+11] = x[idx+11];
1252:     }
1253:     break;
1254:   case ADD_VALUES:
1255:     for (i=0; i<n; i++) {
1256:       idx       = *indicesx++;
1257:       idy       = *indicesy++;
1258:       y[idy]    += x[idx];
1259:       y[idy+1]  += x[idx+1];
1260:       y[idy+2]  += x[idx+2];
1261:       y[idy+3]  += x[idx+3];
1262:       y[idy+4]  += x[idx+4];
1263:       y[idy+5]  += x[idx+5];
1264:       y[idy+6]  += x[idx+6];
1265:       y[idy+7]  += x[idx+7];
1266:       y[idy+8]  += x[idx+8];
1267:       y[idy+9]  += x[idx+9];
1268:       y[idy+10] += x[idx+10];
1269:       y[idy+11] += x[idx+11];
1270:     }
1271:     break;
1272: #if !defined(PETSC_USE_COMPLEX)
1273:   case MAX_VALUES:
1274:     for (i=0; i<n; i++) {
1275:       idx       = *indicesx++;
1276:       idy       = *indicesy++;
1277:       y[idy]    = PetscMax(y[idy],x[idx]);
1278:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1279:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1280:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1281:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1282:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1283:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1284:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1285:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1286:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1287:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1288:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1289:     }
1290: #else
1291:   case MAX_VALUES:
1292: #endif
1293:   case NOT_SET_VALUES:
1294:     break;
1295:   }
1296: }

1298: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1299: #define BS 1
1300:  #include src/vec/vec/utils/vpscat.h
1301: #define BS 2
1302:  #include src/vec/vec/utils/vpscat.h
1303: #define BS 3
1304:  #include src/vec/vec/utils/vpscat.h
1305: #define BS 4
1306:  #include src/vec/vec/utils/vpscat.h
1307: #define BS 5
1308:  #include src/vec/vec/utils/vpscat.h
1309: #define BS 6
1310:  #include src/vec/vec/utils/vpscat.h
1311: #define BS 7
1312:  #include src/vec/vec/utils/vpscat.h
1313: #define BS 8
1314:  #include src/vec/vec/utils/vpscat.h
1315: #define BS 12
1316:  #include src/vec/vec/utils/vpscat.h

1318: /* ---------------------------------------------------------------------------------*/

1322: PetscErrorCode VecScatterDestroy_PtoP_X(VecScatter ctx)
1323: {
1324:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
1325:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
1326:   PetscErrorCode         ierr;
1327:   PetscInt               i;

1330:   if (to->use_readyreceiver) {
1331:     /*
1332:        Since we have already posted sends we must cancel them before freeing 
1333:        the requests
1334:     */
1335:     for (i=0; i<from->n; i++) {
1336:       MPI_Cancel(from->requests+i);
1337:     }
1338:   }

1340:   if (to->use_alltoallw) {
1341:     PetscFree3(to->wcounts,to->wdispls,to->types);
1342:     PetscFree3(from->wcounts,from->wdispls,from->types);
1343:   }

1345:   if (to->use_alltoallv) {
1346:     PetscFree2(to->counts,to->displs);
1347:     PetscFree2(from->counts,from->displs);
1348:   }

1350:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
1351:   /* 
1352:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
1353:      message passing.
1354:   */
1355: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
1356:   for (i=0; i<to->n; i++) {
1357:     MPI_Request_free(to->requests + i);
1358:     MPI_Request_free(to->rev_requests + i);
1359:   }

1361:   /*
1362:       MPICH could not properly cancel requests thus with ready receiver mode we
1363:     cannot free the requests. It may be fixed now, if not then put the following 
1364:     code inside a if !to->use_readyreceiver) {
1365:   */
1366:   for (i=0; i<from->n; i++) {
1367:     MPI_Request_free(from->requests + i);
1368:     MPI_Request_free(from->rev_requests + i);
1369:   }
1370: #endif
1371: 
1372:   VecScatterDestroy_PtoP(ctx);
1373:   return(0);
1374: }

1376: /* ==========================================================================================*/

1378: /*              create parallel to sequential scatter context                           */

1380: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);

1384: PetscErrorCode VecScatterCreateLocal_PtoS(int nsends,int sendSizes[],int sendProcs[],int sendIdx[],int nrecvs,int recvSizes[],int recvProcs[],int recvIdx[],PetscInt bs,VecScatter ctx)
1385: {
1386:   VecScatter_MPI_General *from, *to;
1387:   PetscInt       sendSize, recvSize;
1388:   PetscInt       n, i;

1391:   /* allocate entire send scatter context */
1392:   PetscNew(VecScatter_MPI_General,&to);
1393:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1394:   to->n = nsends;
1395:   for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1396:   PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1397:   PetscMalloc4(bs*sendSize,PetscScalar,&to->values,
1398:                       sendSize,PetscInt,&to->indices,
1399:                       to->n+1,PetscInt,&to->starts,
1400:                       to->n,PetscMPIInt,&to->procs);
1401:   PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,
1402:                       PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1403:   to->starts[0] = 0;
1404:   for(n = 0; n < to->n; n++) {
1405:     to->starts[n+1] = to->starts[n] + sendSizes[n];
1406:     to->procs[n] = sendProcs[n];
1407:     for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1408:       to->indices[i] = sendIdx[i];
1409:     }
1410:   }
1411:   ctx->todata = (void *) to;

1413:   /* allocate entire receive scatter context */
1414:   PetscNew(VecScatter_MPI_General,&from);
1415:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
1416:   from->n = nrecvs;
1417:   for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1418:   PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1419:   PetscMalloc4(bs*recvSize,PetscScalar,&from->values,
1420:                       recvSize,PetscInt,&from->indices,
1421:                       from->n+1,PetscInt,&from->starts,
1422:                       from->n,PetscMPIInt,&from->procs);
1423:   from->starts[0] = 0;
1424:   for(n = 0; n < from->n; n++) {
1425:     from->starts[n+1] = from->starts[n] + recvSizes[n];
1426:     from->procs[n] = recvProcs[n];
1427:     for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1428:       from->indices[i] = recvIdx[i];
1429:     }
1430:   }
1431:   ctx->fromdata = (void *)from;

1433:   /* No local scatter optimization */
1434:   from->local.n      = 0;
1435:   from->local.vslots = 0;
1436:   to->local.n        = 0;
1437:   to->local.vslots   = 0;
1438:   from->local.nonmatching_computed = PETSC_FALSE;
1439:   from->local.n_nonmatching        = 0;
1440:   from->local.slots_nonmatching    = 0;
1441:   to->local.nonmatching_computed   = PETSC_FALSE;
1442:   to->local.n_nonmatching          = 0;
1443:   to->local.slots_nonmatching      = 0;

1445:   from->type = VEC_SCATTER_MPI_GENERAL;
1446:   to->type   = VEC_SCATTER_MPI_GENERAL;
1447:   from->bs = bs;
1448:   to->bs   = bs;

1450:   VecScatterCreateCommon_PtoS(from, to, ctx);
1451:   return(0);
1452: }

1454: /*
1455:    bs indicates how many elements there are in each block. Normally this would be 1.
1456: */
1459: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,PetscInt *inidx,PetscInt ny,PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1460: {
1461:   VecScatter_MPI_General *from,*to;
1462:   PetscMPIInt            size,rank,imdex,tag,n;
1463:   PetscInt               *source = PETSC_NULL,*lens = PETSC_NULL,*owners = PETSC_NULL;
1464:   PetscInt               *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1465:   PetscInt               *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs;
1466:   PetscInt               *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1467:   PetscInt               *rvalues,*svalues,base,nmax,*values,*indx,nprocslocal;
1468:   MPI_Comm               comm;
1469:   MPI_Request            *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1470:   MPI_Status             recv_status,*send_status;
1471:   PetscErrorCode         ierr;

1474:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1475:   PetscObjectGetComm((PetscObject)xin,&comm);
1476:   MPI_Comm_rank(comm,&rank);
1477:   MPI_Comm_size(comm,&size);
1478:   owners = xin->map.range;
1479:   VecGetSize(yin,&lengthy);
1480:   VecGetSize(xin,&lengthx);

1482:   /*  first count number of contributors to each processor */
1483:   PetscMalloc2(2*size,PetscInt,&nprocs,nx,PetscInt,&owner);
1484:   PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1485:   PetscSortIntWithArray(nx,inidx,inidy);
1486:   if (nx > 0 && inidx[0] < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Parallel index %D is negative",inidx[0]);
1487:   if (nx > 0 && inidx[nx-1] > lengthx) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"Parallel index %D is longer than vector length",inidx[nx-1],lengthx);
1488:   j      = 0;
1489:   nsends = 0;
1490:   for (i=0; i<nx; i++) {
1491:     idx = inidx[i];
1492:     for (; j<size; j++) {
1493:       if (idx < owners[j+1]) {
1494:         if (!nprocs[2*j]++) nsends++;
1495:         nprocs[2*j+1] = 1;
1496:         owner[i] = j;
1497:         break;
1498:       }
1499:     }
1500:   }
1501:   nprocslocal    = nprocs[2*rank];
1502:   nprocs[2*rank] = nprocs[2*rank+1] = 0;
1503:   if (nprocslocal) nsends--;

1505:   /* inform other processors of number of messages and max length*/
1506:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

1508:   /* post receives:   */
1509:   PetscMalloc4(nrecvs*nmax,PetscInt,&rvalues,nrecvs,PetscInt,&lens,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1510:   for (i=0; i<nrecvs; i++) {
1511:     MPI_Irecv((rvalues+nmax*i),nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1512:   }

1514:   /* do sends:
1515:      1) starts[i] gives the starting index in svalues for stuff going to 
1516:      the ith processor
1517:   */
1518:   PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1519:   starts[0]  = 0;
1520:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1521:   for (i=0; i<nx; i++) {
1522:     if (owner[i] != rank) {
1523:       svalues[starts[owner[i]]++] = inidx[i];
1524:     }
1525:   }

1527:   starts[0] = 0;
1528:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1529:   count = 0;
1530:   for (i=0; i<size; i++) {
1531:     if (nprocs[2*i+1]) {
1532:       MPI_Isend(svalues+starts[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count++);
1533:     }
1534:   }

1536:   /*  wait on receives */
1537:   count  = nrecvs;
1538:   slen   = 0;
1539:   while (count) {
1540:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1541:     /* unpack receives into our local space */
1542:     MPI_Get_count(&recv_status,MPIU_INT,&n);
1543:     source[imdex]  = recv_status.MPI_SOURCE;
1544:     lens[imdex]    = n;
1545:     slen          += n;
1546:     count--;
1547:   }
1548: 
1549:   /* allocate entire send scatter context */
1550:   PetscNew(VecScatter_MPI_General,&to);
1551:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1552:   to->n = nrecvs;
1553:   PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1554:   PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,
1555:                        nrecvs,PetscMPIInt,&to->procs);
1556:   PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,
1557:                        &to->rstatus);
1558:   ctx->todata   = (void*)to;
1559:   to->starts[0] = 0;

1561:   if (nrecvs) {
1562:     PetscMalloc(nrecvs*sizeof(PetscInt),&indx);
1563:     for (i=0; i<nrecvs; i++) indx[i] = i;
1564:     PetscSortIntWithPermutation(nrecvs,source,indx);

1566:     /* move the data into the send scatter */
1567:     base = owners[rank];
1568:     for (i=0; i<nrecvs; i++) {
1569:       to->starts[i+1] = to->starts[i] + lens[indx[i]];
1570:       to->procs[i]    = source[indx[i]];
1571:       values = rvalues + indx[i]*nmax;
1572:       for (j=0; j<lens[indx[i]]; j++) {
1573:         to->indices[to->starts[i] + j] = values[j] - base;
1574:       }
1575:     }
1576:     PetscFree(indx);
1577:   }
1578:   PetscFree4(rvalues,lens,source,recv_waits);
1579: 
1580:   /* allocate entire receive scatter context */
1581:   PetscNew(VecScatter_MPI_General,&from);
1582:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
1583:   from->n        = nsends;

1585:   PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1586:   PetscMalloc4(ny*bs,PetscScalar,&from->values,ny,PetscInt,&from->indices,
1587:                       nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1588:   ctx->fromdata  = (void*)from;

1590:   /* move data into receive scatter */
1591:   PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1592:   count = 0; from->starts[0] = start[0] = 0;
1593:   for (i=0; i<size; i++) {
1594:     if (nprocs[2*i+1]) {
1595:       lowner[i]            = count;
1596:       from->procs[count++] = i;
1597:       from->starts[count]  = start[count] = start[count-1] + nprocs[2*i];
1598:     }
1599:   }
1600:   for (i=0; i<nx; i++) {
1601:     if (owner[i] != rank) {
1602:       from->indices[start[lowner[owner[i]]]++] = inidy[i];
1603:       if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1604:     }
1605:   }
1606:   PetscFree2(lowner,start);
1607:   PetscFree2(nprocs,owner);
1608: 
1609:   /* wait on sends */
1610:   if (nsends) {
1611:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1612:     MPI_Waitall(nsends,send_waits,send_status);
1613:     PetscFree(send_status);
1614:   }
1615:   PetscFree3(svalues,send_waits,starts);

1617:   if (nprocslocal) {
1618:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
1619:     /* we have a scatter to ourselves */
1620:     PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1621:     PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1622:     nt   = 0;
1623:     for (i=0; i<nx; i++) {
1624:       idx = inidx[i];
1625:       if (idx >= owners[rank] && idx < owners[rank+1]) {
1626:         to->local.vslots[nt]     = idx - owners[rank];
1627:         from->local.vslots[nt++] = inidy[i];
1628:         if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1629:       }
1630:     }
1631:   } else {
1632:     from->local.n      = 0;
1633:     from->local.vslots = 0;
1634:     to->local.n        = 0;
1635:     to->local.vslots   = 0;
1636:   }
1637:   from->local.nonmatching_computed = PETSC_FALSE;
1638:   from->local.n_nonmatching        = 0;
1639:   from->local.slots_nonmatching    = 0;
1640:   to->local.nonmatching_computed   = PETSC_FALSE;
1641:   to->local.n_nonmatching          = 0;
1642:   to->local.slots_nonmatching      = 0;

1644:   from->type = VEC_SCATTER_MPI_GENERAL;
1645:   to->type   = VEC_SCATTER_MPI_GENERAL;
1646:   from->bs = bs;
1647:   to->bs   = bs;

1649:   VecScatterCreateCommon_PtoS(from,to,ctx);
1650:   return(0);
1651: }

1653: /*
1654:    bs indicates how many elements there are in each block. Normally this would be 1.
1655: */
1658: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1659: {
1660:   MPI_Comm       comm = ctx->comm;
1661:   PetscMPIInt    tag  = ctx->tag;
1662:   PetscInt       bs   = to->bs;
1663:   PetscMPIInt    size;
1664:   PetscInt       i, n;
1666: 
1668:   MPI_Comm_size(comm,&size);
1669:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1670:   to->contiq = PETSC_FALSE;
1671:   n = from->starts[from->n];
1672:   from->contiq = PETSC_TRUE;
1673:   for (i=1; i<n; i++) {
1674:     if (from->indices[i] != from->indices[i-1] + bs) {
1675:       from->contiq = PETSC_FALSE;
1676:       break;
1677:     }
1678:   }

1680:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoallw",&to->use_alltoallw);
1681:   from->use_alltoallw = to->use_alltoallw;
1682:   if (to->use_alltoallw) {
1683:     to->use_alltoallv = PETSC_TRUE;
1684:   } else {
1685:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoallv",&to->use_alltoallv);
1686:   }
1687:   from->use_alltoallv = to->use_alltoallv;

1689:   if (to->use_alltoallv) {
1690:     ctx->destroy = VecScatterDestroy_PtoP;

1692:     PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1693:     PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1694:     for (i=0; i<to->n; i++) {
1695:       to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1696:     }
1697:     to->displs[0] = 0;
1698:     for (i=1; i<size; i++) {
1699:       to->displs[i] = to->displs[i-1] + to->counts[i-1];
1700:     }

1702:     PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1703:     PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1704:     for (i=0; i<from->n; i++) {
1705:       from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1706:     }
1707:     from->displs[0] = 0;
1708:     for (i=1; i<size; i++) {
1709:       from->displs[i] = from->displs[i-1] + from->counts[i-1];
1710:     }
1711: #if defined(PETSC_HAVE_MPI_ALLTOALLW) 
1712:     if (to->use_alltoallw) {
1713:       ctx->packtogether = PETSC_FALSE;
1714:       PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1715:       PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1716:       PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1717:       for (i=0; i<size; i++) {
1718:         to->types[i] = MPIU_SCALAR;
1719:       }

1721:       for (i=0; i<to->n; i++) {
1722:         to->wcounts[to->procs[i]] = ((to->starts[i+1] - to->starts[i]) > 0) ? 1 : 0;
1723:         MPI_Type_create_indexed_block(to->starts[i+1]-to->starts[i],bs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1724:         MPI_Type_commit(to->types+to->procs[i]);
1725:       }
1726:       PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1727:       PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1728:       PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1729:       for (i=0; i<size; i++) {
1730:         from->types[i] = MPIU_SCALAR;
1731:       }
1732:       if (from->contiq) {
1733:         PetscInfo(0,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoallw");
1734:         for (i=0; i<from->n; i++) {
1735:           from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1736:         }
1737:         if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1738:         for (i=1; i<from->n; i++) {
1739:           from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1740:         }
1741:       } else {
1742:         for (i=0; i<from->n; i++) {
1743:           from->wcounts[from->procs[i]] = ((from->starts[i+1] - from->starts[i]) > 0) ? 1 : 0;
1744:           MPI_Type_create_indexed_block(from->starts[i+1]-from->starts[i],bs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1745:           MPI_Type_commit(from->types+from->procs[i]);
1746:         }
1747:       }
1748:     }
1749: #else 
1750:     to->use_alltoallw   = PETSC_FALSE;
1751:     from->use_alltoallw = PETSC_FALSE;
1752: #endif
1753:   } else {
1754:     PetscTruth  use_rr = PETSC_FALSE, use_ssend = PETSC_FALSE;
1755:     PetscInt    *sstarts = to->starts,  *rstarts = from->starts;
1756:     PetscMPIInt *sprocs  = to->procs,   *rprocs  = from->procs;
1757:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
1758:     MPI_Request *rev_swaits,*rev_rwaits;
1759:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

1761:     /* allocate additional wait variables for the "reverse" scatter */
1762:     PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1763:     PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1764:     to->rev_requests   = rev_rwaits;
1765:     from->rev_requests = rev_swaits;

1767:     /* Register the receives that you will use later (sends for scatter reverse) */
1768:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&use_rr);
1769:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1770:     if (use_rr) {
1771:       PetscInfo(0,"Using VecScatter ready receiver mode\n");
1772:     }
1773:     if (use_ssend) {
1774:       PetscInfo(0,"Using VecScatter Ssend mode\n");
1775:     }

1777:     for (i=0; i<from->n; i++) {
1778:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1779:       if (!use_ssend) {
1780:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
1781:       } else {
1782:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
1783:       }
1784:     }

1786:     if (use_rr) {
1787:       to->use_readyreceiver    = PETSC_TRUE;
1788:       from->use_readyreceiver  = PETSC_TRUE;
1789:       for (i=0; i<to->n; i++) {
1790:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1791:       }
1792:       if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1793:       MPI_Barrier(comm);
1794:     } else {
1795:       to->use_readyreceiver    = PETSC_FALSE;
1796:       from->use_readyreceiver  = PETSC_FALSE;
1797:       for (i=0; i<to->n; i++) {
1798:         if (!use_ssend) {
1799:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1800:         } else {
1801:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1802:         }
1803:       }
1804:     }
1805:     /* Register receives for scatter reverse */
1806:     for (i=0; i<to->n; i++) {
1807:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
1808:     }

1810:     ctx->destroy   = VecScatterDestroy_PtoP_X;
1811:     ctx->copy      = VecScatterCopy_PtoP_X;
1812:   }
1813:   PetscInfo1(0,"Using blocksize %D scatter\n",bs);
1814:   switch (bs) {
1815:   case 12:
1816:     ctx->begin     = VecScatterBegin_12;
1817:     ctx->end       = VecScatterEnd_12;
1818:     break;
1819:   case 8:
1820:     ctx->begin     = VecScatterBegin_8;
1821:     ctx->end       = VecScatterEnd_8;
1822:     break;
1823:   case 7:
1824:     ctx->begin     = VecScatterBegin_7;
1825:     ctx->end       = VecScatterEnd_7;
1826:     break;
1827:   case 6:
1828:     ctx->begin     = VecScatterBegin_6;
1829:     ctx->end       = VecScatterEnd_6;
1830:     break;
1831:   case 5:
1832:     ctx->begin     = VecScatterBegin_5;
1833:     ctx->end       = VecScatterEnd_5;
1834:     break;
1835:   case 4:
1836:     ctx->begin     = VecScatterBegin_4;
1837:     ctx->end       = VecScatterEnd_4;
1838:     break;
1839:   case 3:
1840:     ctx->begin     = VecScatterBegin_3;
1841:     ctx->end       = VecScatterEnd_3;
1842:     break;
1843:   case 2:
1844:     ctx->begin     = VecScatterBegin_2;
1845:     ctx->end       = VecScatterEnd_2;
1846:     break;
1847:   case 1:
1848:     ctx->begin     = VecScatterBegin_1;
1849:     ctx->end       = VecScatterEnd_1;
1850:     break;
1851:   default:
1852:     SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
1853:   }
1854:   ctx->view      = VecScatterView_MPI;

1856:   /* Check if the local scatter is actually a copy; important special case */
1857:   if (to->local.n) {
1858:     VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
1859:   }
1860:   return(0);
1861: }



1865: /* ------------------------------------------------------------------------------------*/
1866: /*
1867:          Scatter from local Seq vectors to a parallel vector. 
1868:          Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
1869:          reverses the result.
1870: */
1873: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,PetscInt *inidx,PetscInt ny,PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1874: {
1875:   PetscErrorCode         ierr;
1876:   MPI_Request            *waits;
1877:   VecScatter_MPI_General *to,*from;

1880:   VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
1881:   to            = (VecScatter_MPI_General*)ctx->fromdata;
1882:   from          = (VecScatter_MPI_General*)ctx->todata;
1883:   ctx->todata   = (void*)to;
1884:   ctx->fromdata = (void*)from;
1885:   /* these two are special, they are ALWAYS stored in to struct */
1886:   to->sstatus   = from->sstatus;
1887:   to->rstatus   = from->rstatus;

1889:   from->sstatus = 0;
1890:   from->rstatus = 0;

1892:   waits              = from->rev_requests;
1893:   from->rev_requests = from->requests;
1894:   from->requests     = waits;
1895:   waits              = to->rev_requests;
1896:   to->rev_requests   = to->requests;
1897:   to->requests       = waits;
1898:   return(0);
1899: }

1901: /* ---------------------------------------------------------------------------------*/
1904: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,PetscInt *inidx,PetscInt ny,PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
1905: {
1907:   PetscMPIInt    size,rank,tag,imdex,n;
1908:   PetscInt       *lens = PETSC_NULL,*owners = xin->map.range;
1909:   PetscInt       *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
1910:   PetscInt       *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1911:   PetscInt       *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,nmax,*values = PETSC_NULL,lastidx;
1912:   MPI_Comm       comm;
1913:   MPI_Request    *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1914:   MPI_Status     recv_status,*send_status = PETSC_NULL;
1915:   PetscTruth     duplicate = PETSC_FALSE;
1916: #if defined(PETSC_USE_DEBUG)
1917:   PetscTruth     found = PETSC_FALSE;
1918: #endif

1921:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1922:   PetscObjectGetComm((PetscObject)xin,&comm);
1923:   MPI_Comm_size(comm,&size);
1924:   MPI_Comm_rank(comm,&rank);
1925:   if (size == 1) {
1926:     VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
1927:     return(0);
1928:   }

1930:   /*
1931:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
1932:      They then call the StoPScatterCreate()
1933:   */
1934:   /*  first count number of contributors to each processor */
1935:   PetscMalloc3(2*size,PetscInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
1936:   PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1937:   lastidx = -1;
1938:   j       = 0;
1939:   for (i=0; i<nx; i++) {
1940:     /* if indices are NOT locally sorted, need to start search at the beginning */
1941:     if (lastidx > (idx = inidx[i])) j = 0;
1942:     lastidx = idx;
1943:     for (; j<size; j++) {
1944:       if (idx >= owners[j] && idx < owners[j+1]) {
1945:         nprocs[2*j]++;
1946:         nprocs[2*j+1] = 1;
1947:         owner[i] = j;
1948: #if defined(PETSC_USE_DEBUG)
1949:         found = PETSC_TRUE;
1950: #endif
1951:         break;
1952:       }
1953:     }
1954: #if defined(PETSC_USE_DEBUG)
1955:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
1956:     found = PETSC_FALSE;
1957: #endif
1958:   }
1959:   nsends = 0;  for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}

1961:   /* inform other processors of number of messages and max length*/
1962:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

1964:   /* post receives:   */
1965:   PetscMalloc6(2*nrecvs*nmax,PetscInt,&rvalues,2*nx,PetscInt,&svalues,2*nrecvs,PetscInt,&lens,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);

1967:   for (i=0; i<nrecvs; i++) {
1968:     MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1969:   }

1971:   /* do sends:
1972:       1) starts[i] gives the starting index in svalues for stuff going to 
1973:          the ith processor
1974:   */
1975:   starts[0]= 0;
1976:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1977:   for (i=0; i<nx; i++) {
1978:     svalues[2*starts[owner[i]]]       = inidx[i];
1979:     svalues[1 + 2*starts[owner[i]]++] = inidy[i];
1980:   }

1982:   starts[0] = 0;
1983:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1984:   count = 0;
1985:   for (i=0; i<size; i++) {
1986:     if (nprocs[2*i+1]) {
1987:       MPI_Isend(svalues+2*starts[i],2*nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count);
1988:       count++;
1989:     }
1990:   }
1991:   PetscFree3(nprocs,owner,starts);

1993:   /*  wait on receives */
1994:   count = nrecvs;
1995:   slen  = 0;
1996:   while (count) {
1997:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1998:     /* unpack receives into our local space */
1999:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2000:     lens[imdex]  =  n/2;
2001:     slen         += n/2;
2002:     count--;
2003:   }
2004: 
2005:   PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2006:   base  = owners[rank];
2007:   count = 0;
2008:   for (i=0; i<nrecvs; i++) {
2009:     values = rvalues + 2*i*nmax;
2010:     for (j=0; j<lens[i]; j++) {
2011:       local_inidx[count]   = values[2*j] - base;
2012:       local_inidy[count++] = values[2*j+1];
2013:     }
2014:   }

2016:   /* wait on sends */
2017:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2018:   PetscFree6(rvalues,svalues,lens,recv_waits,send_waits,send_status);

2020:   /*
2021:      should sort and remove duplicates from local_inidx,local_inidy 
2022:   */

2024: #if defined(do_it_slow)
2025:   /* sort on the from index */
2026:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2027:   start = 0;
2028:   while (start < slen) {
2029:     count = start+1;
2030:     last  = local_inidx[start];
2031:     while (count < slen && last == local_inidx[count]) count++;
2032:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2033:       /* sort on to index */
2034:       PetscSortInt(count-start,local_inidy+start);
2035:     }
2036:     /* remove duplicates; not most efficient way, but probably good enough */
2037:     i = start;
2038:     while (i < count-1) {
2039:       if (local_inidy[i] != local_inidy[i+1]) {
2040:         i++;
2041:       } else { /* found a duplicate */
2042:         duplicate = PETSC_TRUE;
2043:         for (j=i; j<slen-1; j++) {
2044:           local_inidx[j] = local_inidx[j+1];
2045:           local_inidy[j] = local_inidy[j+1];
2046:         }
2047:         slen--;
2048:         count--;
2049:       }
2050:     }
2051:     start = count;
2052:   }
2053: #endif
2054:   if (duplicate) {
2055:     PetscInfo(0,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2056:   }
2057:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2058:   PetscFree2(local_inidx,local_inidy);
2059:   return(0);
2060: }