Actual source code: vpscat.c
1: #define PETSCVEC_DLL
3: /*
4: Defines parallel vector scatters.
5: */
7: #include private/isimpl.h
8: #include private/vecimpl.h
9: #include ../src/vec/vec/impls/dvecimpl.h
10: #include ../src/vec/vec/impls/mpi/pvecimpl.h
11: #include petscsys.h
15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
16: {
17: VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
18: VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
19: PetscErrorCode ierr;
20: PetscInt i;
21: PetscMPIInt rank;
22: PetscViewerFormat format;
23: PetscTruth iascii;
26: PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
27: if (iascii) {
28: MPI_Comm_rank(((PetscObject)ctx)->comm,&rank);
29: PetscViewerGetFormat(viewer,&format);
30: if (format == PETSC_VIEWER_ASCII_INFO) {
31: PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
33: MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
34: MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
35: itmp = to->starts[to->n+1];
36: MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
37: itmp = from->starts[from->n+1];
38: MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
39: MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,((PetscObject)ctx)->comm);
41: PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
42: PetscViewerASCIIPrintf(viewer," Maximum number sends %D\n",nsend_max);
43: PetscViewerASCIIPrintf(viewer," Maximum number receives %D\n",nrecv_max);
44: PetscViewerASCIIPrintf(viewer," Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
45: PetscViewerASCIIPrintf(viewer," Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
46: PetscViewerASCIIPrintf(viewer," Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
48: } else {
49: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
50: if (to->n) {
51: for (i=0; i<to->n; i++){
52: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
53: }
54: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
55: for (i=0; i<to->starts[to->n]; i++){
56: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
57: }
58: }
60: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
61: if (from->n) {
62: for (i=0; i<from->n; i++){
63: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
64: }
66: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
67: for (i=0; i<from->starts[from->n]; i++){
68: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
69: }
70: }
71: if (to->local.n) {
72: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
73: for (i=0; i<to->local.n; i++){
74: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
75: }
76: }
78: PetscViewerFlush(viewer);
79: }
80: } else {
81: SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
82: }
83: return(0);
84: }
86: /* -----------------------------------------------------------------------------------*/
87: /*
88: The next routine determines what part of the local part of the scatter is an
89: exact copy of values into their current location. We check this here and
90: then know that we need not perform that portion of the scatter when the vector is
91: scattering to itself with INSERT_VALUES.
93: This is currently not used but would speed up, for example DALocalToLocalBegin/End()
95: */
98: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
99: {
100: PetscInt n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
102: PetscInt *nto_slots,*nfrom_slots,j = 0;
103:
105: for (i=0; i<n; i++) {
106: if (to_slots[i] != from_slots[i]) n_nonmatching++;
107: }
109: if (!n_nonmatching) {
110: to->nonmatching_computed = PETSC_TRUE;
111: to->n_nonmatching = from->n_nonmatching = 0;
112: PetscInfo1(scatter,"Reduced %D to 0\n", n);
113: } else if (n_nonmatching == n) {
114: to->nonmatching_computed = PETSC_FALSE;
115: PetscInfo(scatter,"All values non-matching\n");
116: } else {
117: to->nonmatching_computed= PETSC_TRUE;
118: to->n_nonmatching = from->n_nonmatching = n_nonmatching;
119: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
120: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
121: to->slots_nonmatching = nto_slots;
122: from->slots_nonmatching = nfrom_slots;
123: for (i=0; i<n; i++) {
124: if (to_slots[i] != from_slots[i]) {
125: nto_slots[j] = to_slots[i];
126: nfrom_slots[j] = from_slots[i];
127: j++;
128: }
129: }
130: PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
131: }
132: return(0);
133: }
135: /* --------------------------------------------------------------------------------------*/
137: /* -------------------------------------------------------------------------------------*/
140: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
141: {
142: VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
143: VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
144: PetscErrorCode ierr;
145: PetscInt i;
148: if (to->use_readyreceiver) {
149: /*
150: Since we have already posted sends we must cancel them before freeing
151: the requests
152: */
153: for (i=0; i<from->n; i++) {
154: MPI_Cancel(from->requests+i);
155: }
156: for (i=0; i<to->n; i++) {
157: MPI_Cancel(to->rev_requests+i);
158: }
159: }
161: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
162: if (to->use_alltoallw) {
163: PetscFree3(to->wcounts,to->wdispls,to->types);
164: PetscFree3(from->wcounts,from->wdispls,from->types);
165: }
166: #endif
168: #if defined(PETSC_HAVE_MPI_WINDOW)
169: if (to->use_window) {
170: MPI_Win_free(&from->window);
171: MPI_Win_free(&to->window);
172: }
173: #endif
175: if (to->use_alltoallv) {
176: PetscFree2(to->counts,to->displs);
177: PetscFree2(from->counts,from->displs);
178: }
180: /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
181: /*
182: IBM's PE version of MPI has a bug where freeing these guys will screw up later
183: message passing.
184: */
185: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
186: if (!to->use_alltoallv) { /* currently the to->requests etc are ALWAYS allocated even if not used */
187: if (to->requests) {
188: for (i=0; i<to->n; i++) {
189: MPI_Request_free(to->requests + i);
190: }
191: }
192: if (to->rev_requests) {
193: for (i=0; i<to->n; i++) {
194: MPI_Request_free(to->rev_requests + i);
195: }
196: }
197: }
198: /*
199: MPICH could not properly cancel requests thus with ready receiver mode we
200: cannot free the requests. It may be fixed now, if not then put the following
201: code inside a if !to->use_readyreceiver) {
202: */
203: if (!to->use_alltoallv) { /* currently the from->requests etc are ALWAYS allocated even if not used */
204: if (from->requests) {
205: for (i=0; i<from->n; i++) {
206: MPI_Request_free(from->requests + i);
207: }
208: }
210: if (from->rev_requests) {
211: for (i=0; i<from->n; i++) {
212: MPI_Request_free(from->rev_requests + i);
213: }
214: }
215: }
216: #endif
218: PetscFree(to->local.vslots);
219: PetscFree(from->local.vslots);
220: PetscFree2(to->counts,to->displs);
221: PetscFree2(from->counts,from->displs);
222: PetscFree(to->local.slots_nonmatching);
223: PetscFree(from->local.slots_nonmatching);
224: PetscFree(to->rev_requests);
225: PetscFree(from->rev_requests);
226: PetscFree(to->requests);
227: PetscFree(from->requests);
228: PetscFree4(to->values,to->indices,to->starts,to->procs);
229: PetscFree2(to->sstatus,to->rstatus);
230: PetscFree4(from->values,from->indices,from->starts,from->procs);
231: PetscFree(from);
232: PetscFree(to);
233: PetscHeaderDestroy(ctx);
234: return(0);
235: }
239: /* --------------------------------------------------------------------------------------*/
240: /*
241: Special optimization to see if the local part of the scatter is actually
242: a copy. The scatter routines call PetscMemcpy() instead.
243:
244: */
247: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
248: {
249: PetscInt n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
250: PetscInt to_start,from_start;
254: to_start = to_slots[0];
255: from_start = from_slots[0];
257: for (i=1; i<n; i++) {
258: to_start += bs;
259: from_start += bs;
260: if (to_slots[i] != to_start) return(0);
261: if (from_slots[i] != from_start) return(0);
262: }
263: to->is_copy = PETSC_TRUE;
264: to->copy_start = to_slots[0];
265: to->copy_length = bs*sizeof(PetscScalar)*n;
266: from->is_copy = PETSC_TRUE;
267: from->copy_start = from_slots[0];
268: from->copy_length = bs*sizeof(PetscScalar)*n;
269: PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
270: return(0);
271: }
273: /* --------------------------------------------------------------------------------------*/
277: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
278: {
279: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
280: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
281: PetscErrorCode ierr;
282: PetscInt ny,bs = in_from->bs;
285: out->begin = in->begin;
286: out->end = in->end;
287: out->copy = in->copy;
288: out->destroy = in->destroy;
289: out->view = in->view;
291: /* allocate entire send scatter context */
292: PetscNewLog(out,VecScatter_MPI_General,&out_to);
293: PetscNewLog(out,VecScatter_MPI_General,&out_from);
295: ny = in_to->starts[in_to->n];
296: out_to->n = in_to->n;
297: out_to->type = in_to->type;
298: out_to->sendfirst = in_to->sendfirst;
300: PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
301: PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,out_to->n,PetscMPIInt,&out_to->procs);
302: PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->rstatus);
303: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
304: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
305: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
306:
307: out->todata = (void*)out_to;
308: out_to->local.n = in_to->local.n;
309: out_to->local.nonmatching_computed = PETSC_FALSE;
310: out_to->local.n_nonmatching = 0;
311: out_to->local.slots_nonmatching = 0;
312: if (in_to->local.n) {
313: PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
314: PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
315: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
316: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
317: } else {
318: out_to->local.vslots = 0;
319: out_from->local.vslots = 0;
320: }
322: /* allocate entire receive context */
323: out_from->type = in_from->type;
324: ny = in_from->starts[in_from->n];
325: out_from->n = in_from->n;
326: out_from->sendfirst = in_from->sendfirst;
328: PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
329: PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
330: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
331: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
332: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
333: out->fromdata = (void*)out_from;
334: out_from->local.n = in_from->local.n;
335: out_from->local.nonmatching_computed = PETSC_FALSE;
336: out_from->local.n_nonmatching = 0;
337: out_from->local.slots_nonmatching = 0;
339: /*
340: set up the request arrays for use with isend_init() and irecv_init()
341: */
342: {
343: PetscMPIInt tag;
344: MPI_Comm comm;
345: PetscInt *sstarts = out_to->starts, *rstarts = out_from->starts;
346: PetscMPIInt *sprocs = out_to->procs, *rprocs = out_from->procs;
347: PetscInt i;
348: PetscTruth flg;
349: MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
350: MPI_Request *rev_swaits,*rev_rwaits;
351: PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
353: PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
354: PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);
356: rev_rwaits = out_to->rev_requests;
357: rev_swaits = out_from->rev_requests;
359: out_from->bs = out_to->bs = bs;
360: tag = ((PetscObject)out)->tag;
361: comm = ((PetscObject)out)->comm;
363: /* Register the receives that you will use later (sends for scatter reverse) */
364: for (i=0; i<out_from->n; i++) {
365: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
366: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
367: }
369: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&flg);
370: if (flg) {
371: out_to->use_readyreceiver = PETSC_TRUE;
372: out_from->use_readyreceiver = PETSC_TRUE;
373: for (i=0; i<out_to->n; i++) {
374: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
375: }
376: if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
377: MPI_Barrier(comm);
378: PetscInfo(in,"Using VecScatter ready receiver mode\n");
379: } else {
380: out_to->use_readyreceiver = PETSC_FALSE;
381: out_from->use_readyreceiver = PETSC_FALSE;
382: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
383: if (flg) {
384: PetscInfo(in,"Using VecScatter Ssend mode\n");
385: }
386: for (i=0; i<out_to->n; i++) {
387: if (!flg) {
388: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
389: } else {
390: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
391: }
392: }
393: }
394: /* Register receives for scatter reverse */
395: for (i=0; i<out_to->n; i++) {
396: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
397: }
398: }
400: return(0);
401: }
402: /* --------------------------------------------------------------------------------------------------
403: Packs and unpacks the message data into send or from receive buffers.
405: These could be generated automatically.
407: Fortran kernels etc. could be used.
408: */
409: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
410: {
411: PetscInt i;
412: for (i=0; i<n; i++) {
413: y[i] = x[indicesx[i]];
414: }
415: }
416: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
417: {
418: PetscInt i;
419: switch (addv) {
420: case INSERT_VALUES:
421: for (i=0; i<n; i++) {
422: y[indicesy[i]] = x[i];
423: }
424: break;
425: case ADD_VALUES:
426: for (i=0; i<n; i++) {
427: y[indicesy[i]] += x[i];
428: }
429: break;
430: #if !defined(PETSC_USE_COMPLEX)
431: case MAX_VALUES:
432: for (i=0; i<n; i++) {
433: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
434: }
435: #else
436: case MAX_VALUES:
437: #endif
438: case NOT_SET_VALUES:
439: break;
440: }
441: }
443: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
444: {
445: PetscInt i;
446: switch (addv) {
447: case INSERT_VALUES:
448: for (i=0; i<n; i++) {
449: y[indicesy[i]] = x[indicesx[i]];
450: }
451: break;
452: case ADD_VALUES:
453: for (i=0; i<n; i++) {
454: y[indicesy[i]] += x[indicesx[i]];
455: }
456: break;
457: #if !defined(PETSC_USE_COMPLEX)
458: case MAX_VALUES:
459: for (i=0; i<n; i++) {
460: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
461: }
462: #else
463: case MAX_VALUES:
464: #endif
465: case NOT_SET_VALUES:
466: break;
467: }
468: }
470: /* ----------------------------------------------------------------------------------------------- */
471: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
472: {
473: PetscInt i,idx;
475: for (i=0; i<n; i++) {
476: idx = *indicesx++;
477: y[0] = x[idx];
478: y[1] = x[idx+1];
479: y += 2;
480: }
481: }
482: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
483: {
484: PetscInt i,idy;
486: switch (addv) {
487: case INSERT_VALUES:
488: for (i=0; i<n; i++) {
489: idy = *indicesy++;
490: y[idy] = x[0];
491: y[idy+1] = x[1];
492: x += 2;
493: }
494: break;
495: case ADD_VALUES:
496: for (i=0; i<n; i++) {
497: idy = *indicesy++;
498: y[idy] += x[0];
499: y[idy+1] += x[1];
500: x += 2;
501: }
502: break;
503: #if !defined(PETSC_USE_COMPLEX)
504: case MAX_VALUES:
505: for (i=0; i<n; i++) {
506: idy = *indicesy++;
507: y[idy] = PetscMax(y[idy],x[0]);
508: y[idy+1] = PetscMax(y[idy+1],x[1]);
509: x += 2;
510: }
511: #else
512: case MAX_VALUES:
513: #endif
514: case NOT_SET_VALUES:
515: break;
516: }
517: }
519: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
520: {
521: PetscInt i,idx,idy;
523: switch (addv) {
524: case INSERT_VALUES:
525: for (i=0; i<n; i++) {
526: idx = *indicesx++;
527: idy = *indicesy++;
528: y[idy] = x[idx];
529: y[idy+1] = x[idx+1];
530: }
531: break;
532: case ADD_VALUES:
533: for (i=0; i<n; i++) {
534: idx = *indicesx++;
535: idy = *indicesy++;
536: y[idy] += x[idx];
537: y[idy+1] += x[idx+1];
538: }
539: break;
540: #if !defined(PETSC_USE_COMPLEX)
541: case MAX_VALUES:
542: for (i=0; i<n; i++) {
543: idx = *indicesx++;
544: idy = *indicesy++;
545: y[idy] = PetscMax(y[idy],x[idx]);
546: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
547: }
548: #else
549: case MAX_VALUES:
550: #endif
551: case NOT_SET_VALUES:
552: break;
553: }
554: }
555: /* ----------------------------------------------------------------------------------------------- */
556: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
557: {
558: PetscInt i,idx;
560: for (i=0; i<n; i++) {
561: idx = *indicesx++;
562: y[0] = x[idx];
563: y[1] = x[idx+1];
564: y[2] = x[idx+2];
565: y += 3;
566: }
567: }
568: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
569: {
570: PetscInt i,idy;
572: switch (addv) {
573: case INSERT_VALUES:
574: for (i=0; i<n; i++) {
575: idy = *indicesy++;
576: y[idy] = x[0];
577: y[idy+1] = x[1];
578: y[idy+2] = x[2];
579: x += 3;
580: }
581: break;
582: case ADD_VALUES:
583: for (i=0; i<n; i++) {
584: idy = *indicesy++;
585: y[idy] += x[0];
586: y[idy+1] += x[1];
587: y[idy+2] += x[2];
588: x += 3;
589: }
590: break;
591: #if !defined(PETSC_USE_COMPLEX)
592: case MAX_VALUES:
593: for (i=0; i<n; i++) {
594: idy = *indicesy++;
595: y[idy] = PetscMax(y[idy],x[0]);
596: y[idy+1] = PetscMax(y[idy+1],x[1]);
597: y[idy+2] = PetscMax(y[idy+2],x[2]);
598: x += 3;
599: }
600: #else
601: case MAX_VALUES:
602: #endif
603: case NOT_SET_VALUES:
604: break;
605: }
606: }
608: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
609: {
610: PetscInt i,idx,idy;
612: switch (addv) {
613: case INSERT_VALUES:
614: for (i=0; i<n; i++) {
615: idx = *indicesx++;
616: idy = *indicesy++;
617: y[idy] = x[idx];
618: y[idy+1] = x[idx+1];
619: y[idy+2] = x[idx+2];
620: }
621: break;
622: case ADD_VALUES:
623: for (i=0; i<n; i++) {
624: idx = *indicesx++;
625: idy = *indicesy++;
626: y[idy] += x[idx];
627: y[idy+1] += x[idx+1];
628: y[idy+2] += x[idx+2];
629: }
630: break;
631: #if !defined(PETSC_USE_COMPLEX)
632: case MAX_VALUES:
633: for (i=0; i<n; i++) {
634: idx = *indicesx++;
635: idy = *indicesy++;
636: y[idy] = PetscMax(y[idy],x[idx]);
637: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
638: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
639: }
640: #else
641: case MAX_VALUES:
642: #endif
643: case NOT_SET_VALUES:
644: break;
645: }
646: }
647: /* ----------------------------------------------------------------------------------------------- */
648: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
649: {
650: PetscInt i,idx;
652: for (i=0; i<n; i++) {
653: idx = *indicesx++;
654: y[0] = x[idx];
655: y[1] = x[idx+1];
656: y[2] = x[idx+2];
657: y[3] = x[idx+3];
658: y += 4;
659: }
660: }
661: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
662: {
663: PetscInt i,idy;
665: switch (addv) {
666: case INSERT_VALUES:
667: for (i=0; i<n; i++) {
668: idy = *indicesy++;
669: y[idy] = x[0];
670: y[idy+1] = x[1];
671: y[idy+2] = x[2];
672: y[idy+3] = x[3];
673: x += 4;
674: }
675: break;
676: case ADD_VALUES:
677: for (i=0; i<n; i++) {
678: idy = *indicesy++;
679: y[idy] += x[0];
680: y[idy+1] += x[1];
681: y[idy+2] += x[2];
682: y[idy+3] += x[3];
683: x += 4;
684: }
685: break;
686: #if !defined(PETSC_USE_COMPLEX)
687: case MAX_VALUES:
688: for (i=0; i<n; i++) {
689: idy = *indicesy++;
690: y[idy] = PetscMax(y[idy],x[0]);
691: y[idy+1] = PetscMax(y[idy+1],x[1]);
692: y[idy+2] = PetscMax(y[idy+2],x[2]);
693: y[idy+3] = PetscMax(y[idy+3],x[3]);
694: x += 4;
695: }
696: #else
697: case MAX_VALUES:
698: #endif
699: case NOT_SET_VALUES:
700: break;
701: }
702: }
704: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
705: {
706: PetscInt i,idx,idy;
708: switch (addv) {
709: case INSERT_VALUES:
710: for (i=0; i<n; i++) {
711: idx = *indicesx++;
712: idy = *indicesy++;
713: y[idy] = x[idx];
714: y[idy+1] = x[idx+1];
715: y[idy+2] = x[idx+2];
716: y[idy+3] = x[idx+3];
717: }
718: break;
719: case ADD_VALUES:
720: for (i=0; i<n; i++) {
721: idx = *indicesx++;
722: idy = *indicesy++;
723: y[idy] += x[idx];
724: y[idy+1] += x[idx+1];
725: y[idy+2] += x[idx+2];
726: y[idy+3] += x[idx+3];
727: }
728: break;
729: #if !defined(PETSC_USE_COMPLEX)
730: case MAX_VALUES:
731: for (i=0; i<n; i++) {
732: idx = *indicesx++;
733: idy = *indicesy++;
734: y[idy] = PetscMax(y[idy],x[idx]);
735: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
736: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
737: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
738: }
739: #else
740: case MAX_VALUES:
741: #endif
742: case NOT_SET_VALUES:
743: break;
744: }
745: }
746: /* ----------------------------------------------------------------------------------------------- */
747: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
748: {
749: PetscInt i,idx;
751: for (i=0; i<n; i++) {
752: idx = *indicesx++;
753: y[0] = x[idx];
754: y[1] = x[idx+1];
755: y[2] = x[idx+2];
756: y[3] = x[idx+3];
757: y[4] = x[idx+4];
758: y += 5;
759: }
760: }
761: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
762: {
763: PetscInt i,idy;
765: switch (addv) {
766: case INSERT_VALUES:
767: for (i=0; i<n; i++) {
768: idy = *indicesy++;
769: y[idy] = x[0];
770: y[idy+1] = x[1];
771: y[idy+2] = x[2];
772: y[idy+3] = x[3];
773: y[idy+4] = x[4];
774: x += 5;
775: }
776: break;
777: case ADD_VALUES:
778: for (i=0; i<n; i++) {
779: idy = *indicesy++;
780: y[idy] += x[0];
781: y[idy+1] += x[1];
782: y[idy+2] += x[2];
783: y[idy+3] += x[3];
784: y[idy+4] += x[4];
785: x += 5;
786: }
787: break;
788: #if !defined(PETSC_USE_COMPLEX)
789: case MAX_VALUES:
790: for (i=0; i<n; i++) {
791: idy = *indicesy++;
792: y[idy] = PetscMax(y[idy],x[0]);
793: y[idy+1] = PetscMax(y[idy+1],x[1]);
794: y[idy+2] = PetscMax(y[idy+2],x[2]);
795: y[idy+3] = PetscMax(y[idy+3],x[3]);
796: y[idy+4] = PetscMax(y[idy+4],x[4]);
797: x += 5;
798: }
799: #else
800: case MAX_VALUES:
801: #endif
802: case NOT_SET_VALUES:
803: break;
804: }
805: }
807: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
808: {
809: PetscInt i,idx,idy;
811: switch (addv) {
812: case INSERT_VALUES:
813: for (i=0; i<n; i++) {
814: idx = *indicesx++;
815: idy = *indicesy++;
816: y[idy] = x[idx];
817: y[idy+1] = x[idx+1];
818: y[idy+2] = x[idx+2];
819: y[idy+3] = x[idx+3];
820: y[idy+4] = x[idx+4];
821: }
822: break;
823: case ADD_VALUES:
824: for (i=0; i<n; i++) {
825: idx = *indicesx++;
826: idy = *indicesy++;
827: y[idy] += x[idx];
828: y[idy+1] += x[idx+1];
829: y[idy+2] += x[idx+2];
830: y[idy+3] += x[idx+3];
831: y[idy+4] += x[idx+4];
832: }
833: break;
834: #if !defined(PETSC_USE_COMPLEX)
835: case MAX_VALUES:
836: for (i=0; i<n; i++) {
837: idx = *indicesx++;
838: idy = *indicesy++;
839: y[idy] = PetscMax(y[idy],x[idx]);
840: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
841: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
842: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
843: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
844: }
845: #else
846: case MAX_VALUES:
847: #endif
848: case NOT_SET_VALUES:
849: break;
850: }
851: }
852: /* ----------------------------------------------------------------------------------------------- */
853: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
854: {
855: PetscInt i,idx;
857: for (i=0; i<n; i++) {
858: idx = *indicesx++;
859: y[0] = x[idx];
860: y[1] = x[idx+1];
861: y[2] = x[idx+2];
862: y[3] = x[idx+3];
863: y[4] = x[idx+4];
864: y[5] = x[idx+5];
865: y += 6;
866: }
867: }
868: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
869: {
870: PetscInt i,idy;
872: switch (addv) {
873: case INSERT_VALUES:
874: for (i=0; i<n; i++) {
875: idy = *indicesy++;
876: y[idy] = x[0];
877: y[idy+1] = x[1];
878: y[idy+2] = x[2];
879: y[idy+3] = x[3];
880: y[idy+4] = x[4];
881: y[idy+5] = x[5];
882: x += 6;
883: }
884: break;
885: case ADD_VALUES:
886: for (i=0; i<n; i++) {
887: idy = *indicesy++;
888: y[idy] += x[0];
889: y[idy+1] += x[1];
890: y[idy+2] += x[2];
891: y[idy+3] += x[3];
892: y[idy+4] += x[4];
893: y[idy+5] += x[5];
894: x += 6;
895: }
896: break;
897: #if !defined(PETSC_USE_COMPLEX)
898: case MAX_VALUES:
899: for (i=0; i<n; i++) {
900: idy = *indicesy++;
901: y[idy] = PetscMax(y[idy],x[0]);
902: y[idy+1] = PetscMax(y[idy+1],x[1]);
903: y[idy+2] = PetscMax(y[idy+2],x[2]);
904: y[idy+3] = PetscMax(y[idy+3],x[3]);
905: y[idy+4] = PetscMax(y[idy+4],x[4]);
906: y[idy+5] = PetscMax(y[idy+5],x[5]);
907: x += 6;
908: }
909: #else
910: case MAX_VALUES:
911: #endif
912: case NOT_SET_VALUES:
913: break;
914: }
915: }
917: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
918: {
919: PetscInt i,idx,idy;
921: switch (addv) {
922: case INSERT_VALUES:
923: for (i=0; i<n; i++) {
924: idx = *indicesx++;
925: idy = *indicesy++;
926: y[idy] = x[idx];
927: y[idy+1] = x[idx+1];
928: y[idy+2] = x[idx+2];
929: y[idy+3] = x[idx+3];
930: y[idy+4] = x[idx+4];
931: y[idy+5] = x[idx+5];
932: }
933: break;
934: case ADD_VALUES:
935: for (i=0; i<n; i++) {
936: idx = *indicesx++;
937: idy = *indicesy++;
938: y[idy] += x[idx];
939: y[idy+1] += x[idx+1];
940: y[idy+2] += x[idx+2];
941: y[idy+3] += x[idx+3];
942: y[idy+4] += x[idx+4];
943: y[idy+5] += x[idx+5];
944: }
945: break;
946: #if !defined(PETSC_USE_COMPLEX)
947: case MAX_VALUES:
948: for (i=0; i<n; i++) {
949: idx = *indicesx++;
950: idy = *indicesy++;
951: y[idy] = PetscMax(y[idy],x[idx]);
952: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
953: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
954: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
955: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
956: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
957: }
958: #else
959: case MAX_VALUES:
960: #endif
961: case NOT_SET_VALUES:
962: break;
963: }
964: }
965: /* ----------------------------------------------------------------------------------------------- */
966: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
967: {
968: PetscInt i,idx;
970: for (i=0; i<n; i++) {
971: idx = *indicesx++;
972: y[0] = x[idx];
973: y[1] = x[idx+1];
974: y[2] = x[idx+2];
975: y[3] = x[idx+3];
976: y[4] = x[idx+4];
977: y[5] = x[idx+5];
978: y[6] = x[idx+6];
979: y += 7;
980: }
981: }
982: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
983: {
984: PetscInt i,idy;
986: switch (addv) {
987: case INSERT_VALUES:
988: for (i=0; i<n; i++) {
989: idy = *indicesy++;
990: y[idy] = x[0];
991: y[idy+1] = x[1];
992: y[idy+2] = x[2];
993: y[idy+3] = x[3];
994: y[idy+4] = x[4];
995: y[idy+5] = x[5];
996: y[idy+6] = x[6];
997: x += 7;
998: }
999: break;
1000: case ADD_VALUES:
1001: for (i=0; i<n; i++) {
1002: idy = *indicesy++;
1003: y[idy] += x[0];
1004: y[idy+1] += x[1];
1005: y[idy+2] += x[2];
1006: y[idy+3] += x[3];
1007: y[idy+4] += x[4];
1008: y[idy+5] += x[5];
1009: y[idy+6] += x[6];
1010: x += 7;
1011: }
1012: break;
1013: #if !defined(PETSC_USE_COMPLEX)
1014: case MAX_VALUES:
1015: for (i=0; i<n; i++) {
1016: idy = *indicesy++;
1017: y[idy] = PetscMax(y[idy],x[0]);
1018: y[idy+1] = PetscMax(y[idy+1],x[1]);
1019: y[idy+2] = PetscMax(y[idy+2],x[2]);
1020: y[idy+3] = PetscMax(y[idy+3],x[3]);
1021: y[idy+4] = PetscMax(y[idy+4],x[4]);
1022: y[idy+5] = PetscMax(y[idy+5],x[5]);
1023: y[idy+6] = PetscMax(y[idy+6],x[6]);
1024: x += 7;
1025: }
1026: #else
1027: case MAX_VALUES:
1028: #endif
1029: case NOT_SET_VALUES:
1030: break;
1031: }
1032: }
1034: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1035: {
1036: PetscInt i,idx,idy;
1038: switch (addv) {
1039: case INSERT_VALUES:
1040: for (i=0; i<n; i++) {
1041: idx = *indicesx++;
1042: idy = *indicesy++;
1043: y[idy] = x[idx];
1044: y[idy+1] = x[idx+1];
1045: y[idy+2] = x[idx+2];
1046: y[idy+3] = x[idx+3];
1047: y[idy+4] = x[idx+4];
1048: y[idy+5] = x[idx+5];
1049: y[idy+6] = x[idx+6];
1050: }
1051: break;
1052: case ADD_VALUES:
1053: for (i=0; i<n; i++) {
1054: idx = *indicesx++;
1055: idy = *indicesy++;
1056: y[idy] += x[idx];
1057: y[idy+1] += x[idx+1];
1058: y[idy+2] += x[idx+2];
1059: y[idy+3] += x[idx+3];
1060: y[idy+4] += x[idx+4];
1061: y[idy+5] += x[idx+5];
1062: y[idy+6] += x[idx+6];
1063: }
1064: break;
1065: #if !defined(PETSC_USE_COMPLEX)
1066: case MAX_VALUES:
1067: for (i=0; i<n; i++) {
1068: idx = *indicesx++;
1069: idy = *indicesy++;
1070: y[idy] = PetscMax(y[idy],x[idx]);
1071: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1072: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1073: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1074: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1075: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1076: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1077: }
1078: #else
1079: case MAX_VALUES:
1080: #endif
1081: case NOT_SET_VALUES:
1082: break;
1083: }
1084: }
1085: /* ----------------------------------------------------------------------------------------------- */
1086: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1087: {
1088: PetscInt i,idx;
1090: for (i=0; i<n; i++) {
1091: idx = *indicesx++;
1092: y[0] = x[idx];
1093: y[1] = x[idx+1];
1094: y[2] = x[idx+2];
1095: y[3] = x[idx+3];
1096: y[4] = x[idx+4];
1097: y[5] = x[idx+5];
1098: y[6] = x[idx+6];
1099: y[7] = x[idx+7];
1100: y += 8;
1101: }
1102: }
1103: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1104: {
1105: PetscInt i,idy;
1107: switch (addv) {
1108: case INSERT_VALUES:
1109: for (i=0; i<n; i++) {
1110: idy = *indicesy++;
1111: y[idy] = x[0];
1112: y[idy+1] = x[1];
1113: y[idy+2] = x[2];
1114: y[idy+3] = x[3];
1115: y[idy+4] = x[4];
1116: y[idy+5] = x[5];
1117: y[idy+6] = x[6];
1118: y[idy+7] = x[7];
1119: x += 8;
1120: }
1121: break;
1122: case ADD_VALUES:
1123: for (i=0; i<n; i++) {
1124: idy = *indicesy++;
1125: y[idy] += x[0];
1126: y[idy+1] += x[1];
1127: y[idy+2] += x[2];
1128: y[idy+3] += x[3];
1129: y[idy+4] += x[4];
1130: y[idy+5] += x[5];
1131: y[idy+6] += x[6];
1132: y[idy+7] += x[7];
1133: x += 8;
1134: }
1135: break;
1136: #if !defined(PETSC_USE_COMPLEX)
1137: case MAX_VALUES:
1138: for (i=0; i<n; i++) {
1139: idy = *indicesy++;
1140: y[idy] = PetscMax(y[idy],x[0]);
1141: y[idy+1] = PetscMax(y[idy+1],x[1]);
1142: y[idy+2] = PetscMax(y[idy+2],x[2]);
1143: y[idy+3] = PetscMax(y[idy+3],x[3]);
1144: y[idy+4] = PetscMax(y[idy+4],x[4]);
1145: y[idy+5] = PetscMax(y[idy+5],x[5]);
1146: y[idy+6] = PetscMax(y[idy+6],x[6]);
1147: y[idy+7] = PetscMax(y[idy+7],x[7]);
1148: x += 8;
1149: }
1150: #else
1151: case MAX_VALUES:
1152: #endif
1153: case NOT_SET_VALUES:
1154: break;
1155: }
1156: }
1158: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1159: {
1160: PetscInt i,idx,idy;
1162: switch (addv) {
1163: case INSERT_VALUES:
1164: for (i=0; i<n; i++) {
1165: idx = *indicesx++;
1166: idy = *indicesy++;
1167: y[idy] = x[idx];
1168: y[idy+1] = x[idx+1];
1169: y[idy+2] = x[idx+2];
1170: y[idy+3] = x[idx+3];
1171: y[idy+4] = x[idx+4];
1172: y[idy+5] = x[idx+5];
1173: y[idy+6] = x[idx+6];
1174: y[idy+7] = x[idx+7];
1175: }
1176: break;
1177: case ADD_VALUES:
1178: for (i=0; i<n; i++) {
1179: idx = *indicesx++;
1180: idy = *indicesy++;
1181: y[idy] += x[idx];
1182: y[idy+1] += x[idx+1];
1183: y[idy+2] += x[idx+2];
1184: y[idy+3] += x[idx+3];
1185: y[idy+4] += x[idx+4];
1186: y[idy+5] += x[idx+5];
1187: y[idy+6] += x[idx+6];
1188: y[idy+7] += x[idx+7];
1189: }
1190: break;
1191: #if !defined(PETSC_USE_COMPLEX)
1192: case MAX_VALUES:
1193: for (i=0; i<n; i++) {
1194: idx = *indicesx++;
1195: idy = *indicesy++;
1196: y[idy] = PetscMax(y[idy],x[idx]);
1197: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1198: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1199: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1200: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1201: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1202: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1203: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1204: }
1205: #else
1206: case MAX_VALUES:
1207: #endif
1208: case NOT_SET_VALUES:
1209: break;
1210: }
1211: }
1213: /* ----------------------------------------------------------------------------------------------- */
1214: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1215: {
1216: PetscInt i,idx;
1218: for (i=0; i<n; i++) {
1219: idx = *indicesx++;
1220: y[0] = x[idx];
1221: y[1] = x[idx+1];
1222: y[2] = x[idx+2];
1223: y[3] = x[idx+3];
1224: y[4] = x[idx+4];
1225: y[5] = x[idx+5];
1226: y[6] = x[idx+6];
1227: y[7] = x[idx+7];
1228: y[8] = x[idx+8];
1229: y[9] = x[idx+9];
1230: y[10] = x[idx+10];
1231: y[11] = x[idx+11];
1232: y += 12;
1233: }
1234: }
1235: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1236: {
1237: PetscInt i,idy;
1239: switch (addv) {
1240: case INSERT_VALUES:
1241: for (i=0; i<n; i++) {
1242: idy = *indicesy++;
1243: y[idy] = x[0];
1244: y[idy+1] = x[1];
1245: y[idy+2] = x[2];
1246: y[idy+3] = x[3];
1247: y[idy+4] = x[4];
1248: y[idy+5] = x[5];
1249: y[idy+6] = x[6];
1250: y[idy+7] = x[7];
1251: y[idy+8] = x[8];
1252: y[idy+9] = x[9];
1253: y[idy+10] = x[10];
1254: y[idy+11] = x[11];
1255: x += 12;
1256: }
1257: break;
1258: case ADD_VALUES:
1259: for (i=0; i<n; i++) {
1260: idy = *indicesy++;
1261: y[idy] += x[0];
1262: y[idy+1] += x[1];
1263: y[idy+2] += x[2];
1264: y[idy+3] += x[3];
1265: y[idy+4] += x[4];
1266: y[idy+5] += x[5];
1267: y[idy+6] += x[6];
1268: y[idy+7] += x[7];
1269: y[idy+8] += x[8];
1270: y[idy+9] += x[9];
1271: y[idy+10] += x[10];
1272: y[idy+11] += x[11];
1273: x += 12;
1274: }
1275: break;
1276: #if !defined(PETSC_USE_COMPLEX)
1277: case MAX_VALUES:
1278: for (i=0; i<n; i++) {
1279: idy = *indicesy++;
1280: y[idy] = PetscMax(y[idy],x[0]);
1281: y[idy+1] = PetscMax(y[idy+1],x[1]);
1282: y[idy+2] = PetscMax(y[idy+2],x[2]);
1283: y[idy+3] = PetscMax(y[idy+3],x[3]);
1284: y[idy+4] = PetscMax(y[idy+4],x[4]);
1285: y[idy+5] = PetscMax(y[idy+5],x[5]);
1286: y[idy+6] = PetscMax(y[idy+6],x[6]);
1287: y[idy+7] = PetscMax(y[idy+7],x[7]);
1288: y[idy+8] = PetscMax(y[idy+8],x[8]);
1289: y[idy+9] = PetscMax(y[idy+9],x[9]);
1290: y[idy+10] = PetscMax(y[idy+10],x[10]);
1291: y[idy+11] = PetscMax(y[idy+11],x[11]);
1292: x += 12;
1293: }
1294: #else
1295: case MAX_VALUES:
1296: #endif
1297: case NOT_SET_VALUES:
1298: break;
1299: }
1300: }
1302: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1303: {
1304: PetscInt i,idx,idy;
1306: switch (addv) {
1307: case INSERT_VALUES:
1308: for (i=0; i<n; i++) {
1309: idx = *indicesx++;
1310: idy = *indicesy++;
1311: y[idy] = x[idx];
1312: y[idy+1] = x[idx+1];
1313: y[idy+2] = x[idx+2];
1314: y[idy+3] = x[idx+3];
1315: y[idy+4] = x[idx+4];
1316: y[idy+5] = x[idx+5];
1317: y[idy+6] = x[idx+6];
1318: y[idy+7] = x[idx+7];
1319: y[idy+8] = x[idx+8];
1320: y[idy+9] = x[idx+9];
1321: y[idy+10] = x[idx+10];
1322: y[idy+11] = x[idx+11];
1323: }
1324: break;
1325: case ADD_VALUES:
1326: for (i=0; i<n; i++) {
1327: idx = *indicesx++;
1328: idy = *indicesy++;
1329: y[idy] += x[idx];
1330: y[idy+1] += x[idx+1];
1331: y[idy+2] += x[idx+2];
1332: y[idy+3] += x[idx+3];
1333: y[idy+4] += x[idx+4];
1334: y[idy+5] += x[idx+5];
1335: y[idy+6] += x[idx+6];
1336: y[idy+7] += x[idx+7];
1337: y[idy+8] += x[idx+8];
1338: y[idy+9] += x[idx+9];
1339: y[idy+10] += x[idx+10];
1340: y[idy+11] += x[idx+11];
1341: }
1342: break;
1343: #if !defined(PETSC_USE_COMPLEX)
1344: case MAX_VALUES:
1345: for (i=0; i<n; i++) {
1346: idx = *indicesx++;
1347: idy = *indicesy++;
1348: y[idy] = PetscMax(y[idy],x[idx]);
1349: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1350: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1351: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1352: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1353: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1354: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1355: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1356: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1357: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1358: y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1359: y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1360: }
1361: #else
1362: case MAX_VALUES:
1363: #endif
1364: case NOT_SET_VALUES:
1365: break;
1366: }
1367: }
1369: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1370: #define BS 1
1371: #include ../src/vec/vec/utils/vpscat.h
1372: #define BS 2
1373: #include ../src/vec/vec/utils/vpscat.h
1374: #define BS 3
1375: #include ../src/vec/vec/utils/vpscat.h
1376: #define BS 4
1377: #include ../src/vec/vec/utils/vpscat.h
1378: #define BS 5
1379: #include ../src/vec/vec/utils/vpscat.h
1380: #define BS 6
1381: #include ../src/vec/vec/utils/vpscat.h
1382: #define BS 7
1383: #include ../src/vec/vec/utils/vpscat.h
1384: #define BS 8
1385: #include ../src/vec/vec/utils/vpscat.h
1386: #define BS 12
1387: #include ../src/vec/vec/utils/vpscat.h
1389: /* ==========================================================================================*/
1391: /* create parallel to sequential scatter context */
1393: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);
1397: /*@
1398: VecScatterCreateLocal - Creates a VecScatter from a list of messages it must send and receive.
1400: Collective on VecScatter
1402: Input Parameters:
1403: + VecScatter - obtained with VecScatterCreateEmpty()
1404: . nsends -
1405: . sendSizes -
1406: . sendProcs -
1407: . sendIdx - indices where the sent entries are obtained from (in local, on process numbering), this is one long array of size \sum_{i=0,i<nsends} sendSizes[i]
1408: . nrecvs - number of receives to expect
1409: . recvSizes -
1410: . recvProcs - processes who are sending to me
1411: . recvIdx - indices of where received entries are to be put, (in local, on process numbering), this is one long array of size \sum_{i=0,i<nrecvs} recvSizes[i]
1412: - bs - size of block
1414: Notes: sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1415: to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.
1417: Probably does not handle sends to self properly. It should remove those from the counts that are used
1418: in allocating space inside of the from struct
1420: Level: intermediate
1422: @*/
1423: PetscErrorCode VecScatterCreateLocal(VecScatter ctx,PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs)
1424: {
1425: VecScatter_MPI_General *from, *to;
1426: PetscInt sendSize, recvSize;
1427: PetscInt n, i;
1428: PetscErrorCode ierr;
1430: /* allocate entire send scatter context */
1431: PetscNewLog(ctx,VecScatter_MPI_General,&to);
1432: to->n = nsends;
1433: for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1434: PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1435: PetscMalloc4(bs*sendSize,PetscScalar,&to->values,sendSize,PetscInt,&to->indices,to->n+1,PetscInt,&to->starts,to->n,PetscMPIInt,&to->procs);
1436: PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1437: to->starts[0] = 0;
1438: for(n = 0; n < to->n; n++) {
1439: if (sendSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1440: to->starts[n+1] = to->starts[n] + sendSizes[n];
1441: to->procs[n] = sendProcs[n];
1442: for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1443: to->indices[i] = sendIdx[i];
1444: }
1445: }
1446: ctx->todata = (void *) to;
1448: /* allocate entire receive scatter context */
1449: PetscNewLog(ctx,VecScatter_MPI_General,&from);
1450: from->n = nrecvs;
1451: for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1452: PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1453: PetscMalloc4(bs*recvSize,PetscScalar,&from->values,recvSize,PetscInt,&from->indices,from->n+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1454: from->starts[0] = 0;
1455: for(n = 0; n < from->n; n++) {
1456: if (recvSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1457: from->starts[n+1] = from->starts[n] + recvSizes[n];
1458: from->procs[n] = recvProcs[n];
1459: for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1460: from->indices[i] = recvIdx[i];
1461: }
1462: }
1463: ctx->fromdata = (void *)from;
1465: /* No local scatter optimization */
1466: from->local.n = 0;
1467: from->local.vslots = 0;
1468: to->local.n = 0;
1469: to->local.vslots = 0;
1470: from->local.nonmatching_computed = PETSC_FALSE;
1471: from->local.n_nonmatching = 0;
1472: from->local.slots_nonmatching = 0;
1473: to->local.nonmatching_computed = PETSC_FALSE;
1474: to->local.n_nonmatching = 0;
1475: to->local.slots_nonmatching = 0;
1477: from->type = VEC_SCATTER_MPI_GENERAL;
1478: to->type = VEC_SCATTER_MPI_GENERAL;
1479: from->bs = bs;
1480: to->bs = bs;
1481: VecScatterCreateCommon_PtoS(from, to, ctx);
1482: return(0);
1483: }
1485: /*
1486: bs indicates how many elements there are in each block. Normally this would be 1.
1488: contains check that PetscMPIInt can handle the sizes needed
1489: */
1492: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1493: {
1494: VecScatter_MPI_General *from,*to;
1495: PetscMPIInt size,rank,imdex,tag,n;
1496: PetscInt *source = PETSC_NULL,*owners = PETSC_NULL;
1497: PetscInt *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1498: PetscMPIInt *nprocs = PETSC_NULL,nrecvs;
1499: PetscInt i,j,idx,nsends;
1500: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1501: PetscInt *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
1502: PetscMPIInt *onodes1,*olengths1;
1503: MPI_Comm comm;
1504: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1505: MPI_Status recv_status,*send_status;
1506: PetscErrorCode ierr;
1509: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1510: PetscObjectGetComm((PetscObject)xin,&comm);
1511: MPI_Comm_rank(comm,&rank);
1512: MPI_Comm_size(comm,&size);
1513: owners = xin->map->range;
1514: VecGetSize(yin,&lengthy);
1515: VecGetSize(xin,&lengthx);
1517: /* first count number of contributors to each processor */
1518: PetscMalloc2(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner);
1519: PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
1520: j = 0;
1521: nsends = 0;
1522: for (i=0; i<nx; i++) {
1523: idx = inidx[i];
1524: if (idx < owners[j]) j = 0;
1525: for (; j<size; j++) {
1526: if (idx < owners[j+1]) {
1527: if (!nprocs[j]++) nsends++;
1528: owner[i] = j;
1529: break;
1530: }
1531: }
1532: }
1533: nprocslocal = nprocs[rank];
1534: nprocs[rank] = 0;
1535: if (nprocslocal) nsends--;
1536: /* inform other processors of number of messages and max length*/
1537: PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
1538: PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
1539: PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
1540: recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];
1542: /* post receives: */
1543: PetscMalloc3(recvtotal,PetscInt,&rvalues,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1544: count = 0;
1545: for (i=0; i<nrecvs; i++) {
1546: MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
1547: count += olengths1[i];
1548: }
1550: /* do sends:
1551: 1) starts[i] gives the starting index in svalues for stuff going to
1552: the ith processor
1553: */
1554: PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1555: starts[0] = 0;
1556: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1557: for (i=0; i<nx; i++) {
1558: if (owner[i] != rank) {
1559: svalues[starts[owner[i]]++] = inidx[i];
1560: }
1561: }
1563: starts[0] = 0;
1564: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1565: count = 0;
1566: for (i=0; i<size; i++) {
1567: if (nprocs[i]) {
1568: MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
1569: }
1570: }
1572: /* wait on receives */
1573: count = nrecvs;
1574: slen = 0;
1575: while (count) {
1576: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1577: /* unpack receives into our local space */
1578: MPI_Get_count(&recv_status,MPIU_INT,&n);
1579: slen += n;
1580: count--;
1581: }
1583: if (slen != recvtotal) SETERRQ2(PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);
1584:
1585: /* allocate entire send scatter context */
1586: PetscNewLog(ctx,VecScatter_MPI_General,&to);
1587: to->n = nrecvs;
1588: PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1589: PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,nrecvs,PetscMPIInt,&to->procs);
1590: PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,&to->rstatus);
1591: ctx->todata = (void*)to;
1592: to->starts[0] = 0;
1594: if (nrecvs) {
1596: /* move the data into the send scatter */
1597: base = owners[rank];
1598: rsvalues = rvalues;
1599: for (i=0; i<nrecvs; i++) {
1600: to->starts[i+1] = to->starts[i] + olengths1[i];
1601: to->procs[i] = onodes1[i];
1602: values = rsvalues;
1603: rsvalues += olengths1[i];
1604: for (j=0; j<olengths1[i]; j++) {
1605: to->indices[to->starts[i] + j] = values[j] - base;
1606: }
1607: }
1608: }
1609: PetscFree(olengths1);
1610: PetscFree(onodes1);
1611: PetscFree3(rvalues,source,recv_waits);
1613: /* allocate entire receive scatter context */
1614: PetscNewLog(ctx,VecScatter_MPI_General,&from);
1615: from->n = nsends;
1617: PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1618: PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1619: ctx->fromdata = (void*)from;
1621: /* move data into receive scatter */
1622: PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1623: count = 0; from->starts[0] = start[0] = 0;
1624: for (i=0; i<size; i++) {
1625: if (nprocs[i]) {
1626: lowner[i] = count;
1627: from->procs[count++] = i;
1628: from->starts[count] = start[count] = start[count-1] + nprocs[i];
1629: }
1630: }
1632: for (i=0; i<nx; i++) {
1633: if (owner[i] != rank) {
1634: from->indices[start[lowner[owner[i]]]++] = inidy[i];
1635: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1636: }
1637: }
1638: PetscFree2(lowner,start);
1639: PetscFree2(nprocs,owner);
1640:
1641: /* wait on sends */
1642: if (nsends) {
1643: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1644: MPI_Waitall(nsends,send_waits,send_status);
1645: PetscFree(send_status);
1646: }
1647: PetscFree3(svalues,send_waits,starts);
1649: if (nprocslocal) {
1650: PetscInt nt = from->local.n = to->local.n = nprocslocal;
1651: /* we have a scatter to ourselves */
1652: PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1653: PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1654: nt = 0;
1655: for (i=0; i<nx; i++) {
1656: idx = inidx[i];
1657: if (idx >= owners[rank] && idx < owners[rank+1]) {
1658: to->local.vslots[nt] = idx - owners[rank];
1659: from->local.vslots[nt++] = inidy[i];
1660: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1661: }
1662: }
1663: } else {
1664: from->local.n = 0;
1665: from->local.vslots = 0;
1666: to->local.n = 0;
1667: to->local.vslots = 0;
1668: }
1670: from->local.nonmatching_computed = PETSC_FALSE;
1671: from->local.n_nonmatching = 0;
1672: from->local.slots_nonmatching = 0;
1673: to->local.nonmatching_computed = PETSC_FALSE;
1674: to->local.n_nonmatching = 0;
1675: to->local.slots_nonmatching = 0;
1677: from->type = VEC_SCATTER_MPI_GENERAL;
1678: to->type = VEC_SCATTER_MPI_GENERAL;
1679: from->bs = bs;
1680: to->bs = bs;
1681: VecScatterCreateCommon_PtoS(from,to,ctx);
1682: return(0);
1683: }
1685: /*
1686: bs indicates how many elements there are in each block. Normally this would be 1.
1687: */
1690: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1691: {
1692: MPI_Comm comm = ((PetscObject)ctx)->comm;
1693: PetscMPIInt tag = ((PetscObject)ctx)->tag, tagr;
1694: PetscInt bs = to->bs;
1695: PetscMPIInt size;
1696: PetscInt i, n;
1698:
1700: PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1701: ctx->destroy = VecScatterDestroy_PtoP;
1703: PetscOptionsHasName(PETSC_NULL,"-vecscatter_reproduce",&ctx->reproduce);
1704: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1705: from->sendfirst = to->sendfirst;
1707: MPI_Comm_size(comm,&size);
1708: /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1709: to->contiq = PETSC_FALSE;
1710: n = from->starts[from->n];
1711: from->contiq = PETSC_TRUE;
1712: for (i=1; i<n; i++) {
1713: if (from->indices[i] != from->indices[i-1] + bs) {
1714: from->contiq = PETSC_FALSE;
1715: break;
1716: }
1717: }
1719: PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv);
1720: from->use_alltoallv = to->use_alltoallv;
1721: if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
1722: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1723: if (to->use_alltoallv) {
1724: PetscOptionsHasName(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw);
1725: }
1726: from->use_alltoallw = to->use_alltoallw;
1727: if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
1728: #endif
1730: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1731: PetscOptionsHasName(PETSC_NULL,"-vecscatter_window",&to->use_window);
1732: from->use_window = to->use_window;
1733: #endif
1735: if (to->use_alltoallv) {
1737: PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1738: PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1739: for (i=0; i<to->n; i++) {
1740: to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1741: }
1742: to->displs[0] = 0;
1743: for (i=1; i<size; i++) {
1744: to->displs[i] = to->displs[i-1] + to->counts[i-1];
1745: }
1747: PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1748: PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1749: for (i=0; i<from->n; i++) {
1750: from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1751: }
1752: from->displs[0] = 0;
1753: for (i=1; i<size; i++) {
1754: from->displs[i] = from->displs[i-1] + from->counts[i-1];
1755: }
1756: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1757: if (to->use_alltoallw) {
1758: PetscMPIInt mpibs = PetscMPIIntCast(bs), mpilen;
1759: ctx->packtogether = PETSC_FALSE;
1760: PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1761: PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1762: PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1763: for (i=0; i<size; i++) {
1764: to->types[i] = MPIU_SCALAR;
1765: }
1767: for (i=0; i<to->n; i++) {
1768: to->wcounts[to->procs[i]] = 1;
1769: mpilen = PetscMPIIntCast(to->starts[i+1]-to->starts[i]);
1770: MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1771: MPI_Type_commit(to->types+to->procs[i]);
1772: }
1773: PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1774: PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1775: PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1776: for (i=0; i<size; i++) {
1777: from->types[i] = MPIU_SCALAR;
1778: }
1779: if (from->contiq) {
1780: PetscInfo(ctx,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1781: for (i=0; i<from->n; i++) {
1782: from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1783: }
1784: if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1785: for (i=1; i<from->n; i++) {
1786: from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1787: }
1788: } else {
1789: for (i=0; i<from->n; i++) {
1790: from->wcounts[from->procs[i]] = 1;
1791: mpilen = PetscMPIIntCast(from->starts[i+1]-from->starts[i]);
1792: MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1793: MPI_Type_commit(from->types+from->procs[i]);
1794: }
1795: }
1796: }
1797: #else
1798: to->use_alltoallw = PETSC_FALSE;
1799: from->use_alltoallw = PETSC_FALSE;
1800: #endif
1801: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1802: } else if (to->use_window) {
1803: PetscMPIInt temptag,winsize;
1804: MPI_Request *request;
1805: MPI_Status *status;
1806:
1807: PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1808: winsize = (to->n ? to->starts[to->n] : 0)*sizeof(PetscScalar);
1809: MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1810: PetscMalloc(to->n,&to->winstarts);
1811: PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1812: for (i=0; i<to->n; i++) {
1813: MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1814: }
1815: for (i=0; i<from->n; i++) {
1816: MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1817: }
1818: MPI_Waitall(to->n,request,status);
1819: PetscFree2(request,status);
1821: winsize = (from->n ? from->starts[from->n] : 0)*sizeof(PetscScalar);
1822: MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1823: PetscMalloc(from->n,&from->winstarts);
1824: PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1825: for (i=0; i<from->n; i++) {
1826: MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1827: }
1828: for (i=0; i<to->n; i++) {
1829: MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1830: }
1831: MPI_Waitall(from->n,request,status);
1832: PetscFree2(request,status);
1833: #endif
1834: } else {
1835: PetscTruth use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1836: PetscInt *sstarts = to->starts, *rstarts = from->starts;
1837: PetscMPIInt *sprocs = to->procs, *rprocs = from->procs;
1838: MPI_Request *swaits = to->requests,*rwaits = from->requests;
1839: MPI_Request *rev_swaits,*rev_rwaits;
1840: PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
1842: /* allocate additional wait variables for the "reverse" scatter */
1843: PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1844: PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1845: to->rev_requests = rev_rwaits;
1846: from->rev_requests = rev_swaits;
1848: /* Register the receives that you will use later (sends for scatter reverse) */
1849: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&use_rsend);
1850: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1851: if (use_rsend) {
1852: PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
1853: to->use_readyreceiver = PETSC_TRUE;
1854: from->use_readyreceiver = PETSC_TRUE;
1855: } else {
1856: to->use_readyreceiver = PETSC_FALSE;
1857: from->use_readyreceiver = PETSC_FALSE;
1858: }
1859: if (use_ssend) {
1860: PetscInfo(ctx,"Using VecScatter Ssend mode\n");
1861: }
1863: for (i=0; i<from->n; i++) {
1864: if (use_rsend) {
1865: MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1866: } else if (use_ssend) {
1867: MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1868: } else {
1869: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1870: }
1871: }
1873: for (i=0; i<to->n; i++) {
1874: if (use_rsend) {
1875: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1876: } else if (use_ssend) {
1877: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1878: } else {
1879: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1880: }
1881: }
1882: /* Register receives for scatter and reverse */
1883: for (i=0; i<from->n; i++) {
1884: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1885: }
1886: for (i=0; i<to->n; i++) {
1887: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
1888: }
1889: if (use_rsend) {
1890: if (to->n) {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
1891: if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1892: MPI_Barrier(comm);
1893: }
1895: ctx->copy = VecScatterCopy_PtoP_X;
1896: }
1897: PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);
1898: switch (bs) {
1899: case 12:
1900: ctx->begin = VecScatterBegin_12;
1901: ctx->end = VecScatterEnd_12;
1902: break;
1903: case 8:
1904: ctx->begin = VecScatterBegin_8;
1905: ctx->end = VecScatterEnd_8;
1906: break;
1907: case 7:
1908: ctx->begin = VecScatterBegin_7;
1909: ctx->end = VecScatterEnd_7;
1910: break;
1911: case 6:
1912: ctx->begin = VecScatterBegin_6;
1913: ctx->end = VecScatterEnd_6;
1914: break;
1915: case 5:
1916: ctx->begin = VecScatterBegin_5;
1917: ctx->end = VecScatterEnd_5;
1918: break;
1919: case 4:
1920: ctx->begin = VecScatterBegin_4;
1921: ctx->end = VecScatterEnd_4;
1922: break;
1923: case 3:
1924: ctx->begin = VecScatterBegin_3;
1925: ctx->end = VecScatterEnd_3;
1926: break;
1927: case 2:
1928: ctx->begin = VecScatterBegin_2;
1929: ctx->end = VecScatterEnd_2;
1930: break;
1931: case 1:
1932: ctx->begin = VecScatterBegin_1;
1933: ctx->end = VecScatterEnd_1;
1934: break;
1935: default:
1936: SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
1937: }
1938: ctx->view = VecScatterView_MPI;
1939: /* Check if the local scatter is actually a copy; important special case */
1940: if (to->local.n) {
1941: VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
1942: }
1943: return(0);
1944: }
1948: /* ------------------------------------------------------------------------------------*/
1949: /*
1950: Scatter from local Seq vectors to a parallel vector.
1951: Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
1952: reverses the result.
1953: */
1956: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1957: {
1958: PetscErrorCode ierr;
1959: MPI_Request *waits;
1960: VecScatter_MPI_General *to,*from;
1963: VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
1964: to = (VecScatter_MPI_General*)ctx->fromdata;
1965: from = (VecScatter_MPI_General*)ctx->todata;
1966: ctx->todata = (void*)to;
1967: ctx->fromdata = (void*)from;
1968: /* these two are special, they are ALWAYS stored in to struct */
1969: to->sstatus = from->sstatus;
1970: to->rstatus = from->rstatus;
1972: from->sstatus = 0;
1973: from->rstatus = 0;
1975: waits = from->rev_requests;
1976: from->rev_requests = from->requests;
1977: from->requests = waits;
1978: waits = to->rev_requests;
1979: to->rev_requests = to->requests;
1980: to->requests = waits;
1981: return(0);
1982: }
1984: /* ---------------------------------------------------------------------------------*/
1987: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
1988: {
1990: PetscMPIInt size,rank,tag,imdex,n;
1991: PetscInt *owners = xin->map->range;
1992: PetscMPIInt *nprocs = PETSC_NULL;
1993: PetscInt i,j,idx,nsends,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
1994: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1995: PetscInt *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,*values = PETSC_NULL,*rsvalues,recvtotal,lastidx;
1996: PetscMPIInt *onodes1,*olengths1,nrecvs;
1997: MPI_Comm comm;
1998: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1999: MPI_Status recv_status,*send_status = PETSC_NULL;
2000: PetscTruth duplicate = PETSC_FALSE;
2001: #if defined(PETSC_USE_DEBUG)
2002: PetscTruth found = PETSC_FALSE;
2003: #endif
2006: PetscObjectGetNewTag((PetscObject)ctx,&tag);
2007: PetscObjectGetComm((PetscObject)xin,&comm);
2008: MPI_Comm_size(comm,&size);
2009: MPI_Comm_rank(comm,&rank);
2010: if (size == 1) {
2011: VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
2012: return(0);
2013: }
2015: /*
2016: Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2017: They then call the StoPScatterCreate()
2018: */
2019: /* first count number of contributors to each processor */
2020: PetscMalloc3(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
2021: PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
2022: lastidx = -1;
2023: j = 0;
2024: for (i=0; i<nx; i++) {
2025: /* if indices are NOT locally sorted, need to start search at the beginning */
2026: if (lastidx > (idx = inidx[i])) j = 0;
2027: lastidx = idx;
2028: for (; j<size; j++) {
2029: if (idx >= owners[j] && idx < owners[j+1]) {
2030: nprocs[j]++;
2031: owner[i] = j;
2032: #if defined(PETSC_USE_DEBUG)
2033: found = PETSC_TRUE;
2034: #endif
2035: break;
2036: }
2037: }
2038: #if defined(PETSC_USE_DEBUG)
2039: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2040: found = PETSC_FALSE;
2041: #endif
2042: }
2043: nsends = 0; for (i=0; i<size; i++) { nsends += (nprocs[i] > 0);}
2045: /* inform other processors of number of messages and max length*/
2046: PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
2047: PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2048: PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2049: recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];
2051: /* post receives: */
2052: PetscMalloc5(2*recvtotal,PetscInt,&rvalues,2*nx,PetscInt,&svalues,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
2054: count = 0;
2055: for (i=0; i<nrecvs; i++) {
2056: MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2057: count += olengths1[i];
2058: }
2059: PetscFree(onodes1);
2061: /* do sends:
2062: 1) starts[i] gives the starting index in svalues for stuff going to
2063: the ith processor
2064: */
2065: starts[0]= 0;
2066: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2067: for (i=0; i<nx; i++) {
2068: svalues[2*starts[owner[i]]] = inidx[i];
2069: svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2070: }
2072: starts[0] = 0;
2073: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2074: count = 0;
2075: for (i=0; i<size; i++) {
2076: if (nprocs[i]) {
2077: MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2078: count++;
2079: }
2080: }
2081: PetscFree3(nprocs,owner,starts);
2083: /* wait on receives */
2084: count = nrecvs;
2085: slen = 0;
2086: while (count) {
2087: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2088: /* unpack receives into our local space */
2089: MPI_Get_count(&recv_status,MPIU_INT,&n);
2090: slen += n/2;
2091: count--;
2092: }
2093: if (slen != recvtotal) SETERRQ2(PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);
2094:
2095: PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2096: base = owners[rank];
2097: count = 0;
2098: rsvalues = rvalues;
2099: for (i=0; i<nrecvs; i++) {
2100: values = rsvalues;
2101: rsvalues += 2*olengths1[i];
2102: for (j=0; j<olengths1[i]; j++) {
2103: local_inidx[count] = values[2*j] - base;
2104: local_inidy[count++] = values[2*j+1];
2105: }
2106: }
2107: PetscFree(olengths1);
2109: /* wait on sends */
2110: if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2111: PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);
2113: /*
2114: should sort and remove duplicates from local_inidx,local_inidy
2115: */
2117: #if defined(do_it_slow)
2118: /* sort on the from index */
2119: PetscSortIntWithArray(slen,local_inidx,local_inidy);
2120: start = 0;
2121: while (start < slen) {
2122: count = start+1;
2123: last = local_inidx[start];
2124: while (count < slen && last == local_inidx[count]) count++;
2125: if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2126: /* sort on to index */
2127: PetscSortInt(count-start,local_inidy+start);
2128: }
2129: /* remove duplicates; not most efficient way, but probably good enough */
2130: i = start;
2131: while (i < count-1) {
2132: if (local_inidy[i] != local_inidy[i+1]) {
2133: i++;
2134: } else { /* found a duplicate */
2135: duplicate = PETSC_TRUE;
2136: for (j=i; j<slen-1; j++) {
2137: local_inidx[j] = local_inidx[j+1];
2138: local_inidy[j] = local_inidy[j+1];
2139: }
2140: slen--;
2141: count--;
2142: }
2143: }
2144: start = count;
2145: }
2146: #endif
2147: if (duplicate) {
2148: PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2149: }
2150: VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2151: PetscFree2(local_inidx,local_inidy);
2152: return(0);
2153: }