Actual source code: mpiov.c
2: #define PETSCMAT_DLL
4: /*
5: Routines to compute overlapping regions of a parallel MPI matrix
6: and to find submatrices that were shared across processors.
7: */
8: #include ../src/mat/impls/aij/mpi/mpiaij.h
9: #include petscbt.h
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14: EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15: EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
19: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20: {
22: PetscInt i;
25: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: for (i=0; i<ov; ++i) {
27: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
28: }
29: return(0);
30: }
32: /*
33: Sample message format:
34: If a processor A wants processor B to process some elements corresponding
35: to index sets is[1],is[5]
36: mesg [0] = 2 (no of index sets in the mesg)
37: -----------
38: mesg [1] = 1 => is[1]
39: mesg [2] = sizeof(is[1]);
40: -----------
41: mesg [3] = 5 => is[5]
42: mesg [4] = sizeof(is[5]);
43: -----------
44: mesg [5]
45: mesg [n] datas[1]
46: -----------
47: mesg[n+1]
48: mesg[m] data(is[5])
49: -----------
50:
51: Notes:
52: nrqs - no of requests sent (or to be sent out)
53: nrqr - no of requests recieved (which have to be or which have been processed
54: */
57: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58: {
59: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
60: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61: const PetscInt **idx,*idx_i;
62: PetscInt *n,*rtable,**data,len;
64: PetscMPIInt size,rank,tag1,tag2;
65: PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67: PetscBT *table;
68: MPI_Comm comm;
69: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70: MPI_Status *s_status,*recv_status;
73: comm = ((PetscObject)C)->comm;
74: size = c->size;
75: rank = c->rank;
76: m = C->rmap->N;
78: PetscObjectGetNewTag((PetscObject)C,&tag1);
79: PetscObjectGetNewTag((PetscObject)C,&tag2);
80:
81: len = (imax+1)*sizeof(PetscInt*)+ (imax + m)*sizeof(PetscInt);
82: PetscMalloc(len,&idx);
83: n = (PetscInt*)(idx + imax);
84: rtable = n + imax;
85:
86: for (i=0; i<imax; i++) {
87: ISGetIndices(is[i],&idx[i]);
88: ISGetLocalSize(is[i],&n[i]);
89: }
90:
91: /* Create hash table for the mapping :row -> proc*/
92: for (i=0,j=0; i<size; i++) {
93: len = C->rmap->range[i+1];
94: for (; j<len; j++) {
95: rtable[j] = i;
96: }
97: }
99: /* evaluate communication - mesg to who,length of mesg, and buffer space
100: required. Based on this, buffers are allocated, and data copied into them*/
101: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1);/* mesg size */
102: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
103: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
104: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
105: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialise work vector*/
106: for (i=0; i<imax; i++) {
107: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
108: idx_i = idx[i];
109: len = n[i];
110: for (j=0; j<len; j++) {
111: row = idx_i[j];
112: if (row < 0) {
113: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
114: }
115: proc = rtable[row];
116: w4[proc]++;
117: }
118: for (j=0; j<size; j++){
119: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
120: }
121: }
123: nrqs = 0; /* no of outgoing messages */
124: msz = 0; /* total mesg length (for all proc */
125: w1[rank] = 0; /* no mesg sent to intself */
126: w3[rank] = 0;
127: for (i=0; i<size; i++) {
128: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
129: }
130: /* pa - is list of processors to communicate with */
131: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
132: for (i=0,j=0; i<size; i++) {
133: if (w1[i]) {pa[j] = i; j++;}
134: }
136: /* Each message would have a header = 1 + 2*(no of IS) + data */
137: for (i=0; i<nrqs; i++) {
138: j = pa[i];
139: w1[j] += w2[j] + 2*w3[j];
140: msz += w1[j];
141: }
143: /* Determine the number of messages to expect, their lengths, from from-ids */
144: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
145: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
147: /* Now post the Irecvs corresponding to these messages */
148: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
150: /* Allocate Memory for outgoing messages */
151: len = 2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt);
152: PetscMalloc(len,&outdat);
153: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
154: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
155: tmp = (PetscInt*)(outdat + 2*size);
156: ctr = tmp + msz;
158: {
159: PetscInt *iptr = tmp,ict = 0;
160: for (i=0; i<nrqs; i++) {
161: j = pa[i];
162: iptr += ict;
163: outdat[j] = iptr;
164: ict = w1[j];
165: }
166: }
168: /* Form the outgoing messages */
169: /*plug in the headers*/
170: for (i=0; i<nrqs; i++) {
171: j = pa[i];
172: outdat[j][0] = 0;
173: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
174: ptr[j] = outdat[j] + 2*w3[j] + 1;
175: }
176:
177: /* Memory for doing local proc's work*/
178: {
179: PetscInt *d_p;
180: char *t_p;
182: len = (imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
183: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
184: PetscMalloc(len,&table);
185: PetscMemzero(table,len);
186: data = (PetscInt **)(table + imax);
187: isz = (PetscInt *)(data + imax);
188: d_p = (PetscInt *)(isz + imax);
189: t_p = (char *)(d_p + m*imax);
190: for (i=0; i<imax; i++) {
191: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
192: data[i] = d_p + (m)*i;
193: }
194: }
196: /* Parse the IS and update local tables and the outgoing buf with the data*/
197: {
198: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
199: PetscBT table_i;
201: for (i=0; i<imax; i++) {
202: PetscMemzero(ctr,size*sizeof(PetscInt));
203: n_i = n[i];
204: table_i = table[i];
205: idx_i = idx[i];
206: data_i = data[i];
207: isz_i = isz[i];
208: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
209: row = idx_i[j];
210: proc = rtable[row];
211: if (proc != rank) { /* copy to the outgoing buffer */
212: ctr[proc]++;
213: *ptr[proc] = row;
214: ptr[proc]++;
215: } else { /* Update the local table */
216: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
217: }
218: }
219: /* Update the headers for the current IS */
220: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
221: if ((ctr_j = ctr[j])) {
222: outdat_j = outdat[j];
223: k = ++outdat_j[0];
224: outdat_j[2*k] = ctr_j;
225: outdat_j[2*k-1] = i;
226: }
227: }
228: isz[i] = isz_i;
229: }
230: }
231:
234: /* Now post the sends */
235: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
236: for (i=0; i<nrqs; ++i) {
237: j = pa[i];
238: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
239: }
240:
241: /* No longer need the original indices*/
242: for (i=0; i<imax; ++i) {
243: ISRestoreIndices(is[i],idx+i);
244: }
245: PetscFree(idx);
247: for (i=0; i<imax; ++i) {
248: ISDestroy(is[i]);
249: }
250:
251: /* Do Local work*/
252: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
254: /* Receive messages*/
255: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
256: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
257:
258: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
259: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
261: /* Phase 1 sends are complete - deallocate buffers */
262: PetscFree(outdat);
263: PetscFree(w1);
265: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
266: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
267: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
268: PetscFree(rbuf);
270:
271: /* Send the data back*/
272: /* Do a global reduction to know the buffer space req for incoming messages*/
273: {
274: PetscMPIInt *rw1;
275:
276: PetscMalloc(size*sizeof(PetscMPIInt),&rw1);
277: PetscMemzero(rw1,size*sizeof(PetscMPIInt));
279: for (i=0; i<nrqr; ++i) {
280: proc = recv_status[i].MPI_SOURCE;
281: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
282: rw1[proc] = isz1[i];
283: }
284: PetscFree(onodes1);
285: PetscFree(olengths1);
287: /* Determine the number of messages to expect, their lengths, from from-ids */
288: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
289: PetscFree(rw1);
290: }
291: /* Now post the Irecvs corresponding to these messages */
292: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
294: /* Now post the sends */
295: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
296: for (i=0; i<nrqr; ++i) {
297: j = recv_status[i].MPI_SOURCE;
298: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
299: }
301: /* receive work done on other processors*/
302: {
303: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
304: PetscMPIInt idex;
305: PetscBT table_i;
306: MPI_Status *status2;
307:
308: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
309: for (i=0; i<nrqs; ++i) {
310: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
311: /* Process the message*/
312: rbuf2_i = rbuf2[idex];
313: ct1 = 2*rbuf2_i[0]+1;
314: jmax = rbuf2[idex][0];
315: for (j=1; j<=jmax; j++) {
316: max = rbuf2_i[2*j];
317: is_no = rbuf2_i[2*j-1];
318: isz_i = isz[is_no];
319: data_i = data[is_no];
320: table_i = table[is_no];
321: for (k=0; k<max; k++,ct1++) {
322: row = rbuf2_i[ct1];
323: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
324: }
325: isz[is_no] = isz_i;
326: }
327: }
329: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
330: PetscFree(status2);
331: }
332:
333: for (i=0; i<imax; ++i) {
334: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
335: }
336:
337: PetscFree(onodes2);
338: PetscFree(olengths2);
340: PetscFree(pa);
341: PetscFree(rbuf2);
342: PetscFree(s_waits1);
343: PetscFree(r_waits1);
344: PetscFree(s_waits2);
345: PetscFree(r_waits2);
346: PetscFree(table);
347: PetscFree(s_status);
348: PetscFree(recv_status);
349: PetscFree(xdata[0]);
350: PetscFree(xdata);
351: PetscFree(isz1);
352: return(0);
353: }
357: /*
358: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
359: the work on the local processor.
361: Inputs:
362: C - MAT_MPIAIJ;
363: imax - total no of index sets processed at a time;
364: table - an array of char - size = m bits.
365:
366: Output:
367: isz - array containing the count of the solution elements correspondign
368: to each index set;
369: data - pointer to the solutions
370: */
371: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
372: {
373: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
374: Mat A = c->A,B = c->B;
375: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
376: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
377: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
378: PetscBT table_i;
381: rstart = C->rmap->rstart;
382: cstart = C->cmap->rstart;
383: ai = a->i;
384: aj = a->j;
385: bi = b->i;
386: bj = b->j;
387: garray = c->garray;
389:
390: for (i=0; i<imax; i++) {
391: data_i = data[i];
392: table_i = table[i];
393: isz_i = isz[i];
394: for (j=0,max=isz[i]; j<max; j++) {
395: row = data_i[j] - rstart;
396: start = ai[row];
397: end = ai[row+1];
398: for (k=start; k<end; k++) { /* Amat */
399: val = aj[k] + cstart;
400: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
401: }
402: start = bi[row];
403: end = bi[row+1];
404: for (k=start; k<end; k++) { /* Bmat */
405: val = garray[bj[k]];
406: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
407: }
408: }
409: isz[i] = isz_i;
410: }
411: return(0);
412: }
416: /*
417: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
418: and return the output
420: Input:
421: C - the matrix
422: nrqr - no of messages being processed.
423: rbuf - an array of pointers to the recieved requests
424:
425: Output:
426: xdata - array of messages to be sent back
427: isz1 - size of each message
429: For better efficiency perhaps we should malloc separately each xdata[i],
430: then if a remalloc is required we need only copy the data for that one row
431: rather then all previous rows as it is now where a single large chunck of
432: memory is used.
434: */
435: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
436: {
437: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
438: Mat A = c->A,B = c->B;
439: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
441: PetscMPIInt rank;
442: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
443: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
444: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
445: PetscInt *rbuf_i,kmax,rbuf_0;
446: PetscBT xtable;
449: rank = c->rank;
450: m = C->rmap->N;
451: rstart = C->rmap->rstart;
452: cstart = C->cmap->rstart;
453: ai = a->i;
454: aj = a->j;
455: bi = b->i;
456: bj = b->j;
457: garray = c->garray;
458:
459:
460: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
461: rbuf_i = rbuf[i];
462: rbuf_0 = rbuf_i[0];
463: ct += rbuf_0;
464: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
465: }
466:
467: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
468: else max1 = 1;
469: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
470: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
471: ++no_malloc;
472: PetscBTCreate(m,xtable);
473: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
474:
475: ct3 = 0;
476: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
477: rbuf_i = rbuf[i];
478: rbuf_0 = rbuf_i[0];
479: ct1 = 2*rbuf_0+1;
480: ct2 = ct1;
481: ct3 += ct1;
482: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
483: PetscBTMemzero(m,xtable);
484: oct2 = ct2;
485: kmax = rbuf_i[2*j];
486: for (k=0; k<kmax; k++,ct1++) {
487: row = rbuf_i[ct1];
488: if (!PetscBTLookupSet(xtable,row)) {
489: if (!(ct3 < mem_estimate)) {
490: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
491: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
492: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
493: PetscFree(xdata[0]);
494: xdata[0] = tmp;
495: mem_estimate = new_estimate; ++no_malloc;
496: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
497: }
498: xdata[i][ct2++] = row;
499: ct3++;
500: }
501: }
502: for (k=oct2,max2=ct2; k<max2; k++) {
503: row = xdata[i][k] - rstart;
504: start = ai[row];
505: end = ai[row+1];
506: for (l=start; l<end; l++) {
507: val = aj[l] + cstart;
508: if (!PetscBTLookupSet(xtable,val)) {
509: if (!(ct3 < mem_estimate)) {
510: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
511: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
512: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
513: PetscFree(xdata[0]);
514: xdata[0] = tmp;
515: mem_estimate = new_estimate; ++no_malloc;
516: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
517: }
518: xdata[i][ct2++] = val;
519: ct3++;
520: }
521: }
522: start = bi[row];
523: end = bi[row+1];
524: for (l=start; l<end; l++) {
525: val = garray[bj[l]];
526: if (!PetscBTLookupSet(xtable,val)) {
527: if (!(ct3 < mem_estimate)) {
528: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
529: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
530: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
531: PetscFree(xdata[0]);
532: xdata[0] = tmp;
533: mem_estimate = new_estimate; ++no_malloc;
534: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
535: }
536: xdata[i][ct2++] = val;
537: ct3++;
538: }
539: }
540: }
541: /* Update the header*/
542: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
543: xdata[i][2*j-1] = rbuf_i[2*j-1];
544: }
545: xdata[i][0] = rbuf_0;
546: xdata[i+1] = xdata[i] + ct2;
547: isz1[i] = ct2; /* size of each message */
548: }
549: PetscBTDestroy(xtable);
550: PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);
551: return(0);
552: }
553: /* -------------------------------------------------------------------------*/
554: EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
555: EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
556: /*
557: Every processor gets the entire matrix
558: */
561: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
562: {
563: Mat B;
564: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
565: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
567: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
568: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
569: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
570: MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
573: MPI_Comm_size(((PetscObject)A)->comm,&size);
574: MPI_Comm_rank(((PetscObject)A)->comm,&rank);
576: if (scall == MAT_INITIAL_MATRIX) {
577: /* ----------------------------------------------------------------
578: Tell every processor the number of nonzeros per row
579: */
580: PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);
581: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
582: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
583: }
584: sendcount = A->rmap->rend - A->rmap->rstart;
585: PetscMalloc(2*size*sizeof(PetscMPIInt),&recvcounts);
586: displs = recvcounts + size;
587: for (i=0; i<size; i++) {
588: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
589: displs[i] = A->rmap->range[i];
590: }
591: #if defined(PETSC_HAVE_MPI_IN_PLACE)
592: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
593: #else
594: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
595: #endif
596: /* ---------------------------------------------------------------
597: Create the sequential matrix of the same type as the local block diagonal
598: */
599: MatCreate(PETSC_COMM_SELF,&B);
600: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
601: MatSetType(B,((PetscObject)a->A)->type_name);
602: MatSeqAIJSetPreallocation(B,0,lens);
603: PetscMalloc(sizeof(Mat),Bin);
604: **Bin = B;
605: b = (Mat_SeqAIJ *)B->data;
607: /*--------------------------------------------------------------------
608: Copy my part of matrix column indices over
609: */
610: sendcount = ad->nz + bd->nz;
611: jsendbuf = b->j + b->i[rstarts[rank]];
612: a_jsendbuf = ad->j;
613: b_jsendbuf = bd->j;
614: n = A->rmap->rend - A->rmap->rstart;
615: cnt = 0;
616: for (i=0; i<n; i++) {
618: /* put in lower diagonal portion */
619: m = bd->i[i+1] - bd->i[i];
620: while (m > 0) {
621: /* is it above diagonal (in bd (compressed) numbering) */
622: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
623: jsendbuf[cnt++] = garray[*b_jsendbuf++];
624: m--;
625: }
627: /* put in diagonal portion */
628: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
629: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
630: }
632: /* put in upper diagonal portion */
633: while (m-- > 0) {
634: jsendbuf[cnt++] = garray[*b_jsendbuf++];
635: }
636: }
637: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
639: /*--------------------------------------------------------------------
640: Gather all column indices to all processors
641: */
642: for (i=0; i<size; i++) {
643: recvcounts[i] = 0;
644: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
645: recvcounts[i] += lens[j];
646: }
647: }
648: displs[0] = 0;
649: for (i=1; i<size; i++) {
650: displs[i] = displs[i-1] + recvcounts[i-1];
651: }
652: #if defined(PETSC_HAVE_MPI_IN_PLACE)
653: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
654: #else
655: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
656: #endif
657: /*--------------------------------------------------------------------
658: Assemble the matrix into useable form (note numerical values not yet set)
659: */
660: /* set the b->ilen (length of each row) values */
661: PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
662: /* set the b->i indices */
663: b->i[0] = 0;
664: for (i=1; i<=A->rmap->N; i++) {
665: b->i[i] = b->i[i-1] + lens[i-1];
666: }
667: PetscFree(lens);
668: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
669: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
671: } else {
672: B = **Bin;
673: b = (Mat_SeqAIJ *)B->data;
674: }
676: /*--------------------------------------------------------------------
677: Copy my part of matrix numerical values into the values location
678: */
679: if (flag == MAT_GET_VALUES){
680: sendcount = ad->nz + bd->nz;
681: sendbuf = b->a + b->i[rstarts[rank]];
682: a_sendbuf = ad->a;
683: b_sendbuf = bd->a;
684: b_sendj = bd->j;
685: n = A->rmap->rend - A->rmap->rstart;
686: cnt = 0;
687: for (i=0; i<n; i++) {
689: /* put in lower diagonal portion */
690: m = bd->i[i+1] - bd->i[i];
691: while (m > 0) {
692: /* is it above diagonal (in bd (compressed) numbering) */
693: if (garray[*b_sendj] > A->rmap->rstart + i) break;
694: sendbuf[cnt++] = *b_sendbuf++;
695: m--;
696: b_sendj++;
697: }
699: /* put in diagonal portion */
700: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
701: sendbuf[cnt++] = *a_sendbuf++;
702: }
704: /* put in upper diagonal portion */
705: while (m-- > 0) {
706: sendbuf[cnt++] = *b_sendbuf++;
707: b_sendj++;
708: }
709: }
710: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
711:
712: /* -----------------------------------------------------------------
713: Gather all numerical values to all processors
714: */
715: if (!recvcounts) {
716: PetscMalloc(2*size*sizeof(PetscInt),&recvcounts);
717: displs = recvcounts + size;
718: }
719: for (i=0; i<size; i++) {
720: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
721: }
722: displs[0] = 0;
723: for (i=1; i<size; i++) {
724: displs[i] = displs[i-1] + recvcounts[i-1];
725: }
726: recvbuf = b->a;
727: #if defined(PETSC_HAVE_MPI_IN_PLACE)
728: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
729: #else
730: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
731: #endif
732: } /* endof (flag == MAT_GET_VALUES) */
733: PetscFree(recvcounts);
735: if (A->symmetric){
736: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
737: } else if (A->hermitian) {
738: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
739: } else if (A->structurally_symmetric) {
740: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
741: }
742: return(0);
743: }
747: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
748: {
750: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
751: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
754: /*
755: Check for special case each processor gets entire matrix
756: */
757: if (ismax == 1 && C->rmap->N == C->cmap->N) {
758: ISIdentity(*isrow,&rowflag);
759: ISIdentity(*iscol,&colflag);
760: ISGetLocalSize(*isrow,&nrow);
761: ISGetLocalSize(*iscol,&ncol);
762: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
763: wantallmatrix = PETSC_TRUE;
764: PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
765: }
766: }
767: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);
768: if (twantallmatrix) {
769: MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
770: return(0);
771: }
773: /* Allocate memory to hold all the submatrices */
774: if (scall != MAT_REUSE_MATRIX) {
775: PetscMalloc((ismax+1)*sizeof(Mat),submat);
776: }
777: /* Determine the number of stages through which submatrices are done */
778: nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
779: if (!nmax) nmax = 1;
780: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
782: /* Make sure every processor loops through the nstages */
783: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);
785: for (i=0,pos=0; i<nstages; i++) {
786: if (pos+nmax <= ismax) max_no = nmax;
787: else if (pos == ismax) max_no = 0;
788: else max_no = ismax-pos;
789: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
790: pos += max_no;
791: }
792: return(0);
793: }
794: /* -------------------------------------------------------------------------*/
797: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
798: {
799: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
800: Mat A = c->A;
801: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
802: const PetscInt **icol,**irow;
803: PetscInt *nrow,*ncol,start;
805: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
806: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
807: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
808: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
809: PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
810: const PetscInt *irow_i;
811: PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
812: PetscInt *rmap_i;
813: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
814: MPI_Request *r_waits4,*s_waits3,*s_waits4;
815: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
816: MPI_Status *r_status3,*r_status4,*s_status4;
817: MPI_Comm comm;
818: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
819: PetscTruth sorted;
820: PetscMPIInt *onodes1,*olengths1;
821: PetscMPIInt idex,idex2,end;
824: comm = ((PetscObject)C)->comm;
825: tag0 = ((PetscObject)C)->tag;
826: size = c->size;
827: rank = c->rank;
828:
829: /* Get some new tags to keep the communication clean */
830: PetscObjectGetNewTag((PetscObject)C,&tag1);
831: PetscObjectGetNewTag((PetscObject)C,&tag2);
832: PetscObjectGetNewTag((PetscObject)C,&tag3);
834: /* Check if the col indices are sorted */
835: for (i=0; i<ismax; i++) {
836: ISSorted(isrow[i],&sorted);
837: /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
838: ISSorted(iscol[i],&sorted);
839: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
840: }
842: len = (2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt));
843: PetscMalloc(len,&irow);
844: icol = irow + ismax;
845: nrow = (PetscInt*)(icol + ismax);
846: ncol = nrow + ismax;
848: for (i=0; i<ismax; i++) {
849: ISGetIndices(isrow[i],&irow[i]);
850: ISGetIndices(iscol[i],&icol[i]);
851: ISGetLocalSize(isrow[i],&nrow[i]);
852: ISGetLocalSize(iscol[i],&ncol[i]);
853: }
855: /* evaluate communication - mesg to who, length of mesg, and buffer space
856: required. Based on this, buffers are allocated, and data copied into them*/
857: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1); /* mesg size */
858: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
859: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
860: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
861: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialize work vector*/
862: for (i=0; i<ismax; i++) {
863: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
864: jmax = nrow[i];
865: irow_i = irow[i];
866: for (j=0; j<jmax; j++) {
867: l = 0;
868: row = irow_i[j];
869: while (row >= C->rmap->range[l+1]) l++;
870: proc = l;
871: w4[proc]++;
872: }
873: for (j=0; j<size; j++) {
874: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
875: }
876: }
877:
878: nrqs = 0; /* no of outgoing messages */
879: msz = 0; /* total mesg length (for all procs) */
880: w1[rank] = 0; /* no mesg sent to self */
881: w3[rank] = 0;
882: for (i=0; i<size; i++) {
883: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
884: }
885: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
886: for (i=0,j=0; i<size; i++) {
887: if (w1[i]) { pa[j] = i; j++; }
888: }
890: /* Each message would have a header = 1 + 2*(no of IS) + data */
891: for (i=0; i<nrqs; i++) {
892: j = pa[i];
893: w1[j] += w2[j] + 2* w3[j];
894: msz += w1[j];
895: }
897: /* Determine the number of messages to expect, their lengths, from from-ids */
898: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
899: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
901: /* Now post the Irecvs corresponding to these messages */
902: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
903:
904: PetscFree(onodes1);
905: PetscFree(olengths1);
906:
907: /* Allocate Memory for outgoing messages */
908: len = 2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt);
909: PetscMalloc(len,&sbuf1);
910: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
911: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
912: /* allocate memory for outgoing data + buf to receive the first reply */
913: tmp = (PetscInt*)(ptr + size);
914: ctr = tmp + 2*msz;
916: {
917: PetscInt *iptr = tmp,ict = 0;
918: for (i=0; i<nrqs; i++) {
919: j = pa[i];
920: iptr += ict;
921: sbuf1[j] = iptr;
922: ict = w1[j];
923: }
924: }
926: /* Form the outgoing messages */
927: /* Initialize the header space */
928: for (i=0; i<nrqs; i++) {
929: j = pa[i];
930: sbuf1[j][0] = 0;
931: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
932: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
933: }
934:
935: /* Parse the isrow and copy data into outbuf */
936: for (i=0; i<ismax; i++) {
937: PetscMemzero(ctr,size*sizeof(PetscInt));
938: irow_i = irow[i];
939: jmax = nrow[i];
940: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
941: l = 0;
942: row = irow_i[j];
943: while (row >= C->rmap->range[l+1]) l++;
944: proc = l;
945: if (proc != rank) { /* copy to the outgoing buf*/
946: ctr[proc]++;
947: *ptr[proc] = row;
948: ptr[proc]++;
949: }
950: }
951: /* Update the headers for the current IS */
952: for (j=0; j<size; j++) { /* Can Optimise this loop too */
953: if ((ctr_j = ctr[j])) {
954: sbuf1_j = sbuf1[j];
955: k = ++sbuf1_j[0];
956: sbuf1_j[2*k] = ctr_j;
957: sbuf1_j[2*k-1] = i;
958: }
959: }
960: }
962: /* Now post the sends */
963: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
964: for (i=0; i<nrqs; ++i) {
965: j = pa[i];
966: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
967: }
969: /* Post Receives to capture the buffer size */
970: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
971: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
972: rbuf2[0] = tmp + msz;
973: for (i=1; i<nrqs; ++i) {
974: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
975: }
976: for (i=0; i<nrqs; ++i) {
977: j = pa[i];
978: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
979: }
981: /* Send to other procs the buf size they should allocate */
982:
984: /* Receive messages*/
985: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
986: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
987: len = 2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*);
988: PetscMalloc(len,&sbuf2);
989: req_size = (PetscInt*)(sbuf2 + nrqr);
990: req_source = req_size + nrqr;
991:
992: {
993: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
994: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
995: PetscInt *sbuf2_i;
997: for (i=0; i<nrqr; ++i) {
998: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
999: req_size[idex] = 0;
1000: rbuf1_i = rbuf1[idex];
1001: start = 2*rbuf1_i[0] + 1;
1002: MPI_Get_count(r_status1+i,MPIU_INT,&end);
1003: PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);
1004: sbuf2_i = sbuf2[idex];
1005: for (j=start; j<end; j++) {
1006: id = rbuf1_i[j] - rstart;
1007: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1008: sbuf2_i[j] = ncols;
1009: req_size[idex] += ncols;
1010: }
1011: req_source[idex] = r_status1[i].MPI_SOURCE;
1012: /* form the header */
1013: sbuf2_i[0] = req_size[idex];
1014: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1015: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
1016: }
1017: }
1018: PetscFree(r_status1);
1019: PetscFree(r_waits1);
1021: /* recv buffer sizes */
1022: /* Receive messages*/
1023:
1024: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
1025: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
1026: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1027: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1028: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1030: for (i=0; i<nrqs; ++i) {
1031: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1032: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);
1033: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1034: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1035: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1036: }
1037: PetscFree(r_status2);
1038: PetscFree(r_waits2);
1039:
1040: /* Wait on sends1 and sends2 */
1041: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1042: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1044: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1045: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1046: PetscFree(s_status1);
1047: PetscFree(s_status2);
1048: PetscFree(s_waits1);
1049: PetscFree(s_waits2);
1051: /* Now allocate buffers for a->j, and send them off */
1052: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
1053: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1054: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
1055: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1056:
1057: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1058: {
1059: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1060: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1061: PetscInt cend = C->cmap->rend;
1062: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1064: for (i=0; i<nrqr; i++) {
1065: rbuf1_i = rbuf1[i];
1066: sbuf_aj_i = sbuf_aj[i];
1067: ct1 = 2*rbuf1_i[0] + 1;
1068: ct2 = 0;
1069: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1070: kmax = rbuf1[i][2*j];
1071: for (k=0; k<kmax; k++,ct1++) {
1072: row = rbuf1_i[ct1] - rstart;
1073: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1074: ncols = nzA + nzB;
1075: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1077: /* load the column indices for this row into cols*/
1078: cols = sbuf_aj_i + ct2;
1079:
1080: lwrite = 0;
1081: for (l=0; l<nzB; l++) {
1082: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1083: }
1084: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1085: for (l=0; l<nzB; l++) {
1086: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1087: }
1089: ct2 += ncols;
1090: }
1091: }
1092: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1093: }
1094: }
1095: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1096: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1098: /* Allocate buffers for a->a, and send them off */
1099: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1100: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1101: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1102: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1103:
1104: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1105: {
1106: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1107: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1108: PetscInt cend = C->cmap->rend;
1109: PetscInt *b_j = b->j;
1110: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1111:
1112: for (i=0; i<nrqr; i++) {
1113: rbuf1_i = rbuf1[i];
1114: sbuf_aa_i = sbuf_aa[i];
1115: ct1 = 2*rbuf1_i[0]+1;
1116: ct2 = 0;
1117: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1118: kmax = rbuf1_i[2*j];
1119: for (k=0; k<kmax; k++,ct1++) {
1120: row = rbuf1_i[ct1] - rstart;
1121: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1122: ncols = nzA + nzB;
1123: cworkB = b_j + b_i[row];
1124: vworkA = a_a + a_i[row];
1125: vworkB = b_a + b_i[row];
1127: /* load the column values for this row into vals*/
1128: vals = sbuf_aa_i+ct2;
1129:
1130: lwrite = 0;
1131: for (l=0; l<nzB; l++) {
1132: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1133: }
1134: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1135: for (l=0; l<nzB; l++) {
1136: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1137: }
1138:
1139: ct2 += ncols;
1140: }
1141: }
1142: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1143: }
1144: }
1145: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1146: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1147: PetscFree(rbuf1);
1149: /* Form the matrix */
1150: /* create col map */
1151: {
1152: const PetscInt *icol_i;
1153:
1154: len = (1+ismax)*sizeof(PetscInt*)+ (1+ismax*C->cmap->N)*sizeof(PetscInt);
1155: PetscMalloc(len,&cmap);
1156: cmap[0] = (PetscInt*)(cmap + ismax);
1157: PetscMemzero(cmap[0],(1+ismax*C->cmap->N)*sizeof(PetscInt));
1158: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1159: for (i=0; i<ismax; i++) {
1160: jmax = ncol[i];
1161: icol_i = icol[i];
1162: cmap_i = cmap[i];
1163: for (j=0; j<jmax; j++) {
1164: cmap_i[icol_i[j]] = j+1;
1165: }
1166: }
1167: }
1169: /* Create lens which is required for MatCreate... */
1170: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1171: len = (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt);
1172: PetscMalloc(len,&lens);
1173: lens[0] = (PetscInt*)(lens + ismax);
1174: PetscMemzero(lens[0],j*sizeof(PetscInt));
1175: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1176:
1177: /* Update lens from local data */
1178: for (i=0; i<ismax; i++) {
1179: jmax = nrow[i];
1180: cmap_i = cmap[i];
1181: irow_i = irow[i];
1182: lens_i = lens[i];
1183: for (j=0; j<jmax; j++) {
1184: l = 0;
1185: row = irow_i[j];
1186: while (row >= C->rmap->range[l+1]) l++;
1187: proc = l;
1188: if (proc == rank) {
1189: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1190: for (k=0; k<ncols; k++) {
1191: if (cmap_i[cols[k]]) { lens_i[j]++;}
1192: }
1193: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1194: }
1195: }
1196: }
1197:
1198: /* Create row map*/
1199: len = (1+ismax)*sizeof(PetscInt*)+ ismax*C->rmap->N*sizeof(PetscInt);
1200: PetscMalloc(len,&rmap);
1201: rmap[0] = (PetscInt*)(rmap + ismax);
1202: PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1203: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1204: for (i=0; i<ismax; i++) {
1205: rmap_i = rmap[i];
1206: irow_i = irow[i];
1207: jmax = nrow[i];
1208: for (j=0; j<jmax; j++) {
1209: rmap_i[irow_i[j]] = j;
1210: }
1211: }
1212:
1213: /* Update lens from offproc data */
1214: {
1215: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1217: for (tmp2=0; tmp2<nrqs; tmp2++) {
1218: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1219: idex = pa[idex2];
1220: sbuf1_i = sbuf1[idex];
1221: jmax = sbuf1_i[0];
1222: ct1 = 2*jmax+1;
1223: ct2 = 0;
1224: rbuf2_i = rbuf2[idex2];
1225: rbuf3_i = rbuf3[idex2];
1226: for (j=1; j<=jmax; j++) {
1227: is_no = sbuf1_i[2*j-1];
1228: max1 = sbuf1_i[2*j];
1229: lens_i = lens[is_no];
1230: cmap_i = cmap[is_no];
1231: rmap_i = rmap[is_no];
1232: for (k=0; k<max1; k++,ct1++) {
1233: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1234: max2 = rbuf2_i[ct1];
1235: for (l=0; l<max2; l++,ct2++) {
1236: if (cmap_i[rbuf3_i[ct2]]) {
1237: lens_i[row]++;
1238: }
1239: }
1240: }
1241: }
1242: }
1243: }
1244: PetscFree(r_status3);
1245: PetscFree(r_waits3);
1246: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1247: PetscFree(s_status3);
1248: PetscFree(s_waits3);
1250: /* Create the submatrices */
1251: if (scall == MAT_REUSE_MATRIX) {
1252: PetscTruth flag;
1254: /*
1255: Assumes new rows are same length as the old rows,hence bug!
1256: */
1257: for (i=0; i<ismax; i++) {
1258: mat = (Mat_SeqAIJ *)(submats[i]->data);
1259: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) {
1260: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1261: }
1262: PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1263: if (!flag) {
1264: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1265: }
1266: /* Initial matrix as if empty */
1267: PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));
1268: submats[i]->factor = C->factor;
1269: }
1270: } else {
1271: for (i=0; i<ismax; i++) {
1272: MatCreate(PETSC_COMM_SELF,submats+i);
1273: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1274: MatSetType(submats[i],((PetscObject)A)->type_name);
1275: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1276: }
1277: }
1279: /* Assemble the matrices */
1280: /* First assemble the local rows */
1281: {
1282: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1283: PetscScalar *imat_a;
1284:
1285: for (i=0; i<ismax; i++) {
1286: mat = (Mat_SeqAIJ*)submats[i]->data;
1287: imat_ilen = mat->ilen;
1288: imat_j = mat->j;
1289: imat_i = mat->i;
1290: imat_a = mat->a;
1291: cmap_i = cmap[i];
1292: rmap_i = rmap[i];
1293: irow_i = irow[i];
1294: jmax = nrow[i];
1295: for (j=0; j<jmax; j++) {
1296: l = 0;
1297: row = irow_i[j];
1298: while (row >= C->rmap->range[l+1]) l++;
1299: proc = l;
1300: if (proc == rank) {
1301: old_row = row;
1302: row = rmap_i[row];
1303: ilen_row = imat_ilen[row];
1304: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1305: mat_i = imat_i[row] ;
1306: mat_a = imat_a + mat_i;
1307: mat_j = imat_j + mat_i;
1308: for (k=0; k<ncols; k++) {
1309: if ((tcol = cmap_i[cols[k]])) {
1310: *mat_j++ = tcol - 1;
1311: *mat_a++ = vals[k];
1312: ilen_row++;
1313: }
1314: }
1315: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1316: imat_ilen[row] = ilen_row;
1317: }
1318: }
1319: }
1320: }
1322: /* Now assemble the off proc rows*/
1323: {
1324: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1325: PetscInt *imat_j,*imat_i;
1326: PetscScalar *imat_a,*rbuf4_i;
1328: for (tmp2=0; tmp2<nrqs; tmp2++) {
1329: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1330: idex = pa[idex2];
1331: sbuf1_i = sbuf1[idex];
1332: jmax = sbuf1_i[0];
1333: ct1 = 2*jmax + 1;
1334: ct2 = 0;
1335: rbuf2_i = rbuf2[idex2];
1336: rbuf3_i = rbuf3[idex2];
1337: rbuf4_i = rbuf4[idex2];
1338: for (j=1; j<=jmax; j++) {
1339: is_no = sbuf1_i[2*j-1];
1340: rmap_i = rmap[is_no];
1341: cmap_i = cmap[is_no];
1342: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1343: imat_ilen = mat->ilen;
1344: imat_j = mat->j;
1345: imat_i = mat->i;
1346: imat_a = mat->a;
1347: max1 = sbuf1_i[2*j];
1348: for (k=0; k<max1; k++,ct1++) {
1349: row = sbuf1_i[ct1];
1350: row = rmap_i[row];
1351: ilen = imat_ilen[row];
1352: mat_i = imat_i[row] ;
1353: mat_a = imat_a + mat_i;
1354: mat_j = imat_j + mat_i;
1355: max2 = rbuf2_i[ct1];
1356: for (l=0; l<max2; l++,ct2++) {
1357: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1358: *mat_j++ = tcol - 1;
1359: *mat_a++ = rbuf4_i[ct2];
1360: ilen++;
1361: }
1362: }
1363: imat_ilen[row] = ilen;
1364: }
1365: }
1366: }
1367: }
1368: PetscFree(r_status4);
1369: PetscFree(r_waits4);
1370: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1371: PetscFree(s_waits4);
1372: PetscFree(s_status4);
1374: /* Restore the indices */
1375: for (i=0; i<ismax; i++) {
1376: ISRestoreIndices(isrow[i],irow+i);
1377: ISRestoreIndices(iscol[i],icol+i);
1378: }
1380: /* Destroy allocated memory */
1381: PetscFree(irow);
1382: PetscFree(w1);
1383: PetscFree(pa);
1385: PetscFree(sbuf1);
1386: PetscFree(rbuf2);
1387: for (i=0; i<nrqr; ++i) {
1388: PetscFree(sbuf2[i]);
1389: }
1390: for (i=0; i<nrqs; ++i) {
1391: PetscFree(rbuf3[i]);
1392: PetscFree(rbuf4[i]);
1393: }
1395: PetscFree(sbuf2);
1396: PetscFree(rbuf3);
1397: PetscFree(rbuf4);
1398: PetscFree(sbuf_aj[0]);
1399: PetscFree(sbuf_aj);
1400: PetscFree(sbuf_aa[0]);
1401: PetscFree(sbuf_aa);
1402:
1403: PetscFree(cmap);
1404: PetscFree(rmap);
1405: PetscFree(lens);
1407: for (i=0; i<ismax; i++) {
1408: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1409: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1410: }
1411: return(0);
1412: }