Actual source code: baijov.c
1: #define PETSCMAT_DLL
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include ../src/mat/impls/baij/mpi/mpibaij.h
8: #include petscbt.h
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
12: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
13: EXTERN PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: EXTERN PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
18: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
19: {
21: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
22: IS *is_new;
25: PetscMalloc(imax*sizeof(IS),&is_new);
26: /* Convert the indices into block format */
27: ISCompressIndicesGeneral(N,bs,imax,is,is_new);
28: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
29: for (i=0; i<ov; ++i) {
30: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
31: }
32: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
33: ISExpandIndicesGeneral(N,bs,imax,is_new,is);
34: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
35: PetscFree(is_new);
36: return(0);
37: }
39: /*
40: Sample message format:
41: If a processor A wants processor B to process some elements corresponding
42: to index sets is[1], is[5]
43: mesg [0] = 2 (no of index sets in the mesg)
44: -----------
45: mesg [1] = 1 => is[1]
46: mesg [2] = sizeof(is[1]);
47: -----------
48: mesg [5] = 5 => is[5]
49: mesg [6] = sizeof(is[5]);
50: -----------
51: mesg [7]
52: mesg [n] data(is[1])
53: -----------
54: mesg[n+1]
55: mesg[m] data(is[5])
56: -----------
57:
58: Notes:
59: nrqs - no of requests sent (or to be sent out)
60: nrqr - no of requests recieved (which have to be or which have been processed
61: */
64: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
65: {
66: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
67: const PetscInt **idx,*idx_i;
68: PetscInt *n,*w3,*w4,*rtable,**data,len;
70: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
71: PetscInt Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
72: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
73: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
74: PetscBT *table;
75: MPI_Comm comm;
76: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
77: MPI_Status *s_status,*recv_status;
80: comm = ((PetscObject)C)->comm;
81: size = c->size;
82: rank = c->rank;
83: Mbs = c->Mbs;
85: PetscObjectGetNewTag((PetscObject)C,&tag1);
86: PetscObjectGetNewTag((PetscObject)C,&tag2);
88: len = (imax+1)*sizeof(PetscInt*)+ (imax + Mbs)*sizeof(PetscInt);
89: PetscMalloc(len,&idx);
90: n = (PetscInt*)(idx + imax);
91: rtable = n + imax;
92:
93: for (i=0; i<imax; i++) {
94: ISGetIndices(is[i],&idx[i]);
95: ISGetLocalSize(is[i],&n[i]);
96: }
97:
98: /* Create hash table for the mapping :row -> proc*/
99: for (i=0,j=0; i<size; i++) {
100: len = c->rangebs[i+1];
101: for (; j<len; j++) {
102: rtable[j] = i;
103: }
104: }
106: /* evaluate communication - mesg to who,length of mesg, and buffer space
107: required. Based on this, buffers are allocated, and data copied into them*/
108: PetscMalloc(size*2*sizeof(PetscInt)+size*2*sizeof(PetscMPIInt),&w1);/* mesg size */
109: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
110: w3 = (PetscInt*) (w2 + size); /* no of IS that needs to be sent to proc i */
111: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
112: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
113: for (i=0; i<imax; i++) {
114: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
115: idx_i = idx[i];
116: len = n[i];
117: for (j=0; j<len; j++) {
118: row = idx_i[j];
119: if (row < 0) {
120: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
121: }
122: proc = rtable[row];
123: w4[proc]++;
124: }
125: for (j=0; j<size; j++){
126: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
127: }
128: }
130: nrqs = 0; /* no of outgoing messages */
131: msz = 0; /* total mesg length (for all proc */
132: w1[rank] = 0; /* no mesg sent to itself */
133: w3[rank] = 0;
134: for (i=0; i<size; i++) {
135: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
136: }
137: /* pa - is list of processors to communicate with */
138: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
139: for (i=0,j=0; i<size; i++) {
140: if (w1[i]) {pa[j] = i; j++;}
141: }
143: /* Each message would have a header = 1 + 2*(no of IS) + data */
144: for (i=0; i<nrqs; i++) {
145: j = pa[i];
146: w1[j] += w2[j] + 2*w3[j];
147: msz += w1[j];
148: }
149:
150: /* Determine the number of messages to expect, their lengths, from from-ids */
151: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
152: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
154: /* Now post the Irecvs corresponding to these messages */
155: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
156:
157: /* Allocate Memory for outgoing messages */
158: len = 2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt);
159: PetscMalloc(len,&outdat);
160: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
161: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
162: tmp = (PetscInt*)(outdat + 2*size);
163: ctr = tmp + msz;
165: {
166: PetscInt *iptr = tmp,ict = 0;
167: for (i=0; i<nrqs; i++) {
168: j = pa[i];
169: iptr += ict;
170: outdat[j] = iptr;
171: ict = w1[j];
172: }
173: }
175: /* Form the outgoing messages */
176: /*plug in the headers*/
177: for (i=0; i<nrqs; i++) {
178: j = pa[i];
179: outdat[j][0] = 0;
180: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
181: ptr[j] = outdat[j] + 2*w3[j] + 1;
182: }
183:
184: /* Memory for doing local proc's work*/
185: {
186: PetscInt *d_p;
187: char *t_p;
189: len = (imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
190: (Mbs)*imax*sizeof(PetscInt) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
191: PetscMalloc(len,&table);
192: PetscMemzero(table,len);
193: data = (PetscInt **)(table + imax);
194: isz = (PetscInt *)(data + imax);
195: d_p = (PetscInt *)(isz + imax);
196: t_p = (char *)(d_p + Mbs*imax);
197: for (i=0; i<imax; i++) {
198: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
199: data[i] = d_p + (Mbs)*i;
200: }
201: }
203: /* Parse the IS and update local tables and the outgoing buf with the data*/
204: {
205: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
206: PetscBT table_i;
208: for (i=0; i<imax; i++) {
209: PetscMemzero(ctr,size*sizeof(PetscInt));
210: n_i = n[i];
211: table_i = table[i];
212: idx_i = idx[i];
213: data_i = data[i];
214: isz_i = isz[i];
215: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
216: row = idx_i[j];
217: proc = rtable[row];
218: if (proc != rank) { /* copy to the outgoing buffer */
219: ctr[proc]++;
220: *ptr[proc] = row;
221: ptr[proc]++;
222: }
223: else { /* Update the local table */
224: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
225: }
226: }
227: /* Update the headers for the current IS */
228: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
229: if ((ctr_j = ctr[j])) {
230: outdat_j = outdat[j];
231: k = ++outdat_j[0];
232: outdat_j[2*k] = ctr_j;
233: outdat_j[2*k-1] = i;
234: }
235: }
236: isz[i] = isz_i;
237: }
238: }
239:
240: /* Now post the sends */
241: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
242: for (i=0; i<nrqs; ++i) {
243: j = pa[i];
244: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
245: }
246:
247: /* No longer need the original indices*/
248: for (i=0; i<imax; ++i) {
249: ISRestoreIndices(is[i],idx+i);
250: }
251: PetscFree(idx);
253: for (i=0; i<imax; ++i) {
254: ISDestroy(is[i]);
255: }
256:
257: /* Do Local work*/
258: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
260: /* Receive messages*/
261: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
262: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
263:
264: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
265: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
267: /* Phase 1 sends are complete - deallocate buffers */
268: PetscFree(outdat);
269: PetscFree(w1);
271: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
272: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
273: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
274: PetscFree(rbuf);
276: /* Send the data back*/
277: /* Do a global reduction to know the buffer space req for incoming messages*/
278: {
279: PetscMPIInt *rw1;
280:
281: PetscMalloc(size*sizeof(PetscInt),&rw1);
282: PetscMemzero(rw1,size*sizeof(PetscInt));
284: for (i=0; i<nrqr; ++i) {
285: proc = recv_status[i].MPI_SOURCE;
286: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
287: rw1[proc] = isz1[i];
288: }
289:
290: PetscFree(onodes1);
291: PetscFree(olengths1);
293: /* Determine the number of messages to expect, their lengths, from from-ids */
294: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
295: PetscFree(rw1);
296: }
297: /* Now post the Irecvs corresponding to these messages */
298: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
299:
300: /* Now post the sends */
301: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
302: for (i=0; i<nrqr; ++i) {
303: j = recv_status[i].MPI_SOURCE;
304: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
305: }
307: /* receive work done on other processors*/
308: {
309: PetscMPIInt idex;
310: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
311: PetscBT table_i;
312: MPI_Status *status2;
313:
314: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
316: for (i=0; i<nrqs; ++i) {
317: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
318: /* Process the message*/
319: rbuf2_i = rbuf2[idex];
320: ct1 = 2*rbuf2_i[0]+1;
321: jmax = rbuf2[idex][0];
322: for (j=1; j<=jmax; j++) {
323: max = rbuf2_i[2*j];
324: is_no = rbuf2_i[2*j-1];
325: isz_i = isz[is_no];
326: data_i = data[is_no];
327: table_i = table[is_no];
328: for (k=0; k<max; k++,ct1++) {
329: row = rbuf2_i[ct1];
330: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
331: }
332: isz[is_no] = isz_i;
333: }
334: }
335: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
336: PetscFree(status2);
337: }
338:
339: for (i=0; i<imax; ++i) {
340: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
341: }
342:
343:
344: PetscFree(onodes2);
345: PetscFree(olengths2);
347: PetscFree(pa);
348: PetscFree(rbuf2);
349: PetscFree(s_waits1);
350: PetscFree(r_waits1);
351: PetscFree(s_waits2);
352: PetscFree(r_waits2);
353: PetscFree(table);
354: PetscFree(s_status);
355: PetscFree(recv_status);
356: PetscFree(xdata[0]);
357: PetscFree(xdata);
358: PetscFree(isz1);
359: return(0);
360: }
364: /*
365: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
366: the work on the local processor.
368: Inputs:
369: C - MAT_MPIBAIJ;
370: imax - total no of index sets processed at a time;
371: table - an array of char - size = Mbs bits.
372:
373: Output:
374: isz - array containing the count of the solution elements corresponding
375: to each index set;
376: data - pointer to the solutions
377: */
378: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
379: {
380: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
381: Mat A = c->A,B = c->B;
382: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
383: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
384: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
385: PetscBT table_i;
388: rstart = c->rstartbs;
389: cstart = c->cstartbs;
390: ai = a->i;
391: aj = a->j;
392: bi = b->i;
393: bj = b->j;
394: garray = c->garray;
396:
397: for (i=0; i<imax; i++) {
398: data_i = data[i];
399: table_i = table[i];
400: isz_i = isz[i];
401: for (j=0,max=isz[i]; j<max; j++) {
402: row = data_i[j] - rstart;
403: start = ai[row];
404: end = ai[row+1];
405: for (k=start; k<end; k++) { /* Amat */
406: val = aj[k] + cstart;
407: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
408: }
409: start = bi[row];
410: end = bi[row+1];
411: for (k=start; k<end; k++) { /* Bmat */
412: val = garray[bj[k]];
413: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
414: }
415: }
416: isz[i] = isz_i;
417: }
418: return(0);
419: }
422: /*
423: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
424: and return the output
426: Input:
427: C - the matrix
428: nrqr - no of messages being processed.
429: rbuf - an array of pointers to the recieved requests
430:
431: Output:
432: xdata - array of messages to be sent back
433: isz1 - size of each message
435: For better efficiency perhaps we should malloc separately each xdata[i],
436: then if a remalloc is required we need only copy the data for that one row
437: rather than all previous rows as it is now where a single large chunck of
438: memory is used.
440: */
441: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
442: {
443: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
444: Mat A = c->A,B = c->B;
445: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
447: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
448: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
449: PetscInt val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
450: PetscInt *rbuf_i,kmax,rbuf_0;
451: PetscBT xtable;
454: rank = c->rank;
455: Mbs = c->Mbs;
456: rstart = c->rstartbs;
457: cstart = c->cstartbs;
458: ai = a->i;
459: aj = a->j;
460: bi = b->i;
461: bj = b->j;
462: garray = c->garray;
463:
464:
465: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
466: rbuf_i = rbuf[i];
467: rbuf_0 = rbuf_i[0];
468: ct += rbuf_0;
469: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
470: }
471:
472: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
473: else max1 = 1;
474: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
475: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
476: ++no_malloc;
477: PetscBTCreate(Mbs,xtable);
478: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
479:
480: ct3 = 0;
481: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
482: rbuf_i = rbuf[i];
483: rbuf_0 = rbuf_i[0];
484: ct1 = 2*rbuf_0+1;
485: ct2 = ct1;
486: ct3 += ct1;
487: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
488: PetscBTMemzero(Mbs,xtable);
489: oct2 = ct2;
490: kmax = rbuf_i[2*j];
491: for (k=0; k<kmax; k++,ct1++) {
492: row = rbuf_i[ct1];
493: if (!PetscBTLookupSet(xtable,row)) {
494: if (!(ct3 < mem_estimate)) {
495: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
496: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
497: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
498: PetscFree(xdata[0]);
499: xdata[0] = tmp;
500: mem_estimate = new_estimate; ++no_malloc;
501: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
502: }
503: xdata[i][ct2++] = row;
504: ct3++;
505: }
506: }
507: for (k=oct2,max2=ct2; k<max2; k++) {
508: row = xdata[i][k] - rstart;
509: start = ai[row];
510: end = ai[row+1];
511: for (l=start; l<end; l++) {
512: val = aj[l] + cstart;
513: if (!PetscBTLookupSet(xtable,val)) {
514: if (!(ct3 < mem_estimate)) {
515: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
516: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
517: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
518: PetscFree(xdata[0]);
519: xdata[0] = tmp;
520: mem_estimate = new_estimate; ++no_malloc;
521: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
522: }
523: xdata[i][ct2++] = val;
524: ct3++;
525: }
526: }
527: start = bi[row];
528: end = bi[row+1];
529: for (l=start; l<end; l++) {
530: val = garray[bj[l]];
531: if (!PetscBTLookupSet(xtable,val)) {
532: if (!(ct3 < mem_estimate)) {
533: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
534: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
535: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
536: PetscFree(xdata[0]);
537: xdata[0] = tmp;
538: mem_estimate = new_estimate; ++no_malloc;
539: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
540: }
541: xdata[i][ct2++] = val;
542: ct3++;
543: }
544: }
545: }
546: /* Update the header*/
547: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
548: xdata[i][2*j-1] = rbuf_i[2*j-1];
549: }
550: xdata[i][0] = rbuf_0;
551: xdata[i+1] = xdata[i] + ct2;
552: isz1[i] = ct2; /* size of each message */
553: }
554: PetscBTDestroy(xtable);
555: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
556: return(0);
557: }
559: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
563: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
564: {
565: IS *isrow_new,*iscol_new;
566: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
568: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
571: /* The compression and expansion should be avoided. Does'nt point
572: out errors might change the indices hence buggey */
574: PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
575: iscol_new = isrow_new + ismax;
576: ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);
577: ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);
579: /* Allocate memory to hold all the submatrices */
580: if (scall != MAT_REUSE_MATRIX) {
581: PetscMalloc((ismax+1)*sizeof(Mat),submat);
582: }
583: /* Determine the number of stages through which submatrices are done */
584: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
585: if (!nmax) nmax = 1;
586: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
587:
588: /* Make sure every processor loops through the nstages */
589: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);
590: for (i=0,pos=0; i<nstages; i++) {
591: if (pos+nmax <= ismax) max_no = nmax;
592: else if (pos == ismax) max_no = 0;
593: else max_no = ismax-pos;
594: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
595: pos += max_no;
596: }
597:
598: for (i=0; i<ismax; i++) {
599: ISDestroy(isrow_new[i]);
600: ISDestroy(iscol_new[i]);
601: }
602: PetscFree(isrow_new);
603: return(0);
604: }
606: #if defined (PETSC_USE_CTABLE)
609: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
610: {
611: PetscInt nGlobalNd = proc_gnode[size];
612: PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
613:
615: if (fproc > size) fproc = size;
616: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
617: if (row < proc_gnode[fproc]) fproc--;
618: else fproc++;
619: }
620: *rank = fproc;
621: return(0);
622: }
623: #endif
625: /* -------------------------------------------------------------------------*/
626: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
629: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
630: {
631: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
632: Mat A = c->A;
633: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
634: const PetscInt **irow,**icol,*irow_i;
635: PetscInt *nrow,*ncol,*w3,*w4,start;
637: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
638: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
639: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
640: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
641: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
642: PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
643: PetscInt bs=C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
644: PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
645: PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs;
646: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
647: MPI_Request *r_waits4,*s_waits3,*s_waits4;
648: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
649: MPI_Status *r_status3,*r_status4,*s_status4;
650: MPI_Comm comm;
651: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
652: MatScalar *a_a=a->a,*b_a=b->a;
653: PetscTruth flag,sorted;
654: PetscMPIInt *onodes1,*olengths1;
656: #if defined (PETSC_USE_CTABLE)
657: PetscInt tt;
658: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
659: #else
660: PetscInt **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
661: #endif
664: comm = ((PetscObject)C)->comm;
665: tag0 = ((PetscObject)C)->tag;
666: size = c->size;
667: rank = c->rank;
668:
669: /* Get some new tags to keep the communication clean */
670: PetscObjectGetNewTag((PetscObject)C,&tag1);
671: PetscObjectGetNewTag((PetscObject)C,&tag2);
672: PetscObjectGetNewTag((PetscObject)C,&tag3);
674: /* Check if the col indices are sorted */
675: for (i=0; i<ismax; i++) {
676: ISSorted(iscol[i],&sorted);
677: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
678: }
680: len = (2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt));
681: #if !defined (PETSC_USE_CTABLE)
682: len += (Mbs+1)*sizeof(PetscInt);
683: #endif
684: PetscMalloc(len,&irow);
685: icol = irow + ismax;
686: nrow = (PetscInt*)(icol + ismax);
687: ncol = nrow + ismax;
688: #if !defined (PETSC_USE_CTABLE)
689: rtable = ncol + ismax;
690: /* Create hash table for the mapping :row -> proc*/
691: for (i=0,j=0; i<size; i++) {
692: jmax = c->rowners[i+1];
693: for (; j<jmax; j++) {
694: rtable[j] = i;
695: }
696: }
697: #endif
698:
699: for (i=0; i<ismax; i++) {
700: ISGetIndices(isrow[i],&irow[i]);
701: ISGetIndices(iscol[i],&icol[i]);
702: ISGetLocalSize(isrow[i],&nrow[i]);
703: ISGetLocalSize(iscol[i],&ncol[i]);
704: }
706: /* evaluate communication - mesg to who,length of mesg,and buffer space
707: required. Based on this, buffers are allocated, and data copied into them*/
708: PetscMalloc(size*2*sizeof(PetscInt) + size*2*sizeof(PetscMPIInt),&w1); /* mesg size */
709: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
710: w3 = (PetscInt*)(w2 + size); /* no of IS that needs to be sent to proc i */
711: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
712: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
713: for (i=0; i<ismax; i++) {
714: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
715: jmax = nrow[i];
716: irow_i = irow[i];
717: for (j=0; j<jmax; j++) {
718: row = irow_i[j];
719: #if defined (PETSC_USE_CTABLE)
720: PetscGetProc(row,size,c->rangebs,&proc);
721: #else
722: proc = rtable[row];
723: #endif
724: w4[proc]++;
725: }
726: for (j=0; j<size; j++) {
727: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
728: }
729: }
731: nrqs = 0; /* no of outgoing messages */
732: msz = 0; /* total mesg length for all proc */
733: w1[rank] = 0; /* no mesg sent to intself */
734: w3[rank] = 0;
735: for (i=0; i<size; i++) {
736: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
737: }
738: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
739: for (i=0,j=0; i<size; i++) {
740: if (w1[i]) { pa[j] = i; j++; }
741: }
743: /* Each message would have a header = 1 + 2*(no of IS) + data */
744: for (i=0; i<nrqs; i++) {
745: j = pa[i];
746: w1[j] += w2[j] + 2* w3[j];
747: msz += w1[j];
748: }
750: /* Determine the number of messages to expect, their lengths, from from-ids */
751: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
752: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
754: /* Now post the Irecvs corresponding to these messages */
755: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
756:
757: PetscFree(onodes1);
758: PetscFree(olengths1);
760: /* Allocate Memory for outgoing messages */
761: len = 2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt);
762: PetscMalloc(len,&sbuf1);
763: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
764: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
765: /* allocate memory for outgoing data + buf to receive the first reply */
766: tmp = (PetscInt*)(ptr + size);
767: ctr = tmp + 2*msz;
769: {
770: PetscInt *iptr = tmp,ict = 0;
771: for (i=0; i<nrqs; i++) {
772: j = pa[i];
773: iptr += ict;
774: sbuf1[j] = iptr;
775: ict = w1[j];
776: }
777: }
779: /* Form the outgoing messages */
780: /* Initialise the header space */
781: for (i=0; i<nrqs; i++) {
782: j = pa[i];
783: sbuf1[j][0] = 0;
784: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
785: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
786: }
787:
788: /* Parse the isrow and copy data into outbuf */
789: for (i=0; i<ismax; i++) {
790: PetscMemzero(ctr,size*sizeof(PetscInt));
791: irow_i = irow[i];
792: jmax = nrow[i];
793: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
794: row = irow_i[j];
795: #if defined (PETSC_USE_CTABLE)
796: PetscGetProc(row,size,c->rangebs,&proc);
797: #else
798: proc = rtable[row];
799: #endif
800: if (proc != rank) { /* copy to the outgoing buf*/
801: ctr[proc]++;
802: *ptr[proc] = row;
803: ptr[proc]++;
804: }
805: }
806: /* Update the headers for the current IS */
807: for (j=0; j<size; j++) { /* Can Optimise this loop too */
808: if ((ctr_j = ctr[j])) {
809: sbuf1_j = sbuf1[j];
810: k = ++sbuf1_j[0];
811: sbuf1_j[2*k] = ctr_j;
812: sbuf1_j[2*k-1] = i;
813: }
814: }
815: }
817: /* Now post the sends */
818: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
819: for (i=0; i<nrqs; ++i) {
820: j = pa[i];
821: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
822: }
824: /* Post Recieves to capture the buffer size */
825: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
826: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
827: rbuf2[0] = tmp + msz;
828: for (i=1; i<nrqs; ++i) {
829: j = pa[i];
830: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
831: }
832: for (i=0; i<nrqs; ++i) {
833: j = pa[i];
834: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
835: }
837: /* Send to other procs the buf size they should allocate */
839: /* Receive messages*/
840: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
841: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
842: len = 2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*);
843: PetscMalloc(len,&sbuf2);
844: req_size = (PetscInt*)(sbuf2 + nrqr);
845: req_source = req_size + nrqr;
846:
847: {
848: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
849: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
851: for (i=0; i<nrqr; ++i) {
852: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
853: req_size[idex] = 0;
854: rbuf1_i = rbuf1[idex];
855: start = 2*rbuf1_i[0] + 1;
856: MPI_Get_count(r_status1+i,MPIU_INT,&end);
857: PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);
858: sbuf2_i = sbuf2[idex];
859: for (j=start; j<end; j++) {
860: id = rbuf1_i[j] - rstart;
861: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
862: sbuf2_i[j] = ncols;
863: req_size[idex] += ncols;
864: }
865: req_source[idex] = r_status1[i].MPI_SOURCE;
866: /* form the header */
867: sbuf2_i[0] = req_size[idex];
868: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
869: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
870: }
871: }
872: PetscFree(r_status1);
873: PetscFree(r_waits1);
875: /* recv buffer sizes */
876: /* Receive messages*/
878: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
879: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
880: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
881: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
882: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
884: for (i=0; i<nrqs; ++i) {
885: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
886: PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);
887: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
888: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
889: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
890: }
891: PetscFree(r_status2);
892: PetscFree(r_waits2);
893:
894: /* Wait on sends1 and sends2 */
895: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
896: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
898: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
899: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
900: PetscFree(s_status1);
901: PetscFree(s_status2);
902: PetscFree(s_waits1);
903: PetscFree(s_waits2);
905: /* Now allocate buffers for a->j, and send them off */
906: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
907: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
908: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
909: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
910:
911: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
912: {
913: for (i=0; i<nrqr; i++) {
914: rbuf1_i = rbuf1[i];
915: sbuf_aj_i = sbuf_aj[i];
916: ct1 = 2*rbuf1_i[0] + 1;
917: ct2 = 0;
918: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
919: kmax = rbuf1[i][2*j];
920: for (k=0; k<kmax; k++,ct1++) {
921: row = rbuf1_i[ct1] - rstart;
922: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
923: ncols = nzA + nzB;
924: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
926: /* load the column indices for this row into cols*/
927: cols = sbuf_aj_i + ct2;
928: for (l=0; l<nzB; l++) {
929: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
930: else break;
931: }
932: imark = l;
933: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
934: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
935: ct2 += ncols;
936: }
937: }
938: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
939: }
940: }
941: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
942: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
944: /* Allocate buffers for a->a, and send them off */
945: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
946: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
947: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
948: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
949:
950: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
951: {
952: for (i=0; i<nrqr; i++) {
953: rbuf1_i = rbuf1[i];
954: sbuf_aa_i = sbuf_aa[i];
955: ct1 = 2*rbuf1_i[0]+1;
956: ct2 = 0;
957: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
958: kmax = rbuf1_i[2*j];
959: for (k=0; k<kmax; k++,ct1++) {
960: row = rbuf1_i[ct1] - rstart;
961: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
962: ncols = nzA + nzB;
963: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
964: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
966: /* load the column values for this row into vals*/
967: vals = sbuf_aa_i+ct2*bs2;
968: for (l=0; l<nzB; l++) {
969: if ((bmap[cworkB[l]]) < cstart) {
970: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
971: }
972: else break;
973: }
974: imark = l;
975: for (l=0; l<nzA; l++) {
976: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
977: }
978: for (l=imark; l<nzB; l++) {
979: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
980: }
981: ct2 += ncols;
982: }
983: }
984: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
985: }
986: }
987: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
988: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
989: PetscFree(rbuf1);
991: /* Form the matrix */
992: /* create col map */
993: {
994: const PetscInt *icol_i;
995: #if defined (PETSC_USE_CTABLE)
996: /* Create row map*/
997: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
998: for (i=0; i<ismax; i++) {
999: PetscTableCreate(ncol[i]+1,&colmaps[i]);
1000: }
1001: #else
1002: len = (1+ismax)*sizeof(PetscInt*)+ ismax*c->Nbs*sizeof(PetscInt);
1003: PetscMalloc(len,&cmap);
1004: cmap[0] = (PetscInt*)(cmap + ismax);
1005: PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(PetscInt));
1006: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1007: #endif
1008: for (i=0; i<ismax; i++) {
1009: jmax = ncol[i];
1010: icol_i = icol[i];
1011: #if defined (PETSC_USE_CTABLE)
1012: lcol1_gcol1 = colmaps[i];
1013: for (j=0; j<jmax; j++) {
1014: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1015: }
1016: #else
1017: cmap_i = cmap[i];
1018: for (j=0; j<jmax; j++) {
1019: cmap_i[icol_i[j]] = j+1;
1020: }
1021: #endif
1022: }
1023: }
1025: /* Create lens which is required for MatCreate... */
1026: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1027: len = (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt);
1028: PetscMalloc(len,&lens);
1029: lens[0] = (PetscInt*)(lens + ismax);
1030: PetscMemzero(lens[0],j*sizeof(PetscInt));
1031: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1032:
1033: /* Update lens from local data */
1034: for (i=0; i<ismax; i++) {
1035: jmax = nrow[i];
1036: #if defined (PETSC_USE_CTABLE)
1037: lcol1_gcol1 = colmaps[i];
1038: #else
1039: cmap_i = cmap[i];
1040: #endif
1041: irow_i = irow[i];
1042: lens_i = lens[i];
1043: for (j=0; j<jmax; j++) {
1044: row = irow_i[j];
1045: #if defined (PETSC_USE_CTABLE)
1046: PetscGetProc(row,size,c->rangebs,&proc);
1047: #else
1048: proc = rtable[row];
1049: #endif
1050: if (proc == rank) {
1051: /* Get indices from matA and then from matB */
1052: row = row - rstart;
1053: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1054: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1055: #if defined (PETSC_USE_CTABLE)
1056: for (k=0; k<nzA; k++) {
1057: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1058: if (tt) { lens_i[j]++; }
1059: }
1060: for (k=0; k<nzB; k++) {
1061: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1062: if (tt) { lens_i[j]++; }
1063: }
1064: #else
1065: for (k=0; k<nzA; k++) {
1066: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1067: }
1068: for (k=0; k<nzB; k++) {
1069: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1070: }
1071: #endif
1072: }
1073: }
1074: }
1075: #if defined (PETSC_USE_CTABLE)
1076: /* Create row map*/
1077: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1078: for (i=0; i<ismax; i++){
1079: PetscTableCreate(nrow[i]+1,&rowmaps[i]);
1080: }
1081: #else
1082: /* Create row map*/
1083: len = (1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt);
1084: PetscMalloc(len,&rmap);
1085: rmap[0] = (PetscInt*)(rmap + ismax);
1086: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1087: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1088: #endif
1089: for (i=0; i<ismax; i++) {
1090: irow_i = irow[i];
1091: jmax = nrow[i];
1092: #if defined (PETSC_USE_CTABLE)
1093: lrow1_grow1 = rowmaps[i];
1094: for (j=0; j<jmax; j++) {
1095: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1096: }
1097: #else
1098: rmap_i = rmap[i];
1099: for (j=0; j<jmax; j++) {
1100: rmap_i[irow_i[j]] = j;
1101: }
1102: #endif
1103: }
1105: /* Update lens from offproc data */
1106: {
1107: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1108: PetscMPIInt ii;
1110: for (tmp2=0; tmp2<nrqs; tmp2++) {
1111: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1112: idex = pa[ii];
1113: sbuf1_i = sbuf1[idex];
1114: jmax = sbuf1_i[0];
1115: ct1 = 2*jmax+1;
1116: ct2 = 0;
1117: rbuf2_i = rbuf2[ii];
1118: rbuf3_i = rbuf3[ii];
1119: for (j=1; j<=jmax; j++) {
1120: is_no = sbuf1_i[2*j-1];
1121: max1 = sbuf1_i[2*j];
1122: lens_i = lens[is_no];
1123: #if defined (PETSC_USE_CTABLE)
1124: lcol1_gcol1 = colmaps[is_no];
1125: lrow1_grow1 = rowmaps[is_no];
1126: #else
1127: cmap_i = cmap[is_no];
1128: rmap_i = rmap[is_no];
1129: #endif
1130: for (k=0; k<max1; k++,ct1++) {
1131: #if defined (PETSC_USE_CTABLE)
1132: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1133: row--;
1134: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1135: #else
1136: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1137: #endif
1138: max2 = rbuf2_i[ct1];
1139: for (l=0; l<max2; l++,ct2++) {
1140: #if defined (PETSC_USE_CTABLE)
1141: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1142: if (tt) {
1143: lens_i[row]++;
1144: }
1145: #else
1146: if (cmap_i[rbuf3_i[ct2]]) {
1147: lens_i[row]++;
1148: }
1149: #endif
1150: }
1151: }
1152: }
1153: }
1154: }
1155: PetscFree(r_status3);
1156: PetscFree(r_waits3);
1157: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1158: PetscFree(s_status3);
1159: PetscFree(s_waits3);
1161: /* Create the submatrices */
1162: if (scall == MAT_REUSE_MATRIX) {
1163: /*
1164: Assumes new rows are same length as the old rows, hence bug!
1165: */
1166: for (i=0; i<ismax; i++) {
1167: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1168: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) {
1169: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1170: }
1171: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1172: if (!flag) {
1173: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1174: }
1175: /* Initial matrix as if empty */
1176: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1177: submats[i]->factor = C->factor;
1178: }
1179: } else {
1180: for (i=0; i<ismax; i++) {
1181: MatCreate(PETSC_COMM_SELF,submats+i);
1182: MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);
1183: MatSetType(submats[i],((PetscObject)A)->type_name);
1184: MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);
1185: MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);
1186: }
1187: }
1189: /* Assemble the matrices */
1190: /* First assemble the local rows */
1191: {
1192: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1193: MatScalar *imat_a;
1194:
1195: for (i=0; i<ismax; i++) {
1196: mat = (Mat_SeqBAIJ*)submats[i]->data;
1197: imat_ilen = mat->ilen;
1198: imat_j = mat->j;
1199: imat_i = mat->i;
1200: imat_a = mat->a;
1202: #if defined (PETSC_USE_CTABLE)
1203: lcol1_gcol1 = colmaps[i];
1204: lrow1_grow1 = rowmaps[i];
1205: #else
1206: cmap_i = cmap[i];
1207: rmap_i = rmap[i];
1208: #endif
1209: irow_i = irow[i];
1210: jmax = nrow[i];
1211: for (j=0; j<jmax; j++) {
1212: row = irow_i[j];
1213: #if defined (PETSC_USE_CTABLE)
1214: PetscGetProc(row,size,c->rangebs,&proc);
1215: #else
1216: proc = rtable[row];
1217: #endif
1218: if (proc == rank) {
1219: row = row - rstart;
1220: nzA = a_i[row+1] - a_i[row];
1221: nzB = b_i[row+1] - b_i[row];
1222: cworkA = a_j + a_i[row];
1223: cworkB = b_j + b_i[row];
1224: vworkA = a_a + a_i[row]*bs2;
1225: vworkB = b_a + b_i[row]*bs2;
1226: #if defined (PETSC_USE_CTABLE)
1227: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1228: row--;
1229: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1230: #else
1231: row = rmap_i[row + rstart];
1232: #endif
1233: mat_i = imat_i[row];
1234: mat_a = imat_a + mat_i*bs2;
1235: mat_j = imat_j + mat_i;
1236: ilen_row = imat_ilen[row];
1238: /* load the column indices for this row into cols*/
1239: for (l=0; l<nzB; l++) {
1240: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1241: #if defined (PETSC_USE_CTABLE)
1242: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1243: if (tcol) {
1244: #else
1245: if ((tcol = cmap_i[ctmp])) {
1246: #endif
1247: *mat_j++ = tcol - 1;
1248: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1249: mat_a += bs2;
1250: ilen_row++;
1251: }
1252: } else break;
1253: }
1254: imark = l;
1255: for (l=0; l<nzA; l++) {
1256: #if defined (PETSC_USE_CTABLE)
1257: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1258: if (tcol) {
1259: #else
1260: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1261: #endif
1262: *mat_j++ = tcol - 1;
1263: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1264: mat_a += bs2;
1265: ilen_row++;
1266: }
1267: }
1268: for (l=imark; l<nzB; l++) {
1269: #if defined (PETSC_USE_CTABLE)
1270: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1271: if (tcol) {
1272: #else
1273: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1274: #endif
1275: *mat_j++ = tcol - 1;
1276: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1277: mat_a += bs2;
1278: ilen_row++;
1279: }
1280: }
1281: imat_ilen[row] = ilen_row;
1282: }
1283: }
1284:
1285: }
1286: }
1288: /* Now assemble the off proc rows*/
1289: {
1290: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1291: PetscInt *imat_j,*imat_i;
1292: MatScalar *imat_a,*rbuf4_i;
1293: PetscMPIInt ii;
1295: for (tmp2=0; tmp2<nrqs; tmp2++) {
1296: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1297: idex = pa[ii];
1298: sbuf1_i = sbuf1[idex];
1299: jmax = sbuf1_i[0];
1300: ct1 = 2*jmax + 1;
1301: ct2 = 0;
1302: rbuf2_i = rbuf2[ii];
1303: rbuf3_i = rbuf3[ii];
1304: rbuf4_i = rbuf4[ii];
1305: for (j=1; j<=jmax; j++) {
1306: is_no = sbuf1_i[2*j-1];
1307: #if defined (PETSC_USE_CTABLE)
1308: lrow1_grow1 = rowmaps[is_no];
1309: lcol1_gcol1 = colmaps[is_no];
1310: #else
1311: rmap_i = rmap[is_no];
1312: cmap_i = cmap[is_no];
1313: #endif
1314: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1315: imat_ilen = mat->ilen;
1316: imat_j = mat->j;
1317: imat_i = mat->i;
1318: imat_a = mat->a;
1319: max1 = sbuf1_i[2*j];
1320: for (k=0; k<max1; k++,ct1++) {
1321: row = sbuf1_i[ct1];
1322: #if defined (PETSC_USE_CTABLE)
1323: PetscTableFind(lrow1_grow1,row+1,&row);
1324: row--;
1325: if(row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1326: #else
1327: row = rmap_i[row];
1328: #endif
1329: ilen = imat_ilen[row];
1330: mat_i = imat_i[row];
1331: mat_a = imat_a + mat_i*bs2;
1332: mat_j = imat_j + mat_i;
1333: max2 = rbuf2_i[ct1];
1334: for (l=0; l<max2; l++,ct2++) {
1335: #if defined (PETSC_USE_CTABLE)
1336: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1337: if (tcol) {
1338: #else
1339: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1340: #endif
1341: *mat_j++ = tcol - 1;
1342: /* *mat_a++= rbuf4_i[ct2]; */
1343: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1344: mat_a += bs2;
1345: ilen++;
1346: }
1347: }
1348: imat_ilen[row] = ilen;
1349: }
1350: }
1351: }
1352: }
1353: PetscFree(r_status4);
1354: PetscFree(r_waits4);
1355: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1356: PetscFree(s_waits4);
1357: PetscFree(s_status4);
1359: /* Restore the indices */
1360: for (i=0; i<ismax; i++) {
1361: ISRestoreIndices(isrow[i],irow+i);
1362: ISRestoreIndices(iscol[i],icol+i);
1363: }
1365: /* Destroy allocated memory */
1366: PetscFree(irow);
1367: PetscFree(w1);
1368: PetscFree(pa);
1370: PetscFree(sbuf1);
1371: PetscFree(rbuf2);
1372: for (i=0; i<nrqr; ++i) {
1373: PetscFree(sbuf2[i]);
1374: }
1375: for (i=0; i<nrqs; ++i) {
1376: PetscFree(rbuf3[i]);
1377: PetscFree(rbuf4[i]);
1378: }
1380: PetscFree(sbuf2);
1381: PetscFree(rbuf3);
1382: PetscFree(rbuf4);
1383: PetscFree(sbuf_aj[0]);
1384: PetscFree(sbuf_aj);
1385: PetscFree(sbuf_aa[0]);
1386: PetscFree(sbuf_aa);
1388: #if defined (PETSC_USE_CTABLE)
1389: for (i=0; i<ismax; i++){
1390: PetscTableDestroy(rowmaps[i]);
1391: PetscTableDestroy(colmaps[i]);
1392: }
1393: PetscFree(colmaps);
1394: PetscFree(rowmaps);
1395: #else
1396: PetscFree(rmap);
1397: PetscFree(cmap);
1398: #endif
1399: PetscFree(lens);
1401: for (i=0; i<ismax; i++) {
1402: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1403: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1404: }
1405: return(0);
1406: }