Actual source code: mpiptap.c

  1: #define PETSCMAT_DLL

  3: /*
  4:   Defines projective product routines where A is a MPIAIJ matrix
  5:           C = P^T * A * P
  6: */

 8:  #include ../src/mat/impls/aij/seq/aij.h
 9:  #include ../src/mat/utils/freespace.h
 10:  #include ../src/mat/impls/aij/mpi/mpiaij.h
 11:  #include petscbt.h

 13: EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
 16: PetscErrorCode  MatDestroy_MPIAIJ_MatPtAP(Mat A)
 17: {
 18:   PetscErrorCode       ierr;
 19:   Mat_Merge_SeqsToMPI  *merge;
 20:   PetscContainer       container;

 23:   PetscObjectQuery((PetscObject)A,"MatMergeSeqsToMPI",(PetscObject *)&container);
 24:   if (container) {
 25:     PetscContainerGetPointer(container,(void **)&merge);
 26:     PetscFree(merge->id_r);
 27:     PetscFree(merge->len_s);
 28:     PetscFree(merge->len_r);
 29:     PetscFree(merge->bi);
 30:     PetscFree(merge->bj);
 31:     PetscFree(merge->buf_ri);
 32:     PetscFree(merge->buf_rj);
 33:     PetscFree(merge->coi);
 34:     PetscFree(merge->coj);
 35:     PetscFree(merge->owners_co);
 36:     PetscFree(merge->rowmap.range);
 37: 
 38:     PetscContainerDestroy(container);
 39:     PetscObjectCompose((PetscObject)A,"MatMergeSeqsToMPI",0);
 40:   }
 41:   merge->MatDestroy(A);
 42:   PetscFree(merge);
 43:   return(0);
 44: }

 48: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
 49: {
 50:   PetscErrorCode       ierr;
 51:   Mat_Merge_SeqsToMPI  *merge;
 52:   PetscContainer       container;

 55:   PetscObjectQuery((PetscObject)A,"MatMergeSeqsToMPI",(PetscObject *)&container);
 56:   if (container) {
 57:     PetscContainerGetPointer(container,(void **)&merge);
 58:   } else {
 59:     SETERRQ(PETSC_ERR_PLIB,"Container does not exit");
 60:   }
 61:   (*merge->MatDuplicate)(A,op,M);
 62:   (*M)->ops->destroy   = merge->MatDestroy;   /* =MatDestroy_MPIAIJ, *M doesn't duplicate A's container! */
 63:   (*M)->ops->duplicate = merge->MatDuplicate; /* =MatDuplicate_ MPIAIJ */
 64:   return(0);
 65: }

 69: PetscErrorCode MatPtAPSymbolic_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
 70: {

 74:   if (!P->ops->ptapsymbolic_mpiaij) {
 75:     SETERRQ2(PETSC_ERR_SUP,"Not implemented for A=%s and P=%s",((PetscObject)A)->type_name,((PetscObject)P)->type_name);
 76:   }
 77:   (*P->ops->ptapsymbolic_mpiaij)(A,P,fill,C);
 78:   return(0);
 79: }

 83: PetscErrorCode MatPtAPNumeric_MPIAIJ(Mat A,Mat P,Mat C)
 84: {

 88:   if (!P->ops->ptapnumeric_mpiaij) {
 89:     SETERRQ2(PETSC_ERR_SUP,"Not implemented for A=%s and P=%s",((PetscObject)A)->type_name,((PetscObject)P)->type_name);
 90:   }
 91:   (*P->ops->ptapnumeric_mpiaij)(A,P,C);
 92:   return(0);
 93: }

 97: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
 98: {
 99:   PetscErrorCode       ierr;
100:   Mat                  B_mpi;
101:   Mat_MatMatMultMPI    *ap;
102:   PetscContainer       container;
103:   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
104:   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
105:   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
106:   Mat_SeqAIJ           *p_loc,*p_oth;
107:   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
108:   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz;
109:   PetscInt             nlnk,*lnk,*owners_co,*coi,*coj,i,k,pnz,row;
110:   PetscInt             am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
111:   PetscBT              lnkbt;
112:   MPI_Comm             comm=((PetscObject)A)->comm;
113:   PetscMPIInt          size,rank,tag,*len_si,*len_s,*len_ri;
114:   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
115:   PetscInt             len,proc,*dnz,*onz,*owners;
116:   PetscInt             nzi,*bi,*bj;
117:   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
118:   MPI_Request          *swaits,*rwaits;
119:   MPI_Status           *sstatus,rstatus;
120:   Mat_Merge_SeqsToMPI  *merge;
121:   PetscInt             *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0;
122:   PetscMPIInt          j;

125:   MPI_Comm_size(comm,&size);
126:   MPI_Comm_rank(comm,&rank);

128:   /* destroy the container 'Mat_MatMatMultMPI' in case that P is attached to it */
129:   PetscObjectQuery((PetscObject)P,"Mat_MatMatMultMPI",(PetscObject *)&container);
130:   if (container) {
131:     /* reset functions */
132:     PetscContainerGetPointer(container,(void **)&ap);
133:     P->ops->destroy = ap->MatDestroy;
134:     P->ops->duplicate = ap->MatDuplicate;
135:     /* destroy container and contents */
136:     PetscContainerDestroy(container);
137:     PetscObjectCompose((PetscObject)P,"Mat_MatMatMultMPI",0);
138:   }

140:   /* create the container 'Mat_MatMatMultMPI' and attach it to P */
141:   PetscNew(Mat_MatMatMultMPI,&ap);
142:   ap->abi=PETSC_NULL; ap->abj=PETSC_NULL;
143:   ap->abnz_max = 0;

145:   PetscContainerCreate(PETSC_COMM_SELF,&container);
146:   PetscContainerSetPointer(container,ap);
147:   PetscObjectCompose((PetscObject)P,"Mat_MatMatMultMPI",(PetscObject)container);
148:   ap->MatDestroy  = P->ops->destroy;
149:   P->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
150:   ap->reuse       = MAT_INITIAL_MATRIX;
151:   PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatMultMPI);

153:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
154:   MatGetBrowsOfAoCols(A,P,MAT_INITIAL_MATRIX,&ap->startsj,&ap->bufa,&ap->B_oth);
155:   /* get P_loc by taking all local rows of P */
156:   MatGetLocalMat(P,MAT_INITIAL_MATRIX,&ap->B_loc);

158:   p_loc = (Mat_SeqAIJ*)(ap->B_loc)->data;
159:   p_oth = (Mat_SeqAIJ*)(ap->B_oth)->data;
160:   pi_loc = p_loc->i; pj_loc = p_loc->j;
161:   pi_oth = p_oth->i; pj_oth = p_oth->j;

163:   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
164:   /*-------------------------------------------------------------------*/
165:   PetscMalloc((am+2)*sizeof(PetscInt),&api);
166:   ap->abi = api;
167:   api[0] = 0;

169:   /* create and initialize a linked list */
170:   nlnk = pN+1;
171:   PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);

173:   /* Initial FreeSpace size is fill*nnz(A) */
174:   PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am])),&free_space);
175:   current_space = free_space;

177:   for (i=0;i<am;i++) {
178:     apnz = 0;
179:     /* diagonal portion of A */
180:     nzi = adi[i+1] - adi[i];
181:     for (j=0; j<nzi; j++){
182:       row = *adj++;
183:       pnz = pi_loc[row+1] - pi_loc[row];
184:       Jptr  = pj_loc + pi_loc[row];
185:       /* add non-zero cols of P into the sorted linked list lnk */
186:       PetscLLAdd(pnz,Jptr,pN,nlnk,lnk,lnkbt);
187:       apnz += nlnk;
188:     }
189:     /* off-diagonal portion of A */
190:     nzi = aoi[i+1] - aoi[i];
191:     for (j=0; j<nzi; j++){
192:       row = *aoj++;
193:       pnz = pi_oth[row+1] - pi_oth[row];
194:       Jptr  = pj_oth + pi_oth[row];
195:       PetscLLAdd(pnz,Jptr,pN,nlnk,lnk,lnkbt);
196:       apnz += nlnk;
197:     }

199:     api[i+1] = api[i] + apnz;
200:     if (ap->abnz_max < apnz) ap->abnz_max = apnz;

202:     /* if free space is not available, double the total space in the list */
203:     if (current_space->local_remaining<apnz) {
204:       PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);
205:       nspacedouble++;
206:     }

208:     /* Copy data into free space, then initialize lnk */
209:     PetscLLClean(pN,pN,apnz,lnk,current_space->array,lnkbt);
210:     current_space->array           += apnz;
211:     current_space->local_used      += apnz;
212:     current_space->local_remaining -= apnz;
213:   }
214:   /* Allocate space for apj, initialize apj, and */
215:   /* destroy list of free space and other temporary array(s) */
216:   PetscMalloc((api[am]+1)*sizeof(PetscInt),&ap->abj);
217:   apj = ap->abj;
218:   PetscFreeSpaceContiguous(&free_space,ap->abj);

220:   /* determine symbolic Co=(p->B)^T*AP - send to others */
221:   /*----------------------------------------------------*/
222:   MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);

224:   /* then, compute symbolic Co = (p->B)^T*AP */
225:   pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 
226:                          >= (num of nonzero rows of C_seq) - pn */
227:   PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
228:   coi[0] = 0;

230:   /* set initial free space to be 3*pon*max( nnz(AP) per row) */
231:   nnz           = 3*pon*ap->abnz_max + 1;
232:   PetscFreeSpaceGet(nnz,&free_space);
233:   current_space = free_space;

235:   for (i=0; i<pon; i++) {
236:     nnz  = 0;
237:     pnz = poti[i+1] - poti[i];
238:     j     = pnz;
239:     ptJ   = potj + poti[i+1];
240:     while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
241:       j--; ptJ--;
242:       row  = *ptJ; /* row of AP == col of Pot */
243:       apnz = api[row+1] - api[row];
244:       Jptr   = apj + api[row];
245:       /* add non-zero cols of AP into the sorted linked list lnk */
246:       PetscLLAdd(apnz,Jptr,pN,nlnk,lnk,lnkbt);
247:       nnz += nlnk;
248:     }

250:     /* If free space is not available, double the total space in the list */
251:     if (current_space->local_remaining<nnz) {
252:       PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);
253:     }

255:     /* Copy data into free space, and zero out denserows */
256:     PetscLLClean(pN,pN,nnz,lnk,current_space->array,lnkbt);
257:     current_space->array           += nnz;
258:     current_space->local_used      += nnz;
259:     current_space->local_remaining -= nnz;
260:     coi[i+1] = coi[i] + nnz;
261:   }
262:   PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
263:   PetscFreeSpaceContiguous(&free_space,coj);
264:   MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);

266:   /* send j-array (coj) of Co to other processors */
267:   /*----------------------------------------------*/
268:   /* determine row ownership */
269:   PetscNew(Mat_Merge_SeqsToMPI,&merge);
270:   PetscMapInitialize(comm,&merge->rowmap);
271:   merge->rowmap.n = pn;
272:   merge->rowmap.N = PETSC_DECIDE;
273:   merge->rowmap.bs = 1;
274:   PetscMapSetUp(&merge->rowmap);
275:   owners = merge->rowmap.range;

277:   /* determine the number of messages to send, their lengths */
278:   PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
279:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));
280:   PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);
281:   len_s = merge->len_s;
282:   merge->nsend = 0;
283: 
284:   PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
285:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));

287:   proc = 0;
288:   for (i=0; i<pon; i++){
289:     while (prmap[i] >= owners[proc+1]) proc++;
290:     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
291:     len_s[proc] += coi[i+1] - coi[i];
292:   }

294:   len   = 0;  /* max length of buf_si[] */
295:   owners_co[0] = 0;
296:   for (proc=0; proc<size; proc++){
297:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
298:     if (len_si[proc]){
299:       merge->nsend++;
300:       len_si[proc] = 2*(len_si[proc] + 1);
301:       len += len_si[proc];
302:     }
303:   }

305:   /* determine the number and length of messages to receive for coi and coj  */
306:   PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);
307:   PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);

309:   /* post the Irecv and Isend of coj */
310:   PetscCommGetNewTag(comm,&tag);
311:   PetscPostIrecvInt(comm,tag,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);

313:   PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);

315:   for (proc=0, k=0; proc<size; proc++){
316:     if (!len_s[proc]) continue;
317:     i = owners_co[proc];
318:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tag,comm,swaits+k);
319:     k++;
320:   }

322:   /* receives and sends of coj are complete */
323:   PetscMalloc(size*sizeof(MPI_Status),&sstatus);
324:   i = merge->nrecv;
325:   while (i--) {
326:     MPI_Waitany(merge->nrecv,rwaits,&j,&rstatus);
327:   }
328:   PetscFree(rwaits);
329:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
330: 
331:   /* send and recv coi */
332:   /*-------------------*/
333:   PetscPostIrecvInt(comm,tag,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
334: 
335:   PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
336:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
337:   for (proc=0,k=0; proc<size; proc++){
338:     if (!len_s[proc]) continue;
339:     /* form outgoing message for i-structure: 
340:          buf_si[0]:                 nrows to be sent
341:                [1:nrows]:           row index (global)
342:                [nrows+1:2*nrows+1]: i-structure index
343:     */
344:     /*-------------------------------------------*/
345:     nrows = len_si[proc]/2 - 1;
346:     buf_si_i    = buf_si + nrows+1;
347:     buf_si[0]   = nrows;
348:     buf_si_i[0] = 0;
349:     nrows = 0;
350:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
351:       nzi = coi[i+1] - coi[i];
352:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
353:       buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
354:       nrows++;
355:     }
356:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tag,comm,swaits+k);
357:     k++;
358:     buf_si += len_si[proc];
359:   }
360:   i = merge->nrecv;
361:   while (i--) {
362:     MPI_Waitany(merge->nrecv,rwaits,&j,&rstatus);
363:   }
364:   PetscFree(rwaits);
365:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
366:   /*
367:   PetscInfo2(A,"nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);
368:   for (i=0; i<merge->nrecv; i++){
369:     PetscInfo3(A,"recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);
370:   }
371:   */
372:   PetscFree(len_si);
373:   PetscFree(len_ri);
374:   PetscFree(swaits);
375:   PetscFree(sstatus);
376:   PetscFree(buf_s);

378:   /* compute the local portion of C (mpi mat) */
379:   /*------------------------------------------*/
380:   MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);

382:   /* allocate bi array and free space for accumulating nonzero column info */
383:   PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
384:   bi[0] = 0;
385: 
386:   /* set initial free space to be 3*pn*max( nnz(AP) per row) */
387:   nnz           = 3*pn*ap->abnz_max + 1;
388:   PetscFreeSpaceGet(nnz,&free_space);
389:   current_space = free_space;

391:   PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);
392:   nextrow = buf_ri_k + merge->nrecv;
393:   nextci  = nextrow + merge->nrecv;
394:   for (k=0; k<merge->nrecv; k++){
395:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
396:     nrows       = *buf_ri_k[k];
397:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
398:     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
399:   }
400:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
401:   for (i=0; i<pn; i++) {
402:     /* add pdt[i,:]*AP into lnk */
403:     nnz = 0;
404:     pnz  = pdti[i+1] - pdti[i];
405:     j    = pnz;
406:     ptJ  = pdtj + pdti[i+1];
407:     while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
408:       j--; ptJ--;
409:       row  = *ptJ; /* row of AP == col of Pt */
410:       apnz = api[row+1] - api[row];
411:       Jptr   = apj + api[row];
412:       /* add non-zero cols of AP into the sorted linked list lnk */
413:       PetscLLAdd(apnz,Jptr,pN,nlnk,lnk,lnkbt);
414:       nnz += nlnk;
415:     }
416:     /* add received col data into lnk */
417:     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
418:       if (i == *nextrow[k]) { /* i-th row */
419:         nzi = *(nextci[k]+1) - *nextci[k];
420:         Jptr  = buf_rj[k] + *nextci[k];
421:         PetscLLAdd(nzi,Jptr,pN,nlnk,lnk,lnkbt);
422:         nnz += nlnk;
423:         nextrow[k]++; nextci[k]++;
424:       }
425:     }

427:     /* if free space is not available, make more free space */
428:     if (current_space->local_remaining<nnz) {
429:       PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);
430:     }
431:     /* copy data into free space, then initialize lnk */
432:     PetscLLClean(pN,pN,nnz,lnk,current_space->array,lnkbt);
433:     MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
434:     current_space->array           += nnz;
435:     current_space->local_used      += nnz;
436:     current_space->local_remaining -= nnz;
437:     bi[i+1] = bi[i] + nnz;
438:   }
439:   MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
440:   PetscFree(buf_ri_k);

442:   PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
443:   PetscFreeSpaceContiguous(&free_space,bj);
444:   PetscLLDestroy(lnk,lnkbt);

446:   /* create symbolic parallel matrix B_mpi */
447:   /*---------------------------------------*/
448:   MatCreate(comm,&B_mpi);
449:   MatSetSizes(B_mpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
450:   MatSetType(B_mpi,MATMPIAIJ);
451:   MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);
452:   MatPreallocateFinalize(dnz,onz);

454:   merge->bi            = bi;
455:   merge->bj            = bj;
456:   merge->coi           = coi;
457:   merge->coj           = coj;
458:   merge->buf_ri        = buf_ri;
459:   merge->buf_rj        = buf_rj;
460:   merge->owners_co     = owners_co;
461:   merge->MatDestroy    = B_mpi->ops->destroy;
462:   merge->MatDuplicate  = B_mpi->ops->duplicate;

464:   /* B_mpi is not ready for use - assembly will be done by MatPtAPNumeric() */
465:   B_mpi->assembled     = PETSC_FALSE;
466:   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_MatPtAP;
467:   B_mpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;

469:   /* attach the supporting struct to B_mpi for reuse */
470:   PetscContainerCreate(PETSC_COMM_SELF,&container);
471:   PetscContainerSetPointer(container,merge);
472:   PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);
473:   *C = B_mpi;
474: #if defined(PETSC_USE_INFO)
475:   if (bi[pn] != 0) {
476:     PetscReal afill = ((PetscReal)bi[pn])/(adi[am]+aoi[am]);
477:     if (afill < 1.0) afill = 1.0;
478:     PetscInfo3(B_mpi,"Reallocs %D; Fill ratio: given %G needed %G when computing A*P.\n",nspacedouble,fill,afill);
479:     PetscInfo1(B_mpi,"Use MatPtAP(A,P,MatReuse,%G,&C) for best performance.\n",afill);
480:   } else {
481:     PetscInfo(B_mpi,"Empty matrix product\n");
482:   }
483: #endif
484:   return(0);
485: }

489: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
490: {
491:   PetscErrorCode       ierr;
492:   Mat_Merge_SeqsToMPI  *merge;
493:   Mat_MatMatMultMPI    *ap;
494:   PetscContainer       cont_merge,cont_ptap;
495:   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
496:   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
497:   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
498:   Mat_SeqAIJ           *p_loc,*p_oth;
499:   PetscInt             *adi=ad->i,*aoi=ao->i,*adj=ad->j,*aoj=ao->j,*apJ,nextp;
500:   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
501:   PetscInt             i,j,k,anz,pnz,apnz,nextap,row,*cj;
502:   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*pa,*ca,*pa_loc,*pa_oth;
503:   PetscInt             am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
504:   MPI_Comm             comm=((PetscObject)C)->comm;
505:   PetscMPIInt          size,rank,taga,*len_s;
506:   PetscInt             *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
507:   PetscInt             **buf_ri,**buf_rj;
508:   PetscInt             cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
509:   MPI_Request          *s_waits,*r_waits;
510:   MPI_Status           *status;
511:   MatScalar            **abuf_r,*ba_i,*pA,*coa,*ba;
512:   PetscInt             *api,*apj,*coi,*coj;
513:   PetscInt             *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;

516:   MPI_Comm_size(comm,&size);
517:   MPI_Comm_rank(comm,&rank);

519:   PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);
520:   if (cont_merge) {
521:     PetscContainerGetPointer(cont_merge,(void **)&merge);
522:   } else {
523:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
524:   }

526:   PetscObjectQuery((PetscObject)P,"Mat_MatMatMultMPI",(PetscObject *)&cont_ptap);
527:   if (cont_ptap) {
528:     PetscContainerGetPointer(cont_ptap,(void **)&ap);
529:     if (ap->reuse == MAT_INITIAL_MATRIX){
530:       ap->reuse = MAT_REUSE_MATRIX;
531:     } else { /* update numerical values of P_oth and P_loc */
532:       MatGetBrowsOfAoCols(A,P,MAT_REUSE_MATRIX,&ap->startsj,&ap->bufa,&ap->B_oth);
533:       MatGetLocalMat(P,MAT_REUSE_MATRIX,&ap->B_loc);
534:     }
535:   } else {
536:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
537:   }

539:   /* get data from symbolic products */
540:   p_loc = (Mat_SeqAIJ*)(ap->B_loc)->data;
541:   p_oth = (Mat_SeqAIJ*)(ap->B_oth)->data;
542:   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a,pA=pa_loc;
543:   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
544: 
545:   coi = merge->coi; coj = merge->coj;
546:   PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
547:   PetscMemzero(coa,coi[pon]*sizeof(MatScalar));

549:   bi     = merge->bi; bj = merge->bj;
550:   owners = merge->rowmap.range;
551:   PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
552:   PetscMemzero(ba,bi[cm]*sizeof(MatScalar));

554:   /* get data from symbolic A*P */
555:   PetscMalloc((ap->abnz_max+1)*sizeof(MatScalar),&apa);
556:   PetscMemzero(apa,ap->abnz_max*sizeof(MatScalar));

558:   /* compute numeric C_seq=P_loc^T*A_loc*P */
559:   api = ap->abi; apj = ap->abj;
560:   for (i=0;i<am;i++) {
561:     /* form i-th sparse row of A*P */
562:     apnz = api[i+1] - api[i];
563:     apJ  = apj + api[i];
564:     /* diagonal portion of A */
565:     anz  = adi[i+1] - adi[i];
566:     for (j=0;j<anz;j++) {
567:       row = *adj++;
568:       pnz = pi_loc[row+1] - pi_loc[row];
569:       pj  = pj_loc + pi_loc[row];
570:       pa  = pa_loc + pi_loc[row];
571:       nextp = 0;
572:       for (k=0; nextp<pnz; k++) {
573:         if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
574:           apa[k] += (*ada)*pa[nextp++];
575:         }
576:       }
577:       PetscLogFlops(2*pnz);
578:       ada++;
579:     }
580:     /* off-diagonal portion of A */
581:     anz  = aoi[i+1] - aoi[i];
582:     for (j=0; j<anz; j++) {
583:       row = *aoj++;
584:       pnz = pi_oth[row+1] - pi_oth[row];
585:       pj  = pj_oth + pi_oth[row];
586:       pa  = pa_oth + pi_oth[row];
587:       nextp = 0;
588:       for (k=0; nextp<pnz; k++) {
589:         if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
590:           apa[k] += (*aoa)*pa[nextp++];
591:         }
592:       }
593:       PetscLogFlops(2*pnz);
594:       aoa++;
595:     }

597:     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
598:     pnz = pi_loc[i+1] - pi_loc[i];
599:     for (j=0; j<pnz; j++) {
600:       nextap = 0;
601:       row    = *pJ++; /* global index */
602:       if (row < pcstart || row >=pcend) { /* put the value into Co */
603:         cj  = coj + coi[*poJ];
604:         ca  = coa + coi[*poJ++];
605:       } else {                            /* put the value into Cd */
606:         cj   = bj + bi[*pdJ];
607:         ca   = ba + bi[*pdJ++];
608:       }
609:       for (k=0; nextap<apnz; k++) {
610:         if (cj[k]==apJ[nextap]) ca[k] += (*pA)*apa[nextap++];
611:       }
612:       PetscLogFlops(2*apnz);
613:       pA++;
614:     }

616:     /* zero the current row info for A*P */
617:     PetscMemzero(apa,apnz*sizeof(MatScalar));
618:   }
619:   PetscFree(apa);
620: 
621:   /* send and recv matrix values */
622:   /*-----------------------------*/
623:   buf_ri = merge->buf_ri;
624:   buf_rj = merge->buf_rj;
625:   len_s  = merge->len_s;
626:   PetscCommGetNewTag(comm,&taga);
627:   PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);

629:   PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);
630:   for (proc=0,k=0; proc<size; proc++){
631:     if (!len_s[proc]) continue;
632:     i = merge->owners_co[proc];
633:     MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
634:     k++;
635:   }
636:   PetscMalloc(size*sizeof(MPI_Status),&status);
637:   if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
638:   if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
639:   PetscFree(status);

641:   PetscFree(s_waits);
642:   PetscFree(r_waits);
643:   PetscFree(coa);

645:   /* insert local and received values into C */
646:   /*-----------------------------------------*/
647:   PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);
648:   nextrow   = buf_ri_k + merge->nrecv;
649:   nextci = nextrow + merge->nrecv;

651:   for (k=0; k<merge->nrecv; k++){
652:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
653:     nrows       = *(buf_ri_k[k]);
654:     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
655:     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
656:   }

658:   for (i=0; i<cm; i++) {
659:     row = owners[rank] + i; /* global row index of C_seq */
660:     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
661:     ba_i = ba + bi[i];
662:     bnz  = bi[i+1] - bi[i];
663:     /* add received vals into ba */
664:     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
665:       /* i-th row */
666:       if (i == *nextrow[k]) {
667:         cnz = *(nextci[k]+1) - *nextci[k];
668:         cj  = buf_rj[k] + *(nextci[k]);
669:         ca  = abuf_r[k] + *(nextci[k]);
670:         nextcj = 0;
671:         for (j=0; nextcj<cnz; j++){
672:           if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
673:             ba_i[j] += ca[nextcj++];
674:           }
675:         }
676:         nextrow[k]++; nextci[k]++;
677:       }
678:     }
679:     MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
680:     PetscLogFlops(2*cnz);
681:   }
682:   MatSetBlockSize(C,1);
683:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
684:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

686:   PetscFree(ba);
687:   PetscFree(abuf_r);
688:   PetscFree(buf_ri_k);
689:   return(0);
690: }