Actual source code: inode.c

  1: #define PETSCMAT_DLL

  3: /*
  4:   This file provides high performance routines for the Inode format (compressed sparse row)
  5:   by taking advantage of rows with identical nonzero structure (I-nodes).
  6: */
 7:  #include ../src/mat/impls/aij/seq/aij.h

 11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
 12: {
 13:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 15:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 18:   n      = A->cmap->n;
 19:   m      = A->rmap->n;
 20:   ns_row = a->inode.size;
 21: 
 22:   min_mn = (m < n) ? m : n;
 23:   if (!ns) {
 24:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 25:     for(; count+1 < n; count++,i++);
 26:     if (count < n)  {
 27:       i++;
 28:     }
 29:     *size = i;
 30:     return(0);
 31:   }
 32:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
 33: 
 34:   /* Use the same row structure wherever feasible. */
 35:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 36:     ns_col[i] = ns_row[i];
 37:   }

 39:   /* if m < n; pad up the remainder with inode_limit */
 40:   for(; count+1 < n; count++,i++) {
 41:     ns_col[i] = 1;
 42:   }
 43:   /* The last node is the odd ball. padd it up with the remaining rows; */
 44:   if (count < n)  {
 45:     ns_col[i] = n - count;
 46:     i++;
 47:   } else if (count > n) {
 48:     /* Adjust for the over estimation */
 49:     ns_col[i-1] += n - count;
 50:   }
 51:   *size = i;
 52:   *ns   = ns_col;
 53:   return(0);
 54: }


 57: /*
 58:       This builds symmetric version of nonzero structure,
 59: */
 62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 63: {
 64:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 66:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 67:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap->n;
 72:   n         = A->cmap->n;
 73:   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
 74: 
 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 81:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1){
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col)
 87:       tvc[col] = i1;
 88:   }
 89:   /* allocate space for row pointers */
 90:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
 91:   *iia = ia;
 92:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 93:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

 95:   /* determine the number of columns in each row */
 96:   ia[0] = oshift;
 97:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

 99:     j    = aj + ai[row] + ishift;
100:     jmax = aj + ai[row+1] + ishift;
101:     i2   = 0;
102:     col  = *j++ + ishift;
103:     i2   = tvc[col];
104:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105:       ia[i1+1]++;
106:       ia[i2+1]++;
107:       i2++;                     /* Start col of next node */
108:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109:       i2 = tvc[col];
110:     }
111:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112:   }

114:   /* shift ia[i] to point to next row */
115:   for (i1=1; i1<nslim_row+1; i1++) {
116:     row        = ia[i1-1];
117:     ia[i1]    += row;
118:     work[i1-1] = row - oshift;
119:   }

121:   /* allocate space for column pointers */
122:   nz   = ia[nslim_row] + (!ishift);
123:   PetscMalloc(nz*sizeof(PetscInt),&ja);
124:   *jja = ja;

126:  /* loop over lower triangular part putting into ja */
127:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128:     j    = aj + ai[row] + ishift;
129:     jmax = aj + ai[row+1] + ishift;
130:     i2   = 0;                     /* Col inode index */
131:     col  = *j++ + ishift;
132:     i2   = tvc[col];
133:     while (i2<i1 && j<jmax) {
134:       ja[work[i2]++] = i1 + oshift;
135:       ja[work[i1]++] = i2 + oshift;
136:       ++i2;
137:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138:       i2 = tvc[col];
139:     }
140:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

142:   }
143:   PetscFree(work);
144:   PetscFree(tns);
145:   PetscFree(tvc);
146:   return(0);
147: }

149: /*
150:       This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
158:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

162:   nslim_row = a->inode.node_count;
163:   n         = A->cmap->n;

165:   /* Create The column_inode for this matrix */
166:   Mat_CreateColInode(A,&nslim_col,&ns_col);
167: 
168:   /* allocate space for reformated column_inode structure */
169:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

173:   for (i1=0,col=0; i1<nslim_col; ++i1){
174:     nsz = ns_col[i1];
175:     for (i2=0; i2<nsz; ++i2,++col)
176:       tvc[col] = i1;
177:   }
178:   /* allocate space for row pointers */
179:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180:   *iia = ia;
181:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

184:   /* determine the number of columns in each row */
185:   ia[0] = oshift;
186:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187:     j   = aj + ai[row] + ishift;
188:     col = *j++ + ishift;
189:     i2  = tvc[col];
190:     nz  = ai[row+1] - ai[row];
191:     while (nz-- > 0) {           /* off-diagonal elemets */
192:       ia[i1+1]++;
193:       i2++;                     /* Start col of next node */
194:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195:       if (nz > 0) i2 = tvc[col];
196:     }
197:   }

199:   /* shift ia[i] to point to next row */
200:   for (i1=1; i1<nslim_row+1; i1++) {
201:     row        = ia[i1-1];
202:     ia[i1]    += row;
203:     work[i1-1] = row - oshift;
204:   }

206:   /* allocate space for column pointers */
207:   nz   = ia[nslim_row] + (!ishift);
208:   PetscMalloc(nz*sizeof(PetscInt),&ja);
209:   *jja = ja;

211:  /* loop over matrix putting into ja */
212:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213:     j   = aj + ai[row] + ishift;
214:     i2  = 0;                     /* Col inode index */
215:     col = *j++ + ishift;
216:     i2  = tvc[col];
217:     nz  = ai[row+1] - ai[row];
218:     while (nz-- > 0) {
219:       ja[work[i1]++] = i2 + oshift;
220:       ++i2;
221:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222:       if (nz > 0) i2 = tvc[col];
223:     }
224:   }
225:   PetscFree(ns_col);
226:   PetscFree(work);
227:   PetscFree(tns);
228:   PetscFree(tvc);
229:   return(0);
230: }

234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;

240:   *n     = a->inode.node_count;
241:   if (!ia) return(0);
242:   if (!blockcompressed) {
243:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244:   } else if (symmetric) {
245:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246:   } else {
247:     MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248:   }
249:   return(0);
250: }

254: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {

259:   if (!ia) return(0);

261:   if (!blockcompressed) {
262:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263:   } else {
264:     PetscFree(*ia);
265:     PetscFree(*ja);
266:   }

268:   return(0);
269: }

271: /* ----------------------------------------------------------- */

275: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
279:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

283:   nslim_row = a->inode.node_count;
284:   n         = A->cmap->n;

286:   /* Create The column_inode for this matrix */
287:   Mat_CreateColInode(A,&nslim_col,&ns_col);
288: 
289:   /* allocate space for reformated column_inode structure */
290:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

294:   for (i1=0,col=0; i1<nslim_col; ++i1){
295:     nsz = ns_col[i1];
296:     for (i2=0; i2<nsz; ++i2,++col)
297:       tvc[col] = i1;
298:   }
299:   /* allocate space for column pointers */
300:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301:   *iia = ia;
302:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

305:   /* determine the number of columns in each row */
306:   ia[0] = oshift;
307:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308:     j   = aj + ai[row] + ishift;
309:     col = *j++ + ishift;
310:     i2  = tvc[col];
311:     nz  = ai[row+1] - ai[row];
312:     while (nz-- > 0) {           /* off-diagonal elemets */
313:       /* ia[i1+1]++; */
314:       ia[i2+1]++;
315:       i2++;
316:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317:       if (nz > 0) i2 = tvc[col];
318:     }
319:   }

321:   /* shift ia[i] to point to next col */
322:   for (i1=1; i1<nslim_col+1; i1++) {
323:     col        = ia[i1-1];
324:     ia[i1]    += col;
325:     work[i1-1] = col - oshift;
326:   }

328:   /* allocate space for column pointers */
329:   nz   = ia[nslim_col] + (!ishift);
330:   PetscMalloc(nz*sizeof(PetscInt),&ja);
331:   *jja = ja;

333:  /* loop over matrix putting into ja */
334:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335:     j   = aj + ai[row] + ishift;
336:     i2  = 0;                     /* Col inode index */
337:     col = *j++ + ishift;
338:     i2  = tvc[col];
339:     nz  = ai[row+1] - ai[row];
340:     while (nz-- > 0) {
341:       /* ja[work[i1]++] = i2 + oshift; */
342:       ja[work[i2]++] = i1 + oshift;
343:       i2++;
344:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345:       if (nz > 0) i2 = tvc[col];
346:     }
347:   }
348:   PetscFree(ns_col);
349:   PetscFree(work);
350:   PetscFree(tns);
351:   PetscFree(tvc);
352:   return(0);
353: }

357: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {

362:   Mat_CreateColInode(A,n,PETSC_NULL);
363:   if (!ia) return(0);

365:   if (!blockcompressed) {
366:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367:   } else if (symmetric) {
368:     /* Since the indices are symmetric it does'nt matter */
369:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370:   } else {
371:     MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372:   }
373:   return(0);
374: }

378: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {

383:   if (!ia) return(0);
384:   if (!blockcompressed) {
385:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386:   } else {
387:     PetscFree(*ia);
388:     PetscFree(*ja);
389:   }
390:   return(0);
391: }

393: /* ----------------------------------------------------------- */

397: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398: {
399:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401:   PetscScalar       *y;
402:   const PetscScalar *x;
403:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404:   PetscErrorCode    ierr;
405:   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406: 
407: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409: #endif

412:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413:   node_max = a->inode.node_count;
414:   ns       = a->inode.size;     /* Node Size array */
415:   VecGetArray(xx,(PetscScalar**)&x);
416:   VecGetArray(yy,&y);
417:   idx  = a->j;
418:   v1   = a->a;
419:   ii   = a->i;

421:   for (i = 0,row = 0; i< node_max; ++i){
422:     nsz  = ns[i];
423:     n    = ii[1] - ii[0];
424:     nonzerorow += (n>0)*nsz;
425:     ii  += nsz;
426:     sz   = n;                   /* No of non zeros in this row */
427:                                 /* Switch on the size of Node */
428:     switch (nsz){               /* Each loop in 'case' is unrolled */
429:     case 1 :
430:       sum1  = 0;
431: 
432:       for(n = 0; n< sz-1; n+=2) {
433:         i1   = idx[0];          /* The instructions are ordered to */
434:         i2   = idx[1];          /* make the compiler's job easy */
435:         idx += 2;
436:         tmp0 = x[i1];
437:         tmp1 = x[i2];
438:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439:        }
440: 
441:       if (n == sz-1){          /* Take care of the last nonzero  */
442:         tmp0  = x[*idx++];
443:         sum1 += *v1++ * tmp0;
444:       }
445:       y[row++]=sum1;
446:       break;
447:     case 2:
448:       sum1  = 0;
449:       sum2  = 0;
450:       v2    = v1 + n;
451: 
452:       for (n = 0; n< sz-1; n+=2) {
453:         i1   = idx[0];
454:         i2   = idx[1];
455:         idx += 2;
456:         tmp0 = x[i1];
457:         tmp1 = x[i2];
458:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460:       }
461:       if (n == sz-1){
462:         tmp0  = x[*idx++];
463:         sum1 += *v1++ * tmp0;
464:         sum2 += *v2++ * tmp0;
465:       }
466:       y[row++]=sum1;
467:       y[row++]=sum2;
468:       v1      =v2;              /* Since the next block to be processed starts there*/
469:       idx    +=sz;
470:       break;
471:     case 3:
472:       sum1  = 0;
473:       sum2  = 0;
474:       sum3  = 0;
475:       v2    = v1 + n;
476:       v3    = v2 + n;
477: 
478:       for (n = 0; n< sz-1; n+=2) {
479:         i1   = idx[0];
480:         i2   = idx[1];
481:         idx += 2;
482:         tmp0 = x[i1];
483:         tmp1 = x[i2];
484:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487:       }
488:       if (n == sz-1){
489:         tmp0  = x[*idx++];
490:         sum1 += *v1++ * tmp0;
491:         sum2 += *v2++ * tmp0;
492:         sum3 += *v3++ * tmp0;
493:       }
494:       y[row++]=sum1;
495:       y[row++]=sum2;
496:       y[row++]=sum3;
497:       v1       =v3;             /* Since the next block to be processed starts there*/
498:       idx     +=2*sz;
499:       break;
500:     case 4:
501:       sum1  = 0;
502:       sum2  = 0;
503:       sum3  = 0;
504:       sum4  = 0;
505:       v2    = v1 + n;
506:       v3    = v2 + n;
507:       v4    = v3 + n;
508: 
509:       for (n = 0; n< sz-1; n+=2) {
510:         i1   = idx[0];
511:         i2   = idx[1];
512:         idx += 2;
513:         tmp0 = x[i1];
514:         tmp1 = x[i2];
515:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519:       }
520:       if (n == sz-1){
521:         tmp0  = x[*idx++];
522:         sum1 += *v1++ * tmp0;
523:         sum2 += *v2++ * tmp0;
524:         sum3 += *v3++ * tmp0;
525:         sum4 += *v4++ * tmp0;
526:       }
527:       y[row++]=sum1;
528:       y[row++]=sum2;
529:       y[row++]=sum3;
530:       y[row++]=sum4;
531:       v1      =v4;              /* Since the next block to be processed starts there*/
532:       idx    +=3*sz;
533:       break;
534:     case 5:
535:       sum1  = 0;
536:       sum2  = 0;
537:       sum3  = 0;
538:       sum4  = 0;
539:       sum5  = 0;
540:       v2    = v1 + n;
541:       v3    = v2 + n;
542:       v4    = v3 + n;
543:       v5    = v4 + n;
544: 
545:       for (n = 0; n<sz-1; n+=2) {
546:         i1   = idx[0];
547:         i2   = idx[1];
548:         idx += 2;
549:         tmp0 = x[i1];
550:         tmp1 = x[i2];
551:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556:       }
557:       if (n == sz-1){
558:         tmp0  = x[*idx++];
559:         sum1 += *v1++ * tmp0;
560:         sum2 += *v2++ * tmp0;
561:         sum3 += *v3++ * tmp0;
562:         sum4 += *v4++ * tmp0;
563:         sum5 += *v5++ * tmp0;
564:       }
565:       y[row++]=sum1;
566:       y[row++]=sum2;
567:       y[row++]=sum3;
568:       y[row++]=sum4;
569:       y[row++]=sum5;
570:       v1      =v5;       /* Since the next block to be processed starts there */
571:       idx    +=4*sz;
572:       break;
573:     default :
574:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575:     }
576:   }
577:   VecRestoreArray(xx,(PetscScalar**)&x);
578:   VecRestoreArray(yy,&y);
579:   PetscLogFlops(2*a->nz - nonzerorow);
580:   return(0);
581: }
582: /* ----------------------------------------------------------- */
583: /* Almost same code as the MatMult_Inode() */
586: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587: {
588:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590:   MatScalar      *v1,*v2,*v3,*v4,*v5;
591:   PetscScalar    *x,*y,*z,*zt;
593:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594: 
596:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597:   node_max = a->inode.node_count;
598:   ns       = a->inode.size;     /* Node Size array */
599:   VecGetArray(xx,&x);
600:   VecGetArray(yy,&y);
601:   if (zz != yy) {
602:     VecGetArray(zz,&z);
603:   } else {
604:     z = y;
605:   }
606:   zt = z;

608:   idx  = a->j;
609:   v1   = a->a;
610:   ii   = a->i;

612:   for (i = 0,row = 0; i< node_max; ++i){
613:     nsz  = ns[i];
614:     n    = ii[1] - ii[0];
615:     ii  += nsz;
616:     sz   = n;                   /* No of non zeros in this row */
617:                                 /* Switch on the size of Node */
618:     switch (nsz){               /* Each loop in 'case' is unrolled */
619:     case 1 :
620:       sum1  = *zt++;
621: 
622:       for(n = 0; n< sz-1; n+=2) {
623:         i1   = idx[0];          /* The instructions are ordered to */
624:         i2   = idx[1];          /* make the compiler's job easy */
625:         idx += 2;
626:         tmp0 = x[i1];
627:         tmp1 = x[i2];
628:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629:        }
630: 
631:       if(n   == sz-1){          /* Take care of the last nonzero  */
632:         tmp0  = x[*idx++];
633:         sum1 += *v1++ * tmp0;
634:       }
635:       y[row++]=sum1;
636:       break;
637:     case 2:
638:       sum1  = *zt++;
639:       sum2  = *zt++;
640:       v2    = v1 + n;
641: 
642:       for(n = 0; n< sz-1; n+=2) {
643:         i1   = idx[0];
644:         i2   = idx[1];
645:         idx += 2;
646:         tmp0 = x[i1];
647:         tmp1 = x[i2];
648:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650:       }
651:       if(n   == sz-1){
652:         tmp0  = x[*idx++];
653:         sum1 += *v1++ * tmp0;
654:         sum2 += *v2++ * tmp0;
655:       }
656:       y[row++]=sum1;
657:       y[row++]=sum2;
658:       v1      =v2;              /* Since the next block to be processed starts there*/
659:       idx    +=sz;
660:       break;
661:     case 3:
662:       sum1  = *zt++;
663:       sum2  = *zt++;
664:       sum3  = *zt++;
665:       v2    = v1 + n;
666:       v3    = v2 + n;
667: 
668:       for (n = 0; n< sz-1; n+=2) {
669:         i1   = idx[0];
670:         i2   = idx[1];
671:         idx += 2;
672:         tmp0 = x[i1];
673:         tmp1 = x[i2];
674:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677:       }
678:       if (n == sz-1){
679:         tmp0  = x[*idx++];
680:         sum1 += *v1++ * tmp0;
681:         sum2 += *v2++ * tmp0;
682:         sum3 += *v3++ * tmp0;
683:       }
684:       y[row++]=sum1;
685:       y[row++]=sum2;
686:       y[row++]=sum3;
687:       v1       =v3;             /* Since the next block to be processed starts there*/
688:       idx     +=2*sz;
689:       break;
690:     case 4:
691:       sum1  = *zt++;
692:       sum2  = *zt++;
693:       sum3  = *zt++;
694:       sum4  = *zt++;
695:       v2    = v1 + n;
696:       v3    = v2 + n;
697:       v4    = v3 + n;
698: 
699:       for (n = 0; n< sz-1; n+=2) {
700:         i1   = idx[0];
701:         i2   = idx[1];
702:         idx += 2;
703:         tmp0 = x[i1];
704:         tmp1 = x[i2];
705:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709:       }
710:       if (n == sz-1){
711:         tmp0  = x[*idx++];
712:         sum1 += *v1++ * tmp0;
713:         sum2 += *v2++ * tmp0;
714:         sum3 += *v3++ * tmp0;
715:         sum4 += *v4++ * tmp0;
716:       }
717:       y[row++]=sum1;
718:       y[row++]=sum2;
719:       y[row++]=sum3;
720:       y[row++]=sum4;
721:       v1      =v4;              /* Since the next block to be processed starts there*/
722:       idx    +=3*sz;
723:       break;
724:     case 5:
725:       sum1  = *zt++;
726:       sum2  = *zt++;
727:       sum3  = *zt++;
728:       sum4  = *zt++;
729:       sum5  = *zt++;
730:       v2    = v1 + n;
731:       v3    = v2 + n;
732:       v4    = v3 + n;
733:       v5    = v4 + n;
734: 
735:       for (n = 0; n<sz-1; n+=2) {
736:         i1   = idx[0];
737:         i2   = idx[1];
738:         idx += 2;
739:         tmp0 = x[i1];
740:         tmp1 = x[i2];
741:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746:       }
747:       if(n   == sz-1){
748:         tmp0  = x[*idx++];
749:         sum1 += *v1++ * tmp0;
750:         sum2 += *v2++ * tmp0;
751:         sum3 += *v3++ * tmp0;
752:         sum4 += *v4++ * tmp0;
753:         sum5 += *v5++ * tmp0;
754:       }
755:       y[row++]=sum1;
756:       y[row++]=sum2;
757:       y[row++]=sum3;
758:       y[row++]=sum4;
759:       y[row++]=sum5;
760:       v1      =v5;       /* Since the next block to be processed starts there */
761:       idx    +=4*sz;
762:       break;
763:     default :
764:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765:     }
766:   }
767:   VecRestoreArray(xx,&x);
768:   VecRestoreArray(yy,&y);
769:   if (zz != yy) {
770:     VecRestoreArray(zz,&z);
771:   }
772:   PetscLogFlops(2*a->nz);
773:   return(0);
774: }

776: /* ----------------------------------------------------------- */
779: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
780: {
781:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782:   IS                iscol = a->col,isrow = a->row;
783:   PetscErrorCode    ierr;
784:   const PetscInt    *r,*c,*rout,*cout;
785:   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
786:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
787:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790:   const PetscScalar *b;

793:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794:   node_max = a->inode.node_count;
795:   ns       = a->inode.size;     /* Node Size array */

797:   VecGetArray(bb,(PetscScalar**)&b);
798:   VecGetArray(xx,&x);
799:   tmp  = a->solve_work;
800: 
801:   ISGetIndices(isrow,&rout); r = rout;
802:   ISGetIndices(iscol,&cout); c = cout + (n-1);
803: 
804:   /* forward solve the lower triangular */
805:   tmps = tmp ;
806:   aa   = a_a ;
807:   aj   = a_j ;
808:   ad   = a->diag;

810:   for (i = 0,row = 0; i< node_max; ++i){
811:     nsz = ns[i];
812:     aii = ai[row];
813:     v1  = aa + aii;
814:     vi  = aj + aii;
815:     nz  = ad[row]- aii;
816: 
817:     switch (nsz){               /* Each loop in 'case' is unrolled */
818:     case 1 :
819:       sum1 = b[*r++];
820:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
821:       for(j=0; j<nz-1; j+=2){
822:         i0   = vi[0];
823:         i1   = vi[1];
824:         vi  +=2;
825:         tmp0 = tmps[i0];
826:         tmp1 = tmps[i1];
827:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828:       }
829:       if(j == nz-1){
830:         tmp0 = tmps[*vi++];
831:         sum1 -= *v1++ *tmp0;
832:       }
833:       tmp[row ++]=sum1;
834:       break;
835:     case 2:
836:       sum1 = b[*r++];
837:       sum2 = b[*r++];
838:       v2   = aa + ai[row+1];

840:       for(j=0; j<nz-1; j+=2){
841:         i0   = vi[0];
842:         i1   = vi[1];
843:         vi  +=2;
844:         tmp0 = tmps[i0];
845:         tmp1 = tmps[i1];
846:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848:       }
849:       if(j == nz-1){
850:         tmp0 = tmps[*vi++];
851:         sum1 -= *v1++ *tmp0;
852:         sum2 -= *v2++ *tmp0;
853:       }
854:       sum2 -= *v2++ * sum1;
855:       tmp[row ++]=sum1;
856:       tmp[row ++]=sum2;
857:       break;
858:     case 3:
859:       sum1 = b[*r++];
860:       sum2 = b[*r++];
861:       sum3 = b[*r++];
862:       v2   = aa + ai[row+1];
863:       v3   = aa + ai[row+2];
864: 
865:       for (j=0; j<nz-1; j+=2){
866:         i0   = vi[0];
867:         i1   = vi[1];
868:         vi  +=2;
869:         tmp0 = tmps[i0];
870:         tmp1 = tmps[i1];
871:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
872:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
873:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
874:       }
875:       if (j == nz-1){
876:         tmp0 = tmps[*vi++];
877:         sum1 -= *v1++ *tmp0;
878:         sum2 -= *v2++ *tmp0;
879:         sum3 -= *v3++ *tmp0;
880:       }
881:       sum2 -= *v2++ * sum1;
882:       sum3 -= *v3++ * sum1;
883:       sum3 -= *v3++ * sum2;
884:       tmp[row ++]=sum1;
885:       tmp[row ++]=sum2;
886:       tmp[row ++]=sum3;
887:       break;
888: 
889:     case 4:
890:       sum1 = b[*r++];
891:       sum2 = b[*r++];
892:       sum3 = b[*r++];
893:       sum4 = b[*r++];
894:       v2   = aa + ai[row+1];
895:       v3   = aa + ai[row+2];
896:       v4   = aa + ai[row+3];
897: 
898:       for (j=0; j<nz-1; j+=2){
899:         i0   = vi[0];
900:         i1   = vi[1];
901:         vi  +=2;
902:         tmp0 = tmps[i0];
903:         tmp1 = tmps[i1];
904:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908:       }
909:       if (j == nz-1){
910:         tmp0 = tmps[*vi++];
911:         sum1 -= *v1++ *tmp0;
912:         sum2 -= *v2++ *tmp0;
913:         sum3 -= *v3++ *tmp0;
914:         sum4 -= *v4++ *tmp0;
915:       }
916:       sum2 -= *v2++ * sum1;
917:       sum3 -= *v3++ * sum1;
918:       sum4 -= *v4++ * sum1;
919:       sum3 -= *v3++ * sum2;
920:       sum4 -= *v4++ * sum2;
921:       sum4 -= *v4++ * sum3;
922: 
923:       tmp[row ++]=sum1;
924:       tmp[row ++]=sum2;
925:       tmp[row ++]=sum3;
926:       tmp[row ++]=sum4;
927:       break;
928:     case 5:
929:       sum1 = b[*r++];
930:       sum2 = b[*r++];
931:       sum3 = b[*r++];
932:       sum4 = b[*r++];
933:       sum5 = b[*r++];
934:       v2   = aa + ai[row+1];
935:       v3   = aa + ai[row+2];
936:       v4   = aa + ai[row+3];
937:       v5   = aa + ai[row+4];
938: 
939:       for (j=0; j<nz-1; j+=2){
940:         i0   = vi[0];
941:         i1   = vi[1];
942:         vi  +=2;
943:         tmp0 = tmps[i0];
944:         tmp1 = tmps[i1];
945:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950:       }
951:       if (j == nz-1){
952:         tmp0 = tmps[*vi++];
953:         sum1 -= *v1++ *tmp0;
954:         sum2 -= *v2++ *tmp0;
955:         sum3 -= *v3++ *tmp0;
956:         sum4 -= *v4++ *tmp0;
957:         sum5 -= *v5++ *tmp0;
958:       }

960:       sum2 -= *v2++ * sum1;
961:       sum3 -= *v3++ * sum1;
962:       sum4 -= *v4++ * sum1;
963:       sum5 -= *v5++ * sum1;
964:       sum3 -= *v3++ * sum2;
965:       sum4 -= *v4++ * sum2;
966:       sum5 -= *v5++ * sum2;
967:       sum4 -= *v4++ * sum3;
968:       sum5 -= *v5++ * sum3;
969:       sum5 -= *v5++ * sum4;
970: 
971:       tmp[row ++]=sum1;
972:       tmp[row ++]=sum2;
973:       tmp[row ++]=sum3;
974:       tmp[row ++]=sum4;
975:       tmp[row ++]=sum5;
976:       break;
977:     default:
978:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
979:     }
980:   }
981:   /* backward solve the upper triangular */
982:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
983:     nsz = ns[i];
984:     aii = ai[row+1] -1;
985:     v1  = aa + aii;
986:     vi  = aj + aii;
987:     nz  = aii- ad[row];
988:     switch (nsz){               /* Each loop in 'case' is unrolled */
989:     case 1 :
990:       sum1 = tmp[row];

992:       for(j=nz ; j>1; j-=2){
993:         vi  -=2;
994:         i0   = vi[2];
995:         i1   = vi[1];
996:         tmp0 = tmps[i0];
997:         tmp1 = tmps[i1];
998:         v1   -= 2;
999:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000:       }
1001:       if (j==1){
1002:         tmp0  = tmps[*vi--];
1003:         sum1 -= *v1-- * tmp0;
1004:       }
1005:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006:       break;
1007:     case 2 :
1008:       sum1 = tmp[row];
1009:       sum2 = tmp[row -1];
1010:       v2   = aa + ai[row]-1;
1011:       for (j=nz ; j>1; j-=2){
1012:         vi  -=2;
1013:         i0   = vi[2];
1014:         i1   = vi[1];
1015:         tmp0 = tmps[i0];
1016:         tmp1 = tmps[i1];
1017:         v1   -= 2;
1018:         v2   -= 2;
1019:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021:       }
1022:       if (j==1){
1023:         tmp0  = tmps[*vi--];
1024:         sum1 -= *v1-- * tmp0;
1025:         sum2 -= *v2-- * tmp0;
1026:       }
1027: 
1028:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029:       sum2   -= *v2-- * tmp0;
1030:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031:       break;
1032:     case 3 :
1033:       sum1 = tmp[row];
1034:       sum2 = tmp[row -1];
1035:       sum3 = tmp[row -2];
1036:       v2   = aa + ai[row]-1;
1037:       v3   = aa + ai[row -1]-1;
1038:       for (j=nz ; j>1; j-=2){
1039:         vi  -=2;
1040:         i0   = vi[2];
1041:         i1   = vi[1];
1042:         tmp0 = tmps[i0];
1043:         tmp1 = tmps[i1];
1044:         v1   -= 2;
1045:         v2   -= 2;
1046:         v3   -= 2;
1047:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050:       }
1051:       if (j==1){
1052:         tmp0  = tmps[*vi--];
1053:         sum1 -= *v1-- * tmp0;
1054:         sum2 -= *v2-- * tmp0;
1055:         sum3 -= *v3-- * tmp0;
1056:       }
1057:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058:       sum2   -= *v2-- * tmp0;
1059:       sum3   -= *v3-- * tmp0;
1060:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061:       sum3   -= *v3-- * tmp0;
1062:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063: 
1064:       break;
1065:     case 4 :
1066:       sum1 = tmp[row];
1067:       sum2 = tmp[row -1];
1068:       sum3 = tmp[row -2];
1069:       sum4 = tmp[row -3];
1070:       v2   = aa + ai[row]-1;
1071:       v3   = aa + ai[row -1]-1;
1072:       v4   = aa + ai[row -2]-1;

1074:       for (j=nz ; j>1; j-=2){
1075:         vi  -=2;
1076:         i0   = vi[2];
1077:         i1   = vi[1];
1078:         tmp0 = tmps[i0];
1079:         tmp1 = tmps[i1];
1080:         v1  -= 2;
1081:         v2  -= 2;
1082:         v3  -= 2;
1083:         v4  -= 2;
1084:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088:       }
1089:       if (j==1){
1090:         tmp0  = tmps[*vi--];
1091:         sum1 -= *v1-- * tmp0;
1092:         sum2 -= *v2-- * tmp0;
1093:         sum3 -= *v3-- * tmp0;
1094:         sum4 -= *v4-- * tmp0;
1095:       }

1097:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098:       sum2   -= *v2-- * tmp0;
1099:       sum3   -= *v3-- * tmp0;
1100:       sum4   -= *v4-- * tmp0;
1101:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102:       sum3   -= *v3-- * tmp0;
1103:       sum4   -= *v4-- * tmp0;
1104:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105:       sum4   -= *v4-- * tmp0;
1106:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107:       break;
1108:     case 5 :
1109:       sum1 = tmp[row];
1110:       sum2 = tmp[row -1];
1111:       sum3 = tmp[row -2];
1112:       sum4 = tmp[row -3];
1113:       sum5 = tmp[row -4];
1114:       v2   = aa + ai[row]-1;
1115:       v3   = aa + ai[row -1]-1;
1116:       v4   = aa + ai[row -2]-1;
1117:       v5   = aa + ai[row -3]-1;
1118:       for (j=nz ; j>1; j-=2){
1119:         vi  -= 2;
1120:         i0   = vi[2];
1121:         i1   = vi[1];
1122:         tmp0 = tmps[i0];
1123:         tmp1 = tmps[i1];
1124:         v1   -= 2;
1125:         v2   -= 2;
1126:         v3   -= 2;
1127:         v4   -= 2;
1128:         v5   -= 2;
1129:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134:       }
1135:       if (j==1){
1136:         tmp0  = tmps[*vi--];
1137:         sum1 -= *v1-- * tmp0;
1138:         sum2 -= *v2-- * tmp0;
1139:         sum3 -= *v3-- * tmp0;
1140:         sum4 -= *v4-- * tmp0;
1141:         sum5 -= *v5-- * tmp0;
1142:       }

1144:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145:       sum2   -= *v2-- * tmp0;
1146:       sum3   -= *v3-- * tmp0;
1147:       sum4   -= *v4-- * tmp0;
1148:       sum5   -= *v5-- * tmp0;
1149:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150:       sum3   -= *v3-- * tmp0;
1151:       sum4   -= *v4-- * tmp0;
1152:       sum5   -= *v5-- * tmp0;
1153:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154:       sum4   -= *v4-- * tmp0;
1155:       sum5   -= *v5-- * tmp0;
1156:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157:       sum5   -= *v5-- * tmp0;
1158:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159:       break;
1160:     default:
1161:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1162:     }
1163:   }
1164:   ISRestoreIndices(isrow,&rout);
1165:   ISRestoreIndices(iscol,&cout);
1166:   VecRestoreArray(bb,(PetscScalar**)&b);
1167:   VecRestoreArray(xx,&x);
1168:   PetscLogFlops(2*a->nz - A->cmap->n);
1169:   return(0);
1170: }

1174: PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175: {
1176:   Mat               C = B;
1177:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1178:   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1179:   PetscErrorCode    ierr;
1180:   const PetscInt    *r,*ic,*c,*ics;
1181:   PetscInt          n = A->rmap->n,*bi = b->i;
1182:   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1183:   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1184:   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1185:   PetscScalar       mul1,mul2,mul3,tmp;
1186:   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1187:   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1188:   PetscReal         rs=0.0;
1189:   LUShift_Ctx       sctx;
1190:   PetscInt          newshift;

1193:   sctx.shift_top      = 0;
1194:   sctx.nshift_max     = 0;
1195:   sctx.shift_lo       = 0;
1196:   sctx.shift_hi       = 0;
1197:   sctx.shift_fraction = 0;

1199:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1200:   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1201:     sctx.shift_top = 0;
1202:     for (i=0; i<n; i++) {
1203:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1204:       rs    = 0.0;
1205:       ajtmp = aj + ai[i];
1206:       rtmp1 = aa + ai[i];
1207:       nz = ai[i+1] - ai[i];
1208:       for (j=0; j<nz; j++){
1209:         if (*ajtmp != i){
1210:           rs += PetscAbsScalar(*rtmp1++);
1211:         } else {
1212:           rs -= PetscRealPart(*rtmp1++);
1213:         }
1214:         ajtmp++;
1215:       }
1216:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217:     }
1218:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1219:     sctx.shift_top *= 1.1;
1220:     sctx.nshift_max = 5;
1221:     sctx.shift_lo   = 0.;
1222:     sctx.shift_hi   = 1.;
1223:   }
1224:   sctx.shift_amount = 0;
1225:   sctx.nshift       = 0;

1227:   ISGetIndices(isrow,&r);
1228:   ISGetIndices(iscol,&c);
1229:   ISGetIndices(isicol,&ic);
1230:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1231:   PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1232:   ics   = ic ;
1233:   rtmp22 = rtmp11 + n;
1234:   rtmp33 = rtmp22 + n;
1235: 
1236:   node_max = a->inode.node_count;
1237:   ns       = a->inode.size;
1238:   if (!ns){
1239:     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1240:   }

1242:   /* If max inode size > 3, split it into two inodes.*/
1243:   /* also map the inode sizes according to the ordering */
1244:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1245:   for (i=0,j=0; i<node_max; ++i,++j){
1246:     if (ns[i]>3) {
1247:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1248:       ++j;
1249:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1250:     } else {
1251:       tmp_vec1[j] = ns[i];
1252:     }
1253:   }
1254:   /* Use the correct node_max */
1255:   node_max = j;

1257:   /* Now reorder the inode info based on mat re-ordering info */
1258:   /* First create a row -> inode_size_array_index map */
1259:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1260:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1261:   for (i=0,row=0; i<node_max; i++) {
1262:     nodesz = tmp_vec1[i];
1263:     for (j=0; j<nodesz; j++,row++) {
1264:       nsmap[row] = i;
1265:     }
1266:   }
1267:   /* Using nsmap, create a reordered ns structure */
1268:   for (i=0,j=0; i< node_max; i++) {
1269:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1270:     tmp_vec2[i]  = nodesz;
1271:     j           += nodesz;
1272:   }
1273:   PetscFree(nsmap);
1274:   PetscFree(tmp_vec1);
1275:   /* Now use the correct ns */
1276:   ns = tmp_vec2;

1278:   do {
1279:     sctx.lushift = PETSC_FALSE;
1280:     /* Now loop over each block-row, and do the factorization */
1281:     for (i=0,row=0; i<node_max; i++) {
1282:       nodesz = ns[i];
1283:       nz     = bi[row+1] - bi[row];
1284:       bjtmp  = bj + bi[row];

1286:       switch (nodesz){
1287:       case 1:
1288:         for  (j=0; j<nz; j++){
1289:           idx        = bjtmp[j];
1290:           rtmp11[idx] = 0.0;
1291:         }
1292: 
1293:         /* load in initial (unfactored row) */
1294:         idx    = r[row];
1295:         nz_tmp = ai[idx+1] - ai[idx];
1296:         ajtmp  = aj + ai[idx];
1297:         v1     = aa + ai[idx];

1299:         for (j=0; j<nz_tmp; j++) {
1300:           idx        = ics[ajtmp[j]];
1301:           rtmp11[idx] = v1[j];
1302:         }
1303:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1305:         prow = *bjtmp++ ;
1306:         while (prow < row) {
1307:           pc1 = rtmp11 + prow;
1308:           if (*pc1 != 0.0){
1309:             pv   = ba + bd[prow];
1310:             pj   = nbj + bd[prow];
1311:             mul1 = *pc1 * *pv++;
1312:             *pc1 = mul1;
1313:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1314:             PetscLogFlops(2*nz_tmp);
1315:             for (j=0; j<nz_tmp; j++) {
1316:               tmp = pv[j];
1317:               idx = pj[j];
1318:               rtmp11[idx] -= mul1 * tmp;
1319:             }
1320:           }
1321:           prow = *bjtmp++ ;
1322:         }
1323:         pj  = bj + bi[row];
1324:         pc1 = ba + bi[row];

1326:         sctx.pv    = rtmp11[row];
1327:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1328:         rs         = 0.0;
1329:         for (j=0; j<nz; j++) {
1330:           idx    = pj[j];
1331:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1332:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1333:         }
1334:         sctx.rs  = rs;
1335:         MatLUCheckShift_inline(info,sctx,row,newshift);
1336:         if (newshift == 1) goto endofwhile;
1337:         break;
1338: 
1339:       case 2:
1340:         for (j=0; j<nz; j++) {
1341:           idx        = bjtmp[j];
1342:           rtmp11[idx] = 0.0;
1343:           rtmp22[idx] = 0.0;
1344:         }
1345: 
1346:         /* load in initial (unfactored row) */
1347:         idx    = r[row];
1348:         nz_tmp = ai[idx+1] - ai[idx];
1349:         ajtmp  = aj + ai[idx];
1350:         v1     = aa + ai[idx];
1351:         v2     = aa + ai[idx+1];
1352:         for (j=0; j<nz_tmp; j++) {
1353:           idx        = ics[ajtmp[j]];
1354:           rtmp11[idx] = v1[j];
1355:           rtmp22[idx] = v2[j];
1356:         }
1357:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1358:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

1360:         prow = *bjtmp++ ;
1361:         while (prow < row) {
1362:           pc1 = rtmp11 + prow;
1363:           pc2 = rtmp22 + prow;
1364:           if (*pc1 != 0.0 || *pc2 != 0.0){
1365:             pv   = ba + bd[prow];
1366:             pj   = nbj + bd[prow];
1367:             mul1 = *pc1 * *pv;
1368:             mul2 = *pc2 * *pv;
1369:             ++pv;
1370:             *pc1 = mul1;
1371:             *pc2 = mul2;
1372: 
1373:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1374:             for (j=0; j<nz_tmp; j++) {
1375:               tmp = pv[j];
1376:               idx = pj[j];
1377:               rtmp11[idx] -= mul1 * tmp;
1378:               rtmp22[idx] -= mul2 * tmp;
1379:             }
1380:             PetscLogFlops(4*nz_tmp);
1381:           }
1382:           prow = *bjtmp++ ;
1383:         }

1385:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1386:         pc1 = rtmp11 + prow;
1387:         pc2 = rtmp22 + prow;

1389:         sctx.pv = *pc1;
1390:         pj      = bj + bi[prow];
1391:         rs      = 0.0;
1392:         for (j=0; j<nz; j++){
1393:           idx = pj[j];
1394:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1395:         }
1396:         sctx.rs = rs;
1397:         MatLUCheckShift_inline(info,sctx,row,newshift);
1398:         if (newshift == 1) goto endofwhile;

1400:         if (*pc2 != 0.0){
1401:           pj     = nbj + bd[prow];
1402:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1403:           *pc2   = mul2;
1404:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1405:           for (j=0; j<nz_tmp; j++) {
1406:             idx = pj[j] ;
1407:             tmp = rtmp11[idx];
1408:             rtmp22[idx] -= mul2 * tmp;
1409:           }
1410:           PetscLogFlops(2*nz_tmp);
1411:         }
1412: 
1413:         pj  = bj + bi[row];
1414:         pc1 = ba + bi[row];
1415:         pc2 = ba + bi[row+1];

1417:         sctx.pv = rtmp22[row+1];
1418:         rs = 0.0;
1419:         rtmp11[row]   = 1.0/rtmp11[row];
1420:         rtmp22[row+1] = 1.0/rtmp22[row+1];
1421:         /* copy row entries from dense representation to sparse */
1422:         for (j=0; j<nz; j++) {
1423:           idx    = pj[j];
1424:           pc1[j] = rtmp11[idx];
1425:           pc2[j] = rtmp22[idx];
1426:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1427:         }
1428:         sctx.rs = rs;
1429:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
1430:         if (newshift == 1) goto endofwhile;
1431:         break;

1433:       case 3:
1434:         for  (j=0; j<nz; j++) {
1435:           idx        = bjtmp[j];
1436:           rtmp11[idx] = 0.0;
1437:           rtmp22[idx] = 0.0;
1438:           rtmp33[idx] = 0.0;
1439:         }
1440:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1441:         idx    = r[row];
1442:         nz_tmp = ai[idx+1] - ai[idx];
1443:         ajtmp = aj + ai[idx];
1444:         v1    = aa + ai[idx];
1445:         v2    = aa + ai[idx+1];
1446:         v3    = aa + ai[idx+2];
1447:         for (j=0; j<nz_tmp; j++) {
1448:           idx        = ics[ajtmp[j]];
1449:           rtmp11[idx] = v1[j];
1450:           rtmp22[idx] = v2[j];
1451:           rtmp33[idx] = v3[j];
1452:         }
1453:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1454:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1455:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

1457:         /* loop over all pivot row blocks above this row block */
1458:         prow = *bjtmp++ ;
1459:         while (prow < row) {
1460:           pc1 = rtmp11 + prow;
1461:           pc2 = rtmp22 + prow;
1462:           pc3 = rtmp33 + prow;
1463:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1464:             pv   = ba  + bd[prow];
1465:             pj   = nbj + bd[prow];
1466:             mul1 = *pc1 * *pv;
1467:             mul2 = *pc2 * *pv;
1468:             mul3 = *pc3 * *pv;
1469:             ++pv;
1470:             *pc1 = mul1;
1471:             *pc2 = mul2;
1472:             *pc3 = mul3;
1473: 
1474:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1475:             /* update this row based on pivot row */
1476:             for (j=0; j<nz_tmp; j++) {
1477:               tmp = pv[j];
1478:               idx = pj[j];
1479:               rtmp11[idx] -= mul1 * tmp;
1480:               rtmp22[idx] -= mul2 * tmp;
1481:               rtmp33[idx] -= mul3 * tmp;
1482:             }
1483:             PetscLogFlops(6*nz_tmp);
1484:           }
1485:           prow = *bjtmp++ ;
1486:         }

1488:         /* Now take care of diagonal 3x3 block in this set of rows */
1489:         /* note: prow = row here */
1490:         pc1 = rtmp11 + prow;
1491:         pc2 = rtmp22 + prow;
1492:         pc3 = rtmp33 + prow;

1494:         sctx.pv = *pc1;
1495:         pj      = bj + bi[prow];
1496:         rs      = 0.0;
1497:         for (j=0; j<nz; j++){
1498:           idx = pj[j];
1499:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1500:         }
1501:         sctx.rs = rs;
1502:         MatLUCheckShift_inline(info,sctx,row,newshift);
1503:         if (newshift == 1) goto endofwhile;

1505:         if (*pc2 != 0.0 || *pc3 != 0.0){
1506:           mul2 = (*pc2)/(*pc1);
1507:           mul3 = (*pc3)/(*pc1);
1508:           *pc2 = mul2;
1509:           *pc3 = mul3;
1510:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1511:           pj     = nbj + bd[prow];
1512:           for (j=0; j<nz_tmp; j++) {
1513:             idx = pj[j] ;
1514:             tmp = rtmp11[idx];
1515:             rtmp22[idx] -= mul2 * tmp;
1516:             rtmp33[idx] -= mul3 * tmp;
1517:           }
1518:           PetscLogFlops(4*nz_tmp);
1519:         }
1520:         ++prow;

1522:         pc2 = rtmp22 + prow;
1523:         pc3 = rtmp33 + prow;
1524:         sctx.pv = *pc2;
1525:         pj      = bj + bi[prow];
1526:         rs      = 0.0;
1527:         for (j=0; j<nz; j++){
1528:           idx = pj[j];
1529:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1530:         }
1531:         sctx.rs = rs;
1532:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
1533:         if (newshift == 1) goto endofwhile;

1535:         if (*pc3 != 0.0){
1536:           mul3   = (*pc3)/(*pc2);
1537:           *pc3   = mul3;
1538:           pj     = nbj + bd[prow];
1539:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1540:           for (j=0; j<nz_tmp; j++) {
1541:             idx = pj[j] ;
1542:             tmp = rtmp22[idx];
1543:             rtmp33[idx] -= mul3 * tmp;
1544:           }
1545:           PetscLogFlops(4*nz_tmp);
1546:         }

1548:         pj  = bj + bi[row];
1549:         pc1 = ba + bi[row];
1550:         pc2 = ba + bi[row+1];
1551:         pc3 = ba + bi[row+2];

1553:         sctx.pv = rtmp33[row+2];
1554:         rs = 0.0;
1555:         rtmp11[row]   = 1.0/rtmp11[row];
1556:         rtmp22[row+1] = 1.0/rtmp22[row+1];
1557:         rtmp33[row+2] = 1.0/rtmp33[row+2];
1558:         /* copy row entries from dense representation to sparse */
1559:         for (j=0; j<nz; j++) {
1560:           idx    = pj[j];
1561:           pc1[j] = rtmp11[idx];
1562:           pc2[j] = rtmp22[idx];
1563:           pc3[j] = rtmp33[idx];
1564:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1565:         }

1567:         sctx.rs = rs;
1568:         MatLUCheckShift_inline(info,sctx,row+2,newshift);
1569:         if (newshift == 1) goto endofwhile;
1570:         break;

1572:       default:
1573:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1574:       }
1575:       row += nodesz;                 /* Update the row */
1576:     }
1577:     endofwhile:;
1578:   } while (sctx.lushift);
1579:   PetscFree(rtmp11);
1580:   PetscFree(tmp_vec2);
1581:   ISRestoreIndices(isicol,&ic);
1582:   ISRestoreIndices(isrow,&r);
1583:   ISRestoreIndices(iscol,&c);
1584:   (B)->ops->solve           = MatSolve_Inode;
1585:   /* do not set solve add, since MatSolve_Inode + Add is faster */
1586:   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1587:   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1588:   C->assembled   = PETSC_TRUE;
1589:   C->preallocated = PETSC_TRUE;
1590:   if (sctx.nshift) {
1591:     if (info->shiftpd) {
1592:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1593:     } else if (info->shiftnz) {
1594:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1595:     }
1596:   }
1597:   PetscLogFlops(C->cmap->n);
1598:   return(0);
1599: }

1601: /*
1602:      Makes a longer coloring[] array and calls the usual code with that
1603: */
1606: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1607: {
1608:   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1609:   PetscErrorCode  ierr;
1610:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1611:   PetscInt        *colorused,i;
1612:   ISColoringValue *newcolor;

1615:   PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1616:   /* loop over inodes, marking a color for each column*/
1617:   row = 0;
1618:   for (i=0; i<m; i++){
1619:     for (j=0; j<ns[i]; j++) {
1620:       newcolor[row++] = coloring[i] + j*ncolors;
1621:     }
1622:   }

1624:   /* eliminate unneeded colors */
1625:   PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1626:   PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1627:   for (i=0; i<n; i++) {
1628:     colorused[newcolor[i]] = 1;
1629:   }

1631:   for (i=1; i<5*ncolors; i++) {
1632:     colorused[i] += colorused[i-1];
1633:   }
1634:   ncolors = colorused[5*ncolors-1];
1635:   for (i=0; i<n; i++) {
1636:     newcolor[i] = colorused[newcolor[i]]-1;
1637:   }
1638:   PetscFree(colorused);
1639:   ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
1640:   PetscFree(coloring);
1641:   return(0);
1642: }

1644:  #include ../src/inline/ilu.h

1648: PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1649: {
1650:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1651:   PetscScalar        *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1652:   MatScalar          *ibdiag,*bdiag;
1653:   PetscScalar        *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1654:   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1655:   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1656:   PetscErrorCode     ierr;
1657:   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1658:   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;

1661:   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1662:   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1663:   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1664:   if (its > 1) {
1665:     /* switch to non-inode version */
1666:     MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);
1667:     return(0);
1668:   }

1670:   if (!a->inode.ibdiagvalid) {
1671:     if (!a->inode.ibdiag) {
1672:       /* calculate space needed for diagonal blocks */
1673:       for (i=0; i<m; i++) {
1674:         cnt += sizes[i]*sizes[i];
1675:       }
1676:       a->inode.bdiagsize = cnt;
1677:       PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);
1678:     }

1680:     /* copy over the diagonal blocks and invert them */
1681:     ibdiag = a->inode.ibdiag;
1682:     bdiag  = a->inode.bdiag;
1683:     cnt = 0;
1684:     for (i=0, row = 0; i<m; i++) {
1685:       for (j=0; j<sizes[i]; j++) {
1686:         for (k=0; k<sizes[i]; k++) {
1687:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1688:         }
1689:       }
1690:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
1691: 
1692:       switch(sizes[i]) {
1693:         case 1:
1694:           /* Create matrix data structure */
1695:           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1696:           ibdiag[cnt] = 1.0/ibdiag[cnt];
1697:           break;
1698:         case 2:
1699:           Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
1700:           break;
1701:         case 3:
1702:           Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
1703:           break;
1704:         case 4:
1705:           Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
1706:           break;
1707:         case 5:
1708:           Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);
1709:           break;
1710:        default:
1711:          SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1712:       }
1713:       cnt += sizes[i]*sizes[i];
1714:       row += sizes[i];
1715:     }
1716:     a->inode.ibdiagvalid = PETSC_TRUE;
1717:   }
1718:   ibdiag = a->inode.ibdiag;
1719:   bdiag  = a->inode.bdiag;

1721:   VecGetArray(xx,&x);
1722:   if (xx != bb) {
1723:     VecGetArray(bb,(PetscScalar**)&b);
1724:   } else {
1725:     b = x;
1726:   }

1728:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1729:   xs   = x;
1730:   if (flag & SOR_ZERO_INITIAL_GUESS) {
1731:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){

1733:       for (i=0, row=0; i<m; i++) {
1734:         sz  = diag[row] - ii[row];
1735:         v1  = a->a + ii[row];
1736:         idx = a->j + ii[row];

1738:         /* see comments for MatMult_Inode() for how this is coded */
1739:         switch (sizes[i]){
1740:           case 1:
1741: 
1742:             sum1  = b[row];
1743:             for(n = 0; n<sz-1; n+=2) {
1744:               i1   = idx[0];
1745:               i2   = idx[1];
1746:               idx += 2;
1747:               tmp0 = x[i1];
1748:               tmp1 = x[i2];
1749:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1750:             }
1751: 
1752:             if (n == sz-1){
1753:               tmp0  = x[*idx];
1754:               sum1 -= *v1 * tmp0;
1755:             }
1756:             x[row++] = sum1*(*ibdiag++);
1757:             break;
1758:           case 2:
1759:             v2    = a->a + ii[row+1];
1760:             sum1  = b[row];
1761:             sum2  = b[row+1];
1762:             for(n = 0; n<sz-1; n+=2) {
1763:               i1   = idx[0];
1764:               i2   = idx[1];
1765:               idx += 2;
1766:               tmp0 = x[i1];
1767:               tmp1 = x[i2];
1768:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1769:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1770:             }
1771: 
1772:             if (n == sz-1){
1773:               tmp0  = x[*idx];
1774:               sum1 -= v1[0] * tmp0;
1775:               sum2 -= v2[0] * tmp0;
1776:             }
1777:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1778:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1779:             ibdiag  += 4;
1780:             break;
1781:           case 3:
1782:             v2    = a->a + ii[row+1];
1783:             v3    = a->a + ii[row+2];
1784:             sum1  = b[row];
1785:             sum2  = b[row+1];
1786:             sum3  = b[row+2];
1787:             for(n = 0; n<sz-1; n+=2) {
1788:               i1   = idx[0];
1789:               i2   = idx[1];
1790:               idx += 2;
1791:               tmp0 = x[i1];
1792:               tmp1 = x[i2];
1793:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1794:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1795:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1796:             }
1797: 
1798:             if (n == sz-1){
1799:               tmp0  = x[*idx];
1800:               sum1 -= v1[0] * tmp0;
1801:               sum2 -= v2[0] * tmp0;
1802:               sum3 -= v3[0] * tmp0;
1803:             }
1804:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1805:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1806:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1807:             ibdiag  += 9;
1808:             break;
1809:           case 4:
1810:             v2    = a->a + ii[row+1];
1811:             v3    = a->a + ii[row+2];
1812:             v4    = a->a + ii[row+3];
1813:             sum1  = b[row];
1814:             sum2  = b[row+1];
1815:             sum3  = b[row+2];
1816:             sum4  = b[row+3];
1817:             for(n = 0; n<sz-1; n+=2) {
1818:               i1   = idx[0];
1819:               i2   = idx[1];
1820:               idx += 2;
1821:               tmp0 = x[i1];
1822:               tmp1 = x[i2];
1823:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1824:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1825:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1826:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1827:             }
1828: 
1829:             if (n == sz-1){
1830:               tmp0  = x[*idx];
1831:               sum1 -= v1[0] * tmp0;
1832:               sum2 -= v2[0] * tmp0;
1833:               sum3 -= v3[0] * tmp0;
1834:               sum4 -= v4[0] * tmp0;
1835:             }
1836:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1837:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1838:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1839:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1840:             ibdiag  += 16;
1841:             break;
1842:           case 5:
1843:             v2    = a->a + ii[row+1];
1844:             v3    = a->a + ii[row+2];
1845:             v4    = a->a + ii[row+3];
1846:             v5    = a->a + ii[row+4];
1847:             sum1  = b[row];
1848:             sum2  = b[row+1];
1849:             sum3  = b[row+2];
1850:             sum4  = b[row+3];
1851:             sum5  = b[row+4];
1852:             for(n = 0; n<sz-1; n+=2) {
1853:               i1   = idx[0];
1854:               i2   = idx[1];
1855:               idx += 2;
1856:               tmp0 = x[i1];
1857:               tmp1 = x[i2];
1858:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1859:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1860:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1861:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1862:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1863:             }
1864: 
1865:             if (n == sz-1){
1866:               tmp0  = x[*idx];
1867:               sum1 -= v1[0] * tmp0;
1868:               sum2 -= v2[0] * tmp0;
1869:               sum3 -= v3[0] * tmp0;
1870:               sum4 -= v4[0] * tmp0;
1871:               sum5 -= v5[0] * tmp0;
1872:             }
1873:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1874:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1875:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1876:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1877:             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1878:             ibdiag  += 25;
1879:             break;
1880:           default:
1881:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1882:         }
1883:       }

1885:       xb = x;
1886:       PetscLogFlops(a->nz);
1887:     } else xb = b;
1888:     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1889:         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1890:       cnt = 0;
1891:       for (i=0, row=0; i<m; i++) {

1893:         switch (sizes[i]){
1894:           case 1:
1895:             x[row++] *= bdiag[cnt++];
1896:             break;
1897:           case 2:
1898:             x1   = x[row]; x2 = x[row+1];
1899:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1900:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1901:             x[row++] = tmp1;
1902:             x[row++] = tmp2;
1903:             cnt += 4;
1904:             break;
1905:           case 3:
1906:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1907:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1908:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1909:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1910:             x[row++] = tmp1;
1911:             x[row++] = tmp2;
1912:             x[row++] = tmp3;
1913:             cnt += 9;
1914:             break;
1915:           case 4:
1916:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1917:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1918:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1919:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1920:             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1921:             x[row++] = tmp1;
1922:             x[row++] = tmp2;
1923:             x[row++] = tmp3;
1924:             x[row++] = tmp4;
1925:             cnt += 16;
1926:             break;
1927:           case 5:
1928:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1929:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1930:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1931:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1932:             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1933:             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1934:             x[row++] = tmp1;
1935:             x[row++] = tmp2;
1936:             x[row++] = tmp3;
1937:             x[row++] = tmp4;
1938:             x[row++] = tmp5;
1939:             cnt += 25;
1940:             break;
1941:           default:
1942:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1943:         }
1944:       }
1945:       PetscLogFlops(m);
1946:     }
1947:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){

1949:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1950:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1951:         ibdiag -= sizes[i]*sizes[i];
1952:         sz      = ii[row+1] - diag[row] - 1;
1953:         v1      = a->a + diag[row] + 1;
1954:         idx     = a->j + diag[row] + 1;

1956:         /* see comments for MatMult_Inode() for how this is coded */
1957:         switch (sizes[i]){
1958:           case 1:
1959: 
1960:             sum1  = xb[row];
1961:             for(n = 0; n<sz-1; n+=2) {
1962:               i1   = idx[0];
1963:               i2   = idx[1];
1964:               idx += 2;
1965:               tmp0 = x[i1];
1966:               tmp1 = x[i2];
1967:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1968:             }
1969: 
1970:             if (n == sz-1){
1971:               tmp0  = x[*idx];
1972:               sum1 -= *v1*tmp0;
1973:             }
1974:             x[row--] = sum1*(*ibdiag);
1975:             break;

1977:           case 2:
1978: 
1979:             sum1  = xb[row];
1980:             sum2  = xb[row-1];
1981:             /* note that sum1 is associated with the second of the two rows */
1982:             v2    = a->a + diag[row-1] + 2;
1983:             for(n = 0; n<sz-1; n+=2) {
1984:               i1   = idx[0];
1985:               i2   = idx[1];
1986:               idx += 2;
1987:               tmp0 = x[i1];
1988:               tmp1 = x[i2];
1989:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1990:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1991:             }
1992: 
1993:             if (n == sz-1){
1994:               tmp0  = x[*idx];
1995:               sum1 -= *v1*tmp0;
1996:               sum2 -= *v2*tmp0;
1997:             }
1998:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1999:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2000:             break;
2001:           case 3:
2002: 
2003:             sum1  = xb[row];
2004:             sum2  = xb[row-1];
2005:             sum3  = xb[row-2];
2006:             v2    = a->a + diag[row-1] + 2;
2007:             v3    = a->a + diag[row-2] + 3;
2008:             for(n = 0; n<sz-1; n+=2) {
2009:               i1   = idx[0];
2010:               i2   = idx[1];
2011:               idx += 2;
2012:               tmp0 = x[i1];
2013:               tmp1 = x[i2];
2014:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2015:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2016:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2017:             }
2018: 
2019:             if (n == sz-1){
2020:               tmp0  = x[*idx];
2021:               sum1 -= *v1*tmp0;
2022:               sum2 -= *v2*tmp0;
2023:               sum3 -= *v3*tmp0;
2024:             }
2025:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2026:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2027:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2028:             break;
2029:           case 4:
2030: 
2031:             sum1  = xb[row];
2032:             sum2  = xb[row-1];
2033:             sum3  = xb[row-2];
2034:             sum4  = xb[row-3];
2035:             v2    = a->a + diag[row-1] + 2;
2036:             v3    = a->a + diag[row-2] + 3;
2037:             v4    = a->a + diag[row-3] + 4;
2038:             for(n = 0; n<sz-1; n+=2) {
2039:               i1   = idx[0];
2040:               i2   = idx[1];
2041:               idx += 2;
2042:               tmp0 = x[i1];
2043:               tmp1 = x[i2];
2044:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2045:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2046:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2047:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2048:             }
2049: 
2050:             if (n == sz-1){
2051:               tmp0  = x[*idx];
2052:               sum1 -= *v1*tmp0;
2053:               sum2 -= *v2*tmp0;
2054:               sum3 -= *v3*tmp0;
2055:               sum4 -= *v4*tmp0;
2056:             }
2057:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2058:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2059:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2060:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2061:             break;
2062:           case 5:
2063: 
2064:             sum1  = xb[row];
2065:             sum2  = xb[row-1];
2066:             sum3  = xb[row-2];
2067:             sum4  = xb[row-3];
2068:             sum5  = xb[row-4];
2069:             v2    = a->a + diag[row-1] + 2;
2070:             v3    = a->a + diag[row-2] + 3;
2071:             v4    = a->a + diag[row-3] + 4;
2072:             v5    = a->a + diag[row-4] + 5;
2073:             for(n = 0; n<sz-1; n+=2) {
2074:               i1   = idx[0];
2075:               i2   = idx[1];
2076:               idx += 2;
2077:               tmp0 = x[i1];
2078:               tmp1 = x[i2];
2079:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2080:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2081:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2082:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2083:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2084:             }
2085: 
2086:             if (n == sz-1){
2087:               tmp0  = x[*idx];
2088:               sum1 -= *v1*tmp0;
2089:               sum2 -= *v2*tmp0;
2090:               sum3 -= *v3*tmp0;
2091:               sum4 -= *v4*tmp0;
2092:               sum5 -= *v5*tmp0;
2093:             }
2094:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2095:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2096:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2097:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2098:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2099:             break;
2100:           default:
2101:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2102:         }
2103:       }

2105:       PetscLogFlops(a->nz);
2106:     }
2107:     its--;
2108:   }
2109:   VecRestoreArray(xx,&x);
2110:   if (bb != xx) {VecRestoreArray(bb,(PetscScalar**)&b);}
2111:   return(0);
2112: }


2115: /*
2116:     samestructure indicates that the matrix has not changed its nonzero structure so we 
2117:     do not need to recompute the inodes 
2118: */
2121: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2122: {
2123:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2125:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2126:   PetscTruth     flag;

2129:   if (!a->inode.use)                     return(0);
2130:   if (a->inode.checked && samestructure) return(0);


2133:   m = A->rmap->n;
2134:   if (a->inode.size) {ns = a->inode.size;}
2135:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

2137:   i          = 0;
2138:   node_count = 0;
2139:   idx        = a->j;
2140:   ii         = a->i;
2141:   while (i < m){                /* For each row */
2142:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2143:     /* Limits the number of elements in a node to 'a->inode.limit' */
2144:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2145:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2146:       if (nzy != nzx) break;
2147:       idy  += nzx;             /* Same nonzero pattern */
2148:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
2149:       if (!flag) break;
2150:     }
2151:     ns[node_count++] = blk_size;
2152:     idx += blk_size*nzx;
2153:     i    = j;
2154:   }
2155:   /* If not enough inodes found,, do not use inode version of the routines */
2156:   if (!a->inode.size && m && node_count > .9*m) {
2157:     PetscFree(ns);
2158:     a->inode.node_count     = 0;
2159:     a->inode.size           = PETSC_NULL;
2160:     a->inode.use            = PETSC_FALSE;
2161:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
2162:   } else {
2163:     A->ops->mult            = MatMult_Inode;
2164:     A->ops->relax           = MatRelax_Inode;
2165:     A->ops->multadd         = MatMultAdd_Inode;
2166:     A->ops->getrowij        = MatGetRowIJ_Inode;
2167:     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2168:     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2169:     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2170:     A->ops->coloringpatch   = MatColoringPatch_Inode;
2171:     a->inode.node_count     = node_count;
2172:     a->inode.size           = ns;
2173:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
2174:   }
2175:   return(0);
2176: }

2178: /*
2179:      This is really ugly. if inodes are used this replaces the 
2180:   permutations with ones that correspond to rows/cols of the matrix
2181:   rather then inode blocks
2182: */
2185: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2186: {
2187:   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);

2190:   PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
2191:   if (f) {
2192:     (*f)(A,rperm,cperm);
2193:   }
2194:   return(0);
2195: }

2200: PetscErrorCode  MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2201: {
2202:   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2204:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2205:   const PetscInt *ridx,*cidx;
2206:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2207:   PetscInt       nslim_col,*ns_col;
2208:   IS             ris = *rperm,cis = *cperm;

2211:   if (!a->inode.size) return(0); /* no inodes so return */
2212:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

2214:   Mat_CreateColInode(A,&nslim_col,&ns_col);
2215:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
2216:   PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
2217:   permc = permr + m;

2219:   ISGetIndices(ris,&ridx);
2220:   ISGetIndices(cis,&cidx);

2222:   /* Form the inode structure for the rows of permuted matric using inv perm*/
2223:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

2225:   /* Construct the permutations for rows*/
2226:   for (i=0,row = 0; i<nslim_row; ++i){
2227:     indx      = ridx[i];
2228:     start_val = tns[indx];
2229:     end_val   = tns[indx + 1];
2230:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2231:   }

2233:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2234:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

2236:  /* Construct permutations for columns */
2237:   for (i=0,col=0; i<nslim_col; ++i){
2238:     indx      = cidx[i];
2239:     start_val = tns[indx];
2240:     end_val   = tns[indx + 1];
2241:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2242:   }

2244:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
2245:   ISSetPermutation(*rperm);
2246:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
2247:   ISSetPermutation(*cperm);
2248: 
2249:   ISRestoreIndices(ris,&ridx);
2250:   ISRestoreIndices(cis,&cidx);

2252:   PetscFree(ns_col);
2253:   PetscFree(permr);
2254:   ISDestroy(cis);
2255:   ISDestroy(ris);
2256:   PetscFree(tns);
2257:   return(0);
2258: }

2263: /*@C
2264:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

2266:    Collective on Mat

2268:    Input Parameter:
2269: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

2271:    Output Parameter:
2272: +  node_count - no of inodes present in the matrix.
2273: .  sizes      - an array of size node_count,with sizes of each inode.
2274: -  limit      - the max size used to generate the inodes.

2276:    Level: advanced

2278:    Notes: This routine returns some internal storage information
2279:    of the matrix, it is intended to be used by advanced users.
2280:    It should be called after the matrix is assembled.
2281:    The contents of the sizes[] array should not be changed.
2282:    PETSC_NULL may be passed for information not requested.

2284: .keywords: matrix, seqaij, get, inode

2286: .seealso: MatGetInfo()
2287: @*/
2288: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2289: {
2290:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

2293:   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2294:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
2295:   if (f) {
2296:     (*f)(A,node_count,sizes,limit);
2297:   }
2298:   return(0);
2299: }

2304: PetscErrorCode  MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2305: {
2306:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

2309:   if (node_count) *node_count = a->inode.node_count;
2310:   if (sizes)      *sizes      = a->inode.size;
2311:   if (limit)      *limit      = a->inode.limit;
2312:   return(0);
2313: }