Actual source code: baij2.c

  1: #define PETSCMAT_DLL

 3:  #include ../src/mat/impls/baij/seq/baij.h
 4:  #include ../src/inline/spops.h
 5:  #include ../src/inline/ilu.h
 6:  #include petscbt.h

 10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 14:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 15:   const PetscInt *idx;
 16:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 17:   PetscBT        table;

 20:   m     = a->mbs;
 21:   ai    = a->i;
 22:   aj    = a->j;
 23:   bs    = A->rmap->bs;

 25:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 27:   PetscBTCreate(m,table);
 28:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 29:   PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);

 31:   for (i=0; i<is_max; i++) {
 32:     /* Initialise the two local arrays */
 33:     isz  = 0;
 34:     PetscBTMemzero(m,table);
 35: 
 36:     /* Extract the indices, assume there can be duplicate entries */
 37:     ISGetIndices(is[i],&idx);
 38:     ISGetLocalSize(is[i],&n);

 40:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 41:     for (j=0; j<n ; ++j){
 42:       ival = idx[j]/bs; /* convert the indices into block indices */
 43:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 44:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 45:     }
 46:     ISRestoreIndices(is[i],&idx);
 47:     ISDestroy(is[i]);
 48: 
 49:     k = 0;
 50:     for (j=0; j<ov; j++){ /* for each overlap*/
 51:       n = isz;
 52:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 53:         row   = nidx[k];
 54:         start = ai[row];
 55:         end   = ai[row+1];
 56:         for (l = start; l<end ; l++){
 57:           val = aj[l];
 58:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 59:         }
 60:       }
 61:     }
 62:     /* expand the Index Set */
 63:     for (j=0; j<isz; j++) {
 64:       for (k=0; k<bs; k++)
 65:         nidx2[j*bs+k] = nidx[j]*bs+k;
 66:     }
 67:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 68:   }
 69:   PetscBTDestroy(table);
 70:   PetscFree(nidx);
 71:   PetscFree(nidx2);
 72:   return(0);
 73: }

 77: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
 78: {
 79:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 81:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 82:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 83:   const PetscInt *irow,*icol;
 84:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 85:   PetscInt       *aj = a->j,*ai = a->i;
 86:   MatScalar      *mat_a;
 87:   Mat            C;
 88:   PetscTruth     flag,sorted;

 91:   ISSorted(iscol,&sorted);
 92:   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 94:   ISGetIndices(isrow,&irow);
 95:   ISGetIndices(iscol,&icol);
 96:   ISGetLocalSize(isrow,&nrows);
 97:   ISGetLocalSize(iscol,&ncols);

 99:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
100:   ssmap = smap;
101:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
102:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
103:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104:   /* determine lens of each row */
105:   for (i=0; i<nrows; i++) {
106:     kstart  = ai[irow[i]];
107:     kend    = kstart + a->ilen[irow[i]];
108:     lens[i] = 0;
109:       for (k=kstart; k<kend; k++) {
110:         if (ssmap[aj[k]]) {
111:           lens[i]++;
112:         }
113:       }
114:     }
115:   /* Create and fill new matrix */
116:   if (scall == MAT_REUSE_MATRIX) {
117:     c = (Mat_SeqBAIJ *)((*B)->data);

119:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
121:     if (!flag) {
122:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
123:     }
124:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
125:     C = *B;
126:   } else {
127:     MatCreate(((PetscObject)A)->comm,&C);
128:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
129:     MatSetType(C,((PetscObject)A)->type_name);
130:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
131:   }
132:   c = (Mat_SeqBAIJ *)(C->data);
133:   for (i=0; i<nrows; i++) {
134:     row    = irow[i];
135:     kstart = ai[row];
136:     kend   = kstart + a->ilen[row];
137:     mat_i  = c->i[i];
138:     mat_j  = c->j + mat_i;
139:     mat_a  = c->a + mat_i*bs2;
140:     mat_ilen = c->ilen + i;
141:     for (k=kstart; k<kend; k++) {
142:       if ((tcol=ssmap[a->j[k]])) {
143:         *mat_j++ = tcol - 1;
144:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
145:         mat_a   += bs2;
146:         (*mat_ilen)++;
147:       }
148:     }
149:   }
150: 
151:   /* Free work space */
152:   ISRestoreIndices(iscol,&icol);
153:   PetscFree(smap);
154:   PetscFree(lens);
155:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
157: 
158:   ISRestoreIndices(isrow,&irow);
159:   *B = C;
160:   return(0);
161: }

165: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
166: {
167:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
168:   IS             is1,is2;
170:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
171:   const PetscInt *irow,*icol;

174:   ISGetIndices(isrow,&irow);
175:   ISGetIndices(iscol,&icol);
176:   ISGetLocalSize(isrow,&nrows);
177:   ISGetLocalSize(iscol,&ncols);
178: 
179:   /* Verify if the indices corespond to each element in a block 
180:    and form the IS with compressed IS */
181:   PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
182:   iary = vary + a->mbs;
183:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
184:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
185:   count = 0;
186:   for (i=0; i<a->mbs; i++) {
187:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
188:     if (vary[i]==bs) iary[count++] = i;
189:   }
190:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
191: 
192:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
193:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
194:   count = 0;
195:   for (i=0; i<a->mbs; i++) {
196:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
197:     if (vary[i]==bs) iary[count++] = i;
198:   }
199:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
200:   ISRestoreIndices(isrow,&irow);
201:   ISRestoreIndices(iscol,&icol);
202:   PetscFree(vary);

204:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
205:   ISDestroy(is1);
206:   ISDestroy(is2);
207:   return(0);
208: }

212: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
213: {
215:   PetscInt       i;

218:   if (scall == MAT_INITIAL_MATRIX) {
219:     PetscMalloc((n+1)*sizeof(Mat),B);
220:   }

222:   for (i=0; i<n; i++) {
223:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
224:   }
225:   return(0);
226: }


229: /* -------------------------------------------------------*/
230: /* Should check that shapes of vectors and matrices match */
231: /* -------------------------------------------------------*/
232:  #include petscblaslapack.h

236: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
237: {
238:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
239:   PetscScalar       *z,sum;
240:   const PetscScalar *x;
241:   const MatScalar   *v;
242:   PetscErrorCode    ierr;
243:   PetscInt          mbs,i,*idx,*ii,n,*ridx=PETSC_NULL,nonzerorow=0;
244:   PetscTruth        usecprow=a->compressedrow.use;

247:   VecGetArray(xx,(PetscScalar**)&x);
248:   VecGetArray(zz,&z);

250:   idx = a->j;
251:   v   = a->a;
252:   if (usecprow){
253:     mbs  = a->compressedrow.nrows;
254:     ii   = a->compressedrow.i;
255:     ridx = a->compressedrow.rindex;
256:   } else {
257:     mbs = a->mbs;
258:     ii  = a->i;
259:   }

261:   for (i=0; i<mbs; i++) {
262:     n    = ii[1] - ii[0]; ii++;
263:     sum  = 0.0;
264:     nonzerorow += (n>0);
265:     while (n--) sum += *v++ * x[*idx++];
266:     if (usecprow){
267:       z[ridx[i]] = sum;
268:     } else {
269:       z[i] = sum;
270:     }
271:   }
272:   VecRestoreArray(xx,(PetscScalar**)&x);
273:   VecRestoreArray(zz,&z);
274:   PetscLogFlops(2*a->nz - nonzerorow);
275:   return(0);
276: }

280: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
281: {
282:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
283:   PetscScalar       *z = 0,sum1,sum2,*zarray;
284:   const PetscScalar *x,*xb;
285:   PetscScalar       x1,x2;
286:   const MatScalar   *v;
287:   PetscErrorCode    ierr;
288:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
289:   PetscTruth        usecprow=a->compressedrow.use;

292:   VecGetArray(xx,(PetscScalar**)&x);
293:   VecGetArray(zz,&zarray);

295:   idx = a->j;
296:   v   = a->a;
297:   if (usecprow){
298:     mbs  = a->compressedrow.nrows;
299:     ii   = a->compressedrow.i;
300:     ridx = a->compressedrow.rindex;
301:   } else {
302:     mbs = a->mbs;
303:     ii  = a->i;
304:     z   = zarray;
305:   }

307:   for (i=0; i<mbs; i++) {
308:     n  = ii[1] - ii[0]; ii++;
309:     sum1 = 0.0; sum2 = 0.0;
310:     nonzerorow += (n>0);
311:     for (j=0; j<n; j++) {
312:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
313:       sum1 += v[0]*x1 + v[2]*x2;
314:       sum2 += v[1]*x1 + v[3]*x2;
315:       v += 4;
316:     }
317:     if (usecprow) z = zarray + 2*ridx[i];
318:     z[0] = sum1; z[1] = sum2;
319:     if (!usecprow) z += 2;
320:   }
321:   VecRestoreArray(xx,(PetscScalar**)&x);
322:   VecRestoreArray(zz,&zarray);
323:   PetscLogFlops(8*a->nz - 2*nonzerorow);
324:   return(0);
325: }

329: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
330: {
331:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
332:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
333:   const PetscScalar *x,*xb;
334:   const MatScalar   *v;
335:   PetscErrorCode    ierr;
336:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
337:   PetscTruth        usecprow=a->compressedrow.use;
338: 

340: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
341: #pragma disjoint(*v,*z,*xb)
342: #endif

345:   VecGetArray(xx,(PetscScalar**)&x);
346:   VecGetArray(zz,&zarray);

348:   idx = a->j;
349:   v   = a->a;
350:   if (usecprow){
351:     mbs  = a->compressedrow.nrows;
352:     ii   = a->compressedrow.i;
353:     ridx = a->compressedrow.rindex;
354:   } else {
355:     mbs = a->mbs;
356:     ii  = a->i;
357:     z   = zarray;
358:   }

360:   for (i=0; i<mbs; i++) {
361:     n  = ii[1] - ii[0]; ii++;
362:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
363:     nonzerorow += (n>0);
364:     for (j=0; j<n; j++) {
365:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
366:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
367:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
368:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
369:       v += 9;
370:     }
371:     if (usecprow) z = zarray + 3*ridx[i];
372:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
373:     if (!usecprow) z += 3;
374:   }
375:   VecRestoreArray(xx,(PetscScalar**)&x);
376:   VecRestoreArray(zz,&zarray);
377:   PetscLogFlops(18*a->nz - 3*nonzerorow);
378:   return(0);
379: }

383: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
384: {
385:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
386:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
387:   const PetscScalar *x,*xb;
388:   const MatScalar   *v;
389:   PetscErrorCode    ierr;
390:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
391:   PetscTruth        usecprow=a->compressedrow.use;

394:   VecGetArray(xx,(PetscScalar**)&x);
395:   VecGetArray(zz,&zarray);

397:   idx = a->j;
398:   v   = a->a;
399:   if (usecprow){
400:     mbs  = a->compressedrow.nrows;
401:     ii   = a->compressedrow.i;
402:     ridx = a->compressedrow.rindex;
403:   } else {
404:     mbs = a->mbs;
405:     ii  = a->i;
406:     z   = zarray;
407:   }

409:   for (i=0; i<mbs; i++) {
410:     n  = ii[1] - ii[0]; ii++;
411:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
412:     nonzerorow += (n>0);
413:     for (j=0; j<n; j++) {
414:       xb = x + 4*(*idx++);
415:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
416:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
417:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
418:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
419:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
420:       v += 16;
421:     }
422:     if (usecprow) z = zarray + 4*ridx[i];
423:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
424:     if (!usecprow) z += 4;
425:   }
426:   VecRestoreArray(xx,(PetscScalar**)&x);
427:   VecRestoreArray(zz,&zarray);
428:   PetscLogFlops(32*a->nz - 4*nonzerorow);
429:   return(0);
430: }

434: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
435: {
436:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
437:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
438:   const PetscScalar *xb,*x;
439:   const MatScalar   *v;
440:   PetscErrorCode    ierr;
441:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
442:   PetscTruth        usecprow=a->compressedrow.use;

445:   VecGetArray(xx,(PetscScalar**)&x);
446:   VecGetArray(zz,&zarray);

448:   idx = a->j;
449:   v   = a->a;
450:   if (usecprow){
451:     mbs  = a->compressedrow.nrows;
452:     ii   = a->compressedrow.i;
453:     ridx = a->compressedrow.rindex;
454:   } else {
455:     mbs = a->mbs;
456:     ii  = a->i;
457:     z   = zarray;
458:   }

460:   for (i=0; i<mbs; i++) {
461:     n  = ii[1] - ii[0]; ii++;
462:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
463:     nonzerorow += (n>0);
464:     for (j=0; j<n; j++) {
465:       xb = x + 5*(*idx++);
466:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
467:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
468:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
469:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
470:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
471:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
472:       v += 25;
473:     }
474:     if (usecprow) z = zarray + 5*ridx[i];
475:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
476:     if (!usecprow) z += 5;
477:   }
478:   VecRestoreArray(xx,(PetscScalar**)&x);
479:   VecRestoreArray(zz,&zarray);
480:   PetscLogFlops(50*a->nz - 5*nonzerorow);
481:   return(0);
482: }


487: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
488: {
489:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
491:   const PetscScalar *x,*xb;
492:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
493:   const MatScalar   *v;
494:   PetscErrorCode    ierr;
495:   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
496:   PetscTruth        usecprow=a->compressedrow.use;

499:   VecGetArray(xx,(PetscScalar**)&x);
500:   VecGetArray(zz,&zarray);

502:   idx = a->j;
503:   v   = a->a;
504:   if (usecprow){
505:     mbs  = a->compressedrow.nrows;
506:     ii   = a->compressedrow.i;
507:     ridx = a->compressedrow.rindex;
508:   } else {
509:     mbs = a->mbs;
510:     ii  = a->i;
511:     z   = zarray;
512:   }

514:   for (i=0; i<mbs; i++) {
515:     n  = ii[1] - ii[0]; ii++;
516:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
517:     nonzerorow += (n>0);
518:     for (j=0; j<n; j++) {
519:       xb = x + 6*(*idx++);
520:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
521:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
522:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
523:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
524:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
525:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
526:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
527:       v += 36;
528:     }
529:     if (usecprow) z = zarray + 6*ridx[i];
530:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
531:     if (!usecprow) z += 6;
532:   }

534:   VecRestoreArray(xx,(PetscScalar**)&x);
535:   VecRestoreArray(zz,&zarray);
536:   PetscLogFlops(72*a->nz - 6*nonzerorow);
537:   return(0);
538: }
541: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
542: {
543:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
544:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
545:   const PetscScalar *x,*xb;
546:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
547:   const MatScalar   *v;
548:   PetscErrorCode    ierr;
549:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
550:   PetscTruth        usecprow=a->compressedrow.use;

553:   VecGetArray(xx,(PetscScalar**)&x);
554:   VecGetArray(zz,&zarray);

556:   idx = a->j;
557:   v   = a->a;
558:   if (usecprow){
559:     mbs    = a->compressedrow.nrows;
560:     ii     = a->compressedrow.i;
561:     ridx = a->compressedrow.rindex;
562:   } else {
563:     mbs = a->mbs;
564:     ii  = a->i;
565:     z   = zarray;
566:   }

568:   for (i=0; i<mbs; i++) {
569:     n  = ii[1] - ii[0]; ii++;
570:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
571:     nonzerorow += (n>0);
572:     for (j=0; j<n; j++) {
573:       xb = x + 7*(*idx++);
574:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
575:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
576:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
577:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
578:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
579:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
580:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
581:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
582:       v += 49;
583:     }
584:     if (usecprow) z = zarray + 7*ridx[i];
585:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
586:     if (!usecprow) z += 7;
587:   }

589:   VecRestoreArray(xx,(PetscScalar**)&x);
590:   VecRestoreArray(zz,&zarray);
591:   PetscLogFlops(98*a->nz - 7*nonzerorow);
592:   return(0);
593: }

595: /*
596:     This will not work with MatScalar == float because it calls the BLAS
597: */
600: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
601: {
602:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
603:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
604:   MatScalar      *v;
606:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
607:   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
608:   PetscTruth     usecprow=a->compressedrow.use;

611:   VecGetArray(xx,&x);
612:   VecGetArray(zz,&zarray);

614:   idx = a->j;
615:   v   = a->a;
616:   if (usecprow){
617:     mbs  = a->compressedrow.nrows;
618:     ii   = a->compressedrow.i;
619:     ridx = a->compressedrow.rindex;
620:   } else {
621:     mbs = a->mbs;
622:     ii  = a->i;
623:     z   = zarray;
624:   }

626:   if (!a->mult_work) {
627:     k    = PetscMax(A->rmap->n,A->cmap->n);
628:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
629:   }
630:   work = a->mult_work;
631:   for (i=0; i<mbs; i++) {
632:     n     = ii[1] - ii[0]; ii++;
633:     ncols = n*bs;
634:     workt = work;
635:     nonzerorow += (n>0);
636:     for (j=0; j<n; j++) {
637:       xb = x + bs*(*idx++);
638:       for (k=0; k<bs; k++) workt[k] = xb[k];
639:       workt += bs;
640:     }
641:     if (usecprow) z = zarray + bs*ridx[i];
642:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
643:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
644:     v += n*bs2;
645:     if (!usecprow) z += bs;
646:   }
647:   VecRestoreArray(xx,&x);
648:   VecRestoreArray(zz,&zarray);
649:   PetscLogFlops(2*a->nz*bs2 - bs*nonzerorow);
650:   return(0);
651: }

656: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
657: {
658:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
659:   PetscScalar    *x,*y = 0,*z = 0,sum,*yarray,*zarray;
660:   MatScalar      *v;
662:   PetscInt       mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
663:   PetscTruth     usecprow=a->compressedrow.use;

666:   VecGetArray(xx,&x);
667:   VecGetArray(yy,&yarray);
668:   if (zz != yy) {
669:     VecGetArray(zz,&zarray);
670:   } else {
671:     zarray = yarray;
672:   }

674:   idx = a->j;
675:   v   = a->a;
676:   if (usecprow){
677:     if (zz != yy){
678:       PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
679:     }
680:     mbs  = a->compressedrow.nrows;
681:     ii   = a->compressedrow.i;
682:     ridx = a->compressedrow.rindex;
683:   } else {
684:     ii  = a->i;
685:     y   = yarray;
686:     z   = zarray;
687:   }

689:   for (i=0; i<mbs; i++) {
690:     n    = ii[1] - ii[0]; ii++;
691:     if (usecprow){
692:       z = zarray + ridx[i];
693:       y = yarray + ridx[i];
694:     }
695:     sum = y[0];
696:     while (n--) sum += *v++ * x[*idx++];
697:     z[0] = sum;
698:     if (!usecprow){
699:       z++; y++;
700:     }
701:   }
702:   VecRestoreArray(xx,&x);
703:   VecRestoreArray(yy,&yarray);
704:   if (zz != yy) {
705:     VecRestoreArray(zz,&zarray);
706:   }
707:   PetscLogFlops(2*a->nz);
708:   return(0);
709: }

713: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
714: {
715:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
716:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
717:   PetscScalar    x1,x2,*yarray,*zarray;
718:   MatScalar      *v;
720:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
721:   PetscTruth     usecprow=a->compressedrow.use;

724:   VecGetArray(xx,&x);
725:   VecGetArray(yy,&yarray);
726:   if (zz != yy) {
727:     VecGetArray(zz,&zarray);
728:   } else {
729:     zarray = yarray;
730:   }

732:   idx = a->j;
733:   v   = a->a;
734:   if (usecprow){
735:     if (zz != yy){
736:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
737:     }
738:     mbs  = a->compressedrow.nrows;
739:     ii   = a->compressedrow.i;
740:     ridx = a->compressedrow.rindex;
741:     if (zz != yy){
742:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
743:     }
744:   } else {
745:     ii  = a->i;
746:     y   = yarray;
747:     z   = zarray;
748:   }

750:   for (i=0; i<mbs; i++) {
751:     n  = ii[1] - ii[0]; ii++;
752:     if (usecprow){
753:       z = zarray + 2*ridx[i];
754:       y = yarray + 2*ridx[i];
755:     }
756:     sum1 = y[0]; sum2 = y[1];
757:     for (j=0; j<n; j++) {
758:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
759:       sum1 += v[0]*x1 + v[2]*x2;
760:       sum2 += v[1]*x1 + v[3]*x2;
761:       v += 4;
762:     }
763:     z[0] = sum1; z[1] = sum2;
764:     if (!usecprow){
765:       z += 2; y += 2;
766:     }
767:   }
768:   VecRestoreArray(xx,&x);
769:   VecRestoreArray(yy,&yarray);
770:   if (zz != yy) {
771:     VecRestoreArray(zz,&zarray);
772:   }
773:   PetscLogFlops(4*a->nz);
774:   return(0);
775: }

779: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
780: {
781:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
782:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
783:   MatScalar      *v;
785:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
786:   PetscTruth     usecprow=a->compressedrow.use;

789:   VecGetArray(xx,&x);
790:   VecGetArray(yy,&yarray);
791:   if (zz != yy) {
792:     VecGetArray(zz,&zarray);
793:   } else {
794:     zarray = yarray;
795:   }

797:   idx = a->j;
798:   v   = a->a;
799:   if (usecprow){
800:     if (zz != yy){
801:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
802:     }
803:     mbs  = a->compressedrow.nrows;
804:     ii   = a->compressedrow.i;
805:     ridx = a->compressedrow.rindex;
806:   } else {
807:     ii  = a->i;
808:     y   = yarray;
809:     z   = zarray;
810:   }

812:   for (i=0; i<mbs; i++) {
813:     n  = ii[1] - ii[0]; ii++;
814:     if (usecprow){
815:       z = zarray + 3*ridx[i];
816:       y = yarray + 3*ridx[i];
817:     }
818:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
819:     for (j=0; j<n; j++) {
820:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
821:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
822:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
823:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
824:       v += 9;
825:     }
826:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
827:     if (!usecprow){
828:       z += 3; y += 3;
829:     }
830:   }
831:   VecRestoreArray(xx,&x);
832:   VecRestoreArray(yy,&yarray);
833:   if (zz != yy) {
834:     VecRestoreArray(zz,&zarray);
835:   }
836:   PetscLogFlops(18*a->nz);
837:   return(0);
838: }

842: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
843: {
844:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
845:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
846:   MatScalar      *v;
848:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
849:   PetscTruth     usecprow=a->compressedrow.use;

852:   VecGetArray(xx,&x);
853:   VecGetArray(yy,&yarray);
854:   if (zz != yy) {
855:     VecGetArray(zz,&zarray);
856:   } else {
857:     zarray = yarray;
858:   }

860:   idx   = a->j;
861:   v     = a->a;
862:   if (usecprow){
863:     if (zz != yy){
864:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
865:     }
866:     mbs  = a->compressedrow.nrows;
867:     ii   = a->compressedrow.i;
868:     ridx = a->compressedrow.rindex;
869:   } else {
870:     ii  = a->i;
871:     y   = yarray;
872:     z   = zarray;
873:   }

875:   for (i=0; i<mbs; i++) {
876:     n  = ii[1] - ii[0]; ii++;
877:     if (usecprow){
878:       z = zarray + 4*ridx[i];
879:       y = yarray + 4*ridx[i];
880:     }
881:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
882:     for (j=0; j<n; j++) {
883:       xb = x + 4*(*idx++);
884:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
886:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
887:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
888:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
889:       v += 16;
890:     }
891:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
892:     if (!usecprow){
893:       z += 4; y += 4;
894:     }
895:   }
896:   VecRestoreArray(xx,&x);
897:   VecRestoreArray(yy,&yarray);
898:   if (zz != yy) {
899:     VecRestoreArray(zz,&zarray);
900:   }
901:   PetscLogFlops(32*a->nz);
902:   return(0);
903: }

907: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
908: {
909:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
910:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
911:   PetscScalar    *yarray,*zarray;
912:   MatScalar      *v;
914:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
915:   PetscTruth     usecprow=a->compressedrow.use;

918:   VecGetArray(xx,&x);
919:   VecGetArray(yy,&yarray);
920:   if (zz != yy) {
921:     VecGetArray(zz,&zarray);
922:   } else {
923:     zarray = yarray;
924:   }

926:   idx = a->j;
927:   v   = a->a;
928:   if (usecprow){
929:     if (zz != yy){
930:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
931:     }
932:     mbs  = a->compressedrow.nrows;
933:     ii   = a->compressedrow.i;
934:     ridx = a->compressedrow.rindex;
935:   } else {
936:     ii  = a->i;
937:     y   = yarray;
938:     z   = zarray;
939:   }

941:   for (i=0; i<mbs; i++) {
942:     n  = ii[1] - ii[0]; ii++;
943:     if (usecprow){
944:       z = zarray + 5*ridx[i];
945:       y = yarray + 5*ridx[i];
946:     }
947:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
948:     for (j=0; j<n; j++) {
949:       xb = x + 5*(*idx++);
950:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
951:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
952:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
953:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
954:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
955:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
956:       v += 25;
957:     }
958:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
959:     if (!usecprow){
960:       z += 5; y += 5;
961:     }
962:   }
963:   VecRestoreArray(xx,&x);
964:   VecRestoreArray(yy,&yarray);
965:   if (zz != yy) {
966:     VecRestoreArray(zz,&zarray);
967:   }
968:   PetscLogFlops(50*a->nz);
969:   return(0);
970: }
973: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
974: {
975:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
976:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
977:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
978:   MatScalar      *v;
980:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
981:   PetscTruth     usecprow=a->compressedrow.use;

984:   VecGetArray(xx,&x);
985:   VecGetArray(yy,&yarray);
986:   if (zz != yy) {
987:     VecGetArray(zz,&zarray);
988:   } else {
989:     zarray = yarray;
990:   }

992:   idx = a->j;
993:   v   = a->a;
994:   if (usecprow){
995:     if (zz != yy){
996:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
997:     }
998:     mbs  = a->compressedrow.nrows;
999:     ii   = a->compressedrow.i;
1000:     ridx = a->compressedrow.rindex;
1001:   } else {
1002:     ii  = a->i;
1003:     y   = yarray;
1004:     z   = zarray;
1005:   }

1007:   for (i=0; i<mbs; i++) {
1008:     n  = ii[1] - ii[0]; ii++;
1009:     if (usecprow){
1010:       z = zarray + 6*ridx[i];
1011:       y = yarray + 6*ridx[i];
1012:     }
1013:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1014:     for (j=0; j<n; j++) {
1015:       xb = x + 6*(*idx++);
1016:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1017:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1018:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1019:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1020:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1021:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1022:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1023:       v += 36;
1024:     }
1025:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1026:     if (!usecprow){
1027:       z += 6; y += 6;
1028:     }
1029:   }
1030:   VecRestoreArray(xx,&x);
1031:   VecRestoreArray(yy,&yarray);
1032:   if (zz != yy) {
1033:     VecRestoreArray(zz,&zarray);
1034:   }
1035:   PetscLogFlops(72*a->nz);
1036:   return(0);
1037: }

1041: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1042: {
1043:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1044:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1045:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1046:   MatScalar      *v;
1048:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1049:   PetscTruth     usecprow=a->compressedrow.use;

1052:   VecGetArray(xx,&x);
1053:   VecGetArray(yy,&yarray);
1054:   if (zz != yy) {
1055:     VecGetArray(zz,&zarray);
1056:   } else {
1057:     zarray = yarray;
1058:   }

1060:   idx = a->j;
1061:   v   = a->a;
1062:   if (usecprow){
1063:     if (zz != yy){
1064:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1065:     }
1066:     mbs  = a->compressedrow.nrows;
1067:     ii   = a->compressedrow.i;
1068:     ridx = a->compressedrow.rindex;
1069:   } else {
1070:     ii  = a->i;
1071:     y   = yarray;
1072:     z   = zarray;
1073:   }

1075:   for (i=0; i<mbs; i++) {
1076:     n  = ii[1] - ii[0]; ii++;
1077:     if (usecprow){
1078:       z = zarray + 7*ridx[i];
1079:       y = yarray + 7*ridx[i];
1080:     }
1081:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1082:     for (j=0; j<n; j++) {
1083:       xb = x + 7*(*idx++);
1084:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1085:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1086:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1087:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1088:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1089:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1090:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1091:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1092:       v += 49;
1093:     }
1094:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1095:     if (!usecprow){
1096:       z += 7; y += 7;
1097:     }
1098:   }
1099:   VecRestoreArray(xx,&x);
1100:   VecRestoreArray(yy,&yarray);
1101:   if (zz != yy) {
1102:     VecRestoreArray(zz,&zarray);
1103:   }
1104:   PetscLogFlops(98*a->nz);
1105:   return(0);
1106: }

1110: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1111: {
1112:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1113:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1114:   MatScalar      *v;
1116:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1117:   PetscInt       ncols,k,*ridx=PETSC_NULL;
1118:   PetscTruth     usecprow=a->compressedrow.use;

1121:   VecCopy_Seq(yy,zz);
1122:   VecGetArray(xx,&x);
1123:   VecGetArray(zz,&zarray);

1125:   idx = a->j;
1126:   v   = a->a;
1127:   if (usecprow){
1128:     mbs    = a->compressedrow.nrows;
1129:     ii     = a->compressedrow.i;
1130:     ridx = a->compressedrow.rindex;
1131:   } else {
1132:     mbs = a->mbs;
1133:     ii  = a->i;
1134:     z   = zarray;
1135:   }

1137:   if (!a->mult_work) {
1138:     k    = PetscMax(A->rmap->n,A->cmap->n);
1139:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1140:   }
1141:   work = a->mult_work;
1142:   for (i=0; i<mbs; i++) {
1143:     n     = ii[1] - ii[0]; ii++;
1144:     ncols = n*bs;
1145:     workt = work;
1146:     for (j=0; j<n; j++) {
1147:       xb = x + bs*(*idx++);
1148:       for (k=0; k<bs; k++) workt[k] = xb[k];
1149:       workt += bs;
1150:     }
1151:     if (usecprow) z = zarray + bs*ridx[i];
1152:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1153:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1154:     v += n*bs2;
1155:     if (!usecprow){
1156:       z += bs;
1157:     }
1158:   }
1159:   VecRestoreArray(xx,&x);
1160:   VecRestoreArray(zz,&zarray);
1161:   PetscLogFlops(2*a->nz*bs2);
1162:   return(0);
1163: }

1167: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1168: {
1169:   PetscScalar    zero = 0.0;

1173:   VecSet(zz,zero);
1174:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1175:   return(0);
1176: }

1180: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1182: {
1183:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1184:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1185:   MatScalar         *v;
1186:   PetscErrorCode    ierr;
1187:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1188:   Mat_CompressedRow cprow = a->compressedrow;
1189:   PetscTruth        usecprow=cprow.use;

1192:   if (yy != zz) { VecCopy_Seq(yy,zz); }
1193:   VecGetArray(xx,&x);
1194:   VecGetArray(zz,&z);

1196:   idx = a->j;
1197:   v   = a->a;
1198:   if (usecprow){
1199:     mbs  = cprow.nrows;
1200:     ii   = cprow.i;
1201:     ridx = cprow.rindex;
1202:   } else {
1203:     mbs=a->mbs;
1204:     ii = a->i;
1205:     xb = x;
1206:   }

1208:   switch (bs) {
1209:   case 1:
1210:     for (i=0; i<mbs; i++) {
1211:       if (usecprow) xb = x + ridx[i];
1212:       x1 = xb[0];
1213:       ib = idx + ii[0];
1214:       n  = ii[1] - ii[0]; ii++;
1215:       for (j=0; j<n; j++) {
1216:         rval    = ib[j];
1217:         z[rval] += *v * x1;
1218:         v++;
1219:       }
1220:       if (!usecprow) xb++;
1221:     }
1222:     break;
1223:   case 2:
1224:     for (i=0; i<mbs; i++) {
1225:       if (usecprow) xb = x + 2*ridx[i];
1226:       x1 = xb[0]; x2 = xb[1];
1227:       ib = idx + ii[0];
1228:       n  = ii[1] - ii[0]; ii++;
1229:       for (j=0; j<n; j++) {
1230:         rval      = ib[j]*2;
1231:         z[rval++] += v[0]*x1 + v[1]*x2;
1232:         z[rval++] += v[2]*x1 + v[3]*x2;
1233:         v  += 4;
1234:       }
1235:       if (!usecprow) xb += 2;
1236:     }
1237:     break;
1238:   case 3:
1239:     for (i=0; i<mbs; i++) {
1240:       if (usecprow) xb = x + 3*ridx[i];
1241:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1242:       ib = idx + ii[0];
1243:       n  = ii[1] - ii[0]; ii++;
1244:       for (j=0; j<n; j++) {
1245:         rval      = ib[j]*3;
1246:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1247:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1248:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1249:         v  += 9;
1250:       }
1251:       if (!usecprow) xb += 3;
1252:     }
1253:     break;
1254:   case 4:
1255:     for (i=0; i<mbs; i++) {
1256:       if (usecprow) xb = x + 4*ridx[i];
1257:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1258:       ib = idx + ii[0];
1259:       n  = ii[1] - ii[0]; ii++;
1260:       for (j=0; j<n; j++) {
1261:         rval      = ib[j]*4;
1262:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1263:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1264:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1265:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1266:         v  += 16;
1267:       }
1268:       if (!usecprow) xb += 4;
1269:     }
1270:     break;
1271:   case 5:
1272:     for (i=0; i<mbs; i++) {
1273:       if (usecprow) xb = x + 5*ridx[i];
1274:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1275:       x4 = xb[3]; x5 = xb[4];
1276:       ib = idx + ii[0];
1277:       n  = ii[1] - ii[0]; ii++;
1278:       for (j=0; j<n; j++) {
1279:         rval      = ib[j]*5;
1280:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1281:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1282:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1283:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1284:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1285:         v  += 25;
1286:       }
1287:       if (!usecprow) xb += 5;
1288:     }
1289:     break;
1290:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1291:       PetscInt     ncols,k;
1292:       PetscScalar  *work,*workt,*xtmp;

1294:       if (!a->mult_work) {
1295:         k = PetscMax(A->rmap->n,A->cmap->n);
1296:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1297:       }
1298:       work = a->mult_work;
1299:       xtmp = x;
1300:       for (i=0; i<mbs; i++) {
1301:         n     = ii[1] - ii[0]; ii++;
1302:         ncols = n*bs;
1303:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1304:         if (usecprow) {
1305:           xtmp = x + bs*ridx[i];
1306:         }
1307:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1308:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1309:         v += n*bs2;
1310:         if (!usecprow) xtmp += bs;
1311:         workt = work;
1312:         for (j=0; j<n; j++) {
1313:           zb = z + bs*(*idx++);
1314:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1315:           workt += bs;
1316:         }
1317:       }
1318:     }
1319:   }
1320:   VecRestoreArray(xx,&x);
1321:   VecRestoreArray(zz,&z);
1322:   PetscLogFlops(2*a->nz*a->bs2);
1323:   return(0);
1324: }

1328: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1329: {
1330:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1331:   PetscInt       totalnz = a->bs2*a->nz;
1332:   PetscScalar    oalpha = alpha;
1334:   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);

1337:   BLASscal_(&tnz,&oalpha,a->a,&one);
1338:   PetscLogFlops(totalnz);
1339:   return(0);
1340: }

1344: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1345: {
1347:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1348:   MatScalar      *v = a->a;
1349:   PetscReal      sum = 0.0;
1350:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

1353:   if (type == NORM_FROBENIUS) {
1354:     for (i=0; i< bs2*nz; i++) {
1355: #if defined(PETSC_USE_COMPLEX)
1356:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1357: #else
1358:       sum += (*v)*(*v); v++;
1359: #endif
1360:     }
1361:     *norm = sqrt(sum);
1362:   } else if (type == NORM_1) { /* maximum column sum */
1363:     PetscReal *tmp;
1364:     PetscInt  *bcol = a->j;
1365:     PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);
1366:     PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));
1367:     for (i=0; i<nz; i++){
1368:       for (j=0; j<bs; j++){
1369:         k1 = bs*(*bcol) + j; /* column index */
1370:         for (k=0; k<bs; k++){
1371:           tmp[k1] += PetscAbsScalar(*v); v++;
1372:         }
1373:       }
1374:       bcol++;
1375:     }
1376:     *norm = 0.0;
1377:     for (j=0; j<A->cmap->n; j++) {
1378:       if (tmp[j] > *norm) *norm = tmp[j];
1379:     }
1380:     PetscFree(tmp);
1381:   } else if (type == NORM_INFINITY) { /* maximum row sum */
1382:     *norm = 0.0;
1383:     for (k=0; k<bs; k++) {
1384:       for (j=0; j<a->mbs; j++) {
1385:         v = a->a + bs2*a->i[j] + k;
1386:         sum = 0.0;
1387:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1388:           for (k1=0; k1<bs; k1++){
1389:             sum += PetscAbsScalar(*v);
1390:             v   += bs;
1391:           }
1392:         }
1393:         if (sum > *norm) *norm = sum;
1394:       }
1395:     }
1396:   } else {
1397:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1398:   }
1399:   return(0);
1400: }


1405: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1406: {
1407:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;

1411:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1412:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1413:     *flg = PETSC_FALSE;
1414:     return(0);
1415:   }
1416: 
1417:   /* if the a->i are the same */
1418:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1419:   if (!*flg) {
1420:     return(0);
1421:   }
1422: 
1423:   /* if a->j are the same */
1424:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1425:   if (!*flg) {
1426:     return(0);
1427:   }
1428:   /* if a->a are the same */
1429:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1430:   return(0);
1431: 
1432: }

1436: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1437: {
1438:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1440:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1441:   PetscScalar    *x,zero = 0.0;
1442:   MatScalar      *aa,*aa_j;

1445:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1446:   bs   = A->rmap->bs;
1447:   aa   = a->a;
1448:   ai   = a->i;
1449:   aj   = a->j;
1450:   ambs = a->mbs;
1451:   bs2  = a->bs2;

1453:   VecSet(v,zero);
1454:   VecGetArray(v,&x);
1455:   VecGetLocalSize(v,&n);
1456:   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1457:   for (i=0; i<ambs; i++) {
1458:     for (j=ai[i]; j<ai[i+1]; j++) {
1459:       if (aj[j] == i) {
1460:         row  = i*bs;
1461:         aa_j = aa+j*bs2;
1462:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1463:         break;
1464:       }
1465:     }
1466:   }
1467:   VecRestoreArray(v,&x);
1468:   return(0);
1469: }

1473: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1474: {
1475:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1476:   PetscScalar    *l,*r,x,*li,*ri;
1477:   MatScalar      *aa,*v;
1479:   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1482:   ai  = a->i;
1483:   aj  = a->j;
1484:   aa  = a->a;
1485:   m   = A->rmap->n;
1486:   n   = A->cmap->n;
1487:   bs  = A->rmap->bs;
1488:   mbs = a->mbs;
1489:   bs2 = a->bs2;
1490:   if (ll) {
1491:     VecGetArray(ll,&l);
1492:     VecGetLocalSize(ll,&lm);
1493:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1494:     for (i=0; i<mbs; i++) { /* for each block row */
1495:       M  = ai[i+1] - ai[i];
1496:       li = l + i*bs;
1497:       v  = aa + bs2*ai[i];
1498:       for (j=0; j<M; j++) { /* for each block */
1499:         for (k=0; k<bs2; k++) {
1500:           (*v++) *= li[k%bs];
1501:         }
1502:       }
1503:     }
1504:     VecRestoreArray(ll,&l);
1505:     PetscLogFlops(a->nz);
1506:   }
1507: 
1508:   if (rr) {
1509:     VecGetArray(rr,&r);
1510:     VecGetLocalSize(rr,&rn);
1511:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1512:     for (i=0; i<mbs; i++) { /* for each block row */
1513:       M  = ai[i+1] - ai[i];
1514:       v  = aa + bs2*ai[i];
1515:       for (j=0; j<M; j++) { /* for each block */
1516:         ri = r + bs*aj[ai[i]+j];
1517:         for (k=0; k<bs; k++) {
1518:           x = ri[k];
1519:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1520:         }
1521:       }
1522:     }
1523:     VecRestoreArray(rr,&r);
1524:     PetscLogFlops(a->nz);
1525:   }
1526:   return(0);
1527: }


1532: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1533: {
1534:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1537:   info->block_size     = a->bs2;
1538:   info->nz_allocated   = a->maxnz;
1539:   info->nz_used        = a->bs2*a->nz;
1540:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1541:   info->assemblies   = A->num_ass;
1542:   info->mallocs      = a->reallocs;
1543:   info->memory       = ((PetscObject)A)->mem;
1544:   if (A->factor) {
1545:     info->fill_ratio_given  = A->info.fill_ratio_given;
1546:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1547:     info->factor_mallocs    = A->info.factor_mallocs;
1548:   } else {
1549:     info->fill_ratio_given  = 0;
1550:     info->fill_ratio_needed = 0;
1551:     info->factor_mallocs    = 0;
1552:   }
1553:   return(0);
1554: }


1559: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1560: {
1561:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1565:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1566:   return(0);
1567: }