Actual source code: baij2.c
1: #define PETSCMAT_DLL
3: #include ../src/mat/impls/baij/seq/baij.h
4: #include ../src/inline/spops.h
5: #include ../src/inline/ilu.h
6: #include petscbt.h
10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
14: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
15: const PetscInt *idx;
16: PetscInt start,end,*ai,*aj,bs,*nidx2;
17: PetscBT table;
20: m = a->mbs;
21: ai = a->i;
22: aj = a->j;
23: bs = A->rmap->bs;
25: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27: PetscBTCreate(m,table);
28: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
29: PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);
31: for (i=0; i<is_max; i++) {
32: /* Initialise the two local arrays */
33: isz = 0;
34: PetscBTMemzero(m,table);
35:
36: /* Extract the indices, assume there can be duplicate entries */
37: ISGetIndices(is[i],&idx);
38: ISGetLocalSize(is[i],&n);
40: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41: for (j=0; j<n ; ++j){
42: ival = idx[j]/bs; /* convert the indices into block indices */
43: if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
45: }
46: ISRestoreIndices(is[i],&idx);
47: ISDestroy(is[i]);
48:
49: k = 0;
50: for (j=0; j<ov; j++){ /* for each overlap*/
51: n = isz;
52: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53: row = nidx[k];
54: start = ai[row];
55: end = ai[row+1];
56: for (l = start; l<end ; l++){
57: val = aj[l];
58: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59: }
60: }
61: }
62: /* expand the Index Set */
63: for (j=0; j<isz; j++) {
64: for (k=0; k<bs; k++)
65: nidx2[j*bs+k] = nidx[j]*bs+k;
66: }
67: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
68: }
69: PetscBTDestroy(table);
70: PetscFree(nidx);
71: PetscFree(nidx2);
72: return(0);
73: }
77: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
78: {
79: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
81: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
82: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
83: const PetscInt *irow,*icol;
84: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
85: PetscInt *aj = a->j,*ai = a->i;
86: MatScalar *mat_a;
87: Mat C;
88: PetscTruth flag,sorted;
91: ISSorted(iscol,&sorted);
92: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
94: ISGetIndices(isrow,&irow);
95: ISGetIndices(iscol,&icol);
96: ISGetLocalSize(isrow,&nrows);
97: ISGetLocalSize(iscol,&ncols);
99: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
100: ssmap = smap;
101: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
102: PetscMemzero(smap,oldcols*sizeof(PetscInt));
103: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104: /* determine lens of each row */
105: for (i=0; i<nrows; i++) {
106: kstart = ai[irow[i]];
107: kend = kstart + a->ilen[irow[i]];
108: lens[i] = 0;
109: for (k=kstart; k<kend; k++) {
110: if (ssmap[aj[k]]) {
111: lens[i]++;
112: }
113: }
114: }
115: /* Create and fill new matrix */
116: if (scall == MAT_REUSE_MATRIX) {
117: c = (Mat_SeqBAIJ *)((*B)->data);
119: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
121: if (!flag) {
122: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
123: }
124: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
125: C = *B;
126: } else {
127: MatCreate(((PetscObject)A)->comm,&C);
128: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
129: MatSetType(C,((PetscObject)A)->type_name);
130: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
131: }
132: c = (Mat_SeqBAIJ *)(C->data);
133: for (i=0; i<nrows; i++) {
134: row = irow[i];
135: kstart = ai[row];
136: kend = kstart + a->ilen[row];
137: mat_i = c->i[i];
138: mat_j = c->j + mat_i;
139: mat_a = c->a + mat_i*bs2;
140: mat_ilen = c->ilen + i;
141: for (k=kstart; k<kend; k++) {
142: if ((tcol=ssmap[a->j[k]])) {
143: *mat_j++ = tcol - 1;
144: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
145: mat_a += bs2;
146: (*mat_ilen)++;
147: }
148: }
149: }
150:
151: /* Free work space */
152: ISRestoreIndices(iscol,&icol);
153: PetscFree(smap);
154: PetscFree(lens);
155: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
157:
158: ISRestoreIndices(isrow,&irow);
159: *B = C;
160: return(0);
161: }
165: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
166: {
167: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
168: IS is1,is2;
170: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
171: const PetscInt *irow,*icol;
174: ISGetIndices(isrow,&irow);
175: ISGetIndices(iscol,&icol);
176: ISGetLocalSize(isrow,&nrows);
177: ISGetLocalSize(iscol,&ncols);
178:
179: /* Verify if the indices corespond to each element in a block
180: and form the IS with compressed IS */
181: PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
182: iary = vary + a->mbs;
183: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
184: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
185: count = 0;
186: for (i=0; i<a->mbs; i++) {
187: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
188: if (vary[i]==bs) iary[count++] = i;
189: }
190: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
191:
192: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
193: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
194: count = 0;
195: for (i=0; i<a->mbs; i++) {
196: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
197: if (vary[i]==bs) iary[count++] = i;
198: }
199: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
200: ISRestoreIndices(isrow,&irow);
201: ISRestoreIndices(iscol,&icol);
202: PetscFree(vary);
204: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
205: ISDestroy(is1);
206: ISDestroy(is2);
207: return(0);
208: }
212: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
213: {
215: PetscInt i;
218: if (scall == MAT_INITIAL_MATRIX) {
219: PetscMalloc((n+1)*sizeof(Mat),B);
220: }
222: for (i=0; i<n; i++) {
223: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
224: }
225: return(0);
226: }
229: /* -------------------------------------------------------*/
230: /* Should check that shapes of vectors and matrices match */
231: /* -------------------------------------------------------*/
232: #include petscblaslapack.h
236: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
237: {
238: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
239: PetscScalar *z,sum;
240: const PetscScalar *x;
241: const MatScalar *v;
242: PetscErrorCode ierr;
243: PetscInt mbs,i,*idx,*ii,n,*ridx=PETSC_NULL,nonzerorow=0;
244: PetscTruth usecprow=a->compressedrow.use;
247: VecGetArray(xx,(PetscScalar**)&x);
248: VecGetArray(zz,&z);
250: idx = a->j;
251: v = a->a;
252: if (usecprow){
253: mbs = a->compressedrow.nrows;
254: ii = a->compressedrow.i;
255: ridx = a->compressedrow.rindex;
256: } else {
257: mbs = a->mbs;
258: ii = a->i;
259: }
261: for (i=0; i<mbs; i++) {
262: n = ii[1] - ii[0]; ii++;
263: sum = 0.0;
264: nonzerorow += (n>0);
265: while (n--) sum += *v++ * x[*idx++];
266: if (usecprow){
267: z[ridx[i]] = sum;
268: } else {
269: z[i] = sum;
270: }
271: }
272: VecRestoreArray(xx,(PetscScalar**)&x);
273: VecRestoreArray(zz,&z);
274: PetscLogFlops(2*a->nz - nonzerorow);
275: return(0);
276: }
280: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
281: {
282: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
283: PetscScalar *z = 0,sum1,sum2,*zarray;
284: const PetscScalar *x,*xb;
285: PetscScalar x1,x2;
286: const MatScalar *v;
287: PetscErrorCode ierr;
288: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
289: PetscTruth usecprow=a->compressedrow.use;
292: VecGetArray(xx,(PetscScalar**)&x);
293: VecGetArray(zz,&zarray);
295: idx = a->j;
296: v = a->a;
297: if (usecprow){
298: mbs = a->compressedrow.nrows;
299: ii = a->compressedrow.i;
300: ridx = a->compressedrow.rindex;
301: } else {
302: mbs = a->mbs;
303: ii = a->i;
304: z = zarray;
305: }
307: for (i=0; i<mbs; i++) {
308: n = ii[1] - ii[0]; ii++;
309: sum1 = 0.0; sum2 = 0.0;
310: nonzerorow += (n>0);
311: for (j=0; j<n; j++) {
312: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
313: sum1 += v[0]*x1 + v[2]*x2;
314: sum2 += v[1]*x1 + v[3]*x2;
315: v += 4;
316: }
317: if (usecprow) z = zarray + 2*ridx[i];
318: z[0] = sum1; z[1] = sum2;
319: if (!usecprow) z += 2;
320: }
321: VecRestoreArray(xx,(PetscScalar**)&x);
322: VecRestoreArray(zz,&zarray);
323: PetscLogFlops(8*a->nz - 2*nonzerorow);
324: return(0);
325: }
329: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
330: {
331: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
332: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
333: const PetscScalar *x,*xb;
334: const MatScalar *v;
335: PetscErrorCode ierr;
336: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
337: PetscTruth usecprow=a->compressedrow.use;
338:
340: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
341: #pragma disjoint(*v,*z,*xb)
342: #endif
345: VecGetArray(xx,(PetscScalar**)&x);
346: VecGetArray(zz,&zarray);
348: idx = a->j;
349: v = a->a;
350: if (usecprow){
351: mbs = a->compressedrow.nrows;
352: ii = a->compressedrow.i;
353: ridx = a->compressedrow.rindex;
354: } else {
355: mbs = a->mbs;
356: ii = a->i;
357: z = zarray;
358: }
360: for (i=0; i<mbs; i++) {
361: n = ii[1] - ii[0]; ii++;
362: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
363: nonzerorow += (n>0);
364: for (j=0; j<n; j++) {
365: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
366: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
367: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
368: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
369: v += 9;
370: }
371: if (usecprow) z = zarray + 3*ridx[i];
372: z[0] = sum1; z[1] = sum2; z[2] = sum3;
373: if (!usecprow) z += 3;
374: }
375: VecRestoreArray(xx,(PetscScalar**)&x);
376: VecRestoreArray(zz,&zarray);
377: PetscLogFlops(18*a->nz - 3*nonzerorow);
378: return(0);
379: }
383: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
384: {
385: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
386: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
387: const PetscScalar *x,*xb;
388: const MatScalar *v;
389: PetscErrorCode ierr;
390: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
391: PetscTruth usecprow=a->compressedrow.use;
394: VecGetArray(xx,(PetscScalar**)&x);
395: VecGetArray(zz,&zarray);
397: idx = a->j;
398: v = a->a;
399: if (usecprow){
400: mbs = a->compressedrow.nrows;
401: ii = a->compressedrow.i;
402: ridx = a->compressedrow.rindex;
403: } else {
404: mbs = a->mbs;
405: ii = a->i;
406: z = zarray;
407: }
409: for (i=0; i<mbs; i++) {
410: n = ii[1] - ii[0]; ii++;
411: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
412: nonzerorow += (n>0);
413: for (j=0; j<n; j++) {
414: xb = x + 4*(*idx++);
415: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
416: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
417: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
418: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
419: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
420: v += 16;
421: }
422: if (usecprow) z = zarray + 4*ridx[i];
423: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
424: if (!usecprow) z += 4;
425: }
426: VecRestoreArray(xx,(PetscScalar**)&x);
427: VecRestoreArray(zz,&zarray);
428: PetscLogFlops(32*a->nz - 4*nonzerorow);
429: return(0);
430: }
434: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
435: {
436: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
437: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
438: const PetscScalar *xb,*x;
439: const MatScalar *v;
440: PetscErrorCode ierr;
441: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
442: PetscTruth usecprow=a->compressedrow.use;
445: VecGetArray(xx,(PetscScalar**)&x);
446: VecGetArray(zz,&zarray);
448: idx = a->j;
449: v = a->a;
450: if (usecprow){
451: mbs = a->compressedrow.nrows;
452: ii = a->compressedrow.i;
453: ridx = a->compressedrow.rindex;
454: } else {
455: mbs = a->mbs;
456: ii = a->i;
457: z = zarray;
458: }
460: for (i=0; i<mbs; i++) {
461: n = ii[1] - ii[0]; ii++;
462: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
463: nonzerorow += (n>0);
464: for (j=0; j<n; j++) {
465: xb = x + 5*(*idx++);
466: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
467: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
468: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
469: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
470: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
471: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
472: v += 25;
473: }
474: if (usecprow) z = zarray + 5*ridx[i];
475: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
476: if (!usecprow) z += 5;
477: }
478: VecRestoreArray(xx,(PetscScalar**)&x);
479: VecRestoreArray(zz,&zarray);
480: PetscLogFlops(50*a->nz - 5*nonzerorow);
481: return(0);
482: }
487: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
488: {
489: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
490: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
491: const PetscScalar *x,*xb;
492: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
493: const MatScalar *v;
494: PetscErrorCode ierr;
495: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
496: PetscTruth usecprow=a->compressedrow.use;
499: VecGetArray(xx,(PetscScalar**)&x);
500: VecGetArray(zz,&zarray);
502: idx = a->j;
503: v = a->a;
504: if (usecprow){
505: mbs = a->compressedrow.nrows;
506: ii = a->compressedrow.i;
507: ridx = a->compressedrow.rindex;
508: } else {
509: mbs = a->mbs;
510: ii = a->i;
511: z = zarray;
512: }
514: for (i=0; i<mbs; i++) {
515: n = ii[1] - ii[0]; ii++;
516: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
517: nonzerorow += (n>0);
518: for (j=0; j<n; j++) {
519: xb = x + 6*(*idx++);
520: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
521: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
522: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
523: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
524: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
525: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
526: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
527: v += 36;
528: }
529: if (usecprow) z = zarray + 6*ridx[i];
530: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
531: if (!usecprow) z += 6;
532: }
534: VecRestoreArray(xx,(PetscScalar**)&x);
535: VecRestoreArray(zz,&zarray);
536: PetscLogFlops(72*a->nz - 6*nonzerorow);
537: return(0);
538: }
541: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
542: {
543: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
544: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
545: const PetscScalar *x,*xb;
546: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
547: const MatScalar *v;
548: PetscErrorCode ierr;
549: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
550: PetscTruth usecprow=a->compressedrow.use;
553: VecGetArray(xx,(PetscScalar**)&x);
554: VecGetArray(zz,&zarray);
556: idx = a->j;
557: v = a->a;
558: if (usecprow){
559: mbs = a->compressedrow.nrows;
560: ii = a->compressedrow.i;
561: ridx = a->compressedrow.rindex;
562: } else {
563: mbs = a->mbs;
564: ii = a->i;
565: z = zarray;
566: }
568: for (i=0; i<mbs; i++) {
569: n = ii[1] - ii[0]; ii++;
570: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
571: nonzerorow += (n>0);
572: for (j=0; j<n; j++) {
573: xb = x + 7*(*idx++);
574: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
575: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
576: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
577: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
578: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
579: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
580: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
581: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
582: v += 49;
583: }
584: if (usecprow) z = zarray + 7*ridx[i];
585: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
586: if (!usecprow) z += 7;
587: }
589: VecRestoreArray(xx,(PetscScalar**)&x);
590: VecRestoreArray(zz,&zarray);
591: PetscLogFlops(98*a->nz - 7*nonzerorow);
592: return(0);
593: }
595: /*
596: This will not work with MatScalar == float because it calls the BLAS
597: */
600: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
601: {
602: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
603: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
604: MatScalar *v;
606: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
607: PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
608: PetscTruth usecprow=a->compressedrow.use;
611: VecGetArray(xx,&x);
612: VecGetArray(zz,&zarray);
614: idx = a->j;
615: v = a->a;
616: if (usecprow){
617: mbs = a->compressedrow.nrows;
618: ii = a->compressedrow.i;
619: ridx = a->compressedrow.rindex;
620: } else {
621: mbs = a->mbs;
622: ii = a->i;
623: z = zarray;
624: }
626: if (!a->mult_work) {
627: k = PetscMax(A->rmap->n,A->cmap->n);
628: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
629: }
630: work = a->mult_work;
631: for (i=0; i<mbs; i++) {
632: n = ii[1] - ii[0]; ii++;
633: ncols = n*bs;
634: workt = work;
635: nonzerorow += (n>0);
636: for (j=0; j<n; j++) {
637: xb = x + bs*(*idx++);
638: for (k=0; k<bs; k++) workt[k] = xb[k];
639: workt += bs;
640: }
641: if (usecprow) z = zarray + bs*ridx[i];
642: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
643: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
644: v += n*bs2;
645: if (!usecprow) z += bs;
646: }
647: VecRestoreArray(xx,&x);
648: VecRestoreArray(zz,&zarray);
649: PetscLogFlops(2*a->nz*bs2 - bs*nonzerorow);
650: return(0);
651: }
656: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
657: {
658: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
659: PetscScalar *x,*y = 0,*z = 0,sum,*yarray,*zarray;
660: MatScalar *v;
662: PetscInt mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
663: PetscTruth usecprow=a->compressedrow.use;
666: VecGetArray(xx,&x);
667: VecGetArray(yy,&yarray);
668: if (zz != yy) {
669: VecGetArray(zz,&zarray);
670: } else {
671: zarray = yarray;
672: }
674: idx = a->j;
675: v = a->a;
676: if (usecprow){
677: if (zz != yy){
678: PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
679: }
680: mbs = a->compressedrow.nrows;
681: ii = a->compressedrow.i;
682: ridx = a->compressedrow.rindex;
683: } else {
684: ii = a->i;
685: y = yarray;
686: z = zarray;
687: }
689: for (i=0; i<mbs; i++) {
690: n = ii[1] - ii[0]; ii++;
691: if (usecprow){
692: z = zarray + ridx[i];
693: y = yarray + ridx[i];
694: }
695: sum = y[0];
696: while (n--) sum += *v++ * x[*idx++];
697: z[0] = sum;
698: if (!usecprow){
699: z++; y++;
700: }
701: }
702: VecRestoreArray(xx,&x);
703: VecRestoreArray(yy,&yarray);
704: if (zz != yy) {
705: VecRestoreArray(zz,&zarray);
706: }
707: PetscLogFlops(2*a->nz);
708: return(0);
709: }
713: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
714: {
715: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
716: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
717: PetscScalar x1,x2,*yarray,*zarray;
718: MatScalar *v;
720: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
721: PetscTruth usecprow=a->compressedrow.use;
724: VecGetArray(xx,&x);
725: VecGetArray(yy,&yarray);
726: if (zz != yy) {
727: VecGetArray(zz,&zarray);
728: } else {
729: zarray = yarray;
730: }
732: idx = a->j;
733: v = a->a;
734: if (usecprow){
735: if (zz != yy){
736: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
737: }
738: mbs = a->compressedrow.nrows;
739: ii = a->compressedrow.i;
740: ridx = a->compressedrow.rindex;
741: if (zz != yy){
742: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
743: }
744: } else {
745: ii = a->i;
746: y = yarray;
747: z = zarray;
748: }
750: for (i=0; i<mbs; i++) {
751: n = ii[1] - ii[0]; ii++;
752: if (usecprow){
753: z = zarray + 2*ridx[i];
754: y = yarray + 2*ridx[i];
755: }
756: sum1 = y[0]; sum2 = y[1];
757: for (j=0; j<n; j++) {
758: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
759: sum1 += v[0]*x1 + v[2]*x2;
760: sum2 += v[1]*x1 + v[3]*x2;
761: v += 4;
762: }
763: z[0] = sum1; z[1] = sum2;
764: if (!usecprow){
765: z += 2; y += 2;
766: }
767: }
768: VecRestoreArray(xx,&x);
769: VecRestoreArray(yy,&yarray);
770: if (zz != yy) {
771: VecRestoreArray(zz,&zarray);
772: }
773: PetscLogFlops(4*a->nz);
774: return(0);
775: }
779: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
780: {
781: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
782: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
783: MatScalar *v;
785: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
786: PetscTruth usecprow=a->compressedrow.use;
789: VecGetArray(xx,&x);
790: VecGetArray(yy,&yarray);
791: if (zz != yy) {
792: VecGetArray(zz,&zarray);
793: } else {
794: zarray = yarray;
795: }
797: idx = a->j;
798: v = a->a;
799: if (usecprow){
800: if (zz != yy){
801: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
802: }
803: mbs = a->compressedrow.nrows;
804: ii = a->compressedrow.i;
805: ridx = a->compressedrow.rindex;
806: } else {
807: ii = a->i;
808: y = yarray;
809: z = zarray;
810: }
812: for (i=0; i<mbs; i++) {
813: n = ii[1] - ii[0]; ii++;
814: if (usecprow){
815: z = zarray + 3*ridx[i];
816: y = yarray + 3*ridx[i];
817: }
818: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
819: for (j=0; j<n; j++) {
820: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
821: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
822: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
823: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
824: v += 9;
825: }
826: z[0] = sum1; z[1] = sum2; z[2] = sum3;
827: if (!usecprow){
828: z += 3; y += 3;
829: }
830: }
831: VecRestoreArray(xx,&x);
832: VecRestoreArray(yy,&yarray);
833: if (zz != yy) {
834: VecRestoreArray(zz,&zarray);
835: }
836: PetscLogFlops(18*a->nz);
837: return(0);
838: }
842: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
843: {
844: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
845: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
846: MatScalar *v;
848: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
849: PetscTruth usecprow=a->compressedrow.use;
852: VecGetArray(xx,&x);
853: VecGetArray(yy,&yarray);
854: if (zz != yy) {
855: VecGetArray(zz,&zarray);
856: } else {
857: zarray = yarray;
858: }
860: idx = a->j;
861: v = a->a;
862: if (usecprow){
863: if (zz != yy){
864: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
865: }
866: mbs = a->compressedrow.nrows;
867: ii = a->compressedrow.i;
868: ridx = a->compressedrow.rindex;
869: } else {
870: ii = a->i;
871: y = yarray;
872: z = zarray;
873: }
875: for (i=0; i<mbs; i++) {
876: n = ii[1] - ii[0]; ii++;
877: if (usecprow){
878: z = zarray + 4*ridx[i];
879: y = yarray + 4*ridx[i];
880: }
881: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
882: for (j=0; j<n; j++) {
883: xb = x + 4*(*idx++);
884: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
886: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
887: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
888: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
889: v += 16;
890: }
891: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
892: if (!usecprow){
893: z += 4; y += 4;
894: }
895: }
896: VecRestoreArray(xx,&x);
897: VecRestoreArray(yy,&yarray);
898: if (zz != yy) {
899: VecRestoreArray(zz,&zarray);
900: }
901: PetscLogFlops(32*a->nz);
902: return(0);
903: }
907: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
908: {
909: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
910: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
911: PetscScalar *yarray,*zarray;
912: MatScalar *v;
914: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
915: PetscTruth usecprow=a->compressedrow.use;
918: VecGetArray(xx,&x);
919: VecGetArray(yy,&yarray);
920: if (zz != yy) {
921: VecGetArray(zz,&zarray);
922: } else {
923: zarray = yarray;
924: }
926: idx = a->j;
927: v = a->a;
928: if (usecprow){
929: if (zz != yy){
930: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
931: }
932: mbs = a->compressedrow.nrows;
933: ii = a->compressedrow.i;
934: ridx = a->compressedrow.rindex;
935: } else {
936: ii = a->i;
937: y = yarray;
938: z = zarray;
939: }
941: for (i=0; i<mbs; i++) {
942: n = ii[1] - ii[0]; ii++;
943: if (usecprow){
944: z = zarray + 5*ridx[i];
945: y = yarray + 5*ridx[i];
946: }
947: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
948: for (j=0; j<n; j++) {
949: xb = x + 5*(*idx++);
950: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
951: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
952: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
953: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
954: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
955: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
956: v += 25;
957: }
958: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
959: if (!usecprow){
960: z += 5; y += 5;
961: }
962: }
963: VecRestoreArray(xx,&x);
964: VecRestoreArray(yy,&yarray);
965: if (zz != yy) {
966: VecRestoreArray(zz,&zarray);
967: }
968: PetscLogFlops(50*a->nz);
969: return(0);
970: }
973: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
974: {
975: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
976: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
977: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
978: MatScalar *v;
980: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
981: PetscTruth usecprow=a->compressedrow.use;
984: VecGetArray(xx,&x);
985: VecGetArray(yy,&yarray);
986: if (zz != yy) {
987: VecGetArray(zz,&zarray);
988: } else {
989: zarray = yarray;
990: }
992: idx = a->j;
993: v = a->a;
994: if (usecprow){
995: if (zz != yy){
996: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
997: }
998: mbs = a->compressedrow.nrows;
999: ii = a->compressedrow.i;
1000: ridx = a->compressedrow.rindex;
1001: } else {
1002: ii = a->i;
1003: y = yarray;
1004: z = zarray;
1005: }
1007: for (i=0; i<mbs; i++) {
1008: n = ii[1] - ii[0]; ii++;
1009: if (usecprow){
1010: z = zarray + 6*ridx[i];
1011: y = yarray + 6*ridx[i];
1012: }
1013: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1014: for (j=0; j<n; j++) {
1015: xb = x + 6*(*idx++);
1016: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1017: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1018: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1019: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1020: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1021: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1022: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1023: v += 36;
1024: }
1025: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1026: if (!usecprow){
1027: z += 6; y += 6;
1028: }
1029: }
1030: VecRestoreArray(xx,&x);
1031: VecRestoreArray(yy,&yarray);
1032: if (zz != yy) {
1033: VecRestoreArray(zz,&zarray);
1034: }
1035: PetscLogFlops(72*a->nz);
1036: return(0);
1037: }
1041: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1042: {
1043: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1044: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1045: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1046: MatScalar *v;
1048: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1049: PetscTruth usecprow=a->compressedrow.use;
1052: VecGetArray(xx,&x);
1053: VecGetArray(yy,&yarray);
1054: if (zz != yy) {
1055: VecGetArray(zz,&zarray);
1056: } else {
1057: zarray = yarray;
1058: }
1060: idx = a->j;
1061: v = a->a;
1062: if (usecprow){
1063: if (zz != yy){
1064: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1065: }
1066: mbs = a->compressedrow.nrows;
1067: ii = a->compressedrow.i;
1068: ridx = a->compressedrow.rindex;
1069: } else {
1070: ii = a->i;
1071: y = yarray;
1072: z = zarray;
1073: }
1075: for (i=0; i<mbs; i++) {
1076: n = ii[1] - ii[0]; ii++;
1077: if (usecprow){
1078: z = zarray + 7*ridx[i];
1079: y = yarray + 7*ridx[i];
1080: }
1081: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1082: for (j=0; j<n; j++) {
1083: xb = x + 7*(*idx++);
1084: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1085: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1086: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1087: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1088: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1089: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1090: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1091: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1092: v += 49;
1093: }
1094: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1095: if (!usecprow){
1096: z += 7; y += 7;
1097: }
1098: }
1099: VecRestoreArray(xx,&x);
1100: VecRestoreArray(yy,&yarray);
1101: if (zz != yy) {
1102: VecRestoreArray(zz,&zarray);
1103: }
1104: PetscLogFlops(98*a->nz);
1105: return(0);
1106: }
1110: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1111: {
1112: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1113: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1114: MatScalar *v;
1116: PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1117: PetscInt ncols,k,*ridx=PETSC_NULL;
1118: PetscTruth usecprow=a->compressedrow.use;
1121: VecCopy_Seq(yy,zz);
1122: VecGetArray(xx,&x);
1123: VecGetArray(zz,&zarray);
1125: idx = a->j;
1126: v = a->a;
1127: if (usecprow){
1128: mbs = a->compressedrow.nrows;
1129: ii = a->compressedrow.i;
1130: ridx = a->compressedrow.rindex;
1131: } else {
1132: mbs = a->mbs;
1133: ii = a->i;
1134: z = zarray;
1135: }
1137: if (!a->mult_work) {
1138: k = PetscMax(A->rmap->n,A->cmap->n);
1139: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1140: }
1141: work = a->mult_work;
1142: for (i=0; i<mbs; i++) {
1143: n = ii[1] - ii[0]; ii++;
1144: ncols = n*bs;
1145: workt = work;
1146: for (j=0; j<n; j++) {
1147: xb = x + bs*(*idx++);
1148: for (k=0; k<bs; k++) workt[k] = xb[k];
1149: workt += bs;
1150: }
1151: if (usecprow) z = zarray + bs*ridx[i];
1152: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1153: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1154: v += n*bs2;
1155: if (!usecprow){
1156: z += bs;
1157: }
1158: }
1159: VecRestoreArray(xx,&x);
1160: VecRestoreArray(zz,&zarray);
1161: PetscLogFlops(2*a->nz*bs2);
1162: return(0);
1163: }
1167: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1168: {
1169: PetscScalar zero = 0.0;
1173: VecSet(zz,zero);
1174: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1175: return(0);
1176: }
1180: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1182: {
1183: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1184: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1185: MatScalar *v;
1186: PetscErrorCode ierr;
1187: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1188: Mat_CompressedRow cprow = a->compressedrow;
1189: PetscTruth usecprow=cprow.use;
1192: if (yy != zz) { VecCopy_Seq(yy,zz); }
1193: VecGetArray(xx,&x);
1194: VecGetArray(zz,&z);
1196: idx = a->j;
1197: v = a->a;
1198: if (usecprow){
1199: mbs = cprow.nrows;
1200: ii = cprow.i;
1201: ridx = cprow.rindex;
1202: } else {
1203: mbs=a->mbs;
1204: ii = a->i;
1205: xb = x;
1206: }
1208: switch (bs) {
1209: case 1:
1210: for (i=0; i<mbs; i++) {
1211: if (usecprow) xb = x + ridx[i];
1212: x1 = xb[0];
1213: ib = idx + ii[0];
1214: n = ii[1] - ii[0]; ii++;
1215: for (j=0; j<n; j++) {
1216: rval = ib[j];
1217: z[rval] += *v * x1;
1218: v++;
1219: }
1220: if (!usecprow) xb++;
1221: }
1222: break;
1223: case 2:
1224: for (i=0; i<mbs; i++) {
1225: if (usecprow) xb = x + 2*ridx[i];
1226: x1 = xb[0]; x2 = xb[1];
1227: ib = idx + ii[0];
1228: n = ii[1] - ii[0]; ii++;
1229: for (j=0; j<n; j++) {
1230: rval = ib[j]*2;
1231: z[rval++] += v[0]*x1 + v[1]*x2;
1232: z[rval++] += v[2]*x1 + v[3]*x2;
1233: v += 4;
1234: }
1235: if (!usecprow) xb += 2;
1236: }
1237: break;
1238: case 3:
1239: for (i=0; i<mbs; i++) {
1240: if (usecprow) xb = x + 3*ridx[i];
1241: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1242: ib = idx + ii[0];
1243: n = ii[1] - ii[0]; ii++;
1244: for (j=0; j<n; j++) {
1245: rval = ib[j]*3;
1246: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1247: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1248: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1249: v += 9;
1250: }
1251: if (!usecprow) xb += 3;
1252: }
1253: break;
1254: case 4:
1255: for (i=0; i<mbs; i++) {
1256: if (usecprow) xb = x + 4*ridx[i];
1257: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1258: ib = idx + ii[0];
1259: n = ii[1] - ii[0]; ii++;
1260: for (j=0; j<n; j++) {
1261: rval = ib[j]*4;
1262: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1263: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1264: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1265: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1266: v += 16;
1267: }
1268: if (!usecprow) xb += 4;
1269: }
1270: break;
1271: case 5:
1272: for (i=0; i<mbs; i++) {
1273: if (usecprow) xb = x + 5*ridx[i];
1274: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1275: x4 = xb[3]; x5 = xb[4];
1276: ib = idx + ii[0];
1277: n = ii[1] - ii[0]; ii++;
1278: for (j=0; j<n; j++) {
1279: rval = ib[j]*5;
1280: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1281: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1282: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1283: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1284: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1285: v += 25;
1286: }
1287: if (!usecprow) xb += 5;
1288: }
1289: break;
1290: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1291: PetscInt ncols,k;
1292: PetscScalar *work,*workt,*xtmp;
1294: if (!a->mult_work) {
1295: k = PetscMax(A->rmap->n,A->cmap->n);
1296: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1297: }
1298: work = a->mult_work;
1299: xtmp = x;
1300: for (i=0; i<mbs; i++) {
1301: n = ii[1] - ii[0]; ii++;
1302: ncols = n*bs;
1303: PetscMemzero(work,ncols*sizeof(PetscScalar));
1304: if (usecprow) {
1305: xtmp = x + bs*ridx[i];
1306: }
1307: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1308: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1309: v += n*bs2;
1310: if (!usecprow) xtmp += bs;
1311: workt = work;
1312: for (j=0; j<n; j++) {
1313: zb = z + bs*(*idx++);
1314: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1315: workt += bs;
1316: }
1317: }
1318: }
1319: }
1320: VecRestoreArray(xx,&x);
1321: VecRestoreArray(zz,&z);
1322: PetscLogFlops(2*a->nz*a->bs2);
1323: return(0);
1324: }
1328: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1329: {
1330: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1331: PetscInt totalnz = a->bs2*a->nz;
1332: PetscScalar oalpha = alpha;
1334: PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz);
1337: BLASscal_(&tnz,&oalpha,a->a,&one);
1338: PetscLogFlops(totalnz);
1339: return(0);
1340: }
1344: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1345: {
1347: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1348: MatScalar *v = a->a;
1349: PetscReal sum = 0.0;
1350: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1353: if (type == NORM_FROBENIUS) {
1354: for (i=0; i< bs2*nz; i++) {
1355: #if defined(PETSC_USE_COMPLEX)
1356: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1357: #else
1358: sum += (*v)*(*v); v++;
1359: #endif
1360: }
1361: *norm = sqrt(sum);
1362: } else if (type == NORM_1) { /* maximum column sum */
1363: PetscReal *tmp;
1364: PetscInt *bcol = a->j;
1365: PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);
1366: PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));
1367: for (i=0; i<nz; i++){
1368: for (j=0; j<bs; j++){
1369: k1 = bs*(*bcol) + j; /* column index */
1370: for (k=0; k<bs; k++){
1371: tmp[k1] += PetscAbsScalar(*v); v++;
1372: }
1373: }
1374: bcol++;
1375: }
1376: *norm = 0.0;
1377: for (j=0; j<A->cmap->n; j++) {
1378: if (tmp[j] > *norm) *norm = tmp[j];
1379: }
1380: PetscFree(tmp);
1381: } else if (type == NORM_INFINITY) { /* maximum row sum */
1382: *norm = 0.0;
1383: for (k=0; k<bs; k++) {
1384: for (j=0; j<a->mbs; j++) {
1385: v = a->a + bs2*a->i[j] + k;
1386: sum = 0.0;
1387: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1388: for (k1=0; k1<bs; k1++){
1389: sum += PetscAbsScalar(*v);
1390: v += bs;
1391: }
1392: }
1393: if (sum > *norm) *norm = sum;
1394: }
1395: }
1396: } else {
1397: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1398: }
1399: return(0);
1400: }
1405: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1406: {
1407: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1411: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1412: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1413: *flg = PETSC_FALSE;
1414: return(0);
1415: }
1416:
1417: /* if the a->i are the same */
1418: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1419: if (!*flg) {
1420: return(0);
1421: }
1422:
1423: /* if a->j are the same */
1424: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1425: if (!*flg) {
1426: return(0);
1427: }
1428: /* if a->a are the same */
1429: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1430: return(0);
1431:
1432: }
1436: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1437: {
1438: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1440: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1441: PetscScalar *x,zero = 0.0;
1442: MatScalar *aa,*aa_j;
1445: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1446: bs = A->rmap->bs;
1447: aa = a->a;
1448: ai = a->i;
1449: aj = a->j;
1450: ambs = a->mbs;
1451: bs2 = a->bs2;
1453: VecSet(v,zero);
1454: VecGetArray(v,&x);
1455: VecGetLocalSize(v,&n);
1456: if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1457: for (i=0; i<ambs; i++) {
1458: for (j=ai[i]; j<ai[i+1]; j++) {
1459: if (aj[j] == i) {
1460: row = i*bs;
1461: aa_j = aa+j*bs2;
1462: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1463: break;
1464: }
1465: }
1466: }
1467: VecRestoreArray(v,&x);
1468: return(0);
1469: }
1473: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1474: {
1475: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1476: PetscScalar *l,*r,x,*li,*ri;
1477: MatScalar *aa,*v;
1479: PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1482: ai = a->i;
1483: aj = a->j;
1484: aa = a->a;
1485: m = A->rmap->n;
1486: n = A->cmap->n;
1487: bs = A->rmap->bs;
1488: mbs = a->mbs;
1489: bs2 = a->bs2;
1490: if (ll) {
1491: VecGetArray(ll,&l);
1492: VecGetLocalSize(ll,&lm);
1493: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1494: for (i=0; i<mbs; i++) { /* for each block row */
1495: M = ai[i+1] - ai[i];
1496: li = l + i*bs;
1497: v = aa + bs2*ai[i];
1498: for (j=0; j<M; j++) { /* for each block */
1499: for (k=0; k<bs2; k++) {
1500: (*v++) *= li[k%bs];
1501: }
1502: }
1503: }
1504: VecRestoreArray(ll,&l);
1505: PetscLogFlops(a->nz);
1506: }
1507:
1508: if (rr) {
1509: VecGetArray(rr,&r);
1510: VecGetLocalSize(rr,&rn);
1511: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1512: for (i=0; i<mbs; i++) { /* for each block row */
1513: M = ai[i+1] - ai[i];
1514: v = aa + bs2*ai[i];
1515: for (j=0; j<M; j++) { /* for each block */
1516: ri = r + bs*aj[ai[i]+j];
1517: for (k=0; k<bs; k++) {
1518: x = ri[k];
1519: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1520: }
1521: }
1522: }
1523: VecRestoreArray(rr,&r);
1524: PetscLogFlops(a->nz);
1525: }
1526: return(0);
1527: }
1532: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1533: {
1534: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1537: info->block_size = a->bs2;
1538: info->nz_allocated = a->maxnz;
1539: info->nz_used = a->bs2*a->nz;
1540: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1541: info->assemblies = A->num_ass;
1542: info->mallocs = a->reallocs;
1543: info->memory = ((PetscObject)A)->mem;
1544: if (A->factor) {
1545: info->fill_ratio_given = A->info.fill_ratio_given;
1546: info->fill_ratio_needed = A->info.fill_ratio_needed;
1547: info->factor_mallocs = A->info.factor_mallocs;
1548: } else {
1549: info->fill_ratio_given = 0;
1550: info->fill_ratio_needed = 0;
1551: info->factor_mallocs = 0;
1552: }
1553: return(0);
1554: }
1559: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1560: {
1561: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1565: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1566: return(0);
1567: }