Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include ../src/mat/impls/aij/seq/aij.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->cmap->n;
19: m = A->rmap->n;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap->n;
72: n = A->cmap->n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->cmap->n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
242: if (!blockcompressed) {
243: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244: } else if (symmetric) {
245: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246: } else {
247: MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248: }
249: return(0);
250: }
254: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {
259: if (!ia) return(0);
261: if (!blockcompressed) {
262: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263: } else {
264: PetscFree(*ia);
265: PetscFree(*ja);
266: }
268: return(0);
269: }
271: /* ----------------------------------------------------------- */
275: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
279: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
283: nslim_row = a->inode.node_count;
284: n = A->cmap->n;
286: /* Create The column_inode for this matrix */
287: Mat_CreateColInode(A,&nslim_col,&ns_col);
288:
289: /* allocate space for reformated column_inode structure */
290: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
294: for (i1=0,col=0; i1<nslim_col; ++i1){
295: nsz = ns_col[i1];
296: for (i2=0; i2<nsz; ++i2,++col)
297: tvc[col] = i1;
298: }
299: /* allocate space for column pointers */
300: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301: *iia = ia;
302: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
305: /* determine the number of columns in each row */
306: ia[0] = oshift;
307: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row+1] - ai[row];
312: while (nz-- > 0) { /* off-diagonal elemets */
313: /* ia[i1+1]++; */
314: ia[i2+1]++;
315: i2++;
316: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
321: /* shift ia[i] to point to next col */
322: for (i1=1; i1<nslim_col+1; i1++) {
323: col = ia[i1-1];
324: ia[i1] += col;
325: work[i1-1] = col - oshift;
326: }
328: /* allocate space for column pointers */
329: nz = ia[nslim_col] + (!ishift);
330: PetscMalloc(nz*sizeof(PetscInt),&ja);
331: *jja = ja;
333: /* loop over matrix putting into ja */
334: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335: j = aj + ai[row] + ishift;
336: i2 = 0; /* Col inode index */
337: col = *j++ + ishift;
338: i2 = tvc[col];
339: nz = ai[row+1] - ai[row];
340: while (nz-- > 0) {
341: /* ja[work[i1]++] = i2 + oshift; */
342: ja[work[i2]++] = i1 + oshift;
343: i2++;
344: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345: if (nz > 0) i2 = tvc[col];
346: }
347: }
348: PetscFree(ns_col);
349: PetscFree(work);
350: PetscFree(tns);
351: PetscFree(tvc);
352: return(0);
353: }
357: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {
362: Mat_CreateColInode(A,n,PETSC_NULL);
363: if (!ia) return(0);
365: if (!blockcompressed) {
366: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367: } else if (symmetric) {
368: /* Since the indices are symmetric it does'nt matter */
369: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370: } else {
371: MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372: }
373: return(0);
374: }
378: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {
383: if (!ia) return(0);
384: if (!blockcompressed) {
385: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386: } else {
387: PetscFree(*ia);
388: PetscFree(*ja);
389: }
390: return(0);
391: }
393: /* ----------------------------------------------------------- */
397: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398: {
399: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
400: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401: PetscScalar *y;
402: const PetscScalar *x;
403: const MatScalar *v1,*v2,*v3,*v4,*v5;
404: PetscErrorCode ierr;
405: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406:
407: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409: #endif
412: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413: node_max = a->inode.node_count;
414: ns = a->inode.size; /* Node Size array */
415: VecGetArray(xx,(PetscScalar**)&x);
416: VecGetArray(yy,&y);
417: idx = a->j;
418: v1 = a->a;
419: ii = a->i;
421: for (i = 0,row = 0; i< node_max; ++i){
422: nsz = ns[i];
423: n = ii[1] - ii[0];
424: nonzerorow += (n>0)*nsz;
425: ii += nsz;
426: sz = n; /* No of non zeros in this row */
427: /* Switch on the size of Node */
428: switch (nsz){ /* Each loop in 'case' is unrolled */
429: case 1 :
430: sum1 = 0;
431:
432: for(n = 0; n< sz-1; n+=2) {
433: i1 = idx[0]; /* The instructions are ordered to */
434: i2 = idx[1]; /* make the compiler's job easy */
435: idx += 2;
436: tmp0 = x[i1];
437: tmp1 = x[i2];
438: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439: }
440:
441: if (n == sz-1){ /* Take care of the last nonzero */
442: tmp0 = x[*idx++];
443: sum1 += *v1++ * tmp0;
444: }
445: y[row++]=sum1;
446: break;
447: case 2:
448: sum1 = 0;
449: sum2 = 0;
450: v2 = v1 + n;
451:
452: for (n = 0; n< sz-1; n+=2) {
453: i1 = idx[0];
454: i2 = idx[1];
455: idx += 2;
456: tmp0 = x[i1];
457: tmp1 = x[i2];
458: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460: }
461: if (n == sz-1){
462: tmp0 = x[*idx++];
463: sum1 += *v1++ * tmp0;
464: sum2 += *v2++ * tmp0;
465: }
466: y[row++]=sum1;
467: y[row++]=sum2;
468: v1 =v2; /* Since the next block to be processed starts there*/
469: idx +=sz;
470: break;
471: case 3:
472: sum1 = 0;
473: sum2 = 0;
474: sum3 = 0;
475: v2 = v1 + n;
476: v3 = v2 + n;
477:
478: for (n = 0; n< sz-1; n+=2) {
479: i1 = idx[0];
480: i2 = idx[1];
481: idx += 2;
482: tmp0 = x[i1];
483: tmp1 = x[i2];
484: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487: }
488: if (n == sz-1){
489: tmp0 = x[*idx++];
490: sum1 += *v1++ * tmp0;
491: sum2 += *v2++ * tmp0;
492: sum3 += *v3++ * tmp0;
493: }
494: y[row++]=sum1;
495: y[row++]=sum2;
496: y[row++]=sum3;
497: v1 =v3; /* Since the next block to be processed starts there*/
498: idx +=2*sz;
499: break;
500: case 4:
501: sum1 = 0;
502: sum2 = 0;
503: sum3 = 0;
504: sum4 = 0;
505: v2 = v1 + n;
506: v3 = v2 + n;
507: v4 = v3 + n;
508:
509: for (n = 0; n< sz-1; n+=2) {
510: i1 = idx[0];
511: i2 = idx[1];
512: idx += 2;
513: tmp0 = x[i1];
514: tmp1 = x[i2];
515: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519: }
520: if (n == sz-1){
521: tmp0 = x[*idx++];
522: sum1 += *v1++ * tmp0;
523: sum2 += *v2++ * tmp0;
524: sum3 += *v3++ * tmp0;
525: sum4 += *v4++ * tmp0;
526: }
527: y[row++]=sum1;
528: y[row++]=sum2;
529: y[row++]=sum3;
530: y[row++]=sum4;
531: v1 =v4; /* Since the next block to be processed starts there*/
532: idx +=3*sz;
533: break;
534: case 5:
535: sum1 = 0;
536: sum2 = 0;
537: sum3 = 0;
538: sum4 = 0;
539: sum5 = 0;
540: v2 = v1 + n;
541: v3 = v2 + n;
542: v4 = v3 + n;
543: v5 = v4 + n;
544:
545: for (n = 0; n<sz-1; n+=2) {
546: i1 = idx[0];
547: i2 = idx[1];
548: idx += 2;
549: tmp0 = x[i1];
550: tmp1 = x[i2];
551: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556: }
557: if (n == sz-1){
558: tmp0 = x[*idx++];
559: sum1 += *v1++ * tmp0;
560: sum2 += *v2++ * tmp0;
561: sum3 += *v3++ * tmp0;
562: sum4 += *v4++ * tmp0;
563: sum5 += *v5++ * tmp0;
564: }
565: y[row++]=sum1;
566: y[row++]=sum2;
567: y[row++]=sum3;
568: y[row++]=sum4;
569: y[row++]=sum5;
570: v1 =v5; /* Since the next block to be processed starts there */
571: idx +=4*sz;
572: break;
573: default :
574: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575: }
576: }
577: VecRestoreArray(xx,(PetscScalar**)&x);
578: VecRestoreArray(yy,&y);
579: PetscLogFlops(2*a->nz - nonzerorow);
580: return(0);
581: }
582: /* ----------------------------------------------------------- */
583: /* Almost same code as the MatMult_Inode() */
586: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587: {
588: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
589: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590: MatScalar *v1,*v2,*v3,*v4,*v5;
591: PetscScalar *x,*y,*z,*zt;
593: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594:
596: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597: node_max = a->inode.node_count;
598: ns = a->inode.size; /* Node Size array */
599: VecGetArray(xx,&x);
600: VecGetArray(yy,&y);
601: if (zz != yy) {
602: VecGetArray(zz,&z);
603: } else {
604: z = y;
605: }
606: zt = z;
608: idx = a->j;
609: v1 = a->a;
610: ii = a->i;
612: for (i = 0,row = 0; i< node_max; ++i){
613: nsz = ns[i];
614: n = ii[1] - ii[0];
615: ii += nsz;
616: sz = n; /* No of non zeros in this row */
617: /* Switch on the size of Node */
618: switch (nsz){ /* Each loop in 'case' is unrolled */
619: case 1 :
620: sum1 = *zt++;
621:
622: for(n = 0; n< sz-1; n+=2) {
623: i1 = idx[0]; /* The instructions are ordered to */
624: i2 = idx[1]; /* make the compiler's job easy */
625: idx += 2;
626: tmp0 = x[i1];
627: tmp1 = x[i2];
628: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629: }
630:
631: if(n == sz-1){ /* Take care of the last nonzero */
632: tmp0 = x[*idx++];
633: sum1 += *v1++ * tmp0;
634: }
635: y[row++]=sum1;
636: break;
637: case 2:
638: sum1 = *zt++;
639: sum2 = *zt++;
640: v2 = v1 + n;
641:
642: for(n = 0; n< sz-1; n+=2) {
643: i1 = idx[0];
644: i2 = idx[1];
645: idx += 2;
646: tmp0 = x[i1];
647: tmp1 = x[i2];
648: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650: }
651: if(n == sz-1){
652: tmp0 = x[*idx++];
653: sum1 += *v1++ * tmp0;
654: sum2 += *v2++ * tmp0;
655: }
656: y[row++]=sum1;
657: y[row++]=sum2;
658: v1 =v2; /* Since the next block to be processed starts there*/
659: idx +=sz;
660: break;
661: case 3:
662: sum1 = *zt++;
663: sum2 = *zt++;
664: sum3 = *zt++;
665: v2 = v1 + n;
666: v3 = v2 + n;
667:
668: for (n = 0; n< sz-1; n+=2) {
669: i1 = idx[0];
670: i2 = idx[1];
671: idx += 2;
672: tmp0 = x[i1];
673: tmp1 = x[i2];
674: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677: }
678: if (n == sz-1){
679: tmp0 = x[*idx++];
680: sum1 += *v1++ * tmp0;
681: sum2 += *v2++ * tmp0;
682: sum3 += *v3++ * tmp0;
683: }
684: y[row++]=sum1;
685: y[row++]=sum2;
686: y[row++]=sum3;
687: v1 =v3; /* Since the next block to be processed starts there*/
688: idx +=2*sz;
689: break;
690: case 4:
691: sum1 = *zt++;
692: sum2 = *zt++;
693: sum3 = *zt++;
694: sum4 = *zt++;
695: v2 = v1 + n;
696: v3 = v2 + n;
697: v4 = v3 + n;
698:
699: for (n = 0; n< sz-1; n+=2) {
700: i1 = idx[0];
701: i2 = idx[1];
702: idx += 2;
703: tmp0 = x[i1];
704: tmp1 = x[i2];
705: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709: }
710: if (n == sz-1){
711: tmp0 = x[*idx++];
712: sum1 += *v1++ * tmp0;
713: sum2 += *v2++ * tmp0;
714: sum3 += *v3++ * tmp0;
715: sum4 += *v4++ * tmp0;
716: }
717: y[row++]=sum1;
718: y[row++]=sum2;
719: y[row++]=sum3;
720: y[row++]=sum4;
721: v1 =v4; /* Since the next block to be processed starts there*/
722: idx +=3*sz;
723: break;
724: case 5:
725: sum1 = *zt++;
726: sum2 = *zt++;
727: sum3 = *zt++;
728: sum4 = *zt++;
729: sum5 = *zt++;
730: v2 = v1 + n;
731: v3 = v2 + n;
732: v4 = v3 + n;
733: v5 = v4 + n;
734:
735: for (n = 0; n<sz-1; n+=2) {
736: i1 = idx[0];
737: i2 = idx[1];
738: idx += 2;
739: tmp0 = x[i1];
740: tmp1 = x[i2];
741: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746: }
747: if(n == sz-1){
748: tmp0 = x[*idx++];
749: sum1 += *v1++ * tmp0;
750: sum2 += *v2++ * tmp0;
751: sum3 += *v3++ * tmp0;
752: sum4 += *v4++ * tmp0;
753: sum5 += *v5++ * tmp0;
754: }
755: y[row++]=sum1;
756: y[row++]=sum2;
757: y[row++]=sum3;
758: y[row++]=sum4;
759: y[row++]=sum5;
760: v1 =v5; /* Since the next block to be processed starts there */
761: idx +=4*sz;
762: break;
763: default :
764: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765: }
766: }
767: VecRestoreArray(xx,&x);
768: VecRestoreArray(yy,&y);
769: if (zz != yy) {
770: VecRestoreArray(zz,&z);
771: }
772: PetscLogFlops(2*a->nz);
773: return(0);
774: }
776: /* ----------------------------------------------------------- */
779: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
780: {
781: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
782: IS iscol = a->col,isrow = a->row;
783: PetscErrorCode ierr;
784: const PetscInt *r,*c,*rout,*cout;
785: PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
786: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
787: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
788: PetscScalar sum1,sum2,sum3,sum4,sum5;
789: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790: const PetscScalar *b;
793: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794: node_max = a->inode.node_count;
795: ns = a->inode.size; /* Node Size array */
797: VecGetArray(bb,(PetscScalar**)&b);
798: VecGetArray(xx,&x);
799: tmp = a->solve_work;
800:
801: ISGetIndices(isrow,&rout); r = rout;
802: ISGetIndices(iscol,&cout); c = cout + (n-1);
803:
804: /* forward solve the lower triangular */
805: tmps = tmp ;
806: aa = a_a ;
807: aj = a_j ;
808: ad = a->diag;
810: for (i = 0,row = 0; i< node_max; ++i){
811: nsz = ns[i];
812: aii = ai[row];
813: v1 = aa + aii;
814: vi = aj + aii;
815: nz = ad[row]- aii;
816:
817: switch (nsz){ /* Each loop in 'case' is unrolled */
818: case 1 :
819: sum1 = b[*r++];
820: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
821: for(j=0; j<nz-1; j+=2){
822: i0 = vi[0];
823: i1 = vi[1];
824: vi +=2;
825: tmp0 = tmps[i0];
826: tmp1 = tmps[i1];
827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828: }
829: if(j == nz-1){
830: tmp0 = tmps[*vi++];
831: sum1 -= *v1++ *tmp0;
832: }
833: tmp[row ++]=sum1;
834: break;
835: case 2:
836: sum1 = b[*r++];
837: sum2 = b[*r++];
838: v2 = aa + ai[row+1];
840: for(j=0; j<nz-1; j+=2){
841: i0 = vi[0];
842: i1 = vi[1];
843: vi +=2;
844: tmp0 = tmps[i0];
845: tmp1 = tmps[i1];
846: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848: }
849: if(j == nz-1){
850: tmp0 = tmps[*vi++];
851: sum1 -= *v1++ *tmp0;
852: sum2 -= *v2++ *tmp0;
853: }
854: sum2 -= *v2++ * sum1;
855: tmp[row ++]=sum1;
856: tmp[row ++]=sum2;
857: break;
858: case 3:
859: sum1 = b[*r++];
860: sum2 = b[*r++];
861: sum3 = b[*r++];
862: v2 = aa + ai[row+1];
863: v3 = aa + ai[row+2];
864:
865: for (j=0; j<nz-1; j+=2){
866: i0 = vi[0];
867: i1 = vi[1];
868: vi +=2;
869: tmp0 = tmps[i0];
870: tmp1 = tmps[i1];
871: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
872: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
873: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
874: }
875: if (j == nz-1){
876: tmp0 = tmps[*vi++];
877: sum1 -= *v1++ *tmp0;
878: sum2 -= *v2++ *tmp0;
879: sum3 -= *v3++ *tmp0;
880: }
881: sum2 -= *v2++ * sum1;
882: sum3 -= *v3++ * sum1;
883: sum3 -= *v3++ * sum2;
884: tmp[row ++]=sum1;
885: tmp[row ++]=sum2;
886: tmp[row ++]=sum3;
887: break;
888:
889: case 4:
890: sum1 = b[*r++];
891: sum2 = b[*r++];
892: sum3 = b[*r++];
893: sum4 = b[*r++];
894: v2 = aa + ai[row+1];
895: v3 = aa + ai[row+2];
896: v4 = aa + ai[row+3];
897:
898: for (j=0; j<nz-1; j+=2){
899: i0 = vi[0];
900: i1 = vi[1];
901: vi +=2;
902: tmp0 = tmps[i0];
903: tmp1 = tmps[i1];
904: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908: }
909: if (j == nz-1){
910: tmp0 = tmps[*vi++];
911: sum1 -= *v1++ *tmp0;
912: sum2 -= *v2++ *tmp0;
913: sum3 -= *v3++ *tmp0;
914: sum4 -= *v4++ *tmp0;
915: }
916: sum2 -= *v2++ * sum1;
917: sum3 -= *v3++ * sum1;
918: sum4 -= *v4++ * sum1;
919: sum3 -= *v3++ * sum2;
920: sum4 -= *v4++ * sum2;
921: sum4 -= *v4++ * sum3;
922:
923: tmp[row ++]=sum1;
924: tmp[row ++]=sum2;
925: tmp[row ++]=sum3;
926: tmp[row ++]=sum4;
927: break;
928: case 5:
929: sum1 = b[*r++];
930: sum2 = b[*r++];
931: sum3 = b[*r++];
932: sum4 = b[*r++];
933: sum5 = b[*r++];
934: v2 = aa + ai[row+1];
935: v3 = aa + ai[row+2];
936: v4 = aa + ai[row+3];
937: v5 = aa + ai[row+4];
938:
939: for (j=0; j<nz-1; j+=2){
940: i0 = vi[0];
941: i1 = vi[1];
942: vi +=2;
943: tmp0 = tmps[i0];
944: tmp1 = tmps[i1];
945: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950: }
951: if (j == nz-1){
952: tmp0 = tmps[*vi++];
953: sum1 -= *v1++ *tmp0;
954: sum2 -= *v2++ *tmp0;
955: sum3 -= *v3++ *tmp0;
956: sum4 -= *v4++ *tmp0;
957: sum5 -= *v5++ *tmp0;
958: }
960: sum2 -= *v2++ * sum1;
961: sum3 -= *v3++ * sum1;
962: sum4 -= *v4++ * sum1;
963: sum5 -= *v5++ * sum1;
964: sum3 -= *v3++ * sum2;
965: sum4 -= *v4++ * sum2;
966: sum5 -= *v5++ * sum2;
967: sum4 -= *v4++ * sum3;
968: sum5 -= *v5++ * sum3;
969: sum5 -= *v5++ * sum4;
970:
971: tmp[row ++]=sum1;
972: tmp[row ++]=sum2;
973: tmp[row ++]=sum3;
974: tmp[row ++]=sum4;
975: tmp[row ++]=sum5;
976: break;
977: default:
978: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
979: }
980: }
981: /* backward solve the upper triangular */
982: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
983: nsz = ns[i];
984: aii = ai[row+1] -1;
985: v1 = aa + aii;
986: vi = aj + aii;
987: nz = aii- ad[row];
988: switch (nsz){ /* Each loop in 'case' is unrolled */
989: case 1 :
990: sum1 = tmp[row];
992: for(j=nz ; j>1; j-=2){
993: vi -=2;
994: i0 = vi[2];
995: i1 = vi[1];
996: tmp0 = tmps[i0];
997: tmp1 = tmps[i1];
998: v1 -= 2;
999: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000: }
1001: if (j==1){
1002: tmp0 = tmps[*vi--];
1003: sum1 -= *v1-- * tmp0;
1004: }
1005: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006: break;
1007: case 2 :
1008: sum1 = tmp[row];
1009: sum2 = tmp[row -1];
1010: v2 = aa + ai[row]-1;
1011: for (j=nz ; j>1; j-=2){
1012: vi -=2;
1013: i0 = vi[2];
1014: i1 = vi[1];
1015: tmp0 = tmps[i0];
1016: tmp1 = tmps[i1];
1017: v1 -= 2;
1018: v2 -= 2;
1019: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021: }
1022: if (j==1){
1023: tmp0 = tmps[*vi--];
1024: sum1 -= *v1-- * tmp0;
1025: sum2 -= *v2-- * tmp0;
1026: }
1027:
1028: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029: sum2 -= *v2-- * tmp0;
1030: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031: break;
1032: case 3 :
1033: sum1 = tmp[row];
1034: sum2 = tmp[row -1];
1035: sum3 = tmp[row -2];
1036: v2 = aa + ai[row]-1;
1037: v3 = aa + ai[row -1]-1;
1038: for (j=nz ; j>1; j-=2){
1039: vi -=2;
1040: i0 = vi[2];
1041: i1 = vi[1];
1042: tmp0 = tmps[i0];
1043: tmp1 = tmps[i1];
1044: v1 -= 2;
1045: v2 -= 2;
1046: v3 -= 2;
1047: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050: }
1051: if (j==1){
1052: tmp0 = tmps[*vi--];
1053: sum1 -= *v1-- * tmp0;
1054: sum2 -= *v2-- * tmp0;
1055: sum3 -= *v3-- * tmp0;
1056: }
1057: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058: sum2 -= *v2-- * tmp0;
1059: sum3 -= *v3-- * tmp0;
1060: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061: sum3 -= *v3-- * tmp0;
1062: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063:
1064: break;
1065: case 4 :
1066: sum1 = tmp[row];
1067: sum2 = tmp[row -1];
1068: sum3 = tmp[row -2];
1069: sum4 = tmp[row -3];
1070: v2 = aa + ai[row]-1;
1071: v3 = aa + ai[row -1]-1;
1072: v4 = aa + ai[row -2]-1;
1074: for (j=nz ; j>1; j-=2){
1075: vi -=2;
1076: i0 = vi[2];
1077: i1 = vi[1];
1078: tmp0 = tmps[i0];
1079: tmp1 = tmps[i1];
1080: v1 -= 2;
1081: v2 -= 2;
1082: v3 -= 2;
1083: v4 -= 2;
1084: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088: }
1089: if (j==1){
1090: tmp0 = tmps[*vi--];
1091: sum1 -= *v1-- * tmp0;
1092: sum2 -= *v2-- * tmp0;
1093: sum3 -= *v3-- * tmp0;
1094: sum4 -= *v4-- * tmp0;
1095: }
1097: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098: sum2 -= *v2-- * tmp0;
1099: sum3 -= *v3-- * tmp0;
1100: sum4 -= *v4-- * tmp0;
1101: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102: sum3 -= *v3-- * tmp0;
1103: sum4 -= *v4-- * tmp0;
1104: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105: sum4 -= *v4-- * tmp0;
1106: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107: break;
1108: case 5 :
1109: sum1 = tmp[row];
1110: sum2 = tmp[row -1];
1111: sum3 = tmp[row -2];
1112: sum4 = tmp[row -3];
1113: sum5 = tmp[row -4];
1114: v2 = aa + ai[row]-1;
1115: v3 = aa + ai[row -1]-1;
1116: v4 = aa + ai[row -2]-1;
1117: v5 = aa + ai[row -3]-1;
1118: for (j=nz ; j>1; j-=2){
1119: vi -= 2;
1120: i0 = vi[2];
1121: i1 = vi[1];
1122: tmp0 = tmps[i0];
1123: tmp1 = tmps[i1];
1124: v1 -= 2;
1125: v2 -= 2;
1126: v3 -= 2;
1127: v4 -= 2;
1128: v5 -= 2;
1129: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134: }
1135: if (j==1){
1136: tmp0 = tmps[*vi--];
1137: sum1 -= *v1-- * tmp0;
1138: sum2 -= *v2-- * tmp0;
1139: sum3 -= *v3-- * tmp0;
1140: sum4 -= *v4-- * tmp0;
1141: sum5 -= *v5-- * tmp0;
1142: }
1144: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145: sum2 -= *v2-- * tmp0;
1146: sum3 -= *v3-- * tmp0;
1147: sum4 -= *v4-- * tmp0;
1148: sum5 -= *v5-- * tmp0;
1149: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150: sum3 -= *v3-- * tmp0;
1151: sum4 -= *v4-- * tmp0;
1152: sum5 -= *v5-- * tmp0;
1153: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154: sum4 -= *v4-- * tmp0;
1155: sum5 -= *v5-- * tmp0;
1156: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157: sum5 -= *v5-- * tmp0;
1158: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159: break;
1160: default:
1161: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1162: }
1163: }
1164: ISRestoreIndices(isrow,&rout);
1165: ISRestoreIndices(iscol,&cout);
1166: VecRestoreArray(bb,(PetscScalar**)&b);
1167: VecRestoreArray(xx,&x);
1168: PetscLogFlops(2*a->nz - A->cmap->n);
1169: return(0);
1170: }
1174: PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175: {
1176: Mat C = B;
1177: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1178: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1179: PetscErrorCode ierr;
1180: const PetscInt *r,*ic,*c,*ics;
1181: PetscInt n = A->rmap->n,*bi = b->i;
1182: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1183: PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1184: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1185: PetscScalar mul1,mul2,mul3,tmp;
1186: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1187: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1188: PetscReal rs=0.0;
1189: LUShift_Ctx sctx;
1190: PetscInt newshift;
1193: sctx.shift_top = 0;
1194: sctx.nshift_max = 0;
1195: sctx.shift_lo = 0;
1196: sctx.shift_hi = 0;
1197: sctx.shift_fraction = 0;
1199: /* if both shift schemes are chosen by user, only use info->shiftpd */
1200: if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1201: sctx.shift_top = 0;
1202: for (i=0; i<n; i++) {
1203: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1204: rs = 0.0;
1205: ajtmp = aj + ai[i];
1206: rtmp1 = aa + ai[i];
1207: nz = ai[i+1] - ai[i];
1208: for (j=0; j<nz; j++){
1209: if (*ajtmp != i){
1210: rs += PetscAbsScalar(*rtmp1++);
1211: } else {
1212: rs -= PetscRealPart(*rtmp1++);
1213: }
1214: ajtmp++;
1215: }
1216: if (rs>sctx.shift_top) sctx.shift_top = rs;
1217: }
1218: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1219: sctx.shift_top *= 1.1;
1220: sctx.nshift_max = 5;
1221: sctx.shift_lo = 0.;
1222: sctx.shift_hi = 1.;
1223: }
1224: sctx.shift_amount = 0;
1225: sctx.nshift = 0;
1227: ISGetIndices(isrow,&r);
1228: ISGetIndices(iscol,&c);
1229: ISGetIndices(isicol,&ic);
1230: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1231: PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1232: ics = ic ;
1233: rtmp22 = rtmp11 + n;
1234: rtmp33 = rtmp22 + n;
1235:
1236: node_max = a->inode.node_count;
1237: ns = a->inode.size;
1238: if (!ns){
1239: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1240: }
1242: /* If max inode size > 3, split it into two inodes.*/
1243: /* also map the inode sizes according to the ordering */
1244: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1245: for (i=0,j=0; i<node_max; ++i,++j){
1246: if (ns[i]>3) {
1247: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1248: ++j;
1249: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1250: } else {
1251: tmp_vec1[j] = ns[i];
1252: }
1253: }
1254: /* Use the correct node_max */
1255: node_max = j;
1257: /* Now reorder the inode info based on mat re-ordering info */
1258: /* First create a row -> inode_size_array_index map */
1259: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1260: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1261: for (i=0,row=0; i<node_max; i++) {
1262: nodesz = tmp_vec1[i];
1263: for (j=0; j<nodesz; j++,row++) {
1264: nsmap[row] = i;
1265: }
1266: }
1267: /* Using nsmap, create a reordered ns structure */
1268: for (i=0,j=0; i< node_max; i++) {
1269: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1270: tmp_vec2[i] = nodesz;
1271: j += nodesz;
1272: }
1273: PetscFree(nsmap);
1274: PetscFree(tmp_vec1);
1275: /* Now use the correct ns */
1276: ns = tmp_vec2;
1278: do {
1279: sctx.lushift = PETSC_FALSE;
1280: /* Now loop over each block-row, and do the factorization */
1281: for (i=0,row=0; i<node_max; i++) {
1282: nodesz = ns[i];
1283: nz = bi[row+1] - bi[row];
1284: bjtmp = bj + bi[row];
1286: switch (nodesz){
1287: case 1:
1288: for (j=0; j<nz; j++){
1289: idx = bjtmp[j];
1290: rtmp11[idx] = 0.0;
1291: }
1292:
1293: /* load in initial (unfactored row) */
1294: idx = r[row];
1295: nz_tmp = ai[idx+1] - ai[idx];
1296: ajtmp = aj + ai[idx];
1297: v1 = aa + ai[idx];
1299: for (j=0; j<nz_tmp; j++) {
1300: idx = ics[ajtmp[j]];
1301: rtmp11[idx] = v1[j];
1302: }
1303: rtmp11[ics[r[row]]] += sctx.shift_amount;
1305: prow = *bjtmp++ ;
1306: while (prow < row) {
1307: pc1 = rtmp11 + prow;
1308: if (*pc1 != 0.0){
1309: pv = ba + bd[prow];
1310: pj = nbj + bd[prow];
1311: mul1 = *pc1 * *pv++;
1312: *pc1 = mul1;
1313: nz_tmp = bi[prow+1] - bd[prow] - 1;
1314: PetscLogFlops(2*nz_tmp);
1315: for (j=0; j<nz_tmp; j++) {
1316: tmp = pv[j];
1317: idx = pj[j];
1318: rtmp11[idx] -= mul1 * tmp;
1319: }
1320: }
1321: prow = *bjtmp++ ;
1322: }
1323: pj = bj + bi[row];
1324: pc1 = ba + bi[row];
1326: sctx.pv = rtmp11[row];
1327: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1328: rs = 0.0;
1329: for (j=0; j<nz; j++) {
1330: idx = pj[j];
1331: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1332: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1333: }
1334: sctx.rs = rs;
1335: MatLUCheckShift_inline(info,sctx,row,newshift);
1336: if (newshift == 1) goto endofwhile;
1337: break;
1338:
1339: case 2:
1340: for (j=0; j<nz; j++) {
1341: idx = bjtmp[j];
1342: rtmp11[idx] = 0.0;
1343: rtmp22[idx] = 0.0;
1344: }
1345:
1346: /* load in initial (unfactored row) */
1347: idx = r[row];
1348: nz_tmp = ai[idx+1] - ai[idx];
1349: ajtmp = aj + ai[idx];
1350: v1 = aa + ai[idx];
1351: v2 = aa + ai[idx+1];
1352: for (j=0; j<nz_tmp; j++) {
1353: idx = ics[ajtmp[j]];
1354: rtmp11[idx] = v1[j];
1355: rtmp22[idx] = v2[j];
1356: }
1357: rtmp11[ics[r[row]]] += sctx.shift_amount;
1358: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1360: prow = *bjtmp++ ;
1361: while (prow < row) {
1362: pc1 = rtmp11 + prow;
1363: pc2 = rtmp22 + prow;
1364: if (*pc1 != 0.0 || *pc2 != 0.0){
1365: pv = ba + bd[prow];
1366: pj = nbj + bd[prow];
1367: mul1 = *pc1 * *pv;
1368: mul2 = *pc2 * *pv;
1369: ++pv;
1370: *pc1 = mul1;
1371: *pc2 = mul2;
1372:
1373: nz_tmp = bi[prow+1] - bd[prow] - 1;
1374: for (j=0; j<nz_tmp; j++) {
1375: tmp = pv[j];
1376: idx = pj[j];
1377: rtmp11[idx] -= mul1 * tmp;
1378: rtmp22[idx] -= mul2 * tmp;
1379: }
1380: PetscLogFlops(4*nz_tmp);
1381: }
1382: prow = *bjtmp++ ;
1383: }
1385: /* Now take care of diagonal 2x2 block. Note: prow = row here */
1386: pc1 = rtmp11 + prow;
1387: pc2 = rtmp22 + prow;
1389: sctx.pv = *pc1;
1390: pj = bj + bi[prow];
1391: rs = 0.0;
1392: for (j=0; j<nz; j++){
1393: idx = pj[j];
1394: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1395: }
1396: sctx.rs = rs;
1397: MatLUCheckShift_inline(info,sctx,row,newshift);
1398: if (newshift == 1) goto endofwhile;
1400: if (*pc2 != 0.0){
1401: pj = nbj + bd[prow];
1402: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1403: *pc2 = mul2;
1404: nz_tmp = bi[prow+1] - bd[prow] - 1;
1405: for (j=0; j<nz_tmp; j++) {
1406: idx = pj[j] ;
1407: tmp = rtmp11[idx];
1408: rtmp22[idx] -= mul2 * tmp;
1409: }
1410: PetscLogFlops(2*nz_tmp);
1411: }
1412:
1413: pj = bj + bi[row];
1414: pc1 = ba + bi[row];
1415: pc2 = ba + bi[row+1];
1417: sctx.pv = rtmp22[row+1];
1418: rs = 0.0;
1419: rtmp11[row] = 1.0/rtmp11[row];
1420: rtmp22[row+1] = 1.0/rtmp22[row+1];
1421: /* copy row entries from dense representation to sparse */
1422: for (j=0; j<nz; j++) {
1423: idx = pj[j];
1424: pc1[j] = rtmp11[idx];
1425: pc2[j] = rtmp22[idx];
1426: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1427: }
1428: sctx.rs = rs;
1429: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1430: if (newshift == 1) goto endofwhile;
1431: break;
1433: case 3:
1434: for (j=0; j<nz; j++) {
1435: idx = bjtmp[j];
1436: rtmp11[idx] = 0.0;
1437: rtmp22[idx] = 0.0;
1438: rtmp33[idx] = 0.0;
1439: }
1440: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1441: idx = r[row];
1442: nz_tmp = ai[idx+1] - ai[idx];
1443: ajtmp = aj + ai[idx];
1444: v1 = aa + ai[idx];
1445: v2 = aa + ai[idx+1];
1446: v3 = aa + ai[idx+2];
1447: for (j=0; j<nz_tmp; j++) {
1448: idx = ics[ajtmp[j]];
1449: rtmp11[idx] = v1[j];
1450: rtmp22[idx] = v2[j];
1451: rtmp33[idx] = v3[j];
1452: }
1453: rtmp11[ics[r[row]]] += sctx.shift_amount;
1454: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1455: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1457: /* loop over all pivot row blocks above this row block */
1458: prow = *bjtmp++ ;
1459: while (prow < row) {
1460: pc1 = rtmp11 + prow;
1461: pc2 = rtmp22 + prow;
1462: pc3 = rtmp33 + prow;
1463: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1464: pv = ba + bd[prow];
1465: pj = nbj + bd[prow];
1466: mul1 = *pc1 * *pv;
1467: mul2 = *pc2 * *pv;
1468: mul3 = *pc3 * *pv;
1469: ++pv;
1470: *pc1 = mul1;
1471: *pc2 = mul2;
1472: *pc3 = mul3;
1473:
1474: nz_tmp = bi[prow+1] - bd[prow] - 1;
1475: /* update this row based on pivot row */
1476: for (j=0; j<nz_tmp; j++) {
1477: tmp = pv[j];
1478: idx = pj[j];
1479: rtmp11[idx] -= mul1 * tmp;
1480: rtmp22[idx] -= mul2 * tmp;
1481: rtmp33[idx] -= mul3 * tmp;
1482: }
1483: PetscLogFlops(6*nz_tmp);
1484: }
1485: prow = *bjtmp++ ;
1486: }
1488: /* Now take care of diagonal 3x3 block in this set of rows */
1489: /* note: prow = row here */
1490: pc1 = rtmp11 + prow;
1491: pc2 = rtmp22 + prow;
1492: pc3 = rtmp33 + prow;
1494: sctx.pv = *pc1;
1495: pj = bj + bi[prow];
1496: rs = 0.0;
1497: for (j=0; j<nz; j++){
1498: idx = pj[j];
1499: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1500: }
1501: sctx.rs = rs;
1502: MatLUCheckShift_inline(info,sctx,row,newshift);
1503: if (newshift == 1) goto endofwhile;
1505: if (*pc2 != 0.0 || *pc3 != 0.0){
1506: mul2 = (*pc2)/(*pc1);
1507: mul3 = (*pc3)/(*pc1);
1508: *pc2 = mul2;
1509: *pc3 = mul3;
1510: nz_tmp = bi[prow+1] - bd[prow] - 1;
1511: pj = nbj + bd[prow];
1512: for (j=0; j<nz_tmp; j++) {
1513: idx = pj[j] ;
1514: tmp = rtmp11[idx];
1515: rtmp22[idx] -= mul2 * tmp;
1516: rtmp33[idx] -= mul3 * tmp;
1517: }
1518: PetscLogFlops(4*nz_tmp);
1519: }
1520: ++prow;
1522: pc2 = rtmp22 + prow;
1523: pc3 = rtmp33 + prow;
1524: sctx.pv = *pc2;
1525: pj = bj + bi[prow];
1526: rs = 0.0;
1527: for (j=0; j<nz; j++){
1528: idx = pj[j];
1529: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1530: }
1531: sctx.rs = rs;
1532: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1533: if (newshift == 1) goto endofwhile;
1535: if (*pc3 != 0.0){
1536: mul3 = (*pc3)/(*pc2);
1537: *pc3 = mul3;
1538: pj = nbj + bd[prow];
1539: nz_tmp = bi[prow+1] - bd[prow] - 1;
1540: for (j=0; j<nz_tmp; j++) {
1541: idx = pj[j] ;
1542: tmp = rtmp22[idx];
1543: rtmp33[idx] -= mul3 * tmp;
1544: }
1545: PetscLogFlops(4*nz_tmp);
1546: }
1548: pj = bj + bi[row];
1549: pc1 = ba + bi[row];
1550: pc2 = ba + bi[row+1];
1551: pc3 = ba + bi[row+2];
1553: sctx.pv = rtmp33[row+2];
1554: rs = 0.0;
1555: rtmp11[row] = 1.0/rtmp11[row];
1556: rtmp22[row+1] = 1.0/rtmp22[row+1];
1557: rtmp33[row+2] = 1.0/rtmp33[row+2];
1558: /* copy row entries from dense representation to sparse */
1559: for (j=0; j<nz; j++) {
1560: idx = pj[j];
1561: pc1[j] = rtmp11[idx];
1562: pc2[j] = rtmp22[idx];
1563: pc3[j] = rtmp33[idx];
1564: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1565: }
1567: sctx.rs = rs;
1568: MatLUCheckShift_inline(info,sctx,row+2,newshift);
1569: if (newshift == 1) goto endofwhile;
1570: break;
1572: default:
1573: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1574: }
1575: row += nodesz; /* Update the row */
1576: }
1577: endofwhile:;
1578: } while (sctx.lushift);
1579: PetscFree(rtmp11);
1580: PetscFree(tmp_vec2);
1581: ISRestoreIndices(isicol,&ic);
1582: ISRestoreIndices(isrow,&r);
1583: ISRestoreIndices(iscol,&c);
1584: (B)->ops->solve = MatSolve_Inode;
1585: /* do not set solve add, since MatSolve_Inode + Add is faster */
1586: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1587: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1588: C->assembled = PETSC_TRUE;
1589: C->preallocated = PETSC_TRUE;
1590: if (sctx.nshift) {
1591: if (info->shiftpd) {
1592: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1593: } else if (info->shiftnz) {
1594: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1595: }
1596: }
1597: PetscLogFlops(C->cmap->n);
1598: return(0);
1599: }
1601: /*
1602: Makes a longer coloring[] array and calls the usual code with that
1603: */
1606: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1607: {
1608: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
1609: PetscErrorCode ierr;
1610: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1611: PetscInt *colorused,i;
1612: ISColoringValue *newcolor;
1615: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1616: /* loop over inodes, marking a color for each column*/
1617: row = 0;
1618: for (i=0; i<m; i++){
1619: for (j=0; j<ns[i]; j++) {
1620: newcolor[row++] = coloring[i] + j*ncolors;
1621: }
1622: }
1624: /* eliminate unneeded colors */
1625: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1626: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1627: for (i=0; i<n; i++) {
1628: colorused[newcolor[i]] = 1;
1629: }
1631: for (i=1; i<5*ncolors; i++) {
1632: colorused[i] += colorused[i-1];
1633: }
1634: ncolors = colorused[5*ncolors-1];
1635: for (i=0; i<n; i++) {
1636: newcolor[i] = colorused[newcolor[i]]-1;
1637: }
1638: PetscFree(colorused);
1639: ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
1640: PetscFree(coloring);
1641: return(0);
1642: }
1644: #include ../src/inline/ilu.h
1648: PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1649: {
1650: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1651: PetscScalar *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1652: MatScalar *ibdiag,*bdiag;
1653: PetscScalar *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1654: const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5;
1655: PetscReal zeropivot = 1.0e-15, shift = 0.0;
1656: PetscErrorCode ierr;
1657: PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1658: PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k;
1661: if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1662: if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1663: if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1664: if (its > 1) {
1665: /* switch to non-inode version */
1666: MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);
1667: return(0);
1668: }
1670: if (!a->inode.ibdiagvalid) {
1671: if (!a->inode.ibdiag) {
1672: /* calculate space needed for diagonal blocks */
1673: for (i=0; i<m; i++) {
1674: cnt += sizes[i]*sizes[i];
1675: }
1676: a->inode.bdiagsize = cnt;
1677: PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);
1678: }
1680: /* copy over the diagonal blocks and invert them */
1681: ibdiag = a->inode.ibdiag;
1682: bdiag = a->inode.bdiag;
1683: cnt = 0;
1684: for (i=0, row = 0; i<m; i++) {
1685: for (j=0; j<sizes[i]; j++) {
1686: for (k=0; k<sizes[i]; k++) {
1687: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1688: }
1689: }
1690: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
1691:
1692: switch(sizes[i]) {
1693: case 1:
1694: /* Create matrix data structure */
1695: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1696: ibdiag[cnt] = 1.0/ibdiag[cnt];
1697: break;
1698: case 2:
1699: Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
1700: break;
1701: case 3:
1702: Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
1703: break;
1704: case 4:
1705: Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
1706: break;
1707: case 5:
1708: Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);
1709: break;
1710: default:
1711: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1712: }
1713: cnt += sizes[i]*sizes[i];
1714: row += sizes[i];
1715: }
1716: a->inode.ibdiagvalid = PETSC_TRUE;
1717: }
1718: ibdiag = a->inode.ibdiag;
1719: bdiag = a->inode.bdiag;
1721: VecGetArray(xx,&x);
1722: if (xx != bb) {
1723: VecGetArray(bb,(PetscScalar**)&b);
1724: } else {
1725: b = x;
1726: }
1728: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1729: xs = x;
1730: if (flag & SOR_ZERO_INITIAL_GUESS) {
1731: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1733: for (i=0, row=0; i<m; i++) {
1734: sz = diag[row] - ii[row];
1735: v1 = a->a + ii[row];
1736: idx = a->j + ii[row];
1738: /* see comments for MatMult_Inode() for how this is coded */
1739: switch (sizes[i]){
1740: case 1:
1741:
1742: sum1 = b[row];
1743: for(n = 0; n<sz-1; n+=2) {
1744: i1 = idx[0];
1745: i2 = idx[1];
1746: idx += 2;
1747: tmp0 = x[i1];
1748: tmp1 = x[i2];
1749: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1750: }
1751:
1752: if (n == sz-1){
1753: tmp0 = x[*idx];
1754: sum1 -= *v1 * tmp0;
1755: }
1756: x[row++] = sum1*(*ibdiag++);
1757: break;
1758: case 2:
1759: v2 = a->a + ii[row+1];
1760: sum1 = b[row];
1761: sum2 = b[row+1];
1762: for(n = 0; n<sz-1; n+=2) {
1763: i1 = idx[0];
1764: i2 = idx[1];
1765: idx += 2;
1766: tmp0 = x[i1];
1767: tmp1 = x[i2];
1768: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1769: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1770: }
1771:
1772: if (n == sz-1){
1773: tmp0 = x[*idx];
1774: sum1 -= v1[0] * tmp0;
1775: sum2 -= v2[0] * tmp0;
1776: }
1777: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1778: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1779: ibdiag += 4;
1780: break;
1781: case 3:
1782: v2 = a->a + ii[row+1];
1783: v3 = a->a + ii[row+2];
1784: sum1 = b[row];
1785: sum2 = b[row+1];
1786: sum3 = b[row+2];
1787: for(n = 0; n<sz-1; n+=2) {
1788: i1 = idx[0];
1789: i2 = idx[1];
1790: idx += 2;
1791: tmp0 = x[i1];
1792: tmp1 = x[i2];
1793: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1794: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1795: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1796: }
1797:
1798: if (n == sz-1){
1799: tmp0 = x[*idx];
1800: sum1 -= v1[0] * tmp0;
1801: sum2 -= v2[0] * tmp0;
1802: sum3 -= v3[0] * tmp0;
1803: }
1804: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1805: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1806: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1807: ibdiag += 9;
1808: break;
1809: case 4:
1810: v2 = a->a + ii[row+1];
1811: v3 = a->a + ii[row+2];
1812: v4 = a->a + ii[row+3];
1813: sum1 = b[row];
1814: sum2 = b[row+1];
1815: sum3 = b[row+2];
1816: sum4 = b[row+3];
1817: for(n = 0; n<sz-1; n+=2) {
1818: i1 = idx[0];
1819: i2 = idx[1];
1820: idx += 2;
1821: tmp0 = x[i1];
1822: tmp1 = x[i2];
1823: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1824: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1825: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1826: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1827: }
1828:
1829: if (n == sz-1){
1830: tmp0 = x[*idx];
1831: sum1 -= v1[0] * tmp0;
1832: sum2 -= v2[0] * tmp0;
1833: sum3 -= v3[0] * tmp0;
1834: sum4 -= v4[0] * tmp0;
1835: }
1836: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1837: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1838: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1839: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1840: ibdiag += 16;
1841: break;
1842: case 5:
1843: v2 = a->a + ii[row+1];
1844: v3 = a->a + ii[row+2];
1845: v4 = a->a + ii[row+3];
1846: v5 = a->a + ii[row+4];
1847: sum1 = b[row];
1848: sum2 = b[row+1];
1849: sum3 = b[row+2];
1850: sum4 = b[row+3];
1851: sum5 = b[row+4];
1852: for(n = 0; n<sz-1; n+=2) {
1853: i1 = idx[0];
1854: i2 = idx[1];
1855: idx += 2;
1856: tmp0 = x[i1];
1857: tmp1 = x[i2];
1858: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1859: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1860: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1861: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1862: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1863: }
1864:
1865: if (n == sz-1){
1866: tmp0 = x[*idx];
1867: sum1 -= v1[0] * tmp0;
1868: sum2 -= v2[0] * tmp0;
1869: sum3 -= v3[0] * tmp0;
1870: sum4 -= v4[0] * tmp0;
1871: sum5 -= v5[0] * tmp0;
1872: }
1873: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1874: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1875: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1876: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1877: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1878: ibdiag += 25;
1879: break;
1880: default:
1881: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1882: }
1883: }
1885: xb = x;
1886: PetscLogFlops(a->nz);
1887: } else xb = b;
1888: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1889: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1890: cnt = 0;
1891: for (i=0, row=0; i<m; i++) {
1893: switch (sizes[i]){
1894: case 1:
1895: x[row++] *= bdiag[cnt++];
1896: break;
1897: case 2:
1898: x1 = x[row]; x2 = x[row+1];
1899: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1900: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1901: x[row++] = tmp1;
1902: x[row++] = tmp2;
1903: cnt += 4;
1904: break;
1905: case 3:
1906: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
1907: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1908: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1909: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1910: x[row++] = tmp1;
1911: x[row++] = tmp2;
1912: x[row++] = tmp3;
1913: cnt += 9;
1914: break;
1915: case 4:
1916: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1917: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1918: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1919: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1920: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1921: x[row++] = tmp1;
1922: x[row++] = tmp2;
1923: x[row++] = tmp3;
1924: x[row++] = tmp4;
1925: cnt += 16;
1926: break;
1927: case 5:
1928: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1929: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1930: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1931: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1932: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1933: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1934: x[row++] = tmp1;
1935: x[row++] = tmp2;
1936: x[row++] = tmp3;
1937: x[row++] = tmp4;
1938: x[row++] = tmp5;
1939: cnt += 25;
1940: break;
1941: default:
1942: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1943: }
1944: }
1945: PetscLogFlops(m);
1946: }
1947: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1949: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1950: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1951: ibdiag -= sizes[i]*sizes[i];
1952: sz = ii[row+1] - diag[row] - 1;
1953: v1 = a->a + diag[row] + 1;
1954: idx = a->j + diag[row] + 1;
1956: /* see comments for MatMult_Inode() for how this is coded */
1957: switch (sizes[i]){
1958: case 1:
1959:
1960: sum1 = xb[row];
1961: for(n = 0; n<sz-1; n+=2) {
1962: i1 = idx[0];
1963: i2 = idx[1];
1964: idx += 2;
1965: tmp0 = x[i1];
1966: tmp1 = x[i2];
1967: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1968: }
1969:
1970: if (n == sz-1){
1971: tmp0 = x[*idx];
1972: sum1 -= *v1*tmp0;
1973: }
1974: x[row--] = sum1*(*ibdiag);
1975: break;
1977: case 2:
1978:
1979: sum1 = xb[row];
1980: sum2 = xb[row-1];
1981: /* note that sum1 is associated with the second of the two rows */
1982: v2 = a->a + diag[row-1] + 2;
1983: for(n = 0; n<sz-1; n+=2) {
1984: i1 = idx[0];
1985: i2 = idx[1];
1986: idx += 2;
1987: tmp0 = x[i1];
1988: tmp1 = x[i2];
1989: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1990: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1991: }
1992:
1993: if (n == sz-1){
1994: tmp0 = x[*idx];
1995: sum1 -= *v1*tmp0;
1996: sum2 -= *v2*tmp0;
1997: }
1998: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1999: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2000: break;
2001: case 3:
2002:
2003: sum1 = xb[row];
2004: sum2 = xb[row-1];
2005: sum3 = xb[row-2];
2006: v2 = a->a + diag[row-1] + 2;
2007: v3 = a->a + diag[row-2] + 3;
2008: for(n = 0; n<sz-1; n+=2) {
2009: i1 = idx[0];
2010: i2 = idx[1];
2011: idx += 2;
2012: tmp0 = x[i1];
2013: tmp1 = x[i2];
2014: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2015: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2016: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2017: }
2018:
2019: if (n == sz-1){
2020: tmp0 = x[*idx];
2021: sum1 -= *v1*tmp0;
2022: sum2 -= *v2*tmp0;
2023: sum3 -= *v3*tmp0;
2024: }
2025: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2026: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2027: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2028: break;
2029: case 4:
2030:
2031: sum1 = xb[row];
2032: sum2 = xb[row-1];
2033: sum3 = xb[row-2];
2034: sum4 = xb[row-3];
2035: v2 = a->a + diag[row-1] + 2;
2036: v3 = a->a + diag[row-2] + 3;
2037: v4 = a->a + diag[row-3] + 4;
2038: for(n = 0; n<sz-1; n+=2) {
2039: i1 = idx[0];
2040: i2 = idx[1];
2041: idx += 2;
2042: tmp0 = x[i1];
2043: tmp1 = x[i2];
2044: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2045: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2046: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2047: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2048: }
2049:
2050: if (n == sz-1){
2051: tmp0 = x[*idx];
2052: sum1 -= *v1*tmp0;
2053: sum2 -= *v2*tmp0;
2054: sum3 -= *v3*tmp0;
2055: sum4 -= *v4*tmp0;
2056: }
2057: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2058: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2059: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2060: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2061: break;
2062: case 5:
2063:
2064: sum1 = xb[row];
2065: sum2 = xb[row-1];
2066: sum3 = xb[row-2];
2067: sum4 = xb[row-3];
2068: sum5 = xb[row-4];
2069: v2 = a->a + diag[row-1] + 2;
2070: v3 = a->a + diag[row-2] + 3;
2071: v4 = a->a + diag[row-3] + 4;
2072: v5 = a->a + diag[row-4] + 5;
2073: for(n = 0; n<sz-1; n+=2) {
2074: i1 = idx[0];
2075: i2 = idx[1];
2076: idx += 2;
2077: tmp0 = x[i1];
2078: tmp1 = x[i2];
2079: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2080: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2081: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2082: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2083: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2084: }
2085:
2086: if (n == sz-1){
2087: tmp0 = x[*idx];
2088: sum1 -= *v1*tmp0;
2089: sum2 -= *v2*tmp0;
2090: sum3 -= *v3*tmp0;
2091: sum4 -= *v4*tmp0;
2092: sum5 -= *v5*tmp0;
2093: }
2094: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2095: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2096: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2097: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2098: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2099: break;
2100: default:
2101: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2102: }
2103: }
2105: PetscLogFlops(a->nz);
2106: }
2107: its--;
2108: }
2109: VecRestoreArray(xx,&x);
2110: if (bb != xx) {VecRestoreArray(bb,(PetscScalar**)&b);}
2111: return(0);
2112: }
2115: /*
2116: samestructure indicates that the matrix has not changed its nonzero structure so we
2117: do not need to recompute the inodes
2118: */
2121: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2122: {
2123: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2125: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2126: PetscTruth flag;
2129: if (!a->inode.use) return(0);
2130: if (a->inode.checked && samestructure) return(0);
2133: m = A->rmap->n;
2134: if (a->inode.size) {ns = a->inode.size;}
2135: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
2137: i = 0;
2138: node_count = 0;
2139: idx = a->j;
2140: ii = a->i;
2141: while (i < m){ /* For each row */
2142: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
2143: /* Limits the number of elements in a node to 'a->inode.limit' */
2144: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2145: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
2146: if (nzy != nzx) break;
2147: idy += nzx; /* Same nonzero pattern */
2148: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
2149: if (!flag) break;
2150: }
2151: ns[node_count++] = blk_size;
2152: idx += blk_size*nzx;
2153: i = j;
2154: }
2155: /* If not enough inodes found,, do not use inode version of the routines */
2156: if (!a->inode.size && m && node_count > .9*m) {
2157: PetscFree(ns);
2158: a->inode.node_count = 0;
2159: a->inode.size = PETSC_NULL;
2160: a->inode.use = PETSC_FALSE;
2161: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
2162: } else {
2163: A->ops->mult = MatMult_Inode;
2164: A->ops->relax = MatRelax_Inode;
2165: A->ops->multadd = MatMultAdd_Inode;
2166: A->ops->getrowij = MatGetRowIJ_Inode;
2167: A->ops->restorerowij = MatRestoreRowIJ_Inode;
2168: A->ops->getcolumnij = MatGetColumnIJ_Inode;
2169: A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2170: A->ops->coloringpatch = MatColoringPatch_Inode;
2171: a->inode.node_count = node_count;
2172: a->inode.size = ns;
2173: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
2174: }
2175: return(0);
2176: }
2178: /*
2179: This is really ugly. if inodes are used this replaces the
2180: permutations with ones that correspond to rows/cols of the matrix
2181: rather then inode blocks
2182: */
2185: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2186: {
2187: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2190: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
2191: if (f) {
2192: (*f)(A,rperm,cperm);
2193: }
2194: return(0);
2195: }
2200: PetscErrorCode MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2201: {
2202: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
2204: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2205: const PetscInt *ridx,*cidx;
2206: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
2207: PetscInt nslim_col,*ns_col;
2208: IS ris = *rperm,cis = *cperm;
2211: if (!a->inode.size) return(0); /* no inodes so return */
2212: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
2214: Mat_CreateColInode(A,&nslim_col,&ns_col);
2215: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
2216: PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
2217: permc = permr + m;
2219: ISGetIndices(ris,&ridx);
2220: ISGetIndices(cis,&cidx);
2222: /* Form the inode structure for the rows of permuted matric using inv perm*/
2223: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2225: /* Construct the permutations for rows*/
2226: for (i=0,row = 0; i<nslim_row; ++i){
2227: indx = ridx[i];
2228: start_val = tns[indx];
2229: end_val = tns[indx + 1];
2230: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2231: }
2233: /* Form the inode structure for the columns of permuted matrix using inv perm*/
2234: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2236: /* Construct permutations for columns */
2237: for (i=0,col=0; i<nslim_col; ++i){
2238: indx = cidx[i];
2239: start_val = tns[indx];
2240: end_val = tns[indx + 1];
2241: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2242: }
2244: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
2245: ISSetPermutation(*rperm);
2246: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
2247: ISSetPermutation(*cperm);
2248:
2249: ISRestoreIndices(ris,&ridx);
2250: ISRestoreIndices(cis,&cidx);
2252: PetscFree(ns_col);
2253: PetscFree(permr);
2254: ISDestroy(cis);
2255: ISDestroy(ris);
2256: PetscFree(tns);
2257: return(0);
2258: }
2263: /*@C
2264: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2266: Collective on Mat
2268: Input Parameter:
2269: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2271: Output Parameter:
2272: + node_count - no of inodes present in the matrix.
2273: . sizes - an array of size node_count,with sizes of each inode.
2274: - limit - the max size used to generate the inodes.
2276: Level: advanced
2278: Notes: This routine returns some internal storage information
2279: of the matrix, it is intended to be used by advanced users.
2280: It should be called after the matrix is assembled.
2281: The contents of the sizes[] array should not be changed.
2282: PETSC_NULL may be passed for information not requested.
2284: .keywords: matrix, seqaij, get, inode
2286: .seealso: MatGetInfo()
2287: @*/
2288: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2289: {
2290: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2293: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2294: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
2295: if (f) {
2296: (*f)(A,node_count,sizes,limit);
2297: }
2298: return(0);
2299: }
2304: PetscErrorCode MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2305: {
2306: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2309: if (node_count) *node_count = a->inode.node_count;
2310: if (sizes) *sizes = a->inode.size;
2311: if (limit) *limit = a->inode.limit;
2312: return(0);
2313: }