Actual source code: baijfact2.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for BAIJ format.
5: */
7: #include ../src/mat/impls/baij/seq/baij.h
8: #include ../src/inline/ilu.h
9: #include ../src/inline/dot.h
13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14: {
15: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
17: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18: PetscInt *diag = a->diag;
19: MatScalar *aa=a->a,*v;
20: PetscScalar s1,*x,*b;
23: VecCopy(bb,xx);
24: VecGetArray(bb,&b);
25: VecGetArray(xx,&x);
26:
27: /* forward solve the U^T */
28: for (i=0; i<n; i++) {
30: v = aa + diag[i];
31: /* multiply by the inverse of the block diagonal */
32: s1 = (*v++)*x[i];
33: vi = aj + diag[i] + 1;
34: nz = ai[i+1] - diag[i] - 1;
35: while (nz--) {
36: x[*vi++] -= (*v++)*s1;
37: }
38: x[i] = s1;
39: }
40: /* backward solve the L^T */
41: for (i=n-1; i>=0; i--){
42: v = aa + diag[i] - 1;
43: vi = aj + diag[i] - 1;
44: nz = diag[i] - ai[i];
45: s1 = x[i];
46: while (nz--) {
47: x[*vi--] -= (*v--)*s1;
48: }
49: }
50: VecRestoreArray(bb,&b);
51: VecRestoreArray(xx,&x);
52: PetscLogFlops(2*(a->nz) - A->cmap->n);
53: return(0);
54: }
58: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59: {
60: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
62: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63: PetscInt *diag = a->diag,oidx;
64: MatScalar *aa=a->a,*v;
65: PetscScalar s1,s2,x1,x2;
66: PetscScalar *x,*b;
69: VecCopy(bb,xx);
70: VecGetArray(bb,&b);
71: VecGetArray(xx,&x);
73: /* forward solve the U^T */
74: idx = 0;
75: for (i=0; i<n; i++) {
77: v = aa + 4*diag[i];
78: /* multiply by the inverse of the block diagonal */
79: x1 = x[idx]; x2 = x[1+idx];
80: s1 = v[0]*x1 + v[1]*x2;
81: s2 = v[2]*x1 + v[3]*x2;
82: v += 4;
84: vi = aj + diag[i] + 1;
85: nz = ai[i+1] - diag[i] - 1;
86: while (nz--) {
87: oidx = 2*(*vi++);
88: x[oidx] -= v[0]*s1 + v[1]*s2;
89: x[oidx+1] -= v[2]*s1 + v[3]*s2;
90: v += 4;
91: }
92: x[idx] = s1;x[1+idx] = s2;
93: idx += 2;
94: }
95: /* backward solve the L^T */
96: for (i=n-1; i>=0; i--){
97: v = aa + 4*diag[i] - 4;
98: vi = aj + diag[i] - 1;
99: nz = diag[i] - ai[i];
100: idt = 2*i;
101: s1 = x[idt]; s2 = x[1+idt];
102: while (nz--) {
103: idx = 2*(*vi--);
104: x[idx] -= v[0]*s1 + v[1]*s2;
105: x[idx+1] -= v[2]*s1 + v[3]*s2;
106: v -= 4;
107: }
108: }
109: VecRestoreArray(bb,&b);
110: VecRestoreArray(xx,&x);
111: PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
112: return(0);
113: }
117: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118: {
119: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
121: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122: PetscInt *diag = a->diag,oidx;
123: MatScalar *aa=a->a,*v;
124: PetscScalar s1,s2,s3,x1,x2,x3;
125: PetscScalar *x,*b;
128: VecCopy(bb,xx);
129: VecGetArray(bb,&b);
130: VecGetArray(xx,&x);
132: /* forward solve the U^T */
133: idx = 0;
134: for (i=0; i<n; i++) {
136: v = aa + 9*diag[i];
137: /* multiply by the inverse of the block diagonal */
138: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
139: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
140: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
141: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
142: v += 9;
144: vi = aj + diag[i] + 1;
145: nz = ai[i+1] - diag[i] - 1;
146: while (nz--) {
147: oidx = 3*(*vi++);
148: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
149: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
150: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151: v += 9;
152: }
153: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
154: idx += 3;
155: }
156: /* backward solve the L^T */
157: for (i=n-1; i>=0; i--){
158: v = aa + 9*diag[i] - 9;
159: vi = aj + diag[i] - 1;
160: nz = diag[i] - ai[i];
161: idt = 3*i;
162: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
163: while (nz--) {
164: idx = 3*(*vi--);
165: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
166: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
167: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168: v -= 9;
169: }
170: }
171: VecRestoreArray(bb,&b);
172: VecRestoreArray(xx,&x);
173: PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
174: return(0);
175: }
179: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180: {
181: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
183: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184: PetscInt *diag = a->diag,oidx;
185: MatScalar *aa=a->a,*v;
186: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
187: PetscScalar *x,*b;
190: VecCopy(bb,xx);
191: VecGetArray(bb,&b);
192: VecGetArray(xx,&x);
194: /* forward solve the U^T */
195: idx = 0;
196: for (i=0; i<n; i++) {
198: v = aa + 16*diag[i];
199: /* multiply by the inverse of the block diagonal */
200: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
201: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
202: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
203: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
204: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205: v += 16;
207: vi = aj + diag[i] + 1;
208: nz = ai[i+1] - diag[i] - 1;
209: while (nz--) {
210: oidx = 4*(*vi++);
211: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
212: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
213: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215: v += 16;
216: }
217: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218: idx += 4;
219: }
220: /* backward solve the L^T */
221: for (i=n-1; i>=0; i--){
222: v = aa + 16*diag[i] - 16;
223: vi = aj + diag[i] - 1;
224: nz = diag[i] - ai[i];
225: idt = 4*i;
226: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227: while (nz--) {
228: idx = 4*(*vi--);
229: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
230: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
231: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233: v -= 16;
234: }
235: }
236: VecRestoreArray(bb,&b);
237: VecRestoreArray(xx,&x);
238: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
239: return(0);
240: }
244: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245: {
246: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
248: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249: PetscInt *diag = a->diag,oidx;
250: MatScalar *aa=a->a,*v;
251: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252: PetscScalar *x,*b;
255: VecCopy(bb,xx);
256: VecGetArray(bb,&b);
257: VecGetArray(xx,&x);
259: /* forward solve the U^T */
260: idx = 0;
261: for (i=0; i<n; i++) {
263: v = aa + 25*diag[i];
264: /* multiply by the inverse of the block diagonal */
265: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
267: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
268: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271: v += 25;
273: vi = aj + diag[i] + 1;
274: nz = ai[i+1] - diag[i] - 1;
275: while (nz--) {
276: oidx = 5*(*vi++);
277: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
278: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
279: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282: v += 25;
283: }
284: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285: idx += 5;
286: }
287: /* backward solve the L^T */
288: for (i=n-1; i>=0; i--){
289: v = aa + 25*diag[i] - 25;
290: vi = aj + diag[i] - 1;
291: nz = diag[i] - ai[i];
292: idt = 5*i;
293: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294: while (nz--) {
295: idx = 5*(*vi--);
296: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
297: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
298: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301: v -= 25;
302: }
303: }
304: VecRestoreArray(bb,&b);
305: VecRestoreArray(xx,&x);
306: PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
307: return(0);
308: }
312: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313: {
314: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
316: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317: PetscInt *diag = a->diag,oidx;
318: MatScalar *aa=a->a,*v;
319: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320: PetscScalar *x,*b;
323: VecCopy(bb,xx);
324: VecGetArray(bb,&b);
325: VecGetArray(xx,&x);
327: /* forward solve the U^T */
328: idx = 0;
329: for (i=0; i<n; i++) {
331: v = aa + 36*diag[i];
332: /* multiply by the inverse of the block diagonal */
333: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334: x6 = x[5+idx];
335: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
336: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
337: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341: v += 36;
343: vi = aj + diag[i] + 1;
344: nz = ai[i+1] - diag[i] - 1;
345: while (nz--) {
346: oidx = 6*(*vi++);
347: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
348: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
349: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353: v += 36;
354: }
355: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356: x[5+idx] = s6;
357: idx += 6;
358: }
359: /* backward solve the L^T */
360: for (i=n-1; i>=0; i--){
361: v = aa + 36*diag[i] - 36;
362: vi = aj + diag[i] - 1;
363: nz = diag[i] - ai[i];
364: idt = 6*i;
365: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366: s6 = x[5+idt];
367: while (nz--) {
368: idx = 6*(*vi--);
369: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
370: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
371: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375: v -= 36;
376: }
377: }
378: VecRestoreArray(bb,&b);
379: VecRestoreArray(xx,&x);
380: PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
381: return(0);
382: }
386: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387: {
388: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
390: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391: PetscInt *diag = a->diag,oidx;
392: MatScalar *aa=a->a,*v;
393: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394: PetscScalar *x,*b;
397: VecCopy(bb,xx);
398: VecGetArray(bb,&b);
399: VecGetArray(xx,&x);
401: /* forward solve the U^T */
402: idx = 0;
403: for (i=0; i<n; i++) {
405: v = aa + 49*diag[i];
406: /* multiply by the inverse of the block diagonal */
407: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408: x6 = x[5+idx]; x7 = x[6+idx];
409: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
410: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416: v += 49;
418: vi = aj + diag[i] + 1;
419: nz = ai[i+1] - diag[i] - 1;
420: while (nz--) {
421: oidx = 7*(*vi++);
422: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
423: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429: v += 49;
430: }
431: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432: x[5+idx] = s6;x[6+idx] = s7;
433: idx += 7;
434: }
435: /* backward solve the L^T */
436: for (i=n-1; i>=0; i--){
437: v = aa + 49*diag[i] - 49;
438: vi = aj + diag[i] - 1;
439: nz = diag[i] - ai[i];
440: idt = 7*i;
441: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442: s6 = x[5+idt];s7 = x[6+idt];
443: while (nz--) {
444: idx = 7*(*vi--);
445: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
446: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452: v -= 49;
453: }
454: }
455: VecRestoreArray(bb,&b);
456: VecRestoreArray(xx,&x);
457: PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);
458: return(0);
459: }
461: /*---------------------------------------------------------------------------------------------*/
464: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465: {
466: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
467: IS iscol=a->col,isrow=a->row;
469: const PetscInt *r,*c,*rout,*cout;
470: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471: PetscInt *diag = a->diag;
472: MatScalar *aa=a->a,*v;
473: PetscScalar s1,*x,*b,*t;
476: VecGetArray(bb,&b);
477: VecGetArray(xx,&x);
478: t = a->solve_work;
480: ISGetIndices(isrow,&rout); r = rout;
481: ISGetIndices(iscol,&cout); c = cout;
483: /* copy the b into temp work space according to permutation */
484: for (i=0; i<n; i++) {
485: t[i] = b[c[i]];
486: }
488: /* forward solve the U^T */
489: for (i=0; i<n; i++) {
491: v = aa + diag[i];
492: /* multiply by the inverse of the block diagonal */
493: s1 = (*v++)*t[i];
494: vi = aj + diag[i] + 1;
495: nz = ai[i+1] - diag[i] - 1;
496: while (nz--) {
497: t[*vi++] -= (*v++)*s1;
498: }
499: t[i] = s1;
500: }
501: /* backward solve the L^T */
502: for (i=n-1; i>=0; i--){
503: v = aa + diag[i] - 1;
504: vi = aj + diag[i] - 1;
505: nz = diag[i] - ai[i];
506: s1 = t[i];
507: while (nz--) {
508: t[*vi--] -= (*v--)*s1;
509: }
510: }
512: /* copy t into x according to permutation */
513: for (i=0; i<n; i++) {
514: x[r[i]] = t[i];
515: }
517: ISRestoreIndices(isrow,&rout);
518: ISRestoreIndices(iscol,&cout);
519: VecRestoreArray(bb,&b);
520: VecRestoreArray(xx,&x);
521: PetscLogFlops(2*(a->nz) - A->cmap->n);
522: return(0);
523: }
527: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528: {
529: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
530: IS iscol=a->col,isrow=a->row;
532: const PetscInt *r,*c,*rout,*cout;
533: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534: PetscInt *diag = a->diag,ii,ic,ir,oidx;
535: MatScalar *aa=a->a,*v;
536: PetscScalar s1,s2,x1,x2;
537: PetscScalar *x,*b,*t;
540: VecGetArray(bb,&b);
541: VecGetArray(xx,&x);
542: t = a->solve_work;
544: ISGetIndices(isrow,&rout); r = rout;
545: ISGetIndices(iscol,&cout); c = cout;
547: /* copy the b into temp work space according to permutation */
548: ii = 0;
549: for (i=0; i<n; i++) {
550: ic = 2*c[i];
551: t[ii] = b[ic];
552: t[ii+1] = b[ic+1];
553: ii += 2;
554: }
556: /* forward solve the U^T */
557: idx = 0;
558: for (i=0; i<n; i++) {
560: v = aa + 4*diag[i];
561: /* multiply by the inverse of the block diagonal */
562: x1 = t[idx]; x2 = t[1+idx];
563: s1 = v[0]*x1 + v[1]*x2;
564: s2 = v[2]*x1 + v[3]*x2;
565: v += 4;
567: vi = aj + diag[i] + 1;
568: nz = ai[i+1] - diag[i] - 1;
569: while (nz--) {
570: oidx = 2*(*vi++);
571: t[oidx] -= v[0]*s1 + v[1]*s2;
572: t[oidx+1] -= v[2]*s1 + v[3]*s2;
573: v += 4;
574: }
575: t[idx] = s1;t[1+idx] = s2;
576: idx += 2;
577: }
578: /* backward solve the L^T */
579: for (i=n-1; i>=0; i--){
580: v = aa + 4*diag[i] - 4;
581: vi = aj + diag[i] - 1;
582: nz = diag[i] - ai[i];
583: idt = 2*i;
584: s1 = t[idt]; s2 = t[1+idt];
585: while (nz--) {
586: idx = 2*(*vi--);
587: t[idx] -= v[0]*s1 + v[1]*s2;
588: t[idx+1] -= v[2]*s1 + v[3]*s2;
589: v -= 4;
590: }
591: }
593: /* copy t into x according to permutation */
594: ii = 0;
595: for (i=0; i<n; i++) {
596: ir = 2*r[i];
597: x[ir] = t[ii];
598: x[ir+1] = t[ii+1];
599: ii += 2;
600: }
602: ISRestoreIndices(isrow,&rout);
603: ISRestoreIndices(iscol,&cout);
604: VecRestoreArray(bb,&b);
605: VecRestoreArray(xx,&x);
606: PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
607: return(0);
608: }
612: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613: {
614: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
615: IS iscol=a->col,isrow=a->row;
617: const PetscInt *r,*c,*rout,*cout;
618: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619: PetscInt *diag = a->diag,ii,ic,ir,oidx;
620: MatScalar *aa=a->a,*v;
621: PetscScalar s1,s2,s3,x1,x2,x3;
622: PetscScalar *x,*b,*t;
625: VecGetArray(bb,&b);
626: VecGetArray(xx,&x);
627: t = a->solve_work;
629: ISGetIndices(isrow,&rout); r = rout;
630: ISGetIndices(iscol,&cout); c = cout;
632: /* copy the b into temp work space according to permutation */
633: ii = 0;
634: for (i=0; i<n; i++) {
635: ic = 3*c[i];
636: t[ii] = b[ic];
637: t[ii+1] = b[ic+1];
638: t[ii+2] = b[ic+2];
639: ii += 3;
640: }
642: /* forward solve the U^T */
643: idx = 0;
644: for (i=0; i<n; i++) {
646: v = aa + 9*diag[i];
647: /* multiply by the inverse of the block diagonal */
648: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
649: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
650: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
651: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
652: v += 9;
654: vi = aj + diag[i] + 1;
655: nz = ai[i+1] - diag[i] - 1;
656: while (nz--) {
657: oidx = 3*(*vi++);
658: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
659: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
660: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661: v += 9;
662: }
663: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
664: idx += 3;
665: }
666: /* backward solve the L^T */
667: for (i=n-1; i>=0; i--){
668: v = aa + 9*diag[i] - 9;
669: vi = aj + diag[i] - 1;
670: nz = diag[i] - ai[i];
671: idt = 3*i;
672: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
673: while (nz--) {
674: idx = 3*(*vi--);
675: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
676: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
677: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678: v -= 9;
679: }
680: }
682: /* copy t into x according to permutation */
683: ii = 0;
684: for (i=0; i<n; i++) {
685: ir = 3*r[i];
686: x[ir] = t[ii];
687: x[ir+1] = t[ii+1];
688: x[ir+2] = t[ii+2];
689: ii += 3;
690: }
692: ISRestoreIndices(isrow,&rout);
693: ISRestoreIndices(iscol,&cout);
694: VecRestoreArray(bb,&b);
695: VecRestoreArray(xx,&x);
696: PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
697: return(0);
698: }
702: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703: {
704: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
705: IS iscol=a->col,isrow=a->row;
707: const PetscInt *r,*c,*rout,*cout;
708: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709: PetscInt *diag = a->diag,ii,ic,ir,oidx;
710: MatScalar *aa=a->a,*v;
711: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
712: PetscScalar *x,*b,*t;
715: VecGetArray(bb,&b);
716: VecGetArray(xx,&x);
717: t = a->solve_work;
719: ISGetIndices(isrow,&rout); r = rout;
720: ISGetIndices(iscol,&cout); c = cout;
722: /* copy the b into temp work space according to permutation */
723: ii = 0;
724: for (i=0; i<n; i++) {
725: ic = 4*c[i];
726: t[ii] = b[ic];
727: t[ii+1] = b[ic+1];
728: t[ii+2] = b[ic+2];
729: t[ii+3] = b[ic+3];
730: ii += 4;
731: }
733: /* forward solve the U^T */
734: idx = 0;
735: for (i=0; i<n; i++) {
737: v = aa + 16*diag[i];
738: /* multiply by the inverse of the block diagonal */
739: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
740: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
741: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
742: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
743: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744: v += 16;
746: vi = aj + diag[i] + 1;
747: nz = ai[i+1] - diag[i] - 1;
748: while (nz--) {
749: oidx = 4*(*vi++);
750: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
751: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
752: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754: v += 16;
755: }
756: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757: idx += 4;
758: }
759: /* backward solve the L^T */
760: for (i=n-1; i>=0; i--){
761: v = aa + 16*diag[i] - 16;
762: vi = aj + diag[i] - 1;
763: nz = diag[i] - ai[i];
764: idt = 4*i;
765: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766: while (nz--) {
767: idx = 4*(*vi--);
768: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
769: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
770: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772: v -= 16;
773: }
774: }
776: /* copy t into x according to permutation */
777: ii = 0;
778: for (i=0; i<n; i++) {
779: ir = 4*r[i];
780: x[ir] = t[ii];
781: x[ir+1] = t[ii+1];
782: x[ir+2] = t[ii+2];
783: x[ir+3] = t[ii+3];
784: ii += 4;
785: }
787: ISRestoreIndices(isrow,&rout);
788: ISRestoreIndices(iscol,&cout);
789: VecRestoreArray(bb,&b);
790: VecRestoreArray(xx,&x);
791: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
792: return(0);
793: }
797: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798: {
799: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
800: IS iscol=a->col,isrow=a->row;
802: const PetscInt *r,*c,*rout,*cout;
803: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804: PetscInt *diag = a->diag,ii,ic,ir,oidx;
805: MatScalar *aa=a->a,*v;
806: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
807: PetscScalar *x,*b,*t;
810: VecGetArray(bb,&b);
811: VecGetArray(xx,&x);
812: t = a->solve_work;
814: ISGetIndices(isrow,&rout); r = rout;
815: ISGetIndices(iscol,&cout); c = cout;
817: /* copy the b into temp work space according to permutation */
818: ii = 0;
819: for (i=0; i<n; i++) {
820: ic = 5*c[i];
821: t[ii] = b[ic];
822: t[ii+1] = b[ic+1];
823: t[ii+2] = b[ic+2];
824: t[ii+3] = b[ic+3];
825: t[ii+4] = b[ic+4];
826: ii += 5;
827: }
829: /* forward solve the U^T */
830: idx = 0;
831: for (i=0; i<n; i++) {
833: v = aa + 25*diag[i];
834: /* multiply by the inverse of the block diagonal */
835: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
837: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
838: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841: v += 25;
843: vi = aj + diag[i] + 1;
844: nz = ai[i+1] - diag[i] - 1;
845: while (nz--) {
846: oidx = 5*(*vi++);
847: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
848: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
849: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852: v += 25;
853: }
854: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855: idx += 5;
856: }
857: /* backward solve the L^T */
858: for (i=n-1; i>=0; i--){
859: v = aa + 25*diag[i] - 25;
860: vi = aj + diag[i] - 1;
861: nz = diag[i] - ai[i];
862: idt = 5*i;
863: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864: while (nz--) {
865: idx = 5*(*vi--);
866: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
867: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
868: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871: v -= 25;
872: }
873: }
875: /* copy t into x according to permutation */
876: ii = 0;
877: for (i=0; i<n; i++) {
878: ir = 5*r[i];
879: x[ir] = t[ii];
880: x[ir+1] = t[ii+1];
881: x[ir+2] = t[ii+2];
882: x[ir+3] = t[ii+3];
883: x[ir+4] = t[ii+4];
884: ii += 5;
885: }
887: ISRestoreIndices(isrow,&rout);
888: ISRestoreIndices(iscol,&cout);
889: VecRestoreArray(bb,&b);
890: VecRestoreArray(xx,&x);
891: PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
892: return(0);
893: }
897: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898: {
899: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
900: IS iscol=a->col,isrow=a->row;
902: const PetscInt *r,*c,*rout,*cout;
903: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904: PetscInt *diag = a->diag,ii,ic,ir,oidx;
905: MatScalar *aa=a->a,*v;
906: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
907: PetscScalar *x,*b,*t;
910: VecGetArray(bb,&b);
911: VecGetArray(xx,&x);
912: t = a->solve_work;
914: ISGetIndices(isrow,&rout); r = rout;
915: ISGetIndices(iscol,&cout); c = cout;
917: /* copy the b into temp work space according to permutation */
918: ii = 0;
919: for (i=0; i<n; i++) {
920: ic = 6*c[i];
921: t[ii] = b[ic];
922: t[ii+1] = b[ic+1];
923: t[ii+2] = b[ic+2];
924: t[ii+3] = b[ic+3];
925: t[ii+4] = b[ic+4];
926: t[ii+5] = b[ic+5];
927: ii += 6;
928: }
930: /* forward solve the U^T */
931: idx = 0;
932: for (i=0; i<n; i++) {
934: v = aa + 36*diag[i];
935: /* multiply by the inverse of the block diagonal */
936: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937: x6 = t[5+idx];
938: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
939: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
940: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944: v += 36;
946: vi = aj + diag[i] + 1;
947: nz = ai[i+1] - diag[i] - 1;
948: while (nz--) {
949: oidx = 6*(*vi++);
950: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
951: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
952: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956: v += 36;
957: }
958: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959: t[5+idx] = s6;
960: idx += 6;
961: }
962: /* backward solve the L^T */
963: for (i=n-1; i>=0; i--){
964: v = aa + 36*diag[i] - 36;
965: vi = aj + diag[i] - 1;
966: nz = diag[i] - ai[i];
967: idt = 6*i;
968: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969: s6 = t[5+idt];
970: while (nz--) {
971: idx = 6*(*vi--);
972: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
973: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
974: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978: v -= 36;
979: }
980: }
982: /* copy t into x according to permutation */
983: ii = 0;
984: for (i=0; i<n; i++) {
985: ir = 6*r[i];
986: x[ir] = t[ii];
987: x[ir+1] = t[ii+1];
988: x[ir+2] = t[ii+2];
989: x[ir+3] = t[ii+3];
990: x[ir+4] = t[ii+4];
991: x[ir+5] = t[ii+5];
992: ii += 6;
993: }
995: ISRestoreIndices(isrow,&rout);
996: ISRestoreIndices(iscol,&cout);
997: VecRestoreArray(bb,&b);
998: VecRestoreArray(xx,&x);
999: PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1000: return(0);
1001: }
1005: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006: {
1007: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1008: IS iscol=a->col,isrow=a->row;
1010: const PetscInt *r,*c,*rout,*cout;
1011: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012: PetscInt *diag = a->diag,ii,ic,ir,oidx;
1013: MatScalar *aa=a->a,*v;
1014: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1015: PetscScalar *x,*b,*t;
1018: VecGetArray(bb,&b);
1019: VecGetArray(xx,&x);
1020: t = a->solve_work;
1022: ISGetIndices(isrow,&rout); r = rout;
1023: ISGetIndices(iscol,&cout); c = cout;
1025: /* copy the b into temp work space according to permutation */
1026: ii = 0;
1027: for (i=0; i<n; i++) {
1028: ic = 7*c[i];
1029: t[ii] = b[ic];
1030: t[ii+1] = b[ic+1];
1031: t[ii+2] = b[ic+2];
1032: t[ii+3] = b[ic+3];
1033: t[ii+4] = b[ic+4];
1034: t[ii+5] = b[ic+5];
1035: t[ii+6] = b[ic+6];
1036: ii += 7;
1037: }
1039: /* forward solve the U^T */
1040: idx = 0;
1041: for (i=0; i<n; i++) {
1043: v = aa + 49*diag[i];
1044: /* multiply by the inverse of the block diagonal */
1045: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046: x6 = t[5+idx]; x7 = t[6+idx];
1047: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1048: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054: v += 49;
1056: vi = aj + diag[i] + 1;
1057: nz = ai[i+1] - diag[i] - 1;
1058: while (nz--) {
1059: oidx = 7*(*vi++);
1060: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1061: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067: v += 49;
1068: }
1069: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070: t[5+idx] = s6;t[6+idx] = s7;
1071: idx += 7;
1072: }
1073: /* backward solve the L^T */
1074: for (i=n-1; i>=0; i--){
1075: v = aa + 49*diag[i] - 49;
1076: vi = aj + diag[i] - 1;
1077: nz = diag[i] - ai[i];
1078: idt = 7*i;
1079: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080: s6 = t[5+idt];s7 = t[6+idt];
1081: while (nz--) {
1082: idx = 7*(*vi--);
1083: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1084: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090: v -= 49;
1091: }
1092: }
1094: /* copy t into x according to permutation */
1095: ii = 0;
1096: for (i=0; i<n; i++) {
1097: ir = 7*r[i];
1098: x[ir] = t[ii];
1099: x[ir+1] = t[ii+1];
1100: x[ir+2] = t[ii+2];
1101: x[ir+3] = t[ii+3];
1102: x[ir+4] = t[ii+4];
1103: x[ir+5] = t[ii+5];
1104: x[ir+6] = t[ii+6];
1105: ii += 7;
1106: }
1108: ISRestoreIndices(isrow,&rout);
1109: ISRestoreIndices(iscol,&cout);
1110: VecRestoreArray(bb,&b);
1111: VecRestoreArray(xx,&x);
1112: PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);
1113: return(0);
1114: }
1116: /* ----------------------------------------------------------- */
1119: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1120: {
1121: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1122: IS iscol=a->col,isrow=a->row;
1124: const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1125: PetscInt i,n=a->mbs;
1126: PetscInt nz,bs=A->rmap->bs,bs2=a->bs2;
1127: MatScalar *aa=a->a,*v;
1128: PetscScalar *x,*b,*s,*t,*ls;
1131: VecGetArray(bb,&b);
1132: VecGetArray(xx,&x);
1133: t = a->solve_work;
1135: ISGetIndices(isrow,&rout); r = rout;
1136: ISGetIndices(iscol,&cout); c = cout + (n-1);
1138: /* forward solve the lower triangular */
1139: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1140: for (i=1; i<n; i++) {
1141: v = aa + bs2*ai[i];
1142: vi = aj + ai[i];
1143: nz = a->diag[i] - ai[i];
1144: s = t + bs*i;
1145: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1146: while (nz--) {
1147: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1148: v += bs2;
1149: }
1150: }
1151: /* backward solve the upper triangular */
1152: ls = a->solve_work + A->cmap->n;
1153: for (i=n-1; i>=0; i--){
1154: v = aa + bs2*(a->diag[i] + 1);
1155: vi = aj + a->diag[i] + 1;
1156: nz = ai[i+1] - a->diag[i] - 1;
1157: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1158: while (nz--) {
1159: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1160: v += bs2;
1161: }
1162: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1163: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1164: }
1166: ISRestoreIndices(isrow,&rout);
1167: ISRestoreIndices(iscol,&cout);
1168: VecRestoreArray(bb,&b);
1169: VecRestoreArray(xx,&x);
1170: PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1171: return(0);
1172: }
1176: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1177: {
1178: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1179: IS iscol=a->col,isrow=a->row;
1181: const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1182: PetscInt i,n=a->mbs,nz,idx,idt,idc;
1183: MatScalar *aa=a->a,*v;
1184: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1185: PetscScalar *x,*b,*t;
1188: VecGetArray(bb,&b);
1189: VecGetArray(xx,&x);
1190: t = a->solve_work;
1192: ISGetIndices(isrow,&rout); r = rout;
1193: ISGetIndices(iscol,&cout); c = cout + (n-1);
1195: /* forward solve the lower triangular */
1196: idx = 7*(*r++);
1197: t[0] = b[idx]; t[1] = b[1+idx];
1198: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199: t[5] = b[5+idx]; t[6] = b[6+idx];
1201: for (i=1; i<n; i++) {
1202: v = aa + 49*ai[i];
1203: vi = aj + ai[i];
1204: nz = diag[i] - ai[i];
1205: idx = 7*(*r++);
1206: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1208: while (nz--) {
1209: idx = 7*(*vi++);
1210: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1211: x4 = t[3+idx];x5 = t[4+idx];
1212: x6 = t[5+idx];x7 = t[6+idx];
1213: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1220: v += 49;
1221: }
1222: idx = 7*i;
1223: t[idx] = s1;t[1+idx] = s2;
1224: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225: t[5+idx] = s6;t[6+idx] = s7;
1226: }
1227: /* backward solve the upper triangular */
1228: for (i=n-1; i>=0; i--){
1229: v = aa + 49*diag[i] + 49;
1230: vi = aj + diag[i] + 1;
1231: nz = ai[i+1] - diag[i] - 1;
1232: idt = 7*i;
1233: s1 = t[idt]; s2 = t[1+idt];
1234: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235: s6 = t[5+idt];s7 = t[6+idt];
1236: while (nz--) {
1237: idx = 7*(*vi++);
1238: x1 = t[idx]; x2 = t[1+idx];
1239: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240: x6 = t[5+idx]; x7 = t[6+idx];
1241: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1248: v += 49;
1249: }
1250: idc = 7*(*c--);
1251: v = aa + 49*diag[i];
1252: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
1253: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1266: }
1268: ISRestoreIndices(isrow,&rout);
1269: ISRestoreIndices(iscol,&cout);
1270: VecRestoreArray(bb,&b);
1271: VecRestoreArray(xx,&x);
1272: PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);
1273: return(0);
1274: }
1278: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1279: {
1280: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1281: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282: PetscErrorCode ierr;
1283: PetscInt *diag = a->diag,jdx;
1284: const MatScalar *aa=a->a,*v;
1285: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286: const PetscScalar *b;
1289: VecGetArray(bb,(PetscScalar**)&b);
1290: VecGetArray(xx,&x);
1291: /* forward solve the lower triangular */
1292: idx = 0;
1293: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1294: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1295: x[6] = b[6+idx];
1296: for (i=1; i<n; i++) {
1297: v = aa + 49*ai[i];
1298: vi = aj + ai[i];
1299: nz = diag[i] - ai[i];
1300: idx = 7*i;
1301: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1302: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303: s7 = b[6+idx];
1304: while (nz--) {
1305: jdx = 7*(*vi++);
1306: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1307: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1308: x7 = x[6+jdx];
1309: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1316: v += 49;
1317: }
1318: x[idx] = s1;
1319: x[1+idx] = s2;
1320: x[2+idx] = s3;
1321: x[3+idx] = s4;
1322: x[4+idx] = s5;
1323: x[5+idx] = s6;
1324: x[6+idx] = s7;
1325: }
1326: /* backward solve the upper triangular */
1327: for (i=n-1; i>=0; i--){
1328: v = aa + 49*diag[i] + 49;
1329: vi = aj + diag[i] + 1;
1330: nz = ai[i+1] - diag[i] - 1;
1331: idt = 7*i;
1332: s1 = x[idt]; s2 = x[1+idt];
1333: s3 = x[2+idt]; s4 = x[3+idt];
1334: s5 = x[4+idt]; s6 = x[5+idt];
1335: s7 = x[6+idt];
1336: while (nz--) {
1337: idx = 7*(*vi++);
1338: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1339: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1340: x7 = x[6+idx];
1341: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1348: v += 49;
1349: }
1350: v = aa + 49*diag[i];
1351: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
1352: + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
1354: + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
1356: + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
1358: + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
1360: + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
1362: + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
1364: + v[34]*s5 + v[41]*s6 + v[48]*s7;
1365: }
1367: VecRestoreArray(bb,(PetscScalar**)&b);
1368: VecRestoreArray(xx,&x);
1369: PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1370: return(0);
1371: }
1375: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1376: {
1377: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1378: IS iscol=a->col,isrow=a->row;
1379: PetscErrorCode ierr;
1380: const PetscInt *r,*c,*rout,*cout;
1381: PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1382: const MatScalar *aa=a->a,*v;
1383: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1384: const PetscScalar *b;
1386: VecGetArray(bb,(PetscScalar**)&b);
1387: VecGetArray(xx,&x);
1388: t = a->solve_work;
1390: ISGetIndices(isrow,&rout); r = rout;
1391: ISGetIndices(iscol,&cout); c = cout + (n-1);
1393: /* forward solve the lower triangular */
1394: idx = 6*(*r++);
1395: t[0] = b[idx]; t[1] = b[1+idx];
1396: t[2] = b[2+idx]; t[3] = b[3+idx];
1397: t[4] = b[4+idx]; t[5] = b[5+idx];
1398: for (i=1; i<n; i++) {
1399: v = aa + 36*ai[i];
1400: vi = aj + ai[i];
1401: nz = diag[i] - ai[i];
1402: idx = 6*(*r++);
1403: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1404: s5 = b[4+idx]; s6 = b[5+idx];
1405: while (nz--) {
1406: idx = 6*(*vi++);
1407: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1408: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1409: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415: v += 36;
1416: }
1417: idx = 6*i;
1418: t[idx] = s1;t[1+idx] = s2;
1419: t[2+idx] = s3;t[3+idx] = s4;
1420: t[4+idx] = s5;t[5+idx] = s6;
1421: }
1422: /* backward solve the upper triangular */
1423: for (i=n-1; i>=0; i--){
1424: v = aa + 36*diag[i] + 36;
1425: vi = aj + diag[i] + 1;
1426: nz = ai[i+1] - diag[i] - 1;
1427: idt = 6*i;
1428: s1 = t[idt]; s2 = t[1+idt];
1429: s3 = t[2+idt];s4 = t[3+idt];
1430: s5 = t[4+idt];s6 = t[5+idt];
1431: while (nz--) {
1432: idx = 6*(*vi++);
1433: x1 = t[idx]; x2 = t[1+idx];
1434: x3 = t[2+idx]; x4 = t[3+idx];
1435: x5 = t[4+idx]; x6 = t[5+idx];
1436: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1437: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1438: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1439: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1440: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1441: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1442: v += 36;
1443: }
1444: idc = 6*(*c--);
1445: v = aa + 36*diag[i];
1446: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
1447: v[18]*s4+v[24]*s5+v[30]*s6;
1448: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1449: v[19]*s4+v[25]*s5+v[31]*s6;
1450: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1451: v[20]*s4+v[26]*s5+v[32]*s6;
1452: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1453: v[21]*s4+v[27]*s5+v[33]*s6;
1454: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1455: v[22]*s4+v[28]*s5+v[34]*s6;
1456: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1457: v[23]*s4+v[29]*s5+v[35]*s6;
1458: }
1460: ISRestoreIndices(isrow,&rout);
1461: ISRestoreIndices(iscol,&cout);
1462: VecRestoreArray(bb,(PetscScalar**)&b);
1463: VecRestoreArray(xx,&x);
1464: PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1465: return(0);
1466: }
1470: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1471: {
1472: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1473: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1474: PetscErrorCode ierr;
1475: PetscInt *diag = a->diag,jdx;
1476: const MatScalar *aa=a->a,*v;
1477: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1478: const PetscScalar *b;
1481: VecGetArray(bb,(PetscScalar**)&b);
1482: VecGetArray(xx,&x);
1483: /* forward solve the lower triangular */
1484: idx = 0;
1485: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1486: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1487: for (i=1; i<n; i++) {
1488: v = aa + 36*ai[i];
1489: vi = aj + ai[i];
1490: nz = diag[i] - ai[i];
1491: idx = 6*i;
1492: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1493: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1494: while (nz--) {
1495: jdx = 6*(*vi++);
1496: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1497: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1498: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1499: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1500: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1501: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1502: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1503: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1504: v += 36;
1505: }
1506: x[idx] = s1;
1507: x[1+idx] = s2;
1508: x[2+idx] = s3;
1509: x[3+idx] = s4;
1510: x[4+idx] = s5;
1511: x[5+idx] = s6;
1512: }
1513: /* backward solve the upper triangular */
1514: for (i=n-1; i>=0; i--){
1515: v = aa + 36*diag[i] + 36;
1516: vi = aj + diag[i] + 1;
1517: nz = ai[i+1] - diag[i] - 1;
1518: idt = 6*i;
1519: s1 = x[idt]; s2 = x[1+idt];
1520: s3 = x[2+idt]; s4 = x[3+idt];
1521: s5 = x[4+idt]; s6 = x[5+idt];
1522: while (nz--) {
1523: idx = 6*(*vi++);
1524: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1525: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1526: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1532: v += 36;
1533: }
1534: v = aa + 36*diag[i];
1535: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1536: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1537: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1538: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1539: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1540: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1541: }
1543: VecRestoreArray(bb,(PetscScalar**)&b);
1544: VecRestoreArray(xx,&x);
1545: PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1546: return(0);
1547: }
1551: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1552: {
1553: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1554: IS iscol=a->col,isrow=a->row;
1555: PetscErrorCode ierr;
1556: const PetscInt *r,*c,*rout,*cout,*diag = a->diag;
1557: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1558: const MatScalar *aa=a->a,*v;
1559: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1560: const PetscScalar *b;
1563: VecGetArray(bb,(PetscScalar**)&b);
1564: VecGetArray(xx,&x);
1565: t = a->solve_work;
1567: ISGetIndices(isrow,&rout); r = rout;
1568: ISGetIndices(iscol,&cout); c = cout + (n-1);
1570: /* forward solve the lower triangular */
1571: idx = 5*(*r++);
1572: t[0] = b[idx]; t[1] = b[1+idx];
1573: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1574: for (i=1; i<n; i++) {
1575: v = aa + 25*ai[i];
1576: vi = aj + ai[i];
1577: nz = diag[i] - ai[i];
1578: idx = 5*(*r++);
1579: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1580: s5 = b[4+idx];
1581: while (nz--) {
1582: idx = 5*(*vi++);
1583: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1584: x4 = t[3+idx];x5 = t[4+idx];
1585: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1586: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1587: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1588: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1589: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1590: v += 25;
1591: }
1592: idx = 5*i;
1593: t[idx] = s1;t[1+idx] = s2;
1594: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1595: }
1596: /* backward solve the upper triangular */
1597: for (i=n-1; i>=0; i--){
1598: v = aa + 25*diag[i] + 25;
1599: vi = aj + diag[i] + 1;
1600: nz = ai[i+1] - diag[i] - 1;
1601: idt = 5*i;
1602: s1 = t[idt]; s2 = t[1+idt];
1603: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1604: while (nz--) {
1605: idx = 5*(*vi++);
1606: x1 = t[idx]; x2 = t[1+idx];
1607: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1608: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1609: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1610: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1611: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1612: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1613: v += 25;
1614: }
1615: idc = 5*(*c--);
1616: v = aa + 25*diag[i];
1617: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
1618: v[15]*s4+v[20]*s5;
1619: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1620: v[16]*s4+v[21]*s5;
1621: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1622: v[17]*s4+v[22]*s5;
1623: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1624: v[18]*s4+v[23]*s5;
1625: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1626: v[19]*s4+v[24]*s5;
1627: }
1629: ISRestoreIndices(isrow,&rout);
1630: ISRestoreIndices(iscol,&cout);
1631: VecRestoreArray(bb,(PetscScalar**)&b);
1632: VecRestoreArray(xx,&x);
1633: PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
1634: return(0);
1635: }
1639: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1640: {
1641: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1642: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1643: PetscErrorCode ierr;
1644: PetscInt *diag = a->diag,jdx;
1645: const MatScalar *aa=a->a,*v;
1646: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1647: const PetscScalar *b;
1650: VecGetArray(bb,(PetscScalar**)&b);
1651: VecGetArray(xx,&x);
1652: /* forward solve the lower triangular */
1653: idx = 0;
1654: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1655: for (i=1; i<n; i++) {
1656: v = aa + 25*ai[i];
1657: vi = aj + ai[i];
1658: nz = diag[i] - ai[i];
1659: idx = 5*i;
1660: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1661: while (nz--) {
1662: jdx = 5*(*vi++);
1663: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1664: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1665: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1666: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1667: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1668: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1669: v += 25;
1670: }
1671: x[idx] = s1;
1672: x[1+idx] = s2;
1673: x[2+idx] = s3;
1674: x[3+idx] = s4;
1675: x[4+idx] = s5;
1676: }
1677: /* backward solve the upper triangular */
1678: for (i=n-1; i>=0; i--){
1679: v = aa + 25*diag[i] + 25;
1680: vi = aj + diag[i] + 1;
1681: nz = ai[i+1] - diag[i] - 1;
1682: idt = 5*i;
1683: s1 = x[idt]; s2 = x[1+idt];
1684: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1685: while (nz--) {
1686: idx = 5*(*vi++);
1687: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1688: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1689: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1690: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1691: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1692: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1693: v += 25;
1694: }
1695: v = aa + 25*diag[i];
1696: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1697: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1698: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1699: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1700: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1701: }
1703: VecRestoreArray(bb,(PetscScalar**)&b);
1704: VecRestoreArray(xx,&x);
1705: PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
1706: return(0);
1707: }
1711: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1712: {
1713: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1714: IS iscol=a->col,isrow=a->row;
1715: PetscErrorCode ierr;
1716: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1717: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1718: const MatScalar *aa=a->a,*v;
1719: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1720: const PetscScalar *b;
1723: VecGetArray(bb,(PetscScalar**)&b);
1724: VecGetArray(xx,&x);
1725: t = a->solve_work;
1727: ISGetIndices(isrow,&rout); r = rout;
1728: ISGetIndices(iscol,&cout); c = cout + (n-1);
1730: /* forward solve the lower triangular */
1731: idx = 4*(*r++);
1732: t[0] = b[idx]; t[1] = b[1+idx];
1733: t[2] = b[2+idx]; t[3] = b[3+idx];
1734: for (i=1; i<n; i++) {
1735: v = aa + 16*ai[i];
1736: vi = aj + ai[i];
1737: nz = diag[i] - ai[i];
1738: idx = 4*(*r++);
1739: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1740: while (nz--) {
1741: idx = 4*(*vi++);
1742: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1743: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1744: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1745: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1746: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1747: v += 16;
1748: }
1749: idx = 4*i;
1750: t[idx] = s1;t[1+idx] = s2;
1751: t[2+idx] = s3;t[3+idx] = s4;
1752: }
1753: /* backward solve the upper triangular */
1754: for (i=n-1; i>=0; i--){
1755: v = aa + 16*diag[i] + 16;
1756: vi = aj + diag[i] + 1;
1757: nz = ai[i+1] - diag[i] - 1;
1758: idt = 4*i;
1759: s1 = t[idt]; s2 = t[1+idt];
1760: s3 = t[2+idt];s4 = t[3+idt];
1761: while (nz--) {
1762: idx = 4*(*vi++);
1763: x1 = t[idx]; x2 = t[1+idx];
1764: x3 = t[2+idx]; x4 = t[3+idx];
1765: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1766: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1767: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1768: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1769: v += 16;
1770: }
1771: idc = 4*(*c--);
1772: v = aa + 16*diag[i];
1773: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1774: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1775: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1776: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1777: }
1779: ISRestoreIndices(isrow,&rout);
1780: ISRestoreIndices(iscol,&cout);
1781: VecRestoreArray(bb,(PetscScalar**)&b);
1782: VecRestoreArray(xx,&x);
1783: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
1784: return(0);
1785: }
1789: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1790: {
1791: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1792: IS iscol=a->col,isrow=a->row;
1793: PetscErrorCode ierr;
1794: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1795: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1796: const MatScalar *aa=a->a,*v;
1797: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
1798: PetscScalar *x;
1799: const PetscScalar *b;
1802: VecGetArray(bb,(PetscScalar**)&b);
1803: VecGetArray(xx,&x);
1804: t = (MatScalar *)a->solve_work;
1806: ISGetIndices(isrow,&rout); r = rout;
1807: ISGetIndices(iscol,&cout); c = cout + (n-1);
1809: /* forward solve the lower triangular */
1810: idx = 4*(*r++);
1811: t[0] = (MatScalar)b[idx];
1812: t[1] = (MatScalar)b[1+idx];
1813: t[2] = (MatScalar)b[2+idx];
1814: t[3] = (MatScalar)b[3+idx];
1815: for (i=1; i<n; i++) {
1816: v = aa + 16*ai[i];
1817: vi = aj + ai[i];
1818: nz = diag[i] - ai[i];
1819: idx = 4*(*r++);
1820: s1 = (MatScalar)b[idx];
1821: s2 = (MatScalar)b[1+idx];
1822: s3 = (MatScalar)b[2+idx];
1823: s4 = (MatScalar)b[3+idx];
1824: while (nz--) {
1825: idx = 4*(*vi++);
1826: x1 = t[idx];
1827: x2 = t[1+idx];
1828: x3 = t[2+idx];
1829: x4 = t[3+idx];
1830: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1831: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1832: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1833: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1834: v += 16;
1835: }
1836: idx = 4*i;
1837: t[idx] = s1;
1838: t[1+idx] = s2;
1839: t[2+idx] = s3;
1840: t[3+idx] = s4;
1841: }
1842: /* backward solve the upper triangular */
1843: for (i=n-1; i>=0; i--){
1844: v = aa + 16*diag[i] + 16;
1845: vi = aj + diag[i] + 1;
1846: nz = ai[i+1] - diag[i] - 1;
1847: idt = 4*i;
1848: s1 = t[idt];
1849: s2 = t[1+idt];
1850: s3 = t[2+idt];
1851: s4 = t[3+idt];
1852: while (nz--) {
1853: idx = 4*(*vi++);
1854: x1 = t[idx];
1855: x2 = t[1+idx];
1856: x3 = t[2+idx];
1857: x4 = t[3+idx];
1858: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1859: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1860: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1861: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1862: v += 16;
1863: }
1864: idc = 4*(*c--);
1865: v = aa + 16*diag[i];
1866: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1867: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1868: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1869: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1870: x[idc] = (PetscScalar)t[idt];
1871: x[1+idc] = (PetscScalar)t[1+idt];
1872: x[2+idc] = (PetscScalar)t[2+idt];
1873: x[3+idc] = (PetscScalar)t[3+idt];
1874: }
1876: ISRestoreIndices(isrow,&rout);
1877: ISRestoreIndices(iscol,&cout);
1878: VecRestoreArray(bb,(PetscScalar**)&b);
1879: VecRestoreArray(xx,&x);
1880: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
1881: return(0);
1882: }
1884: #if defined (PETSC_HAVE_SSE)
1886: #include PETSC_HAVE_SSE
1890: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1891: {
1892: /*
1893: Note: This code uses demotion of double
1894: to float when performing the mixed-mode computation.
1895: This may not be numerically reasonable for all applications.
1896: */
1897: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1898: IS iscol=a->col,isrow=a->row;
1900: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
1901: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1902: MatScalar *aa=a->a,*v;
1903: PetscScalar *x,*b,*t;
1905: /* Make space in temp stack for 16 Byte Aligned arrays */
1906: float ssealignedspace[11],*tmps,*tmpx;
1907: unsigned long offset;
1908:
1910: SSE_SCOPE_BEGIN;
1912: offset = (unsigned long)ssealignedspace % 16;
1913: if (offset) offset = (16 - offset)/4;
1914: tmps = &ssealignedspace[offset];
1915: tmpx = &ssealignedspace[offset+4];
1916: PREFETCH_NTA(aa+16*ai[1]);
1918: VecGetArray(bb,&b);
1919: VecGetArray(xx,&x);
1920: t = a->solve_work;
1922: ISGetIndices(isrow,&rout); r = rout;
1923: ISGetIndices(iscol,&cout); c = cout + (n-1);
1925: /* forward solve the lower triangular */
1926: idx = 4*(*r++);
1927: t[0] = b[idx]; t[1] = b[1+idx];
1928: t[2] = b[2+idx]; t[3] = b[3+idx];
1929: v = aa + 16*ai[1];
1931: for (i=1; i<n;) {
1932: PREFETCH_NTA(&v[8]);
1933: vi = aj + ai[i];
1934: nz = diag[i] - ai[i];
1935: idx = 4*(*r++);
1937: /* Demote sum from double to float */
1938: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1939: LOAD_PS(tmps,XMM7);
1941: while (nz--) {
1942: PREFETCH_NTA(&v[16]);
1943: idx = 4*(*vi++);
1944:
1945: /* Demote solution (so far) from double to float */
1946: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1948: /* 4x4 Matrix-Vector product with negative accumulation: */
1949: SSE_INLINE_BEGIN_2(tmpx,v)
1950: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1952: /* First Column */
1953: SSE_COPY_PS(XMM0,XMM6)
1954: SSE_SHUFFLE(XMM0,XMM0,0x00)
1955: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1956: SSE_SUB_PS(XMM7,XMM0)
1957:
1958: /* Second Column */
1959: SSE_COPY_PS(XMM1,XMM6)
1960: SSE_SHUFFLE(XMM1,XMM1,0x55)
1961: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1962: SSE_SUB_PS(XMM7,XMM1)
1963:
1964: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1965:
1966: /* Third Column */
1967: SSE_COPY_PS(XMM2,XMM6)
1968: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1969: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1970: SSE_SUB_PS(XMM7,XMM2)
1972: /* Fourth Column */
1973: SSE_COPY_PS(XMM3,XMM6)
1974: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1975: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1976: SSE_SUB_PS(XMM7,XMM3)
1977: SSE_INLINE_END_2
1978:
1979: v += 16;
1980: }
1981: idx = 4*i;
1982: v = aa + 16*ai[++i];
1983: PREFETCH_NTA(v);
1984: STORE_PS(tmps,XMM7);
1986: /* Promote result from float to double */
1987: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1988: }
1989: /* backward solve the upper triangular */
1990: idt = 4*(n-1);
1991: ai16 = 16*diag[n-1];
1992: v = aa + ai16 + 16;
1993: for (i=n-1; i>=0;){
1994: PREFETCH_NTA(&v[8]);
1995: vi = aj + diag[i] + 1;
1996: nz = ai[i+1] - diag[i] - 1;
1997:
1998: /* Demote accumulator from double to float */
1999: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2000: LOAD_PS(tmps,XMM7);
2002: while (nz--) {
2003: PREFETCH_NTA(&v[16]);
2004: idx = 4*(*vi++);
2006: /* Demote solution (so far) from double to float */
2007: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2009: /* 4x4 Matrix-Vector Product with negative accumulation: */
2010: SSE_INLINE_BEGIN_2(tmpx,v)
2011: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2013: /* First Column */
2014: SSE_COPY_PS(XMM0,XMM6)
2015: SSE_SHUFFLE(XMM0,XMM0,0x00)
2016: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2017: SSE_SUB_PS(XMM7,XMM0)
2019: /* Second Column */
2020: SSE_COPY_PS(XMM1,XMM6)
2021: SSE_SHUFFLE(XMM1,XMM1,0x55)
2022: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2023: SSE_SUB_PS(XMM7,XMM1)
2025: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2026:
2027: /* Third Column */
2028: SSE_COPY_PS(XMM2,XMM6)
2029: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2030: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2031: SSE_SUB_PS(XMM7,XMM2)
2033: /* Fourth Column */
2034: SSE_COPY_PS(XMM3,XMM6)
2035: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2036: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2037: SSE_SUB_PS(XMM7,XMM3)
2038: SSE_INLINE_END_2
2039: v += 16;
2040: }
2041: v = aa + ai16;
2042: ai16 = 16*diag[--i];
2043: PREFETCH_NTA(aa+ai16+16);
2044: /*
2045: Scale the result by the diagonal 4x4 block,
2046: which was inverted as part of the factorization
2047: */
2048: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2049: /* First Column */
2050: SSE_COPY_PS(XMM0,XMM7)
2051: SSE_SHUFFLE(XMM0,XMM0,0x00)
2052: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2054: /* Second Column */
2055: SSE_COPY_PS(XMM1,XMM7)
2056: SSE_SHUFFLE(XMM1,XMM1,0x55)
2057: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2058: SSE_ADD_PS(XMM0,XMM1)
2060: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2061:
2062: /* Third Column */
2063: SSE_COPY_PS(XMM2,XMM7)
2064: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2065: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2066: SSE_ADD_PS(XMM0,XMM2)
2068: /* Fourth Column */
2069: SSE_COPY_PS(XMM3,XMM7)
2070: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2071: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2072: SSE_ADD_PS(XMM0,XMM3)
2073:
2074: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2075: SSE_INLINE_END_3
2077: /* Promote solution from float to double */
2078: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2080: /* Apply reordering to t and stream into x. */
2081: /* This way, x doesn't pollute the cache. */
2082: /* Be careful with size: 2 doubles = 4 floats! */
2083: idc = 4*(*c--);
2084: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2085: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
2086: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2087: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2088: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2089: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2090: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2091: SSE_INLINE_END_2
2092: v = aa + ai16 + 16;
2093: idt -= 4;
2094: }
2096: ISRestoreIndices(isrow,&rout);
2097: ISRestoreIndices(iscol,&cout);
2098: VecRestoreArray(bb,&b);
2099: VecRestoreArray(xx,&x);
2100: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2101: SSE_SCOPE_END;
2102: return(0);
2103: }
2105: #endif
2108: /*
2109: Special case where the matrix was ILU(0) factored in the natural
2110: ordering. This eliminates the need for the column and row permutation.
2111: */
2114: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2115: {
2116: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2117: PetscInt n=a->mbs;
2118: const PetscInt *ai=a->i,*aj=a->j;
2119: PetscErrorCode ierr;
2120: const PetscInt *diag = a->diag;
2121: const MatScalar *aa=a->a;
2122: PetscScalar *x;
2123: const PetscScalar *b;
2126: VecGetArray(bb,(PetscScalar**)&b);
2127: VecGetArray(xx,&x);
2129: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2130: {
2131: static PetscScalar w[2000]; /* very BAD need to fix */
2132: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2133: }
2134: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2135: {
2136: static PetscScalar w[2000]; /* very BAD need to fix */
2137: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2138: }
2139: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2140: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2141: #else
2142: {
2143: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
2144: const MatScalar *v;
2145: PetscInt jdx,idt,idx,nz,i,ai16;
2146: const PetscInt *vi;
2148: /* forward solve the lower triangular */
2149: idx = 0;
2150: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2151: for (i=1; i<n; i++) {
2152: v = aa + 16*ai[i];
2153: vi = aj + ai[i];
2154: nz = diag[i] - ai[i];
2155: idx += 4;
2156: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2157: while (nz--) {
2158: jdx = 4*(*vi++);
2159: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2160: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2161: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2162: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2163: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2164: v += 16;
2165: }
2166: x[idx] = s1;
2167: x[1+idx] = s2;
2168: x[2+idx] = s3;
2169: x[3+idx] = s4;
2170: }
2171: /* backward solve the upper triangular */
2172: idt = 4*(n-1);
2173: for (i=n-1; i>=0; i--){
2174: ai16 = 16*diag[i];
2175: v = aa + ai16 + 16;
2176: vi = aj + diag[i] + 1;
2177: nz = ai[i+1] - diag[i] - 1;
2178: s1 = x[idt]; s2 = x[1+idt];
2179: s3 = x[2+idt];s4 = x[3+idt];
2180: while (nz--) {
2181: idx = 4*(*vi++);
2182: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
2183: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2184: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2185: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2186: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2187: v += 16;
2188: }
2189: v = aa + ai16;
2190: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2191: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
2192: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2193: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2194: idt -= 4;
2195: }
2196: }
2197: #endif
2199: VecRestoreArray(bb,(PetscScalar**)&b);
2200: VecRestoreArray(xx,&x);
2201: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2202: return(0);
2203: }
2207: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2208: {
2209: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2210: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2212: PetscInt *diag = a->diag;
2213: MatScalar *aa=a->a;
2214: PetscScalar *x,*b;
2217: VecGetArray(bb,&b);
2218: VecGetArray(xx,&x);
2220: {
2221: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
2222: MatScalar *v,*t=(MatScalar *)x;
2223: PetscInt jdx,idt,idx,nz,*vi,i,ai16;
2225: /* forward solve the lower triangular */
2226: idx = 0;
2227: t[0] = (MatScalar)b[0];
2228: t[1] = (MatScalar)b[1];
2229: t[2] = (MatScalar)b[2];
2230: t[3] = (MatScalar)b[3];
2231: for (i=1; i<n; i++) {
2232: v = aa + 16*ai[i];
2233: vi = aj + ai[i];
2234: nz = diag[i] - ai[i];
2235: idx += 4;
2236: s1 = (MatScalar)b[idx];
2237: s2 = (MatScalar)b[1+idx];
2238: s3 = (MatScalar)b[2+idx];
2239: s4 = (MatScalar)b[3+idx];
2240: while (nz--) {
2241: jdx = 4*(*vi++);
2242: x1 = t[jdx];
2243: x2 = t[1+jdx];
2244: x3 = t[2+jdx];
2245: x4 = t[3+jdx];
2246: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2247: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2248: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2249: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2250: v += 16;
2251: }
2252: t[idx] = s1;
2253: t[1+idx] = s2;
2254: t[2+idx] = s3;
2255: t[3+idx] = s4;
2256: }
2257: /* backward solve the upper triangular */
2258: idt = 4*(n-1);
2259: for (i=n-1; i>=0; i--){
2260: ai16 = 16*diag[i];
2261: v = aa + ai16 + 16;
2262: vi = aj + diag[i] + 1;
2263: nz = ai[i+1] - diag[i] - 1;
2264: s1 = t[idt];
2265: s2 = t[1+idt];
2266: s3 = t[2+idt];
2267: s4 = t[3+idt];
2268: while (nz--) {
2269: idx = 4*(*vi++);
2270: x1 = (MatScalar)x[idx];
2271: x2 = (MatScalar)x[1+idx];
2272: x3 = (MatScalar)x[2+idx];
2273: x4 = (MatScalar)x[3+idx];
2274: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2275: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2276: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2277: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2278: v += 16;
2279: }
2280: v = aa + ai16;
2281: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
2282: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
2283: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2284: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2285: idt -= 4;
2286: }
2287: }
2289: VecRestoreArray(bb,&b);
2290: VecRestoreArray(xx,&x);
2291: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2292: return(0);
2293: }
2295: #if defined (PETSC_HAVE_SSE)
2297: #include PETSC_HAVE_SSE
2300: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2301: {
2302: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2303: unsigned short *aj=(unsigned short *)a->j;
2305: int *ai=a->i,n=a->mbs,*diag = a->diag;
2306: MatScalar *aa=a->a;
2307: PetscScalar *x,*b;
2310: SSE_SCOPE_BEGIN;
2311: /*
2312: Note: This code currently uses demotion of double
2313: to float when performing the mixed-mode computation.
2314: This may not be numerically reasonable for all applications.
2315: */
2316: PREFETCH_NTA(aa+16*ai[1]);
2318: VecGetArray(bb,&b);
2319: VecGetArray(xx,&x);
2320: {
2321: /* x will first be computed in single precision then promoted inplace to double */
2322: MatScalar *v,*t=(MatScalar *)x;
2323: int nz,i,idt,ai16;
2324: unsigned int jdx,idx;
2325: unsigned short *vi;
2326: /* Forward solve the lower triangular factor. */
2328: /* First block is the identity. */
2329: idx = 0;
2330: CONVERT_DOUBLE4_FLOAT4(t,b);
2331: v = aa + 16*((unsigned int)ai[1]);
2333: for (i=1; i<n;) {
2334: PREFETCH_NTA(&v[8]);
2335: vi = aj + ai[i];
2336: nz = diag[i] - ai[i];
2337: idx += 4;
2339: /* Demote RHS from double to float. */
2340: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2341: LOAD_PS(&t[idx],XMM7);
2343: while (nz--) {
2344: PREFETCH_NTA(&v[16]);
2345: jdx = 4*((unsigned int)(*vi++));
2346:
2347: /* 4x4 Matrix-Vector product with negative accumulation: */
2348: SSE_INLINE_BEGIN_2(&t[jdx],v)
2349: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2351: /* First Column */
2352: SSE_COPY_PS(XMM0,XMM6)
2353: SSE_SHUFFLE(XMM0,XMM0,0x00)
2354: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2355: SSE_SUB_PS(XMM7,XMM0)
2357: /* Second Column */
2358: SSE_COPY_PS(XMM1,XMM6)
2359: SSE_SHUFFLE(XMM1,XMM1,0x55)
2360: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2361: SSE_SUB_PS(XMM7,XMM1)
2363: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2364:
2365: /* Third Column */
2366: SSE_COPY_PS(XMM2,XMM6)
2367: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2368: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2369: SSE_SUB_PS(XMM7,XMM2)
2371: /* Fourth Column */
2372: SSE_COPY_PS(XMM3,XMM6)
2373: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2374: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2375: SSE_SUB_PS(XMM7,XMM3)
2376: SSE_INLINE_END_2
2377:
2378: v += 16;
2379: }
2380: v = aa + 16*ai[++i];
2381: PREFETCH_NTA(v);
2382: STORE_PS(&t[idx],XMM7);
2383: }
2385: /* Backward solve the upper triangular factor.*/
2387: idt = 4*(n-1);
2388: ai16 = 16*diag[n-1];
2389: v = aa + ai16 + 16;
2390: for (i=n-1; i>=0;){
2391: PREFETCH_NTA(&v[8]);
2392: vi = aj + diag[i] + 1;
2393: nz = ai[i+1] - diag[i] - 1;
2394:
2395: LOAD_PS(&t[idt],XMM7);
2397: while (nz--) {
2398: PREFETCH_NTA(&v[16]);
2399: idx = 4*((unsigned int)(*vi++));
2401: /* 4x4 Matrix-Vector Product with negative accumulation: */
2402: SSE_INLINE_BEGIN_2(&t[idx],v)
2403: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2405: /* First Column */
2406: SSE_COPY_PS(XMM0,XMM6)
2407: SSE_SHUFFLE(XMM0,XMM0,0x00)
2408: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2409: SSE_SUB_PS(XMM7,XMM0)
2411: /* Second Column */
2412: SSE_COPY_PS(XMM1,XMM6)
2413: SSE_SHUFFLE(XMM1,XMM1,0x55)
2414: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2415: SSE_SUB_PS(XMM7,XMM1)
2417: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2418:
2419: /* Third Column */
2420: SSE_COPY_PS(XMM2,XMM6)
2421: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2422: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2423: SSE_SUB_PS(XMM7,XMM2)
2425: /* Fourth Column */
2426: SSE_COPY_PS(XMM3,XMM6)
2427: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2428: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2429: SSE_SUB_PS(XMM7,XMM3)
2430: SSE_INLINE_END_2
2431: v += 16;
2432: }
2433: v = aa + ai16;
2434: ai16 = 16*diag[--i];
2435: PREFETCH_NTA(aa+ai16+16);
2436: /*
2437: Scale the result by the diagonal 4x4 block,
2438: which was inverted as part of the factorization
2439: */
2440: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2441: /* First Column */
2442: SSE_COPY_PS(XMM0,XMM7)
2443: SSE_SHUFFLE(XMM0,XMM0,0x00)
2444: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2446: /* Second Column */
2447: SSE_COPY_PS(XMM1,XMM7)
2448: SSE_SHUFFLE(XMM1,XMM1,0x55)
2449: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2450: SSE_ADD_PS(XMM0,XMM1)
2452: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2453:
2454: /* Third Column */
2455: SSE_COPY_PS(XMM2,XMM7)
2456: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2457: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2458: SSE_ADD_PS(XMM0,XMM2)
2460: /* Fourth Column */
2461: SSE_COPY_PS(XMM3,XMM7)
2462: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2463: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2464: SSE_ADD_PS(XMM0,XMM3)
2466: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2467: SSE_INLINE_END_3
2469: v = aa + ai16 + 16;
2470: idt -= 4;
2471: }
2473: /* Convert t from single precision back to double precision (inplace)*/
2474: idt = 4*(n-1);
2475: for (i=n-1;i>=0;i--) {
2476: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2477: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2478: PetscScalar *xtemp=&x[idt];
2479: MatScalar *ttemp=&t[idt];
2480: xtemp[3] = (PetscScalar)ttemp[3];
2481: xtemp[2] = (PetscScalar)ttemp[2];
2482: xtemp[1] = (PetscScalar)ttemp[1];
2483: xtemp[0] = (PetscScalar)ttemp[0];
2484: idt -= 4;
2485: }
2487: } /* End of artificial scope. */
2488: VecRestoreArray(bb,&b);
2489: VecRestoreArray(xx,&x);
2490: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2491: SSE_SCOPE_END;
2492: return(0);
2493: }
2497: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2498: {
2499: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2500: int *aj=a->j;
2502: int *ai=a->i,n=a->mbs,*diag = a->diag;
2503: MatScalar *aa=a->a;
2504: PetscScalar *x,*b;
2507: SSE_SCOPE_BEGIN;
2508: /*
2509: Note: This code currently uses demotion of double
2510: to float when performing the mixed-mode computation.
2511: This may not be numerically reasonable for all applications.
2512: */
2513: PREFETCH_NTA(aa+16*ai[1]);
2515: VecGetArray(bb,&b);
2516: VecGetArray(xx,&x);
2517: {
2518: /* x will first be computed in single precision then promoted inplace to double */
2519: MatScalar *v,*t=(MatScalar *)x;
2520: int nz,i,idt,ai16;
2521: int jdx,idx;
2522: int *vi;
2523: /* Forward solve the lower triangular factor. */
2525: /* First block is the identity. */
2526: idx = 0;
2527: CONVERT_DOUBLE4_FLOAT4(t,b);
2528: v = aa + 16*ai[1];
2530: for (i=1; i<n;) {
2531: PREFETCH_NTA(&v[8]);
2532: vi = aj + ai[i];
2533: nz = diag[i] - ai[i];
2534: idx += 4;
2536: /* Demote RHS from double to float. */
2537: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2538: LOAD_PS(&t[idx],XMM7);
2540: while (nz--) {
2541: PREFETCH_NTA(&v[16]);
2542: jdx = 4*(*vi++);
2543: /* jdx = *vi++; */
2544:
2545: /* 4x4 Matrix-Vector product with negative accumulation: */
2546: SSE_INLINE_BEGIN_2(&t[jdx],v)
2547: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2549: /* First Column */
2550: SSE_COPY_PS(XMM0,XMM6)
2551: SSE_SHUFFLE(XMM0,XMM0,0x00)
2552: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2553: SSE_SUB_PS(XMM7,XMM0)
2555: /* Second Column */
2556: SSE_COPY_PS(XMM1,XMM6)
2557: SSE_SHUFFLE(XMM1,XMM1,0x55)
2558: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2559: SSE_SUB_PS(XMM7,XMM1)
2561: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2562:
2563: /* Third Column */
2564: SSE_COPY_PS(XMM2,XMM6)
2565: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2566: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2567: SSE_SUB_PS(XMM7,XMM2)
2569: /* Fourth Column */
2570: SSE_COPY_PS(XMM3,XMM6)
2571: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2572: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2573: SSE_SUB_PS(XMM7,XMM3)
2574: SSE_INLINE_END_2
2575:
2576: v += 16;
2577: }
2578: v = aa + 16*ai[++i];
2579: PREFETCH_NTA(v);
2580: STORE_PS(&t[idx],XMM7);
2581: }
2583: /* Backward solve the upper triangular factor.*/
2585: idt = 4*(n-1);
2586: ai16 = 16*diag[n-1];
2587: v = aa + ai16 + 16;
2588: for (i=n-1; i>=0;){
2589: PREFETCH_NTA(&v[8]);
2590: vi = aj + diag[i] + 1;
2591: nz = ai[i+1] - diag[i] - 1;
2592:
2593: LOAD_PS(&t[idt],XMM7);
2595: while (nz--) {
2596: PREFETCH_NTA(&v[16]);
2597: idx = 4*(*vi++);
2598: /* idx = *vi++; */
2600: /* 4x4 Matrix-Vector Product with negative accumulation: */
2601: SSE_INLINE_BEGIN_2(&t[idx],v)
2602: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2604: /* First Column */
2605: SSE_COPY_PS(XMM0,XMM6)
2606: SSE_SHUFFLE(XMM0,XMM0,0x00)
2607: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2608: SSE_SUB_PS(XMM7,XMM0)
2610: /* Second Column */
2611: SSE_COPY_PS(XMM1,XMM6)
2612: SSE_SHUFFLE(XMM1,XMM1,0x55)
2613: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2614: SSE_SUB_PS(XMM7,XMM1)
2616: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2617:
2618: /* Third Column */
2619: SSE_COPY_PS(XMM2,XMM6)
2620: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2621: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2622: SSE_SUB_PS(XMM7,XMM2)
2624: /* Fourth Column */
2625: SSE_COPY_PS(XMM3,XMM6)
2626: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2627: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2628: SSE_SUB_PS(XMM7,XMM3)
2629: SSE_INLINE_END_2
2630: v += 16;
2631: }
2632: v = aa + ai16;
2633: ai16 = 16*diag[--i];
2634: PREFETCH_NTA(aa+ai16+16);
2635: /*
2636: Scale the result by the diagonal 4x4 block,
2637: which was inverted as part of the factorization
2638: */
2639: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2640: /* First Column */
2641: SSE_COPY_PS(XMM0,XMM7)
2642: SSE_SHUFFLE(XMM0,XMM0,0x00)
2643: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2645: /* Second Column */
2646: SSE_COPY_PS(XMM1,XMM7)
2647: SSE_SHUFFLE(XMM1,XMM1,0x55)
2648: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2649: SSE_ADD_PS(XMM0,XMM1)
2651: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2652:
2653: /* Third Column */
2654: SSE_COPY_PS(XMM2,XMM7)
2655: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2656: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2657: SSE_ADD_PS(XMM0,XMM2)
2659: /* Fourth Column */
2660: SSE_COPY_PS(XMM3,XMM7)
2661: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2662: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2663: SSE_ADD_PS(XMM0,XMM3)
2665: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2666: SSE_INLINE_END_3
2668: v = aa + ai16 + 16;
2669: idt -= 4;
2670: }
2672: /* Convert t from single precision back to double precision (inplace)*/
2673: idt = 4*(n-1);
2674: for (i=n-1;i>=0;i--) {
2675: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2676: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2677: PetscScalar *xtemp=&x[idt];
2678: MatScalar *ttemp=&t[idt];
2679: xtemp[3] = (PetscScalar)ttemp[3];
2680: xtemp[2] = (PetscScalar)ttemp[2];
2681: xtemp[1] = (PetscScalar)ttemp[1];
2682: xtemp[0] = (PetscScalar)ttemp[0];
2683: idt -= 4;
2684: }
2686: } /* End of artificial scope. */
2687: VecRestoreArray(bb,&b);
2688: VecRestoreArray(xx,&x);
2689: PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2690: SSE_SCOPE_END;
2691: return(0);
2692: }
2694: #endif
2697: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2698: {
2699: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2700: IS iscol=a->col,isrow=a->row;
2701: PetscErrorCode ierr;
2702: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2703: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2704: const MatScalar *aa=a->a,*v;
2705: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
2706: const PetscScalar *b;
2709: VecGetArray(bb,(PetscScalar**)&b);
2710: VecGetArray(xx,&x);
2711: t = a->solve_work;
2713: ISGetIndices(isrow,&rout); r = rout;
2714: ISGetIndices(iscol,&cout); c = cout + (n-1);
2716: /* forward solve the lower triangular */
2717: idx = 3*(*r++);
2718: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2719: for (i=1; i<n; i++) {
2720: v = aa + 9*ai[i];
2721: vi = aj + ai[i];
2722: nz = diag[i] - ai[i];
2723: idx = 3*(*r++);
2724: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2725: while (nz--) {
2726: idx = 3*(*vi++);
2727: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2728: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2729: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2730: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2731: v += 9;
2732: }
2733: idx = 3*i;
2734: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2735: }
2736: /* backward solve the upper triangular */
2737: for (i=n-1; i>=0; i--){
2738: v = aa + 9*diag[i] + 9;
2739: vi = aj + diag[i] + 1;
2740: nz = ai[i+1] - diag[i] - 1;
2741: idt = 3*i;
2742: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2743: while (nz--) {
2744: idx = 3*(*vi++);
2745: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2746: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2747: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2748: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2749: v += 9;
2750: }
2751: idc = 3*(*c--);
2752: v = aa + 9*diag[i];
2753: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2754: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2755: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2756: }
2757: ISRestoreIndices(isrow,&rout);
2758: ISRestoreIndices(iscol,&cout);
2759: VecRestoreArray(bb,(PetscScalar**)&b);
2760: VecRestoreArray(xx,&x);
2761: PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
2762: return(0);
2763: }
2765: /*
2766: Special case where the matrix was ILU(0) factored in the natural
2767: ordering. This eliminates the need for the column and row permutation.
2768: */
2771: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2772: {
2773: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2774: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2775: PetscErrorCode ierr;
2776: PetscInt *diag = a->diag;
2777: const MatScalar *aa=a->a,*v;
2778: PetscScalar *x,s1,s2,s3,x1,x2,x3;
2779: const PetscScalar *b;
2780: PetscInt jdx,idt,idx,nz,*vi,i;
2783: VecGetArray(bb,(PetscScalar**)&b);
2784: VecGetArray(xx,&x);
2786: /* forward solve the lower triangular */
2787: idx = 0;
2788: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
2789: for (i=1; i<n; i++) {
2790: v = aa + 9*ai[i];
2791: vi = aj + ai[i];
2792: nz = diag[i] - ai[i];
2793: idx += 3;
2794: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
2795: while (nz--) {
2796: jdx = 3*(*vi++);
2797: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2798: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2799: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2800: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2801: v += 9;
2802: }
2803: x[idx] = s1;
2804: x[1+idx] = s2;
2805: x[2+idx] = s3;
2806: }
2807: /* backward solve the upper triangular */
2808: for (i=n-1; i>=0; i--){
2809: v = aa + 9*diag[i] + 9;
2810: vi = aj + diag[i] + 1;
2811: nz = ai[i+1] - diag[i] - 1;
2812: idt = 3*i;
2813: s1 = x[idt]; s2 = x[1+idt];
2814: s3 = x[2+idt];
2815: while (nz--) {
2816: idx = 3*(*vi++);
2817: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
2818: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2819: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2820: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2821: v += 9;
2822: }
2823: v = aa + 9*diag[i];
2824: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2825: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2826: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2827: }
2829: VecRestoreArray(bb,(PetscScalar**)&b);
2830: VecRestoreArray(xx,&x);
2831: PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
2832: return(0);
2833: }
2837: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2838: {
2839: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2840: IS iscol=a->col,isrow=a->row;
2841: PetscErrorCode ierr;
2842: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2843: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2844: const MatScalar *aa=a->a,*v;
2845: PetscScalar *x,s1,s2,x1,x2,*t;
2846: const PetscScalar *b;
2849: VecGetArray(bb,(PetscScalar**)&b);
2850: VecGetArray(xx,&x);
2851: t = a->solve_work;
2853: ISGetIndices(isrow,&rout); r = rout;
2854: ISGetIndices(iscol,&cout); c = cout + (n-1);
2856: /* forward solve the lower triangular */
2857: idx = 2*(*r++);
2858: t[0] = b[idx]; t[1] = b[1+idx];
2859: for (i=1; i<n; i++) {
2860: v = aa + 4*ai[i];
2861: vi = aj + ai[i];
2862: nz = diag[i] - ai[i];
2863: idx = 2*(*r++);
2864: s1 = b[idx]; s2 = b[1+idx];
2865: while (nz--) {
2866: idx = 2*(*vi++);
2867: x1 = t[idx]; x2 = t[1+idx];
2868: s1 -= v[0]*x1 + v[2]*x2;
2869: s2 -= v[1]*x1 + v[3]*x2;
2870: v += 4;
2871: }
2872: idx = 2*i;
2873: t[idx] = s1; t[1+idx] = s2;
2874: }
2875: /* backward solve the upper triangular */
2876: for (i=n-1; i>=0; i--){
2877: v = aa + 4*diag[i] + 4;
2878: vi = aj + diag[i] + 1;
2879: nz = ai[i+1] - diag[i] - 1;
2880: idt = 2*i;
2881: s1 = t[idt]; s2 = t[1+idt];
2882: while (nz--) {
2883: idx = 2*(*vi++);
2884: x1 = t[idx]; x2 = t[1+idx];
2885: s1 -= v[0]*x1 + v[2]*x2;
2886: s2 -= v[1]*x1 + v[3]*x2;
2887: v += 4;
2888: }
2889: idc = 2*(*c--);
2890: v = aa + 4*diag[i];
2891: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
2892: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2893: }
2894: ISRestoreIndices(isrow,&rout);
2895: ISRestoreIndices(iscol,&cout);
2896: VecRestoreArray(bb,(PetscScalar**)&b);
2897: VecRestoreArray(xx,&x);
2898: PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
2899: return(0);
2900: }
2902: /*
2903: Special case where the matrix was ILU(0) factored in the natural
2904: ordering. This eliminates the need for the column and row permutation.
2905: */
2908: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2909: {
2910: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2911: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2912: PetscErrorCode ierr;
2913: PetscInt *diag = a->diag;
2914: const MatScalar *aa=a->a,*v;
2915: PetscScalar *x,s1,s2,x1,x2;
2916: const PetscScalar *b;
2917: PetscInt jdx,idt,idx,nz,*vi,i;
2920: VecGetArray(bb,(PetscScalar**)&b);
2921: VecGetArray(xx,&x);
2923: /* forward solve the lower triangular */
2924: idx = 0;
2925: x[0] = b[0]; x[1] = b[1];
2926: for (i=1; i<n; i++) {
2927: v = aa + 4*ai[i];
2928: vi = aj + ai[i];
2929: nz = diag[i] - ai[i];
2930: idx += 2;
2931: s1 = b[idx];s2 = b[1+idx];
2932: while (nz--) {
2933: jdx = 2*(*vi++);
2934: x1 = x[jdx];x2 = x[1+jdx];
2935: s1 -= v[0]*x1 + v[2]*x2;
2936: s2 -= v[1]*x1 + v[3]*x2;
2937: v += 4;
2938: }
2939: x[idx] = s1;
2940: x[1+idx] = s2;
2941: }
2942: /* backward solve the upper triangular */
2943: for (i=n-1; i>=0; i--){
2944: v = aa + 4*diag[i] + 4;
2945: vi = aj + diag[i] + 1;
2946: nz = ai[i+1] - diag[i] - 1;
2947: idt = 2*i;
2948: s1 = x[idt]; s2 = x[1+idt];
2949: while (nz--) {
2950: idx = 2*(*vi++);
2951: x1 = x[idx]; x2 = x[1+idx];
2952: s1 -= v[0]*x1 + v[2]*x2;
2953: s2 -= v[1]*x1 + v[3]*x2;
2954: v += 4;
2955: }
2956: v = aa + 4*diag[i];
2957: x[idt] = v[0]*s1 + v[2]*s2;
2958: x[1+idt] = v[1]*s1 + v[3]*s2;
2959: }
2961: VecRestoreArray(bb,(PetscScalar**)&b);
2962: VecRestoreArray(xx,&x);
2963: PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
2964: return(0);
2965: }
2969: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2970: {
2971: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2972: IS iscol=a->col,isrow=a->row;
2974: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2975: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2976: MatScalar *aa=a->a,*v;
2977: PetscScalar *x,*b,s1,*t;
2980: if (!n) return(0);
2982: VecGetArray(bb,&b);
2983: VecGetArray(xx,&x);
2984: t = a->solve_work;
2986: ISGetIndices(isrow,&rout); r = rout;
2987: ISGetIndices(iscol,&cout); c = cout + (n-1);
2989: /* forward solve the lower triangular */
2990: t[0] = b[*r++];
2991: for (i=1; i<n; i++) {
2992: v = aa + ai[i];
2993: vi = aj + ai[i];
2994: nz = diag[i] - ai[i];
2995: s1 = b[*r++];
2996: while (nz--) {
2997: s1 -= (*v++)*t[*vi++];
2998: }
2999: t[i] = s1;
3000: }
3001: /* backward solve the upper triangular */
3002: for (i=n-1; i>=0; i--){
3003: v = aa + diag[i] + 1;
3004: vi = aj + diag[i] + 1;
3005: nz = ai[i+1] - diag[i] - 1;
3006: s1 = t[i];
3007: while (nz--) {
3008: s1 -= (*v++)*t[*vi++];
3009: }
3010: x[*c--] = t[i] = aa[diag[i]]*s1;
3011: }
3013: ISRestoreIndices(isrow,&rout);
3014: ISRestoreIndices(iscol,&cout);
3015: VecRestoreArray(bb,&b);
3016: VecRestoreArray(xx,&x);
3017: PetscLogFlops(2*1*(a->nz) - A->cmap->n);
3018: return(0);
3019: }
3020: /*
3021: Special case where the matrix was ILU(0) factored in the natural
3022: ordering. This eliminates the need for the column and row permutation.
3023: */
3026: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3027: {
3028: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3029: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
3031: PetscInt *diag = a->diag;
3032: MatScalar *aa=a->a;
3033: PetscScalar *x,*b;
3034: PetscScalar s1,x1;
3035: MatScalar *v;
3036: PetscInt jdx,idt,idx,nz,*vi,i;
3039: VecGetArray(bb,&b);
3040: VecGetArray(xx,&x);
3042: /* forward solve the lower triangular */
3043: idx = 0;
3044: x[0] = b[0];
3045: for (i=1; i<n; i++) {
3046: v = aa + ai[i];
3047: vi = aj + ai[i];
3048: nz = diag[i] - ai[i];
3049: idx += 1;
3050: s1 = b[idx];
3051: while (nz--) {
3052: jdx = *vi++;
3053: x1 = x[jdx];
3054: s1 -= v[0]*x1;
3055: v += 1;
3056: }
3057: x[idx] = s1;
3058: }
3059: /* backward solve the upper triangular */
3060: for (i=n-1; i>=0; i--){
3061: v = aa + diag[i] + 1;
3062: vi = aj + diag[i] + 1;
3063: nz = ai[i+1] - diag[i] - 1;
3064: idt = i;
3065: s1 = x[idt];
3066: while (nz--) {
3067: idx = *vi++;
3068: x1 = x[idx];
3069: s1 -= v[0]*x1;
3070: v += 1;
3071: }
3072: v = aa + diag[i];
3073: x[idt] = v[0]*s1;
3074: }
3075: VecRestoreArray(bb,&b);
3076: VecRestoreArray(xx,&x);
3077: PetscLogFlops(2*(a->nz) - A->cmap->n);
3078: return(0);
3079: }
3081: /* ----------------------------------------------------------------*/
3082: /*
3083: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3084: except that the data structure of Mat_SeqAIJ is slightly different.
3085: Not a good example of code reuse.
3086: */
3087: EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3088: EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3092: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3093: {
3094: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3095: IS isicol;
3097: const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3098: PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3099: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3100: PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3101: PetscTruth col_identity,row_identity,both_identity,flg;
3102: PetscReal f;
3105: f = info->fill;
3106: levels = (PetscInt)info->levels;
3107: diagonal_fill = (PetscInt)info->diagonal_fill;
3108: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3109: ISIdentity(isrow,&row_identity);
3110: ISIdentity(iscol,&col_identity);
3111: both_identity = (PetscTruth) (row_identity && col_identity);
3113: if (!levels && both_identity) { /* special case copy the nonzero structure */
3114: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);
3115: fact->factor = MAT_FACTOR_ILU;
3116: b = (Mat_SeqBAIJ*)(fact)->data;
3117: MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);
3118: if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3119: b->row = isrow;
3120: b->col = iscol;
3121: PetscObjectReference((PetscObject)isrow);
3122: PetscObjectReference((PetscObject)iscol);
3123: b->icol = isicol;
3124: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3125: PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);
3126: } else { /* general case perform the symbolic factorization */
3127: ISGetIndices(isrow,&r);
3128: ISGetIndices(isicol,&ic);
3130: /* get new row pointers */
3131: PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3132: ainew[0] = 0;
3133: /* don't know how many column pointers are needed so estimate */
3134: jmax = (PetscInt)(f*ai[n] + 1);
3135: PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3136: /* ajfill is level of fill for each fill entry */
3137: PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3138: /* fill is a linked list of nonzeros in active row */
3139: PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3140: /* im is level for each filled value */
3141: PetscMalloc((n+1)*sizeof(PetscInt),&im);
3142: /* dloc is location of diagonal in factor */
3143: PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3144: dloc[0] = 0;
3145: for (prow=0; prow<n; prow++) {
3147: /* copy prow into linked list */
3148: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
3149: if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3150: xi = aj + ai[r[prow]];
3151: fill[n] = n;
3152: fill[prow] = -1; /* marker for diagonal entry */
3153: while (nz--) {
3154: fm = n;
3155: idx = ic[*xi++];
3156: do {
3157: m = fm;
3158: fm = fill[m];
3159: } while (fm < idx);
3160: fill[m] = idx;
3161: fill[idx] = fm;
3162: im[idx] = 0;
3163: }
3165: /* make sure diagonal entry is included */
3166: if (diagonal_fill && fill[prow] == -1) {
3167: fm = n;
3168: while (fill[fm] < prow) fm = fill[fm];
3169: fill[prow] = fill[fm]; /* insert diagonal into linked list */
3170: fill[fm] = prow;
3171: im[prow] = 0;
3172: nzf++;
3173: dcount++;
3174: }
3176: nzi = 0;
3177: row = fill[n];
3178: while (row < prow) {
3179: incrlev = im[row] + 1;
3180: nz = dloc[row];
3181: xi = ajnew + ainew[row] + nz + 1;
3182: flev = ajfill + ainew[row] + nz + 1;
3183: nnz = ainew[row+1] - ainew[row] - nz - 1;
3184: fm = row;
3185: while (nnz-- > 0) {
3186: idx = *xi++;
3187: if (*flev + incrlev > levels) {
3188: flev++;
3189: continue;
3190: }
3191: do {
3192: m = fm;
3193: fm = fill[m];
3194: } while (fm < idx);
3195: if (fm != idx) {
3196: im[idx] = *flev + incrlev;
3197: fill[m] = idx;
3198: fill[idx] = fm;
3199: fm = idx;
3200: nzf++;
3201: } else {
3202: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3203: }
3204: flev++;
3205: }
3206: row = fill[row];
3207: nzi++;
3208: }
3209: /* copy new filled row into permanent storage */
3210: ainew[prow+1] = ainew[prow] + nzf;
3211: if (ainew[prow+1] > jmax) {
3213: /* estimate how much additional space we will need */
3214: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3215: /* just double the memory each time */
3216: PetscInt maxadd = jmax;
3217: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3218: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3219: jmax += maxadd;
3221: /* allocate a longer ajnew and ajfill */
3222: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
3223: PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
3224: PetscFree(ajnew);
3225: ajnew = xitmp;
3226: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
3227: PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
3228: PetscFree(ajfill);
3229: ajfill = xitmp;
3230: reallocate++; /* count how many reallocations are needed */
3231: }
3232: xitmp = ajnew + ainew[prow];
3233: flev = ajfill + ainew[prow];
3234: dloc[prow] = nzi;
3235: fm = fill[n];
3236: while (nzf--) {
3237: *xitmp++ = fm;
3238: *flev++ = im[fm];
3239: fm = fill[fm];
3240: }
3241: /* make sure row has diagonal entry */
3242: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3243: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3244: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3245: }
3246: }
3247: PetscFree(ajfill);
3248: ISRestoreIndices(isrow,&r);
3249: ISRestoreIndices(isicol,&ic);
3250: PetscFree(fill);
3251: PetscFree(im);
3253: #if defined(PETSC_USE_INFO)
3254: {
3255: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3256: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
3257: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
3258: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
3259: PetscInfo(A,"for best performance.\n");
3260: if (diagonal_fill) {
3261: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
3262: }
3263: }
3264: #endif
3266: /* put together the new matrix */
3267: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
3268: PetscLogObjectParent(fact,isicol);
3269: b = (Mat_SeqBAIJ*)(fact)->data;
3270: b->free_a = PETSC_TRUE;
3271: b->free_ij = PETSC_TRUE;
3272: b->singlemalloc = PETSC_FALSE;
3273: PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
3274: b->j = ajnew;
3275: b->i = ainew;
3276: for (i=0; i<n; i++) dloc[i] += ainew[i];
3277: b->diag = dloc;
3278: b->ilen = 0;
3279: b->imax = 0;
3280: b->row = isrow;
3281: b->col = iscol;
3282: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3283: PetscObjectReference((PetscObject)isrow);
3284: PetscObjectReference((PetscObject)iscol);
3285: b->icol = isicol;
3286: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3287: /* In b structure: Free imax, ilen, old a, old j.
3288: Allocate dloc, solve_work, new a, new j */
3289: PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3290: b->maxnz = b->nz = ainew[n];
3292: (fact)->info.factor_mallocs = reallocate;
3293: (fact)->info.fill_ratio_given = f;
3294: (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3295: }
3296: MatSeqBAIJSetNumericFactorization(fact,both_identity);
3297: return(0);
3298: }
3302: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3303: {
3304: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3305: /* int i,*AJ=a->j,nz=a->nz; */
3307: /* Undo Column scaling */
3308: /* while (nz--) { */
3309: /* AJ[i] = AJ[i]/4; */
3310: /* } */
3311: /* This should really invoke a push/pop logic, but we don't have that yet. */
3312: A->ops->setunfactored = PETSC_NULL;
3313: return(0);
3314: }
3318: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3319: {
3320: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3321: PetscInt *AJ=a->j,nz=a->nz;
3322: unsigned short *aj=(unsigned short *)AJ;
3324: /* Is this really necessary? */
3325: while (nz--) {
3326: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3327: }
3328: A->ops->setunfactored = PETSC_NULL;
3329: return(0);
3330: }