Actual source code: baijfact2.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for BAIJ format. 
  5: */

 7:  #include ../src/mat/impls/baij/seq/baij.h
 8:  #include ../src/inline/ilu.h
 9:  #include ../src/inline/dot.h

 13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 17:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 18:   PetscInt       *diag = a->diag;
 19:   MatScalar      *aa=a->a,*v;
 20:   PetscScalar    s1,*x,*b;

 23:   VecCopy(bb,xx);
 24:   VecGetArray(bb,&b);
 25:   VecGetArray(xx,&x);
 26: 
 27:   /* forward solve the U^T */
 28:   for (i=0; i<n; i++) {

 30:     v     = aa + diag[i];
 31:     /* multiply by the inverse of the block diagonal */
 32:     s1    = (*v++)*x[i];
 33:     vi    = aj + diag[i] + 1;
 34:     nz    = ai[i+1] - diag[i] - 1;
 35:     while (nz--) {
 36:       x[*vi++]  -= (*v++)*s1;
 37:     }
 38:     x[i]   = s1;
 39:   }
 40:   /* backward solve the L^T */
 41:   for (i=n-1; i>=0; i--){
 42:     v    = aa + diag[i] - 1;
 43:     vi   = aj + diag[i] - 1;
 44:     nz   = diag[i] - ai[i];
 45:     s1   = x[i];
 46:     while (nz--) {
 47:       x[*vi--]   -=  (*v--)*s1;
 48:     }
 49:   }
 50:   VecRestoreArray(bb,&b);
 51:   VecRestoreArray(xx,&x);
 52:   PetscLogFlops(2*(a->nz) - A->cmap->n);
 53:   return(0);
 54: }

 58: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 59: {
 60:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 62:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 63:   PetscInt       *diag = a->diag,oidx;
 64:   MatScalar      *aa=a->a,*v;
 65:   PetscScalar    s1,s2,x1,x2;
 66:   PetscScalar    *x,*b;

 69:   VecCopy(bb,xx);
 70:   VecGetArray(bb,&b);
 71:   VecGetArray(xx,&x);

 73:   /* forward solve the U^T */
 74:   idx = 0;
 75:   for (i=0; i<n; i++) {

 77:     v     = aa + 4*diag[i];
 78:     /* multiply by the inverse of the block diagonal */
 79:     x1 = x[idx];   x2 = x[1+idx];
 80:     s1 = v[0]*x1  +  v[1]*x2;
 81:     s2 = v[2]*x1  +  v[3]*x2;
 82:     v += 4;

 84:     vi    = aj + diag[i] + 1;
 85:     nz    = ai[i+1] - diag[i] - 1;
 86:     while (nz--) {
 87:       oidx = 2*(*vi++);
 88:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 89:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 90:       v  += 4;
 91:     }
 92:     x[idx]   = s1;x[1+idx] = s2;
 93:     idx += 2;
 94:   }
 95:   /* backward solve the L^T */
 96:   for (i=n-1; i>=0; i--){
 97:     v    = aa + 4*diag[i] - 4;
 98:     vi   = aj + diag[i] - 1;
 99:     nz   = diag[i] - ai[i];
100:     idt  = 2*i;
101:     s1   = x[idt];  s2 = x[1+idt];
102:     while (nz--) {
103:       idx   = 2*(*vi--);
104:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106:       v -= 4;
107:     }
108:   }
109:   VecRestoreArray(bb,&b);
110:   VecRestoreArray(xx,&x);
111:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
112:   return(0);
113: }

117: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118: {
119:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
121:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122:   PetscInt       *diag = a->diag,oidx;
123:   MatScalar      *aa=a->a,*v;
124:   PetscScalar    s1,s2,s3,x1,x2,x3;
125:   PetscScalar    *x,*b;

128:   VecCopy(bb,xx);
129:   VecGetArray(bb,&b);
130:   VecGetArray(xx,&x);

132:   /* forward solve the U^T */
133:   idx = 0;
134:   for (i=0; i<n; i++) {

136:     v     = aa + 9*diag[i];
137:     /* multiply by the inverse of the block diagonal */
138:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142:     v += 9;

144:     vi    = aj + diag[i] + 1;
145:     nz    = ai[i+1] - diag[i] - 1;
146:     while (nz--) {
147:       oidx = 3*(*vi++);
148:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151:       v  += 9;
152:     }
153:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154:     idx += 3;
155:   }
156:   /* backward solve the L^T */
157:   for (i=n-1; i>=0; i--){
158:     v    = aa + 9*diag[i] - 9;
159:     vi   = aj + diag[i] - 1;
160:     nz   = diag[i] - ai[i];
161:     idt  = 3*i;
162:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163:     while (nz--) {
164:       idx   = 3*(*vi--);
165:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168:       v -= 9;
169:     }
170:   }
171:   VecRestoreArray(bb,&b);
172:   VecRestoreArray(xx,&x);
173:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
174:   return(0);
175: }

179: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180: {
181:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
183:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184:   PetscInt       *diag = a->diag,oidx;
185:   MatScalar      *aa=a->a,*v;
186:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187:   PetscScalar    *x,*b;

190:   VecCopy(bb,xx);
191:   VecGetArray(bb,&b);
192:   VecGetArray(xx,&x);

194:   /* forward solve the U^T */
195:   idx = 0;
196:   for (i=0; i<n; i++) {

198:     v     = aa + 16*diag[i];
199:     /* multiply by the inverse of the block diagonal */
200:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205:     v += 16;

207:     vi    = aj + diag[i] + 1;
208:     nz    = ai[i+1] - diag[i] - 1;
209:     while (nz--) {
210:       oidx = 4*(*vi++);
211:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215:       v  += 16;
216:     }
217:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218:     idx += 4;
219:   }
220:   /* backward solve the L^T */
221:   for (i=n-1; i>=0; i--){
222:     v    = aa + 16*diag[i] - 16;
223:     vi   = aj + diag[i] - 1;
224:     nz   = diag[i] - ai[i];
225:     idt  = 4*i;
226:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227:     while (nz--) {
228:       idx   = 4*(*vi--);
229:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233:       v -= 16;
234:     }
235:   }
236:   VecRestoreArray(bb,&b);
237:   VecRestoreArray(xx,&x);
238:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
239:   return(0);
240: }

244: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245: {
246:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
248:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249:   PetscInt       *diag = a->diag,oidx;
250:   MatScalar      *aa=a->a,*v;
251:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252:   PetscScalar    *x,*b;

255:   VecCopy(bb,xx);
256:   VecGetArray(bb,&b);
257:   VecGetArray(xx,&x);

259:   /* forward solve the U^T */
260:   idx = 0;
261:   for (i=0; i<n; i++) {

263:     v     = aa + 25*diag[i];
264:     /* multiply by the inverse of the block diagonal */
265:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271:     v += 25;

273:     vi    = aj + diag[i] + 1;
274:     nz    = ai[i+1] - diag[i] - 1;
275:     while (nz--) {
276:       oidx = 5*(*vi++);
277:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282:       v  += 25;
283:     }
284:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285:     idx += 5;
286:   }
287:   /* backward solve the L^T */
288:   for (i=n-1; i>=0; i--){
289:     v    = aa + 25*diag[i] - 25;
290:     vi   = aj + diag[i] - 1;
291:     nz   = diag[i] - ai[i];
292:     idt  = 5*i;
293:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294:     while (nz--) {
295:       idx   = 5*(*vi--);
296:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301:       v -= 25;
302:     }
303:   }
304:   VecRestoreArray(bb,&b);
305:   VecRestoreArray(xx,&x);
306:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
307:   return(0);
308: }

312: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313: {
314:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
316:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317:   PetscInt       *diag = a->diag,oidx;
318:   MatScalar      *aa=a->a,*v;
319:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320:   PetscScalar    *x,*b;

323:   VecCopy(bb,xx);
324:   VecGetArray(bb,&b);
325:   VecGetArray(xx,&x);

327:   /* forward solve the U^T */
328:   idx = 0;
329:   for (i=0; i<n; i++) {

331:     v     = aa + 36*diag[i];
332:     /* multiply by the inverse of the block diagonal */
333:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334:     x6    = x[5+idx];
335:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341:     v += 36;

343:     vi    = aj + diag[i] + 1;
344:     nz    = ai[i+1] - diag[i] - 1;
345:     while (nz--) {
346:       oidx = 6*(*vi++);
347:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353:       v  += 36;
354:     }
355:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356:     x[5+idx] = s6;
357:     idx += 6;
358:   }
359:   /* backward solve the L^T */
360:   for (i=n-1; i>=0; i--){
361:     v    = aa + 36*diag[i] - 36;
362:     vi   = aj + diag[i] - 1;
363:     nz   = diag[i] - ai[i];
364:     idt  = 6*i;
365:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366:     s6 = x[5+idt];
367:     while (nz--) {
368:       idx   = 6*(*vi--);
369:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375:       v -= 36;
376:     }
377:   }
378:   VecRestoreArray(bb,&b);
379:   VecRestoreArray(xx,&x);
380:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
381:   return(0);
382: }

386: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387: {
388:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
390:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391:   PetscInt       *diag = a->diag,oidx;
392:   MatScalar      *aa=a->a,*v;
393:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394:   PetscScalar    *x,*b;

397:   VecCopy(bb,xx);
398:   VecGetArray(bb,&b);
399:   VecGetArray(xx,&x);

401:   /* forward solve the U^T */
402:   idx = 0;
403:   for (i=0; i<n; i++) {

405:     v     = aa + 49*diag[i];
406:     /* multiply by the inverse of the block diagonal */
407:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408:     x6    = x[5+idx]; x7 = x[6+idx];
409:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416:     v += 49;

418:     vi    = aj + diag[i] + 1;
419:     nz    = ai[i+1] - diag[i] - 1;
420:     while (nz--) {
421:       oidx = 7*(*vi++);
422:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429:       v  += 49;
430:     }
431:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432:     x[5+idx] = s6;x[6+idx] = s7;
433:     idx += 7;
434:   }
435:   /* backward solve the L^T */
436:   for (i=n-1; i>=0; i--){
437:     v    = aa + 49*diag[i] - 49;
438:     vi   = aj + diag[i] - 1;
439:     nz   = diag[i] - ai[i];
440:     idt  = 7*i;
441:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442:     s6 = x[5+idt];s7 = x[6+idt];
443:     while (nz--) {
444:       idx   = 7*(*vi--);
445:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452:       v -= 49;
453:     }
454:   }
455:   VecRestoreArray(bb,&b);
456:   VecRestoreArray(xx,&x);
457:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);
458:   return(0);
459: }

461: /*---------------------------------------------------------------------------------------------*/
464: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465: {
466:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467:   IS             iscol=a->col,isrow=a->row;
469:   const PetscInt *r,*c,*rout,*cout;
470:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471:   PetscInt       *diag = a->diag;
472:   MatScalar      *aa=a->a,*v;
473:   PetscScalar    s1,*x,*b,*t;

476:   VecGetArray(bb,&b);
477:   VecGetArray(xx,&x);
478:   t  = a->solve_work;

480:   ISGetIndices(isrow,&rout); r = rout;
481:   ISGetIndices(iscol,&cout); c = cout;

483:   /* copy the b into temp work space according to permutation */
484:   for (i=0; i<n; i++) {
485:     t[i] = b[c[i]];
486:   }

488:   /* forward solve the U^T */
489:   for (i=0; i<n; i++) {

491:     v     = aa + diag[i];
492:     /* multiply by the inverse of the block diagonal */
493:     s1    = (*v++)*t[i];
494:     vi    = aj + diag[i] + 1;
495:     nz    = ai[i+1] - diag[i] - 1;
496:     while (nz--) {
497:       t[*vi++]  -= (*v++)*s1;
498:     }
499:     t[i]   = s1;
500:   }
501:   /* backward solve the L^T */
502:   for (i=n-1; i>=0; i--){
503:     v    = aa + diag[i] - 1;
504:     vi   = aj + diag[i] - 1;
505:     nz   = diag[i] - ai[i];
506:     s1   = t[i];
507:     while (nz--) {
508:       t[*vi--]   -=  (*v--)*s1;
509:     }
510:   }

512:   /* copy t into x according to permutation */
513:   for (i=0; i<n; i++) {
514:     x[r[i]]   = t[i];
515:   }

517:   ISRestoreIndices(isrow,&rout);
518:   ISRestoreIndices(iscol,&cout);
519:   VecRestoreArray(bb,&b);
520:   VecRestoreArray(xx,&x);
521:   PetscLogFlops(2*(a->nz) - A->cmap->n);
522:   return(0);
523: }

527: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528: {
529:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530:   IS             iscol=a->col,isrow=a->row;
532:   const PetscInt *r,*c,*rout,*cout;
533:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
535:   MatScalar      *aa=a->a,*v;
536:   PetscScalar    s1,s2,x1,x2;
537:   PetscScalar    *x,*b,*t;

540:   VecGetArray(bb,&b);
541:   VecGetArray(xx,&x);
542:   t  = a->solve_work;

544:   ISGetIndices(isrow,&rout); r = rout;
545:   ISGetIndices(iscol,&cout); c = cout;

547:   /* copy the b into temp work space according to permutation */
548:   ii = 0;
549:   for (i=0; i<n; i++) {
550:     ic      = 2*c[i];
551:     t[ii]   = b[ic];
552:     t[ii+1] = b[ic+1];
553:     ii += 2;
554:   }

556:   /* forward solve the U^T */
557:   idx = 0;
558:   for (i=0; i<n; i++) {

560:     v     = aa + 4*diag[i];
561:     /* multiply by the inverse of the block diagonal */
562:     x1    = t[idx];   x2 = t[1+idx];
563:     s1 = v[0]*x1  +  v[1]*x2;
564:     s2 = v[2]*x1  +  v[3]*x2;
565:     v += 4;

567:     vi    = aj + diag[i] + 1;
568:     nz    = ai[i+1] - diag[i] - 1;
569:     while (nz--) {
570:       oidx = 2*(*vi++);
571:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
572:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
573:       v  += 4;
574:     }
575:     t[idx]   = s1;t[1+idx] = s2;
576:     idx += 2;
577:   }
578:   /* backward solve the L^T */
579:   for (i=n-1; i>=0; i--){
580:     v    = aa + 4*diag[i] - 4;
581:     vi   = aj + diag[i] - 1;
582:     nz   = diag[i] - ai[i];
583:     idt  = 2*i;
584:     s1 = t[idt];  s2 = t[1+idt];
585:     while (nz--) {
586:       idx   = 2*(*vi--);
587:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
588:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
589:       v -= 4;
590:     }
591:   }

593:   /* copy t into x according to permutation */
594:   ii = 0;
595:   for (i=0; i<n; i++) {
596:     ir      = 2*r[i];
597:     x[ir]   = t[ii];
598:     x[ir+1] = t[ii+1];
599:     ii += 2;
600:   }

602:   ISRestoreIndices(isrow,&rout);
603:   ISRestoreIndices(iscol,&cout);
604:   VecRestoreArray(bb,&b);
605:   VecRestoreArray(xx,&x);
606:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
607:   return(0);
608: }

612: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613: {
614:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
615:   IS             iscol=a->col,isrow=a->row;
617:   const PetscInt *r,*c,*rout,*cout;
618:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
620:   MatScalar      *aa=a->a,*v;
621:   PetscScalar    s1,s2,s3,x1,x2,x3;
622:   PetscScalar    *x,*b,*t;

625:   VecGetArray(bb,&b);
626:   VecGetArray(xx,&x);
627:   t  = a->solve_work;

629:   ISGetIndices(isrow,&rout); r = rout;
630:   ISGetIndices(iscol,&cout); c = cout;

632:   /* copy the b into temp work space according to permutation */
633:   ii = 0;
634:   for (i=0; i<n; i++) {
635:     ic      = 3*c[i];
636:     t[ii]   = b[ic];
637:     t[ii+1] = b[ic+1];
638:     t[ii+2] = b[ic+2];
639:     ii += 3;
640:   }

642:   /* forward solve the U^T */
643:   idx = 0;
644:   for (i=0; i<n; i++) {

646:     v     = aa + 9*diag[i];
647:     /* multiply by the inverse of the block diagonal */
648:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
649:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
650:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
651:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
652:     v += 9;

654:     vi    = aj + diag[i] + 1;
655:     nz    = ai[i+1] - diag[i] - 1;
656:     while (nz--) {
657:       oidx = 3*(*vi++);
658:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
659:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
660:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661:       v  += 9;
662:     }
663:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
664:     idx += 3;
665:   }
666:   /* backward solve the L^T */
667:   for (i=n-1; i>=0; i--){
668:     v    = aa + 9*diag[i] - 9;
669:     vi   = aj + diag[i] - 1;
670:     nz   = diag[i] - ai[i];
671:     idt  = 3*i;
672:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
673:     while (nz--) {
674:       idx   = 3*(*vi--);
675:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
676:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
677:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678:       v -= 9;
679:     }
680:   }

682:   /* copy t into x according to permutation */
683:   ii = 0;
684:   for (i=0; i<n; i++) {
685:     ir      = 3*r[i];
686:     x[ir]   = t[ii];
687:     x[ir+1] = t[ii+1];
688:     x[ir+2] = t[ii+2];
689:     ii += 3;
690:   }

692:   ISRestoreIndices(isrow,&rout);
693:   ISRestoreIndices(iscol,&cout);
694:   VecRestoreArray(bb,&b);
695:   VecRestoreArray(xx,&x);
696:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
697:   return(0);
698: }

702: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703: {
704:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
705:   IS             iscol=a->col,isrow=a->row;
707:   const PetscInt *r,*c,*rout,*cout;
708:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
710:   MatScalar      *aa=a->a,*v;
711:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
712:   PetscScalar    *x,*b,*t;

715:   VecGetArray(bb,&b);
716:   VecGetArray(xx,&x);
717:   t  = a->solve_work;

719:   ISGetIndices(isrow,&rout); r = rout;
720:   ISGetIndices(iscol,&cout); c = cout;

722:   /* copy the b into temp work space according to permutation */
723:   ii = 0;
724:   for (i=0; i<n; i++) {
725:     ic      = 4*c[i];
726:     t[ii]   = b[ic];
727:     t[ii+1] = b[ic+1];
728:     t[ii+2] = b[ic+2];
729:     t[ii+3] = b[ic+3];
730:     ii += 4;
731:   }

733:   /* forward solve the U^T */
734:   idx = 0;
735:   for (i=0; i<n; i++) {

737:     v     = aa + 16*diag[i];
738:     /* multiply by the inverse of the block diagonal */
739:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
740:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
741:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
742:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
743:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744:     v += 16;

746:     vi    = aj + diag[i] + 1;
747:     nz    = ai[i+1] - diag[i] - 1;
748:     while (nz--) {
749:       oidx = 4*(*vi++);
750:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
751:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
752:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754:       v  += 16;
755:     }
756:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757:     idx += 4;
758:   }
759:   /* backward solve the L^T */
760:   for (i=n-1; i>=0; i--){
761:     v    = aa + 16*diag[i] - 16;
762:     vi   = aj + diag[i] - 1;
763:     nz   = diag[i] - ai[i];
764:     idt  = 4*i;
765:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766:     while (nz--) {
767:       idx   = 4*(*vi--);
768:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
769:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
770:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772:       v -= 16;
773:     }
774:   }

776:   /* copy t into x according to permutation */
777:   ii = 0;
778:   for (i=0; i<n; i++) {
779:     ir      = 4*r[i];
780:     x[ir]   = t[ii];
781:     x[ir+1] = t[ii+1];
782:     x[ir+2] = t[ii+2];
783:     x[ir+3] = t[ii+3];
784:     ii += 4;
785:   }

787:   ISRestoreIndices(isrow,&rout);
788:   ISRestoreIndices(iscol,&cout);
789:   VecRestoreArray(bb,&b);
790:   VecRestoreArray(xx,&x);
791:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
792:   return(0);
793: }

797: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798: {
799:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
800:   IS             iscol=a->col,isrow=a->row;
802:   const PetscInt *r,*c,*rout,*cout;
803:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
805:   MatScalar      *aa=a->a,*v;
806:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
807:   PetscScalar    *x,*b,*t;

810:   VecGetArray(bb,&b);
811:   VecGetArray(xx,&x);
812:   t  = a->solve_work;

814:   ISGetIndices(isrow,&rout); r = rout;
815:   ISGetIndices(iscol,&cout); c = cout;

817:   /* copy the b into temp work space according to permutation */
818:   ii = 0;
819:   for (i=0; i<n; i++) {
820:     ic      = 5*c[i];
821:     t[ii]   = b[ic];
822:     t[ii+1] = b[ic+1];
823:     t[ii+2] = b[ic+2];
824:     t[ii+3] = b[ic+3];
825:     t[ii+4] = b[ic+4];
826:     ii += 5;
827:   }

829:   /* forward solve the U^T */
830:   idx = 0;
831:   for (i=0; i<n; i++) {

833:     v     = aa + 25*diag[i];
834:     /* multiply by the inverse of the block diagonal */
835:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
837:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
838:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841:     v += 25;

843:     vi    = aj + diag[i] + 1;
844:     nz    = ai[i+1] - diag[i] - 1;
845:     while (nz--) {
846:       oidx = 5*(*vi++);
847:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852:       v  += 25;
853:     }
854:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855:     idx += 5;
856:   }
857:   /* backward solve the L^T */
858:   for (i=n-1; i>=0; i--){
859:     v    = aa + 25*diag[i] - 25;
860:     vi   = aj + diag[i] - 1;
861:     nz   = diag[i] - ai[i];
862:     idt  = 5*i;
863:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864:     while (nz--) {
865:       idx   = 5*(*vi--);
866:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
867:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
868:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871:       v -= 25;
872:     }
873:   }

875:   /* copy t into x according to permutation */
876:   ii = 0;
877:   for (i=0; i<n; i++) {
878:     ir      = 5*r[i];
879:     x[ir]   = t[ii];
880:     x[ir+1] = t[ii+1];
881:     x[ir+2] = t[ii+2];
882:     x[ir+3] = t[ii+3];
883:     x[ir+4] = t[ii+4];
884:     ii += 5;
885:   }

887:   ISRestoreIndices(isrow,&rout);
888:   ISRestoreIndices(iscol,&cout);
889:   VecRestoreArray(bb,&b);
890:   VecRestoreArray(xx,&x);
891:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
892:   return(0);
893: }

897: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898: {
899:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
900:   IS             iscol=a->col,isrow=a->row;
902:   const PetscInt *r,*c,*rout,*cout;
903:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
905:   MatScalar      *aa=a->a,*v;
906:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
907:   PetscScalar    *x,*b,*t;

910:   VecGetArray(bb,&b);
911:   VecGetArray(xx,&x);
912:   t  = a->solve_work;

914:   ISGetIndices(isrow,&rout); r = rout;
915:   ISGetIndices(iscol,&cout); c = cout;

917:   /* copy the b into temp work space according to permutation */
918:   ii = 0;
919:   for (i=0; i<n; i++) {
920:     ic      = 6*c[i];
921:     t[ii]   = b[ic];
922:     t[ii+1] = b[ic+1];
923:     t[ii+2] = b[ic+2];
924:     t[ii+3] = b[ic+3];
925:     t[ii+4] = b[ic+4];
926:     t[ii+5] = b[ic+5];
927:     ii += 6;
928:   }

930:   /* forward solve the U^T */
931:   idx = 0;
932:   for (i=0; i<n; i++) {

934:     v     = aa + 36*diag[i];
935:     /* multiply by the inverse of the block diagonal */
936:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937:     x6    = t[5+idx];
938:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
939:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
940:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944:     v += 36;

946:     vi    = aj + diag[i] + 1;
947:     nz    = ai[i+1] - diag[i] - 1;
948:     while (nz--) {
949:       oidx = 6*(*vi++);
950:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
951:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
952:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956:       v  += 36;
957:     }
958:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959:     t[5+idx] = s6;
960:     idx += 6;
961:   }
962:   /* backward solve the L^T */
963:   for (i=n-1; i>=0; i--){
964:     v    = aa + 36*diag[i] - 36;
965:     vi   = aj + diag[i] - 1;
966:     nz   = diag[i] - ai[i];
967:     idt  = 6*i;
968:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969:     s6 = t[5+idt];
970:     while (nz--) {
971:       idx   = 6*(*vi--);
972:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
973:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
974:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978:       v -= 36;
979:     }
980:   }

982:   /* copy t into x according to permutation */
983:   ii = 0;
984:   for (i=0; i<n; i++) {
985:     ir      = 6*r[i];
986:     x[ir]   = t[ii];
987:     x[ir+1] = t[ii+1];
988:     x[ir+2] = t[ii+2];
989:     x[ir+3] = t[ii+3];
990:     x[ir+4] = t[ii+4];
991:     x[ir+5] = t[ii+5];
992:     ii += 6;
993:   }

995:   ISRestoreIndices(isrow,&rout);
996:   ISRestoreIndices(iscol,&cout);
997:   VecRestoreArray(bb,&b);
998:   VecRestoreArray(xx,&x);
999:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1000:   return(0);
1001: }

1005: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006: {
1007:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1008:   IS             iscol=a->col,isrow=a->row;
1010:   const PetscInt *r,*c,*rout,*cout;
1011:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1013:   MatScalar      *aa=a->a,*v;
1014:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1015:   PetscScalar    *x,*b,*t;

1018:   VecGetArray(bb,&b);
1019:   VecGetArray(xx,&x);
1020:   t  = a->solve_work;

1022:   ISGetIndices(isrow,&rout); r = rout;
1023:   ISGetIndices(iscol,&cout); c = cout;

1025:   /* copy the b into temp work space according to permutation */
1026:   ii = 0;
1027:   for (i=0; i<n; i++) {
1028:     ic      = 7*c[i];
1029:     t[ii]   = b[ic];
1030:     t[ii+1] = b[ic+1];
1031:     t[ii+2] = b[ic+2];
1032:     t[ii+3] = b[ic+3];
1033:     t[ii+4] = b[ic+4];
1034:     t[ii+5] = b[ic+5];
1035:     t[ii+6] = b[ic+6];
1036:     ii += 7;
1037:   }

1039:   /* forward solve the U^T */
1040:   idx = 0;
1041:   for (i=0; i<n; i++) {

1043:     v     = aa + 49*diag[i];
1044:     /* multiply by the inverse of the block diagonal */
1045:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046:     x6    = t[5+idx]; x7 = t[6+idx];
1047:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1048:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054:     v += 49;

1056:     vi    = aj + diag[i] + 1;
1057:     nz    = ai[i+1] - diag[i] - 1;
1058:     while (nz--) {
1059:       oidx = 7*(*vi++);
1060:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067:       v  += 49;
1068:     }
1069:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070:     t[5+idx] = s6;t[6+idx] = s7;
1071:     idx += 7;
1072:   }
1073:   /* backward solve the L^T */
1074:   for (i=n-1; i>=0; i--){
1075:     v    = aa + 49*diag[i] - 49;
1076:     vi   = aj + diag[i] - 1;
1077:     nz   = diag[i] - ai[i];
1078:     idt  = 7*i;
1079:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080:     s6 = t[5+idt];s7 = t[6+idt];
1081:     while (nz--) {
1082:       idx   = 7*(*vi--);
1083:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1084:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090:       v -= 49;
1091:     }
1092:   }

1094:   /* copy t into x according to permutation */
1095:   ii = 0;
1096:   for (i=0; i<n; i++) {
1097:     ir      = 7*r[i];
1098:     x[ir]   = t[ii];
1099:     x[ir+1] = t[ii+1];
1100:     x[ir+2] = t[ii+2];
1101:     x[ir+3] = t[ii+3];
1102:     x[ir+4] = t[ii+4];
1103:     x[ir+5] = t[ii+5];
1104:     x[ir+6] = t[ii+6];
1105:     ii += 7;
1106:   }

1108:   ISRestoreIndices(isrow,&rout);
1109:   ISRestoreIndices(iscol,&cout);
1110:   VecRestoreArray(bb,&b);
1111:   VecRestoreArray(xx,&x);
1112:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);
1113:   return(0);
1114: }

1116: /* ----------------------------------------------------------- */
1119: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1120: {
1121:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1122:   IS             iscol=a->col,isrow=a->row;
1124:   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1125:   PetscInt       i,n=a->mbs;
1126:   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1127:   MatScalar      *aa=a->a,*v;
1128:   PetscScalar    *x,*b,*s,*t,*ls;

1131:   VecGetArray(bb,&b);
1132:   VecGetArray(xx,&x);
1133:   t  = a->solve_work;

1135:   ISGetIndices(isrow,&rout); r = rout;
1136:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1138:   /* forward solve the lower triangular */
1139:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1140:   for (i=1; i<n; i++) {
1141:     v   = aa + bs2*ai[i];
1142:     vi  = aj + ai[i];
1143:     nz  = a->diag[i] - ai[i];
1144:     s = t + bs*i;
1145:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1146:     while (nz--) {
1147:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1148:       v += bs2;
1149:     }
1150:   }
1151:   /* backward solve the upper triangular */
1152:   ls = a->solve_work + A->cmap->n;
1153:   for (i=n-1; i>=0; i--){
1154:     v   = aa + bs2*(a->diag[i] + 1);
1155:     vi  = aj + a->diag[i] + 1;
1156:     nz  = ai[i+1] - a->diag[i] - 1;
1157:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1158:     while (nz--) {
1159:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1160:       v += bs2;
1161:     }
1162:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1163:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1164:   }

1166:   ISRestoreIndices(isrow,&rout);
1167:   ISRestoreIndices(iscol,&cout);
1168:   VecRestoreArray(bb,&b);
1169:   VecRestoreArray(xx,&x);
1170:   PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1171:   return(0);
1172: }

1176: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1177: {
1178:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1179:   IS             iscol=a->col,isrow=a->row;
1181:   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1182:   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1183:   MatScalar      *aa=a->a,*v;
1184:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1185:   PetscScalar    *x,*b,*t;

1188:   VecGetArray(bb,&b);
1189:   VecGetArray(xx,&x);
1190:   t  = a->solve_work;

1192:   ISGetIndices(isrow,&rout); r = rout;
1193:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1195:   /* forward solve the lower triangular */
1196:   idx    = 7*(*r++);
1197:   t[0] = b[idx];   t[1] = b[1+idx];
1198:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199:   t[5] = b[5+idx]; t[6] = b[6+idx];

1201:   for (i=1; i<n; i++) {
1202:     v     = aa + 49*ai[i];
1203:     vi    = aj + ai[i];
1204:     nz    = diag[i] - ai[i];
1205:     idx   = 7*(*r++);
1206:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1208:     while (nz--) {
1209:       idx   = 7*(*vi++);
1210:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1211:       x4    = t[3+idx];x5 = t[4+idx];
1212:       x6    = t[5+idx];x7 = t[6+idx];
1213:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1220:       v += 49;
1221:     }
1222:     idx = 7*i;
1223:     t[idx]   = s1;t[1+idx] = s2;
1224:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225:     t[5+idx] = s6;t[6+idx] = s7;
1226:   }
1227:   /* backward solve the upper triangular */
1228:   for (i=n-1; i>=0; i--){
1229:     v    = aa + 49*diag[i] + 49;
1230:     vi   = aj + diag[i] + 1;
1231:     nz   = ai[i+1] - diag[i] - 1;
1232:     idt  = 7*i;
1233:     s1 = t[idt];  s2 = t[1+idt];
1234:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235:     s6 = t[5+idt];s7 = t[6+idt];
1236:     while (nz--) {
1237:       idx   = 7*(*vi++);
1238:       x1    = t[idx];   x2 = t[1+idx];
1239:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240:       x6    = t[5+idx]; x7 = t[6+idx];
1241:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1248:       v += 49;
1249:     }
1250:     idc = 7*(*c--);
1251:     v   = aa + 49*diag[i];
1252:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1253:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1266:   }

1268:   ISRestoreIndices(isrow,&rout);
1269:   ISRestoreIndices(iscol,&cout);
1270:   VecRestoreArray(bb,&b);
1271:   VecRestoreArray(xx,&x);
1272:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);
1273:   return(0);
1274: }

1278: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1279: {
1280:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1281:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282:   PetscErrorCode    ierr;
1283:   PetscInt          *diag = a->diag,jdx;
1284:   const MatScalar   *aa=a->a,*v;
1285:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286:   const PetscScalar *b;

1289:   VecGetArray(bb,(PetscScalar**)&b);
1290:   VecGetArray(xx,&x);
1291:   /* forward solve the lower triangular */
1292:   idx    = 0;
1293:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1294:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1295:   x[6] = b[6+idx];
1296:   for (i=1; i<n; i++) {
1297:     v     =  aa + 49*ai[i];
1298:     vi    =  aj + ai[i];
1299:     nz    =  diag[i] - ai[i];
1300:     idx   =  7*i;
1301:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1302:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303:     s7  =  b[6+idx];
1304:     while (nz--) {
1305:       jdx   = 7*(*vi++);
1306:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1307:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1308:       x7    = x[6+jdx];
1309:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1316:       v += 49;
1317:      }
1318:     x[idx]   = s1;
1319:     x[1+idx] = s2;
1320:     x[2+idx] = s3;
1321:     x[3+idx] = s4;
1322:     x[4+idx] = s5;
1323:     x[5+idx] = s6;
1324:     x[6+idx] = s7;
1325:   }
1326:   /* backward solve the upper triangular */
1327:   for (i=n-1; i>=0; i--){
1328:     v    = aa + 49*diag[i] + 49;
1329:     vi   = aj + diag[i] + 1;
1330:     nz   = ai[i+1] - diag[i] - 1;
1331:     idt  = 7*i;
1332:     s1 = x[idt];   s2 = x[1+idt];
1333:     s3 = x[2+idt]; s4 = x[3+idt];
1334:     s5 = x[4+idt]; s6 = x[5+idt];
1335:     s7 = x[6+idt];
1336:     while (nz--) {
1337:       idx   = 7*(*vi++);
1338:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1339:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1340:       x7    = x[6+idx];
1341:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1348:       v += 49;
1349:     }
1350:     v        = aa + 49*diag[i];
1351:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1352:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1354:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1356:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1358:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1360:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1362:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1364:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1365:   }

1367:   VecRestoreArray(bb,(PetscScalar**)&b);
1368:   VecRestoreArray(xx,&x);
1369:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1370:   return(0);
1371: }

1375: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1376: {
1377:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1378:   IS                iscol=a->col,isrow=a->row;
1379:   PetscErrorCode    ierr;
1380:   const PetscInt    *r,*c,*rout,*cout;
1381:   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1382:   const MatScalar   *aa=a->a,*v;
1383:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1384:   const PetscScalar *b;
1386:   VecGetArray(bb,(PetscScalar**)&b);
1387:   VecGetArray(xx,&x);
1388:   t  = a->solve_work;

1390:   ISGetIndices(isrow,&rout); r = rout;
1391:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1393:   /* forward solve the lower triangular */
1394:   idx    = 6*(*r++);
1395:   t[0] = b[idx];   t[1] = b[1+idx];
1396:   t[2] = b[2+idx]; t[3] = b[3+idx];
1397:   t[4] = b[4+idx]; t[5] = b[5+idx];
1398:   for (i=1; i<n; i++) {
1399:     v     = aa + 36*ai[i];
1400:     vi    = aj + ai[i];
1401:     nz    = diag[i] - ai[i];
1402:     idx   = 6*(*r++);
1403:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1404:     s5  = b[4+idx]; s6 = b[5+idx];
1405:     while (nz--) {
1406:       idx   = 6*(*vi++);
1407:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1408:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1409:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415:       v += 36;
1416:     }
1417:     idx = 6*i;
1418:     t[idx]   = s1;t[1+idx] = s2;
1419:     t[2+idx] = s3;t[3+idx] = s4;
1420:     t[4+idx] = s5;t[5+idx] = s6;
1421:   }
1422:   /* backward solve the upper triangular */
1423:   for (i=n-1; i>=0; i--){
1424:     v    = aa + 36*diag[i] + 36;
1425:     vi   = aj + diag[i] + 1;
1426:     nz   = ai[i+1] - diag[i] - 1;
1427:     idt  = 6*i;
1428:     s1 = t[idt];  s2 = t[1+idt];
1429:     s3 = t[2+idt];s4 = t[3+idt];
1430:     s5 = t[4+idt];s6 = t[5+idt];
1431:     while (nz--) {
1432:       idx   = 6*(*vi++);
1433:       x1    = t[idx];   x2 = t[1+idx];
1434:       x3    = t[2+idx]; x4 = t[3+idx];
1435:       x5    = t[4+idx]; x6 = t[5+idx];
1436:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1437:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1438:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1439:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1440:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1441:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1442:       v += 36;
1443:     }
1444:     idc = 6*(*c--);
1445:     v   = aa + 36*diag[i];
1446:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1447:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1448:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1449:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1450:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1451:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1452:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1453:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1454:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1455:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1456:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1457:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1458:   }

1460:   ISRestoreIndices(isrow,&rout);
1461:   ISRestoreIndices(iscol,&cout);
1462:   VecRestoreArray(bb,(PetscScalar**)&b);
1463:   VecRestoreArray(xx,&x);
1464:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1465:   return(0);
1466: }

1470: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1471: {
1472:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1473:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1474:   PetscErrorCode    ierr;
1475:   PetscInt          *diag = a->diag,jdx;
1476:   const MatScalar   *aa=a->a,*v;
1477:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1478:   const PetscScalar *b;

1481:   VecGetArray(bb,(PetscScalar**)&b);
1482:   VecGetArray(xx,&x);
1483:   /* forward solve the lower triangular */
1484:   idx    = 0;
1485:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1486:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1487:   for (i=1; i<n; i++) {
1488:     v     =  aa + 36*ai[i];
1489:     vi    =  aj + ai[i];
1490:     nz    =  diag[i] - ai[i];
1491:     idx   =  6*i;
1492:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1493:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1494:     while (nz--) {
1495:       jdx   = 6*(*vi++);
1496:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1497:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1498:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1499:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1500:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1501:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1502:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1503:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1504:       v += 36;
1505:      }
1506:     x[idx]   = s1;
1507:     x[1+idx] = s2;
1508:     x[2+idx] = s3;
1509:     x[3+idx] = s4;
1510:     x[4+idx] = s5;
1511:     x[5+idx] = s6;
1512:   }
1513:   /* backward solve the upper triangular */
1514:   for (i=n-1; i>=0; i--){
1515:     v    = aa + 36*diag[i] + 36;
1516:     vi   = aj + diag[i] + 1;
1517:     nz   = ai[i+1] - diag[i] - 1;
1518:     idt  = 6*i;
1519:     s1 = x[idt];   s2 = x[1+idt];
1520:     s3 = x[2+idt]; s4 = x[3+idt];
1521:     s5 = x[4+idt]; s6 = x[5+idt];
1522:     while (nz--) {
1523:       idx   = 6*(*vi++);
1524:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1525:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1526:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1532:       v += 36;
1533:     }
1534:     v        = aa + 36*diag[i];
1535:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1536:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1537:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1538:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1539:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1540:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1541:   }

1543:   VecRestoreArray(bb,(PetscScalar**)&b);
1544:   VecRestoreArray(xx,&x);
1545:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);
1546:   return(0);
1547: }

1551: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1552: {
1553:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1554:   IS                iscol=a->col,isrow=a->row;
1555:   PetscErrorCode    ierr;
1556:   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1557:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1558:   const MatScalar   *aa=a->a,*v;
1559:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1560:   const PetscScalar *b;

1563:   VecGetArray(bb,(PetscScalar**)&b);
1564:   VecGetArray(xx,&x);
1565:   t  = a->solve_work;

1567:   ISGetIndices(isrow,&rout); r = rout;
1568:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1570:   /* forward solve the lower triangular */
1571:   idx    = 5*(*r++);
1572:   t[0] = b[idx];   t[1] = b[1+idx];
1573:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1574:   for (i=1; i<n; i++) {
1575:     v     = aa + 25*ai[i];
1576:     vi    = aj + ai[i];
1577:     nz    = diag[i] - ai[i];
1578:     idx   = 5*(*r++);
1579:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1580:     s5  = b[4+idx];
1581:     while (nz--) {
1582:       idx   = 5*(*vi++);
1583:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1584:       x4    = t[3+idx];x5 = t[4+idx];
1585:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1586:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1587:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1588:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1589:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1590:       v += 25;
1591:     }
1592:     idx = 5*i;
1593:     t[idx]   = s1;t[1+idx] = s2;
1594:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1595:   }
1596:   /* backward solve the upper triangular */
1597:   for (i=n-1; i>=0; i--){
1598:     v    = aa + 25*diag[i] + 25;
1599:     vi   = aj + diag[i] + 1;
1600:     nz   = ai[i+1] - diag[i] - 1;
1601:     idt  = 5*i;
1602:     s1 = t[idt];  s2 = t[1+idt];
1603:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1604:     while (nz--) {
1605:       idx   = 5*(*vi++);
1606:       x1    = t[idx];   x2 = t[1+idx];
1607:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1608:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1609:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1610:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1611:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1612:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1613:       v += 25;
1614:     }
1615:     idc = 5*(*c--);
1616:     v   = aa + 25*diag[i];
1617:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1618:                                  v[15]*s4+v[20]*s5;
1619:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1620:                                  v[16]*s4+v[21]*s5;
1621:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1622:                                  v[17]*s4+v[22]*s5;
1623:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1624:                                  v[18]*s4+v[23]*s5;
1625:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1626:                                  v[19]*s4+v[24]*s5;
1627:   }

1629:   ISRestoreIndices(isrow,&rout);
1630:   ISRestoreIndices(iscol,&cout);
1631:   VecRestoreArray(bb,(PetscScalar**)&b);
1632:   VecRestoreArray(xx,&x);
1633:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
1634:   return(0);
1635: }

1639: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1640: {
1641:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1642:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1643:   PetscErrorCode    ierr;
1644:   PetscInt          *diag = a->diag,jdx;
1645:   const MatScalar   *aa=a->a,*v;
1646:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1647:   const PetscScalar *b;

1650:   VecGetArray(bb,(PetscScalar**)&b);
1651:   VecGetArray(xx,&x);
1652:   /* forward solve the lower triangular */
1653:   idx    = 0;
1654:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1655:   for (i=1; i<n; i++) {
1656:     v     =  aa + 25*ai[i];
1657:     vi    =  aj + ai[i];
1658:     nz    =  diag[i] - ai[i];
1659:     idx   =  5*i;
1660:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1661:     while (nz--) {
1662:       jdx   = 5*(*vi++);
1663:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1664:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1665:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1666:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1667:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1668:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1669:       v    += 25;
1670:     }
1671:     x[idx]   = s1;
1672:     x[1+idx] = s2;
1673:     x[2+idx] = s3;
1674:     x[3+idx] = s4;
1675:     x[4+idx] = s5;
1676:   }
1677:   /* backward solve the upper triangular */
1678:   for (i=n-1; i>=0; i--){
1679:     v    = aa + 25*diag[i] + 25;
1680:     vi   = aj + diag[i] + 1;
1681:     nz   = ai[i+1] - diag[i] - 1;
1682:     idt  = 5*i;
1683:     s1 = x[idt];  s2 = x[1+idt];
1684:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1685:     while (nz--) {
1686:       idx   = 5*(*vi++);
1687:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1688:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1689:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1690:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1691:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1692:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1693:       v    += 25;
1694:     }
1695:     v        = aa + 25*diag[i];
1696:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1697:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1698:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1699:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1700:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1701:   }

1703:   VecRestoreArray(bb,(PetscScalar**)&b);
1704:   VecRestoreArray(xx,&x);
1705:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);
1706:   return(0);
1707: }

1711: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1712: {
1713:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1714:   IS                iscol=a->col,isrow=a->row;
1715:   PetscErrorCode    ierr;
1716:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1717:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1718:   const MatScalar   *aa=a->a,*v;
1719:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1720:   const PetscScalar *b;

1723:   VecGetArray(bb,(PetscScalar**)&b);
1724:   VecGetArray(xx,&x);
1725:   t  = a->solve_work;

1727:   ISGetIndices(isrow,&rout); r = rout;
1728:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1730:   /* forward solve the lower triangular */
1731:   idx    = 4*(*r++);
1732:   t[0] = b[idx];   t[1] = b[1+idx];
1733:   t[2] = b[2+idx]; t[3] = b[3+idx];
1734:   for (i=1; i<n; i++) {
1735:     v     = aa + 16*ai[i];
1736:     vi    = aj + ai[i];
1737:     nz    = diag[i] - ai[i];
1738:     idx   = 4*(*r++);
1739:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1740:     while (nz--) {
1741:       idx   = 4*(*vi++);
1742:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1743:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1744:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1745:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1746:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1747:       v    += 16;
1748:     }
1749:     idx        = 4*i;
1750:     t[idx]   = s1;t[1+idx] = s2;
1751:     t[2+idx] = s3;t[3+idx] = s4;
1752:   }
1753:   /* backward solve the upper triangular */
1754:   for (i=n-1; i>=0; i--){
1755:     v    = aa + 16*diag[i] + 16;
1756:     vi   = aj + diag[i] + 1;
1757:     nz   = ai[i+1] - diag[i] - 1;
1758:     idt  = 4*i;
1759:     s1 = t[idt];  s2 = t[1+idt];
1760:     s3 = t[2+idt];s4 = t[3+idt];
1761:     while (nz--) {
1762:       idx   = 4*(*vi++);
1763:       x1    = t[idx];   x2 = t[1+idx];
1764:       x3    = t[2+idx]; x4 = t[3+idx];
1765:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1766:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1767:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1768:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1769:       v += 16;
1770:     }
1771:     idc      = 4*(*c--);
1772:     v        = aa + 16*diag[i];
1773:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1774:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1775:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1776:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1777:   }

1779:   ISRestoreIndices(isrow,&rout);
1780:   ISRestoreIndices(iscol,&cout);
1781:   VecRestoreArray(bb,(PetscScalar**)&b);
1782:   VecRestoreArray(xx,&x);
1783:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
1784:   return(0);
1785: }

1789: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1790: {
1791:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1792:   IS                iscol=a->col,isrow=a->row;
1793:   PetscErrorCode    ierr;
1794:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1795:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1796:   const MatScalar   *aa=a->a,*v;
1797:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1798:   PetscScalar       *x;
1799:   const PetscScalar *b;

1802:   VecGetArray(bb,(PetscScalar**)&b);
1803:   VecGetArray(xx,&x);
1804:   t  = (MatScalar *)a->solve_work;

1806:   ISGetIndices(isrow,&rout); r = rout;
1807:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1809:   /* forward solve the lower triangular */
1810:   idx    = 4*(*r++);
1811:   t[0] = (MatScalar)b[idx];
1812:   t[1] = (MatScalar)b[1+idx];
1813:   t[2] = (MatScalar)b[2+idx];
1814:   t[3] = (MatScalar)b[3+idx];
1815:   for (i=1; i<n; i++) {
1816:     v     = aa + 16*ai[i];
1817:     vi    = aj + ai[i];
1818:     nz    = diag[i] - ai[i];
1819:     idx   = 4*(*r++);
1820:     s1 = (MatScalar)b[idx];
1821:     s2 = (MatScalar)b[1+idx];
1822:     s3 = (MatScalar)b[2+idx];
1823:     s4 = (MatScalar)b[3+idx];
1824:     while (nz--) {
1825:       idx   = 4*(*vi++);
1826:       x1  = t[idx];
1827:       x2  = t[1+idx];
1828:       x3  = t[2+idx];
1829:       x4  = t[3+idx];
1830:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1831:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1832:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1833:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1834:       v    += 16;
1835:     }
1836:     idx        = 4*i;
1837:     t[idx]   = s1;
1838:     t[1+idx] = s2;
1839:     t[2+idx] = s3;
1840:     t[3+idx] = s4;
1841:   }
1842:   /* backward solve the upper triangular */
1843:   for (i=n-1; i>=0; i--){
1844:     v    = aa + 16*diag[i] + 16;
1845:     vi   = aj + diag[i] + 1;
1846:     nz   = ai[i+1] - diag[i] - 1;
1847:     idt  = 4*i;
1848:     s1 = t[idt];
1849:     s2 = t[1+idt];
1850:     s3 = t[2+idt];
1851:     s4 = t[3+idt];
1852:     while (nz--) {
1853:       idx   = 4*(*vi++);
1854:       x1  = t[idx];
1855:       x2  = t[1+idx];
1856:       x3  = t[2+idx];
1857:       x4  = t[3+idx];
1858:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1859:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1860:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1861:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1862:       v += 16;
1863:     }
1864:     idc      = 4*(*c--);
1865:     v        = aa + 16*diag[i];
1866:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1867:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1868:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1869:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1870:     x[idc]   = (PetscScalar)t[idt];
1871:     x[1+idc] = (PetscScalar)t[1+idt];
1872:     x[2+idc] = (PetscScalar)t[2+idt];
1873:     x[3+idc] = (PetscScalar)t[3+idt];
1874:  }

1876:   ISRestoreIndices(isrow,&rout);
1877:   ISRestoreIndices(iscol,&cout);
1878:   VecRestoreArray(bb,(PetscScalar**)&b);
1879:   VecRestoreArray(xx,&x);
1880:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
1881:   return(0);
1882: }

1884: #if defined (PETSC_HAVE_SSE)

1886: #include PETSC_HAVE_SSE

1890: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1891: {
1892:   /* 
1893:      Note: This code uses demotion of double
1894:      to float when performing the mixed-mode computation.
1895:      This may not be numerically reasonable for all applications.
1896:   */
1897:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1898:   IS             iscol=a->col,isrow=a->row;
1900:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
1901:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1902:   MatScalar      *aa=a->a,*v;
1903:   PetscScalar    *x,*b,*t;

1905:   /* Make space in temp stack for 16 Byte Aligned arrays */
1906:   float           ssealignedspace[11],*tmps,*tmpx;
1907:   unsigned long   offset;
1908: 
1910:   SSE_SCOPE_BEGIN;

1912:     offset = (unsigned long)ssealignedspace % 16;
1913:     if (offset) offset = (16 - offset)/4;
1914:     tmps = &ssealignedspace[offset];
1915:     tmpx = &ssealignedspace[offset+4];
1916:     PREFETCH_NTA(aa+16*ai[1]);

1918:     VecGetArray(bb,&b);
1919:     VecGetArray(xx,&x);
1920:     t  = a->solve_work;

1922:     ISGetIndices(isrow,&rout); r = rout;
1923:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1925:     /* forward solve the lower triangular */
1926:     idx  = 4*(*r++);
1927:     t[0] = b[idx];   t[1] = b[1+idx];
1928:     t[2] = b[2+idx]; t[3] = b[3+idx];
1929:     v    =  aa + 16*ai[1];

1931:     for (i=1; i<n;) {
1932:       PREFETCH_NTA(&v[8]);
1933:       vi   =  aj      + ai[i];
1934:       nz   =  diag[i] - ai[i];
1935:       idx  =  4*(*r++);

1937:       /* Demote sum from double to float */
1938:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1939:       LOAD_PS(tmps,XMM7);

1941:       while (nz--) {
1942:         PREFETCH_NTA(&v[16]);
1943:         idx = 4*(*vi++);
1944: 
1945:         /* Demote solution (so far) from double to float */
1946:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1948:         /* 4x4 Matrix-Vector product with negative accumulation: */
1949:         SSE_INLINE_BEGIN_2(tmpx,v)
1950:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1952:           /* First Column */
1953:           SSE_COPY_PS(XMM0,XMM6)
1954:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1955:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1956:           SSE_SUB_PS(XMM7,XMM0)
1957: 
1958:           /* Second Column */
1959:           SSE_COPY_PS(XMM1,XMM6)
1960:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1961:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1962:           SSE_SUB_PS(XMM7,XMM1)
1963: 
1964:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1965: 
1966:           /* Third Column */
1967:           SSE_COPY_PS(XMM2,XMM6)
1968:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1969:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1970:           SSE_SUB_PS(XMM7,XMM2)

1972:           /* Fourth Column */
1973:           SSE_COPY_PS(XMM3,XMM6)
1974:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1975:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1976:           SSE_SUB_PS(XMM7,XMM3)
1977:         SSE_INLINE_END_2
1978: 
1979:         v  += 16;
1980:       }
1981:       idx = 4*i;
1982:       v   = aa + 16*ai[++i];
1983:       PREFETCH_NTA(v);
1984:       STORE_PS(tmps,XMM7);

1986:       /* Promote result from float to double */
1987:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1988:     }
1989:     /* backward solve the upper triangular */
1990:     idt  = 4*(n-1);
1991:     ai16 = 16*diag[n-1];
1992:     v    = aa + ai16 + 16;
1993:     for (i=n-1; i>=0;){
1994:       PREFETCH_NTA(&v[8]);
1995:       vi = aj + diag[i] + 1;
1996:       nz = ai[i+1] - diag[i] - 1;
1997: 
1998:       /* Demote accumulator from double to float */
1999:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2000:       LOAD_PS(tmps,XMM7);

2002:       while (nz--) {
2003:         PREFETCH_NTA(&v[16]);
2004:         idx = 4*(*vi++);

2006:         /* Demote solution (so far) from double to float */
2007:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

2009:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2010:         SSE_INLINE_BEGIN_2(tmpx,v)
2011:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2013:           /* First Column */
2014:           SSE_COPY_PS(XMM0,XMM6)
2015:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2016:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2017:           SSE_SUB_PS(XMM7,XMM0)

2019:           /* Second Column */
2020:           SSE_COPY_PS(XMM1,XMM6)
2021:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2022:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2023:           SSE_SUB_PS(XMM7,XMM1)

2025:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2026: 
2027:           /* Third Column */
2028:           SSE_COPY_PS(XMM2,XMM6)
2029:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2030:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2031:           SSE_SUB_PS(XMM7,XMM2)

2033:           /* Fourth Column */
2034:           SSE_COPY_PS(XMM3,XMM6)
2035:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2036:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2037:           SSE_SUB_PS(XMM7,XMM3)
2038:         SSE_INLINE_END_2
2039:         v  += 16;
2040:       }
2041:       v    = aa + ai16;
2042:       ai16 = 16*diag[--i];
2043:       PREFETCH_NTA(aa+ai16+16);
2044:       /* 
2045:          Scale the result by the diagonal 4x4 block, 
2046:          which was inverted as part of the factorization
2047:       */
2048:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2049:         /* First Column */
2050:         SSE_COPY_PS(XMM0,XMM7)
2051:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2052:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2054:         /* Second Column */
2055:         SSE_COPY_PS(XMM1,XMM7)
2056:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2057:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2058:         SSE_ADD_PS(XMM0,XMM1)

2060:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2061: 
2062:         /* Third Column */
2063:         SSE_COPY_PS(XMM2,XMM7)
2064:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2065:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2066:         SSE_ADD_PS(XMM0,XMM2)

2068:         /* Fourth Column */
2069:         SSE_COPY_PS(XMM3,XMM7)
2070:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2071:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2072:         SSE_ADD_PS(XMM0,XMM3)
2073: 
2074:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2075:       SSE_INLINE_END_3

2077:       /* Promote solution from float to double */
2078:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

2080:       /* Apply reordering to t and stream into x.    */
2081:       /* This way, x doesn't pollute the cache.      */
2082:       /* Be careful with size: 2 doubles = 4 floats! */
2083:       idc  = 4*(*c--);
2084:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2085:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2086:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2087:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2088:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2089:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2090:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2091:       SSE_INLINE_END_2
2092:       v    = aa + ai16 + 16;
2093:       idt -= 4;
2094:     }

2096:     ISRestoreIndices(isrow,&rout);
2097:     ISRestoreIndices(iscol,&cout);
2098:     VecRestoreArray(bb,&b);
2099:     VecRestoreArray(xx,&x);
2100:     PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2101:   SSE_SCOPE_END;
2102:   return(0);
2103: }

2105: #endif


2108: /*
2109:       Special case where the matrix was ILU(0) factored in the natural
2110:    ordering. This eliminates the need for the column and row permutation.
2111: */
2114: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2115: {
2116:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2117:   PetscInt          n=a->mbs;
2118:   const PetscInt    *ai=a->i,*aj=a->j;
2119:   PetscErrorCode    ierr;
2120:   const PetscInt    *diag = a->diag;
2121:   const MatScalar   *aa=a->a;
2122:   PetscScalar       *x;
2123:   const PetscScalar *b;

2126:   VecGetArray(bb,(PetscScalar**)&b);
2127:   VecGetArray(xx,&x);

2129: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2130:   {
2131:     static PetscScalar w[2000]; /* very BAD need to fix */
2132:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2133:   }
2134: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2135:   {
2136:     static PetscScalar w[2000]; /* very BAD need to fix */
2137:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2138:   }
2139: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2140:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2141: #else
2142:   {
2143:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2144:     const MatScalar *v;
2145:     PetscInt        jdx,idt,idx,nz,i,ai16;
2146:     const PetscInt  *vi;

2148:   /* forward solve the lower triangular */
2149:   idx    = 0;
2150:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2151:   for (i=1; i<n; i++) {
2152:     v     =  aa      + 16*ai[i];
2153:     vi    =  aj      + ai[i];
2154:     nz    =  diag[i] - ai[i];
2155:     idx   +=  4;
2156:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2157:     while (nz--) {
2158:       jdx   = 4*(*vi++);
2159:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2160:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2161:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2162:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2163:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2164:       v    += 16;
2165:     }
2166:     x[idx]   = s1;
2167:     x[1+idx] = s2;
2168:     x[2+idx] = s3;
2169:     x[3+idx] = s4;
2170:   }
2171:   /* backward solve the upper triangular */
2172:   idt = 4*(n-1);
2173:   for (i=n-1; i>=0; i--){
2174:     ai16 = 16*diag[i];
2175:     v    = aa + ai16 + 16;
2176:     vi   = aj + diag[i] + 1;
2177:     nz   = ai[i+1] - diag[i] - 1;
2178:     s1 = x[idt];  s2 = x[1+idt];
2179:     s3 = x[2+idt];s4 = x[3+idt];
2180:     while (nz--) {
2181:       idx   = 4*(*vi++);
2182:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2183:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2184:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2185:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2186:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2187:       v    += 16;
2188:     }
2189:     v        = aa + ai16;
2190:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2191:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2192:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2193:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2194:     idt -= 4;
2195:   }
2196:   }
2197: #endif

2199:   VecRestoreArray(bb,(PetscScalar**)&b);
2200:   VecRestoreArray(xx,&x);
2201:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2202:   return(0);
2203: }

2207: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2208: {
2209:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2210:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2212:   PetscInt       *diag = a->diag;
2213:   MatScalar      *aa=a->a;
2214:   PetscScalar    *x,*b;

2217:   VecGetArray(bb,&b);
2218:   VecGetArray(xx,&x);

2220:   {
2221:     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2222:     MatScalar  *v,*t=(MatScalar *)x;
2223:     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;

2225:     /* forward solve the lower triangular */
2226:     idx  = 0;
2227:     t[0] = (MatScalar)b[0];
2228:     t[1] = (MatScalar)b[1];
2229:     t[2] = (MatScalar)b[2];
2230:     t[3] = (MatScalar)b[3];
2231:     for (i=1; i<n; i++) {
2232:       v     =  aa      + 16*ai[i];
2233:       vi    =  aj      + ai[i];
2234:       nz    =  diag[i] - ai[i];
2235:       idx   +=  4;
2236:       s1 = (MatScalar)b[idx];
2237:       s2 = (MatScalar)b[1+idx];
2238:       s3 = (MatScalar)b[2+idx];
2239:       s4 = (MatScalar)b[3+idx];
2240:       while (nz--) {
2241:         jdx = 4*(*vi++);
2242:         x1  = t[jdx];
2243:         x2  = t[1+jdx];
2244:         x3  = t[2+jdx];
2245:         x4  = t[3+jdx];
2246:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2247:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2248:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2249:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2250:         v    += 16;
2251:       }
2252:       t[idx]   = s1;
2253:       t[1+idx] = s2;
2254:       t[2+idx] = s3;
2255:       t[3+idx] = s4;
2256:     }
2257:     /* backward solve the upper triangular */
2258:     idt = 4*(n-1);
2259:     for (i=n-1; i>=0; i--){
2260:       ai16 = 16*diag[i];
2261:       v    = aa + ai16 + 16;
2262:       vi   = aj + diag[i] + 1;
2263:       nz   = ai[i+1] - diag[i] - 1;
2264:       s1   = t[idt];
2265:       s2   = t[1+idt];
2266:       s3   = t[2+idt];
2267:       s4   = t[3+idt];
2268:       while (nz--) {
2269:         idx = 4*(*vi++);
2270:         x1  = (MatScalar)x[idx];
2271:         x2  = (MatScalar)x[1+idx];
2272:         x3  = (MatScalar)x[2+idx];
2273:         x4  = (MatScalar)x[3+idx];
2274:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2275:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2276:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2277:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2278:         v    += 16;
2279:       }
2280:       v        = aa + ai16;
2281:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2282:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2283:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2284:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2285:       idt -= 4;
2286:     }
2287:   }

2289:   VecRestoreArray(bb,&b);
2290:   VecRestoreArray(xx,&x);
2291:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2292:   return(0);
2293: }

2295: #if defined (PETSC_HAVE_SSE)

2297: #include PETSC_HAVE_SSE
2300: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2301: {
2302:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2303:   unsigned short *aj=(unsigned short *)a->j;
2305:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2306:   MatScalar      *aa=a->a;
2307:   PetscScalar    *x,*b;

2310:   SSE_SCOPE_BEGIN;
2311:   /* 
2312:      Note: This code currently uses demotion of double
2313:      to float when performing the mixed-mode computation.
2314:      This may not be numerically reasonable for all applications.
2315:   */
2316:   PREFETCH_NTA(aa+16*ai[1]);

2318:   VecGetArray(bb,&b);
2319:   VecGetArray(xx,&x);
2320:   {
2321:     /* x will first be computed in single precision then promoted inplace to double */
2322:     MatScalar      *v,*t=(MatScalar *)x;
2323:     int            nz,i,idt,ai16;
2324:     unsigned int   jdx,idx;
2325:     unsigned short *vi;
2326:     /* Forward solve the lower triangular factor. */

2328:     /* First block is the identity. */
2329:     idx  = 0;
2330:     CONVERT_DOUBLE4_FLOAT4(t,b);
2331:     v    =  aa + 16*((unsigned int)ai[1]);

2333:     for (i=1; i<n;) {
2334:       PREFETCH_NTA(&v[8]);
2335:       vi   =  aj      + ai[i];
2336:       nz   =  diag[i] - ai[i];
2337:       idx +=  4;

2339:       /* Demote RHS from double to float. */
2340:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2341:       LOAD_PS(&t[idx],XMM7);

2343:       while (nz--) {
2344:         PREFETCH_NTA(&v[16]);
2345:         jdx = 4*((unsigned int)(*vi++));
2346: 
2347:         /* 4x4 Matrix-Vector product with negative accumulation: */
2348:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2349:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2351:           /* First Column */
2352:           SSE_COPY_PS(XMM0,XMM6)
2353:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2354:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2355:           SSE_SUB_PS(XMM7,XMM0)

2357:           /* Second Column */
2358:           SSE_COPY_PS(XMM1,XMM6)
2359:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2360:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2361:           SSE_SUB_PS(XMM7,XMM1)

2363:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2364: 
2365:           /* Third Column */
2366:           SSE_COPY_PS(XMM2,XMM6)
2367:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2368:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2369:           SSE_SUB_PS(XMM7,XMM2)

2371:           /* Fourth Column */
2372:           SSE_COPY_PS(XMM3,XMM6)
2373:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2374:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2375:           SSE_SUB_PS(XMM7,XMM3)
2376:         SSE_INLINE_END_2
2377: 
2378:         v  += 16;
2379:       }
2380:       v    =  aa + 16*ai[++i];
2381:       PREFETCH_NTA(v);
2382:       STORE_PS(&t[idx],XMM7);
2383:     }

2385:     /* Backward solve the upper triangular factor.*/

2387:     idt  = 4*(n-1);
2388:     ai16 = 16*diag[n-1];
2389:     v    = aa + ai16 + 16;
2390:     for (i=n-1; i>=0;){
2391:       PREFETCH_NTA(&v[8]);
2392:       vi = aj + diag[i] + 1;
2393:       nz = ai[i+1] - diag[i] - 1;
2394: 
2395:       LOAD_PS(&t[idt],XMM7);

2397:       while (nz--) {
2398:         PREFETCH_NTA(&v[16]);
2399:         idx = 4*((unsigned int)(*vi++));

2401:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2402:         SSE_INLINE_BEGIN_2(&t[idx],v)
2403:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2405:           /* First Column */
2406:           SSE_COPY_PS(XMM0,XMM6)
2407:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2408:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2409:           SSE_SUB_PS(XMM7,XMM0)

2411:           /* Second Column */
2412:           SSE_COPY_PS(XMM1,XMM6)
2413:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2414:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2415:           SSE_SUB_PS(XMM7,XMM1)

2417:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2418: 
2419:           /* Third Column */
2420:           SSE_COPY_PS(XMM2,XMM6)
2421:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2422:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2423:           SSE_SUB_PS(XMM7,XMM2)

2425:           /* Fourth Column */
2426:           SSE_COPY_PS(XMM3,XMM6)
2427:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2428:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2429:           SSE_SUB_PS(XMM7,XMM3)
2430:         SSE_INLINE_END_2
2431:         v  += 16;
2432:       }
2433:       v    = aa + ai16;
2434:       ai16 = 16*diag[--i];
2435:       PREFETCH_NTA(aa+ai16+16);
2436:       /* 
2437:          Scale the result by the diagonal 4x4 block, 
2438:          which was inverted as part of the factorization
2439:       */
2440:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2441:         /* First Column */
2442:         SSE_COPY_PS(XMM0,XMM7)
2443:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2444:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2446:         /* Second Column */
2447:         SSE_COPY_PS(XMM1,XMM7)
2448:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2449:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2450:         SSE_ADD_PS(XMM0,XMM1)

2452:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2453: 
2454:         /* Third Column */
2455:         SSE_COPY_PS(XMM2,XMM7)
2456:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2457:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2458:         SSE_ADD_PS(XMM0,XMM2)

2460:         /* Fourth Column */
2461:         SSE_COPY_PS(XMM3,XMM7)
2462:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2463:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2464:         SSE_ADD_PS(XMM0,XMM3)

2466:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2467:       SSE_INLINE_END_3

2469:       v    = aa + ai16 + 16;
2470:       idt -= 4;
2471:     }

2473:     /* Convert t from single precision back to double precision (inplace)*/
2474:     idt = 4*(n-1);
2475:     for (i=n-1;i>=0;i--) {
2476:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2477:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2478:       PetscScalar *xtemp=&x[idt];
2479:       MatScalar   *ttemp=&t[idt];
2480:       xtemp[3] = (PetscScalar)ttemp[3];
2481:       xtemp[2] = (PetscScalar)ttemp[2];
2482:       xtemp[1] = (PetscScalar)ttemp[1];
2483:       xtemp[0] = (PetscScalar)ttemp[0];
2484:       idt -= 4;
2485:     }

2487:   } /* End of artificial scope. */
2488:   VecRestoreArray(bb,&b);
2489:   VecRestoreArray(xx,&x);
2490:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2491:   SSE_SCOPE_END;
2492:   return(0);
2493: }

2497: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2498: {
2499:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2500:   int            *aj=a->j;
2502:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2503:   MatScalar      *aa=a->a;
2504:   PetscScalar    *x,*b;

2507:   SSE_SCOPE_BEGIN;
2508:   /* 
2509:      Note: This code currently uses demotion of double
2510:      to float when performing the mixed-mode computation.
2511:      This may not be numerically reasonable for all applications.
2512:   */
2513:   PREFETCH_NTA(aa+16*ai[1]);

2515:   VecGetArray(bb,&b);
2516:   VecGetArray(xx,&x);
2517:   {
2518:     /* x will first be computed in single precision then promoted inplace to double */
2519:     MatScalar *v,*t=(MatScalar *)x;
2520:     int       nz,i,idt,ai16;
2521:     int       jdx,idx;
2522:     int       *vi;
2523:     /* Forward solve the lower triangular factor. */

2525:     /* First block is the identity. */
2526:     idx  = 0;
2527:     CONVERT_DOUBLE4_FLOAT4(t,b);
2528:     v    =  aa + 16*ai[1];

2530:     for (i=1; i<n;) {
2531:       PREFETCH_NTA(&v[8]);
2532:       vi   =  aj      + ai[i];
2533:       nz   =  diag[i] - ai[i];
2534:       idx +=  4;

2536:       /* Demote RHS from double to float. */
2537:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2538:       LOAD_PS(&t[idx],XMM7);

2540:       while (nz--) {
2541:         PREFETCH_NTA(&v[16]);
2542:         jdx = 4*(*vi++);
2543: /*          jdx = *vi++; */
2544: 
2545:         /* 4x4 Matrix-Vector product with negative accumulation: */
2546:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2547:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2549:           /* First Column */
2550:           SSE_COPY_PS(XMM0,XMM6)
2551:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2552:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2553:           SSE_SUB_PS(XMM7,XMM0)

2555:           /* Second Column */
2556:           SSE_COPY_PS(XMM1,XMM6)
2557:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2558:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2559:           SSE_SUB_PS(XMM7,XMM1)

2561:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2562: 
2563:           /* Third Column */
2564:           SSE_COPY_PS(XMM2,XMM6)
2565:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2566:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2567:           SSE_SUB_PS(XMM7,XMM2)

2569:           /* Fourth Column */
2570:           SSE_COPY_PS(XMM3,XMM6)
2571:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2572:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2573:           SSE_SUB_PS(XMM7,XMM3)
2574:         SSE_INLINE_END_2
2575: 
2576:         v  += 16;
2577:       }
2578:       v    =  aa + 16*ai[++i];
2579:       PREFETCH_NTA(v);
2580:       STORE_PS(&t[idx],XMM7);
2581:     }

2583:     /* Backward solve the upper triangular factor.*/

2585:     idt  = 4*(n-1);
2586:     ai16 = 16*diag[n-1];
2587:     v    = aa + ai16 + 16;
2588:     for (i=n-1; i>=0;){
2589:       PREFETCH_NTA(&v[8]);
2590:       vi = aj + diag[i] + 1;
2591:       nz = ai[i+1] - diag[i] - 1;
2592: 
2593:       LOAD_PS(&t[idt],XMM7);

2595:       while (nz--) {
2596:         PREFETCH_NTA(&v[16]);
2597:         idx = 4*(*vi++);
2598: /*          idx = *vi++; */

2600:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2601:         SSE_INLINE_BEGIN_2(&t[idx],v)
2602:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2604:           /* First Column */
2605:           SSE_COPY_PS(XMM0,XMM6)
2606:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2607:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2608:           SSE_SUB_PS(XMM7,XMM0)

2610:           /* Second Column */
2611:           SSE_COPY_PS(XMM1,XMM6)
2612:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2613:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2614:           SSE_SUB_PS(XMM7,XMM1)

2616:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2617: 
2618:           /* Third Column */
2619:           SSE_COPY_PS(XMM2,XMM6)
2620:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2621:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2622:           SSE_SUB_PS(XMM7,XMM2)

2624:           /* Fourth Column */
2625:           SSE_COPY_PS(XMM3,XMM6)
2626:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2627:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2628:           SSE_SUB_PS(XMM7,XMM3)
2629:         SSE_INLINE_END_2
2630:         v  += 16;
2631:       }
2632:       v    = aa + ai16;
2633:       ai16 = 16*diag[--i];
2634:       PREFETCH_NTA(aa+ai16+16);
2635:       /* 
2636:          Scale the result by the diagonal 4x4 block, 
2637:          which was inverted as part of the factorization
2638:       */
2639:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2640:         /* First Column */
2641:         SSE_COPY_PS(XMM0,XMM7)
2642:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2643:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2645:         /* Second Column */
2646:         SSE_COPY_PS(XMM1,XMM7)
2647:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2648:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2649:         SSE_ADD_PS(XMM0,XMM1)

2651:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2652: 
2653:         /* Third Column */
2654:         SSE_COPY_PS(XMM2,XMM7)
2655:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2656:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2657:         SSE_ADD_PS(XMM0,XMM2)

2659:         /* Fourth Column */
2660:         SSE_COPY_PS(XMM3,XMM7)
2661:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2662:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2663:         SSE_ADD_PS(XMM0,XMM3)

2665:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2666:       SSE_INLINE_END_3

2668:       v    = aa + ai16 + 16;
2669:       idt -= 4;
2670:     }

2672:     /* Convert t from single precision back to double precision (inplace)*/
2673:     idt = 4*(n-1);
2674:     for (i=n-1;i>=0;i--) {
2675:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2676:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2677:       PetscScalar *xtemp=&x[idt];
2678:       MatScalar   *ttemp=&t[idt];
2679:       xtemp[3] = (PetscScalar)ttemp[3];
2680:       xtemp[2] = (PetscScalar)ttemp[2];
2681:       xtemp[1] = (PetscScalar)ttemp[1];
2682:       xtemp[0] = (PetscScalar)ttemp[0];
2683:       idt -= 4;
2684:     }

2686:   } /* End of artificial scope. */
2687:   VecRestoreArray(bb,&b);
2688:   VecRestoreArray(xx,&x);
2689:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);
2690:   SSE_SCOPE_END;
2691:   return(0);
2692: }

2694: #endif
2697: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2698: {
2699:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2700:   IS                iscol=a->col,isrow=a->row;
2701:   PetscErrorCode    ierr;
2702:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2703:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2704:   const MatScalar   *aa=a->a,*v;
2705:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2706:   const PetscScalar *b;

2709:   VecGetArray(bb,(PetscScalar**)&b);
2710:   VecGetArray(xx,&x);
2711:   t  = a->solve_work;

2713:   ISGetIndices(isrow,&rout); r = rout;
2714:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2716:   /* forward solve the lower triangular */
2717:   idx    = 3*(*r++);
2718:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2719:   for (i=1; i<n; i++) {
2720:     v     = aa + 9*ai[i];
2721:     vi    = aj + ai[i];
2722:     nz    = diag[i] - ai[i];
2723:     idx   = 3*(*r++);
2724:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2725:     while (nz--) {
2726:       idx   = 3*(*vi++);
2727:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2728:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2729:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2730:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2731:       v += 9;
2732:     }
2733:     idx = 3*i;
2734:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2735:   }
2736:   /* backward solve the upper triangular */
2737:   for (i=n-1; i>=0; i--){
2738:     v    = aa + 9*diag[i] + 9;
2739:     vi   = aj + diag[i] + 1;
2740:     nz   = ai[i+1] - diag[i] - 1;
2741:     idt  = 3*i;
2742:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2743:     while (nz--) {
2744:       idx   = 3*(*vi++);
2745:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2746:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2747:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2748:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2749:       v += 9;
2750:     }
2751:     idc = 3*(*c--);
2752:     v   = aa + 9*diag[i];
2753:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2754:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2755:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2756:   }
2757:   ISRestoreIndices(isrow,&rout);
2758:   ISRestoreIndices(iscol,&cout);
2759:   VecRestoreArray(bb,(PetscScalar**)&b);
2760:   VecRestoreArray(xx,&x);
2761:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
2762:   return(0);
2763: }

2765: /*
2766:       Special case where the matrix was ILU(0) factored in the natural
2767:    ordering. This eliminates the need for the column and row permutation.
2768: */
2771: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2772: {
2773:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2774:   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2775:   PetscErrorCode    ierr;
2776:   PetscInt          *diag = a->diag;
2777:   const MatScalar   *aa=a->a,*v;
2778:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2779:   const PetscScalar *b;
2780:   PetscInt          jdx,idt,idx,nz,*vi,i;

2783:   VecGetArray(bb,(PetscScalar**)&b);
2784:   VecGetArray(xx,&x);

2786:   /* forward solve the lower triangular */
2787:   idx    = 0;
2788:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2789:   for (i=1; i<n; i++) {
2790:     v     =  aa      + 9*ai[i];
2791:     vi    =  aj      + ai[i];
2792:     nz    =  diag[i] - ai[i];
2793:     idx   +=  3;
2794:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2795:     while (nz--) {
2796:       jdx   = 3*(*vi++);
2797:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2798:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2799:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2800:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2801:       v    += 9;
2802:     }
2803:     x[idx]   = s1;
2804:     x[1+idx] = s2;
2805:     x[2+idx] = s3;
2806:   }
2807:   /* backward solve the upper triangular */
2808:   for (i=n-1; i>=0; i--){
2809:     v    = aa + 9*diag[i] + 9;
2810:     vi   = aj + diag[i] + 1;
2811:     nz   = ai[i+1] - diag[i] - 1;
2812:     idt  = 3*i;
2813:     s1 = x[idt];  s2 = x[1+idt];
2814:     s3 = x[2+idt];
2815:     while (nz--) {
2816:       idx   = 3*(*vi++);
2817:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2818:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2819:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2820:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2821:       v    += 9;
2822:     }
2823:     v        = aa +  9*diag[i];
2824:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2825:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2826:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2827:   }

2829:   VecRestoreArray(bb,(PetscScalar**)&b);
2830:   VecRestoreArray(xx,&x);
2831:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);
2832:   return(0);
2833: }

2837: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2838: {
2839:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2840:   IS                iscol=a->col,isrow=a->row;
2841:   PetscErrorCode    ierr;
2842:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2843:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2844:   const MatScalar   *aa=a->a,*v;
2845:   PetscScalar       *x,s1,s2,x1,x2,*t;
2846:   const PetscScalar *b;

2849:   VecGetArray(bb,(PetscScalar**)&b);
2850:   VecGetArray(xx,&x);
2851:   t  = a->solve_work;

2853:   ISGetIndices(isrow,&rout); r = rout;
2854:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2856:   /* forward solve the lower triangular */
2857:   idx    = 2*(*r++);
2858:   t[0] = b[idx]; t[1] = b[1+idx];
2859:   for (i=1; i<n; i++) {
2860:     v     = aa + 4*ai[i];
2861:     vi    = aj + ai[i];
2862:     nz    = diag[i] - ai[i];
2863:     idx   = 2*(*r++);
2864:     s1  = b[idx]; s2 = b[1+idx];
2865:     while (nz--) {
2866:       idx   = 2*(*vi++);
2867:       x1    = t[idx]; x2 = t[1+idx];
2868:       s1 -= v[0]*x1 + v[2]*x2;
2869:       s2 -= v[1]*x1 + v[3]*x2;
2870:       v += 4;
2871:     }
2872:     idx = 2*i;
2873:     t[idx] = s1; t[1+idx] = s2;
2874:   }
2875:   /* backward solve the upper triangular */
2876:   for (i=n-1; i>=0; i--){
2877:     v    = aa + 4*diag[i] + 4;
2878:     vi   = aj + diag[i] + 1;
2879:     nz   = ai[i+1] - diag[i] - 1;
2880:     idt  = 2*i;
2881:     s1 = t[idt]; s2 = t[1+idt];
2882:     while (nz--) {
2883:       idx   = 2*(*vi++);
2884:       x1    = t[idx]; x2 = t[1+idx];
2885:       s1 -= v[0]*x1 + v[2]*x2;
2886:       s2 -= v[1]*x1 + v[3]*x2;
2887:       v += 4;
2888:     }
2889:     idc = 2*(*c--);
2890:     v   = aa + 4*diag[i];
2891:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2892:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2893:   }
2894:   ISRestoreIndices(isrow,&rout);
2895:   ISRestoreIndices(iscol,&cout);
2896:   VecRestoreArray(bb,(PetscScalar**)&b);
2897:   VecRestoreArray(xx,&x);
2898:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
2899:   return(0);
2900: }

2902: /*
2903:       Special case where the matrix was ILU(0) factored in the natural
2904:    ordering. This eliminates the need for the column and row permutation.
2905: */
2908: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2909: {
2910:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2911:   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2912:   PetscErrorCode    ierr;
2913:   PetscInt          *diag = a->diag;
2914:   const MatScalar   *aa=a->a,*v;
2915:   PetscScalar       *x,s1,s2,x1,x2;
2916:   const PetscScalar *b;
2917:   PetscInt          jdx,idt,idx,nz,*vi,i;

2920:   VecGetArray(bb,(PetscScalar**)&b);
2921:   VecGetArray(xx,&x);

2923:   /* forward solve the lower triangular */
2924:   idx    = 0;
2925:   x[0]   = b[0]; x[1] = b[1];
2926:   for (i=1; i<n; i++) {
2927:     v     =  aa      + 4*ai[i];
2928:     vi    =  aj      + ai[i];
2929:     nz    =  diag[i] - ai[i];
2930:     idx   +=  2;
2931:     s1  =  b[idx];s2 = b[1+idx];
2932:     while (nz--) {
2933:       jdx   = 2*(*vi++);
2934:       x1    = x[jdx];x2 = x[1+jdx];
2935:       s1 -= v[0]*x1 + v[2]*x2;
2936:       s2 -= v[1]*x1 + v[3]*x2;
2937:       v    += 4;
2938:     }
2939:     x[idx]   = s1;
2940:     x[1+idx] = s2;
2941:   }
2942:   /* backward solve the upper triangular */
2943:   for (i=n-1; i>=0; i--){
2944:     v    = aa + 4*diag[i] + 4;
2945:     vi   = aj + diag[i] + 1;
2946:     nz   = ai[i+1] - diag[i] - 1;
2947:     idt  = 2*i;
2948:     s1 = x[idt];  s2 = x[1+idt];
2949:     while (nz--) {
2950:       idx   = 2*(*vi++);
2951:       x1    = x[idx];   x2 = x[1+idx];
2952:       s1 -= v[0]*x1 + v[2]*x2;
2953:       s2 -= v[1]*x1 + v[3]*x2;
2954:       v    += 4;
2955:     }
2956:     v        = aa +  4*diag[i];
2957:     x[idt]   = v[0]*s1 + v[2]*s2;
2958:     x[1+idt] = v[1]*s1 + v[3]*s2;
2959:   }

2961:   VecRestoreArray(bb,(PetscScalar**)&b);
2962:   VecRestoreArray(xx,&x);
2963:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);
2964:   return(0);
2965: }

2969: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2970: {
2971:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2972:   IS             iscol=a->col,isrow=a->row;
2974:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2975:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2976:   MatScalar      *aa=a->a,*v;
2977:   PetscScalar    *x,*b,s1,*t;

2980:   if (!n) return(0);

2982:   VecGetArray(bb,&b);
2983:   VecGetArray(xx,&x);
2984:   t  = a->solve_work;

2986:   ISGetIndices(isrow,&rout); r = rout;
2987:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2989:   /* forward solve the lower triangular */
2990:   t[0] = b[*r++];
2991:   for (i=1; i<n; i++) {
2992:     v     = aa + ai[i];
2993:     vi    = aj + ai[i];
2994:     nz    = diag[i] - ai[i];
2995:     s1  = b[*r++];
2996:     while (nz--) {
2997:       s1 -= (*v++)*t[*vi++];
2998:     }
2999:     t[i] = s1;
3000:   }
3001:   /* backward solve the upper triangular */
3002:   for (i=n-1; i>=0; i--){
3003:     v    = aa + diag[i] + 1;
3004:     vi   = aj + diag[i] + 1;
3005:     nz   = ai[i+1] - diag[i] - 1;
3006:     s1 = t[i];
3007:     while (nz--) {
3008:       s1 -= (*v++)*t[*vi++];
3009:     }
3010:     x[*c--] = t[i] = aa[diag[i]]*s1;
3011:   }

3013:   ISRestoreIndices(isrow,&rout);
3014:   ISRestoreIndices(iscol,&cout);
3015:   VecRestoreArray(bb,&b);
3016:   VecRestoreArray(xx,&x);
3017:   PetscLogFlops(2*1*(a->nz) - A->cmap->n);
3018:   return(0);
3019: }
3020: /*
3021:       Special case where the matrix was ILU(0) factored in the natural
3022:    ordering. This eliminates the need for the column and row permutation.
3023: */
3026: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3027: {
3028:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3029:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3031:   PetscInt       *diag = a->diag;
3032:   MatScalar      *aa=a->a;
3033:   PetscScalar    *x,*b;
3034:   PetscScalar    s1,x1;
3035:   MatScalar      *v;
3036:   PetscInt       jdx,idt,idx,nz,*vi,i;

3039:   VecGetArray(bb,&b);
3040:   VecGetArray(xx,&x);

3042:   /* forward solve the lower triangular */
3043:   idx    = 0;
3044:   x[0]   = b[0];
3045:   for (i=1; i<n; i++) {
3046:     v     =  aa      + ai[i];
3047:     vi    =  aj      + ai[i];
3048:     nz    =  diag[i] - ai[i];
3049:     idx   +=  1;
3050:     s1  =  b[idx];
3051:     while (nz--) {
3052:       jdx   = *vi++;
3053:       x1    = x[jdx];
3054:       s1 -= v[0]*x1;
3055:       v    += 1;
3056:     }
3057:     x[idx]   = s1;
3058:   }
3059:   /* backward solve the upper triangular */
3060:   for (i=n-1; i>=0; i--){
3061:     v    = aa + diag[i] + 1;
3062:     vi   = aj + diag[i] + 1;
3063:     nz   = ai[i+1] - diag[i] - 1;
3064:     idt  = i;
3065:     s1 = x[idt];
3066:     while (nz--) {
3067:       idx   = *vi++;
3068:       x1    = x[idx];
3069:       s1 -= v[0]*x1;
3070:       v    += 1;
3071:     }
3072:     v        = aa +  diag[i];
3073:     x[idt]   = v[0]*s1;
3074:   }
3075:   VecRestoreArray(bb,&b);
3076:   VecRestoreArray(xx,&x);
3077:   PetscLogFlops(2*(a->nz) - A->cmap->n);
3078:   return(0);
3079: }

3081: /* ----------------------------------------------------------------*/
3082: /*
3083:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3084:    except that the data structure of Mat_SeqAIJ is slightly different.
3085:    Not a good example of code reuse.
3086: */
3087: EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3088: EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);

3092: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3093: {
3094:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3095:   IS             isicol;
3097:   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3098:   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3099:   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3100:   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3101:   PetscTruth     col_identity,row_identity,both_identity,flg;
3102:   PetscReal      f;

3105:   f             = info->fill;
3106:   levels        = (PetscInt)info->levels;
3107:   diagonal_fill = (PetscInt)info->diagonal_fill;
3108:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3109:   ISIdentity(isrow,&row_identity);
3110:   ISIdentity(iscol,&col_identity);
3111:   both_identity = (PetscTruth) (row_identity && col_identity);

3113:   if (!levels && both_identity) {  /* special case copy the nonzero structure */
3114:     MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);
3115:     fact->factor = MAT_FACTOR_ILU;
3116:     b               = (Mat_SeqBAIJ*)(fact)->data;
3117:     MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);
3118:     if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3119:     b->row        = isrow;
3120:     b->col        = iscol;
3121:     PetscObjectReference((PetscObject)isrow);
3122:     PetscObjectReference((PetscObject)iscol);
3123:     b->icol       = isicol;
3124:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3125:     PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);
3126:   } else { /* general case perform the symbolic factorization */
3127:     ISGetIndices(isrow,&r);
3128:     ISGetIndices(isicol,&ic);

3130:     /* get new row pointers */
3131:     PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3132:     ainew[0] = 0;
3133:     /* don't know how many column pointers are needed so estimate */
3134:     jmax = (PetscInt)(f*ai[n] + 1);
3135:     PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3136:     /* ajfill is level of fill for each fill entry */
3137:     PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3138:     /* fill is a linked list of nonzeros in active row */
3139:     PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3140:     /* im is level for each filled value */
3141:     PetscMalloc((n+1)*sizeof(PetscInt),&im);
3142:     /* dloc is location of diagonal in factor */
3143:     PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3144:     dloc[0]  = 0;
3145:     for (prow=0; prow<n; prow++) {

3147:       /* copy prow into linked list */
3148:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3149:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3150:       xi         = aj + ai[r[prow]];
3151:       fill[n]    = n;
3152:       fill[prow] = -1; /* marker for diagonal entry */
3153:       while (nz--) {
3154:         fm  = n;
3155:         idx = ic[*xi++];
3156:         do {
3157:           m  = fm;
3158:           fm = fill[m];
3159:         } while (fm < idx);
3160:         fill[m]   = idx;
3161:         fill[idx] = fm;
3162:         im[idx]   = 0;
3163:       }

3165:       /* make sure diagonal entry is included */
3166:       if (diagonal_fill && fill[prow] == -1) {
3167:         fm = n;
3168:         while (fill[fm] < prow) fm = fill[fm];
3169:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
3170:         fill[fm]   = prow;
3171:         im[prow]   = 0;
3172:         nzf++;
3173:         dcount++;
3174:       }

3176:       nzi = 0;
3177:       row = fill[n];
3178:       while (row < prow) {
3179:         incrlev = im[row] + 1;
3180:         nz      = dloc[row];
3181:         xi      = ajnew  + ainew[row] + nz + 1;
3182:         flev    = ajfill + ainew[row] + nz + 1;
3183:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
3184:         fm      = row;
3185:         while (nnz-- > 0) {
3186:           idx = *xi++;
3187:           if (*flev + incrlev > levels) {
3188:             flev++;
3189:             continue;
3190:           }
3191:           do {
3192:             m  = fm;
3193:             fm = fill[m];
3194:           } while (fm < idx);
3195:           if (fm != idx) {
3196:             im[idx]   = *flev + incrlev;
3197:             fill[m]   = idx;
3198:             fill[idx] = fm;
3199:             fm        = idx;
3200:             nzf++;
3201:           } else {
3202:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3203:           }
3204:           flev++;
3205:         }
3206:         row = fill[row];
3207:         nzi++;
3208:       }
3209:       /* copy new filled row into permanent storage */
3210:       ainew[prow+1] = ainew[prow] + nzf;
3211:       if (ainew[prow+1] > jmax) {

3213:         /* estimate how much additional space we will need */
3214:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3215:         /* just double the memory each time */
3216:         PetscInt maxadd = jmax;
3217:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3218:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3219:         jmax += maxadd;

3221:         /* allocate a longer ajnew and ajfill */
3222:         PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
3223:         PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
3224:         PetscFree(ajnew);
3225:         ajnew = xitmp;
3226:         PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
3227:         PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
3228:         PetscFree(ajfill);
3229:         ajfill = xitmp;
3230:         reallocate++; /* count how many reallocations are needed */
3231:       }
3232:       xitmp       = ajnew + ainew[prow];
3233:       flev        = ajfill + ainew[prow];
3234:       dloc[prow]  = nzi;
3235:       fm          = fill[n];
3236:       while (nzf--) {
3237:         *xitmp++ = fm;
3238:         *flev++ = im[fm];
3239:         fm      = fill[fm];
3240:       }
3241:       /* make sure row has diagonal entry */
3242:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3243:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3244:     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3245:       }
3246:     }
3247:     PetscFree(ajfill);
3248:     ISRestoreIndices(isrow,&r);
3249:     ISRestoreIndices(isicol,&ic);
3250:     PetscFree(fill);
3251:     PetscFree(im);

3253: #if defined(PETSC_USE_INFO)
3254:     {
3255:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3256:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
3257:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
3258:       PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
3259:       PetscInfo(A,"for best performance.\n");
3260:       if (diagonal_fill) {
3261:         PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
3262:       }
3263:     }
3264: #endif

3266:     /* put together the new matrix */
3267:     MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
3268:     PetscLogObjectParent(fact,isicol);
3269:     b    = (Mat_SeqBAIJ*)(fact)->data;
3270:     b->free_a       = PETSC_TRUE;
3271:     b->free_ij      = PETSC_TRUE;
3272:     b->singlemalloc = PETSC_FALSE;
3273:     PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
3274:     b->j          = ajnew;
3275:     b->i          = ainew;
3276:     for (i=0; i<n; i++) dloc[i] += ainew[i];
3277:     b->diag       = dloc;
3278:     b->ilen       = 0;
3279:     b->imax       = 0;
3280:     b->row        = isrow;
3281:     b->col        = iscol;
3282:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3283:     PetscObjectReference((PetscObject)isrow);
3284:     PetscObjectReference((PetscObject)iscol);
3285:     b->icol       = isicol;
3286:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3287:     /* In b structure:  Free imax, ilen, old a, old j.  
3288:        Allocate dloc, solve_work, new a, new j */
3289:     PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3290:     b->maxnz          = b->nz = ainew[n];

3292:     (fact)->info.factor_mallocs    = reallocate;
3293:     (fact)->info.fill_ratio_given  = f;
3294:     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3295:   }
3296:   MatSeqBAIJSetNumericFactorization(fact,both_identity);
3297:   return(0);
3298: }

3302: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3303: {
3304:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3305:   /* int i,*AJ=a->j,nz=a->nz; */
3307:   /* Undo Column scaling */
3308: /*    while (nz--) { */
3309: /*      AJ[i] = AJ[i]/4; */
3310: /*    } */
3311:   /* This should really invoke a push/pop logic, but we don't have that yet. */
3312:   A->ops->setunfactored = PETSC_NULL;
3313:   return(0);
3314: }

3318: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3319: {
3320:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3321:   PetscInt       *AJ=a->j,nz=a->nz;
3322:   unsigned short *aj=(unsigned short *)AJ;
3324:   /* Is this really necessary? */
3325:   while (nz--) {
3326:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3327:   }
3328:   A->ops->setunfactored = PETSC_NULL;
3329:   return(0);
3330: }