Actual source code: sbaijfact2.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for SBAIJ format. 
  5: */

 7:  #include ../src/mat/impls/sbaij/seq/sbaij.h
 8:  #include ../src/mat/impls/baij/seq/baij.h
 9:  #include ../src/inline/ilu.h
 10:  #include ../src/inline/dot.h

 14: PetscErrorCode MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
 15: {
 16:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
 17:   IS              isrow=a->row;
 18:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
 19:   PetscErrorCode  ierr;
 20:   const PetscInt  *r;
 21:   PetscInt        nz,*vj,k,idx,k1;
 22:   PetscInt        bs=A->rmap->bs,bs2 = a->bs2;
 23:   MatScalar       *aa=a->a,*v,*diag;
 24:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

 27:   VecGetArray(bb,&b);
 28:   VecGetArray(xx,&x);
 29:   t  = a->solve_work;
 30:   ISGetIndices(isrow,&r);
 31:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

 33:   /* solve U^T * D * y = b by forward substitution */
 34:   xk = t;
 35:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 36:     idx   = bs*r[k];
 37:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 38:   }
 39:   for (k=0; k<mbs; k++){
 40:     v  = aa + bs2*ai[k];
 41:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
 42:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
 43:     nz = ai[k+1] - ai[k];
 44:     vj = aj + ai[k];
 45:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
 46:     while (nz--) {
 47:       /* x(:) += U(k,:)^T*(Dk*xk) */
 48:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 49:       vj++; xj = t + (*vj)*bs;
 50:       v += bs2;
 51:     }
 52:     /* xk = inv(Dk)*(Dk*xk) */
 53:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 54:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 55:   }

 57:   /* solve U*x = y by back substitution */
 58:   for (k=mbs-1; k>=0; k--){
 59:     v  = aa + bs2*ai[k];
 60:     xk = t + k*bs;        /* xk */
 61:     nz = ai[k+1] - ai[k];
 62:     vj = aj + ai[k];
 63:     xj = t + (*vj)*bs;
 64:     while (nz--) {
 65:       /* xk += U(k,:)*x(:) */
 66:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 67:       vj++;
 68:       v += bs2; xj = t + (*vj)*bs;
 69:     }
 70:     idx   = bs*r[k];
 71:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 72:   }

 74:   PetscFree(xk_tmp);
 75:   ISRestoreIndices(isrow,&r);
 76:   VecRestoreArray(bb,&b);
 77:   VecRestoreArray(xx,&x);
 78:   PetscLogFlops(bs2*(2*a->nz + mbs));
 79:   return(0);
 80: }

 84: PetscErrorCode MatForwardSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
 85: {
 87:   SETERRQ(1,"not implemented yet");
 88:   return(0);
 89: }

 93: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
 94: {
 96:   SETERRQ(1,"not implemented yet");
 97:   return(0);
 98: }

102: PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
103: {
105:   PetscInt       nz,*vj,k;
106:   PetscInt       bs2 = bs*bs;
107:   MatScalar      *v,*diag;
108:   PetscScalar    *xk,*xj,*xk_tmp;
109: 
111:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
112:   for (k=0; k<mbs; k++){
113:     v  = aa + bs2*ai[k];
114:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
115:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
116:     nz = ai[k+1] - ai[k];
117:     vj = aj + ai[k];
118:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
119:     while (nz--) {
120:       /* x(:) += U(k,:)^T*(Dk*xk) */
121:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
122:       vj++; xj = x + (*vj)*bs;
123:       v += bs2;
124:     }
125:     /* xk = inv(Dk)*(Dk*xk) */
126:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
127:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
128:   }
129:   PetscFree(xk_tmp);
130:   return(0);
131: }

135: PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
136: {
137:   PetscInt       nz,*vj,k;
138:   PetscInt       bs2 = bs*bs;
139:   MatScalar      *v;
140:   PetscScalar    *xk,*xj;

143:   for (k=mbs-1; k>=0; k--){
144:     v  = aa + bs2*ai[k];
145:     xk = x + k*bs;        /* xk */
146:     nz = ai[k+1] - ai[k];
147:     vj = aj + ai[k];
148:     xj = x + (*vj)*bs;
149:     while (nz--) {
150:       /* xk += U(k,:)*x(:) */
151:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
152:       vj++;
153:       v += bs2; xj = x + (*vj)*bs;
154:     }
155:   }
156:   return(0);
157: }

161: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
162: {
163:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
165:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
166:   PetscInt       bs=A->rmap->bs;
167:   MatScalar      *aa=a->a;
168:   PetscScalar    *x,*b;
169: #if defined(PETSC_USE_LOG)
170:   PetscInt       bs2 = a->bs2;
171: #endif

174:   VecGetArray(bb,&b);
175:   VecGetArray(xx,&x);

177:   /* solve U^T * D * y = b by forward substitution */
178:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
179:   ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);

181:   /* solve U*x = y by back substitution */
182:   BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);

184:   VecRestoreArray(bb,&b);
185:   VecRestoreArray(xx,&x);
186:   PetscLogFlops(bs2*(2*a->nz + mbs));
187:   return(0);
188: }

192: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
193: {
194:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
196:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
197:   PetscInt       bs=A->rmap->bs;
198:   MatScalar      *aa=a->a;
199:   PetscScalar    *x,*b;
200: #if defined(PETSC_USE_LOG)
201:   PetscInt       bs2 = a->bs2;
202: #endif

205:   VecGetArray(bb,&b);
206:   VecGetArray(xx,&x);
207:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
208:   ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
209:   VecRestoreArray(bb,&b);
210:   VecRestoreArray(xx,&x);
211:   PetscLogFlops(bs2*a->nz + A->rmap->N);
212:   return(0);
213: }

217: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
218: {
219:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
221:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
222:   PetscInt       bs=A->rmap->bs;
223:   MatScalar      *aa=a->a;
224:   PetscScalar    *x,*b;
225: #if defined(PETSC_USE_LOG)
226:   PetscInt       bs2 = a->bs2;
227: #endif

230:   VecGetArray(bb,&b);
231:   VecGetArray(xx,&x);
232:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
233:   BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
234:   VecRestoreArray(bb,&b);
235:   VecRestoreArray(xx,&x);
236:   PetscLogFlops(bs2*a->nz);
237:   return(0);
238: }

242: PetscErrorCode MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
243: {
244:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
245:   IS             isrow=a->row;
246:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
248:   const PetscInt *r;
249:   PetscInt       nz,*vj,k,idx;
250:   MatScalar      *aa=a->a,*v,*d;
251:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

254:   VecGetArray(bb,&b);
255:   VecGetArray(xx,&x);
256:   t  = a->solve_work;
257:   ISGetIndices(isrow,&r);

259:   /* solve U^T * D * y = b by forward substitution */
260:   tp = t;
261:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
262:     idx   = 7*r[k];
263:     tp[0] = b[idx];
264:     tp[1] = b[idx+1];
265:     tp[2] = b[idx+2];
266:     tp[3] = b[idx+3];
267:     tp[4] = b[idx+4];
268:     tp[5] = b[idx+5];
269:     tp[6] = b[idx+6];
270:     tp += 7;
271:   }
272: 
273:   for (k=0; k<mbs; k++){
274:     v  = aa + 49*ai[k];
275:     vj = aj + ai[k];
276:     tp = t + k*7;
277:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
278:     nz = ai[k+1] - ai[k];
279:     tp = t + (*vj)*7;
280:     while (nz--) {
281:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
282:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
283:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
284:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
285:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
286:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
287:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
288:       vj++; tp = t + (*vj)*7;
289:       v += 49;
290:     }

292:     /* xk = inv(Dk)*(Dk*xk) */
293:     d  = aa+k*49;          /* ptr to inv(Dk) */
294:     tp    = t + k*7;
295:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
296:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
297:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
298:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
299:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
300:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
301:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
302:   }

304:   /* solve U*x = y by back substitution */
305:   for (k=mbs-1; k>=0; k--){
306:     v  = aa + 49*ai[k];
307:     vj = aj + ai[k];
308:     tp    = t + k*7;
309:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
310:     nz = ai[k+1] - ai[k];
311: 
312:     tp = t + (*vj)*7;
313:     while (nz--) {
314:       /* xk += U(k,:)*x(:) */
315:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
316:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
317:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
318:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
319:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
320:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
321:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
322:       vj++; tp = t + (*vj)*7;
323:       v += 49;
324:     }
325:     tp    = t + k*7;
326:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
327:     idx   = 7*r[k];
328:     x[idx]     = x0;
329:     x[idx+1]   = x1;
330:     x[idx+2]   = x2;
331:     x[idx+3]   = x3;
332:     x[idx+4]   = x4;
333:     x[idx+5]   = x5;
334:     x[idx+6]   = x6;
335:   }

337:   ISRestoreIndices(isrow,&r);
338:   VecRestoreArray(bb,&b);
339:   VecRestoreArray(xx,&x);
340:   PetscLogFlops(49*(2*a->nz + mbs));
341:   return(0);
342: }

346: PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
347: {
348:   MatScalar      *v,*d;
349:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
350:   PetscInt       nz,*vj,k;

353:   for (k=0; k<mbs; k++){
354:     v  = aa + 49*ai[k];
355:     xp = x + k*7;
356:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
357:     nz = ai[k+1] - ai[k];
358:     vj = aj + ai[k];
359:     xp = x + (*vj)*7;
360:     while (nz--) {
361:       /* x(:) += U(k,:)^T*(Dk*xk) */
362:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
363:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
364:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
365:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
366:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
367:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
368:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
369:       vj++; xp = x + (*vj)*7;
370:       v += 49;
371:     }
372:     /* xk = inv(Dk)*(Dk*xk) */
373:     d  = aa+k*49;          /* ptr to inv(Dk) */
374:     xp = x + k*7;
375:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
376:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
377:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
378:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
379:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
380:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
381:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
382:   }
383:   return(0);
384: }

388: PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
389: {
390:   MatScalar      *v;
391:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
392:   PetscInt       nz,*vj,k;

395:   for (k=mbs-1; k>=0; k--){
396:     v  = aa + 49*ai[k];
397:     xp = x + k*7;
398:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
399:     nz = ai[k+1] - ai[k];
400:     vj = aj + ai[k];
401:     xp = x + (*vj)*7;
402:     while (nz--) {
403:       /* xk += U(k,:)*x(:) */
404:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
405:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
406:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
407:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
408:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
409:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
410:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
411:       vj++;
412:       v += 49; xp = x + (*vj)*7;
413:     }
414:     xp = x + k*7;
415:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
416:   }
417:   return(0);
418: }

422: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
423: {
424:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
426:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
427:   MatScalar      *aa=a->a;
428:   PetscScalar    *x,*b;

431:   VecGetArray(bb,&b);
432:   VecGetArray(xx,&x);
433: 
434:   /* solve U^T * D * y = b by forward substitution */
435:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
436:   ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);

438:   /* solve U*x = y by back substitution */
439:   BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);

441:   VecRestoreArray(bb,&b);
442:   VecRestoreArray(xx,&x);
443:   PetscLogFlops(49*(2*a->nz + mbs));
444:   return(0);
445: }

449: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
450: {
451:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
453:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
454:   MatScalar      *aa=a->a;
455:   PetscScalar    *x,*b;

458:   VecGetArray(bb,&b);
459:   VecGetArray(xx,&x);
460:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
461:   ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
462:   VecRestoreArray(bb,&b);
463:   VecRestoreArray(xx,&x);
464:   PetscLogFlops(49*a->nz + mbs);
465:   return(0);
466: }

470: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
471: {
472:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
474:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
475:   MatScalar      *aa=a->a;
476:   PetscScalar    *x,*b;

479:   VecGetArray(bb,&b);
480:   VecGetArray(xx,&x);
481:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
482:   BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
483:   VecRestoreArray(bb,&b);
484:   VecRestoreArray(xx,&x);
485:   PetscLogFlops(49*a->nz);
486:   return(0);
487: }

491: PetscErrorCode MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
492: {
493:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
494:   IS             isrow=a->row;
495:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
497:   const PetscInt *r;
498:   PetscInt       nz,*vj,k,idx;
499:   MatScalar      *aa=a->a,*v,*d;
500:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

503:   VecGetArray(bb,&b);
504:   VecGetArray(xx,&x);
505:   t  = a->solve_work;
506:   ISGetIndices(isrow,&r);

508:   /* solve U^T * D * y = b by forward substitution */
509:   tp = t;
510:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
511:     idx   = 6*r[k];
512:     tp[0] = b[idx];
513:     tp[1] = b[idx+1];
514:     tp[2] = b[idx+2];
515:     tp[3] = b[idx+3];
516:     tp[4] = b[idx+4];
517:     tp[5] = b[idx+5];
518:     tp += 6;
519:   }
520: 
521:   for (k=0; k<mbs; k++){
522:     v  = aa + 36*ai[k];
523:     vj = aj + ai[k];
524:     tp = t + k*6;
525:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
526:     nz = ai[k+1] - ai[k];
527:     tp = t + (*vj)*6;
528:     while (nz--) {
529:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
530:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
531:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
532:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
533:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
534:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
535:       vj++; tp = t + (*vj)*6;
536:       v += 36;
537:     }

539:     /* xk = inv(Dk)*(Dk*xk) */
540:     d  = aa+k*36;          /* ptr to inv(Dk) */
541:     tp    = t + k*6;
542:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
543:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
544:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
545:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
546:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
547:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
548:   }

550:   /* solve U*x = y by back substitution */
551:   for (k=mbs-1; k>=0; k--){
552:     v  = aa + 36*ai[k];
553:     vj = aj + ai[k];
554:     tp    = t + k*6;
555:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
556:     nz = ai[k+1] - ai[k];
557: 
558:     tp = t + (*vj)*6;
559:     while (nz--) {
560:       /* xk += U(k,:)*x(:) */
561:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
562:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
563:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
564:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
565:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
566:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
567:       vj++; tp = t + (*vj)*6;
568:       v += 36;
569:     }
570:     tp    = t + k*6;
571:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
572:     idx   = 6*r[k];
573:     x[idx]     = x0;
574:     x[idx+1]   = x1;
575:     x[idx+2]   = x2;
576:     x[idx+3]   = x3;
577:     x[idx+4]   = x4;
578:     x[idx+5]   = x5;
579:   }

581:   ISRestoreIndices(isrow,&r);
582:   VecRestoreArray(bb,&b);
583:   VecRestoreArray(xx,&x);
584:   PetscLogFlops(36*(2*a->nz + mbs));
585:   return(0);
586: }

590: PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
591: {
592:   MatScalar      *v,*d;
593:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
594:   PetscInt       nz,*vj,k;

597:   for (k=0; k<mbs; k++){
598:     v  = aa + 36*ai[k];
599:     xp = x + k*6;
600:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
601:     nz = ai[k+1] - ai[k];
602:     vj = aj + ai[k];
603:     xp = x + (*vj)*6;
604:     while (nz--) {
605:       /* x(:) += U(k,:)^T*(Dk*xk) */
606:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
607:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
608:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
609:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
610:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
611:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
612:       vj++; xp = x + (*vj)*6;
613:       v += 36;
614:     }
615:     /* xk = inv(Dk)*(Dk*xk) */
616:     d  = aa+k*36;          /* ptr to inv(Dk) */
617:     xp = x + k*6;
618:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
619:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
620:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
621:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
622:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
623:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
624:   }
625:   return(0);
626: }
629: PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
630: {
631:   MatScalar      *v;
632:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
633:   PetscInt       nz,*vj,k;

636:   for (k=mbs-1; k>=0; k--){
637:     v  = aa + 36*ai[k];
638:     xp = x + k*6;
639:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
640:     nz = ai[k+1] - ai[k];
641:     vj = aj + ai[k];
642:     xp = x + (*vj)*6;
643:     while (nz--) {
644:       /* xk += U(k,:)*x(:) */
645:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
646:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
647:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
648:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
649:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
650:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
651:       vj++;
652:       v += 36; xp = x + (*vj)*6;
653:     }
654:     xp = x + k*6;
655:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
656:   }
657:   return(0);
658: }


663: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
664: {
665:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
666:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
667:   MatScalar      *aa=a->a;
668:   PetscScalar    *x,*b;

672:   VecGetArray(bb,&b);
673:   VecGetArray(xx,&x);
674: 
675:   /* solve U^T * D * y = b by forward substitution */
676:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
677:   ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);

679:   /* solve U*x = y by back substitution */
680:   BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);

682:   VecRestoreArray(bb,&b);
683:   VecRestoreArray(xx,&x);
684:   PetscLogFlops(36*(2*a->nz + mbs));
685:   return(0);
686: }

690: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
691: {
692:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
693:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
694:   MatScalar      *aa=a->a;
695:   PetscScalar    *x,*b;

699:   VecGetArray(bb,&b);
700:   VecGetArray(xx,&x);
701:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
702:   ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
703:   VecRestoreArray(bb,&b);
704:   VecRestoreArray(xx,&x);
705:   PetscLogFlops(36*a->nz + mbs);
706:   return(0);
707: }

711: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
712: {
713:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
714:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
715:   MatScalar      *aa=a->a;
716:   PetscScalar    *x,*b;

720:   VecGetArray(bb,&b);
721:   VecGetArray(xx,&x);
722:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
723:   BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
724:   VecRestoreArray(bb,&b);
725:   VecRestoreArray(xx,&x);
726:   PetscLogFlops(36*a->nz);
727:   return(0);
728: }

732: PetscErrorCode MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
733: {
734:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
735:   IS             isrow=a->row;
736:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
738:   const PetscInt *r;
739:   PetscInt       nz,*vj,k,idx;
740:   MatScalar      *aa=a->a,*v,*diag;
741:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

744:   VecGetArray(bb,&b);
745:   VecGetArray(xx,&x);
746:   t  = a->solve_work;
747:   ISGetIndices(isrow,&r);

749:   /* solve U^T * D * y = b by forward substitution */
750:   tp = t;
751:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
752:     idx   = 5*r[k];
753:     tp[0] = b[idx];
754:     tp[1] = b[idx+1];
755:     tp[2] = b[idx+2];
756:     tp[3] = b[idx+3];
757:     tp[4] = b[idx+4];
758:     tp += 5;
759:   }
760: 
761:   for (k=0; k<mbs; k++){
762:     v  = aa + 25*ai[k];
763:     vj = aj + ai[k];
764:     tp = t + k*5;
765:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
766:     nz = ai[k+1] - ai[k];

768:     tp = t + (*vj)*5;
769:     while (nz--) {
770:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
771:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
772:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
773:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
774:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
775:       vj++; tp = t + (*vj)*5;
776:       v += 25;
777:     }

779:     /* xk = inv(Dk)*(Dk*xk) */
780:     diag  = aa+k*25;          /* ptr to inv(Dk) */
781:     tp    = t + k*5;
782:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
783:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
784:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
785:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
786:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
787:   }

789:   /* solve U*x = y by back substitution */
790:   for (k=mbs-1; k>=0; k--){
791:     v  = aa + 25*ai[k];
792:     vj = aj + ai[k];
793:     tp    = t + k*5;
794:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
795:     nz = ai[k+1] - ai[k];
796: 
797:     tp = t + (*vj)*5;
798:     while (nz--) {
799:       /* xk += U(k,:)*x(:) */
800:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
801:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
802:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
803:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
804:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
805:       vj++; tp = t + (*vj)*5;
806:       v += 25;
807:     }
808:     tp    = t + k*5;
809:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
810:     idx   = 5*r[k];
811:     x[idx]     = x0;
812:     x[idx+1]   = x1;
813:     x[idx+2]   = x2;
814:     x[idx+3]   = x3;
815:     x[idx+4]   = x4;
816:   }

818:   ISRestoreIndices(isrow,&r);
819:   VecRestoreArray(bb,&b);
820:   VecRestoreArray(xx,&x);
821:   PetscLogFlops(25*(2*a->nz + mbs));
822:   return(0);
823: }

827: PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
828: {
829:   MatScalar      *v,*diag;
830:   PetscScalar    *xp,x0,x1,x2,x3,x4;
831:   PetscInt       nz,*vj,k;

834:   for (k=0; k<mbs; k++){
835:     v  = aa + 25*ai[k];
836:     xp = x + k*5;
837:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
838:     nz = ai[k+1] - ai[k];
839:     vj = aj + ai[k];
840:     xp = x + (*vj)*5;
841:     while (nz--) {
842:       /* x(:) += U(k,:)^T*(Dk*xk) */
843:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
844:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
845:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
846:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
847:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
848:       vj++; xp = x + (*vj)*5;
849:       v += 25;
850:     }
851:     /* xk = inv(Dk)*(Dk*xk) */
852:     diag = aa+k*25;          /* ptr to inv(Dk) */
853:     xp   = x + k*5;
854:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
855:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
856:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
857:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
858:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
859:   }
860:   return(0);
861: }

865: PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
866: {
867:   MatScalar      *v;
868:   PetscScalar    *xp,x0,x1,x2,x3,x4;
869:   PetscInt       nz,*vj,k;

872:   for (k=mbs-1; k>=0; k--){
873:     v  = aa + 25*ai[k];
874:     xp = x + k*5;
875:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
876:     nz = ai[k+1] - ai[k];
877:     vj = aj + ai[k];
878:     xp = x + (*vj)*5;
879:     while (nz--) {
880:       /* xk += U(k,:)*x(:) */
881:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
882:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
883:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
884:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
885:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
886:       vj++;
887:       v += 25; xp = x + (*vj)*5;
888:     }
889:     xp = x + k*5;
890:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
891:   }
892:   return(0);
893: }

897: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
898: {
899:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
900:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
901:   MatScalar      *aa=a->a;
902:   PetscScalar    *x,*b;

906:   VecGetArray(bb,&b);
907:   VecGetArray(xx,&x);

909:   /* solve U^T * D * y = b by forward substitution */
910:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
911:   ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);

913:   /* solve U*x = y by back substitution */
914:   BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);

916:   VecRestoreArray(bb,&b);
917:   VecRestoreArray(xx,&x);
918:   PetscLogFlops(25*(2*a->nz + mbs));
919:   return(0);
920: }

924: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
925: {
926:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
927:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
928:   MatScalar      *aa=a->a;
929:   PetscScalar    *x,*b;

933:   VecGetArray(bb,&b);
934:   VecGetArray(xx,&x);
935:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
936:   ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
937:   VecRestoreArray(bb,&b);
938:   VecRestoreArray(xx,&x);
939:   PetscLogFlops(25*(a->nz + mbs));
940:   return(0);
941: }

945: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
946: {
947:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
948:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
949:   MatScalar      *aa=a->a;
950:   PetscScalar    *x,*b;

954:   VecGetArray(bb,&b);
955:   VecGetArray(xx,&x);
956:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
957:   BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
958:   VecRestoreArray(bb,&b);
959:   VecRestoreArray(xx,&x);
960:   PetscLogFlops(25*a->nz);
961:   return(0);
962: }

966: PetscErrorCode MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
967: {
968:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
969:   IS             isrow=a->row;
970:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
972:   const PetscInt *r;
973:   PetscInt       nz,*vj,k,idx;
974:   MatScalar      *aa=a->a,*v,*diag;
975:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

978:   VecGetArray(bb,&b);
979:   VecGetArray(xx,&x);
980:   t  = a->solve_work;
981:   ISGetIndices(isrow,&r);

983:   /* solve U^T * D * y = b by forward substitution */
984:   tp = t;
985:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
986:     idx   = 4*r[k];
987:     tp[0] = b[idx];
988:     tp[1] = b[idx+1];
989:     tp[2] = b[idx+2];
990:     tp[3] = b[idx+3];
991:     tp += 4;
992:   }
993: 
994:   for (k=0; k<mbs; k++){
995:     v  = aa + 16*ai[k];
996:     vj = aj + ai[k];
997:     tp = t + k*4;
998:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
999:     nz = ai[k+1] - ai[k];

1001:     tp = t + (*vj)*4;
1002:     while (nz--) {
1003:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1004:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1005:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1006:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1007:       vj++; tp = t + (*vj)*4;
1008:       v += 16;
1009:     }

1011:     /* xk = inv(Dk)*(Dk*xk) */
1012:     diag  = aa+k*16;          /* ptr to inv(Dk) */
1013:     tp    = t + k*4;
1014:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1015:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1016:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1017:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1018:   }

1020:   /* solve U*x = y by back substitution */
1021:   for (k=mbs-1; k>=0; k--){
1022:     v  = aa + 16*ai[k];
1023:     vj = aj + ai[k];
1024:     tp    = t + k*4;
1025:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1026:     nz = ai[k+1] - ai[k];
1027: 
1028:     tp = t + (*vj)*4;
1029:     while (nz--) {
1030:       /* xk += U(k,:)*x(:) */
1031:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1032:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1033:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1034:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1035:       vj++; tp = t + (*vj)*4;
1036:       v += 16;
1037:     }
1038:     tp    = t + k*4;
1039:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1040:     idx        = 4*r[k];
1041:     x[idx]     = x0;
1042:     x[idx+1]   = x1;
1043:     x[idx+2]   = x2;
1044:     x[idx+3]   = x3;
1045:   }

1047:   ISRestoreIndices(isrow,&r);
1048:   VecRestoreArray(bb,&b);
1049:   VecRestoreArray(xx,&x);
1050:   PetscLogFlops(16*(2*a->nz + mbs));
1051:   return(0);
1052: }

1056: PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1057: {
1058:   MatScalar      *v,*diag;
1059:   PetscScalar    *xp,x0,x1,x2,x3;
1060:   PetscInt       nz,*vj,k;

1063:   for (k=0; k<mbs; k++){
1064:     v  = aa + 16*ai[k];
1065:     xp = x + k*4;
1066:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1067:     nz = ai[k+1] - ai[k];
1068:     vj = aj + ai[k];
1069:     xp = x + (*vj)*4;
1070:     while (nz--) {
1071:       /* x(:) += U(k,:)^T*(Dk*xk) */
1072:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1073:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1074:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1075:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1076:       vj++; xp = x + (*vj)*4;
1077:       v += 16;
1078:     }
1079:     /* xk = inv(Dk)*(Dk*xk) */
1080:     diag = aa+k*16;          /* ptr to inv(Dk) */
1081:     xp   = x + k*4;
1082:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1083:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1084:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1085:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1086:   }
1087:   return(0);
1088: }

1092: PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1093: {
1094:   MatScalar      *v;
1095:   PetscScalar    *xp,x0,x1,x2,x3;
1096:   PetscInt       nz,*vj,k;

1099:   for (k=mbs-1; k>=0; k--){
1100:     v  = aa + 16*ai[k];
1101:     xp = x + k*4;
1102:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1103:     nz = ai[k+1] - ai[k];
1104:     vj = aj + ai[k];
1105:     xp = x + (*vj)*4;
1106:     while (nz--) {
1107:       /* xk += U(k,:)*x(:) */
1108:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1109:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1110:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1111:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1112:       vj++;
1113:       v += 16; xp = x + (*vj)*4;
1114:     }
1115:     xp = x + k*4;
1116:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1117:   }
1118:   return(0);
1119: }

1123: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1124: {
1125:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1126:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1127:   MatScalar      *aa=a->a;
1128:   PetscScalar    *x,*b;

1132:   VecGetArray(bb,&b);
1133:   VecGetArray(xx,&x);

1135:   /* solve U^T * D * y = b by forward substitution */
1136:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1137:   ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);

1139:   /* solve U*x = y by back substitution */
1140:   BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1141:   VecRestoreArray(bb,&b);
1142:   VecRestoreArray(xx,&x);
1143:   PetscLogFlops(16*(2*a->nz + mbs));
1144:   return(0);
1145: }

1149: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1150: {
1151:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1152:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1153:   MatScalar      *aa=a->a;
1154:   PetscScalar    *x,*b;

1158:   VecGetArray(bb,&b);
1159:   VecGetArray(xx,&x);
1160:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1161:   ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1162:   VecRestoreArray(bb,&b);
1163:   VecRestoreArray(xx,&x);
1164:   PetscLogFlops(16*a->nz + mbs);
1165:   return(0);
1166: }

1170: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1171: {
1172:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1173:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1174:   MatScalar      *aa=a->a;
1175:   PetscScalar    *x,*b;

1179:   VecGetArray(bb,&b);
1180:   VecGetArray(xx,&x);
1181:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1182:   BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1183:   VecRestoreArray(bb,&b);
1184:   VecRestoreArray(xx,&x);
1185:   PetscLogFlops(16*a->nz);
1186:   return(0);
1187: }

1191: PetscErrorCode MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
1192: {
1193:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1194:   IS             isrow=a->row;
1195:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1197:   const PetscInt *r;
1198:   PetscInt       nz,*vj,k,idx;
1199:   MatScalar      *aa=a->a,*v,*diag;
1200:   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;

1203:   VecGetArray(bb,&b);
1204:   VecGetArray(xx,&x);
1205:   t  = a->solve_work;
1206:   ISGetIndices(isrow,&r);

1208:   /* solve U^T * D * y = b by forward substitution */
1209:   tp = t;
1210:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1211:     idx   = 3*r[k];
1212:     tp[0] = b[idx];
1213:     tp[1] = b[idx+1];
1214:     tp[2] = b[idx+2];
1215:     tp += 3;
1216:   }
1217: 
1218:   for (k=0; k<mbs; k++){
1219:     v  = aa + 9*ai[k];
1220:     vj = aj + ai[k];
1221:     tp = t + k*3;
1222:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1223:     nz = ai[k+1] - ai[k];

1225:     tp = t + (*vj)*3;
1226:     while (nz--) {
1227:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1228:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1229:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1230:       vj++; tp = t + (*vj)*3;
1231:       v += 9;
1232:     }

1234:     /* xk = inv(Dk)*(Dk*xk) */
1235:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1236:     tp    = t + k*3;
1237:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1238:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1239:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1240:   }

1242:   /* solve U*x = y by back substitution */
1243:   for (k=mbs-1; k>=0; k--){
1244:     v  = aa + 9*ai[k];
1245:     vj = aj + ai[k];
1246:     tp    = t + k*3;
1247:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1248:     nz = ai[k+1] - ai[k];
1249: 
1250:     tp = t + (*vj)*3;
1251:     while (nz--) {
1252:       /* xk += U(k,:)*x(:) */
1253:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1254:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1255:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1256:       vj++; tp = t + (*vj)*3;
1257:       v += 9;
1258:     }
1259:     tp    = t + k*3;
1260:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
1261:     idx      = 3*r[k];
1262:     x[idx]   = x0;
1263:     x[idx+1] = x1;
1264:     x[idx+2] = x2;
1265:   }

1267:   ISRestoreIndices(isrow,&r);
1268:   VecRestoreArray(bb,&b);
1269:   VecRestoreArray(xx,&x);
1270:   PetscLogFlops(9*(2*a->nz + mbs));
1271:   return(0);
1272: }

1276: PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1277: {
1278:   MatScalar      *v,*diag;
1279:   PetscScalar    *xp,x0,x1,x2;
1280:   PetscInt       nz,*vj,k;

1283:   for (k=0; k<mbs; k++){
1284:     v  = aa + 9*ai[k];
1285:     xp = x + k*3;
1286:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1287:     nz = ai[k+1] - ai[k];
1288:     vj = aj + ai[k];
1289:     xp = x + (*vj)*3;
1290:     while (nz--) {
1291:       /* x(:) += U(k,:)^T*(Dk*xk) */
1292:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1293:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1294:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1295:       vj++; xp = x + (*vj)*3;
1296:       v += 9;
1297:     }
1298:     /* xk = inv(Dk)*(Dk*xk) */
1299:     diag = aa+k*9;          /* ptr to inv(Dk) */
1300:     xp   = x + k*3;
1301:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1302:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1303:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1304:   }
1305:   return(0);
1306: }

1310: PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1311: {
1312:   MatScalar      *v;
1313:   PetscScalar    *xp,x0,x1,x2;
1314:   PetscInt       nz,*vj,k;

1317:   for (k=mbs-1; k>=0; k--){
1318:     v  = aa + 9*ai[k];
1319:     xp = x + k*3;
1320:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1321:     nz = ai[k+1] - ai[k];
1322:     vj = aj + ai[k];
1323:     xp = x + (*vj)*3;
1324:     while (nz--) {
1325:       /* xk += U(k,:)*x(:) */
1326:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1327:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1328:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1329:       vj++;
1330:       v += 9; xp = x + (*vj)*3;
1331:     }
1332:     xp = x + k*3;
1333:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1334:   }
1335:   return(0);
1336: }

1340: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1341: {
1342:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1343:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1344:   MatScalar      *aa=a->a;
1345:   PetscScalar    *x,*b;
1347: 
1349:   VecGetArray(bb,&b);
1350:   VecGetArray(xx,&x);

1352:   /* solve U^T * D * y = b by forward substitution */
1353:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1354:   ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);

1356:   /* solve U*x = y by back substitution */
1357:   BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);

1359:   VecRestoreArray(bb,&b);
1360:   VecRestoreArray(xx,&x);
1361:   PetscLogFlops(9*(2*a->nz + mbs));
1362:   return(0);
1363: }

1367: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1368: {
1369:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1370:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1371:   MatScalar      *aa=a->a;
1372:   PetscScalar    *x,*b;

1376:   VecGetArray(bb,&b);
1377:   VecGetArray(xx,&x);
1378:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1379:   ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1380:   VecRestoreArray(bb,&b);
1381:   VecRestoreArray(xx,&x);
1382:   PetscLogFlops(9*(a->nz + mbs));
1383:   return(0);
1384: }

1388: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1389: {
1390:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1391:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1392:   MatScalar      *aa=a->a;
1393:   PetscScalar    *x,*b;

1397:   VecGetArray(bb,&b);
1398:   VecGetArray(xx,&x);
1399:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1400:   BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1401:   VecRestoreArray(bb,&b);
1402:   VecRestoreArray(xx,&x);
1403:   PetscLogFlops(9*a->nz);
1404:   return(0);
1405: }

1409: PetscErrorCode MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1410: {
1411:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
1412:   IS             isrow=a->row;
1413:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1415:   const PetscInt *r;
1416:   PetscInt       nz,*vj,k,k2,idx;
1417:   MatScalar      *aa=a->a,*v,*diag;
1418:   PetscScalar    *x,*b,x0,x1,*t;

1421:   VecGetArray(bb,&b);
1422:   VecGetArray(xx,&x);
1423:   t  = a->solve_work;
1424:   ISGetIndices(isrow,&r);

1426:   /* solve U^T * D * y = perm(b) by forward substitution */
1427:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1428:     idx = 2*r[k];
1429:     t[k*2]   = b[idx];
1430:     t[k*2+1] = b[idx+1];
1431:   }
1432:   for (k=0; k<mbs; k++){
1433:     v  = aa + 4*ai[k];
1434:     vj = aj + ai[k];
1435:     k2 = k*2;
1436:     x0 = t[k2]; x1 = t[k2+1];
1437:     nz = ai[k+1] - ai[k];
1438:     while (nz--) {
1439:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1440:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1441:       vj++; v += 4;
1442:     }
1443:     diag = aa+k*4;  /* ptr to inv(Dk) */
1444:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1445:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1446:   }

1448:   /* solve U*x = y by back substitution */
1449:   for (k=mbs-1; k>=0; k--){
1450:     v  = aa + 4*ai[k];
1451:     vj = aj + ai[k];
1452:     k2 = k*2;
1453:     x0 = t[k2]; x1 = t[k2+1];
1454:     nz = ai[k+1] - ai[k];
1455:     while (nz--) {
1456:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1457:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1458:       vj++; v += 4;
1459:     }
1460:     t[k2]    = x0;
1461:     t[k2+1]  = x1;
1462:     idx      = 2*r[k];
1463:     x[idx]   = x0;
1464:     x[idx+1] = x1;
1465:   }

1467:   ISRestoreIndices(isrow,&r);
1468:   VecRestoreArray(bb,&b);
1469:   VecRestoreArray(xx,&x);
1470:   PetscLogFlops(4*(2*a->nz + mbs));
1471:   return(0);
1472: }

1476: PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1477: {
1478:   MatScalar      *v,*diag;
1479:   PetscScalar    x0,x1;
1480:   PetscInt       nz,*vj,k,k2;

1483:   for (k=0; k<mbs; k++){
1484:     v  = aa + 4*ai[k];
1485:     vj = aj + ai[k];
1486:     k2 = k*2;
1487:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1488:     nz = ai[k+1] - ai[k];
1489: 
1490:     while (nz--) {
1491:       /* x(:) += U(k,:)^T*(Dk*xk) */
1492:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1493:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1494:       vj++; v += 4;
1495:     }
1496:     /* xk = inv(Dk)*(Dk*xk) */
1497:     diag = aa+k*4;          /* ptr to inv(Dk) */
1498:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1499:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1500:   }
1501:   return(0);
1502: }

1506: PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1507: {
1508:   MatScalar      *v;
1509:   PetscScalar    x0,x1;
1510:   PetscInt       nz,*vj,k,k2;

1513:   for (k=mbs-1; k>=0; k--){
1514:     v  = aa + 4*ai[k];
1515:     vj = aj + ai[k];
1516:     k2 = k*2;
1517:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1518:     nz = ai[k+1] - ai[k];
1519:     while (nz--) {
1520:       /* xk += U(k,:)*x(:) */
1521:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1522:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1523:       vj++; v += 4;
1524:     }
1525:     x[k2]     = x0;
1526:     x[k2+1]   = x1;
1527:   }
1528:   return(0);
1529: }

1533: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1534: {
1535:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1536:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1537:   MatScalar      *aa=a->a;
1538:   PetscScalar    *x,*b;

1542:   VecGetArray(bb,&b);
1543:   VecGetArray(xx,&x);

1545:   /* solve U^T * D * y = b by forward substitution */
1546:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1547:   ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);

1549:   /* solve U*x = y by back substitution */
1550:   BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);

1552:   VecRestoreArray(bb,&b);
1553:   VecRestoreArray(xx,&x);
1554:   PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1555:   return(0);
1556: }

1560: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1561: {
1562:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1563:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1564:   MatScalar      *aa=a->a;
1565:   PetscScalar    *x,*b;

1569:   VecGetArray(bb,&b);
1570:   VecGetArray(xx,&x);
1571:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1572:   ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1573:   VecRestoreArray(bb,&b);
1574:   VecRestoreArray(xx,&x);
1575:   PetscLogFlops(4*(a->nz + mbs));
1576:   return(0);
1577: }

1581: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1582: {
1583:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1584:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1585:   MatScalar      *aa=a->a;
1586:   PetscScalar    *x,*b;

1590:   VecGetArray(bb,&b);
1591:   VecGetArray(xx,&x);
1592:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1593:   BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1594:   VecRestoreArray(bb,&b);
1595:   VecRestoreArray(xx,&x);
1596:   PetscLogFlops(4*a->nz);
1597:   return(0);
1598: }

1602: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1603: {
1604:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1605:   IS              isrow=a->row;
1606:   PetscErrorCode  ierr;
1607:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1608:   const MatScalar *aa=a->a,*v;
1609:   PetscScalar     *x,*b,xk,*t;
1610:   PetscInt        nz,k;

1613:   VecGetArray(bb,&b);
1614:   VecGetArray(xx,&x);
1615:   t    = a->solve_work;
1616:   ISGetIndices(isrow,&rp);
1617: 
1618:   /* solve U^T*D^(1/2)*y = perm(b) by forward substitution */
1619:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1620:   for (k=0; k<mbs; k++){
1621:     v  = aa + ai[k] + 1;
1622:     vj = aj + ai[k] + 1;
1623:     xk = t[k];
1624:     nz = ai[k+1] - ai[k] - 1;
1625:     while (nz--) t[*vj++] += (*v++) * xk;
1626:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1627:   }

1629:   /* solve U*perm(x) = y by back substitution */
1630:   for (k=mbs-1; k>=0; k--){
1631:     v  = aa + ai[k] + 1;
1632:     vj = aj + ai[k] + 1;
1633:     nz = ai[k+1] - ai[k] - 1;
1634:     while (nz--) t[k] += (*v++) * t[*vj++];
1635:     x[rp[k]] = t[k];
1636:   }

1638:   ISRestoreIndices(isrow,&rp);
1639:   VecRestoreArray(bb,&b);
1640:   VecRestoreArray(xx,&x);
1641:   PetscLogFlops(4*a->nz);
1642:   return(0);
1643: }

1647: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1648: {
1649:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1650:   IS              isrow=a->row;
1651:   PetscErrorCode  ierr;
1652:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1653:   const MatScalar *aa=a->a,*v;
1654:   PetscReal       diagk;
1655:   PetscScalar     *x,*b,xk;
1656:   PetscInt        nz,k;

1659:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1660:   VecGetArray(bb,&b);
1661:   VecGetArray(xx,&x);
1662:   ISGetIndices(isrow,&rp);
1663: 
1664:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1665:   for (k=0; k<mbs; k++){
1666:     v  = aa + ai[k] + 1;
1667:     vj = aj + ai[k] + 1;
1668:     xk = x[k];
1669:     nz = ai[k+1] - ai[k] - 1;
1670:     while (nz--) x[*vj++] += (*v++) * xk;

1672:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1673:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1674:     x[k] = xk*sqrt(diagk);
1675:   }
1676:   ISRestoreIndices(isrow,&rp);
1677:   VecRestoreArray(bb,&b);
1678:   VecRestoreArray(xx,&x);
1679:   PetscLogFlops(2*a->nz);
1680:   return(0);
1681: }

1685: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1686: {
1687:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1688:   IS              isrow=a->row;
1689:   PetscErrorCode  ierr;
1690:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1691:   const MatScalar *aa=a->a,*v;
1692:   PetscReal       diagk;
1693:   PetscScalar     *x,*b,*t;
1694:   PetscInt        nz,k;

1697:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1698:   VecGetArray(bb,&b);
1699:   VecGetArray(xx,&x);
1700:   t    = a->solve_work;
1701:   ISGetIndices(isrow,&rp);

1703:   for (k=mbs-1; k>=0; k--){
1704:     v  = aa + ai[k] + 1;
1705:     vj = aj + ai[k] + 1;
1706:     diagk = PetscRealPart(aa[ai[k]]);
1707:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1708:     t[k] = b[k] * sqrt(diagk);
1709:     nz = ai[k+1] - ai[k] - 1;
1710:     while (nz--) t[k] += (*v++) * t[*vj++];
1711:     x[rp[k]] = t[k];
1712:   }
1713:   ISRestoreIndices(isrow,&rp);
1714:   VecRestoreArray(bb,&b);
1715:   VecRestoreArray(xx,&x);
1716:   PetscLogFlops(2*a->nz);
1717:   return(0);
1718: }

1722: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1723: {
1724:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;

1728:   if (A->rmap->bs == 1) {
1729:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1730:   } else {
1731:     IS              isrow=a->row;
1732:     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1733:     const MatScalar *aa=a->a,*v;
1734:     PetscScalar     *x,*b,*t;
1735:     PetscInt        nz,k,n,i;
1736:     if (bb->n > a->solves_work_n) {
1737:       PetscFree(a->solves_work);
1738:       PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1739:       a->solves_work_n = bb->n;
1740:     }
1741:     n    = bb->n;
1742:     VecGetArray(bb->v,&b);
1743:     VecGetArray(xx->v,&x);
1744:     t    = a->solves_work;

1746:     ISGetIndices(isrow,&rp);
1747: 
1748:     /* solve U^T*D*y = perm(b) by forward substitution */
1749:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1750:     for (k=0; k<mbs; k++){
1751:       v  = aa + ai[k];
1752:       vj = aj + ai[k];
1753:       nz = ai[k+1] - ai[k];
1754:       while (nz--) {
1755:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1756:         v++;vj++;
1757:       }
1758:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1759:     }
1760: 
1761:     /* solve U*perm(x) = y by back substitution */
1762:     for (k=mbs-1; k>=0; k--){
1763:       v  = aa + ai[k];
1764:       vj = aj + ai[k];
1765:       nz = ai[k+1] - ai[k];
1766:       while (nz--) {
1767:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1768:         v++;vj++;
1769:       }
1770:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1771:     }

1773:     ISRestoreIndices(isrow,&rp);
1774:     VecRestoreArray(bb->v,&b);
1775:     VecRestoreArray(xx->v,&x);
1776:     PetscLogFlops(bb->n*(4*a->nz + A->rmap->N));
1777:   }
1778:   return(0);
1779: }

1781: /*
1782:       Special case where the matrix was ILU(0) factored in the natural
1783:    ordering. This eliminates the need for the column and row permutation.
1784: */
1787: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1788: {
1789:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1791:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1792:   MatScalar      *aa=a->a,*v;
1793:   PetscScalar    *x,*b,xk;
1794:   PetscInt       nz,*vj,k;

1797:   VecGetArray(bb,&b);
1798:   VecGetArray(xx,&x);
1799: 
1800:   /* solve U^T*D*y = b by forward substitution */
1801:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1802:   for (k=0; k<mbs; k++){
1803:     v  = aa + ai[k] + 1;
1804:     vj = aj + ai[k] + 1;
1805:     xk = x[k];
1806:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1807:     while (nz--) x[*vj++] += (*v++) * xk;
1808:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
1809:   }

1811:   /* solve U*x = y by back substitution */
1812:   for (k=mbs-2; k>=0; k--){
1813:     v  = aa + ai[k] + 1;
1814:     vj = aj + ai[k] + 1;
1815:     xk = x[k];
1816:     nz = ai[k+1] - ai[k] - 1;
1817:     while (nz--) xk += (*v++) * x[*vj++];
1818:     x[k] = xk;
1819:   }

1821:   VecRestoreArray(bb,&b);
1822:   VecRestoreArray(xx,&x);
1823:   PetscLogFlops(4*a->nz);
1824:   return(0);
1825: }

1829: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1830: {
1831:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1833:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1834:   MatScalar      *aa=a->a,*v;
1835:   PetscReal      diagk;
1836:   PetscScalar    *x,*b;
1837:   PetscInt       nz,*vj,k;

1840:   /* solve U^T*D^(1/2)*x = b by forward substitution */
1841:   VecGetArray(bb,&b);
1842:   VecGetArray(xx,&x);
1843:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1844:   for (k=0; k<mbs; k++){
1845:     v  = aa + ai[k] + 1;
1846:     vj = aj + ai[k] + 1;
1847:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1848:     while (nz--) x[*vj++] += (*v++) * x[k];
1849:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1850:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
1851:     x[k] *= sqrt(diagk);
1852:   }
1853:   VecRestoreArray(bb,&b);
1854:   VecRestoreArray(xx,&x);
1855:   PetscLogFlops(2*a->nz);
1856:   return(0);
1857: }

1861: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1862: {
1863:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1865:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1866:   MatScalar      *aa=a->a,*v;
1867:   PetscReal      diagk;
1868:   PetscScalar    *x,*b;
1869:   PetscInt       nz,*vj,k;

1872:   /* solve D^(1/2)*U*x = b by back substitution */
1873:   VecGetArray(bb,&b);
1874:   VecGetArray(xx,&x);

1876:   for (k=mbs-1; k>=0; k--){
1877:     v  = aa + ai[k] + 1;
1878:     vj = aj + ai[k] + 1;
1879:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1880:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1881:     x[k] = sqrt(diagk)*b[k];
1882:     nz = ai[k+1] - ai[k] - 1;
1883:     while (nz--) x[k] += (*v++) * x[*vj++];
1884:   }
1885:   VecRestoreArray(bb,&b);
1886:   VecRestoreArray(xx,&x);
1887:   PetscLogFlops(2*a->nz);
1888:   return(0);
1889: }


1893: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1896: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
1897: {
1898:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
1900:   const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
1901:   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
1902:   PetscInt       m,reallocs = 0,*levtmp;
1903:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
1904:   PetscInt       incrlev,*lev,shift,prow,nz;
1905:   PetscReal      f = info->fill,levels = info->levels;
1906:   PetscTruth     perm_identity;

1909:   /* check whether perm is the identity mapping */
1910:   ISIdentity(perm,&perm_identity);

1912:   if (perm_identity){
1913:     a->permute = PETSC_FALSE;
1914:     ai = a->i; aj = a->j;
1915:   } else { /*  non-trivial permutation */
1916:     a->permute = PETSC_TRUE;
1917:     MatReorderingSeqSBAIJ(A, perm);
1918:     ai = a->inew; aj = a->jnew;
1919:   }
1920: 
1921:   /* initialization */
1922:   ISGetIndices(perm,&rip);
1923:   umax  = (PetscInt)(f*ai[mbs] + 1);
1924:   PetscMalloc(umax*sizeof(PetscInt),&lev);
1925:   umax += mbs + 1;
1926:   shift = mbs + 1;
1927:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1928:   PetscMalloc(umax*sizeof(PetscInt),&ju);
1929:   iu[0] = mbs + 1;
1930:   juidx = mbs + 1;
1931:   /* prowl: linked list for pivot row */
1932:   PetscMalloc((3*mbs+1)*sizeof(PetscInt),&prowl);
1933:   /* q: linked list for col index */
1934:   q       = prowl + mbs;
1935:   levtmp  = q     + mbs;
1936: 
1937:   for (i=0; i<mbs; i++){
1938:     prowl[i] = mbs;
1939:     q[i] = 0;
1940:   }

1942:   /* for each row k */
1943:   for (k=0; k<mbs; k++){
1944:     nzk  = 0;
1945:     q[k] = mbs;
1946:     /* copy current row into linked list */
1947:     nz = ai[rip[k]+1] - ai[rip[k]];
1948:     j = ai[rip[k]];
1949:     while (nz--){
1950:       vj = rip[aj[j++]];
1951:       if (vj > k){
1952:         qm = k;
1953:         do {
1954:           m  = qm; qm = q[m];
1955:         } while(qm < vj);
1956:         if (qm == vj) {
1957:           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1958:         }
1959:         nzk++;
1960:         q[m]       = vj;
1961:         q[vj]      = qm;
1962:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1963:       }
1964:     }

1966:     /* modify nonzero structure of k-th row by computing fill-in
1967:        for each row prow to be merged in */
1968:     prow = k;
1969:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1970: 
1971:     while (prow < k){
1972:       /* merge row prow into k-th row */
1973:       jmin = iu[prow] + 1;
1974:       jmax = iu[prow+1];
1975:       qm = k;
1976:       for (j=jmin; j<jmax; j++){
1977:         incrlev = lev[j-shift] + 1;
1978:         if (incrlev > levels) continue;
1979:         vj      = ju[j];
1980:         do {
1981:           m = qm; qm = q[m];
1982:         } while (qm < vj);
1983:         if (qm != vj){      /* a fill */
1984:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1985:           levtmp[vj] = incrlev;
1986:         } else {
1987:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1988:         }
1989:       }
1990:       prow = prowl[prow]; /* next pivot row */
1991:     }
1992: 
1993:     /* add k to row list for first nonzero element in k-th row */
1994:     if (nzk > 1){
1995:       i = q[k]; /* col value of first nonzero element in k_th row of U */
1996:       prowl[k] = prowl[i]; prowl[i] = k;
1997:     }
1998:     iu[k+1] = iu[k] + nzk;

2000:     /* allocate more space to ju and lev if needed */
2001:     if (iu[k+1] > umax) {
2002:       /* estimate how much additional space we will need */
2003:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2004:       /* just double the memory each time */
2005:       maxadd = umax;
2006:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2007:       umax += maxadd;

2009:       /* allocate a longer ju */
2010:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2011:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2012:       PetscFree(ju);
2013:       ju       = jutmp;

2015:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2016:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2017:       PetscFree(lev);
2018:       lev      = jutmp;
2019:       reallocs += 2; /* count how many times we realloc */
2020:     }

2022:     /* save nonzero structure of k-th row in ju */
2023:     i=k;
2024:     while (nzk --) {
2025:       i                = q[i];
2026:       ju[juidx]        = i;
2027:       lev[juidx-shift] = levtmp[i];
2028:       juidx++;
2029:     }
2030:   }
2031: 
2032: #if defined(PETSC_USE_INFO)
2033:   if (ai[mbs] != 0) {
2034:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2035:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2036:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2037:     PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2038:     PetscInfo(A,"for best performance.\n");
2039:   } else {
2040:     PetscInfo(A,"Empty matrix.\n");
2041:   }
2042: #endif

2044:   ISRestoreIndices(perm,&rip);
2045:   PetscFree(prowl);
2046:   PetscFree(lev);

2048:   /* put together the new matrix */
2049:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);

2051:   /* PetscLogObjectParent(B,iperm); */
2052:   b    = (Mat_SeqSBAIJ*)(B)->data;
2053:   PetscFree2(b->imax,b->ilen);
2054:   b->singlemalloc = PETSC_FALSE;
2055:   b->free_a       = PETSC_TRUE;
2056:   b->free_ij      = PETSC_TRUE;
2057:   /* the next line frees the default space generated by the Create() */
2058:   PetscFree3(b->a,b->j,b->i);
2059:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2060:   b->j    = ju;
2061:   b->i    = iu;
2062:   b->diag = 0;
2063:   b->ilen = 0;
2064:   b->imax = 0;
2065: 
2066:   if (b->row) {
2067:     ISDestroy(b->row);
2068:   }
2069:   if (b->icol) {
2070:     ISDestroy(b->icol);
2071:   }
2072:   b->row  = perm;
2073:   b->icol = perm;
2074:   PetscObjectReference((PetscObject)perm);
2075:   PetscObjectReference((PetscObject)perm);
2076:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2077:   /* In b structure:  Free imax, ilen, old a, old j.  
2078:      Allocate idnew, solve_work, new a, new j */
2079:   PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2080:   b->maxnz = b->nz = iu[mbs];
2081: 
2082:   (B)->info.factor_mallocs    = reallocs;
2083:   (B)->info.fill_ratio_given  = f;
2084:   if (ai[mbs] != 0) {
2085:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2086:   } else {
2087:     (B)->info.fill_ratio_needed = 0.0;
2088:   }
2089:   MatSeqSBAIJSetNumericFactorization(B,perm_identity);
2090:   return(0);
2091: }

2093:  #include petscbt.h
2094:  #include ../src/mat/utils/freespace.h
2097: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2098: {
2099:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2100:   Mat_SeqSBAIJ       *b;
2101:   PetscErrorCode     ierr;
2102:   PetscTruth         perm_identity,free_ij = PETSC_TRUE,missing;
2103:   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2104:   const PetscInt     *cols,*rip,*ai,*aj;
2105:   PetscInt           reallocs=0,i,*ui;
2106:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2107:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2108:   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
2109:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2110:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2111:   PetscBT            lnkbt;

2114:   MatMissingDiagonal(A,&missing,&d);
2115:   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

2117:   /*  
2118:    This code originally uses Modified Sparse Row (MSR) storage
2119:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2120:    Then it is rewritten so the factor B takes seqsbaij format. However the associated 
2121:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 
2122:    thus the original code in MSR format is still used for these cases. 
2123:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 
2124:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2125:   */
2126:   if (bs > 1){
2127:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2128:     return(0);
2129:   }

2131:   /* check whether perm is the identity mapping */
2132:   ISIdentity(perm,&perm_identity);
2133: 
2134:   /* special case that simply copies fill pattern */
2135:   if (!levels && perm_identity) {
2136:     a->permute = PETSC_FALSE;
2137:     /* reuse the column pointers and row offsets for memory savings */
2138:     ui           = a->i;
2139:     uj           = a->j;
2140:     free_ij      = PETSC_FALSE;
2141:     ratio_needed = 1.0;
2142:   } else { /* case: levels>0 || (levels=0 && !perm_identity) */
2143:     if (perm_identity){
2144:       a->permute = PETSC_FALSE;
2145:       ai = a->i; aj = a->j;
2146:     } else { /*  non-trivial permutation */
2147:       a->permute = PETSC_TRUE;
2148:       MatReorderingSeqSBAIJ(A, perm);
2149:       ai   = a->inew; aj = a->jnew;
2150:     }
2151:     ISGetIndices(perm,&rip);

2153:     /* initialization */
2154:     PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2155:     ui[0] = 0;

2157:     /* jl: linked list for storing indices of the pivot rows 
2158:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2159:     PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);
2160:     il         = jl + am;
2161:     uj_ptr     = (PetscInt**)(il + am);
2162:     uj_lvl_ptr = (PetscInt**)(uj_ptr + am);
2163:     for (i=0; i<am; i++){
2164:       jl[i] = am; il[i] = 0;
2165:     }

2167:     /* create and initialize a linked list for storing column indices of the active row k */
2168:     nlnk = am + 1;
2169:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2171:     /* initial FreeSpace size is fill*(ai[am]+1) */
2172:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2173:     current_space = free_space;
2174:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2175:     current_space_lvl = free_space_lvl;

2177:     for (k=0; k<am; k++){  /* for each active row k */
2178:       /* initialize lnk by the column indices of row rip[k] */
2179:       nzk   = 0;
2180:       ncols = ai[rip[k]+1] - ai[rip[k]];
2181:       cols  = aj+ai[rip[k]];
2182:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2183:       nzk += nlnk;

2185:       /* update lnk by computing fill-in for each pivot row to be merged in */
2186:       prow = jl[k]; /* 1st pivot row */
2187: 
2188:       while (prow < k){
2189:         nextprow = jl[prow];
2190: 
2191:         /* merge prow into k-th row */
2192:         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2193:         jmax = ui[prow+1];
2194:         ncols = jmax-jmin;
2195:         i     = jmin - ui[prow];
2196:         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2197:         j     = *(uj_lvl_ptr[prow] + i - 1);
2198:         cols_lvl = uj_lvl_ptr[prow]+i;
2199:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2200:         nzk += nlnk;

2202:         /* update il and jl for prow */
2203:         if (jmin < jmax){
2204:           il[prow] = jmin;
2205:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2206:         }
2207:         prow = nextprow;
2208:       }

2210:       /* if free space is not available, make more free space */
2211:       if (current_space->local_remaining<nzk) {
2212:         i = am - k + 1; /* num of unfactored rows */
2213:         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2214:         PetscFreeSpaceGet(i,&current_space);
2215:         PetscFreeSpaceGet(i,&current_space_lvl);
2216:         reallocs++;
2217:       }

2219:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2220:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2222:       /* add the k-th row into il and jl */
2223:       if (nzk-1 > 0){
2224:         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2225:         jl[k] = jl[i]; jl[i] = k;
2226:         il[k] = ui[k] + 1;
2227:       }
2228:       uj_ptr[k]     = current_space->array;
2229:       uj_lvl_ptr[k] = current_space_lvl->array;

2231:       current_space->array           += nzk;
2232:       current_space->local_used      += nzk;
2233:       current_space->local_remaining -= nzk;
2234:       current_space_lvl->array           += nzk;
2235:       current_space_lvl->local_used      += nzk;
2236:       current_space_lvl->local_remaining -= nzk;

2238:       ui[k+1] = ui[k] + nzk;
2239:     }

2241: #if defined(PETSC_USE_INFO)
2242:     if (ai[am] != 0) {
2243:       PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2244:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2245:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2246:       PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2247:     } else {
2248:       PetscInfo(A,"Empty matrix.\n");
2249:     }
2250: #endif

2252:     ISRestoreIndices(perm,&rip);
2253:     PetscFree(jl);

2255:     /* destroy list of free space and other temporary array(s) */
2256:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2257:     PetscFreeSpaceContiguous(&free_space,uj);
2258:     PetscIncompleteLLDestroy(lnk,lnkbt);
2259:     PetscFreeSpaceDestroy(free_space_lvl);
2260:     if (ai[am] != 0) {
2261:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2262:     } else {
2263:       ratio_needed = 0.0;
2264:     }
2265:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2267:   /* put together the new matrix in MATSEQSBAIJ format */
2268:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);

2270:   b = (Mat_SeqSBAIJ*)(fact)->data;
2271:   PetscFree2(b->imax,b->ilen);
2272:   b->singlemalloc = PETSC_FALSE;
2273:   b->free_a  = PETSC_TRUE;
2274:   b->free_ij = free_ij;
2275:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2276:   b->j    = uj;
2277:   b->i    = ui;
2278:   b->diag    = 0;
2279:   b->ilen    = 0;
2280:   b->imax    = 0;
2281:   b->row     = perm;
2282:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2283:   PetscObjectReference((PetscObject)perm);
2284:   b->icol = perm;
2285:   PetscObjectReference((PetscObject)perm);
2286:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2287:   b->maxnz = b->nz = ui[am];
2288: 
2289:   (fact)->info.factor_mallocs    = reallocs;
2290:   (fact)->info.fill_ratio_given  = fill;
2291:   (fact)->info.fill_ratio_needed = ratio_needed;
2292:   MatSeqSBAIJSetNumericFactorization(fact,perm_identity);
2293:   return(0);
2294: }