Actual source code: sbaijfact2.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for SBAIJ format.
5: */
7: #include ../src/mat/impls/sbaij/seq/sbaij.h
8: #include ../src/mat/impls/baij/seq/baij.h
9: #include ../src/inline/ilu.h
10: #include ../src/inline/dot.h
14: PetscErrorCode MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
15: {
16: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
17: IS isrow=a->row;
18: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
19: PetscErrorCode ierr;
20: const PetscInt *r;
21: PetscInt nz,*vj,k,idx,k1;
22: PetscInt bs=A->rmap->bs,bs2 = a->bs2;
23: MatScalar *aa=a->a,*v,*diag;
24: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
27: VecGetArray(bb,&b);
28: VecGetArray(xx,&x);
29: t = a->solve_work;
30: ISGetIndices(isrow,&r);
31: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
33: /* solve U^T * D * y = b by forward substitution */
34: xk = t;
35: for (k=0; k<mbs; k++) { /* t <- perm(b) */
36: idx = bs*r[k];
37: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
38: }
39: for (k=0; k<mbs; k++){
40: v = aa + bs2*ai[k];
41: xk = t + k*bs; /* Dk*xk = k-th block of x */
42: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
43: nz = ai[k+1] - ai[k];
44: vj = aj + ai[k];
45: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
46: while (nz--) {
47: /* x(:) += U(k,:)^T*(Dk*xk) */
48: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
49: vj++; xj = t + (*vj)*bs;
50: v += bs2;
51: }
52: /* xk = inv(Dk)*(Dk*xk) */
53: diag = aa+k*bs2; /* ptr to inv(Dk) */
54: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
55: }
57: /* solve U*x = y by back substitution */
58: for (k=mbs-1; k>=0; k--){
59: v = aa + bs2*ai[k];
60: xk = t + k*bs; /* xk */
61: nz = ai[k+1] - ai[k];
62: vj = aj + ai[k];
63: xj = t + (*vj)*bs;
64: while (nz--) {
65: /* xk += U(k,:)*x(:) */
66: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
67: vj++;
68: v += bs2; xj = t + (*vj)*bs;
69: }
70: idx = bs*r[k];
71: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
72: }
74: PetscFree(xk_tmp);
75: ISRestoreIndices(isrow,&r);
76: VecRestoreArray(bb,&b);
77: VecRestoreArray(xx,&x);
78: PetscLogFlops(bs2*(2*a->nz + mbs));
79: return(0);
80: }
84: PetscErrorCode MatForwardSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
85: {
87: SETERRQ(1,"not implemented yet");
88: return(0);
89: }
93: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
94: {
96: SETERRQ(1,"not implemented yet");
97: return(0);
98: }
102: PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
103: {
105: PetscInt nz,*vj,k;
106: PetscInt bs2 = bs*bs;
107: MatScalar *v,*diag;
108: PetscScalar *xk,*xj,*xk_tmp;
109:
111: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
112: for (k=0; k<mbs; k++){
113: v = aa + bs2*ai[k];
114: xk = x + k*bs; /* Dk*xk = k-th block of x */
115: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
116: nz = ai[k+1] - ai[k];
117: vj = aj + ai[k];
118: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
119: while (nz--) {
120: /* x(:) += U(k,:)^T*(Dk*xk) */
121: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
122: vj++; xj = x + (*vj)*bs;
123: v += bs2;
124: }
125: /* xk = inv(Dk)*(Dk*xk) */
126: diag = aa+k*bs2; /* ptr to inv(Dk) */
127: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
128: }
129: PetscFree(xk_tmp);
130: return(0);
131: }
135: PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
136: {
137: PetscInt nz,*vj,k;
138: PetscInt bs2 = bs*bs;
139: MatScalar *v;
140: PetscScalar *xk,*xj;
143: for (k=mbs-1; k>=0; k--){
144: v = aa + bs2*ai[k];
145: xk = x + k*bs; /* xk */
146: nz = ai[k+1] - ai[k];
147: vj = aj + ai[k];
148: xj = x + (*vj)*bs;
149: while (nz--) {
150: /* xk += U(k,:)*x(:) */
151: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
152: vj++;
153: v += bs2; xj = x + (*vj)*bs;
154: }
155: }
156: return(0);
157: }
161: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
162: {
163: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
165: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
166: PetscInt bs=A->rmap->bs;
167: MatScalar *aa=a->a;
168: PetscScalar *x,*b;
169: #if defined(PETSC_USE_LOG)
170: PetscInt bs2 = a->bs2;
171: #endif
174: VecGetArray(bb,&b);
175: VecGetArray(xx,&x);
177: /* solve U^T * D * y = b by forward substitution */
178: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
179: ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
181: /* solve U*x = y by back substitution */
182: BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
184: VecRestoreArray(bb,&b);
185: VecRestoreArray(xx,&x);
186: PetscLogFlops(bs2*(2*a->nz + mbs));
187: return(0);
188: }
192: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
193: {
194: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
196: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
197: PetscInt bs=A->rmap->bs;
198: MatScalar *aa=a->a;
199: PetscScalar *x,*b;
200: #if defined(PETSC_USE_LOG)
201: PetscInt bs2 = a->bs2;
202: #endif
205: VecGetArray(bb,&b);
206: VecGetArray(xx,&x);
207: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
208: ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
209: VecRestoreArray(bb,&b);
210: VecRestoreArray(xx,&x);
211: PetscLogFlops(bs2*a->nz + A->rmap->N);
212: return(0);
213: }
217: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
218: {
219: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
221: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
222: PetscInt bs=A->rmap->bs;
223: MatScalar *aa=a->a;
224: PetscScalar *x,*b;
225: #if defined(PETSC_USE_LOG)
226: PetscInt bs2 = a->bs2;
227: #endif
230: VecGetArray(bb,&b);
231: VecGetArray(xx,&x);
232: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
233: BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
234: VecRestoreArray(bb,&b);
235: VecRestoreArray(xx,&x);
236: PetscLogFlops(bs2*a->nz);
237: return(0);
238: }
242: PetscErrorCode MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
243: {
244: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
245: IS isrow=a->row;
246: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
248: const PetscInt *r;
249: PetscInt nz,*vj,k,idx;
250: MatScalar *aa=a->a,*v,*d;
251: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
254: VecGetArray(bb,&b);
255: VecGetArray(xx,&x);
256: t = a->solve_work;
257: ISGetIndices(isrow,&r);
259: /* solve U^T * D * y = b by forward substitution */
260: tp = t;
261: for (k=0; k<mbs; k++) { /* t <- perm(b) */
262: idx = 7*r[k];
263: tp[0] = b[idx];
264: tp[1] = b[idx+1];
265: tp[2] = b[idx+2];
266: tp[3] = b[idx+3];
267: tp[4] = b[idx+4];
268: tp[5] = b[idx+5];
269: tp[6] = b[idx+6];
270: tp += 7;
271: }
272:
273: for (k=0; k<mbs; k++){
274: v = aa + 49*ai[k];
275: vj = aj + ai[k];
276: tp = t + k*7;
277: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
278: nz = ai[k+1] - ai[k];
279: tp = t + (*vj)*7;
280: while (nz--) {
281: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
282: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
283: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
284: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
285: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
286: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
287: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
288: vj++; tp = t + (*vj)*7;
289: v += 49;
290: }
292: /* xk = inv(Dk)*(Dk*xk) */
293: d = aa+k*49; /* ptr to inv(Dk) */
294: tp = t + k*7;
295: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
296: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
297: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
298: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
299: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
300: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
301: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
302: }
304: /* solve U*x = y by back substitution */
305: for (k=mbs-1; k>=0; k--){
306: v = aa + 49*ai[k];
307: vj = aj + ai[k];
308: tp = t + k*7;
309: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
310: nz = ai[k+1] - ai[k];
311:
312: tp = t + (*vj)*7;
313: while (nz--) {
314: /* xk += U(k,:)*x(:) */
315: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
316: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
317: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
318: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
319: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
320: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
321: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
322: vj++; tp = t + (*vj)*7;
323: v += 49;
324: }
325: tp = t + k*7;
326: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
327: idx = 7*r[k];
328: x[idx] = x0;
329: x[idx+1] = x1;
330: x[idx+2] = x2;
331: x[idx+3] = x3;
332: x[idx+4] = x4;
333: x[idx+5] = x5;
334: x[idx+6] = x6;
335: }
337: ISRestoreIndices(isrow,&r);
338: VecRestoreArray(bb,&b);
339: VecRestoreArray(xx,&x);
340: PetscLogFlops(49*(2*a->nz + mbs));
341: return(0);
342: }
346: PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
347: {
348: MatScalar *v,*d;
349: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
350: PetscInt nz,*vj,k;
353: for (k=0; k<mbs; k++){
354: v = aa + 49*ai[k];
355: xp = x + k*7;
356: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
357: nz = ai[k+1] - ai[k];
358: vj = aj + ai[k];
359: xp = x + (*vj)*7;
360: while (nz--) {
361: /* x(:) += U(k,:)^T*(Dk*xk) */
362: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
363: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
364: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
365: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
366: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
367: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
368: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
369: vj++; xp = x + (*vj)*7;
370: v += 49;
371: }
372: /* xk = inv(Dk)*(Dk*xk) */
373: d = aa+k*49; /* ptr to inv(Dk) */
374: xp = x + k*7;
375: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
376: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
377: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
378: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
379: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
380: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
381: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
382: }
383: return(0);
384: }
388: PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
389: {
390: MatScalar *v;
391: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
392: PetscInt nz,*vj,k;
395: for (k=mbs-1; k>=0; k--){
396: v = aa + 49*ai[k];
397: xp = x + k*7;
398: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
399: nz = ai[k+1] - ai[k];
400: vj = aj + ai[k];
401: xp = x + (*vj)*7;
402: while (nz--) {
403: /* xk += U(k,:)*x(:) */
404: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
405: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
406: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
407: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
408: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
409: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
410: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
411: vj++;
412: v += 49; xp = x + (*vj)*7;
413: }
414: xp = x + k*7;
415: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
416: }
417: return(0);
418: }
422: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
423: {
424: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
426: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
427: MatScalar *aa=a->a;
428: PetscScalar *x,*b;
431: VecGetArray(bb,&b);
432: VecGetArray(xx,&x);
433:
434: /* solve U^T * D * y = b by forward substitution */
435: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
436: ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
438: /* solve U*x = y by back substitution */
439: BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
441: VecRestoreArray(bb,&b);
442: VecRestoreArray(xx,&x);
443: PetscLogFlops(49*(2*a->nz + mbs));
444: return(0);
445: }
449: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
450: {
451: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
453: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
454: MatScalar *aa=a->a;
455: PetscScalar *x,*b;
458: VecGetArray(bb,&b);
459: VecGetArray(xx,&x);
460: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
461: ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
462: VecRestoreArray(bb,&b);
463: VecRestoreArray(xx,&x);
464: PetscLogFlops(49*a->nz + mbs);
465: return(0);
466: }
470: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
471: {
472: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
474: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
475: MatScalar *aa=a->a;
476: PetscScalar *x,*b;
479: VecGetArray(bb,&b);
480: VecGetArray(xx,&x);
481: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
482: BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
483: VecRestoreArray(bb,&b);
484: VecRestoreArray(xx,&x);
485: PetscLogFlops(49*a->nz);
486: return(0);
487: }
491: PetscErrorCode MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
492: {
493: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
494: IS isrow=a->row;
495: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
497: const PetscInt *r;
498: PetscInt nz,*vj,k,idx;
499: MatScalar *aa=a->a,*v,*d;
500: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
503: VecGetArray(bb,&b);
504: VecGetArray(xx,&x);
505: t = a->solve_work;
506: ISGetIndices(isrow,&r);
508: /* solve U^T * D * y = b by forward substitution */
509: tp = t;
510: for (k=0; k<mbs; k++) { /* t <- perm(b) */
511: idx = 6*r[k];
512: tp[0] = b[idx];
513: tp[1] = b[idx+1];
514: tp[2] = b[idx+2];
515: tp[3] = b[idx+3];
516: tp[4] = b[idx+4];
517: tp[5] = b[idx+5];
518: tp += 6;
519: }
520:
521: for (k=0; k<mbs; k++){
522: v = aa + 36*ai[k];
523: vj = aj + ai[k];
524: tp = t + k*6;
525: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
526: nz = ai[k+1] - ai[k];
527: tp = t + (*vj)*6;
528: while (nz--) {
529: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
530: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
531: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
532: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
533: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
534: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
535: vj++; tp = t + (*vj)*6;
536: v += 36;
537: }
539: /* xk = inv(Dk)*(Dk*xk) */
540: d = aa+k*36; /* ptr to inv(Dk) */
541: tp = t + k*6;
542: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
543: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
544: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
545: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
546: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
547: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
548: }
550: /* solve U*x = y by back substitution */
551: for (k=mbs-1; k>=0; k--){
552: v = aa + 36*ai[k];
553: vj = aj + ai[k];
554: tp = t + k*6;
555: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
556: nz = ai[k+1] - ai[k];
557:
558: tp = t + (*vj)*6;
559: while (nz--) {
560: /* xk += U(k,:)*x(:) */
561: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
562: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
563: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
564: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
565: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
566: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
567: vj++; tp = t + (*vj)*6;
568: v += 36;
569: }
570: tp = t + k*6;
571: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
572: idx = 6*r[k];
573: x[idx] = x0;
574: x[idx+1] = x1;
575: x[idx+2] = x2;
576: x[idx+3] = x3;
577: x[idx+4] = x4;
578: x[idx+5] = x5;
579: }
581: ISRestoreIndices(isrow,&r);
582: VecRestoreArray(bb,&b);
583: VecRestoreArray(xx,&x);
584: PetscLogFlops(36*(2*a->nz + mbs));
585: return(0);
586: }
590: PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
591: {
592: MatScalar *v,*d;
593: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
594: PetscInt nz,*vj,k;
597: for (k=0; k<mbs; k++){
598: v = aa + 36*ai[k];
599: xp = x + k*6;
600: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
601: nz = ai[k+1] - ai[k];
602: vj = aj + ai[k];
603: xp = x + (*vj)*6;
604: while (nz--) {
605: /* x(:) += U(k,:)^T*(Dk*xk) */
606: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
607: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
608: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
609: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
610: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
611: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
612: vj++; xp = x + (*vj)*6;
613: v += 36;
614: }
615: /* xk = inv(Dk)*(Dk*xk) */
616: d = aa+k*36; /* ptr to inv(Dk) */
617: xp = x + k*6;
618: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
619: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
620: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
621: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
622: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
623: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
624: }
625: return(0);
626: }
629: PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
630: {
631: MatScalar *v;
632: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
633: PetscInt nz,*vj,k;
636: for (k=mbs-1; k>=0; k--){
637: v = aa + 36*ai[k];
638: xp = x + k*6;
639: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
640: nz = ai[k+1] - ai[k];
641: vj = aj + ai[k];
642: xp = x + (*vj)*6;
643: while (nz--) {
644: /* xk += U(k,:)*x(:) */
645: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
646: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
647: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
648: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
649: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
650: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
651: vj++;
652: v += 36; xp = x + (*vj)*6;
653: }
654: xp = x + k*6;
655: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
656: }
657: return(0);
658: }
663: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
664: {
665: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
666: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
667: MatScalar *aa=a->a;
668: PetscScalar *x,*b;
672: VecGetArray(bb,&b);
673: VecGetArray(xx,&x);
674:
675: /* solve U^T * D * y = b by forward substitution */
676: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
677: ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
679: /* solve U*x = y by back substitution */
680: BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
682: VecRestoreArray(bb,&b);
683: VecRestoreArray(xx,&x);
684: PetscLogFlops(36*(2*a->nz + mbs));
685: return(0);
686: }
690: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
691: {
692: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
693: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
694: MatScalar *aa=a->a;
695: PetscScalar *x,*b;
699: VecGetArray(bb,&b);
700: VecGetArray(xx,&x);
701: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
702: ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
703: VecRestoreArray(bb,&b);
704: VecRestoreArray(xx,&x);
705: PetscLogFlops(36*a->nz + mbs);
706: return(0);
707: }
711: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
712: {
713: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
714: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
715: MatScalar *aa=a->a;
716: PetscScalar *x,*b;
720: VecGetArray(bb,&b);
721: VecGetArray(xx,&x);
722: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
723: BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
724: VecRestoreArray(bb,&b);
725: VecRestoreArray(xx,&x);
726: PetscLogFlops(36*a->nz);
727: return(0);
728: }
732: PetscErrorCode MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
733: {
734: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
735: IS isrow=a->row;
736: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
738: const PetscInt *r;
739: PetscInt nz,*vj,k,idx;
740: MatScalar *aa=a->a,*v,*diag;
741: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
744: VecGetArray(bb,&b);
745: VecGetArray(xx,&x);
746: t = a->solve_work;
747: ISGetIndices(isrow,&r);
749: /* solve U^T * D * y = b by forward substitution */
750: tp = t;
751: for (k=0; k<mbs; k++) { /* t <- perm(b) */
752: idx = 5*r[k];
753: tp[0] = b[idx];
754: tp[1] = b[idx+1];
755: tp[2] = b[idx+2];
756: tp[3] = b[idx+3];
757: tp[4] = b[idx+4];
758: tp += 5;
759: }
760:
761: for (k=0; k<mbs; k++){
762: v = aa + 25*ai[k];
763: vj = aj + ai[k];
764: tp = t + k*5;
765: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
766: nz = ai[k+1] - ai[k];
768: tp = t + (*vj)*5;
769: while (nz--) {
770: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
771: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
772: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
773: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
774: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
775: vj++; tp = t + (*vj)*5;
776: v += 25;
777: }
779: /* xk = inv(Dk)*(Dk*xk) */
780: diag = aa+k*25; /* ptr to inv(Dk) */
781: tp = t + k*5;
782: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
783: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
784: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
785: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
786: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
787: }
789: /* solve U*x = y by back substitution */
790: for (k=mbs-1; k>=0; k--){
791: v = aa + 25*ai[k];
792: vj = aj + ai[k];
793: tp = t + k*5;
794: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
795: nz = ai[k+1] - ai[k];
796:
797: tp = t + (*vj)*5;
798: while (nz--) {
799: /* xk += U(k,:)*x(:) */
800: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
801: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
802: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
803: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
804: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
805: vj++; tp = t + (*vj)*5;
806: v += 25;
807: }
808: tp = t + k*5;
809: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
810: idx = 5*r[k];
811: x[idx] = x0;
812: x[idx+1] = x1;
813: x[idx+2] = x2;
814: x[idx+3] = x3;
815: x[idx+4] = x4;
816: }
818: ISRestoreIndices(isrow,&r);
819: VecRestoreArray(bb,&b);
820: VecRestoreArray(xx,&x);
821: PetscLogFlops(25*(2*a->nz + mbs));
822: return(0);
823: }
827: PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
828: {
829: MatScalar *v,*diag;
830: PetscScalar *xp,x0,x1,x2,x3,x4;
831: PetscInt nz,*vj,k;
834: for (k=0; k<mbs; k++){
835: v = aa + 25*ai[k];
836: xp = x + k*5;
837: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
838: nz = ai[k+1] - ai[k];
839: vj = aj + ai[k];
840: xp = x + (*vj)*5;
841: while (nz--) {
842: /* x(:) += U(k,:)^T*(Dk*xk) */
843: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
844: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
845: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
846: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
847: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
848: vj++; xp = x + (*vj)*5;
849: v += 25;
850: }
851: /* xk = inv(Dk)*(Dk*xk) */
852: diag = aa+k*25; /* ptr to inv(Dk) */
853: xp = x + k*5;
854: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
855: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
856: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
857: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
858: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
859: }
860: return(0);
861: }
865: PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
866: {
867: MatScalar *v;
868: PetscScalar *xp,x0,x1,x2,x3,x4;
869: PetscInt nz,*vj,k;
872: for (k=mbs-1; k>=0; k--){
873: v = aa + 25*ai[k];
874: xp = x + k*5;
875: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
876: nz = ai[k+1] - ai[k];
877: vj = aj + ai[k];
878: xp = x + (*vj)*5;
879: while (nz--) {
880: /* xk += U(k,:)*x(:) */
881: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
882: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
883: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
884: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
885: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
886: vj++;
887: v += 25; xp = x + (*vj)*5;
888: }
889: xp = x + k*5;
890: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
891: }
892: return(0);
893: }
897: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
898: {
899: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
900: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
901: MatScalar *aa=a->a;
902: PetscScalar *x,*b;
906: VecGetArray(bb,&b);
907: VecGetArray(xx,&x);
909: /* solve U^T * D * y = b by forward substitution */
910: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
911: ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
913: /* solve U*x = y by back substitution */
914: BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
916: VecRestoreArray(bb,&b);
917: VecRestoreArray(xx,&x);
918: PetscLogFlops(25*(2*a->nz + mbs));
919: return(0);
920: }
924: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
925: {
926: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
927: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
928: MatScalar *aa=a->a;
929: PetscScalar *x,*b;
933: VecGetArray(bb,&b);
934: VecGetArray(xx,&x);
935: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
936: ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
937: VecRestoreArray(bb,&b);
938: VecRestoreArray(xx,&x);
939: PetscLogFlops(25*(a->nz + mbs));
940: return(0);
941: }
945: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
946: {
947: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
948: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
949: MatScalar *aa=a->a;
950: PetscScalar *x,*b;
954: VecGetArray(bb,&b);
955: VecGetArray(xx,&x);
956: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
957: BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
958: VecRestoreArray(bb,&b);
959: VecRestoreArray(xx,&x);
960: PetscLogFlops(25*a->nz);
961: return(0);
962: }
966: PetscErrorCode MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
967: {
968: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
969: IS isrow=a->row;
970: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
972: const PetscInt *r;
973: PetscInt nz,*vj,k,idx;
974: MatScalar *aa=a->a,*v,*diag;
975: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
978: VecGetArray(bb,&b);
979: VecGetArray(xx,&x);
980: t = a->solve_work;
981: ISGetIndices(isrow,&r);
983: /* solve U^T * D * y = b by forward substitution */
984: tp = t;
985: for (k=0; k<mbs; k++) { /* t <- perm(b) */
986: idx = 4*r[k];
987: tp[0] = b[idx];
988: tp[1] = b[idx+1];
989: tp[2] = b[idx+2];
990: tp[3] = b[idx+3];
991: tp += 4;
992: }
993:
994: for (k=0; k<mbs; k++){
995: v = aa + 16*ai[k];
996: vj = aj + ai[k];
997: tp = t + k*4;
998: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
999: nz = ai[k+1] - ai[k];
1001: tp = t + (*vj)*4;
1002: while (nz--) {
1003: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1004: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1005: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1006: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1007: vj++; tp = t + (*vj)*4;
1008: v += 16;
1009: }
1011: /* xk = inv(Dk)*(Dk*xk) */
1012: diag = aa+k*16; /* ptr to inv(Dk) */
1013: tp = t + k*4;
1014: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1015: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1016: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1017: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1018: }
1020: /* solve U*x = y by back substitution */
1021: for (k=mbs-1; k>=0; k--){
1022: v = aa + 16*ai[k];
1023: vj = aj + ai[k];
1024: tp = t + k*4;
1025: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1026: nz = ai[k+1] - ai[k];
1027:
1028: tp = t + (*vj)*4;
1029: while (nz--) {
1030: /* xk += U(k,:)*x(:) */
1031: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1032: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1033: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1034: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1035: vj++; tp = t + (*vj)*4;
1036: v += 16;
1037: }
1038: tp = t + k*4;
1039: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1040: idx = 4*r[k];
1041: x[idx] = x0;
1042: x[idx+1] = x1;
1043: x[idx+2] = x2;
1044: x[idx+3] = x3;
1045: }
1047: ISRestoreIndices(isrow,&r);
1048: VecRestoreArray(bb,&b);
1049: VecRestoreArray(xx,&x);
1050: PetscLogFlops(16*(2*a->nz + mbs));
1051: return(0);
1052: }
1056: PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1057: {
1058: MatScalar *v,*diag;
1059: PetscScalar *xp,x0,x1,x2,x3;
1060: PetscInt nz,*vj,k;
1063: for (k=0; k<mbs; k++){
1064: v = aa + 16*ai[k];
1065: xp = x + k*4;
1066: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1067: nz = ai[k+1] - ai[k];
1068: vj = aj + ai[k];
1069: xp = x + (*vj)*4;
1070: while (nz--) {
1071: /* x(:) += U(k,:)^T*(Dk*xk) */
1072: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1073: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1074: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1075: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1076: vj++; xp = x + (*vj)*4;
1077: v += 16;
1078: }
1079: /* xk = inv(Dk)*(Dk*xk) */
1080: diag = aa+k*16; /* ptr to inv(Dk) */
1081: xp = x + k*4;
1082: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1083: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1084: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1085: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1086: }
1087: return(0);
1088: }
1092: PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1093: {
1094: MatScalar *v;
1095: PetscScalar *xp,x0,x1,x2,x3;
1096: PetscInt nz,*vj,k;
1099: for (k=mbs-1; k>=0; k--){
1100: v = aa + 16*ai[k];
1101: xp = x + k*4;
1102: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1103: nz = ai[k+1] - ai[k];
1104: vj = aj + ai[k];
1105: xp = x + (*vj)*4;
1106: while (nz--) {
1107: /* xk += U(k,:)*x(:) */
1108: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1109: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1110: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1111: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1112: vj++;
1113: v += 16; xp = x + (*vj)*4;
1114: }
1115: xp = x + k*4;
1116: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1117: }
1118: return(0);
1119: }
1123: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1124: {
1125: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1126: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1127: MatScalar *aa=a->a;
1128: PetscScalar *x,*b;
1132: VecGetArray(bb,&b);
1133: VecGetArray(xx,&x);
1135: /* solve U^T * D * y = b by forward substitution */
1136: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1137: ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1139: /* solve U*x = y by back substitution */
1140: BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1141: VecRestoreArray(bb,&b);
1142: VecRestoreArray(xx,&x);
1143: PetscLogFlops(16*(2*a->nz + mbs));
1144: return(0);
1145: }
1149: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1150: {
1151: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1152: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1153: MatScalar *aa=a->a;
1154: PetscScalar *x,*b;
1158: VecGetArray(bb,&b);
1159: VecGetArray(xx,&x);
1160: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1161: ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1162: VecRestoreArray(bb,&b);
1163: VecRestoreArray(xx,&x);
1164: PetscLogFlops(16*a->nz + mbs);
1165: return(0);
1166: }
1170: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1171: {
1172: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1173: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1174: MatScalar *aa=a->a;
1175: PetscScalar *x,*b;
1179: VecGetArray(bb,&b);
1180: VecGetArray(xx,&x);
1181: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1182: BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1183: VecRestoreArray(bb,&b);
1184: VecRestoreArray(xx,&x);
1185: PetscLogFlops(16*a->nz);
1186: return(0);
1187: }
1191: PetscErrorCode MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
1192: {
1193: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1194: IS isrow=a->row;
1195: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1197: const PetscInt *r;
1198: PetscInt nz,*vj,k,idx;
1199: MatScalar *aa=a->a,*v,*diag;
1200: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
1203: VecGetArray(bb,&b);
1204: VecGetArray(xx,&x);
1205: t = a->solve_work;
1206: ISGetIndices(isrow,&r);
1208: /* solve U^T * D * y = b by forward substitution */
1209: tp = t;
1210: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1211: idx = 3*r[k];
1212: tp[0] = b[idx];
1213: tp[1] = b[idx+1];
1214: tp[2] = b[idx+2];
1215: tp += 3;
1216: }
1217:
1218: for (k=0; k<mbs; k++){
1219: v = aa + 9*ai[k];
1220: vj = aj + ai[k];
1221: tp = t + k*3;
1222: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1223: nz = ai[k+1] - ai[k];
1225: tp = t + (*vj)*3;
1226: while (nz--) {
1227: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1228: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1229: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1230: vj++; tp = t + (*vj)*3;
1231: v += 9;
1232: }
1234: /* xk = inv(Dk)*(Dk*xk) */
1235: diag = aa+k*9; /* ptr to inv(Dk) */
1236: tp = t + k*3;
1237: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1238: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1239: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1240: }
1242: /* solve U*x = y by back substitution */
1243: for (k=mbs-1; k>=0; k--){
1244: v = aa + 9*ai[k];
1245: vj = aj + ai[k];
1246: tp = t + k*3;
1247: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1248: nz = ai[k+1] - ai[k];
1249:
1250: tp = t + (*vj)*3;
1251: while (nz--) {
1252: /* xk += U(k,:)*x(:) */
1253: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1254: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1255: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1256: vj++; tp = t + (*vj)*3;
1257: v += 9;
1258: }
1259: tp = t + k*3;
1260: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1261: idx = 3*r[k];
1262: x[idx] = x0;
1263: x[idx+1] = x1;
1264: x[idx+2] = x2;
1265: }
1267: ISRestoreIndices(isrow,&r);
1268: VecRestoreArray(bb,&b);
1269: VecRestoreArray(xx,&x);
1270: PetscLogFlops(9*(2*a->nz + mbs));
1271: return(0);
1272: }
1276: PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1277: {
1278: MatScalar *v,*diag;
1279: PetscScalar *xp,x0,x1,x2;
1280: PetscInt nz,*vj,k;
1283: for (k=0; k<mbs; k++){
1284: v = aa + 9*ai[k];
1285: xp = x + k*3;
1286: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1287: nz = ai[k+1] - ai[k];
1288: vj = aj + ai[k];
1289: xp = x + (*vj)*3;
1290: while (nz--) {
1291: /* x(:) += U(k,:)^T*(Dk*xk) */
1292: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1293: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1294: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1295: vj++; xp = x + (*vj)*3;
1296: v += 9;
1297: }
1298: /* xk = inv(Dk)*(Dk*xk) */
1299: diag = aa+k*9; /* ptr to inv(Dk) */
1300: xp = x + k*3;
1301: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1302: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1303: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1304: }
1305: return(0);
1306: }
1310: PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1311: {
1312: MatScalar *v;
1313: PetscScalar *xp,x0,x1,x2;
1314: PetscInt nz,*vj,k;
1317: for (k=mbs-1; k>=0; k--){
1318: v = aa + 9*ai[k];
1319: xp = x + k*3;
1320: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1321: nz = ai[k+1] - ai[k];
1322: vj = aj + ai[k];
1323: xp = x + (*vj)*3;
1324: while (nz--) {
1325: /* xk += U(k,:)*x(:) */
1326: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1327: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1328: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1329: vj++;
1330: v += 9; xp = x + (*vj)*3;
1331: }
1332: xp = x + k*3;
1333: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1334: }
1335: return(0);
1336: }
1340: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1341: {
1342: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1343: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1344: MatScalar *aa=a->a;
1345: PetscScalar *x,*b;
1347:
1349: VecGetArray(bb,&b);
1350: VecGetArray(xx,&x);
1352: /* solve U^T * D * y = b by forward substitution */
1353: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1354: ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1356: /* solve U*x = y by back substitution */
1357: BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1359: VecRestoreArray(bb,&b);
1360: VecRestoreArray(xx,&x);
1361: PetscLogFlops(9*(2*a->nz + mbs));
1362: return(0);
1363: }
1367: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1368: {
1369: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1370: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1371: MatScalar *aa=a->a;
1372: PetscScalar *x,*b;
1376: VecGetArray(bb,&b);
1377: VecGetArray(xx,&x);
1378: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1379: ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1380: VecRestoreArray(bb,&b);
1381: VecRestoreArray(xx,&x);
1382: PetscLogFlops(9*(a->nz + mbs));
1383: return(0);
1384: }
1388: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1389: {
1390: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1391: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1392: MatScalar *aa=a->a;
1393: PetscScalar *x,*b;
1397: VecGetArray(bb,&b);
1398: VecGetArray(xx,&x);
1399: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1400: BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1401: VecRestoreArray(bb,&b);
1402: VecRestoreArray(xx,&x);
1403: PetscLogFlops(9*a->nz);
1404: return(0);
1405: }
1409: PetscErrorCode MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1410: {
1411: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data;
1412: IS isrow=a->row;
1413: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1415: const PetscInt *r;
1416: PetscInt nz,*vj,k,k2,idx;
1417: MatScalar *aa=a->a,*v,*diag;
1418: PetscScalar *x,*b,x0,x1,*t;
1421: VecGetArray(bb,&b);
1422: VecGetArray(xx,&x);
1423: t = a->solve_work;
1424: ISGetIndices(isrow,&r);
1426: /* solve U^T * D * y = perm(b) by forward substitution */
1427: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1428: idx = 2*r[k];
1429: t[k*2] = b[idx];
1430: t[k*2+1] = b[idx+1];
1431: }
1432: for (k=0; k<mbs; k++){
1433: v = aa + 4*ai[k];
1434: vj = aj + ai[k];
1435: k2 = k*2;
1436: x0 = t[k2]; x1 = t[k2+1];
1437: nz = ai[k+1] - ai[k];
1438: while (nz--) {
1439: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1440: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1441: vj++; v += 4;
1442: }
1443: diag = aa+k*4; /* ptr to inv(Dk) */
1444: t[k2] = diag[0]*x0 + diag[2]*x1;
1445: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1446: }
1448: /* solve U*x = y by back substitution */
1449: for (k=mbs-1; k>=0; k--){
1450: v = aa + 4*ai[k];
1451: vj = aj + ai[k];
1452: k2 = k*2;
1453: x0 = t[k2]; x1 = t[k2+1];
1454: nz = ai[k+1] - ai[k];
1455: while (nz--) {
1456: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1457: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1458: vj++; v += 4;
1459: }
1460: t[k2] = x0;
1461: t[k2+1] = x1;
1462: idx = 2*r[k];
1463: x[idx] = x0;
1464: x[idx+1] = x1;
1465: }
1467: ISRestoreIndices(isrow,&r);
1468: VecRestoreArray(bb,&b);
1469: VecRestoreArray(xx,&x);
1470: PetscLogFlops(4*(2*a->nz + mbs));
1471: return(0);
1472: }
1476: PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1477: {
1478: MatScalar *v,*diag;
1479: PetscScalar x0,x1;
1480: PetscInt nz,*vj,k,k2;
1483: for (k=0; k<mbs; k++){
1484: v = aa + 4*ai[k];
1485: vj = aj + ai[k];
1486: k2 = k*2;
1487: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1488: nz = ai[k+1] - ai[k];
1489:
1490: while (nz--) {
1491: /* x(:) += U(k,:)^T*(Dk*xk) */
1492: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1493: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1494: vj++; v += 4;
1495: }
1496: /* xk = inv(Dk)*(Dk*xk) */
1497: diag = aa+k*4; /* ptr to inv(Dk) */
1498: x[k2] = diag[0]*x0 + diag[2]*x1;
1499: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1500: }
1501: return(0);
1502: }
1506: PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1507: {
1508: MatScalar *v;
1509: PetscScalar x0,x1;
1510: PetscInt nz,*vj,k,k2;
1513: for (k=mbs-1; k>=0; k--){
1514: v = aa + 4*ai[k];
1515: vj = aj + ai[k];
1516: k2 = k*2;
1517: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1518: nz = ai[k+1] - ai[k];
1519: while (nz--) {
1520: /* xk += U(k,:)*x(:) */
1521: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1522: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1523: vj++; v += 4;
1524: }
1525: x[k2] = x0;
1526: x[k2+1] = x1;
1527: }
1528: return(0);
1529: }
1533: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1534: {
1535: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1536: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1537: MatScalar *aa=a->a;
1538: PetscScalar *x,*b;
1542: VecGetArray(bb,&b);
1543: VecGetArray(xx,&x);
1545: /* solve U^T * D * y = b by forward substitution */
1546: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1547: ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1549: /* solve U*x = y by back substitution */
1550: BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1552: VecRestoreArray(bb,&b);
1553: VecRestoreArray(xx,&x);
1554: PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1555: return(0);
1556: }
1560: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1561: {
1562: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1563: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1564: MatScalar *aa=a->a;
1565: PetscScalar *x,*b;
1569: VecGetArray(bb,&b);
1570: VecGetArray(xx,&x);
1571: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1572: ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1573: VecRestoreArray(bb,&b);
1574: VecRestoreArray(xx,&x);
1575: PetscLogFlops(4*(a->nz + mbs));
1576: return(0);
1577: }
1581: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1582: {
1583: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1584: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1585: MatScalar *aa=a->a;
1586: PetscScalar *x,*b;
1590: VecGetArray(bb,&b);
1591: VecGetArray(xx,&x);
1592: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1593: BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1594: VecRestoreArray(bb,&b);
1595: VecRestoreArray(xx,&x);
1596: PetscLogFlops(4*a->nz);
1597: return(0);
1598: }
1602: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1603: {
1604: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1605: IS isrow=a->row;
1606: PetscErrorCode ierr;
1607: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1608: const MatScalar *aa=a->a,*v;
1609: PetscScalar *x,*b,xk,*t;
1610: PetscInt nz,k;
1613: VecGetArray(bb,&b);
1614: VecGetArray(xx,&x);
1615: t = a->solve_work;
1616: ISGetIndices(isrow,&rp);
1617:
1618: /* solve U^T*D^(1/2)*y = perm(b) by forward substitution */
1619: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1620: for (k=0; k<mbs; k++){
1621: v = aa + ai[k] + 1;
1622: vj = aj + ai[k] + 1;
1623: xk = t[k];
1624: nz = ai[k+1] - ai[k] - 1;
1625: while (nz--) t[*vj++] += (*v++) * xk;
1626: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1627: }
1629: /* solve U*perm(x) = y by back substitution */
1630: for (k=mbs-1; k>=0; k--){
1631: v = aa + ai[k] + 1;
1632: vj = aj + ai[k] + 1;
1633: nz = ai[k+1] - ai[k] - 1;
1634: while (nz--) t[k] += (*v++) * t[*vj++];
1635: x[rp[k]] = t[k];
1636: }
1638: ISRestoreIndices(isrow,&rp);
1639: VecRestoreArray(bb,&b);
1640: VecRestoreArray(xx,&x);
1641: PetscLogFlops(4*a->nz);
1642: return(0);
1643: }
1647: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1648: {
1649: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1650: IS isrow=a->row;
1651: PetscErrorCode ierr;
1652: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1653: const MatScalar *aa=a->a,*v;
1654: PetscReal diagk;
1655: PetscScalar *x,*b,xk;
1656: PetscInt nz,k;
1659: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1660: VecGetArray(bb,&b);
1661: VecGetArray(xx,&x);
1662: ISGetIndices(isrow,&rp);
1663:
1664: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1665: for (k=0; k<mbs; k++){
1666: v = aa + ai[k] + 1;
1667: vj = aj + ai[k] + 1;
1668: xk = x[k];
1669: nz = ai[k+1] - ai[k] - 1;
1670: while (nz--) x[*vj++] += (*v++) * xk;
1672: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1673: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1674: x[k] = xk*sqrt(diagk);
1675: }
1676: ISRestoreIndices(isrow,&rp);
1677: VecRestoreArray(bb,&b);
1678: VecRestoreArray(xx,&x);
1679: PetscLogFlops(2*a->nz);
1680: return(0);
1681: }
1685: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1686: {
1687: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1688: IS isrow=a->row;
1689: PetscErrorCode ierr;
1690: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1691: const MatScalar *aa=a->a,*v;
1692: PetscReal diagk;
1693: PetscScalar *x,*b,*t;
1694: PetscInt nz,k;
1697: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1698: VecGetArray(bb,&b);
1699: VecGetArray(xx,&x);
1700: t = a->solve_work;
1701: ISGetIndices(isrow,&rp);
1703: for (k=mbs-1; k>=0; k--){
1704: v = aa + ai[k] + 1;
1705: vj = aj + ai[k] + 1;
1706: diagk = PetscRealPart(aa[ai[k]]);
1707: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1708: t[k] = b[k] * sqrt(diagk);
1709: nz = ai[k+1] - ai[k] - 1;
1710: while (nz--) t[k] += (*v++) * t[*vj++];
1711: x[rp[k]] = t[k];
1712: }
1713: ISRestoreIndices(isrow,&rp);
1714: VecRestoreArray(bb,&b);
1715: VecRestoreArray(xx,&x);
1716: PetscLogFlops(2*a->nz);
1717: return(0);
1718: }
1722: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1723: {
1724: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1728: if (A->rmap->bs == 1) {
1729: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1730: } else {
1731: IS isrow=a->row;
1732: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1733: const MatScalar *aa=a->a,*v;
1734: PetscScalar *x,*b,*t;
1735: PetscInt nz,k,n,i;
1736: if (bb->n > a->solves_work_n) {
1737: PetscFree(a->solves_work);
1738: PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1739: a->solves_work_n = bb->n;
1740: }
1741: n = bb->n;
1742: VecGetArray(bb->v,&b);
1743: VecGetArray(xx->v,&x);
1744: t = a->solves_work;
1746: ISGetIndices(isrow,&rp);
1747:
1748: /* solve U^T*D*y = perm(b) by forward substitution */
1749: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1750: for (k=0; k<mbs; k++){
1751: v = aa + ai[k];
1752: vj = aj + ai[k];
1753: nz = ai[k+1] - ai[k];
1754: while (nz--) {
1755: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1756: v++;vj++;
1757: }
1758: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1759: }
1760:
1761: /* solve U*perm(x) = y by back substitution */
1762: for (k=mbs-1; k>=0; k--){
1763: v = aa + ai[k];
1764: vj = aj + ai[k];
1765: nz = ai[k+1] - ai[k];
1766: while (nz--) {
1767: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1768: v++;vj++;
1769: }
1770: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1771: }
1773: ISRestoreIndices(isrow,&rp);
1774: VecRestoreArray(bb->v,&b);
1775: VecRestoreArray(xx->v,&x);
1776: PetscLogFlops(bb->n*(4*a->nz + A->rmap->N));
1777: }
1778: return(0);
1779: }
1781: /*
1782: Special case where the matrix was ILU(0) factored in the natural
1783: ordering. This eliminates the need for the column and row permutation.
1784: */
1787: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1788: {
1789: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1791: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1792: MatScalar *aa=a->a,*v;
1793: PetscScalar *x,*b,xk;
1794: PetscInt nz,*vj,k;
1797: VecGetArray(bb,&b);
1798: VecGetArray(xx,&x);
1799:
1800: /* solve U^T*D*y = b by forward substitution */
1801: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1802: for (k=0; k<mbs; k++){
1803: v = aa + ai[k] + 1;
1804: vj = aj + ai[k] + 1;
1805: xk = x[k];
1806: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1807: while (nz--) x[*vj++] += (*v++) * xk;
1808: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
1809: }
1811: /* solve U*x = y by back substitution */
1812: for (k=mbs-2; k>=0; k--){
1813: v = aa + ai[k] + 1;
1814: vj = aj + ai[k] + 1;
1815: xk = x[k];
1816: nz = ai[k+1] - ai[k] - 1;
1817: while (nz--) xk += (*v++) * x[*vj++];
1818: x[k] = xk;
1819: }
1821: VecRestoreArray(bb,&b);
1822: VecRestoreArray(xx,&x);
1823: PetscLogFlops(4*a->nz);
1824: return(0);
1825: }
1829: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1830: {
1831: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1833: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1834: MatScalar *aa=a->a,*v;
1835: PetscReal diagk;
1836: PetscScalar *x,*b;
1837: PetscInt nz,*vj,k;
1840: /* solve U^T*D^(1/2)*x = b by forward substitution */
1841: VecGetArray(bb,&b);
1842: VecGetArray(xx,&x);
1843: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1844: for (k=0; k<mbs; k++){
1845: v = aa + ai[k] + 1;
1846: vj = aj + ai[k] + 1;
1847: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1848: while (nz--) x[*vj++] += (*v++) * x[k];
1849: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1850: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
1851: x[k] *= sqrt(diagk);
1852: }
1853: VecRestoreArray(bb,&b);
1854: VecRestoreArray(xx,&x);
1855: PetscLogFlops(2*a->nz);
1856: return(0);
1857: }
1861: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1862: {
1863: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1865: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1866: MatScalar *aa=a->a,*v;
1867: PetscReal diagk;
1868: PetscScalar *x,*b;
1869: PetscInt nz,*vj,k;
1872: /* solve D^(1/2)*U*x = b by back substitution */
1873: VecGetArray(bb,&b);
1874: VecGetArray(xx,&x);
1876: for (k=mbs-1; k>=0; k--){
1877: v = aa + ai[k] + 1;
1878: vj = aj + ai[k] + 1;
1879: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1880: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1881: x[k] = sqrt(diagk)*b[k];
1882: nz = ai[k+1] - ai[k] - 1;
1883: while (nz--) x[k] += (*v++) * x[*vj++];
1884: }
1885: VecRestoreArray(bb,&b);
1886: VecRestoreArray(xx,&x);
1887: PetscLogFlops(2*a->nz);
1888: return(0);
1889: }
1893: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1896: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
1897: {
1898: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1900: const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
1901: PetscInt *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
1902: PetscInt m,reallocs = 0,*levtmp;
1903: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
1904: PetscInt incrlev,*lev,shift,prow,nz;
1905: PetscReal f = info->fill,levels = info->levels;
1906: PetscTruth perm_identity;
1909: /* check whether perm is the identity mapping */
1910: ISIdentity(perm,&perm_identity);
1912: if (perm_identity){
1913: a->permute = PETSC_FALSE;
1914: ai = a->i; aj = a->j;
1915: } else { /* non-trivial permutation */
1916: a->permute = PETSC_TRUE;
1917: MatReorderingSeqSBAIJ(A, perm);
1918: ai = a->inew; aj = a->jnew;
1919: }
1920:
1921: /* initialization */
1922: ISGetIndices(perm,&rip);
1923: umax = (PetscInt)(f*ai[mbs] + 1);
1924: PetscMalloc(umax*sizeof(PetscInt),&lev);
1925: umax += mbs + 1;
1926: shift = mbs + 1;
1927: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1928: PetscMalloc(umax*sizeof(PetscInt),&ju);
1929: iu[0] = mbs + 1;
1930: juidx = mbs + 1;
1931: /* prowl: linked list for pivot row */
1932: PetscMalloc((3*mbs+1)*sizeof(PetscInt),&prowl);
1933: /* q: linked list for col index */
1934: q = prowl + mbs;
1935: levtmp = q + mbs;
1936:
1937: for (i=0; i<mbs; i++){
1938: prowl[i] = mbs;
1939: q[i] = 0;
1940: }
1942: /* for each row k */
1943: for (k=0; k<mbs; k++){
1944: nzk = 0;
1945: q[k] = mbs;
1946: /* copy current row into linked list */
1947: nz = ai[rip[k]+1] - ai[rip[k]];
1948: j = ai[rip[k]];
1949: while (nz--){
1950: vj = rip[aj[j++]];
1951: if (vj > k){
1952: qm = k;
1953: do {
1954: m = qm; qm = q[m];
1955: } while(qm < vj);
1956: if (qm == vj) {
1957: SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1958: }
1959: nzk++;
1960: q[m] = vj;
1961: q[vj] = qm;
1962: levtmp[vj] = 0; /* initialize lev for nonzero element */
1963: }
1964: }
1966: /* modify nonzero structure of k-th row by computing fill-in
1967: for each row prow to be merged in */
1968: prow = k;
1969: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1970:
1971: while (prow < k){
1972: /* merge row prow into k-th row */
1973: jmin = iu[prow] + 1;
1974: jmax = iu[prow+1];
1975: qm = k;
1976: for (j=jmin; j<jmax; j++){
1977: incrlev = lev[j-shift] + 1;
1978: if (incrlev > levels) continue;
1979: vj = ju[j];
1980: do {
1981: m = qm; qm = q[m];
1982: } while (qm < vj);
1983: if (qm != vj){ /* a fill */
1984: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1985: levtmp[vj] = incrlev;
1986: } else {
1987: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1988: }
1989: }
1990: prow = prowl[prow]; /* next pivot row */
1991: }
1992:
1993: /* add k to row list for first nonzero element in k-th row */
1994: if (nzk > 1){
1995: i = q[k]; /* col value of first nonzero element in k_th row of U */
1996: prowl[k] = prowl[i]; prowl[i] = k;
1997: }
1998: iu[k+1] = iu[k] + nzk;
2000: /* allocate more space to ju and lev if needed */
2001: if (iu[k+1] > umax) {
2002: /* estimate how much additional space we will need */
2003: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2004: /* just double the memory each time */
2005: maxadd = umax;
2006: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2007: umax += maxadd;
2009: /* allocate a longer ju */
2010: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2011: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2012: PetscFree(ju);
2013: ju = jutmp;
2015: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2016: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2017: PetscFree(lev);
2018: lev = jutmp;
2019: reallocs += 2; /* count how many times we realloc */
2020: }
2022: /* save nonzero structure of k-th row in ju */
2023: i=k;
2024: while (nzk --) {
2025: i = q[i];
2026: ju[juidx] = i;
2027: lev[juidx-shift] = levtmp[i];
2028: juidx++;
2029: }
2030: }
2031:
2032: #if defined(PETSC_USE_INFO)
2033: if (ai[mbs] != 0) {
2034: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2035: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2036: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2037: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2038: PetscInfo(A,"for best performance.\n");
2039: } else {
2040: PetscInfo(A,"Empty matrix.\n");
2041: }
2042: #endif
2044: ISRestoreIndices(perm,&rip);
2045: PetscFree(prowl);
2046: PetscFree(lev);
2048: /* put together the new matrix */
2049: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);
2051: /* PetscLogObjectParent(B,iperm); */
2052: b = (Mat_SeqSBAIJ*)(B)->data;
2053: PetscFree2(b->imax,b->ilen);
2054: b->singlemalloc = PETSC_FALSE;
2055: b->free_a = PETSC_TRUE;
2056: b->free_ij = PETSC_TRUE;
2057: /* the next line frees the default space generated by the Create() */
2058: PetscFree3(b->a,b->j,b->i);
2059: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2060: b->j = ju;
2061: b->i = iu;
2062: b->diag = 0;
2063: b->ilen = 0;
2064: b->imax = 0;
2065:
2066: if (b->row) {
2067: ISDestroy(b->row);
2068: }
2069: if (b->icol) {
2070: ISDestroy(b->icol);
2071: }
2072: b->row = perm;
2073: b->icol = perm;
2074: PetscObjectReference((PetscObject)perm);
2075: PetscObjectReference((PetscObject)perm);
2076: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2077: /* In b structure: Free imax, ilen, old a, old j.
2078: Allocate idnew, solve_work, new a, new j */
2079: PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2080: b->maxnz = b->nz = iu[mbs];
2081:
2082: (B)->info.factor_mallocs = reallocs;
2083: (B)->info.fill_ratio_given = f;
2084: if (ai[mbs] != 0) {
2085: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2086: } else {
2087: (B)->info.fill_ratio_needed = 0.0;
2088: }
2089: MatSeqSBAIJSetNumericFactorization(B,perm_identity);
2090: return(0);
2091: }
2093: #include petscbt.h
2094: #include ../src/mat/utils/freespace.h
2097: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2098: {
2099: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2100: Mat_SeqSBAIJ *b;
2101: PetscErrorCode ierr;
2102: PetscTruth perm_identity,free_ij = PETSC_TRUE,missing;
2103: PetscInt bs=A->rmap->bs,am=a->mbs,d;
2104: const PetscInt *cols,*rip,*ai,*aj;
2105: PetscInt reallocs=0,i,*ui;
2106: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2107: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2108: PetscReal fill=info->fill,levels=info->levels,ratio_needed;
2109: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2110: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2111: PetscBT lnkbt;
2114: MatMissingDiagonal(A,&missing,&d);
2115: if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2117: /*
2118: This code originally uses Modified Sparse Row (MSR) storage
2119: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2120: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2121: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2122: thus the original code in MSR format is still used for these cases.
2123: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2124: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2125: */
2126: if (bs > 1){
2127: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2128: return(0);
2129: }
2131: /* check whether perm is the identity mapping */
2132: ISIdentity(perm,&perm_identity);
2133:
2134: /* special case that simply copies fill pattern */
2135: if (!levels && perm_identity) {
2136: a->permute = PETSC_FALSE;
2137: /* reuse the column pointers and row offsets for memory savings */
2138: ui = a->i;
2139: uj = a->j;
2140: free_ij = PETSC_FALSE;
2141: ratio_needed = 1.0;
2142: } else { /* case: levels>0 || (levels=0 && !perm_identity) */
2143: if (perm_identity){
2144: a->permute = PETSC_FALSE;
2145: ai = a->i; aj = a->j;
2146: } else { /* non-trivial permutation */
2147: a->permute = PETSC_TRUE;
2148: MatReorderingSeqSBAIJ(A, perm);
2149: ai = a->inew; aj = a->jnew;
2150: }
2151: ISGetIndices(perm,&rip);
2153: /* initialization */
2154: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2155: ui[0] = 0;
2157: /* jl: linked list for storing indices of the pivot rows
2158: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2159: PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);
2160: il = jl + am;
2161: uj_ptr = (PetscInt**)(il + am);
2162: uj_lvl_ptr = (PetscInt**)(uj_ptr + am);
2163: for (i=0; i<am; i++){
2164: jl[i] = am; il[i] = 0;
2165: }
2167: /* create and initialize a linked list for storing column indices of the active row k */
2168: nlnk = am + 1;
2169: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2171: /* initial FreeSpace size is fill*(ai[am]+1) */
2172: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2173: current_space = free_space;
2174: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2175: current_space_lvl = free_space_lvl;
2177: for (k=0; k<am; k++){ /* for each active row k */
2178: /* initialize lnk by the column indices of row rip[k] */
2179: nzk = 0;
2180: ncols = ai[rip[k]+1] - ai[rip[k]];
2181: cols = aj+ai[rip[k]];
2182: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2183: nzk += nlnk;
2185: /* update lnk by computing fill-in for each pivot row to be merged in */
2186: prow = jl[k]; /* 1st pivot row */
2187:
2188: while (prow < k){
2189: nextprow = jl[prow];
2190:
2191: /* merge prow into k-th row */
2192: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2193: jmax = ui[prow+1];
2194: ncols = jmax-jmin;
2195: i = jmin - ui[prow];
2196: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2197: j = *(uj_lvl_ptr[prow] + i - 1);
2198: cols_lvl = uj_lvl_ptr[prow]+i;
2199: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2200: nzk += nlnk;
2202: /* update il and jl for prow */
2203: if (jmin < jmax){
2204: il[prow] = jmin;
2205: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2206: }
2207: prow = nextprow;
2208: }
2210: /* if free space is not available, make more free space */
2211: if (current_space->local_remaining<nzk) {
2212: i = am - k + 1; /* num of unfactored rows */
2213: i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2214: PetscFreeSpaceGet(i,¤t_space);
2215: PetscFreeSpaceGet(i,¤t_space_lvl);
2216: reallocs++;
2217: }
2219: /* copy data into free_space and free_space_lvl, then initialize lnk */
2220: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2222: /* add the k-th row into il and jl */
2223: if (nzk-1 > 0){
2224: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2225: jl[k] = jl[i]; jl[i] = k;
2226: il[k] = ui[k] + 1;
2227: }
2228: uj_ptr[k] = current_space->array;
2229: uj_lvl_ptr[k] = current_space_lvl->array;
2231: current_space->array += nzk;
2232: current_space->local_used += nzk;
2233: current_space->local_remaining -= nzk;
2234: current_space_lvl->array += nzk;
2235: current_space_lvl->local_used += nzk;
2236: current_space_lvl->local_remaining -= nzk;
2238: ui[k+1] = ui[k] + nzk;
2239: }
2241: #if defined(PETSC_USE_INFO)
2242: if (ai[am] != 0) {
2243: PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2244: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2245: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2246: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2247: } else {
2248: PetscInfo(A,"Empty matrix.\n");
2249: }
2250: #endif
2252: ISRestoreIndices(perm,&rip);
2253: PetscFree(jl);
2255: /* destroy list of free space and other temporary array(s) */
2256: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2257: PetscFreeSpaceContiguous(&free_space,uj);
2258: PetscIncompleteLLDestroy(lnk,lnkbt);
2259: PetscFreeSpaceDestroy(free_space_lvl);
2260: if (ai[am] != 0) {
2261: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2262: } else {
2263: ratio_needed = 0.0;
2264: }
2265: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2267: /* put together the new matrix in MATSEQSBAIJ format */
2268: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
2270: b = (Mat_SeqSBAIJ*)(fact)->data;
2271: PetscFree2(b->imax,b->ilen);
2272: b->singlemalloc = PETSC_FALSE;
2273: b->free_a = PETSC_TRUE;
2274: b->free_ij = free_ij;
2275: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2276: b->j = uj;
2277: b->i = ui;
2278: b->diag = 0;
2279: b->ilen = 0;
2280: b->imax = 0;
2281: b->row = perm;
2282: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2283: PetscObjectReference((PetscObject)perm);
2284: b->icol = perm;
2285: PetscObjectReference((PetscObject)perm);
2286: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2287: b->maxnz = b->nz = ui[am];
2288:
2289: (fact)->info.factor_mallocs = reallocs;
2290: (fact)->info.fill_ratio_given = fill;
2291: (fact)->info.fill_ratio_needed = ratio_needed;
2292: MatSeqSBAIJSetNumericFactorization(fact,perm_identity);
2293: return(0);
2294: }