Actual source code: dgefa4.c

  1: #define PETSCMAT_DLL

  3: /*
  4:        Inverts 4 by 4 matrix using partial pivoting.

  6:        Used by the sparse factorization routines in 
  7:      src/mat/impls/baij/seq

  9:        See also src/inline/ilu.h

 11:        This is a combination of the Linpack routines
 12:     dgefa() and dgedi() specialized for a size of 4.

 14: */
 15:  #include petsc.h

 19: PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift)
 20: {
 21:     PetscInt   i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3;
 22:     PetscInt   k4,j3;
 23:     MatScalar  *aa,*ax,*ay,work[16],stmp;
 24:     MatReal    tmp,max;

 26: /*     gaussian elimination with partial pivoting */

 29:     shift = .25*shift*(PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15]));
 30:     /* Parameter adjustments */
 31:     a       -= 5;

 33:     for (k = 1; k <= 3; ++k) {
 34:         kp1 = k + 1;
 35:         k3  = 4*k;
 36:         k4  = k3 + k;
 37: /*        find l = pivot index */

 39:         i__2 = 5 - k;
 40:         aa = &a[k4];
 41:         max = PetscAbsScalar(aa[0]);
 42:         l = 1;
 43:         for (ll=1; ll<i__2; ll++) {
 44:           tmp = PetscAbsScalar(aa[ll]);
 45:           if (tmp > max) { max = tmp; l = ll+1;}
 46:         }
 47:         l       += k - 1;
 48:         ipvt[k-1] = l;

 50:         if (a[l + k3] == 0.0) {
 51:           if (shift == 0.0) {
 52:             SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1);
 53:             } else {
 54:             /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */
 55:               a[l + k3] = shift;
 56:             }
 57:         }

 59: /*           interchange if necessary */

 61:         if (l != k) {
 62:           stmp      = a[l + k3];
 63:           a[l + k3] = a[k4];
 64:           a[k4]     = stmp;
 65:         }

 67: /*           compute multipliers */

 69:         stmp = -1. / a[k4];
 70:         i__2 = 4 - k;
 71:         aa = &a[1 + k4];
 72:         for (ll=0; ll<i__2; ll++) {
 73:           aa[ll] *= stmp;
 74:         }

 76: /*           row elimination with column indexing */

 78:         ax = &a[k4+1];
 79:         for (j = kp1; j <= 4; ++j) {
 80:             j3   = 4*j;
 81:             stmp = a[l + j3];
 82:             if (l != k) {
 83:               a[l + j3] = a[k + j3];
 84:               a[k + j3] = stmp;
 85:             }

 87:             i__3 = 4 - k;
 88:             ay = &a[1+k+j3];
 89:             for (ll=0; ll<i__3; ll++) {
 90:               ay[ll] += stmp*ax[ll];
 91:             }
 92:         }
 93:     }
 94:     ipvt[3] = 4;
 95:     if (a[20] == 0.0) {
 96:       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3);
 97:     }

 99:     /*
100:          Now form the inverse 
101:     */

103:    /*     compute inverse(u) */

105:     for (k = 1; k <= 4; ++k) {
106:         k3    = 4*k;
107:         k4    = k3 + k;
108:         a[k4] = 1.0 / a[k4];
109:         stmp  = -a[k4];
110:         i__2  = k - 1;
111:         aa    = &a[k3 + 1];
112:         for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
113:         kp1 = k + 1;
114:         if (4 < kp1) continue;
115:         ax = aa;
116:         for (j = kp1; j <= 4; ++j) {
117:             j3        = 4*j;
118:             stmp      = a[k + j3];
119:             a[k + j3] = 0.0;
120:             ay        = &a[j3 + 1];
121:             for (ll=0; ll<k; ll++) {
122:               ay[ll] += stmp*ax[ll];
123:             }
124:         }
125:     }

127:    /*    form inverse(u)*inverse(l) */

129:     for (kb = 1; kb <= 3; ++kb) {
130:         k   = 4 - kb;
131:         k3  = 4*k;
132:         kp1 = k + 1;
133:         aa  = a + k3;
134:         for (i = kp1; i <= 4; ++i) {
135:             work[i-1] = aa[i];
136:             aa[i]   = 0.0;
137:         }
138:         for (j = kp1; j <= 4; ++j) {
139:             stmp  = work[j-1];
140:             ax    = &a[4*j + 1];
141:             ay    = &a[k3 + 1];
142:             ay[0] += stmp*ax[0];
143:             ay[1] += stmp*ax[1];
144:             ay[2] += stmp*ax[2];
145:             ay[3] += stmp*ax[3];
146:         }
147:         l = ipvt[k-1];
148:         if (l != k) {
149:             ax = &a[k3 + 1];
150:             ay = &a[4*l + 1];
151:             stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp;
152:             stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp;
153:             stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp;
154:             stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp;
155:         }
156:     }
157:     return(0);
158: }

160: #if defined(PETSC_HAVE_SSE)
161: #include PETSC_HAVE_SSE

165: PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(float *a)
166: {
167:   /* 
168:      This routine is converted from Intel's Small Matrix Library.
169:      See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix
170:      Order Number: 245043-001
171:      March 1999
172:      http://www.intel.com

174:      Inverse of a 4x4 matrix via Kramer's Rule:
175:      bool Invert4x4(SMLXMatrix &);
176:   */

179:   SSE_SCOPE_BEGIN;
180:     SSE_INLINE_BEGIN_1(a)

182: /* ----------------------------------------------- */

184:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
185:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0)

187:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
188:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5)

190:       SSE_COPY_PS(XMM3,XMM0)
191:       SSE_SHUFFLE(XMM3,XMM5,0x88)

193:       SSE_SHUFFLE(XMM5,XMM0,0xDD)

195:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0)
196:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0)

198:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6)
199:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6)

201:       SSE_COPY_PS(XMM4,XMM0)
202:       SSE_SHUFFLE(XMM4,XMM6,0x88)

204:       SSE_SHUFFLE(XMM6,XMM0,0xDD)

206: /* ----------------------------------------------- */

208:       SSE_COPY_PS(XMM7,XMM4)
209:       SSE_MULT_PS(XMM7,XMM6)

211:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

213:       SSE_COPY_PS(XMM0,XMM5)
214:       SSE_MULT_PS(XMM0,XMM7)

216:       SSE_COPY_PS(XMM2,XMM3)
217:       SSE_MULT_PS(XMM2,XMM7)

219:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

221:       SSE_COPY_PS(XMM1,XMM5)
222:       SSE_MULT_PS(XMM1,XMM7)
223:       SSE_SUB_PS(XMM1,XMM0)

225:       SSE_MULT_PS(XMM7,XMM3)
226:       SSE_SUB_PS(XMM7,XMM2)

228:       SSE_SHUFFLE(XMM7,XMM7,0x4E)
229:       SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7)

231: /* ----------------------------------------------- */

233:       SSE_COPY_PS(XMM0,XMM5)
234:       SSE_MULT_PS(XMM0,XMM4)

236:       SSE_SHUFFLE(XMM0,XMM0,0xB1)

238:       SSE_COPY_PS(XMM2,XMM6)
239:       SSE_MULT_PS(XMM2,XMM0)
240:       SSE_ADD_PS(XMM2,XMM1)
241: 
242:       SSE_COPY_PS(XMM7,XMM3)
243:       SSE_MULT_PS(XMM7,XMM0)

245:       SSE_SHUFFLE(XMM0,XMM0,0x4E)

247:       SSE_COPY_PS(XMM1,XMM6)
248:       SSE_MULT_PS(XMM1,XMM0)
249:       SSE_SUB_PS(XMM2,XMM1)

251:       SSE_MULT_PS(XMM0,XMM3)
252:       SSE_SUB_PS(XMM0,XMM7)

254:       SSE_SHUFFLE(XMM0,XMM0,0x4E)
255:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0)

257:       /* ----------------------------------------------- */

259:       SSE_COPY_PS(XMM7,XMM5)
260:       SSE_SHUFFLE(XMM7,XMM5,0x4E)
261:       SSE_MULT_PS(XMM7,XMM6)

263:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

265:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

267:       SSE_COPY_PS(XMM0,XMM4)
268:       SSE_MULT_PS(XMM0,XMM7)
269:       SSE_ADD_PS(XMM0,XMM2)

271:       SSE_COPY_PS(XMM2,XMM3)
272:       SSE_MULT_PS(XMM2,XMM7)

274:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

276:       SSE_COPY_PS(XMM1,XMM4)
277:       SSE_MULT_PS(XMM1,XMM7)
278:       SSE_SUB_PS(XMM0,XMM1)
279:       SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0)

281:       SSE_MULT_PS(XMM7,XMM3)
282:       SSE_SUB_PS(XMM7,XMM2)

284:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

286:       /* ----------------------------------------------- */

288:       SSE_COPY_PS(XMM1,XMM3)
289:       SSE_MULT_PS(XMM1,XMM5)

291:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

293:       SSE_COPY_PS(XMM0,XMM6)
294:       SSE_MULT_PS(XMM0,XMM1)
295:       SSE_ADD_PS(XMM0,XMM7)
296: 
297:       SSE_COPY_PS(XMM2,XMM4)
298:       SSE_MULT_PS(XMM2,XMM1)
299:       SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12)

301:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

303:       SSE_COPY_PS(XMM7,XMM6)
304:       SSE_MULT_PS(XMM7,XMM1)
305:       SSE_SUB_PS(XMM7,XMM0)

307:       SSE_MULT_PS(XMM1,XMM4)
308:       SSE_SUB_PS(XMM2,XMM1)
309:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2)

311:       /* ----------------------------------------------- */

313:       SSE_COPY_PS(XMM1,XMM3)
314:       SSE_MULT_PS(XMM1,XMM6)

316:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

318:       SSE_COPY_PS(XMM2,XMM4)
319:       SSE_MULT_PS(XMM2,XMM1)
320:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0)
321:       SSE_SUB_PS(XMM0,XMM2)

323:       SSE_COPY_PS(XMM2,XMM5)
324:       SSE_MULT_PS(XMM2,XMM1)
325:       SSE_ADD_PS(XMM2,XMM7)

327:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

329:       SSE_COPY_PS(XMM7,XMM4)
330:       SSE_MULT_PS(XMM7,XMM1)
331:       SSE_ADD_PS(XMM7,XMM0)

333:       SSE_MULT_PS(XMM1,XMM5)
334:       SSE_SUB_PS(XMM2,XMM1)

336:       /* ----------------------------------------------- */

338:       SSE_MULT_PS(XMM4,XMM3)

340:       SSE_SHUFFLE(XMM4,XMM4,0xB1)

342:       SSE_COPY_PS(XMM1,XMM6)
343:       SSE_MULT_PS(XMM1,XMM4)
344:       SSE_ADD_PS(XMM1,XMM7)

346:       SSE_COPY_PS(XMM0,XMM5)
347:       SSE_MULT_PS(XMM0,XMM4)
348:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7)
349:       SSE_SUB_PS(XMM7,XMM0)

351:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

353:       SSE_MULT_PS(XMM6,XMM4)
354:       SSE_SUB_PS(XMM1,XMM6)

356:       SSE_MULT_PS(XMM5,XMM4)
357:       SSE_ADD_PS(XMM5,XMM7)

359:       /* ----------------------------------------------- */

361:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
362:       SSE_MULT_PS(XMM3,XMM0)

364:       SSE_COPY_PS(XMM4,XMM3)
365:       SSE_SHUFFLE(XMM4,XMM3,0x4E)
366:       SSE_ADD_PS(XMM4,XMM3)

368:       SSE_COPY_PS(XMM6,XMM4)
369:       SSE_SHUFFLE(XMM6,XMM4,0xB1)
370:       SSE_ADD_SS(XMM6,XMM4)

372:       SSE_COPY_PS(XMM3,XMM6)
373:       SSE_RECIP_SS(XMM3,XMM6)
374:       SSE_COPY_SS(XMM4,XMM3)
375:       SSE_ADD_SS(XMM4,XMM3)
376:       SSE_MULT_SS(XMM3,XMM3)
377:       SSE_MULT_SS(XMM6,XMM3)
378:       SSE_SUB_SS(XMM4,XMM6)

380:       SSE_SHUFFLE(XMM4,XMM4,0x00)

382:       SSE_MULT_PS(XMM0,XMM4)
383:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0)
384:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0)

386:       SSE_MULT_PS(XMM1,XMM4)
387:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1)
388:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1)

390:       SSE_MULT_PS(XMM2,XMM4)
391:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2)
392:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2)

394:       SSE_MULT_PS(XMM4,XMM5)
395:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
396:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)

398:       /* ----------------------------------------------- */

400:       SSE_INLINE_END_1;
401:   SSE_SCOPE_END;

403:   return(0);
404: }

406: #endif