Actual source code: dgefa4.c
1: #define PETSCMAT_DLL
3: /*
4: Inverts 4 by 4 matrix using partial pivoting.
6: Used by the sparse factorization routines in
7: src/mat/impls/baij/seq
9: See also src/inline/ilu.h
11: This is a combination of the Linpack routines
12: dgefa() and dgedi() specialized for a size of 4.
14: */
15: #include petsc.h
19: PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift)
20: {
21: PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3;
22: PetscInt k4,j3;
23: MatScalar *aa,*ax,*ay,work[16],stmp;
24: MatReal tmp,max;
26: /* gaussian elimination with partial pivoting */
29: shift = .25*shift*(PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15]));
30: /* Parameter adjustments */
31: a -= 5;
33: for (k = 1; k <= 3; ++k) {
34: kp1 = k + 1;
35: k3 = 4*k;
36: k4 = k3 + k;
37: /* find l = pivot index */
39: i__2 = 5 - k;
40: aa = &a[k4];
41: max = PetscAbsScalar(aa[0]);
42: l = 1;
43: for (ll=1; ll<i__2; ll++) {
44: tmp = PetscAbsScalar(aa[ll]);
45: if (tmp > max) { max = tmp; l = ll+1;}
46: }
47: l += k - 1;
48: ipvt[k-1] = l;
50: if (a[l + k3] == 0.0) {
51: if (shift == 0.0) {
52: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1);
53: } else {
54: /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */
55: a[l + k3] = shift;
56: }
57: }
59: /* interchange if necessary */
61: if (l != k) {
62: stmp = a[l + k3];
63: a[l + k3] = a[k4];
64: a[k4] = stmp;
65: }
67: /* compute multipliers */
69: stmp = -1. / a[k4];
70: i__2 = 4 - k;
71: aa = &a[1 + k4];
72: for (ll=0; ll<i__2; ll++) {
73: aa[ll] *= stmp;
74: }
76: /* row elimination with column indexing */
78: ax = &a[k4+1];
79: for (j = kp1; j <= 4; ++j) {
80: j3 = 4*j;
81: stmp = a[l + j3];
82: if (l != k) {
83: a[l + j3] = a[k + j3];
84: a[k + j3] = stmp;
85: }
87: i__3 = 4 - k;
88: ay = &a[1+k+j3];
89: for (ll=0; ll<i__3; ll++) {
90: ay[ll] += stmp*ax[ll];
91: }
92: }
93: }
94: ipvt[3] = 4;
95: if (a[20] == 0.0) {
96: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3);
97: }
99: /*
100: Now form the inverse
101: */
103: /* compute inverse(u) */
105: for (k = 1; k <= 4; ++k) {
106: k3 = 4*k;
107: k4 = k3 + k;
108: a[k4] = 1.0 / a[k4];
109: stmp = -a[k4];
110: i__2 = k - 1;
111: aa = &a[k3 + 1];
112: for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
113: kp1 = k + 1;
114: if (4 < kp1) continue;
115: ax = aa;
116: for (j = kp1; j <= 4; ++j) {
117: j3 = 4*j;
118: stmp = a[k + j3];
119: a[k + j3] = 0.0;
120: ay = &a[j3 + 1];
121: for (ll=0; ll<k; ll++) {
122: ay[ll] += stmp*ax[ll];
123: }
124: }
125: }
127: /* form inverse(u)*inverse(l) */
129: for (kb = 1; kb <= 3; ++kb) {
130: k = 4 - kb;
131: k3 = 4*k;
132: kp1 = k + 1;
133: aa = a + k3;
134: for (i = kp1; i <= 4; ++i) {
135: work[i-1] = aa[i];
136: aa[i] = 0.0;
137: }
138: for (j = kp1; j <= 4; ++j) {
139: stmp = work[j-1];
140: ax = &a[4*j + 1];
141: ay = &a[k3 + 1];
142: ay[0] += stmp*ax[0];
143: ay[1] += stmp*ax[1];
144: ay[2] += stmp*ax[2];
145: ay[3] += stmp*ax[3];
146: }
147: l = ipvt[k-1];
148: if (l != k) {
149: ax = &a[k3 + 1];
150: ay = &a[4*l + 1];
151: stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp;
152: stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp;
153: stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp;
154: stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp;
155: }
156: }
157: return(0);
158: }
160: #if defined(PETSC_HAVE_SSE)
161: #include PETSC_HAVE_SSE
165: PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(float *a)
166: {
167: /*
168: This routine is converted from Intel's Small Matrix Library.
169: See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix
170: Order Number: 245043-001
171: March 1999
172: http://www.intel.com
174: Inverse of a 4x4 matrix via Kramer's Rule:
175: bool Invert4x4(SMLXMatrix &);
176: */
179: SSE_SCOPE_BEGIN;
180: SSE_INLINE_BEGIN_1(a)
182: /* ----------------------------------------------- */
184: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
185: SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0)
187: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
188: SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5)
190: SSE_COPY_PS(XMM3,XMM0)
191: SSE_SHUFFLE(XMM3,XMM5,0x88)
193: SSE_SHUFFLE(XMM5,XMM0,0xDD)
195: SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0)
196: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0)
198: SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6)
199: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6)
201: SSE_COPY_PS(XMM4,XMM0)
202: SSE_SHUFFLE(XMM4,XMM6,0x88)
204: SSE_SHUFFLE(XMM6,XMM0,0xDD)
206: /* ----------------------------------------------- */
208: SSE_COPY_PS(XMM7,XMM4)
209: SSE_MULT_PS(XMM7,XMM6)
211: SSE_SHUFFLE(XMM7,XMM7,0xB1)
213: SSE_COPY_PS(XMM0,XMM5)
214: SSE_MULT_PS(XMM0,XMM7)
216: SSE_COPY_PS(XMM2,XMM3)
217: SSE_MULT_PS(XMM2,XMM7)
219: SSE_SHUFFLE(XMM7,XMM7,0x4E)
221: SSE_COPY_PS(XMM1,XMM5)
222: SSE_MULT_PS(XMM1,XMM7)
223: SSE_SUB_PS(XMM1,XMM0)
225: SSE_MULT_PS(XMM7,XMM3)
226: SSE_SUB_PS(XMM7,XMM2)
228: SSE_SHUFFLE(XMM7,XMM7,0x4E)
229: SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7)
231: /* ----------------------------------------------- */
233: SSE_COPY_PS(XMM0,XMM5)
234: SSE_MULT_PS(XMM0,XMM4)
236: SSE_SHUFFLE(XMM0,XMM0,0xB1)
238: SSE_COPY_PS(XMM2,XMM6)
239: SSE_MULT_PS(XMM2,XMM0)
240: SSE_ADD_PS(XMM2,XMM1)
241:
242: SSE_COPY_PS(XMM7,XMM3)
243: SSE_MULT_PS(XMM7,XMM0)
245: SSE_SHUFFLE(XMM0,XMM0,0x4E)
247: SSE_COPY_PS(XMM1,XMM6)
248: SSE_MULT_PS(XMM1,XMM0)
249: SSE_SUB_PS(XMM2,XMM1)
251: SSE_MULT_PS(XMM0,XMM3)
252: SSE_SUB_PS(XMM0,XMM7)
254: SSE_SHUFFLE(XMM0,XMM0,0x4E)
255: SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0)
257: /* ----------------------------------------------- */
259: SSE_COPY_PS(XMM7,XMM5)
260: SSE_SHUFFLE(XMM7,XMM5,0x4E)
261: SSE_MULT_PS(XMM7,XMM6)
263: SSE_SHUFFLE(XMM7,XMM7,0xB1)
265: SSE_SHUFFLE(XMM4,XMM4,0x4E)
267: SSE_COPY_PS(XMM0,XMM4)
268: SSE_MULT_PS(XMM0,XMM7)
269: SSE_ADD_PS(XMM0,XMM2)
271: SSE_COPY_PS(XMM2,XMM3)
272: SSE_MULT_PS(XMM2,XMM7)
274: SSE_SHUFFLE(XMM7,XMM7,0x4E)
276: SSE_COPY_PS(XMM1,XMM4)
277: SSE_MULT_PS(XMM1,XMM7)
278: SSE_SUB_PS(XMM0,XMM1)
279: SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0)
281: SSE_MULT_PS(XMM7,XMM3)
282: SSE_SUB_PS(XMM7,XMM2)
284: SSE_SHUFFLE(XMM7,XMM7,0x4E)
286: /* ----------------------------------------------- */
288: SSE_COPY_PS(XMM1,XMM3)
289: SSE_MULT_PS(XMM1,XMM5)
291: SSE_SHUFFLE(XMM1,XMM1,0xB1)
293: SSE_COPY_PS(XMM0,XMM6)
294: SSE_MULT_PS(XMM0,XMM1)
295: SSE_ADD_PS(XMM0,XMM7)
296:
297: SSE_COPY_PS(XMM2,XMM4)
298: SSE_MULT_PS(XMM2,XMM1)
299: SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12)
301: SSE_SHUFFLE(XMM1,XMM1,0x4E)
303: SSE_COPY_PS(XMM7,XMM6)
304: SSE_MULT_PS(XMM7,XMM1)
305: SSE_SUB_PS(XMM7,XMM0)
307: SSE_MULT_PS(XMM1,XMM4)
308: SSE_SUB_PS(XMM2,XMM1)
309: SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2)
311: /* ----------------------------------------------- */
313: SSE_COPY_PS(XMM1,XMM3)
314: SSE_MULT_PS(XMM1,XMM6)
316: SSE_SHUFFLE(XMM1,XMM1,0xB1)
318: SSE_COPY_PS(XMM2,XMM4)
319: SSE_MULT_PS(XMM2,XMM1)
320: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0)
321: SSE_SUB_PS(XMM0,XMM2)
323: SSE_COPY_PS(XMM2,XMM5)
324: SSE_MULT_PS(XMM2,XMM1)
325: SSE_ADD_PS(XMM2,XMM7)
327: SSE_SHUFFLE(XMM1,XMM1,0x4E)
329: SSE_COPY_PS(XMM7,XMM4)
330: SSE_MULT_PS(XMM7,XMM1)
331: SSE_ADD_PS(XMM7,XMM0)
333: SSE_MULT_PS(XMM1,XMM5)
334: SSE_SUB_PS(XMM2,XMM1)
336: /* ----------------------------------------------- */
338: SSE_MULT_PS(XMM4,XMM3)
340: SSE_SHUFFLE(XMM4,XMM4,0xB1)
342: SSE_COPY_PS(XMM1,XMM6)
343: SSE_MULT_PS(XMM1,XMM4)
344: SSE_ADD_PS(XMM1,XMM7)
346: SSE_COPY_PS(XMM0,XMM5)
347: SSE_MULT_PS(XMM0,XMM4)
348: SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7)
349: SSE_SUB_PS(XMM7,XMM0)
351: SSE_SHUFFLE(XMM4,XMM4,0x4E)
353: SSE_MULT_PS(XMM6,XMM4)
354: SSE_SUB_PS(XMM1,XMM6)
356: SSE_MULT_PS(XMM5,XMM4)
357: SSE_ADD_PS(XMM5,XMM7)
359: /* ----------------------------------------------- */
361: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
362: SSE_MULT_PS(XMM3,XMM0)
364: SSE_COPY_PS(XMM4,XMM3)
365: SSE_SHUFFLE(XMM4,XMM3,0x4E)
366: SSE_ADD_PS(XMM4,XMM3)
368: SSE_COPY_PS(XMM6,XMM4)
369: SSE_SHUFFLE(XMM6,XMM4,0xB1)
370: SSE_ADD_SS(XMM6,XMM4)
372: SSE_COPY_PS(XMM3,XMM6)
373: SSE_RECIP_SS(XMM3,XMM6)
374: SSE_COPY_SS(XMM4,XMM3)
375: SSE_ADD_SS(XMM4,XMM3)
376: SSE_MULT_SS(XMM3,XMM3)
377: SSE_MULT_SS(XMM6,XMM3)
378: SSE_SUB_SS(XMM4,XMM6)
380: SSE_SHUFFLE(XMM4,XMM4,0x00)
382: SSE_MULT_PS(XMM0,XMM4)
383: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0)
384: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0)
386: SSE_MULT_PS(XMM1,XMM4)
387: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1)
388: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1)
390: SSE_MULT_PS(XMM2,XMM4)
391: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2)
392: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2)
394: SSE_MULT_PS(XMM4,XMM5)
395: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
396: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
398: /* ----------------------------------------------- */
400: SSE_INLINE_END_1;
401: SSE_SCOPE_END;
403: return(0);
404: }
406: #endif