Actual source code: baijfact12.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for BAIJ format.
5: */
6: #include ../src/mat/impls/baij/seq/baij.h
7: #include ../src/inline/ilu.h
11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
12: {
13: /*
14: Default Version for when blocks are 4 by 4 Using natural ordering
15: */
16: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
18: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
19: PetscInt *ajtmpold,*ajtmp,nz,row;
20: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
21: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
22: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
23: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
24: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
25: MatScalar m13,m14,m15,m16;
26: MatScalar *ba = b->a,*aa = a->a;
27: PetscTruth pivotinblocks = b->pivotinblocks;
28: PetscReal shift = info->shiftinblocks;
31: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
33: for (i=0; i<n; i++) {
34: nz = bi[i+1] - bi[i];
35: ajtmp = bj + bi[i];
36: for (j=0; j<nz; j++) {
37: x = rtmp+16*ajtmp[j];
38: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
39: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
40: }
41: /* load in initial (unfactored row) */
42: nz = ai[i+1] - ai[i];
43: ajtmpold = aj + ai[i];
44: v = aa + 16*ai[i];
45: for (j=0; j<nz; j++) {
46: x = rtmp+16*ajtmpold[j];
47: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
48: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
49: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
50: x[14] = v[14]; x[15] = v[15];
51: v += 16;
52: }
53: row = *ajtmp++;
54: while (row < i) {
55: pc = rtmp + 16*row;
56: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
57: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
58: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
59: p15 = pc[14]; p16 = pc[15];
60: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
61: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
62: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
63: || p16 != 0.0) {
64: pv = ba + 16*diag_offset[row];
65: pj = bj + diag_offset[row] + 1;
66: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
67: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
68: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
69: x15 = pv[14]; x16 = pv[15];
70: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
71: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
72: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
73: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
75: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
76: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
77: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
78: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
80: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
81: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
82: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
83: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
85: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
86: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
87: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
88: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
89: nz = bi[row+1] - diag_offset[row] - 1;
90: pv += 16;
91: for (j=0; j<nz; j++) {
92: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
93: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
94: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
95: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
96: x = rtmp + 16*pj[j];
97: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
98: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
99: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
100: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
102: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
103: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
104: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
105: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
107: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
108: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
109: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
110: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
112: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
113: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
114: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
115: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
117: pv += 16;
118: }
119: PetscLogFlops(128*nz+112);
120: }
121: row = *ajtmp++;
122: }
123: /* finished row so stick it into b->a */
124: pv = ba + 16*bi[i];
125: pj = bj + bi[i];
126: nz = bi[i+1] - bi[i];
127: for (j=0; j<nz; j++) {
128: x = rtmp+16*pj[j];
129: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
130: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
131: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
132: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
133: pv += 16;
134: }
135: /* invert diagonal block */
136: w = ba + 16*diag_offset[i];
137: if (pivotinblocks) {
138: Kernel_A_gets_inverse_A_4(w,shift);
139: } else {
140: Kernel_A_gets_inverse_A_4_nopivot(w);
141: }
142: }
144: PetscFree(rtmp);
145: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
146: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
147: C->assembled = PETSC_TRUE;
148: PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
149: return(0);
150: }
153: #if defined(PETSC_HAVE_SSE)
155: #include PETSC_HAVE_SSE
157: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
160: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
161: {
162: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
164: int i,j,n = a->mbs;
165: int *bj = b->j,*bjtmp,*pj;
166: int row;
167: int *ajtmpold,nz,*bi=b->i;
168: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
169: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
170: MatScalar *ba = b->a,*aa = a->a;
171: int nonzero=0;
172: /* int nonzero=0,colscale = 16; */
173: PetscTruth pivotinblocks = b->pivotinblocks;
174: PetscReal shift = info->shiftinblocks;
177: SSE_SCOPE_BEGIN;
179: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
180: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
181: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
182: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
183: /* if ((unsigned long)bj==(unsigned long)aj) { */
184: /* colscale = 4; */
185: /* } */
186: for (i=0; i<n; i++) {
187: nz = bi[i+1] - bi[i];
188: bjtmp = bj + bi[i];
189: /* zero out the 4x4 block accumulators */
190: /* zero out one register */
191: XOR_PS(XMM7,XMM7);
192: for (j=0; j<nz; j++) {
193: x = rtmp+16*bjtmp[j];
194: /* x = rtmp+4*bjtmp[j]; */
195: SSE_INLINE_BEGIN_1(x)
196: /* Copy zero register to memory locations */
197: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
198: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
199: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
200: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
201: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
202: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
203: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
204: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
205: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
206: SSE_INLINE_END_1;
207: }
208: /* load in initial (unfactored row) */
209: nz = ai[i+1] - ai[i];
210: ajtmpold = aj + ai[i];
211: v = aa + 16*ai[i];
212: for (j=0; j<nz; j++) {
213: x = rtmp+16*ajtmpold[j];
214: /* x = rtmp+colscale*ajtmpold[j]; */
215: /* Copy v block into x block */
216: SSE_INLINE_BEGIN_2(v,x)
217: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
218: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
219: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
221: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
222: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
224: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
225: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
227: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
228: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
230: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
231: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
233: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
234: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
236: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
237: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
239: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
240: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
241: SSE_INLINE_END_2;
243: v += 16;
244: }
245: /* row = (*bjtmp++)/4; */
246: row = *bjtmp++;
247: while (row < i) {
248: pc = rtmp + 16*row;
249: SSE_INLINE_BEGIN_1(pc)
250: /* Load block from lower triangle */
251: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
252: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
253: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
255: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
256: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
258: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
259: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
261: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
262: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
264: /* Compare block to zero block */
266: SSE_COPY_PS(XMM4,XMM7)
267: SSE_CMPNEQ_PS(XMM4,XMM0)
269: SSE_COPY_PS(XMM5,XMM7)
270: SSE_CMPNEQ_PS(XMM5,XMM1)
272: SSE_COPY_PS(XMM6,XMM7)
273: SSE_CMPNEQ_PS(XMM6,XMM2)
275: SSE_CMPNEQ_PS(XMM7,XMM3)
277: /* Reduce the comparisons to one SSE register */
278: SSE_OR_PS(XMM6,XMM7)
279: SSE_OR_PS(XMM5,XMM4)
280: SSE_OR_PS(XMM5,XMM6)
281: SSE_INLINE_END_1;
283: /* Reduce the one SSE register to an integer register for branching */
284: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
285: MOVEMASK(nonzero,XMM5);
287: /* If block is nonzero ... */
288: if (nonzero) {
289: pv = ba + 16*diag_offset[row];
290: PREFETCH_L1(&pv[16]);
291: pj = bj + diag_offset[row] + 1;
293: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
294: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
295: /* but the diagonal was inverted already */
296: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
298: SSE_INLINE_BEGIN_2(pv,pc)
299: /* Column 0, product is accumulated in XMM4 */
300: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
301: SSE_SHUFFLE(XMM4,XMM4,0x00)
302: SSE_MULT_PS(XMM4,XMM0)
304: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
305: SSE_SHUFFLE(XMM5,XMM5,0x00)
306: SSE_MULT_PS(XMM5,XMM1)
307: SSE_ADD_PS(XMM4,XMM5)
309: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
310: SSE_SHUFFLE(XMM6,XMM6,0x00)
311: SSE_MULT_PS(XMM6,XMM2)
312: SSE_ADD_PS(XMM4,XMM6)
314: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
315: SSE_SHUFFLE(XMM7,XMM7,0x00)
316: SSE_MULT_PS(XMM7,XMM3)
317: SSE_ADD_PS(XMM4,XMM7)
319: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
320: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
322: /* Column 1, product is accumulated in XMM5 */
323: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
324: SSE_SHUFFLE(XMM5,XMM5,0x00)
325: SSE_MULT_PS(XMM5,XMM0)
327: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
328: SSE_SHUFFLE(XMM6,XMM6,0x00)
329: SSE_MULT_PS(XMM6,XMM1)
330: SSE_ADD_PS(XMM5,XMM6)
332: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
333: SSE_SHUFFLE(XMM7,XMM7,0x00)
334: SSE_MULT_PS(XMM7,XMM2)
335: SSE_ADD_PS(XMM5,XMM7)
337: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
338: SSE_SHUFFLE(XMM6,XMM6,0x00)
339: SSE_MULT_PS(XMM6,XMM3)
340: SSE_ADD_PS(XMM5,XMM6)
342: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
343: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
345: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
347: /* Column 2, product is accumulated in XMM6 */
348: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
349: SSE_SHUFFLE(XMM6,XMM6,0x00)
350: SSE_MULT_PS(XMM6,XMM0)
352: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
353: SSE_SHUFFLE(XMM7,XMM7,0x00)
354: SSE_MULT_PS(XMM7,XMM1)
355: SSE_ADD_PS(XMM6,XMM7)
357: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
358: SSE_SHUFFLE(XMM7,XMM7,0x00)
359: SSE_MULT_PS(XMM7,XMM2)
360: SSE_ADD_PS(XMM6,XMM7)
362: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
363: SSE_SHUFFLE(XMM7,XMM7,0x00)
364: SSE_MULT_PS(XMM7,XMM3)
365: SSE_ADD_PS(XMM6,XMM7)
366:
367: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
368: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
370: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
371: /* Column 3, product is accumulated in XMM0 */
372: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
373: SSE_SHUFFLE(XMM7,XMM7,0x00)
374: SSE_MULT_PS(XMM0,XMM7)
376: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
377: SSE_SHUFFLE(XMM7,XMM7,0x00)
378: SSE_MULT_PS(XMM1,XMM7)
379: SSE_ADD_PS(XMM0,XMM1)
381: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
382: SSE_SHUFFLE(XMM1,XMM1,0x00)
383: SSE_MULT_PS(XMM1,XMM2)
384: SSE_ADD_PS(XMM0,XMM1)
386: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
387: SSE_SHUFFLE(XMM7,XMM7,0x00)
388: SSE_MULT_PS(XMM3,XMM7)
389: SSE_ADD_PS(XMM0,XMM3)
391: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
392: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
394: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
395: /* This is code to be maintained and read by humans afterall. */
396: /* Copy Multiplier Col 3 into XMM3 */
397: SSE_COPY_PS(XMM3,XMM0)
398: /* Copy Multiplier Col 2 into XMM2 */
399: SSE_COPY_PS(XMM2,XMM6)
400: /* Copy Multiplier Col 1 into XMM1 */
401: SSE_COPY_PS(XMM1,XMM5)
402: /* Copy Multiplier Col 0 into XMM0 */
403: SSE_COPY_PS(XMM0,XMM4)
404: SSE_INLINE_END_2;
406: /* Update the row: */
407: nz = bi[row+1] - diag_offset[row] - 1;
408: pv += 16;
409: for (j=0; j<nz; j++) {
410: PREFETCH_L1(&pv[16]);
411: x = rtmp + 16*pj[j];
412: /* x = rtmp + 4*pj[j]; */
414: /* X:=X-M*PV, One column at a time */
415: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
416: SSE_INLINE_BEGIN_2(x,pv)
417: /* Load First Column of X*/
418: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
419: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
421: /* Matrix-Vector Product: */
422: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
423: SSE_SHUFFLE(XMM5,XMM5,0x00)
424: SSE_MULT_PS(XMM5,XMM0)
425: SSE_SUB_PS(XMM4,XMM5)
427: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
428: SSE_SHUFFLE(XMM6,XMM6,0x00)
429: SSE_MULT_PS(XMM6,XMM1)
430: SSE_SUB_PS(XMM4,XMM6)
432: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
433: SSE_SHUFFLE(XMM7,XMM7,0x00)
434: SSE_MULT_PS(XMM7,XMM2)
435: SSE_SUB_PS(XMM4,XMM7)
437: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
438: SSE_SHUFFLE(XMM5,XMM5,0x00)
439: SSE_MULT_PS(XMM5,XMM3)
440: SSE_SUB_PS(XMM4,XMM5)
442: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
443: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
445: /* Second Column */
446: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
447: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
449: /* Matrix-Vector Product: */
450: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
451: SSE_SHUFFLE(XMM6,XMM6,0x00)
452: SSE_MULT_PS(XMM6,XMM0)
453: SSE_SUB_PS(XMM5,XMM6)
455: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
456: SSE_SHUFFLE(XMM7,XMM7,0x00)
457: SSE_MULT_PS(XMM7,XMM1)
458: SSE_SUB_PS(XMM5,XMM7)
460: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
461: SSE_SHUFFLE(XMM4,XMM4,0x00)
462: SSE_MULT_PS(XMM4,XMM2)
463: SSE_SUB_PS(XMM5,XMM4)
465: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
466: SSE_SHUFFLE(XMM6,XMM6,0x00)
467: SSE_MULT_PS(XMM6,XMM3)
468: SSE_SUB_PS(XMM5,XMM6)
469:
470: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
471: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
473: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
475: /* Third Column */
476: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
477: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
479: /* Matrix-Vector Product: */
480: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
481: SSE_SHUFFLE(XMM7,XMM7,0x00)
482: SSE_MULT_PS(XMM7,XMM0)
483: SSE_SUB_PS(XMM6,XMM7)
485: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
486: SSE_SHUFFLE(XMM4,XMM4,0x00)
487: SSE_MULT_PS(XMM4,XMM1)
488: SSE_SUB_PS(XMM6,XMM4)
490: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
491: SSE_SHUFFLE(XMM5,XMM5,0x00)
492: SSE_MULT_PS(XMM5,XMM2)
493: SSE_SUB_PS(XMM6,XMM5)
495: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
496: SSE_SHUFFLE(XMM7,XMM7,0x00)
497: SSE_MULT_PS(XMM7,XMM3)
498: SSE_SUB_PS(XMM6,XMM7)
499:
500: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
501: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
502:
503: /* Fourth Column */
504: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
505: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
507: /* Matrix-Vector Product: */
508: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
509: SSE_SHUFFLE(XMM5,XMM5,0x00)
510: SSE_MULT_PS(XMM5,XMM0)
511: SSE_SUB_PS(XMM4,XMM5)
513: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
514: SSE_SHUFFLE(XMM6,XMM6,0x00)
515: SSE_MULT_PS(XMM6,XMM1)
516: SSE_SUB_PS(XMM4,XMM6)
518: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
519: SSE_SHUFFLE(XMM7,XMM7,0x00)
520: SSE_MULT_PS(XMM7,XMM2)
521: SSE_SUB_PS(XMM4,XMM7)
523: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
524: SSE_SHUFFLE(XMM5,XMM5,0x00)
525: SSE_MULT_PS(XMM5,XMM3)
526: SSE_SUB_PS(XMM4,XMM5)
527:
528: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
529: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
530: SSE_INLINE_END_2;
531: pv += 16;
532: }
533: PetscLogFlops(128*nz+112);
534: }
535: row = *bjtmp++;
536: /* row = (*bjtmp++)/4; */
537: }
538: /* finished row so stick it into b->a */
539: pv = ba + 16*bi[i];
540: pj = bj + bi[i];
541: nz = bi[i+1] - bi[i];
543: /* Copy x block back into pv block */
544: for (j=0; j<nz; j++) {
545: x = rtmp+16*pj[j];
546: /* x = rtmp+4*pj[j]; */
548: SSE_INLINE_BEGIN_2(x,pv)
549: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
550: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
551: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
553: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
554: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
556: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
557: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
559: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
560: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
562: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
563: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
565: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
566: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
568: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
569: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
571: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
572: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
573: SSE_INLINE_END_2;
574: pv += 16;
575: }
576: /* invert diagonal block */
577: w = ba + 16*diag_offset[i];
578: if (pivotinblocks) {
579: Kernel_A_gets_inverse_A_4(w,shift);
580: } else {
581: Kernel_A_gets_inverse_A_4_nopivot(w);
582: }
583: /* Kernel_A_gets_inverse_A_4_SSE(w); */
584: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
585: }
587: PetscFree(rtmp);
588: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
589: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
590: C->assembled = PETSC_TRUE;
591: PetscLogFlops(1.3333*64*b->mbs);
592: /* Flop Count from inverting diagonal blocks */
593: SSE_SCOPE_END;
594: return(0);
595: }
599: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
600: {
601: Mat A=C;
602: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
604: int i,j,n = a->mbs;
605: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
606: unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
607: unsigned int row;
608: int nz,*bi=b->i;
609: int *diag_offset = b->diag,*ai=a->i;
610: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
611: MatScalar *ba = b->a,*aa = a->a;
612: int nonzero=0;
613: /* int nonzero=0,colscale = 16; */
614: PetscTruth pivotinblocks = b->pivotinblocks;
615: PetscReal shift = info->shiftinblocks;
618: SSE_SCOPE_BEGIN;
620: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
621: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
622: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
623: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
624: /* if ((unsigned long)bj==(unsigned long)aj) { */
625: /* colscale = 4; */
626: /* } */
627:
628: for (i=0; i<n; i++) {
629: nz = bi[i+1] - bi[i];
630: bjtmp = bj + bi[i];
631: /* zero out the 4x4 block accumulators */
632: /* zero out one register */
633: XOR_PS(XMM7,XMM7);
634: for (j=0; j<nz; j++) {
635: x = rtmp+16*((unsigned int)bjtmp[j]);
636: /* x = rtmp+4*bjtmp[j]; */
637: SSE_INLINE_BEGIN_1(x)
638: /* Copy zero register to memory locations */
639: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
640: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
641: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
642: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
643: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
644: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
645: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
646: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
647: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
648: SSE_INLINE_END_1;
649: }
650: /* load in initial (unfactored row) */
651: nz = ai[i+1] - ai[i];
652: ajtmp = aj + ai[i];
653: v = aa + 16*ai[i];
654: for (j=0; j<nz; j++) {
655: x = rtmp+16*((unsigned int)ajtmp[j]);
656: /* x = rtmp+colscale*ajtmp[j]; */
657: /* Copy v block into x block */
658: SSE_INLINE_BEGIN_2(v,x)
659: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
660: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
661: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
663: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
664: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
666: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
667: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
669: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
670: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
672: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
673: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
675: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
676: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
678: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
679: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
681: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
682: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
683: SSE_INLINE_END_2;
685: v += 16;
686: }
687: /* row = (*bjtmp++)/4; */
688: row = (unsigned int)(*bjtmp++);
689: while (row < i) {
690: pc = rtmp + 16*row;
691: SSE_INLINE_BEGIN_1(pc)
692: /* Load block from lower triangle */
693: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
694: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
695: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
697: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
698: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
700: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
701: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
703: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
704: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
706: /* Compare block to zero block */
708: SSE_COPY_PS(XMM4,XMM7)
709: SSE_CMPNEQ_PS(XMM4,XMM0)
711: SSE_COPY_PS(XMM5,XMM7)
712: SSE_CMPNEQ_PS(XMM5,XMM1)
714: SSE_COPY_PS(XMM6,XMM7)
715: SSE_CMPNEQ_PS(XMM6,XMM2)
717: SSE_CMPNEQ_PS(XMM7,XMM3)
719: /* Reduce the comparisons to one SSE register */
720: SSE_OR_PS(XMM6,XMM7)
721: SSE_OR_PS(XMM5,XMM4)
722: SSE_OR_PS(XMM5,XMM6)
723: SSE_INLINE_END_1;
725: /* Reduce the one SSE register to an integer register for branching */
726: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
727: MOVEMASK(nonzero,XMM5);
729: /* If block is nonzero ... */
730: if (nonzero) {
731: pv = ba + 16*diag_offset[row];
732: PREFETCH_L1(&pv[16]);
733: pj = bj + diag_offset[row] + 1;
735: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
736: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
737: /* but the diagonal was inverted already */
738: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
740: SSE_INLINE_BEGIN_2(pv,pc)
741: /* Column 0, product is accumulated in XMM4 */
742: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
743: SSE_SHUFFLE(XMM4,XMM4,0x00)
744: SSE_MULT_PS(XMM4,XMM0)
746: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
747: SSE_SHUFFLE(XMM5,XMM5,0x00)
748: SSE_MULT_PS(XMM5,XMM1)
749: SSE_ADD_PS(XMM4,XMM5)
751: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
752: SSE_SHUFFLE(XMM6,XMM6,0x00)
753: SSE_MULT_PS(XMM6,XMM2)
754: SSE_ADD_PS(XMM4,XMM6)
756: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
757: SSE_SHUFFLE(XMM7,XMM7,0x00)
758: SSE_MULT_PS(XMM7,XMM3)
759: SSE_ADD_PS(XMM4,XMM7)
761: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
762: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
764: /* Column 1, product is accumulated in XMM5 */
765: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
766: SSE_SHUFFLE(XMM5,XMM5,0x00)
767: SSE_MULT_PS(XMM5,XMM0)
769: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
770: SSE_SHUFFLE(XMM6,XMM6,0x00)
771: SSE_MULT_PS(XMM6,XMM1)
772: SSE_ADD_PS(XMM5,XMM6)
774: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
775: SSE_SHUFFLE(XMM7,XMM7,0x00)
776: SSE_MULT_PS(XMM7,XMM2)
777: SSE_ADD_PS(XMM5,XMM7)
779: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
780: SSE_SHUFFLE(XMM6,XMM6,0x00)
781: SSE_MULT_PS(XMM6,XMM3)
782: SSE_ADD_PS(XMM5,XMM6)
784: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
785: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
787: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
789: /* Column 2, product is accumulated in XMM6 */
790: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
791: SSE_SHUFFLE(XMM6,XMM6,0x00)
792: SSE_MULT_PS(XMM6,XMM0)
794: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
795: SSE_SHUFFLE(XMM7,XMM7,0x00)
796: SSE_MULT_PS(XMM7,XMM1)
797: SSE_ADD_PS(XMM6,XMM7)
799: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
800: SSE_SHUFFLE(XMM7,XMM7,0x00)
801: SSE_MULT_PS(XMM7,XMM2)
802: SSE_ADD_PS(XMM6,XMM7)
804: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
805: SSE_SHUFFLE(XMM7,XMM7,0x00)
806: SSE_MULT_PS(XMM7,XMM3)
807: SSE_ADD_PS(XMM6,XMM7)
808:
809: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
810: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
812: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
813: /* Column 3, product is accumulated in XMM0 */
814: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
815: SSE_SHUFFLE(XMM7,XMM7,0x00)
816: SSE_MULT_PS(XMM0,XMM7)
818: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
819: SSE_SHUFFLE(XMM7,XMM7,0x00)
820: SSE_MULT_PS(XMM1,XMM7)
821: SSE_ADD_PS(XMM0,XMM1)
823: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
824: SSE_SHUFFLE(XMM1,XMM1,0x00)
825: SSE_MULT_PS(XMM1,XMM2)
826: SSE_ADD_PS(XMM0,XMM1)
828: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
829: SSE_SHUFFLE(XMM7,XMM7,0x00)
830: SSE_MULT_PS(XMM3,XMM7)
831: SSE_ADD_PS(XMM0,XMM3)
833: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
834: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
836: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
837: /* This is code to be maintained and read by humans afterall. */
838: /* Copy Multiplier Col 3 into XMM3 */
839: SSE_COPY_PS(XMM3,XMM0)
840: /* Copy Multiplier Col 2 into XMM2 */
841: SSE_COPY_PS(XMM2,XMM6)
842: /* Copy Multiplier Col 1 into XMM1 */
843: SSE_COPY_PS(XMM1,XMM5)
844: /* Copy Multiplier Col 0 into XMM0 */
845: SSE_COPY_PS(XMM0,XMM4)
846: SSE_INLINE_END_2;
848: /* Update the row: */
849: nz = bi[row+1] - diag_offset[row] - 1;
850: pv += 16;
851: for (j=0; j<nz; j++) {
852: PREFETCH_L1(&pv[16]);
853: x = rtmp + 16*((unsigned int)pj[j]);
854: /* x = rtmp + 4*pj[j]; */
856: /* X:=X-M*PV, One column at a time */
857: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
858: SSE_INLINE_BEGIN_2(x,pv)
859: /* Load First Column of X*/
860: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
861: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
863: /* Matrix-Vector Product: */
864: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
865: SSE_SHUFFLE(XMM5,XMM5,0x00)
866: SSE_MULT_PS(XMM5,XMM0)
867: SSE_SUB_PS(XMM4,XMM5)
869: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
870: SSE_SHUFFLE(XMM6,XMM6,0x00)
871: SSE_MULT_PS(XMM6,XMM1)
872: SSE_SUB_PS(XMM4,XMM6)
874: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
875: SSE_SHUFFLE(XMM7,XMM7,0x00)
876: SSE_MULT_PS(XMM7,XMM2)
877: SSE_SUB_PS(XMM4,XMM7)
879: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
880: SSE_SHUFFLE(XMM5,XMM5,0x00)
881: SSE_MULT_PS(XMM5,XMM3)
882: SSE_SUB_PS(XMM4,XMM5)
884: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
885: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
887: /* Second Column */
888: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
889: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
891: /* Matrix-Vector Product: */
892: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
893: SSE_SHUFFLE(XMM6,XMM6,0x00)
894: SSE_MULT_PS(XMM6,XMM0)
895: SSE_SUB_PS(XMM5,XMM6)
897: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
898: SSE_SHUFFLE(XMM7,XMM7,0x00)
899: SSE_MULT_PS(XMM7,XMM1)
900: SSE_SUB_PS(XMM5,XMM7)
902: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
903: SSE_SHUFFLE(XMM4,XMM4,0x00)
904: SSE_MULT_PS(XMM4,XMM2)
905: SSE_SUB_PS(XMM5,XMM4)
907: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
908: SSE_SHUFFLE(XMM6,XMM6,0x00)
909: SSE_MULT_PS(XMM6,XMM3)
910: SSE_SUB_PS(XMM5,XMM6)
911:
912: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
913: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
915: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
917: /* Third Column */
918: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
919: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
921: /* Matrix-Vector Product: */
922: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
923: SSE_SHUFFLE(XMM7,XMM7,0x00)
924: SSE_MULT_PS(XMM7,XMM0)
925: SSE_SUB_PS(XMM6,XMM7)
927: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
928: SSE_SHUFFLE(XMM4,XMM4,0x00)
929: SSE_MULT_PS(XMM4,XMM1)
930: SSE_SUB_PS(XMM6,XMM4)
932: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
933: SSE_SHUFFLE(XMM5,XMM5,0x00)
934: SSE_MULT_PS(XMM5,XMM2)
935: SSE_SUB_PS(XMM6,XMM5)
937: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
938: SSE_SHUFFLE(XMM7,XMM7,0x00)
939: SSE_MULT_PS(XMM7,XMM3)
940: SSE_SUB_PS(XMM6,XMM7)
941:
942: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
943: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
944:
945: /* Fourth Column */
946: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
947: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
949: /* Matrix-Vector Product: */
950: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
951: SSE_SHUFFLE(XMM5,XMM5,0x00)
952: SSE_MULT_PS(XMM5,XMM0)
953: SSE_SUB_PS(XMM4,XMM5)
955: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
956: SSE_SHUFFLE(XMM6,XMM6,0x00)
957: SSE_MULT_PS(XMM6,XMM1)
958: SSE_SUB_PS(XMM4,XMM6)
960: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
961: SSE_SHUFFLE(XMM7,XMM7,0x00)
962: SSE_MULT_PS(XMM7,XMM2)
963: SSE_SUB_PS(XMM4,XMM7)
965: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
966: SSE_SHUFFLE(XMM5,XMM5,0x00)
967: SSE_MULT_PS(XMM5,XMM3)
968: SSE_SUB_PS(XMM4,XMM5)
969:
970: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
971: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
972: SSE_INLINE_END_2;
973: pv += 16;
974: }
975: PetscLogFlops(128*nz+112);
976: }
977: row = (unsigned int)(*bjtmp++);
978: /* row = (*bjtmp++)/4; */
979: /* bjtmp++; */
980: }
981: /* finished row so stick it into b->a */
982: pv = ba + 16*bi[i];
983: pj = bj + bi[i];
984: nz = bi[i+1] - bi[i];
986: /* Copy x block back into pv block */
987: for (j=0; j<nz; j++) {
988: x = rtmp+16*((unsigned int)pj[j]);
989: /* x = rtmp+4*pj[j]; */
991: SSE_INLINE_BEGIN_2(x,pv)
992: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
993: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
994: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
996: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
997: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
999: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1000: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1002: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1003: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1005: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1006: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1008: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1009: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1011: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1012: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1014: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1015: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1016: SSE_INLINE_END_2;
1017: pv += 16;
1018: }
1019: /* invert diagonal block */
1020: w = ba + 16*diag_offset[i];
1021: if (pivotinblocks) {
1022: Kernel_A_gets_inverse_A_4(w,shift);
1023: } else {
1024: Kernel_A_gets_inverse_A_4_nopivot(w);
1025: }
1026: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1027: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1028: }
1030: PetscFree(rtmp);
1031: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1032: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1033: C->assembled = PETSC_TRUE;
1034: PetscLogFlops(1.3333*64*b->mbs);
1035: /* Flop Count from inverting diagonal blocks */
1036: SSE_SCOPE_END;
1037: return(0);
1038: }
1042: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1043: {
1044: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1046: int i,j,n = a->mbs;
1047: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1048: unsigned int row;
1049: int *ajtmpold,nz,*bi=b->i;
1050: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1051: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1052: MatScalar *ba = b->a,*aa = a->a;
1053: int nonzero=0;
1054: /* int nonzero=0,colscale = 16; */
1055: PetscTruth pivotinblocks = b->pivotinblocks;
1056: PetscReal shift = info->shiftinblocks;
1059: SSE_SCOPE_BEGIN;
1061: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1062: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1063: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1064: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1065: /* if ((unsigned long)bj==(unsigned long)aj) { */
1066: /* colscale = 4; */
1067: /* } */
1068: if ((unsigned long)bj==(unsigned long)aj) {
1069: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1070: }
1071:
1072: for (i=0; i<n; i++) {
1073: nz = bi[i+1] - bi[i];
1074: bjtmp = bj + bi[i];
1075: /* zero out the 4x4 block accumulators */
1076: /* zero out one register */
1077: XOR_PS(XMM7,XMM7);
1078: for (j=0; j<nz; j++) {
1079: x = rtmp+16*((unsigned int)bjtmp[j]);
1080: /* x = rtmp+4*bjtmp[j]; */
1081: SSE_INLINE_BEGIN_1(x)
1082: /* Copy zero register to memory locations */
1083: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1084: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1085: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1086: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1087: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1088: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1089: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1090: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1091: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1092: SSE_INLINE_END_1;
1093: }
1094: /* load in initial (unfactored row) */
1095: nz = ai[i+1] - ai[i];
1096: ajtmpold = aj + ai[i];
1097: v = aa + 16*ai[i];
1098: for (j=0; j<nz; j++) {
1099: x = rtmp+16*ajtmpold[j];
1100: /* x = rtmp+colscale*ajtmpold[j]; */
1101: /* Copy v block into x block */
1102: SSE_INLINE_BEGIN_2(v,x)
1103: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1104: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1105: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1107: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1108: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1110: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1111: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1113: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1114: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1116: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1117: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1119: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1120: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1122: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1123: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1125: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1126: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1127: SSE_INLINE_END_2;
1129: v += 16;
1130: }
1131: /* row = (*bjtmp++)/4; */
1132: row = (unsigned int)(*bjtmp++);
1133: while (row < i) {
1134: pc = rtmp + 16*row;
1135: SSE_INLINE_BEGIN_1(pc)
1136: /* Load block from lower triangle */
1137: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1138: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1139: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1141: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1142: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1144: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1145: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1147: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1148: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1150: /* Compare block to zero block */
1152: SSE_COPY_PS(XMM4,XMM7)
1153: SSE_CMPNEQ_PS(XMM4,XMM0)
1155: SSE_COPY_PS(XMM5,XMM7)
1156: SSE_CMPNEQ_PS(XMM5,XMM1)
1158: SSE_COPY_PS(XMM6,XMM7)
1159: SSE_CMPNEQ_PS(XMM6,XMM2)
1161: SSE_CMPNEQ_PS(XMM7,XMM3)
1163: /* Reduce the comparisons to one SSE register */
1164: SSE_OR_PS(XMM6,XMM7)
1165: SSE_OR_PS(XMM5,XMM4)
1166: SSE_OR_PS(XMM5,XMM6)
1167: SSE_INLINE_END_1;
1169: /* Reduce the one SSE register to an integer register for branching */
1170: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1171: MOVEMASK(nonzero,XMM5);
1173: /* If block is nonzero ... */
1174: if (nonzero) {
1175: pv = ba + 16*diag_offset[row];
1176: PREFETCH_L1(&pv[16]);
1177: pj = bj + diag_offset[row] + 1;
1179: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1180: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1181: /* but the diagonal was inverted already */
1182: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1184: SSE_INLINE_BEGIN_2(pv,pc)
1185: /* Column 0, product is accumulated in XMM4 */
1186: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1187: SSE_SHUFFLE(XMM4,XMM4,0x00)
1188: SSE_MULT_PS(XMM4,XMM0)
1190: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1191: SSE_SHUFFLE(XMM5,XMM5,0x00)
1192: SSE_MULT_PS(XMM5,XMM1)
1193: SSE_ADD_PS(XMM4,XMM5)
1195: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1196: SSE_SHUFFLE(XMM6,XMM6,0x00)
1197: SSE_MULT_PS(XMM6,XMM2)
1198: SSE_ADD_PS(XMM4,XMM6)
1200: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1201: SSE_SHUFFLE(XMM7,XMM7,0x00)
1202: SSE_MULT_PS(XMM7,XMM3)
1203: SSE_ADD_PS(XMM4,XMM7)
1205: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1206: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1208: /* Column 1, product is accumulated in XMM5 */
1209: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1210: SSE_SHUFFLE(XMM5,XMM5,0x00)
1211: SSE_MULT_PS(XMM5,XMM0)
1213: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1214: SSE_SHUFFLE(XMM6,XMM6,0x00)
1215: SSE_MULT_PS(XMM6,XMM1)
1216: SSE_ADD_PS(XMM5,XMM6)
1218: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1219: SSE_SHUFFLE(XMM7,XMM7,0x00)
1220: SSE_MULT_PS(XMM7,XMM2)
1221: SSE_ADD_PS(XMM5,XMM7)
1223: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1224: SSE_SHUFFLE(XMM6,XMM6,0x00)
1225: SSE_MULT_PS(XMM6,XMM3)
1226: SSE_ADD_PS(XMM5,XMM6)
1228: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1229: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1231: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1233: /* Column 2, product is accumulated in XMM6 */
1234: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1235: SSE_SHUFFLE(XMM6,XMM6,0x00)
1236: SSE_MULT_PS(XMM6,XMM0)
1238: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1239: SSE_SHUFFLE(XMM7,XMM7,0x00)
1240: SSE_MULT_PS(XMM7,XMM1)
1241: SSE_ADD_PS(XMM6,XMM7)
1243: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1244: SSE_SHUFFLE(XMM7,XMM7,0x00)
1245: SSE_MULT_PS(XMM7,XMM2)
1246: SSE_ADD_PS(XMM6,XMM7)
1248: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1249: SSE_SHUFFLE(XMM7,XMM7,0x00)
1250: SSE_MULT_PS(XMM7,XMM3)
1251: SSE_ADD_PS(XMM6,XMM7)
1252:
1253: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1254: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1256: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1257: /* Column 3, product is accumulated in XMM0 */
1258: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1259: SSE_SHUFFLE(XMM7,XMM7,0x00)
1260: SSE_MULT_PS(XMM0,XMM7)
1262: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1263: SSE_SHUFFLE(XMM7,XMM7,0x00)
1264: SSE_MULT_PS(XMM1,XMM7)
1265: SSE_ADD_PS(XMM0,XMM1)
1267: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1268: SSE_SHUFFLE(XMM1,XMM1,0x00)
1269: SSE_MULT_PS(XMM1,XMM2)
1270: SSE_ADD_PS(XMM0,XMM1)
1272: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1273: SSE_SHUFFLE(XMM7,XMM7,0x00)
1274: SSE_MULT_PS(XMM3,XMM7)
1275: SSE_ADD_PS(XMM0,XMM3)
1277: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1278: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1280: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1281: /* This is code to be maintained and read by humans afterall. */
1282: /* Copy Multiplier Col 3 into XMM3 */
1283: SSE_COPY_PS(XMM3,XMM0)
1284: /* Copy Multiplier Col 2 into XMM2 */
1285: SSE_COPY_PS(XMM2,XMM6)
1286: /* Copy Multiplier Col 1 into XMM1 */
1287: SSE_COPY_PS(XMM1,XMM5)
1288: /* Copy Multiplier Col 0 into XMM0 */
1289: SSE_COPY_PS(XMM0,XMM4)
1290: SSE_INLINE_END_2;
1292: /* Update the row: */
1293: nz = bi[row+1] - diag_offset[row] - 1;
1294: pv += 16;
1295: for (j=0; j<nz; j++) {
1296: PREFETCH_L1(&pv[16]);
1297: x = rtmp + 16*((unsigned int)pj[j]);
1298: /* x = rtmp + 4*pj[j]; */
1300: /* X:=X-M*PV, One column at a time */
1301: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1302: SSE_INLINE_BEGIN_2(x,pv)
1303: /* Load First Column of X*/
1304: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1305: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1307: /* Matrix-Vector Product: */
1308: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1309: SSE_SHUFFLE(XMM5,XMM5,0x00)
1310: SSE_MULT_PS(XMM5,XMM0)
1311: SSE_SUB_PS(XMM4,XMM5)
1313: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1314: SSE_SHUFFLE(XMM6,XMM6,0x00)
1315: SSE_MULT_PS(XMM6,XMM1)
1316: SSE_SUB_PS(XMM4,XMM6)
1318: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1319: SSE_SHUFFLE(XMM7,XMM7,0x00)
1320: SSE_MULT_PS(XMM7,XMM2)
1321: SSE_SUB_PS(XMM4,XMM7)
1323: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1324: SSE_SHUFFLE(XMM5,XMM5,0x00)
1325: SSE_MULT_PS(XMM5,XMM3)
1326: SSE_SUB_PS(XMM4,XMM5)
1328: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1329: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1331: /* Second Column */
1332: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1333: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1335: /* Matrix-Vector Product: */
1336: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1337: SSE_SHUFFLE(XMM6,XMM6,0x00)
1338: SSE_MULT_PS(XMM6,XMM0)
1339: SSE_SUB_PS(XMM5,XMM6)
1341: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1342: SSE_SHUFFLE(XMM7,XMM7,0x00)
1343: SSE_MULT_PS(XMM7,XMM1)
1344: SSE_SUB_PS(XMM5,XMM7)
1346: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1347: SSE_SHUFFLE(XMM4,XMM4,0x00)
1348: SSE_MULT_PS(XMM4,XMM2)
1349: SSE_SUB_PS(XMM5,XMM4)
1351: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1352: SSE_SHUFFLE(XMM6,XMM6,0x00)
1353: SSE_MULT_PS(XMM6,XMM3)
1354: SSE_SUB_PS(XMM5,XMM6)
1355:
1356: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1357: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1359: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1361: /* Third Column */
1362: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1363: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1365: /* Matrix-Vector Product: */
1366: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1367: SSE_SHUFFLE(XMM7,XMM7,0x00)
1368: SSE_MULT_PS(XMM7,XMM0)
1369: SSE_SUB_PS(XMM6,XMM7)
1371: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1372: SSE_SHUFFLE(XMM4,XMM4,0x00)
1373: SSE_MULT_PS(XMM4,XMM1)
1374: SSE_SUB_PS(XMM6,XMM4)
1376: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1377: SSE_SHUFFLE(XMM5,XMM5,0x00)
1378: SSE_MULT_PS(XMM5,XMM2)
1379: SSE_SUB_PS(XMM6,XMM5)
1381: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1382: SSE_SHUFFLE(XMM7,XMM7,0x00)
1383: SSE_MULT_PS(XMM7,XMM3)
1384: SSE_SUB_PS(XMM6,XMM7)
1385:
1386: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1387: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1388:
1389: /* Fourth Column */
1390: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1391: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1393: /* Matrix-Vector Product: */
1394: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1395: SSE_SHUFFLE(XMM5,XMM5,0x00)
1396: SSE_MULT_PS(XMM5,XMM0)
1397: SSE_SUB_PS(XMM4,XMM5)
1399: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1400: SSE_SHUFFLE(XMM6,XMM6,0x00)
1401: SSE_MULT_PS(XMM6,XMM1)
1402: SSE_SUB_PS(XMM4,XMM6)
1404: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1405: SSE_SHUFFLE(XMM7,XMM7,0x00)
1406: SSE_MULT_PS(XMM7,XMM2)
1407: SSE_SUB_PS(XMM4,XMM7)
1409: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1410: SSE_SHUFFLE(XMM5,XMM5,0x00)
1411: SSE_MULT_PS(XMM5,XMM3)
1412: SSE_SUB_PS(XMM4,XMM5)
1413:
1414: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1415: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1416: SSE_INLINE_END_2;
1417: pv += 16;
1418: }
1419: PetscLogFlops(128*nz+112);
1420: }
1421: row = (unsigned int)(*bjtmp++);
1422: /* row = (*bjtmp++)/4; */
1423: /* bjtmp++; */
1424: }
1425: /* finished row so stick it into b->a */
1426: pv = ba + 16*bi[i];
1427: pj = bj + bi[i];
1428: nz = bi[i+1] - bi[i];
1430: /* Copy x block back into pv block */
1431: for (j=0; j<nz; j++) {
1432: x = rtmp+16*((unsigned int)pj[j]);
1433: /* x = rtmp+4*pj[j]; */
1435: SSE_INLINE_BEGIN_2(x,pv)
1436: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1437: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1438: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1440: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1441: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1443: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1444: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1446: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1447: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1449: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1450: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1452: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1453: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1455: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1456: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1458: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1459: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1460: SSE_INLINE_END_2;
1461: pv += 16;
1462: }
1463: /* invert diagonal block */
1464: w = ba + 16*diag_offset[i];
1465: if (pivotinblocks) {
1466: Kernel_A_gets_inverse_A_4(w,shift);
1467: } else {
1468: Kernel_A_gets_inverse_A_4_nopivot(w);
1469: }
1470: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1471: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1472: }
1474: PetscFree(rtmp);
1475: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1476: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1477: C->assembled = PETSC_TRUE;
1478: PetscLogFlops(1.3333*64*b->mbs);
1479: /* Flop Count from inverting diagonal blocks */
1480: SSE_SCOPE_END;
1481: return(0);
1482: }
1484: #endif