Actual source code: csrperm.c

  1: #define PETSCMAT_DLL

  3: /*
  4:   Defines basic operations for the MATSEQCSRPERM matrix class.
  5:   This class is derived from the MATSEQAIJ class and retains the 
  6:   compressed row storage (aka Yale sparse matrix format) but augments 
  7:   it with some permutation information that enables some operations 
  8:   to be more vectorizable.  A physically rearranged copy of the matrix 
  9:   may be stored if the user desires.

 11:   Eventually a variety of permutations may be supported.
 12: */

 14:  #include ../src/mat/impls/aij/seq/aij.h

 16: #define NDIM 512
 17:     /* NDIM specifies how many rows at a time we should work with when 
 18:      * performing the vectorized mat-vec.  This depends on various factors 
 19:      * such as vector register length, etc., and I really need to add a 
 20:      * way for the user (or the library) to tune this.  I'm setting it to  
 21:      * 512 for now since that is what Ed D'Azevedo was using in his Fortran 
 22:      * routines. */

 24: typedef struct {
 25:   PetscInt  ngroup;
 26:   PetscInt *xgroup;
 27:     /* Denotes where groups of rows with same number of nonzeros 
 28:      * begin and end, i.e., xgroup[i] gives us the position in iperm[] 
 29:      * where the ith group begins. */
 30:   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
 31:   PetscInt *iperm;  /* The permutation vector. */

 33:   /* Flag that indicates whether we need to clean up permutation 
 34:    * information during the MatDestroy. */
 35:   PetscTruth CleanUpCSRPERM;

 37:   /* Some of this stuff is for Ed's recursive triangular solve.
 38:    * I'm not sure what I need yet. */
 39:   PetscInt blocksize;
 40:   PetscInt nstep;
 41:   PetscInt *jstart_list;
 42:   PetscInt *jend_list;
 43:   PetscInt *action_list;
 44:   PetscInt *ngroup_list;
 45:   PetscInt **ipointer_list;
 46:   PetscInt **xgroup_list;
 47:   PetscInt **nzgroup_list;
 48:   PetscInt **iperm_list;

 50:   /* We need to keep a pointer to MatAssemblyEnd_SeqAIJ because we 
 51:    * actually want to call this function from within the 
 52:    * MatAssemblyEnd_SeqCSRPERM function.  Similarly, we also need 
 53:    * MatDestroy_SeqAIJ and MatDuplicate_SeqAIJ. */
 54:   PetscErrorCode (*AssemblyEnd_SeqAIJ)(Mat,MatAssemblyType);
 55:   PetscErrorCode (*MatDestroy_SeqAIJ)(Mat);
 56:   PetscErrorCode (*MatDuplicate_SeqAIJ)(Mat,MatDuplicateOption,Mat*);
 57: } Mat_SeqCSRPERM;

 62: PetscErrorCode  MatConvert_SeqCSRPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
 63: {
 64:   /* This routine is only called to convert a MATCSRPERM to its base PETSc type, */
 65:   /* so we will ignore 'MatType type'. */
 67:   Mat            B = *newmat;
 68:   Mat_SeqCSRPERM *csrperm=(Mat_SeqCSRPERM*)A->spptr;

 71:   if (reuse == MAT_INITIAL_MATRIX) {
 72:     MatDuplicate(A,MAT_COPY_VALUES,&B);
 73:   }

 75:   /* Reset the original function pointers. */
 76:   B->ops->assemblyend = csrperm->AssemblyEnd_SeqAIJ;
 77:   B->ops->destroy   = csrperm->MatDestroy_SeqAIJ;
 78:   B->ops->duplicate = csrperm->MatDuplicate_SeqAIJ;

 80:   /* Free everything in the Mat_SeqCSRPERM data structure. 
 81:    * We don't free the Mat_SeqCSRPERM struct itself, as this will 
 82:    * cause problems later when MatDestroy() tries to free it. */
 83:   if(csrperm->CleanUpCSRPERM) {
 84:     PetscFree(csrperm->xgroup);
 85:     PetscFree(csrperm->nzgroup);
 86:     PetscFree(csrperm->iperm);
 87:   }

 89:   /* Change the type of B to MATSEQAIJ. */
 90:   PetscObjectChangeTypeName( (PetscObject)B, MATSEQAIJ);
 91: 
 92:   *newmat = B;
 93:   return(0);
 94: }

 99: PetscErrorCode MatDestroy_SeqCSRPERM(Mat A)
100: {
102:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM *) A->spptr;

104:   /* We are going to convert A back into a SEQAIJ matrix, since we are 
105:    * eventually going to use MatDestroy_SeqAIJ() to destroy everything 
106:    * that is not specific to CSRPERM.
107:    * In preparation for this, reset the operations pointers in A to 
108:    * their SeqAIJ versions. */
109:   A->ops->assemblyend = csrperm->AssemblyEnd_SeqAIJ;
110:     /* I suppose I don't actually need to point A->ops->assemblyend back 
111:      * to the right thing, but I do it anyway for completeness. */
112:   A->ops->destroy   = csrperm->MatDestroy_SeqAIJ;
113:   A->ops->duplicate = csrperm->MatDuplicate_SeqAIJ;

115:   /* Free everything in the Mat_SeqCSRPERM data structure. 
116:    * Note that we don't need to free the Mat_SeqCSRPERM struct 
117:    * itself, as MatDestroy() will do so. */
118:   if(csrperm->CleanUpCSRPERM) {
119:     PetscFree(csrperm->xgroup);
120:     PetscFree(csrperm->nzgroup);
121:     PetscFree(csrperm->iperm);
122:   }

124:   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 
125:    * to destroy everything that remains. */
126:   PetscObjectChangeTypeName( (PetscObject)A, MATSEQAIJ);
127:   /* Note that I don't call MatSetType().  I believe this is because that 
128:    * is only to be called when *building* a matrix.  I could be wrong, but 
129:    * that is how things work for the SuperLU matrix class. */
130:   (*A->ops->destroy)(A);
131:   return(0);
132: }

134: PetscErrorCode MatDuplicate_SeqCSRPERM(Mat A, MatDuplicateOption op, Mat *M)
135: {
137:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM *) A->spptr;
138:   Mat_SeqCSRPERM *csrperm_dest = (Mat_SeqCSRPERM *) (*M)->spptr;

141:   (*csrperm->MatDuplicate_SeqAIJ)(A,op,M);
142:   PetscMemcpy((*M)->spptr,csrperm,sizeof(Mat_SeqCSRPERM));
143:   /* Allocate space for, and copy the grouping and permutation info. 
144:    * I note that when the groups are initially determined in 
145:    * SeqCSRPERM_create_perm, xgroup and nzgroup may be sized larger than 
146:    * necessary.  But at this point, we know how large they need to be, and 
147:    * allocate only the necessary amount of memory.  So the duplicated matrix 
148:    * may actually use slightly less storage than the original! */
149:   PetscMalloc(A->rmap->n*sizeof(PetscInt), csrperm_dest->iperm);
150:   PetscMalloc((csrperm->ngroup+1)*sizeof(PetscInt), csrperm_dest->xgroup);
151:   PetscMalloc((csrperm->ngroup)*sizeof(PetscInt), csrperm_dest->nzgroup);
152:   PetscMemcpy(csrperm_dest->iperm,csrperm->iperm,sizeof(PetscInt)*A->rmap->n);
153:   PetscMemcpy(csrperm_dest->xgroup,csrperm->xgroup,sizeof(PetscInt)*(csrperm->ngroup+1));
154:   PetscMemcpy(csrperm_dest->nzgroup,csrperm->nzgroup,sizeof(PetscInt)*csrperm->ngroup);
155:   return(0);
156: }

160: PetscErrorCode SeqCSRPERM_create_perm(Mat A)
161: {
162:   PetscInt   m;  /* Number of rows in the matrix. */
163:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data;
164:   PetscInt   *ia;  /* From the CSR representation; points to the beginning  of each row. */
165:   PetscInt   maxnz; /* Maximum number of nonzeros in any row. */
166:   PetscInt   *rows_in_bucket;
167:     /* To construct the permutation, we sort each row into one of maxnz 
168:      * buckets based on how many nonzeros are in the row. */
169:   PetscInt   nz;
170:   PetscInt   *nz_in_row;  /* the number of nonzero elements in row k. */
171:   PetscInt   *ipnz;
172:     /* When constructing the iperm permutation vector, 
173:      * ipnz[nz] is used to point to the next place in the permutation vector 
174:      * that a row with nz nonzero elements should be placed.*/
175:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM*) A->spptr;
176:     /* Points to the MATSEQCSRPERM-specific data in the matrix B. */
178:   PetscInt       i, ngroup, istart, ipos;

180:   /* I really ought to put something in here to check if B is of 
181:    * type MATSEQCSRPERM and return an error code if it is not.
182:    * Come back and do this! */
183: 
184:   m  = A->rmap->n;
185:   ia = a->i;
186: 
187:   /* Allocate the arrays that will hold the permutation vector. */
188:   PetscMalloc(m*sizeof(PetscInt), &csrperm->iperm);

190:   /* Allocate some temporary work arrays that will be used in 
191:    * calculating the permuation vector and groupings. */
192:   PetscMalloc((m+1)*sizeof(PetscInt), &rows_in_bucket);
193:   PetscMalloc((m+1)*sizeof(PetscInt), &ipnz);
194:   PetscMalloc(m*sizeof(PetscInt), &nz_in_row);

196:   /* Now actually figure out the permutation and grouping. */

198:   /* First pass: Determine number of nonzeros in each row, maximum 
199:    * number of nonzeros in any row, and how many rows fall into each  
200:    * "bucket" of rows with same number of nonzeros. */
201:   maxnz = 0;
202:   for (i=0; i<m; i++) {
203:     nz_in_row[i] = ia[i+1]-ia[i];
204:     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
205:   }

207:   for (i=0; i<=maxnz; i++) {
208:     rows_in_bucket[i] = 0;
209:   }
210:   for (i=0; i<m; i++) {
211:     nz = nz_in_row[i];
212:     rows_in_bucket[nz]++;
213:   }

215:   /* Allocate space for the grouping info.  There will be at most (maxnz + 1) 
216:    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be 
217:    * rows with no nonzero elements.)  If there are (maxnz + 1) groups, 
218:    * then xgroup[] must consist of (maxnz + 2) elements, since the last 
219:    * element of xgroup will tell us where the (maxnz + 1)th group ends.
220:    * We allocate space for the maximum number of groups; 
221:    * that is potentially a little wasteful, but not too much so.  
222:    * Perhaps I should fix it later. */
223:   PetscMalloc((maxnz+2)*sizeof(PetscInt), &csrperm->xgroup);
224:   PetscMalloc((maxnz+1)*sizeof(PetscInt), &csrperm->nzgroup);

226:   /* Second pass.  Look at what is in the buckets and create the groupings.
227:    * Note that it is OK to have a group of rows with no non-zero values. */
228:   ngroup = 0;
229:   istart = 0;
230:   for (i=0; i<=maxnz; i++) {
231:     if (rows_in_bucket[i] > 0) {
232:       csrperm->nzgroup[ngroup] = i;
233:       csrperm->xgroup[ngroup] = istart;
234:       ngroup++;
235:       istart += rows_in_bucket[i];
236:     }
237:   }

239:   csrperm->xgroup[ngroup] = istart;
240:   csrperm->ngroup = ngroup;

242:   /* Now fill in the permutation vector iperm. */
243:   ipnz[0] = 0;
244:   for (i=0; i<maxnz; i++) {
245:     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
246:   }

248:   for (i=0; i<m; i++) {
249:     nz = nz_in_row[i];
250:     ipos = ipnz[nz];
251:     csrperm->iperm[ipos] = i;
252:     ipnz[nz]++;
253:   }

255:   /* Clean up temporary work arrays. */
256:   PetscFree(rows_in_bucket);
257:   PetscFree(ipnz);
258:   PetscFree(nz_in_row);

260:   /* Since we've allocated some memory to hold permutation info, 
261:    * flip the CleanUpCSRPERM flag to true. */
262:   csrperm->CleanUpCSRPERM = PETSC_TRUE;
263:   return(0);
264: }


269: PetscErrorCode MatAssemblyEnd_SeqCSRPERM(Mat A, MatAssemblyType mode)
270: {
272:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM*) A->spptr;
273:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

275:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
276: 
277:   /* Since a MATSEQCSRPERM matrix is really just a MATSEQAIJ with some 
278:    * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 
279:    * I'm not sure if this is the best way to do this, but it avoids 
280:    * a lot of code duplication.
281:    * I also note that currently MATSEQCSRPERM doesn't know anything about 
282:    * the Mat_CompressedRow data structure that SeqAIJ now uses when there 
283:    * are many zero rows.  If the SeqAIJ assembly end routine decides to use 
284:    * this, this may break things.  (Don't know... haven't looked at it.) */
285:   a->inode.use = PETSC_FALSE;
286:   (*csrperm->AssemblyEnd_SeqAIJ)(A, mode);

288:   /* Now calculate the permutation and grouping information. */
289:   SeqCSRPERM_create_perm(A);
290:   return(0);
291: }

293:  #include ../src/inline/dot.h

297: PetscErrorCode MatMult_SeqCSRPERM(Mat A,Vec xx,Vec yy)
298: {
299:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
300:   PetscScalar     *x,*y;
301:   const MatScalar *aa;
302:   PetscErrorCode  ierr;
303:   PetscInt        *aj,*ai;
304: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTCSRPERM) && defined(notworking))
305:   PetscInt        i,j;
306: #endif

308:   /* Variables that don't appear in MatMult_SeqAIJ. */
309:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM *) A->spptr;
310:   PetscInt       *iperm;  /* Points to the permutation vector. */
311:   PetscInt       *xgroup;
312:     /* Denotes where groups of rows with same number of nonzeros 
313:      * begin and end in iperm. */
314:   PetscInt *nzgroup;
315:   PetscInt ngroup;
316:   PetscInt igroup;
317:   PetscInt jstart,jend;
318:     /* jstart is used in loops to denote the position in iperm where a 
319:      * group starts; jend denotes the position where it ends.
320:      * (jend + 1 is where the next group starts.) */
321:   PetscInt iold,nz;
322:   PetscInt istart,iend,isize;
323:   PetscInt ipos;
324:   PetscScalar yp[NDIM];
325:   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */

327: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
328: #pragma disjoint(*x,*y,*aa)
329: #endif

332:   VecGetArray(xx,&x);
333:   VecGetArray(yy,&y);
334:   aj  = a->j;    /* aj[k] gives column index for element aa[k]. */
335:   aa  = a->a;  /* Nonzero elements stored row-by-row. */
336:   ai  = a->i;   /* ai[k] is the position in aa and aj where row k starts. */

338:   /* Get the info we need about the permutations and groupings. */
339:   iperm  = csrperm->iperm;
340:   ngroup = csrperm->ngroup;
341:   xgroup = csrperm->xgroup;
342:   nzgroup = csrperm->nzgroup;
343: 
344: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTCSRPERM) && defined(notworking)
345:   fortranmultcsrperm_(&m,x,ii,aj,aa,y);
346: #else

348:   for (igroup=0; igroup<ngroup; igroup++) {
349:     jstart = xgroup[igroup];
350:     jend   = xgroup[igroup+1] - 1;
351:     nz     = nzgroup[igroup];

353:     /* Handle the special cases where the number of nonzeros per row 
354:      * in the group is either 0 or 1. */
355:     if (nz == 0) {
356:       for (i=jstart; i<=jend; i++) {
357:         y[iperm[i]] = 0.0;
358:       }
359:     } else if (nz == 1) {
360:       for (i=jstart; i<=jend; i++) {
361:         iold = iperm[i];
362:         ipos = ai[iold];
363:         y[iold] = aa[ipos] * x[aj[ipos]];
364:       }
365:     } else {
366: 
367:       /* We work our way through the current group in chunks of NDIM rows 
368:        * at a time. */

370:       for (istart=jstart; istart<=jend; istart+=NDIM) {
371:         /* Figure out where the chunk of 'isize' rows ends in iperm.
372:          * 'isize may of course be less than NDIM for the last chunk. */
373:         iend = istart + (NDIM - 1);
374:         if (iend > jend) { iend = jend; }
375:         isize = iend - istart + 1;

377:         /* Initialize the yp[] array that will be used to hold part of 
378:          * the permuted results vector, and figure out where in aa each 
379:          * row of the chunk will begin. */
380:         for (i=0; i<isize; i++) {
381:           iold = iperm[istart + i];
382:             /* iold is a row number from the matrix A *before* reordering. */
383:           ip[i] = ai[iold];
384:             /* ip[i] tells us where the ith row of the chunk begins in aa. */
385:           yp[i] = (PetscScalar) 0.0;
386:         }

388:         /* If the number of zeros per row exceeds the number of rows in 
389:          * the chunk, we should vectorize along nz, that is, perform the 
390:          * mat-vec one row at a time as in the usual CSR case. */
391:         if (nz > isize) {
392: #if defined(PETSC_HAVE_CRAYC)
393: #pragma _CRI preferstream
394: #endif
395:           for (i=0; i<isize; i++) {
396: #if defined(PETSC_HAVE_CRAYC)
397: #pragma _CRI prefervector
398: #endif
399:             for (j=0; j<nz; j++) {
400:               ipos = ip[i] + j;
401:               yp[i] += aa[ipos] * x[aj[ipos]];
402:             }
403:           }
404:         } else {
405:         /* Otherwise, there are enough rows in the chunk to make it 
406:          * worthwhile to vectorize across the rows, that is, to do the 
407:          * matvec by operating with "columns" of the chunk. */
408:           for (j=0; j<nz; j++) {
409:             for(i=0; i<isize; i++) {
410:               ipos = ip[i] + j;
411:               yp[i] += aa[ipos] * x[aj[ipos]];
412:             }
413:           }
414:         }

416: #if defined(PETSC_HAVE_CRAYC)
417: #pragma _CRI ivdep
418: #endif
419:         /* Put results from yp[] into non-permuted result vector y. */
420:         for (i=0; i<isize; i++) {
421:           y[iperm[istart+i]] = yp[i];
422:         }
423:       } /* End processing chunk of isize rows of a group. */
424:     } /* End handling matvec for chunk with nz > 1. */
425:   } /* End loop over igroup. */
426: #endif
427:   PetscLogFlops(2*a->nz - A->rmap->n);
428:   VecRestoreArray(xx,&x);
429:   VecRestoreArray(yy,&y);
430:   return(0);
431: }


434: /* MatMultAdd_SeqCSRPERM() calculates yy = ww + A * xx.
435:  * Note that the names I used to designate the vectors differs from that 
436:  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent 
437:  * with the MatMult_SeqCSRPERM() routine, which is very similar to this one. */
438: /*
439:     I hate having virtually identical code for the mult and the multadd!!!
440: */
443: PetscErrorCode MatMultAdd_SeqCSRPERM(Mat A,Vec xx,Vec ww,Vec yy)
444: {
445:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
446:   PetscScalar     *x,*y,*w;
447:   const MatScalar *aa;
448:   PetscErrorCode  ierr;
449:   PetscInt        *aj,*ai;
450: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDCSRPERM)
451:   PetscInt        i,j;
452: #endif

454:   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
455:   Mat_SeqCSRPERM *csrperm;
456:   PetscInt *iperm;  /* Points to the permutation vector. */
457:   PetscInt *xgroup;
458:     /* Denotes where groups of rows with same number of nonzeros 
459:      * begin and end in iperm. */
460:   PetscInt *nzgroup;
461:   PetscInt ngroup;
462:   PetscInt igroup;
463:   PetscInt jstart,jend;
464:     /* jstart is used in loops to denote the position in iperm where a 
465:      * group starts; jend denotes the position where it ends.
466:      * (jend + 1 is where the next group starts.) */
467:   PetscInt iold,nz;
468:   PetscInt istart,iend,isize;
469:   PetscInt ipos;
470:   PetscScalar yp[NDIM];
471:   PetscInt ip[NDIM];
472:     /* yp[] and ip[] are treated as vector "registers" for performing 
473:      * the mat-vec. */

475: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
476: #pragma disjoint(*x,*y,*aa)
477: #endif

480:   VecGetArray(xx,&x);
481:   VecGetArray(yy,&y);
482:   if (yy != ww) {
483:     VecGetArray(ww,&w);
484:   } else {
485:     w = y;
486:   }

488:   aj  = a->j;  /* aj[k] gives column index for element aa[k]. */
489:   aa  = a->a;  /* Nonzero elements stored row-by-row. */
490:   ai  = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

492:   /* Get the info we need about the permutations and groupings. */
493:   csrperm = (Mat_SeqCSRPERM *) A->spptr;
494:   iperm = csrperm->iperm;
495:   ngroup = csrperm->ngroup;
496:   xgroup = csrperm->xgroup;
497:   nzgroup = csrperm->nzgroup;
498: 
499: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDCSRPERM)
500:   fortranmultaddcsrperm_(&m,x,ii,aj,aa,y,w);
501: #else

503:   for(igroup=0; igroup<ngroup; igroup++) {
504:     jstart = xgroup[igroup];
505:     jend = xgroup[igroup+1] - 1;

507:     nz = nzgroup[igroup];

509:     /* Handle the special cases where the number of nonzeros per row 
510:      * in the group is either 0 or 1. */
511:     if(nz == 0) {
512:       for(i=jstart; i<=jend; i++) {
513:         iold = iperm[i];
514:         y[iold] = w[iold];
515:       }
516:     }
517:     else if(nz == 1) {
518:       for(i=jstart; i<=jend; i++) {
519:         iold = iperm[i];
520:         ipos = ai[iold];
521:         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
522:       }
523:     }
524:     /* For the general case: */
525:     else {
526: 
527:       /* We work our way through the current group in chunks of NDIM rows 
528:        * at a time. */

530:       for(istart=jstart; istart<=jend; istart+=NDIM) {
531:         /* Figure out where the chunk of 'isize' rows ends in iperm.
532:          * 'isize may of course be less than NDIM for the last chunk. */
533:         iend = istart + (NDIM - 1);
534:         if(iend > jend) { iend = jend; }
535:         isize = iend - istart + 1;

537:         /* Initialize the yp[] array that will be used to hold part of 
538:          * the permuted results vector, and figure out where in aa each 
539:          * row of the chunk will begin. */
540:         for(i=0; i<isize; i++) {
541:           iold = iperm[istart + i];
542:             /* iold is a row number from the matrix A *before* reordering. */
543:           ip[i] = ai[iold];
544:             /* ip[i] tells us where the ith row of the chunk begins in aa. */
545:           yp[i] = w[iold];
546:         }

548:         /* If the number of zeros per row exceeds the number of rows in 
549:          * the chunk, we should vectorize along nz, that is, perform the 
550:          * mat-vec one row at a time as in the usual CSR case. */
551:         if(nz > isize) {
552: #if defined(PETSC_HAVE_CRAYC)
553: #pragma _CRI preferstream
554: #endif
555:           for(i=0; i<isize; i++) {
556: #if defined(PETSC_HAVE_CRAYC)
557: #pragma _CRI prefervector
558: #endif
559:             for(j=0; j<nz; j++) {
560:               ipos = ip[i] + j;
561:               yp[i] += aa[ipos] * x[aj[ipos]];
562:             }
563:           }
564:         }
565:         /* Otherwise, there are enough rows in the chunk to make it 
566:          * worthwhile to vectorize across the rows, that is, to do the 
567:          * matvec by operating with "columns" of the chunk. */
568:         else {
569:           for(j=0; j<nz; j++) {
570:             for(i=0; i<isize; i++) {
571:               ipos = ip[i] + j;
572:               yp[i] += aa[ipos] * x[aj[ipos]];
573:             }
574:           }
575:         }

577: #if defined(PETSC_HAVE_CRAYC)
578: #pragma _CRI ivdep
579: #endif
580:         /* Put results from yp[] into non-permuted result vector y. */
581:         for(i=0; i<isize; i++) {
582:           y[iperm[istart+i]] = yp[i];
583:         }
584:       } /* End processing chunk of isize rows of a group. */
585: 
586:     } /* End handling matvec for chunk with nz > 1. */
587:   } /* End loop over igroup. */

589: #endif
590:   PetscLogFlops(2*a->nz - A->rmap->n);
591:   VecRestoreArray(xx,&x);
592:   VecRestoreArray(yy,&y);
593:   if (yy != ww) {
594:     VecRestoreArray(ww,&w);
595:   }
596:   return(0);
597: }


600: /* MatConvert_SeqAIJ_SeqCSRPERM converts a SeqAIJ matrix into a 
601:  * SeqCSRPERM matrix.  This routine is called by the MatCreate_SeqCSRPERM() 
602:  * routine, but can also be used to convert an assembled SeqAIJ matrix 
603:  * into a SeqCSRPERM one. */
607: PetscErrorCode  MatConvert_SeqAIJ_SeqCSRPERM(Mat A,const MatType type,MatReuse reuse,Mat *newmat)
608: {
610:   Mat            B = *newmat;
611:   Mat_SeqCSRPERM *csrperm;

614:   if (reuse == MAT_INITIAL_MATRIX) {
615:     MatDuplicate(A,MAT_COPY_VALUES,&B);
616:   }

618:   PetscNewLog(B,Mat_SeqCSRPERM,&csrperm);
619:   B->spptr = (void *) csrperm;

621:   csrperm->AssemblyEnd_SeqAIJ  = A->ops->assemblyend;
622:   csrperm->MatDestroy_SeqAIJ   = A->ops->destroy;
623:   csrperm->MatDuplicate_SeqAIJ = A->ops->duplicate;

625:   /* Set function pointers for methods that we inherit from AIJ but override. */
626:   B->ops->duplicate   = MatDuplicate_SeqCSRPERM;
627:   B->ops->assemblyend = MatAssemblyEnd_SeqCSRPERM;
628:   B->ops->destroy     = MatDestroy_SeqCSRPERM;
629:   B->ops->mult        = MatMult_SeqCSRPERM;
630:   B->ops->multadd     = MatMultAdd_SeqCSRPERM;

632:   /* If A has already been assembled, compute the permutation. */
633:   if (A->assembled == PETSC_TRUE) {
634:     SeqCSRPERM_create_perm(B);
635:   }
636: 
637:   PetscObjectComposeFunctionDynamic((PetscObject)B,"MatConvert_seqcsrperm_seqaij_C",
638:                                            "MatConvert_SeqCSRPERM_SeqAIJ",MatConvert_SeqCSRPERM_SeqAIJ);

640:   PetscObjectChangeTypeName((PetscObject)B,MATSEQCSRPERM);
641:   *newmat = B;
642:   return(0);
643: }


649: /*@C
650:    MatCreateSeqCSRPERM - Creates a sparse matrix of type SEQCSRPERM.
651:    This type inherits from AIJ, but calculates some additional permutation 
652:    information that is used to allow better vectorization of some 
653:    operations.  At the cost of increased storage, the AIJ formatted 
654:    matrix can be copied to a format in which pieces of the matrix are 
655:    stored in ELLPACK format, allowing the vectorized matrix multiply 
656:    routine to use stride-1 memory accesses.  As with the AIJ type, it is 
657:    important to preallocate matrix storage in order to get good assembly 
658:    performance.
659:    
660:    Collective on MPI_Comm

662:    Input Parameters:
663: +  comm - MPI communicator, set to PETSC_COMM_SELF
664: .  m - number of rows
665: .  n - number of columns
666: .  nz - number of nonzeros per row (same for all rows)
667: -  nnz - array containing the number of nonzeros in the various rows 
668:          (possibly different for each row) or PETSC_NULL

670:    Output Parameter:
671: .  A - the matrix 

673:    Notes:
674:    If nnz is given then nz is ignored

676:    Level: intermediate

678: .keywords: matrix, cray, sparse, parallel

680: .seealso: MatCreate(), MatCreateMPICSRPERM(), MatSetValues()
681: @*/
682: PetscErrorCode  MatCreateSeqCSRPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
683: {

687:   MatCreate(comm,A);
688:   MatSetSizes(*A,m,n,m,n);
689:   MatSetType(*A,MATSEQCSRPERM);
690:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
691:   return(0);
692: }


698: PetscErrorCode  MatCreate_SeqCSRPERM(Mat A)
699: {

703:   MatSetType(A,MATSEQAIJ);
704:   MatConvert_SeqAIJ_SeqCSRPERM(A,MATSEQCSRPERM,MAT_REUSE_MATRIX,&A);
705:   return(0);
706: }