Actual source code: baijfact2.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for BAIJ format. 
  5: */

 7:  #include src/mat/impls/baij/seq/baij.h
 8:  #include src/inline/ilu.h
 9:  #include src/inline/dot.h

 13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 17:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 18:   PetscInt       *diag = a->diag;
 19:   MatScalar      *aa=a->a,*v;
 20:   PetscScalar    s1,*x,*b;

 23:   VecCopy(bb,xx);
 24:   VecGetArray(bb,&b);
 25:   VecGetArray(xx,&x);
 26: 
 27:   /* forward solve the U^T */
 28:   for (i=0; i<n; i++) {

 30:     v     = aa + diag[i];
 31:     /* multiply by the inverse of the block diagonal */
 32:     s1    = (*v++)*x[i];
 33:     vi    = aj + diag[i] + 1;
 34:     nz    = ai[i+1] - diag[i] - 1;
 35:     while (nz--) {
 36:       x[*vi++]  -= (*v++)*s1;
 37:     }
 38:     x[i]   = s1;
 39:   }
 40:   /* backward solve the L^T */
 41:   for (i=n-1; i>=0; i--){
 42:     v    = aa + diag[i] - 1;
 43:     vi   = aj + diag[i] - 1;
 44:     nz   = diag[i] - ai[i];
 45:     s1   = x[i];
 46:     while (nz--) {
 47:       x[*vi--]   -=  (*v--)*s1;
 48:     }
 49:   }
 50:   VecRestoreArray(bb,&b);
 51:   VecRestoreArray(xx,&x);
 52:   PetscLogFlops(2*(a->nz) - A->cmap.n);
 53:   return(0);
 54: }

 58: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 59: {
 60:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 62:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 63:   PetscInt       *diag = a->diag,oidx;
 64:   MatScalar      *aa=a->a,*v;
 65:   PetscScalar    s1,s2,x1,x2;
 66:   PetscScalar    *x,*b;

 69:   VecCopy(bb,xx);
 70:   VecGetArray(bb,&b);
 71:   VecGetArray(xx,&x);

 73:   /* forward solve the U^T */
 74:   idx = 0;
 75:   for (i=0; i<n; i++) {

 77:     v     = aa + 4*diag[i];
 78:     /* multiply by the inverse of the block diagonal */
 79:     x1 = x[idx];   x2 = x[1+idx];
 80:     s1 = v[0]*x1  +  v[1]*x2;
 81:     s2 = v[2]*x1  +  v[3]*x2;
 82:     v += 4;

 84:     vi    = aj + diag[i] + 1;
 85:     nz    = ai[i+1] - diag[i] - 1;
 86:     while (nz--) {
 87:       oidx = 2*(*vi++);
 88:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 89:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 90:       v  += 4;
 91:     }
 92:     x[idx]   = s1;x[1+idx] = s2;
 93:     idx += 2;
 94:   }
 95:   /* backward solve the L^T */
 96:   for (i=n-1; i>=0; i--){
 97:     v    = aa + 4*diag[i] - 4;
 98:     vi   = aj + diag[i] - 1;
 99:     nz   = diag[i] - ai[i];
100:     idt  = 2*i;
101:     s1   = x[idt];  s2 = x[1+idt];
102:     while (nz--) {
103:       idx   = 2*(*vi--);
104:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106:       v -= 4;
107:     }
108:   }
109:   VecRestoreArray(bb,&b);
110:   VecRestoreArray(xx,&x);
111:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
112:   return(0);
113: }

117: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118: {
119:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
121:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122:   PetscInt       *diag = a->diag,oidx;
123:   MatScalar      *aa=a->a,*v;
124:   PetscScalar    s1,s2,s3,x1,x2,x3;
125:   PetscScalar    *x,*b;

128:   VecCopy(bb,xx);
129:   VecGetArray(bb,&b);
130:   VecGetArray(xx,&x);

132:   /* forward solve the U^T */
133:   idx = 0;
134:   for (i=0; i<n; i++) {

136:     v     = aa + 9*diag[i];
137:     /* multiply by the inverse of the block diagonal */
138:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142:     v += 9;

144:     vi    = aj + diag[i] + 1;
145:     nz    = ai[i+1] - diag[i] - 1;
146:     while (nz--) {
147:       oidx = 3*(*vi++);
148:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151:       v  += 9;
152:     }
153:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154:     idx += 3;
155:   }
156:   /* backward solve the L^T */
157:   for (i=n-1; i>=0; i--){
158:     v    = aa + 9*diag[i] - 9;
159:     vi   = aj + diag[i] - 1;
160:     nz   = diag[i] - ai[i];
161:     idt  = 3*i;
162:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163:     while (nz--) {
164:       idx   = 3*(*vi--);
165:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168:       v -= 9;
169:     }
170:   }
171:   VecRestoreArray(bb,&b);
172:   VecRestoreArray(xx,&x);
173:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
174:   return(0);
175: }

179: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180: {
181:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
183:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184:   PetscInt       *diag = a->diag,oidx;
185:   MatScalar      *aa=a->a,*v;
186:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187:   PetscScalar    *x,*b;

190:   VecCopy(bb,xx);
191:   VecGetArray(bb,&b);
192:   VecGetArray(xx,&x);

194:   /* forward solve the U^T */
195:   idx = 0;
196:   for (i=0; i<n; i++) {

198:     v     = aa + 16*diag[i];
199:     /* multiply by the inverse of the block diagonal */
200:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205:     v += 16;

207:     vi    = aj + diag[i] + 1;
208:     nz    = ai[i+1] - diag[i] - 1;
209:     while (nz--) {
210:       oidx = 4*(*vi++);
211:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215:       v  += 16;
216:     }
217:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218:     idx += 4;
219:   }
220:   /* backward solve the L^T */
221:   for (i=n-1; i>=0; i--){
222:     v    = aa + 16*diag[i] - 16;
223:     vi   = aj + diag[i] - 1;
224:     nz   = diag[i] - ai[i];
225:     idt  = 4*i;
226:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227:     while (nz--) {
228:       idx   = 4*(*vi--);
229:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233:       v -= 16;
234:     }
235:   }
236:   VecRestoreArray(bb,&b);
237:   VecRestoreArray(xx,&x);
238:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
239:   return(0);
240: }

244: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245: {
246:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
248:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249:   PetscInt       *diag = a->diag,oidx;
250:   MatScalar      *aa=a->a,*v;
251:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252:   PetscScalar    *x,*b;

255:   VecCopy(bb,xx);
256:   VecGetArray(bb,&b);
257:   VecGetArray(xx,&x);

259:   /* forward solve the U^T */
260:   idx = 0;
261:   for (i=0; i<n; i++) {

263:     v     = aa + 25*diag[i];
264:     /* multiply by the inverse of the block diagonal */
265:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271:     v += 25;

273:     vi    = aj + diag[i] + 1;
274:     nz    = ai[i+1] - diag[i] - 1;
275:     while (nz--) {
276:       oidx = 5*(*vi++);
277:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282:       v  += 25;
283:     }
284:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285:     idx += 5;
286:   }
287:   /* backward solve the L^T */
288:   for (i=n-1; i>=0; i--){
289:     v    = aa + 25*diag[i] - 25;
290:     vi   = aj + diag[i] - 1;
291:     nz   = diag[i] - ai[i];
292:     idt  = 5*i;
293:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294:     while (nz--) {
295:       idx   = 5*(*vi--);
296:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301:       v -= 25;
302:     }
303:   }
304:   VecRestoreArray(bb,&b);
305:   VecRestoreArray(xx,&x);
306:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
307:   return(0);
308: }

312: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313: {
314:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
316:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317:   PetscInt       *diag = a->diag,oidx;
318:   MatScalar      *aa=a->a,*v;
319:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320:   PetscScalar    *x,*b;

323:   VecCopy(bb,xx);
324:   VecGetArray(bb,&b);
325:   VecGetArray(xx,&x);

327:   /* forward solve the U^T */
328:   idx = 0;
329:   for (i=0; i<n; i++) {

331:     v     = aa + 36*diag[i];
332:     /* multiply by the inverse of the block diagonal */
333:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334:     x6    = x[5+idx];
335:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341:     v += 36;

343:     vi    = aj + diag[i] + 1;
344:     nz    = ai[i+1] - diag[i] - 1;
345:     while (nz--) {
346:       oidx = 6*(*vi++);
347:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353:       v  += 36;
354:     }
355:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356:     x[5+idx] = s6;
357:     idx += 6;
358:   }
359:   /* backward solve the L^T */
360:   for (i=n-1; i>=0; i--){
361:     v    = aa + 36*diag[i] - 36;
362:     vi   = aj + diag[i] - 1;
363:     nz   = diag[i] - ai[i];
364:     idt  = 6*i;
365:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366:     s6 = x[5+idt];
367:     while (nz--) {
368:       idx   = 6*(*vi--);
369:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375:       v -= 36;
376:     }
377:   }
378:   VecRestoreArray(bb,&b);
379:   VecRestoreArray(xx,&x);
380:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
381:   return(0);
382: }

386: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387: {
388:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
390:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391:   PetscInt       *diag = a->diag,oidx;
392:   MatScalar      *aa=a->a,*v;
393:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394:   PetscScalar    *x,*b;

397:   VecCopy(bb,xx);
398:   VecGetArray(bb,&b);
399:   VecGetArray(xx,&x);

401:   /* forward solve the U^T */
402:   idx = 0;
403:   for (i=0; i<n; i++) {

405:     v     = aa + 49*diag[i];
406:     /* multiply by the inverse of the block diagonal */
407:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408:     x6    = x[5+idx]; x7 = x[6+idx];
409:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416:     v += 49;

418:     vi    = aj + diag[i] + 1;
419:     nz    = ai[i+1] - diag[i] - 1;
420:     while (nz--) {
421:       oidx = 7*(*vi++);
422:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429:       v  += 49;
430:     }
431:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432:     x[5+idx] = s6;x[6+idx] = s7;
433:     idx += 7;
434:   }
435:   /* backward solve the L^T */
436:   for (i=n-1; i>=0; i--){
437:     v    = aa + 49*diag[i] - 49;
438:     vi   = aj + diag[i] - 1;
439:     nz   = diag[i] - ai[i];
440:     idt  = 7*i;
441:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442:     s6 = x[5+idt];s7 = x[6+idt];
443:     while (nz--) {
444:       idx   = 7*(*vi--);
445:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452:       v -= 49;
453:     }
454:   }
455:   VecRestoreArray(bb,&b);
456:   VecRestoreArray(xx,&x);
457:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
458:   return(0);
459: }

461: /*---------------------------------------------------------------------------------------------*/
464: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465: {
466:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467:   IS             iscol=a->col,isrow=a->row;
469:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470:   PetscInt       *diag = a->diag;
471:   MatScalar      *aa=a->a,*v;
472:   PetscScalar    s1,*x,*b,*t;

475:   VecGetArray(bb,&b);
476:   VecGetArray(xx,&x);
477:   t  = a->solve_work;

479:   ISGetIndices(isrow,&rout); r = rout;
480:   ISGetIndices(iscol,&cout); c = cout;

482:   /* copy the b into temp work space according to permutation */
483:   for (i=0; i<n; i++) {
484:     t[i] = b[c[i]];
485:   }

487:   /* forward solve the U^T */
488:   for (i=0; i<n; i++) {

490:     v     = aa + diag[i];
491:     /* multiply by the inverse of the block diagonal */
492:     s1    = (*v++)*t[i];
493:     vi    = aj + diag[i] + 1;
494:     nz    = ai[i+1] - diag[i] - 1;
495:     while (nz--) {
496:       t[*vi++]  -= (*v++)*s1;
497:     }
498:     t[i]   = s1;
499:   }
500:   /* backward solve the L^T */
501:   for (i=n-1; i>=0; i--){
502:     v    = aa + diag[i] - 1;
503:     vi   = aj + diag[i] - 1;
504:     nz   = diag[i] - ai[i];
505:     s1   = t[i];
506:     while (nz--) {
507:       t[*vi--]   -=  (*v--)*s1;
508:     }
509:   }

511:   /* copy t into x according to permutation */
512:   for (i=0; i<n; i++) {
513:     x[r[i]]   = t[i];
514:   }

516:   ISRestoreIndices(isrow,&rout);
517:   ISRestoreIndices(iscol,&cout);
518:   VecRestoreArray(bb,&b);
519:   VecRestoreArray(xx,&x);
520:   PetscLogFlops(2*(a->nz) - A->cmap.n);
521:   return(0);
522: }

526: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527: {
528:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529:   IS             iscol=a->col,isrow=a->row;
531:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
533:   MatScalar      *aa=a->a,*v;
534:   PetscScalar    s1,s2,x1,x2;
535:   PetscScalar    *x,*b,*t;

538:   VecGetArray(bb,&b);
539:   VecGetArray(xx,&x);
540:   t  = a->solve_work;

542:   ISGetIndices(isrow,&rout); r = rout;
543:   ISGetIndices(iscol,&cout); c = cout;

545:   /* copy the b into temp work space according to permutation */
546:   ii = 0;
547:   for (i=0; i<n; i++) {
548:     ic      = 2*c[i];
549:     t[ii]   = b[ic];
550:     t[ii+1] = b[ic+1];
551:     ii += 2;
552:   }

554:   /* forward solve the U^T */
555:   idx = 0;
556:   for (i=0; i<n; i++) {

558:     v     = aa + 4*diag[i];
559:     /* multiply by the inverse of the block diagonal */
560:     x1    = t[idx];   x2 = t[1+idx];
561:     s1 = v[0]*x1  +  v[1]*x2;
562:     s2 = v[2]*x1  +  v[3]*x2;
563:     v += 4;

565:     vi    = aj + diag[i] + 1;
566:     nz    = ai[i+1] - diag[i] - 1;
567:     while (nz--) {
568:       oidx = 2*(*vi++);
569:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
570:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
571:       v  += 4;
572:     }
573:     t[idx]   = s1;t[1+idx] = s2;
574:     idx += 2;
575:   }
576:   /* backward solve the L^T */
577:   for (i=n-1; i>=0; i--){
578:     v    = aa + 4*diag[i] - 4;
579:     vi   = aj + diag[i] - 1;
580:     nz   = diag[i] - ai[i];
581:     idt  = 2*i;
582:     s1 = t[idt];  s2 = t[1+idt];
583:     while (nz--) {
584:       idx   = 2*(*vi--);
585:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
586:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
587:       v -= 4;
588:     }
589:   }

591:   /* copy t into x according to permutation */
592:   ii = 0;
593:   for (i=0; i<n; i++) {
594:     ir      = 2*r[i];
595:     x[ir]   = t[ii];
596:     x[ir+1] = t[ii+1];
597:     ii += 2;
598:   }

600:   ISRestoreIndices(isrow,&rout);
601:   ISRestoreIndices(iscol,&cout);
602:   VecRestoreArray(bb,&b);
603:   VecRestoreArray(xx,&x);
604:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
605:   return(0);
606: }

610: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611: {
612:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
613:   IS             iscol=a->col,isrow=a->row;
615:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
617:   MatScalar      *aa=a->a,*v;
618:   PetscScalar    s1,s2,s3,x1,x2,x3;
619:   PetscScalar    *x,*b,*t;

622:   VecGetArray(bb,&b);
623:   VecGetArray(xx,&x);
624:   t  = a->solve_work;

626:   ISGetIndices(isrow,&rout); r = rout;
627:   ISGetIndices(iscol,&cout); c = cout;

629:   /* copy the b into temp work space according to permutation */
630:   ii = 0;
631:   for (i=0; i<n; i++) {
632:     ic      = 3*c[i];
633:     t[ii]   = b[ic];
634:     t[ii+1] = b[ic+1];
635:     t[ii+2] = b[ic+2];
636:     ii += 3;
637:   }

639:   /* forward solve the U^T */
640:   idx = 0;
641:   for (i=0; i<n; i++) {

643:     v     = aa + 9*diag[i];
644:     /* multiply by the inverse of the block diagonal */
645:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
646:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
647:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
648:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
649:     v += 9;

651:     vi    = aj + diag[i] + 1;
652:     nz    = ai[i+1] - diag[i] - 1;
653:     while (nz--) {
654:       oidx = 3*(*vi++);
655:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
656:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
657:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658:       v  += 9;
659:     }
660:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
661:     idx += 3;
662:   }
663:   /* backward solve the L^T */
664:   for (i=n-1; i>=0; i--){
665:     v    = aa + 9*diag[i] - 9;
666:     vi   = aj + diag[i] - 1;
667:     nz   = diag[i] - ai[i];
668:     idt  = 3*i;
669:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
670:     while (nz--) {
671:       idx   = 3*(*vi--);
672:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
673:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
674:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675:       v -= 9;
676:     }
677:   }

679:   /* copy t into x according to permutation */
680:   ii = 0;
681:   for (i=0; i<n; i++) {
682:     ir      = 3*r[i];
683:     x[ir]   = t[ii];
684:     x[ir+1] = t[ii+1];
685:     x[ir+2] = t[ii+2];
686:     ii += 3;
687:   }

689:   ISRestoreIndices(isrow,&rout);
690:   ISRestoreIndices(iscol,&cout);
691:   VecRestoreArray(bb,&b);
692:   VecRestoreArray(xx,&x);
693:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
694:   return(0);
695: }

699: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700: {
701:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
702:   IS             iscol=a->col,isrow=a->row;
704:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
706:   MatScalar      *aa=a->a,*v;
707:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
708:   PetscScalar    *x,*b,*t;

711:   VecGetArray(bb,&b);
712:   VecGetArray(xx,&x);
713:   t  = a->solve_work;

715:   ISGetIndices(isrow,&rout); r = rout;
716:   ISGetIndices(iscol,&cout); c = cout;

718:   /* copy the b into temp work space according to permutation */
719:   ii = 0;
720:   for (i=0; i<n; i++) {
721:     ic      = 4*c[i];
722:     t[ii]   = b[ic];
723:     t[ii+1] = b[ic+1];
724:     t[ii+2] = b[ic+2];
725:     t[ii+3] = b[ic+3];
726:     ii += 4;
727:   }

729:   /* forward solve the U^T */
730:   idx = 0;
731:   for (i=0; i<n; i++) {

733:     v     = aa + 16*diag[i];
734:     /* multiply by the inverse of the block diagonal */
735:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
736:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
737:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
738:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
739:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740:     v += 16;

742:     vi    = aj + diag[i] + 1;
743:     nz    = ai[i+1] - diag[i] - 1;
744:     while (nz--) {
745:       oidx = 4*(*vi++);
746:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750:       v  += 16;
751:     }
752:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753:     idx += 4;
754:   }
755:   /* backward solve the L^T */
756:   for (i=n-1; i>=0; i--){
757:     v    = aa + 16*diag[i] - 16;
758:     vi   = aj + diag[i] - 1;
759:     nz   = diag[i] - ai[i];
760:     idt  = 4*i;
761:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762:     while (nz--) {
763:       idx   = 4*(*vi--);
764:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
765:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
766:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768:       v -= 16;
769:     }
770:   }

772:   /* copy t into x according to permutation */
773:   ii = 0;
774:   for (i=0; i<n; i++) {
775:     ir      = 4*r[i];
776:     x[ir]   = t[ii];
777:     x[ir+1] = t[ii+1];
778:     x[ir+2] = t[ii+2];
779:     x[ir+3] = t[ii+3];
780:     ii += 4;
781:   }

783:   ISRestoreIndices(isrow,&rout);
784:   ISRestoreIndices(iscol,&cout);
785:   VecRestoreArray(bb,&b);
786:   VecRestoreArray(xx,&x);
787:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
788:   return(0);
789: }

793: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794: {
795:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796:   IS             iscol=a->col,isrow=a->row;
798:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
800:   MatScalar      *aa=a->a,*v;
801:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
802:   PetscScalar    *x,*b,*t;

805:   VecGetArray(bb,&b);
806:   VecGetArray(xx,&x);
807:   t  = a->solve_work;

809:   ISGetIndices(isrow,&rout); r = rout;
810:   ISGetIndices(iscol,&cout); c = cout;

812:   /* copy the b into temp work space according to permutation */
813:   ii = 0;
814:   for (i=0; i<n; i++) {
815:     ic      = 5*c[i];
816:     t[ii]   = b[ic];
817:     t[ii+1] = b[ic+1];
818:     t[ii+2] = b[ic+2];
819:     t[ii+3] = b[ic+3];
820:     t[ii+4] = b[ic+4];
821:     ii += 5;
822:   }

824:   /* forward solve the U^T */
825:   idx = 0;
826:   for (i=0; i<n; i++) {

828:     v     = aa + 25*diag[i];
829:     /* multiply by the inverse of the block diagonal */
830:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
832:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
833:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836:     v += 25;

838:     vi    = aj + diag[i] + 1;
839:     nz    = ai[i+1] - diag[i] - 1;
840:     while (nz--) {
841:       oidx = 5*(*vi++);
842:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847:       v  += 25;
848:     }
849:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850:     idx += 5;
851:   }
852:   /* backward solve the L^T */
853:   for (i=n-1; i>=0; i--){
854:     v    = aa + 25*diag[i] - 25;
855:     vi   = aj + diag[i] - 1;
856:     nz   = diag[i] - ai[i];
857:     idt  = 5*i;
858:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859:     while (nz--) {
860:       idx   = 5*(*vi--);
861:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
862:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
863:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866:       v -= 25;
867:     }
868:   }

870:   /* copy t into x according to permutation */
871:   ii = 0;
872:   for (i=0; i<n; i++) {
873:     ir      = 5*r[i];
874:     x[ir]   = t[ii];
875:     x[ir+1] = t[ii+1];
876:     x[ir+2] = t[ii+2];
877:     x[ir+3] = t[ii+3];
878:     x[ir+4] = t[ii+4];
879:     ii += 5;
880:   }

882:   ISRestoreIndices(isrow,&rout);
883:   ISRestoreIndices(iscol,&cout);
884:   VecRestoreArray(bb,&b);
885:   VecRestoreArray(xx,&x);
886:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
887:   return(0);
888: }

892: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893: {
894:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
895:   IS             iscol=a->col,isrow=a->row;
897:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
899:   MatScalar      *aa=a->a,*v;
900:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
901:   PetscScalar    *x,*b,*t;

904:   VecGetArray(bb,&b);
905:   VecGetArray(xx,&x);
906:   t  = a->solve_work;

908:   ISGetIndices(isrow,&rout); r = rout;
909:   ISGetIndices(iscol,&cout); c = cout;

911:   /* copy the b into temp work space according to permutation */
912:   ii = 0;
913:   for (i=0; i<n; i++) {
914:     ic      = 6*c[i];
915:     t[ii]   = b[ic];
916:     t[ii+1] = b[ic+1];
917:     t[ii+2] = b[ic+2];
918:     t[ii+3] = b[ic+3];
919:     t[ii+4] = b[ic+4];
920:     t[ii+5] = b[ic+5];
921:     ii += 6;
922:   }

924:   /* forward solve the U^T */
925:   idx = 0;
926:   for (i=0; i<n; i++) {

928:     v     = aa + 36*diag[i];
929:     /* multiply by the inverse of the block diagonal */
930:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931:     x6    = t[5+idx];
932:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
933:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
934:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938:     v += 36;

940:     vi    = aj + diag[i] + 1;
941:     nz    = ai[i+1] - diag[i] - 1;
942:     while (nz--) {
943:       oidx = 6*(*vi++);
944:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
945:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
946:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950:       v  += 36;
951:     }
952:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953:     t[5+idx] = s6;
954:     idx += 6;
955:   }
956:   /* backward solve the L^T */
957:   for (i=n-1; i>=0; i--){
958:     v    = aa + 36*diag[i] - 36;
959:     vi   = aj + diag[i] - 1;
960:     nz   = diag[i] - ai[i];
961:     idt  = 6*i;
962:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963:     s6 = t[5+idt];
964:     while (nz--) {
965:       idx   = 6*(*vi--);
966:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
967:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
968:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972:       v -= 36;
973:     }
974:   }

976:   /* copy t into x according to permutation */
977:   ii = 0;
978:   for (i=0; i<n; i++) {
979:     ir      = 6*r[i];
980:     x[ir]   = t[ii];
981:     x[ir+1] = t[ii+1];
982:     x[ir+2] = t[ii+2];
983:     x[ir+3] = t[ii+3];
984:     x[ir+4] = t[ii+4];
985:     x[ir+5] = t[ii+5];
986:     ii += 6;
987:   }

989:   ISRestoreIndices(isrow,&rout);
990:   ISRestoreIndices(iscol,&cout);
991:   VecRestoreArray(bb,&b);
992:   VecRestoreArray(xx,&x);
993:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
994:   return(0);
995: }

999: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000: {
1001:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1002:   IS             iscol=a->col,isrow=a->row;
1004:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1006:   MatScalar      *aa=a->a,*v;
1007:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1008:   PetscScalar    *x,*b,*t;

1011:   VecGetArray(bb,&b);
1012:   VecGetArray(xx,&x);
1013:   t  = a->solve_work;

1015:   ISGetIndices(isrow,&rout); r = rout;
1016:   ISGetIndices(iscol,&cout); c = cout;

1018:   /* copy the b into temp work space according to permutation */
1019:   ii = 0;
1020:   for (i=0; i<n; i++) {
1021:     ic      = 7*c[i];
1022:     t[ii]   = b[ic];
1023:     t[ii+1] = b[ic+1];
1024:     t[ii+2] = b[ic+2];
1025:     t[ii+3] = b[ic+3];
1026:     t[ii+4] = b[ic+4];
1027:     t[ii+5] = b[ic+5];
1028:     t[ii+6] = b[ic+6];
1029:     ii += 7;
1030:   }

1032:   /* forward solve the U^T */
1033:   idx = 0;
1034:   for (i=0; i<n; i++) {

1036:     v     = aa + 49*diag[i];
1037:     /* multiply by the inverse of the block diagonal */
1038:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039:     x6    = t[5+idx]; x7 = t[6+idx];
1040:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1041:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047:     v += 49;

1049:     vi    = aj + diag[i] + 1;
1050:     nz    = ai[i+1] - diag[i] - 1;
1051:     while (nz--) {
1052:       oidx = 7*(*vi++);
1053:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1054:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060:       v  += 49;
1061:     }
1062:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063:     t[5+idx] = s6;t[6+idx] = s7;
1064:     idx += 7;
1065:   }
1066:   /* backward solve the L^T */
1067:   for (i=n-1; i>=0; i--){
1068:     v    = aa + 49*diag[i] - 49;
1069:     vi   = aj + diag[i] - 1;
1070:     nz   = diag[i] - ai[i];
1071:     idt  = 7*i;
1072:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073:     s6 = t[5+idt];s7 = t[6+idt];
1074:     while (nz--) {
1075:       idx   = 7*(*vi--);
1076:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1077:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083:       v -= 49;
1084:     }
1085:   }

1087:   /* copy t into x according to permutation */
1088:   ii = 0;
1089:   for (i=0; i<n; i++) {
1090:     ir      = 7*r[i];
1091:     x[ir]   = t[ii];
1092:     x[ir+1] = t[ii+1];
1093:     x[ir+2] = t[ii+2];
1094:     x[ir+3] = t[ii+3];
1095:     x[ir+4] = t[ii+4];
1096:     x[ir+5] = t[ii+5];
1097:     x[ir+6] = t[ii+6];
1098:     ii += 7;
1099:   }

1101:   ISRestoreIndices(isrow,&rout);
1102:   ISRestoreIndices(iscol,&cout);
1103:   VecRestoreArray(bb,&b);
1104:   VecRestoreArray(xx,&x);
1105:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1106:   return(0);
1107: }

1109: /* ----------------------------------------------------------- */
1112: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1113: {
1114:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1115:   IS             iscol=a->col,isrow=a->row;
1117:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118:   PetscInt       nz,bs=A->rmap.bs,bs2=a->bs2,*rout,*cout;
1119:   MatScalar      *aa=a->a,*v;
1120:   PetscScalar    *x,*b,*s,*t,*ls;

1123:   VecGetArray(bb,&b);
1124:   VecGetArray(xx,&x);
1125:   t  = a->solve_work;

1127:   ISGetIndices(isrow,&rout); r = rout;
1128:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1130:   /* forward solve the lower triangular */
1131:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1132:   for (i=1; i<n; i++) {
1133:     v   = aa + bs2*ai[i];
1134:     vi  = aj + ai[i];
1135:     nz  = a->diag[i] - ai[i];
1136:     s = t + bs*i;
1137:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1138:     while (nz--) {
1139:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1140:       v += bs2;
1141:     }
1142:   }
1143:   /* backward solve the upper triangular */
1144:   ls = a->solve_work + A->cmap.n;
1145:   for (i=n-1; i>=0; i--){
1146:     v   = aa + bs2*(a->diag[i] + 1);
1147:     vi  = aj + a->diag[i] + 1;
1148:     nz  = ai[i+1] - a->diag[i] - 1;
1149:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1150:     while (nz--) {
1151:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1152:       v += bs2;
1153:     }
1154:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1155:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1156:   }

1158:   ISRestoreIndices(isrow,&rout);
1159:   ISRestoreIndices(iscol,&cout);
1160:   VecRestoreArray(bb,&b);
1161:   VecRestoreArray(xx,&x);
1162:   PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap.bs*A->cmap.n);
1163:   return(0);
1164: }

1168: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1169: {
1170:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1171:   IS             iscol=a->col,isrow=a->row;
1173:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174:   PetscInt       *diag = a->diag;
1175:   MatScalar      *aa=a->a,*v;
1176:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1177:   PetscScalar    *x,*b,*t;

1180:   VecGetArray(bb,&b);
1181:   VecGetArray(xx,&x);
1182:   t  = a->solve_work;

1184:   ISGetIndices(isrow,&rout); r = rout;
1185:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1187:   /* forward solve the lower triangular */
1188:   idx    = 7*(*r++);
1189:   t[0] = b[idx];   t[1] = b[1+idx];
1190:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191:   t[5] = b[5+idx]; t[6] = b[6+idx];

1193:   for (i=1; i<n; i++) {
1194:     v     = aa + 49*ai[i];
1195:     vi    = aj + ai[i];
1196:     nz    = diag[i] - ai[i];
1197:     idx   = 7*(*r++);
1198:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1200:     while (nz--) {
1201:       idx   = 7*(*vi++);
1202:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1203:       x4    = t[3+idx];x5 = t[4+idx];
1204:       x6    = t[5+idx];x7 = t[6+idx];
1205:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1212:       v += 49;
1213:     }
1214:     idx = 7*i;
1215:     t[idx]   = s1;t[1+idx] = s2;
1216:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217:     t[5+idx] = s6;t[6+idx] = s7;
1218:   }
1219:   /* backward solve the upper triangular */
1220:   for (i=n-1; i>=0; i--){
1221:     v    = aa + 49*diag[i] + 49;
1222:     vi   = aj + diag[i] + 1;
1223:     nz   = ai[i+1] - diag[i] - 1;
1224:     idt  = 7*i;
1225:     s1 = t[idt];  s2 = t[1+idt];
1226:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227:     s6 = t[5+idt];s7 = t[6+idt];
1228:     while (nz--) {
1229:       idx   = 7*(*vi++);
1230:       x1    = t[idx];   x2 = t[1+idx];
1231:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232:       x6    = t[5+idx]; x7 = t[6+idx];
1233:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1240:       v += 49;
1241:     }
1242:     idc = 7*(*c--);
1243:     v   = aa + 49*diag[i];
1244:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1245:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1258:   }

1260:   ISRestoreIndices(isrow,&rout);
1261:   ISRestoreIndices(iscol,&cout);
1262:   VecRestoreArray(bb,&b);
1263:   VecRestoreArray(xx,&x);
1264:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1265:   return(0);
1266: }

1270: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1271: {
1272:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1273:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1275:   PetscInt       *diag = a->diag,jdx;
1276:   MatScalar      *aa=a->a,*v;
1277:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

1280:   VecGetArray(bb,&b);
1281:   VecGetArray(xx,&x);
1282:   /* forward solve the lower triangular */
1283:   idx    = 0;
1284:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1285:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1286:   x[6] = b[6+idx];
1287:   for (i=1; i<n; i++) {
1288:     v     =  aa + 49*ai[i];
1289:     vi    =  aj + ai[i];
1290:     nz    =  diag[i] - ai[i];
1291:     idx   =  7*i;
1292:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1293:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1294:     s7  =  b[6+idx];
1295:     while (nz--) {
1296:       jdx   = 7*(*vi++);
1297:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1298:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1299:       x7    = x[6+jdx];
1300:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1301:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1302:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1303:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1304:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1305:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1306:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1307:       v += 49;
1308:      }
1309:     x[idx]   = s1;
1310:     x[1+idx] = s2;
1311:     x[2+idx] = s3;
1312:     x[3+idx] = s4;
1313:     x[4+idx] = s5;
1314:     x[5+idx] = s6;
1315:     x[6+idx] = s7;
1316:   }
1317:   /* backward solve the upper triangular */
1318:   for (i=n-1; i>=0; i--){
1319:     v    = aa + 49*diag[i] + 49;
1320:     vi   = aj + diag[i] + 1;
1321:     nz   = ai[i+1] - diag[i] - 1;
1322:     idt  = 7*i;
1323:     s1 = x[idt];   s2 = x[1+idt];
1324:     s3 = x[2+idt]; s4 = x[3+idt];
1325:     s5 = x[4+idt]; s6 = x[5+idt];
1326:     s7 = x[6+idt];
1327:     while (nz--) {
1328:       idx   = 7*(*vi++);
1329:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1330:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1331:       x7    = x[6+idx];
1332:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1333:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1334:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1335:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1336:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1337:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1338:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1339:       v += 49;
1340:     }
1341:     v        = aa + 49*diag[i];
1342:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1343:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1344:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1345:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1346:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1347:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1348:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1349:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1350:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1351:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1352:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1353:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1354:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1355:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1356:   }

1358:   VecRestoreArray(bb,&b);
1359:   VecRestoreArray(xx,&x);
1360:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1361:   return(0);
1362: }

1366: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1367: {
1368:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1369:   IS             iscol=a->col,isrow=a->row;
1371:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1372:   PetscInt       *diag = a->diag;
1373:   MatScalar      *aa=a->a,*v;
1374:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;

1377:   VecGetArray(bb,&b);
1378:   VecGetArray(xx,&x);
1379:   t  = a->solve_work;

1381:   ISGetIndices(isrow,&rout); r = rout;
1382:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1384:   /* forward solve the lower triangular */
1385:   idx    = 6*(*r++);
1386:   t[0] = b[idx];   t[1] = b[1+idx];
1387:   t[2] = b[2+idx]; t[3] = b[3+idx];
1388:   t[4] = b[4+idx]; t[5] = b[5+idx];
1389:   for (i=1; i<n; i++) {
1390:     v     = aa + 36*ai[i];
1391:     vi    = aj + ai[i];
1392:     nz    = diag[i] - ai[i];
1393:     idx   = 6*(*r++);
1394:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1395:     s5  = b[4+idx]; s6 = b[5+idx];
1396:     while (nz--) {
1397:       idx   = 6*(*vi++);
1398:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1399:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1400:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1401:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1402:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1403:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1404:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1405:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1406:       v += 36;
1407:     }
1408:     idx = 6*i;
1409:     t[idx]   = s1;t[1+idx] = s2;
1410:     t[2+idx] = s3;t[3+idx] = s4;
1411:     t[4+idx] = s5;t[5+idx] = s6;
1412:   }
1413:   /* backward solve the upper triangular */
1414:   for (i=n-1; i>=0; i--){
1415:     v    = aa + 36*diag[i] + 36;
1416:     vi   = aj + diag[i] + 1;
1417:     nz   = ai[i+1] - diag[i] - 1;
1418:     idt  = 6*i;
1419:     s1 = t[idt];  s2 = t[1+idt];
1420:     s3 = t[2+idt];s4 = t[3+idt];
1421:     s5 = t[4+idt];s6 = t[5+idt];
1422:     while (nz--) {
1423:       idx   = 6*(*vi++);
1424:       x1    = t[idx];   x2 = t[1+idx];
1425:       x3    = t[2+idx]; x4 = t[3+idx];
1426:       x5    = t[4+idx]; x6 = t[5+idx];
1427:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1428:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1429:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1430:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1431:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1432:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1433:       v += 36;
1434:     }
1435:     idc = 6*(*c--);
1436:     v   = aa + 36*diag[i];
1437:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1438:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1439:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1440:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1441:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1442:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1443:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1444:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1445:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1446:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1447:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1448:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1449:   }

1451:   ISRestoreIndices(isrow,&rout);
1452:   ISRestoreIndices(iscol,&cout);
1453:   VecRestoreArray(bb,&b);
1454:   VecRestoreArray(xx,&x);
1455:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1456:   return(0);
1457: }

1461: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1462: {
1463:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1464:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1466:   PetscInt       *diag = a->diag,jdx;
1467:   MatScalar      *aa=a->a,*v;
1468:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

1471:   VecGetArray(bb,&b);
1472:   VecGetArray(xx,&x);
1473:   /* forward solve the lower triangular */
1474:   idx    = 0;
1475:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1476:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1477:   for (i=1; i<n; i++) {
1478:     v     =  aa + 36*ai[i];
1479:     vi    =  aj + ai[i];
1480:     nz    =  diag[i] - ai[i];
1481:     idx   =  6*i;
1482:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1483:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1484:     while (nz--) {
1485:       jdx   = 6*(*vi++);
1486:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1487:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1488:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1489:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1490:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1491:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1492:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1493:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1494:       v += 36;
1495:      }
1496:     x[idx]   = s1;
1497:     x[1+idx] = s2;
1498:     x[2+idx] = s3;
1499:     x[3+idx] = s4;
1500:     x[4+idx] = s5;
1501:     x[5+idx] = s6;
1502:   }
1503:   /* backward solve the upper triangular */
1504:   for (i=n-1; i>=0; i--){
1505:     v    = aa + 36*diag[i] + 36;
1506:     vi   = aj + diag[i] + 1;
1507:     nz   = ai[i+1] - diag[i] - 1;
1508:     idt  = 6*i;
1509:     s1 = x[idt];   s2 = x[1+idt];
1510:     s3 = x[2+idt]; s4 = x[3+idt];
1511:     s5 = x[4+idt]; s6 = x[5+idt];
1512:     while (nz--) {
1513:       idx   = 6*(*vi++);
1514:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1515:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1516:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1517:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1518:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1519:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1520:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1521:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1522:       v += 36;
1523:     }
1524:     v        = aa + 36*diag[i];
1525:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1526:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1527:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1528:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1529:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1530:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1531:   }

1533:   VecRestoreArray(bb,&b);
1534:   VecRestoreArray(xx,&x);
1535:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1536:   return(0);
1537: }

1541: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1542: {
1543:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1544:   IS             iscol=a->col,isrow=a->row;
1546:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1547:   PetscInt       *diag = a->diag;
1548:   MatScalar      *aa=a->a,*v;
1549:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;

1552:   VecGetArray(bb,&b);
1553:   VecGetArray(xx,&x);
1554:   t  = a->solve_work;

1556:   ISGetIndices(isrow,&rout); r = rout;
1557:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1559:   /* forward solve the lower triangular */
1560:   idx    = 5*(*r++);
1561:   t[0] = b[idx];   t[1] = b[1+idx];
1562:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1563:   for (i=1; i<n; i++) {
1564:     v     = aa + 25*ai[i];
1565:     vi    = aj + ai[i];
1566:     nz    = diag[i] - ai[i];
1567:     idx   = 5*(*r++);
1568:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1569:     s5  = b[4+idx];
1570:     while (nz--) {
1571:       idx   = 5*(*vi++);
1572:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1573:       x4    = t[3+idx];x5 = t[4+idx];
1574:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1575:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1576:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1577:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1578:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1579:       v += 25;
1580:     }
1581:     idx = 5*i;
1582:     t[idx]   = s1;t[1+idx] = s2;
1583:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1584:   }
1585:   /* backward solve the upper triangular */
1586:   for (i=n-1; i>=0; i--){
1587:     v    = aa + 25*diag[i] + 25;
1588:     vi   = aj + diag[i] + 1;
1589:     nz   = ai[i+1] - diag[i] - 1;
1590:     idt  = 5*i;
1591:     s1 = t[idt];  s2 = t[1+idt];
1592:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1593:     while (nz--) {
1594:       idx   = 5*(*vi++);
1595:       x1    = t[idx];   x2 = t[1+idx];
1596:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1597:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1598:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1599:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1600:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1601:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1602:       v += 25;
1603:     }
1604:     idc = 5*(*c--);
1605:     v   = aa + 25*diag[i];
1606:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1607:                                  v[15]*s4+v[20]*s5;
1608:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1609:                                  v[16]*s4+v[21]*s5;
1610:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1611:                                  v[17]*s4+v[22]*s5;
1612:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1613:                                  v[18]*s4+v[23]*s5;
1614:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1615:                                  v[19]*s4+v[24]*s5;
1616:   }

1618:   ISRestoreIndices(isrow,&rout);
1619:   ISRestoreIndices(iscol,&cout);
1620:   VecRestoreArray(bb,&b);
1621:   VecRestoreArray(xx,&x);
1622:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1623:   return(0);
1624: }

1628: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1629: {
1630:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1631:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1633:   PetscInt       *diag = a->diag,jdx;
1634:   MatScalar      *aa=a->a,*v;
1635:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;

1638:   VecGetArray(bb,&b);
1639:   VecGetArray(xx,&x);
1640:   /* forward solve the lower triangular */
1641:   idx    = 0;
1642:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1643:   for (i=1; i<n; i++) {
1644:     v     =  aa + 25*ai[i];
1645:     vi    =  aj + ai[i];
1646:     nz    =  diag[i] - ai[i];
1647:     idx   =  5*i;
1648:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1649:     while (nz--) {
1650:       jdx   = 5*(*vi++);
1651:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1652:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1653:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1654:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1655:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1656:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1657:       v    += 25;
1658:     }
1659:     x[idx]   = s1;
1660:     x[1+idx] = s2;
1661:     x[2+idx] = s3;
1662:     x[3+idx] = s4;
1663:     x[4+idx] = s5;
1664:   }
1665:   /* backward solve the upper triangular */
1666:   for (i=n-1; i>=0; i--){
1667:     v    = aa + 25*diag[i] + 25;
1668:     vi   = aj + diag[i] + 1;
1669:     nz   = ai[i+1] - diag[i] - 1;
1670:     idt  = 5*i;
1671:     s1 = x[idt];  s2 = x[1+idt];
1672:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1673:     while (nz--) {
1674:       idx   = 5*(*vi++);
1675:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1676:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1677:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1678:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1679:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1680:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1681:       v    += 25;
1682:     }
1683:     v        = aa + 25*diag[i];
1684:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1685:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1686:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1687:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1688:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1689:   }

1691:   VecRestoreArray(bb,&b);
1692:   VecRestoreArray(xx,&x);
1693:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1694:   return(0);
1695: }

1699: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1700: {
1701:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1702:   IS             iscol=a->col,isrow=a->row;
1704:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1705:   PetscInt       *diag = a->diag;
1706:   MatScalar      *aa=a->a,*v;
1707:   PetscScalar    *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;

1710:   VecGetArray(bb,&b);
1711:   VecGetArray(xx,&x);
1712:   t  = a->solve_work;

1714:   ISGetIndices(isrow,&rout); r = rout;
1715:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1717:   /* forward solve the lower triangular */
1718:   idx    = 4*(*r++);
1719:   t[0] = b[idx];   t[1] = b[1+idx];
1720:   t[2] = b[2+idx]; t[3] = b[3+idx];
1721:   for (i=1; i<n; i++) {
1722:     v     = aa + 16*ai[i];
1723:     vi    = aj + ai[i];
1724:     nz    = diag[i] - ai[i];
1725:     idx   = 4*(*r++);
1726:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1727:     while (nz--) {
1728:       idx   = 4*(*vi++);
1729:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1730:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1731:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1732:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1733:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1734:       v    += 16;
1735:     }
1736:     idx        = 4*i;
1737:     t[idx]   = s1;t[1+idx] = s2;
1738:     t[2+idx] = s3;t[3+idx] = s4;
1739:   }
1740:   /* backward solve the upper triangular */
1741:   for (i=n-1; i>=0; i--){
1742:     v    = aa + 16*diag[i] + 16;
1743:     vi   = aj + diag[i] + 1;
1744:     nz   = ai[i+1] - diag[i] - 1;
1745:     idt  = 4*i;
1746:     s1 = t[idt];  s2 = t[1+idt];
1747:     s3 = t[2+idt];s4 = t[3+idt];
1748:     while (nz--) {
1749:       idx   = 4*(*vi++);
1750:       x1    = t[idx];   x2 = t[1+idx];
1751:       x3    = t[2+idx]; x4 = t[3+idx];
1752:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1753:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1754:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1755:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1756:       v += 16;
1757:     }
1758:     idc      = 4*(*c--);
1759:     v        = aa + 16*diag[i];
1760:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1761:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1762:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1763:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1764:   }

1766:   ISRestoreIndices(isrow,&rout);
1767:   ISRestoreIndices(iscol,&cout);
1768:   VecRestoreArray(bb,&b);
1769:   VecRestoreArray(xx,&x);
1770:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1771:   return(0);
1772: }

1776: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1777: {
1778:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1779:   IS             iscol=a->col,isrow=a->row;
1781:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1782:   PetscInt       *diag = a->diag;
1783:   MatScalar      *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1784:   PetscScalar    *x,*b;

1787:   VecGetArray(bb,&b);
1788:   VecGetArray(xx,&x);
1789:   t  = (MatScalar *)a->solve_work;

1791:   ISGetIndices(isrow,&rout); r = rout;
1792:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1794:   /* forward solve the lower triangular */
1795:   idx    = 4*(*r++);
1796:   t[0] = (MatScalar)b[idx];
1797:   t[1] = (MatScalar)b[1+idx];
1798:   t[2] = (MatScalar)b[2+idx];
1799:   t[3] = (MatScalar)b[3+idx];
1800:   for (i=1; i<n; i++) {
1801:     v     = aa + 16*ai[i];
1802:     vi    = aj + ai[i];
1803:     nz    = diag[i] - ai[i];
1804:     idx   = 4*(*r++);
1805:     s1 = (MatScalar)b[idx];
1806:     s2 = (MatScalar)b[1+idx];
1807:     s3 = (MatScalar)b[2+idx];
1808:     s4 = (MatScalar)b[3+idx];
1809:     while (nz--) {
1810:       idx   = 4*(*vi++);
1811:       x1  = t[idx];
1812:       x2  = t[1+idx];
1813:       x3  = t[2+idx];
1814:       x4  = t[3+idx];
1815:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1816:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1817:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1818:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1819:       v    += 16;
1820:     }
1821:     idx        = 4*i;
1822:     t[idx]   = s1;
1823:     t[1+idx] = s2;
1824:     t[2+idx] = s3;
1825:     t[3+idx] = s4;
1826:   }
1827:   /* backward solve the upper triangular */
1828:   for (i=n-1; i>=0; i--){
1829:     v    = aa + 16*diag[i] + 16;
1830:     vi   = aj + diag[i] + 1;
1831:     nz   = ai[i+1] - diag[i] - 1;
1832:     idt  = 4*i;
1833:     s1 = t[idt];
1834:     s2 = t[1+idt];
1835:     s3 = t[2+idt];
1836:     s4 = t[3+idt];
1837:     while (nz--) {
1838:       idx   = 4*(*vi++);
1839:       x1  = t[idx];
1840:       x2  = t[1+idx];
1841:       x3  = t[2+idx];
1842:       x4  = t[3+idx];
1843:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1844:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1845:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1846:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1847:       v += 16;
1848:     }
1849:     idc      = 4*(*c--);
1850:     v        = aa + 16*diag[i];
1851:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1852:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1853:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1854:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1855:     x[idc]   = (PetscScalar)t[idt];
1856:     x[1+idc] = (PetscScalar)t[1+idt];
1857:     x[2+idc] = (PetscScalar)t[2+idt];
1858:     x[3+idc] = (PetscScalar)t[3+idt];
1859:  }

1861:   ISRestoreIndices(isrow,&rout);
1862:   ISRestoreIndices(iscol,&cout);
1863:   VecRestoreArray(bb,&b);
1864:   VecRestoreArray(xx,&x);
1865:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1866:   return(0);
1867: }

1869: #if defined (PETSC_HAVE_SSE)

1871: #include PETSC_HAVE_SSE

1875: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1876: {
1877:   /* 
1878:      Note: This code uses demotion of double
1879:      to float when performing the mixed-mode computation.
1880:      This may not be numerically reasonable for all applications.
1881:   */
1882:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1883:   IS             iscol=a->col,isrow=a->row;
1885:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1886:   PetscInt       *diag = a->diag,ai16;
1887:   MatScalar      *aa=a->a,*v;
1888:   PetscScalar    *x,*b,*t;

1890:   /* Make space in temp stack for 16 Byte Aligned arrays */
1891:   float           ssealignedspace[11],*tmps,*tmpx;
1892:   unsigned long   offset;
1893: 
1895:   SSE_SCOPE_BEGIN;

1897:     offset = (unsigned long)ssealignedspace % 16;
1898:     if (offset) offset = (16 - offset)/4;
1899:     tmps = &ssealignedspace[offset];
1900:     tmpx = &ssealignedspace[offset+4];
1901:     PREFETCH_NTA(aa+16*ai[1]);

1903:     VecGetArray(bb,&b);
1904:     VecGetArray(xx,&x);
1905:     t  = a->solve_work;

1907:     ISGetIndices(isrow,&rout); r = rout;
1908:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1910:     /* forward solve the lower triangular */
1911:     idx  = 4*(*r++);
1912:     t[0] = b[idx];   t[1] = b[1+idx];
1913:     t[2] = b[2+idx]; t[3] = b[3+idx];
1914:     v    =  aa + 16*ai[1];

1916:     for (i=1; i<n;) {
1917:       PREFETCH_NTA(&v[8]);
1918:       vi   =  aj      + ai[i];
1919:       nz   =  diag[i] - ai[i];
1920:       idx  =  4*(*r++);

1922:       /* Demote sum from double to float */
1923:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1924:       LOAD_PS(tmps,XMM7);

1926:       while (nz--) {
1927:         PREFETCH_NTA(&v[16]);
1928:         idx = 4*(*vi++);
1929: 
1930:         /* Demote solution (so far) from double to float */
1931:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1933:         /* 4x4 Matrix-Vector product with negative accumulation: */
1934:         SSE_INLINE_BEGIN_2(tmpx,v)
1935:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1937:           /* First Column */
1938:           SSE_COPY_PS(XMM0,XMM6)
1939:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1940:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1941:           SSE_SUB_PS(XMM7,XMM0)
1942: 
1943:           /* Second Column */
1944:           SSE_COPY_PS(XMM1,XMM6)
1945:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1946:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1947:           SSE_SUB_PS(XMM7,XMM1)
1948: 
1949:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1950: 
1951:           /* Third Column */
1952:           SSE_COPY_PS(XMM2,XMM6)
1953:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1954:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1955:           SSE_SUB_PS(XMM7,XMM2)

1957:           /* Fourth Column */
1958:           SSE_COPY_PS(XMM3,XMM6)
1959:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1960:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1961:           SSE_SUB_PS(XMM7,XMM3)
1962:         SSE_INLINE_END_2
1963: 
1964:         v  += 16;
1965:       }
1966:       idx = 4*i;
1967:       v   = aa + 16*ai[++i];
1968:       PREFETCH_NTA(v);
1969:       STORE_PS(tmps,XMM7);

1971:       /* Promote result from float to double */
1972:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1973:     }
1974:     /* backward solve the upper triangular */
1975:     idt  = 4*(n-1);
1976:     ai16 = 16*diag[n-1];
1977:     v    = aa + ai16 + 16;
1978:     for (i=n-1; i>=0;){
1979:       PREFETCH_NTA(&v[8]);
1980:       vi = aj + diag[i] + 1;
1981:       nz = ai[i+1] - diag[i] - 1;
1982: 
1983:       /* Demote accumulator from double to float */
1984:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1985:       LOAD_PS(tmps,XMM7);

1987:       while (nz--) {
1988:         PREFETCH_NTA(&v[16]);
1989:         idx = 4*(*vi++);

1991:         /* Demote solution (so far) from double to float */
1992:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

1994:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1995:         SSE_INLINE_BEGIN_2(tmpx,v)
1996:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1998:           /* First Column */
1999:           SSE_COPY_PS(XMM0,XMM6)
2000:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2001:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2002:           SSE_SUB_PS(XMM7,XMM0)

2004:           /* Second Column */
2005:           SSE_COPY_PS(XMM1,XMM6)
2006:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2007:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2008:           SSE_SUB_PS(XMM7,XMM1)

2010:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2011: 
2012:           /* Third Column */
2013:           SSE_COPY_PS(XMM2,XMM6)
2014:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2015:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2016:           SSE_SUB_PS(XMM7,XMM2)

2018:           /* Fourth Column */
2019:           SSE_COPY_PS(XMM3,XMM6)
2020:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2021:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2022:           SSE_SUB_PS(XMM7,XMM3)
2023:         SSE_INLINE_END_2
2024:         v  += 16;
2025:       }
2026:       v    = aa + ai16;
2027:       ai16 = 16*diag[--i];
2028:       PREFETCH_NTA(aa+ai16+16);
2029:       /* 
2030:          Scale the result by the diagonal 4x4 block, 
2031:          which was inverted as part of the factorization
2032:       */
2033:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2034:         /* First Column */
2035:         SSE_COPY_PS(XMM0,XMM7)
2036:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2037:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2039:         /* Second Column */
2040:         SSE_COPY_PS(XMM1,XMM7)
2041:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2042:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2043:         SSE_ADD_PS(XMM0,XMM1)

2045:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2046: 
2047:         /* Third Column */
2048:         SSE_COPY_PS(XMM2,XMM7)
2049:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2050:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2051:         SSE_ADD_PS(XMM0,XMM2)

2053:         /* Fourth Column */
2054:         SSE_COPY_PS(XMM3,XMM7)
2055:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2056:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2057:         SSE_ADD_PS(XMM0,XMM3)
2058: 
2059:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2060:       SSE_INLINE_END_3

2062:       /* Promote solution from float to double */
2063:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

2065:       /* Apply reordering to t and stream into x.    */
2066:       /* This way, x doesn't pollute the cache.      */
2067:       /* Be careful with size: 2 doubles = 4 floats! */
2068:       idc  = 4*(*c--);
2069:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2070:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2071:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2072:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2073:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2074:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2075:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2076:       SSE_INLINE_END_2
2077:       v    = aa + ai16 + 16;
2078:       idt -= 4;
2079:     }

2081:     ISRestoreIndices(isrow,&rout);
2082:     ISRestoreIndices(iscol,&cout);
2083:     VecRestoreArray(bb,&b);
2084:     VecRestoreArray(xx,&x);
2085:     PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2086:   SSE_SCOPE_END;
2087:   return(0);
2088: }

2090: #endif


2093: /*
2094:       Special case where the matrix was ILU(0) factored in the natural
2095:    ordering. This eliminates the need for the column and row permutation.
2096: */
2099: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2100: {
2101:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2102:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2104:   PetscInt       *diag = a->diag;
2105:   MatScalar      *aa=a->a;
2106:   PetscScalar    *x,*b;

2109:   VecGetArray(bb,&b);
2110:   VecGetArray(xx,&x);

2112: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2113:   {
2114:     static PetscScalar w[2000]; /* very BAD need to fix */
2115:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2116:   }
2117: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2118:   {
2119:     static PetscScalar w[2000]; /* very BAD need to fix */
2120:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2121:   }
2122: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2123:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2124: #else
2125:   {
2126:     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2127:     MatScalar    *v;
2128:     PetscInt     jdx,idt,idx,nz,*vi,i,ai16;

2130:   /* forward solve the lower triangular */
2131:   idx    = 0;
2132:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2133:   for (i=1; i<n; i++) {
2134:     v     =  aa      + 16*ai[i];
2135:     vi    =  aj      + ai[i];
2136:     nz    =  diag[i] - ai[i];
2137:     idx   +=  4;
2138:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2139:     while (nz--) {
2140:       jdx   = 4*(*vi++);
2141:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2142:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2143:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2144:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2145:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2146:       v    += 16;
2147:     }
2148:     x[idx]   = s1;
2149:     x[1+idx] = s2;
2150:     x[2+idx] = s3;
2151:     x[3+idx] = s4;
2152:   }
2153:   /* backward solve the upper triangular */
2154:   idt = 4*(n-1);
2155:   for (i=n-1; i>=0; i--){
2156:     ai16 = 16*diag[i];
2157:     v    = aa + ai16 + 16;
2158:     vi   = aj + diag[i] + 1;
2159:     nz   = ai[i+1] - diag[i] - 1;
2160:     s1 = x[idt];  s2 = x[1+idt];
2161:     s3 = x[2+idt];s4 = x[3+idt];
2162:     while (nz--) {
2163:       idx   = 4*(*vi++);
2164:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2165:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2166:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2167:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2168:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2169:       v    += 16;
2170:     }
2171:     v        = aa + ai16;
2172:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2173:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2174:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2175:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2176:     idt -= 4;
2177:   }
2178:   }
2179: #endif

2181:   VecRestoreArray(bb,&b);
2182:   VecRestoreArray(xx,&x);
2183:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2184:   return(0);
2185: }

2189: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2190: {
2191:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2192:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2194:   PetscInt       *diag = a->diag;
2195:   MatScalar      *aa=a->a;
2196:   PetscScalar    *x,*b;

2199:   VecGetArray(bb,&b);
2200:   VecGetArray(xx,&x);

2202:   {
2203:     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2204:     MatScalar  *v,*t=(MatScalar *)x;
2205:     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;

2207:     /* forward solve the lower triangular */
2208:     idx  = 0;
2209:     t[0] = (MatScalar)b[0];
2210:     t[1] = (MatScalar)b[1];
2211:     t[2] = (MatScalar)b[2];
2212:     t[3] = (MatScalar)b[3];
2213:     for (i=1; i<n; i++) {
2214:       v     =  aa      + 16*ai[i];
2215:       vi    =  aj      + ai[i];
2216:       nz    =  diag[i] - ai[i];
2217:       idx   +=  4;
2218:       s1 = (MatScalar)b[idx];
2219:       s2 = (MatScalar)b[1+idx];
2220:       s3 = (MatScalar)b[2+idx];
2221:       s4 = (MatScalar)b[3+idx];
2222:       while (nz--) {
2223:         jdx = 4*(*vi++);
2224:         x1  = t[jdx];
2225:         x2  = t[1+jdx];
2226:         x3  = t[2+jdx];
2227:         x4  = t[3+jdx];
2228:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2229:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2230:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2231:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2232:         v    += 16;
2233:       }
2234:       t[idx]   = s1;
2235:       t[1+idx] = s2;
2236:       t[2+idx] = s3;
2237:       t[3+idx] = s4;
2238:     }
2239:     /* backward solve the upper triangular */
2240:     idt = 4*(n-1);
2241:     for (i=n-1; i>=0; i--){
2242:       ai16 = 16*diag[i];
2243:       v    = aa + ai16 + 16;
2244:       vi   = aj + diag[i] + 1;
2245:       nz   = ai[i+1] - diag[i] - 1;
2246:       s1   = t[idt];
2247:       s2   = t[1+idt];
2248:       s3   = t[2+idt];
2249:       s4   = t[3+idt];
2250:       while (nz--) {
2251:         idx = 4*(*vi++);
2252:         x1  = (MatScalar)x[idx];
2253:         x2  = (MatScalar)x[1+idx];
2254:         x3  = (MatScalar)x[2+idx];
2255:         x4  = (MatScalar)x[3+idx];
2256:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2257:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2258:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2259:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2260:         v    += 16;
2261:       }
2262:       v        = aa + ai16;
2263:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2264:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2265:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2266:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2267:       idt -= 4;
2268:     }
2269:   }

2271:   VecRestoreArray(bb,&b);
2272:   VecRestoreArray(xx,&x);
2273:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2274:   return(0);
2275: }

2277: #if defined (PETSC_HAVE_SSE)

2279: #include PETSC_HAVE_SSE
2282: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2283: {
2284:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2285:   unsigned short *aj=(unsigned short *)a->j;
2287:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2288:   MatScalar      *aa=a->a;
2289:   PetscScalar    *x,*b;

2292:   SSE_SCOPE_BEGIN;
2293:   /* 
2294:      Note: This code currently uses demotion of double
2295:      to float when performing the mixed-mode computation.
2296:      This may not be numerically reasonable for all applications.
2297:   */
2298:   PREFETCH_NTA(aa+16*ai[1]);

2300:   VecGetArray(bb,&b);
2301:   VecGetArray(xx,&x);
2302:   {
2303:     /* x will first be computed in single precision then promoted inplace to double */
2304:     MatScalar      *v,*t=(MatScalar *)x;
2305:     int            nz,i,idt,ai16;
2306:     unsigned int   jdx,idx;
2307:     unsigned short *vi;
2308:     /* Forward solve the lower triangular factor. */

2310:     /* First block is the identity. */
2311:     idx  = 0;
2312:     CONVERT_DOUBLE4_FLOAT4(t,b);
2313:     v    =  aa + 16*((unsigned int)ai[1]);

2315:     for (i=1; i<n;) {
2316:       PREFETCH_NTA(&v[8]);
2317:       vi   =  aj      + ai[i];
2318:       nz   =  diag[i] - ai[i];
2319:       idx +=  4;

2321:       /* Demote RHS from double to float. */
2322:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2323:       LOAD_PS(&t[idx],XMM7);

2325:       while (nz--) {
2326:         PREFETCH_NTA(&v[16]);
2327:         jdx = 4*((unsigned int)(*vi++));
2328: 
2329:         /* 4x4 Matrix-Vector product with negative accumulation: */
2330:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2331:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2333:           /* First Column */
2334:           SSE_COPY_PS(XMM0,XMM6)
2335:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2336:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2337:           SSE_SUB_PS(XMM7,XMM0)

2339:           /* Second Column */
2340:           SSE_COPY_PS(XMM1,XMM6)
2341:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2342:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2343:           SSE_SUB_PS(XMM7,XMM1)

2345:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2346: 
2347:           /* Third Column */
2348:           SSE_COPY_PS(XMM2,XMM6)
2349:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2350:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2351:           SSE_SUB_PS(XMM7,XMM2)

2353:           /* Fourth Column */
2354:           SSE_COPY_PS(XMM3,XMM6)
2355:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2356:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2357:           SSE_SUB_PS(XMM7,XMM3)
2358:         SSE_INLINE_END_2
2359: 
2360:         v  += 16;
2361:       }
2362:       v    =  aa + 16*ai[++i];
2363:       PREFETCH_NTA(v);
2364:       STORE_PS(&t[idx],XMM7);
2365:     }

2367:     /* Backward solve the upper triangular factor.*/

2369:     idt  = 4*(n-1);
2370:     ai16 = 16*diag[n-1];
2371:     v    = aa + ai16 + 16;
2372:     for (i=n-1; i>=0;){
2373:       PREFETCH_NTA(&v[8]);
2374:       vi = aj + diag[i] + 1;
2375:       nz = ai[i+1] - diag[i] - 1;
2376: 
2377:       LOAD_PS(&t[idt],XMM7);

2379:       while (nz--) {
2380:         PREFETCH_NTA(&v[16]);
2381:         idx = 4*((unsigned int)(*vi++));

2383:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2384:         SSE_INLINE_BEGIN_2(&t[idx],v)
2385:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2387:           /* First Column */
2388:           SSE_COPY_PS(XMM0,XMM6)
2389:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2390:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2391:           SSE_SUB_PS(XMM7,XMM0)

2393:           /* Second Column */
2394:           SSE_COPY_PS(XMM1,XMM6)
2395:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2396:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2397:           SSE_SUB_PS(XMM7,XMM1)

2399:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2400: 
2401:           /* Third Column */
2402:           SSE_COPY_PS(XMM2,XMM6)
2403:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2404:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2405:           SSE_SUB_PS(XMM7,XMM2)

2407:           /* Fourth Column */
2408:           SSE_COPY_PS(XMM3,XMM6)
2409:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2410:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2411:           SSE_SUB_PS(XMM7,XMM3)
2412:         SSE_INLINE_END_2
2413:         v  += 16;
2414:       }
2415:       v    = aa + ai16;
2416:       ai16 = 16*diag[--i];
2417:       PREFETCH_NTA(aa+ai16+16);
2418:       /* 
2419:          Scale the result by the diagonal 4x4 block, 
2420:          which was inverted as part of the factorization
2421:       */
2422:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2423:         /* First Column */
2424:         SSE_COPY_PS(XMM0,XMM7)
2425:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2426:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2428:         /* Second Column */
2429:         SSE_COPY_PS(XMM1,XMM7)
2430:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2431:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2432:         SSE_ADD_PS(XMM0,XMM1)

2434:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2435: 
2436:         /* Third Column */
2437:         SSE_COPY_PS(XMM2,XMM7)
2438:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2439:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2440:         SSE_ADD_PS(XMM0,XMM2)

2442:         /* Fourth Column */
2443:         SSE_COPY_PS(XMM3,XMM7)
2444:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2445:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2446:         SSE_ADD_PS(XMM0,XMM3)

2448:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2449:       SSE_INLINE_END_3

2451:       v    = aa + ai16 + 16;
2452:       idt -= 4;
2453:     }

2455:     /* Convert t from single precision back to double precision (inplace)*/
2456:     idt = 4*(n-1);
2457:     for (i=n-1;i>=0;i--) {
2458:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2459:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2460:       PetscScalar *xtemp=&x[idt];
2461:       MatScalar   *ttemp=&t[idt];
2462:       xtemp[3] = (PetscScalar)ttemp[3];
2463:       xtemp[2] = (PetscScalar)ttemp[2];
2464:       xtemp[1] = (PetscScalar)ttemp[1];
2465:       xtemp[0] = (PetscScalar)ttemp[0];
2466:       idt -= 4;
2467:     }

2469:   } /* End of artificial scope. */
2470:   VecRestoreArray(bb,&b);
2471:   VecRestoreArray(xx,&x);
2472:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2473:   SSE_SCOPE_END;
2474:   return(0);
2475: }

2479: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2480: {
2481:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2482:   int            *aj=a->j;
2484:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2485:   MatScalar      *aa=a->a;
2486:   PetscScalar    *x,*b;

2489:   SSE_SCOPE_BEGIN;
2490:   /* 
2491:      Note: This code currently uses demotion of double
2492:      to float when performing the mixed-mode computation.
2493:      This may not be numerically reasonable for all applications.
2494:   */
2495:   PREFETCH_NTA(aa+16*ai[1]);

2497:   VecGetArray(bb,&b);
2498:   VecGetArray(xx,&x);
2499:   {
2500:     /* x will first be computed in single precision then promoted inplace to double */
2501:     MatScalar *v,*t=(MatScalar *)x;
2502:     int       nz,i,idt,ai16;
2503:     int       jdx,idx;
2504:     int       *vi;
2505:     /* Forward solve the lower triangular factor. */

2507:     /* First block is the identity. */
2508:     idx  = 0;
2509:     CONVERT_DOUBLE4_FLOAT4(t,b);
2510:     v    =  aa + 16*ai[1];

2512:     for (i=1; i<n;) {
2513:       PREFETCH_NTA(&v[8]);
2514:       vi   =  aj      + ai[i];
2515:       nz   =  diag[i] - ai[i];
2516:       idx +=  4;

2518:       /* Demote RHS from double to float. */
2519:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2520:       LOAD_PS(&t[idx],XMM7);

2522:       while (nz--) {
2523:         PREFETCH_NTA(&v[16]);
2524:         jdx = 4*(*vi++);
2525: /*          jdx = *vi++; */
2526: 
2527:         /* 4x4 Matrix-Vector product with negative accumulation: */
2528:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2529:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2531:           /* First Column */
2532:           SSE_COPY_PS(XMM0,XMM6)
2533:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2534:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2535:           SSE_SUB_PS(XMM7,XMM0)

2537:           /* Second Column */
2538:           SSE_COPY_PS(XMM1,XMM6)
2539:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2540:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2541:           SSE_SUB_PS(XMM7,XMM1)

2543:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2544: 
2545:           /* Third Column */
2546:           SSE_COPY_PS(XMM2,XMM6)
2547:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2548:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2549:           SSE_SUB_PS(XMM7,XMM2)

2551:           /* Fourth Column */
2552:           SSE_COPY_PS(XMM3,XMM6)
2553:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2554:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2555:           SSE_SUB_PS(XMM7,XMM3)
2556:         SSE_INLINE_END_2
2557: 
2558:         v  += 16;
2559:       }
2560:       v    =  aa + 16*ai[++i];
2561:       PREFETCH_NTA(v);
2562:       STORE_PS(&t[idx],XMM7);
2563:     }

2565:     /* Backward solve the upper triangular factor.*/

2567:     idt  = 4*(n-1);
2568:     ai16 = 16*diag[n-1];
2569:     v    = aa + ai16 + 16;
2570:     for (i=n-1; i>=0;){
2571:       PREFETCH_NTA(&v[8]);
2572:       vi = aj + diag[i] + 1;
2573:       nz = ai[i+1] - diag[i] - 1;
2574: 
2575:       LOAD_PS(&t[idt],XMM7);

2577:       while (nz--) {
2578:         PREFETCH_NTA(&v[16]);
2579:         idx = 4*(*vi++);
2580: /*          idx = *vi++; */

2582:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2583:         SSE_INLINE_BEGIN_2(&t[idx],v)
2584:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2586:           /* First Column */
2587:           SSE_COPY_PS(XMM0,XMM6)
2588:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2589:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2590:           SSE_SUB_PS(XMM7,XMM0)

2592:           /* Second Column */
2593:           SSE_COPY_PS(XMM1,XMM6)
2594:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2595:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2596:           SSE_SUB_PS(XMM7,XMM1)

2598:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2599: 
2600:           /* Third Column */
2601:           SSE_COPY_PS(XMM2,XMM6)
2602:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2603:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2604:           SSE_SUB_PS(XMM7,XMM2)

2606:           /* Fourth Column */
2607:           SSE_COPY_PS(XMM3,XMM6)
2608:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2609:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2610:           SSE_SUB_PS(XMM7,XMM3)
2611:         SSE_INLINE_END_2
2612:         v  += 16;
2613:       }
2614:       v    = aa + ai16;
2615:       ai16 = 16*diag[--i];
2616:       PREFETCH_NTA(aa+ai16+16);
2617:       /* 
2618:          Scale the result by the diagonal 4x4 block, 
2619:          which was inverted as part of the factorization
2620:       */
2621:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2622:         /* First Column */
2623:         SSE_COPY_PS(XMM0,XMM7)
2624:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2625:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2627:         /* Second Column */
2628:         SSE_COPY_PS(XMM1,XMM7)
2629:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2630:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2631:         SSE_ADD_PS(XMM0,XMM1)

2633:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2634: 
2635:         /* Third Column */
2636:         SSE_COPY_PS(XMM2,XMM7)
2637:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2638:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2639:         SSE_ADD_PS(XMM0,XMM2)

2641:         /* Fourth Column */
2642:         SSE_COPY_PS(XMM3,XMM7)
2643:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2644:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2645:         SSE_ADD_PS(XMM0,XMM3)

2647:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2648:       SSE_INLINE_END_3

2650:       v    = aa + ai16 + 16;
2651:       idt -= 4;
2652:     }

2654:     /* Convert t from single precision back to double precision (inplace)*/
2655:     idt = 4*(n-1);
2656:     for (i=n-1;i>=0;i--) {
2657:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2658:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2659:       PetscScalar *xtemp=&x[idt];
2660:       MatScalar   *ttemp=&t[idt];
2661:       xtemp[3] = (PetscScalar)ttemp[3];
2662:       xtemp[2] = (PetscScalar)ttemp[2];
2663:       xtemp[1] = (PetscScalar)ttemp[1];
2664:       xtemp[0] = (PetscScalar)ttemp[0];
2665:       idt -= 4;
2666:     }

2668:   } /* End of artificial scope. */
2669:   VecRestoreArray(bb,&b);
2670:   VecRestoreArray(xx,&x);
2671:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2672:   SSE_SCOPE_END;
2673:   return(0);
2674: }

2676: #endif
2679: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2680: {
2681:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2682:   IS             iscol=a->col,isrow=a->row;
2684:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2685:   PetscInt       *diag = a->diag;
2686:   MatScalar      *aa=a->a,*v;
2687:   PetscScalar    *x,*b,s1,s2,s3,x1,x2,x3,*t;

2690:   VecGetArray(bb,&b);
2691:   VecGetArray(xx,&x);
2692:   t  = a->solve_work;

2694:   ISGetIndices(isrow,&rout); r = rout;
2695:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2697:   /* forward solve the lower triangular */
2698:   idx    = 3*(*r++);
2699:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2700:   for (i=1; i<n; i++) {
2701:     v     = aa + 9*ai[i];
2702:     vi    = aj + ai[i];
2703:     nz    = diag[i] - ai[i];
2704:     idx   = 3*(*r++);
2705:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2706:     while (nz--) {
2707:       idx   = 3*(*vi++);
2708:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2709:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2710:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2711:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2712:       v += 9;
2713:     }
2714:     idx = 3*i;
2715:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2716:   }
2717:   /* backward solve the upper triangular */
2718:   for (i=n-1; i>=0; i--){
2719:     v    = aa + 9*diag[i] + 9;
2720:     vi   = aj + diag[i] + 1;
2721:     nz   = ai[i+1] - diag[i] - 1;
2722:     idt  = 3*i;
2723:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2724:     while (nz--) {
2725:       idx   = 3*(*vi++);
2726:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2727:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2728:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2729:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2730:       v += 9;
2731:     }
2732:     idc = 3*(*c--);
2733:     v   = aa + 9*diag[i];
2734:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2735:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2736:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2737:   }
2738:   ISRestoreIndices(isrow,&rout);
2739:   ISRestoreIndices(iscol,&cout);
2740:   VecRestoreArray(bb,&b);
2741:   VecRestoreArray(xx,&x);
2742:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2743:   return(0);
2744: }

2746: /*
2747:       Special case where the matrix was ILU(0) factored in the natural
2748:    ordering. This eliminates the need for the column and row permutation.
2749: */
2752: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2753: {
2754:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2755:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2757:   PetscInt       *diag = a->diag;
2758:   MatScalar      *aa=a->a,*v;
2759:   PetscScalar    *x,*b,s1,s2,s3,x1,x2,x3;
2760:   PetscInt       jdx,idt,idx,nz,*vi,i;

2763:   VecGetArray(bb,&b);
2764:   VecGetArray(xx,&x);


2767:   /* forward solve the lower triangular */
2768:   idx    = 0;
2769:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2770:   for (i=1; i<n; i++) {
2771:     v     =  aa      + 9*ai[i];
2772:     vi    =  aj      + ai[i];
2773:     nz    =  diag[i] - ai[i];
2774:     idx   +=  3;
2775:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2776:     while (nz--) {
2777:       jdx   = 3*(*vi++);
2778:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2779:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2780:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2781:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2782:       v    += 9;
2783:     }
2784:     x[idx]   = s1;
2785:     x[1+idx] = s2;
2786:     x[2+idx] = s3;
2787:   }
2788:   /* backward solve the upper triangular */
2789:   for (i=n-1; i>=0; i--){
2790:     v    = aa + 9*diag[i] + 9;
2791:     vi   = aj + diag[i] + 1;
2792:     nz   = ai[i+1] - diag[i] - 1;
2793:     idt  = 3*i;
2794:     s1 = x[idt];  s2 = x[1+idt];
2795:     s3 = x[2+idt];
2796:     while (nz--) {
2797:       idx   = 3*(*vi++);
2798:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2799:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2800:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2801:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2802:       v    += 9;
2803:     }
2804:     v        = aa +  9*diag[i];
2805:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2806:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2807:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2808:   }

2810:   VecRestoreArray(bb,&b);
2811:   VecRestoreArray(xx,&x);
2812:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2813:   return(0);
2814: }

2818: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2819: {
2820:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2821:   IS             iscol=a->col,isrow=a->row;
2823:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2824:   PetscInt       *diag = a->diag;
2825:   MatScalar      *aa=a->a,*v;
2826:   PetscScalar    *x,*b,s1,s2,x1,x2,*t;

2829:   VecGetArray(bb,&b);
2830:   VecGetArray(xx,&x);
2831:   t  = a->solve_work;

2833:   ISGetIndices(isrow,&rout); r = rout;
2834:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2836:   /* forward solve the lower triangular */
2837:   idx    = 2*(*r++);
2838:   t[0] = b[idx]; t[1] = b[1+idx];
2839:   for (i=1; i<n; i++) {
2840:     v     = aa + 4*ai[i];
2841:     vi    = aj + ai[i];
2842:     nz    = diag[i] - ai[i];
2843:     idx   = 2*(*r++);
2844:     s1  = b[idx]; s2 = b[1+idx];
2845:     while (nz--) {
2846:       idx   = 2*(*vi++);
2847:       x1    = t[idx]; x2 = t[1+idx];
2848:       s1 -= v[0]*x1 + v[2]*x2;
2849:       s2 -= v[1]*x1 + v[3]*x2;
2850:       v += 4;
2851:     }
2852:     idx = 2*i;
2853:     t[idx] = s1; t[1+idx] = s2;
2854:   }
2855:   /* backward solve the upper triangular */
2856:   for (i=n-1; i>=0; i--){
2857:     v    = aa + 4*diag[i] + 4;
2858:     vi   = aj + diag[i] + 1;
2859:     nz   = ai[i+1] - diag[i] - 1;
2860:     idt  = 2*i;
2861:     s1 = t[idt]; s2 = t[1+idt];
2862:     while (nz--) {
2863:       idx   = 2*(*vi++);
2864:       x1    = t[idx]; x2 = t[1+idx];
2865:       s1 -= v[0]*x1 + v[2]*x2;
2866:       s2 -= v[1]*x1 + v[3]*x2;
2867:       v += 4;
2868:     }
2869:     idc = 2*(*c--);
2870:     v   = aa + 4*diag[i];
2871:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2872:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2873:   }
2874:   ISRestoreIndices(isrow,&rout);
2875:   ISRestoreIndices(iscol,&cout);
2876:   VecRestoreArray(bb,&b);
2877:   VecRestoreArray(xx,&x);
2878:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2879:   return(0);
2880: }

2882: /*
2883:       Special case where the matrix was ILU(0) factored in the natural
2884:    ordering. This eliminates the need for the column and row permutation.
2885: */
2888: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2889: {
2890:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2891:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2893:   PetscInt       *diag = a->diag;
2894:   MatScalar      *aa=a->a,*v;
2895:   PetscScalar    *x,*b,s1,s2,x1,x2;
2896:   PetscInt       jdx,idt,idx,nz,*vi,i;

2899:   VecGetArray(bb,&b);
2900:   VecGetArray(xx,&x);

2902:   /* forward solve the lower triangular */
2903:   idx    = 0;
2904:   x[0]   = b[0]; x[1] = b[1];
2905:   for (i=1; i<n; i++) {
2906:     v     =  aa      + 4*ai[i];
2907:     vi    =  aj      + ai[i];
2908:     nz    =  diag[i] - ai[i];
2909:     idx   +=  2;
2910:     s1  =  b[idx];s2 = b[1+idx];
2911:     while (nz--) {
2912:       jdx   = 2*(*vi++);
2913:       x1    = x[jdx];x2 = x[1+jdx];
2914:       s1 -= v[0]*x1 + v[2]*x2;
2915:       s2 -= v[1]*x1 + v[3]*x2;
2916:       v    += 4;
2917:     }
2918:     x[idx]   = s1;
2919:     x[1+idx] = s2;
2920:   }
2921:   /* backward solve the upper triangular */
2922:   for (i=n-1; i>=0; i--){
2923:     v    = aa + 4*diag[i] + 4;
2924:     vi   = aj + diag[i] + 1;
2925:     nz   = ai[i+1] - diag[i] - 1;
2926:     idt  = 2*i;
2927:     s1 = x[idt];  s2 = x[1+idt];
2928:     while (nz--) {
2929:       idx   = 2*(*vi++);
2930:       x1    = x[idx];   x2 = x[1+idx];
2931:       s1 -= v[0]*x1 + v[2]*x2;
2932:       s2 -= v[1]*x1 + v[3]*x2;
2933:       v    += 4;
2934:     }
2935:     v        = aa +  4*diag[i];
2936:     x[idt]   = v[0]*s1 + v[2]*s2;
2937:     x[1+idt] = v[1]*s1 + v[3]*s2;
2938:   }

2940:   VecRestoreArray(bb,&b);
2941:   VecRestoreArray(xx,&x);
2942:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2943:   return(0);
2944: }

2948: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2949: {
2950:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2951:   IS             iscol=a->col,isrow=a->row;
2953:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2954:   PetscInt       *diag = a->diag;
2955:   MatScalar      *aa=a->a,*v;
2956:   PetscScalar    *x,*b,s1,*t;

2959:   if (!n) return(0);

2961:   VecGetArray(bb,&b);
2962:   VecGetArray(xx,&x);
2963:   t  = a->solve_work;

2965:   ISGetIndices(isrow,&rout); r = rout;
2966:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2968:   /* forward solve the lower triangular */
2969:   t[0] = b[*r++];
2970:   for (i=1; i<n; i++) {
2971:     v     = aa + ai[i];
2972:     vi    = aj + ai[i];
2973:     nz    = diag[i] - ai[i];
2974:     s1  = b[*r++];
2975:     while (nz--) {
2976:       s1 -= (*v++)*t[*vi++];
2977:     }
2978:     t[i] = s1;
2979:   }
2980:   /* backward solve the upper triangular */
2981:   for (i=n-1; i>=0; i--){
2982:     v    = aa + diag[i] + 1;
2983:     vi   = aj + diag[i] + 1;
2984:     nz   = ai[i+1] - diag[i] - 1;
2985:     s1 = t[i];
2986:     while (nz--) {
2987:       s1 -= (*v++)*t[*vi++];
2988:     }
2989:     x[*c--] = t[i] = aa[diag[i]]*s1;
2990:   }

2992:   ISRestoreIndices(isrow,&rout);
2993:   ISRestoreIndices(iscol,&cout);
2994:   VecRestoreArray(bb,&b);
2995:   VecRestoreArray(xx,&x);
2996:   PetscLogFlops(2*1*(a->nz) - A->cmap.n);
2997:   return(0);
2998: }
2999: /*
3000:       Special case where the matrix was ILU(0) factored in the natural
3001:    ordering. This eliminates the need for the column and row permutation.
3002: */
3005: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3006: {
3007:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3008:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3010:   PetscInt       *diag = a->diag;
3011:   MatScalar      *aa=a->a;
3012:   PetscScalar    *x,*b;
3013:   PetscScalar    s1,x1;
3014:   MatScalar      *v;
3015:   PetscInt       jdx,idt,idx,nz,*vi,i;

3018:   VecGetArray(bb,&b);
3019:   VecGetArray(xx,&x);

3021:   /* forward solve the lower triangular */
3022:   idx    = 0;
3023:   x[0]   = b[0];
3024:   for (i=1; i<n; i++) {
3025:     v     =  aa      + ai[i];
3026:     vi    =  aj      + ai[i];
3027:     nz    =  diag[i] - ai[i];
3028:     idx   +=  1;
3029:     s1  =  b[idx];
3030:     while (nz--) {
3031:       jdx   = *vi++;
3032:       x1    = x[jdx];
3033:       s1 -= v[0]*x1;
3034:       v    += 1;
3035:     }
3036:     x[idx]   = s1;
3037:   }
3038:   /* backward solve the upper triangular */
3039:   for (i=n-1; i>=0; i--){
3040:     v    = aa + diag[i] + 1;
3041:     vi   = aj + diag[i] + 1;
3042:     nz   = ai[i+1] - diag[i] - 1;
3043:     idt  = i;
3044:     s1 = x[idt];
3045:     while (nz--) {
3046:       idx   = *vi++;
3047:       x1    = x[idx];
3048:       s1 -= v[0]*x1;
3049:       v    += 1;
3050:     }
3051:     v        = aa +  diag[i];
3052:     x[idt]   = v[0]*s1;
3053:   }
3054:   VecRestoreArray(bb,&b);
3055:   VecRestoreArray(xx,&x);
3056:   PetscLogFlops(2*(a->nz) - A->cmap.n);
3057:   return(0);
3058: }

3060: /* ----------------------------------------------------------------*/
3061: /*
3062:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3063:    except that the data structure of Mat_SeqAIJ is slightly different.
3064:    Not a good example of code reuse.
3065: */
3066: EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);

3070: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3071: {
3072:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3073:   IS             isicol;
3075:   PetscInt       *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3076:   PetscInt       *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3077:   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3078:   PetscInt       incrlev,nnz,i,bs = A->rmap.bs,bs2 = a->bs2,levels,diagonal_fill;
3079:   PetscTruth     col_identity,row_identity;
3080:   PetscReal      f;

3083:   f             = info->fill;
3084:   levels        = (PetscInt)info->levels;
3085:   diagonal_fill = (PetscInt)info->diagonal_fill;
3086:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3087:   ISIdentity(isrow,&row_identity);
3088:   ISIdentity(iscol,&col_identity);

3090:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3091:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3092:     (*fact)->factor = FACTOR_LU;
3093:     b               = (Mat_SeqBAIJ*)(*fact)->data;
3094:     MatMissingDiagonal_SeqBAIJ(*fact);
3095:     b->row        = isrow;
3096:     b->col        = iscol;
3097:     PetscObjectReference((PetscObject)isrow);
3098:     PetscObjectReference((PetscObject)iscol);
3099:     b->icol       = isicol;
3100:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3101:     PetscMalloc(((*fact)->rmap.N+1+(*fact)->rmap.bs)*sizeof(PetscScalar),&b->solve_work);
3102:   } else { /* general case perform the symbolic factorization */
3103:     ISGetIndices(isrow,&r);
3104:     ISGetIndices(isicol,&ic);

3106:     /* get new row pointers */
3107:     PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3108:     ainew[0] = 0;
3109:     /* don't know how many column pointers are needed so estimate */
3110:     jmax = (PetscInt)(f*ai[n] + 1);
3111:     PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3112:     /* ajfill is level of fill for each fill entry */
3113:     PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3114:     /* fill is a linked list of nonzeros in active row */
3115:     PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3116:     /* im is level for each filled value */
3117:     PetscMalloc((n+1)*sizeof(PetscInt),&im);
3118:     /* dloc is location of diagonal in factor */
3119:     PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3120:     dloc[0]  = 0;
3121:     for (prow=0; prow<n; prow++) {

3123:       /* copy prow into linked list */
3124:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3125:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3126:       xi         = aj + ai[r[prow]];
3127:       fill[n]    = n;
3128:       fill[prow] = -1; /* marker for diagonal entry */
3129:       while (nz--) {
3130:         fm  = n;
3131:         idx = ic[*xi++];
3132:         do {
3133:           m  = fm;
3134:           fm = fill[m];
3135:         } while (fm < idx);
3136:         fill[m]   = idx;
3137:         fill[idx] = fm;
3138:         im[idx]   = 0;
3139:       }

3141:       /* make sure diagonal entry is included */
3142:       if (diagonal_fill && fill[prow] == -1) {
3143:         fm = n;
3144:         while (fill[fm] < prow) fm = fill[fm];
3145:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
3146:         fill[fm]   = prow;
3147:         im[prow]   = 0;
3148:         nzf++;
3149:         dcount++;
3150:       }

3152:       nzi = 0;
3153:       row = fill[n];
3154:       while (row < prow) {
3155:         incrlev = im[row] + 1;
3156:         nz      = dloc[row];
3157:         xi      = ajnew  + ainew[row] + nz + 1;
3158:         flev    = ajfill + ainew[row] + nz + 1;
3159:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
3160:         fm      = row;
3161:         while (nnz-- > 0) {
3162:           idx = *xi++;
3163:           if (*flev + incrlev > levels) {
3164:             flev++;
3165:             continue;
3166:           }
3167:           do {
3168:             m  = fm;
3169:             fm = fill[m];
3170:           } while (fm < idx);
3171:           if (fm != idx) {
3172:             im[idx]   = *flev + incrlev;
3173:             fill[m]   = idx;
3174:             fill[idx] = fm;
3175:             fm        = idx;
3176:             nzf++;
3177:           } else {
3178:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3179:           }
3180:           flev++;
3181:         }
3182:         row = fill[row];
3183:         nzi++;
3184:       }
3185:       /* copy new filled row into permanent storage */
3186:       ainew[prow+1] = ainew[prow] + nzf;
3187:       if (ainew[prow+1] > jmax) {

3189:         /* estimate how much additional space we will need */
3190:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3191:         /* just double the memory each time */
3192:         PetscInt maxadd = jmax;
3193:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3194:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3195:         jmax += maxadd;

3197:         /* allocate a longer ajnew and ajfill */
3198:         PetscMalloc(jmax*sizeof(PetscInt),&xi);
3199:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));
3200:         PetscFree(ajnew);
3201:         ajnew = xi;
3202:         PetscMalloc(jmax*sizeof(PetscInt),&xi);
3203:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));
3204:         PetscFree(ajfill);
3205:         ajfill = xi;
3206:         reallocate++; /* count how many reallocations are needed */
3207:       }
3208:       xi          = ajnew + ainew[prow];
3209:       flev        = ajfill + ainew[prow];
3210:       dloc[prow]  = nzi;
3211:       fm          = fill[n];
3212:       while (nzf--) {
3213:         *xi++   = fm;
3214:         *flev++ = im[fm];
3215:         fm      = fill[fm];
3216:       }
3217:       /* make sure row has diagonal entry */
3218:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3219:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3220:     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3221:       }
3222:     }
3223:     PetscFree(ajfill);
3224:     ISRestoreIndices(isrow,&r);
3225:     ISRestoreIndices(isicol,&ic);
3226:     PetscFree(fill);
3227:     PetscFree(im);

3229: #if defined(PETSC_USE_INFO)
3230:     {
3231:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3232:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
3233:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
3234:       PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
3235:       PetscInfo(A,"for best performance.\n");
3236:       if (diagonal_fill) {
3237:         PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
3238:       }
3239:     }
3240: #endif

3242:     /* put together the new matrix */
3243:     MatCreate(A->comm,fact);
3244:     MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);
3245:     MatSetType(*fact,A->type_name);
3246:     MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
3247:     PetscLogObjectParent(*fact,isicol);
3248:     b    = (Mat_SeqBAIJ*)(*fact)->data;
3249:     b->free_a       = PETSC_TRUE;
3250:     b->free_ij      = PETSC_TRUE;
3251:     b->singlemalloc = PETSC_FALSE;
3252:     PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
3253:     b->j          = ajnew;
3254:     b->i          = ainew;
3255:     for (i=0; i<n; i++) dloc[i] += ainew[i];
3256:     b->diag       = dloc;
3257:     b->ilen       = 0;
3258:     b->imax       = 0;
3259:     b->row        = isrow;
3260:     b->col        = iscol;
3261:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3262:     PetscObjectReference((PetscObject)isrow);
3263:     PetscObjectReference((PetscObject)iscol);
3264:     b->icol       = isicol;
3265:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3266:     /* In b structure:  Free imax, ilen, old a, old j.  
3267:        Allocate dloc, solve_work, new a, new j */
3268:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3269:     b->maxnz          = b->nz = ainew[n];
3270:     (*fact)->factor   = FACTOR_LU;

3272:     (*fact)->info.factor_mallocs    = reallocate;
3273:     (*fact)->info.fill_ratio_given  = f;
3274:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3275:   }

3277:   if (row_identity && col_identity) {
3278:     MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3279:   }
3280:   return(0);
3281: }

3285: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3286: {
3287:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3288:   /* int i,*AJ=a->j,nz=a->nz; */
3290:   /* Undo Column scaling */
3291: /*    while (nz--) { */
3292: /*      AJ[i] = AJ[i]/4; */
3293: /*    } */
3294:   /* This should really invoke a push/pop logic, but we don't have that yet. */
3295:   A->ops->setunfactored = PETSC_NULL;
3296:   return(0);
3297: }

3301: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3302: {
3303:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3304:   PetscInt       *AJ=a->j,nz=a->nz;
3305:   unsigned short *aj=(unsigned short *)AJ;
3307:   /* Is this really necessary? */
3308:   while (nz--) {
3309:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3310:   }
3311:   A->ops->setunfactored = PETSC_NULL;
3312:   return(0);
3313: }

3317: PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3318: {
3320:   /*
3321:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3322:       with natural ordering
3323:   */
3325:   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3326:   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3327:   switch (inA->rmap.bs) {
3328:   case 1:
3329:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3330:     PetscInfo(inA,"Using special in-place natural ordering factor BS=1\n");
3331:     break;
3332:   case 2:
3333:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3334:     PetscInfo(inA,"Using special in-place natural ordering factor BS=2\n");
3335:     break;
3336:   case 3:
3337:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3338:     PetscInfo(inA,"Using special in-place natural ordering factor BS=3\n");
3339:     break;
3340:   case 4:
3341: #if defined(PETSC_USE_MAT_SINGLE)
3342:     {
3343:       PetscTruth  sse_enabled_local;
3345:       PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3346:       if (sse_enabled_local) {
3347: #  if defined(PETSC_HAVE_SSE)
3348:         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3349:         if (n==(unsigned short)n) {
3350:           unsigned short *aj=(unsigned short *)AJ;
3351:           for (i=0;i<nz;i++) {
3352:             aj[i] = (unsigned short)AJ[i];
3353:           }
3354:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3355:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3356:           PetscInfo(inA,"Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3357:         } else {
3358:         /* Scale the column indices for easier indexing in MatSolve. */
3359: /*            for (i=0;i<nz;i++) { */
3360: /*              AJ[i] = AJ[i]*4; */
3361: /*            } */
3362:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3363:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3364:           PetscInfo(inA,"Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3365:         }
3366: #  else
3367:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3368:         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3369: #  endif
3370:       } else {
3371:         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3372:         PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3373:       }
3374:     }
3375: #else
3376:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3377:     PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3378: #endif
3379:     break;
3380:   case 5:
3381:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3382:     PetscInfo(inA,"Using special in-place natural ordering factor BS=5\n");
3383:     break;
3384:   case 6:
3385:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3386:     PetscInfo(inA,"Using special in-place natural ordering factor BS=6\n");
3387:     break;
3388:   case 7:
3389:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3390:     PetscInfo(inA,"Using special in-place natural ordering factor BS=7\n");
3391:     break;
3392:   }
3393:   return(0);
3394: }

3398: PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3399: {
3400:   /*
3401:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3402:       with natural ordering
3403:   */
3404:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
3405:   IS             row = a->row, col = a->col;
3406:   PetscTruth     row_identity, col_identity;
3407:   PetscTruth     use_natural;


3412:   use_natural = PETSC_FALSE;
3413:   if (row && col) {
3414:     ISIdentity(row,&row_identity);
3415:     ISIdentity(col,&col_identity);

3417:     if (row_identity && col_identity) {
3418:       use_natural = PETSC_TRUE;
3419:     }
3420:   } else {
3421:     use_natural = PETSC_TRUE;
3422:   }

3424:   switch (A->rmap.bs) {
3425:   case 1:
3426:     if (use_natural) {
3427:       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3428:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3429:       PetscInfo(A,"Using special in-place natural ordering solve BS=1\n");
3430:       PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3431:     } else {
3432:       A->ops->solve           = MatSolve_SeqBAIJ_1;
3433:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3434:     }
3435:     break;
3436:   case 2:
3437:     if (use_natural) {
3438:       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3439:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3440:       PetscInfo(A,"Using special in-place natural ordering solve BS=2\n");
3441:       PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3442:     } else {
3443:       A->ops->solve           = MatSolve_SeqBAIJ_2;
3444:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3445:     }
3446:     break;
3447:   case 3:
3448:     if (use_natural) {
3449:       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3450:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3451:       PetscInfo(A,"Using special in-place natural ordering solve BS=3\n");
3452:       PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3453:     } else {
3454:       A->ops->solve           = MatSolve_SeqBAIJ_3;
3455:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3456:     }
3457:     break;
3458:   case 4:
3459:     {
3460:       PetscTruth sse_enabled_local;
3461:       PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3462:       if (use_natural) {
3463: #if defined(PETSC_USE_MAT_SINGLE)
3464:         if (sse_enabled_local) { /* Natural + Single + SSE */
3465: #  if defined(PETSC_HAVE_SSE)
3466:           PetscInt n=a->mbs;
3467:           if (n==(unsigned short)n) {
3468:             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3469:             PetscInfo(A,"Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3470:           } else {
3471:             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3472:             PetscInfo(A,"Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3473:           }
3474: #  else
3475:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3476:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3477: #  endif
3478:         } else { /* Natural + Single */
3479:           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3480:           PetscInfo(A,"Using single precision, in-place, natural ordering solve BS=4\n");
3481:         }
3482: #else
3483:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3484:         PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3485: #endif
3486:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3487:         PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3488:       } else { /* Arbitrary ordering */
3489: #if defined(PETSC_USE_MAT_SINGLE)
3490:         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3491: #  if defined(PETSC_HAVE_SSE)
3492:           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3493:           PetscInfo(A,"Using single precision, SSE solve BS=4\n");
3494: #  else
3495:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3496:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3497: #  endif
3498:         } else { /* Arbitrary + Single */
3499:           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3500:           PetscInfo(A,"Using single precision solve BS=4\n");
3501:         }
3502: #else
3503:         A->ops->solve           = MatSolve_SeqBAIJ_4;
3504: #endif
3505:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3506:       }
3507:     }
3508:     break;
3509:   case 5:
3510:     if (use_natural) {
3511:       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3512:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3513:       PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3514:       PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3515:     } else {
3516:       A->ops->solve           = MatSolve_SeqBAIJ_5;
3517:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3518:     }
3519:     break;
3520:   case 6:
3521:     if (use_natural) {
3522:       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3523:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3524:       PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3525:       PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3526:     } else {
3527:       A->ops->solve           = MatSolve_SeqBAIJ_6;
3528:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3529:     }
3530:     break;
3531:   case 7:
3532:     if (use_natural) {
3533:       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3534:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3535:       PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3536:       PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3537:     } else {
3538:       A->ops->solve           = MatSolve_SeqBAIJ_7;
3539:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3540:     }
3541:     break;
3542:   default:
3543:     A->ops->solve             = MatSolve_SeqBAIJ_N;
3544:     break;
3545:   }
3546:   return(0);
3547: }

3551: PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3552: {

3556:   MatSeqBAIJ_UpdateSolvers(A);
3557:   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3558:     (*A->ops->solve)(A,x,y);
3559:   } else {
3560:     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3561:   }
3562:   return(0);
3563: }

3567: PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3568: {

3572:   MatSeqBAIJ_UpdateSolvers(A);
3573:   (*A->ops->solvetranspose)(A,x,y);
3574:   return(0);
3575: }