#define max(i,j) ((i)>(j)?(i):(j)) #define min(i,j) ((i)<(j)?(i):(j)) #ifdef __STDC__ #include /* include this if available */ #include #define FABSMASK INT_MAX #else #define FABSMASK 0x7fffffff #endif #ifndef fabsf #ifndef fabs double fabs(); #endif #define fabsf(x) fabs((double)x) #endif float lufact(a,n,indx) float **a; /* a[1..n][1..n] as in Press, Flannery et al */ int n,indx[]; /* return indx[1..n],factored a[][] */ { static float *p,*p2,*p3,accest; float *scale=(float *)&indx[1-sizeof(float)/sizeof(int)]; /* put first scale[] then indx[] in caller's array */ static union intflt{float f;int i;} xmax,xx; /* smart compilers will use register variables */ static int i,j,k,ipiv,newmax; static double d,atemp,atmp2; for(i=1;i<=n;++i){ /* determine scale of each row */ xmax.f=fabsf(a[i][1]); for(j=2;j<=n;++j){ #ifdef ICMP /* if comparison is to be done by int arithmetic, ** noting that only + numbers are possible */ xx.f=a[i][j]; /* consistent style so that this loop will not dominate ** time of the other search loop; ** pointer casting has been tried here, ** but does not agree with compiler optimizers */ xmax.i=max(xx.i&FABSMASK,xmax.i); #else xmax.f=max(fabsf(a[i][j]),xmax.f); #endif } /* except for this assignment, ** compilers could choose concurrent outer code ** (assign each loop iteration to independent process) */ scale[i]=1/xmax.f; } accest=1; /* initialize to "no loss of accuracy" */ for(j=2;j<=n;++j){ /* main factorization loop */ xmax.f=0.; /* search down column for max scaled pivot */ for(i=j-1;i<=n;++i){ #ifdef ICMP xx.f=a[i][j-1]*scale[i]; /* give opportunity for pipelining by using ? ** instead of if(){} as preferred by certain compilers ** in 1988 */ xmax.i=(newmax=(xx.i&=FABSMASK)>xmax.i)?xx.i:xmax.i; #else xmax.f=(newmax=(xx.f=fabsf(a[i][j-1]*scale[i]))>xmax.f) ?xx.f:xmax.f; #endif ipiv=newmax?i:ipiv; } p=a[ipiv]; d=p[j-1]; /* this is the pivot element */ a[ipiv]=a[j-1]; /* swap row pointers */ a[j-1]=p; accest=min(accest,xmax.f); /* track cancellation error */ scale[ipiv]=scale[j-1]; indx[j-1] = ipiv; /* record changes for caller */ /* column j and row j-1 calculation combined, unrolled into pairs */ for(i=j;i<=n;i += 2){ double sum11=0.,sum12=0.,sum21=0.,sum22=0.; p2=a[i]; p3=a[i+1]; /* may point beyond a[n] */ for(k=1;k=0) /* rounded normal precision best */ atemp=p2[j-1]/p[j-1]; atmp2=p3[j-1]/p[j-1]; #else /* force into double and encourage compiler to preinvert */ atemp=p2[j-1]/d; atmp2=p3[j-1]/d; #endif /* reassociation of additions is OK for mixed precision, not for full double */ p[i] -= sum21; /* completing upper factor */ /* these results will be discarded in the case i==n */ sum22 = p[i+1]-sum22; /* note i>=j ; observe sequence */ sum12 = p3[j]-sum12-atmp2*p[j]; /* storage assignments last to allow more parallel computation */ p2[j] -= sum11+atemp*p[j]; p2[j-1]=atemp; if(i==n)break; p[i+1]=sum22; p3[j-1]=atmp2; p3[j]=sum12; } } accest = min(fabsf(a[n][n]*scale[n]),accest); indx[n]=n; return accest; }