| Matrix-vector multipler
|
SUBROUTINE matvec8a(n,m,a,x,y)
INTEGER i,j,n,m
DOUBLE PRECISION x, y, a
DIMENSION x(8,m), y(8,n), a(m,n)
DO i=1, n
DO j=1, m
y(1,i)=y(1,i)+a(j,i)*x(1,j)
y(2,i)=y(2,i)+a(j,i)*x(2,j)
y(3,i)=y(3,i)+a(j,i)*x(3,j)
y(4,i)=y(4,i)+a(j,i)*x(4,j)
y(5,i)=y(5,i)+a(j,i)*x(5,j)
y(6,i)=y(6,i)+a(j,i)*x(6,j)
y(7,i)=y(7,i)+a(j,i)*x(7,j)
y(8,i)=y(8,i)+a(j,i)*x(8,j)
END DO
END DO
RETURN
END
|
| main C++ program
|
#include <iomanip.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
extern "C" {
int mp_my_threadnum_();
int mp_numthreads_();
#define MATVEC matvec8a_
void MATVEC(int*,int*,double*,double*,double*);
}
main() {
clock_t time1, time2;
double tt1=0.0,tt2=0.0;
unsigned sec = 1;
const int DIM=1440;
const int ORD = 8;
int dim=DIM;
int dim1;
int thds = mp_numthreads_();
const int LOOP=int(100.e9/(DIM*DIM)/ORD);
cout &/lt;< "\n*******************************************\n";
cout &/lt;< "pragM: DIM = " << DIM << ", LOOP = " << LOOP << ", "
&/lt;< "Threads = " << thds << "\n"; cout.flush();
int i,j,k,pnum;
int pflags[64], flags_tot;
double *out, tot_out=0.0;
out=new double[64];
for(i=0;i<64;i++) out[i] = 0;
for(i=0;i<64;i++) pflags[i] = 0;
double B[DIM], a[DIM][ORD], c[DIM][ORD], A[DIM][DIM];
#pragma parallel shared(tt1,tt2,a,c,A,dim,dim1,pflags) \
local(time1,time2,flags_tot,i,j,k,pnum)
{
time1=clock();
#pragma pfor
for(i=0;i<DIM;i++) {
for(j=0;j<ORD;j++) {
a[i][j]=1.0+1.2*(i+j)*1e-14;
}
}
#pragma pfor
for(i=0;i<DIM;i++) {
for(j=0;j<ORD;j++) {
c[i][j]=a[i][j];
}
}
#pragma pfor
for(i=0;i<DIM;i++) {
for(j=0;j<DIM;j++) {
A[i][j]=0.51*(i+j+1)*1e-14;
}
}
int pnum=mp_my_threadnum_();
int dim1=dim/thds;
#pragma synchronize
for(k=0;k<LOOP;k++) {
MATVEC(&dim1,&dim,&A[pnum*dim1][0],&a[0][0],&c[pnum*dim1][0]);
#pragma synchronize //removed in unsync run
#pragma pfor //removed in unsync run
for(i=0;i<DIM;i++) { //removed in unsync run
for(j=0;j<ORD;j++) { //removed in unsync run
a[i][j]=c[i][j]; //removed in unsync run
} //removed in unsync run
} //removed in unsync run
}
#pragma pfor
for(i=0;i<DIM;i++) {
for(j=0;j<ORD;j++) {
out[pnum]+=a[i][j];
}
}
time2=clock();
#pragma critical
tt1+=double(time2-time1)/double(CLOCKS_PER_SEC);
} //end parallel
for(i=0;i<64;i++) tot_out += out[i];
cout << "tot_out = " << setiosflags(ios::fixed | ios::showpoint) <<
setprecision
(8) << tot_out << "\n";
cout << "Total time = " << setprecision(2) << tt1 << " seconds" << endl;
}
|