Performance Comparison 6 Februrary 1998

Parallel Scaling on T3E and Origin2000.  

Note:  
(a) Data is shown on Mflops-per-processor basis.
(b) Origin2000 scheduler fails completely as the
	number of processors reaches 32.
(c) This data incorporates most recent SGI patch.  

Below the figure is 
(1) subroutine that dominates application;
(2) main code that calls the application;
(3) Origin script that executes main code.
Figure [click superscript 1 for pix]¹
Matrix-vector multipler
      SUBROUTINE matvec8a(n,m,a,x,y)
      INTEGER i,j,n,m
      DOUBLE PRECISION x, y, a
      DIMENSION x(8,m), y(8,n),  a(m,n)
      DO i=1, n
        DO j=1, m
          y(1,i)=y(1,i)+a(j,i)*x(1,j)
          y(2,i)=y(2,i)+a(j,i)*x(2,j)
          y(3,i)=y(3,i)+a(j,i)*x(3,j)
          y(4,i)=y(4,i)+a(j,i)*x(4,j)
          y(5,i)=y(5,i)+a(j,i)*x(5,j)
          y(6,i)=y(6,i)+a(j,i)*x(6,j)
          y(7,i)=y(7,i)+a(j,i)*x(7,j)
          y(8,i)=y(8,i)+a(j,i)*x(8,j)
        END DO
      END DO
      RETURN
      END
main C++ program
#include <iomanip.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <unistd.h>

extern "C" {
  int mp_my_threadnum_();
  int mp_numthreads_();
#define MATVEC matvec8a_
  void MATVEC(int*,int*,double*,double*,double*);
}

main() {

clock_t time1, time2;
double tt1=0.0,tt2=0.0;
unsigned sec = 1;
const int DIM=1440;
const int ORD = 8;
int dim=DIM;
int dim1;
int thds = mp_numthreads_();
const int LOOP=int(100.e9/(DIM*DIM)/ORD);
cout &/lt;< "\n*******************************************\n";
cout &/lt;< "pragM: DIM = " << DIM << ", LOOP = " << LOOP << ", " 
     &/lt;< "Threads = " << thds << "\n"; cout.flush();
int i,j,k,pnum;
int pflags[64], flags_tot;
double *out, tot_out=0.0;
out=new double[64];
for(i=0;i<64;i++) out[i] = 0;
for(i=0;i<64;i++) pflags[i] = 0;
double B[DIM], a[DIM][ORD], c[DIM][ORD], A[DIM][DIM];

#pragma parallel shared(tt1,tt2,a,c,A,dim,dim1,pflags) \
	local(time1,time2,flags_tot,i,j,k,pnum)
{
time1=clock();
#pragma pfor
for(i=0;i<DIM;i++) {
  for(j=0;j<ORD;j++) {
    a[i][j]=1.0+1.2*(i+j)*1e-14;
  }
}

#pragma pfor
for(i=0;i<DIM;i++) {
  for(j=0;j<ORD;j++) {
    c[i][j]=a[i][j];
  }
}

#pragma pfor
for(i=0;i<DIM;i++) {
  for(j=0;j<DIM;j++) {
    A[i][j]=0.51*(i+j+1)*1e-14;
  }
}

int pnum=mp_my_threadnum_();
int dim1=dim/thds;

#pragma synchronize
for(k=0;k<LOOP;k++) {
  MATVEC(&dim1,&dim,&A[pnum*dim1][0],&a[0][0],&c[pnum*dim1][0]);
#pragma synchronize               //removed in unsync run
#pragma pfor                      //removed in unsync run
  for(i=0;i<DIM;i++) {            //removed in unsync run
    for(j=0;j<ORD;j++) {          //removed in unsync run
      a[i][j]=c[i][j];            //removed in unsync run
    }                             //removed in unsync run
  }                               //removed in unsync run
}

#pragma pfor
for(i=0;i<DIM;i++) {
  for(j=0;j<ORD;j++) {
    out[pnum]+=a[i][j];
  }
}
time2=clock();
#pragma critical
  tt1+=double(time2-time1)/double(CLOCKS_PER_SEC);
} //end parallel


for(i=0;i<64;i++) tot_out += out[i];

cout << "tot_out = " << setiosflags(ios::fixed | ios::showpoint) <<
setprecision
(8) << tot_out << "\n";
cout << "Total time = " << setprecision(2) << tt1 << " seconds" << endl;

}
batch file for Origin
#!/bin/csh
#BSUB -q debug    
#BSUB -m "jord1"
#BSUB -o pragsync.jo
#BSUB -J pragsync 
#BSUB -c 05:00

setenv MP_SET_NUMTHREADS 8       
ssusage pragsync

setenv MP_SET_NUMTHREADS 16       
ssusage pragsync

setenv MP_SET_NUMTHREADS 24       
ssusage pragsync

setenv MP_SET_NUMTHREADS 32      
ssusage pragsync

setenv MP_SET_NUMTHREADS 40       
ssusage pragsync

setenv MP_SET_NUMTHREADS 48       
ssusage pragsync

setenv MP_SET_NUMTHREADS 56       
ssusage pragsync

setenv MP_SET_NUMTHREADS 64       
ssusage pragsync

Your comments and suggestions are appreciated.
[Previous] [Wilkins Home Page]

Edited by: wilkins@mps.ohio-state.edu [February 1998]