Commit 2475b2a2 authored by petros.anastasiadis's avatar petros.anastasiadis

Code comments, Some Graphs

parent e054f618
......@@ -11,11 +11,11 @@
#SBATCH --job-name=run_GPU # Job name
#SBATCH --output=GPU.out
#SBATCH --error=GPU.err
#SBATCH --ntasks=1 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=1 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --ntasks=32 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=16 # Number of nodes requested
#SBATCH --ntasks-per-node=2 # Tasks per node
#SBATCH --cpus-per-task=1 # Threads per task
#SBATCH --gres=gpu:1 # GPUs per node
#SBATCH --gres=gpu:2 # GPUs per node
#SBATCH --time=00:40:00 # walltime
#SBATCH --mem=32G # memory per NODE
#SBATCH --partition=gpu # Partition
......@@ -40,9 +40,9 @@ gpu_prog2="./cuBLAS_MultiGPU.exe"
for n;
do
srun $gpu_prog $n $n
srun $gpu_prog1 $n $n
#srun $gpu_prog $n $n >> temp.out
#srun $gpu_prog1 $n $n >> temp.out
# Important note: In MultiGPU version you must use gres=ntasks-per-node values in order to utilize all GPUs !!!
# srun $gpu_prog2 $n $n
srun $gpu_prog2 $n $n >> temp.out
done
......@@ -25,17 +25,9 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
double timer;
/* File Input to COO */
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File */
{
if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
......@@ -44,31 +36,28 @@ int main(int argc, char **argv)
/* Allocate space */
double *x = (double *) malloc(m * sizeof(*x));
double *y = (double *) malloc(n * sizeof(*y));
double *M = (double *) malloc(n * m * sizeof(*M));
if( !y || !x || !M ) error("memory allocation failed");
if( !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
vec_init(y, n, 0.0);
/* Initialize cuda/cublas variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
if (!device_num) {
printf("No available Cuda Devices...terminating");
return 0;
}
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasHandle_t handle;
double *A, * y, *x_c;
printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
/* Initialize Unified memmory visible and accesible from both CPU and GPU */
cudaMallocManaged(&A, m*n * sizeof(double));
......@@ -81,7 +70,7 @@ int main(int argc, char **argv)
matrix_col_major(M, A, n, m); /* We transpose the matrix because cuBLAS works with column-major format */
cublasCreate(&handle);
/* Warmup */
/*GPU Warmup */
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
......@@ -102,7 +91,7 @@ int main(int argc, char **argv)
fclose(fp) ;
#endif
report_results(timer);
}
return 0;
}
......
......@@ -27,27 +27,22 @@ int main(int argc, char ** argv)
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i, j, sparse=0, *cooCol, n_z, *I;
double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t;
int i, j;
double * M, *M_cl, * A, * x, * y, *local_y, *x_c, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
global_nm[0]=atoi(argv[1]);
global_nm[1]=atoi(argv[2]);
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
global_nm[0] = atoi(argv[1]);
global_nm[1] = atoi(argv[2]);
}
else error("Too many Arguments");
/* Padd M if needed */
/* Padd M in 'size' equal pieces */
local_nm[0]=global_nm[0];
global_padded_nm[0]=global_nm[0];
......@@ -72,10 +67,7 @@ int main(int argc, char ** argv)
if( !y || !x || !M || !M_cl ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local_nm[0],local_nm[1],global_padded_nm[0],global_padded_nm[1]);
......@@ -87,9 +79,10 @@ int main(int argc, char ** argv)
x_c = (double *) malloc(local_nm[1] * sizeof(*x_c));
if ( !A || !local_y || !x_c) error("Process local alloc failed");
/* Unlike the MPI code, we want each proccess data to be in a good shape for GPU utilization. Thats why we transpose the matrix and we scatter it M dimension-wise */
if(rank == 0) matrix_col_major(M, M_cl, global_padded_nm[0], global_padded_nm[1]);
/* Rank 0 scatters the global matrix and x vector */
/* Rank 0 scatters the global matrix and broadcasts x vector */
double * gsendbuf;
if (rank == 0){
gsendbuf = &(M_cl[0]);
......@@ -113,7 +106,7 @@ int main(int argc, char ** argv)
cublasHandle_t handle;
cublasCreate(&handle);
/* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large */
/* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large (possible performance degradation) */
double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
......@@ -146,7 +139,6 @@ int main(int argc, char ** argv)
if (rank==0) comm_t= MPI_Wtime() - comm_t;
MPI_Reduce(local_y, y, local_nm[0], MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
//MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t;
if (rank == 0) {
......
......@@ -28,17 +28,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
double timer;
/* File Input to COO */
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File */
{
if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
......@@ -48,6 +41,7 @@ int main(int argc, char **argv)
int grid_size = (n-1)/block_size + 1;
size_t shmem_size = 0;
/* GPU kernel block/grid sizes */
dim3 gpu_block(block_size, 1);
dim3 gpu_grid(grid_size, 1);
......@@ -60,12 +54,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */
ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
......@@ -90,7 +79,6 @@ int main(int argc, char **argv)
for (i = 0; i < m; i++) x_c[i] = x[i];
/* First naive kernel */
for ( i = 0; i < n*m; i++) A[i] = M[i] ;
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
......
......@@ -20,23 +20,18 @@ int main(int argc, char ** argv) {
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i,j,k, sparse=0, *cooCol, n_z, *I;
double * M, * A, * x, * y, *local_y, * cooVal, comm_t, comp_t;
int i,j,k;
double * M, * A, * x, * y, *local_y, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
global_nm[0]=atoi(argv[1]);
global_nm[1]=atoi(argv[2]);
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
global_nm[0] = atoi(argv[1]);
global_nm[1] = atoi(argv[2]);
}
else error("Too many Arguments");
......@@ -62,10 +57,7 @@ int main(int argc, char ** argv) {
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
......
......@@ -20,27 +20,22 @@ int main(int argc, char ** argv) {
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i,j,k, sparse=0, *cooCol, n_z, *I;
double * M, * A, * x, * y, *local_y, * cooVal, comm_t, comp_t;
int i,j,k;
double * M, * A, * x, * y, *local_y, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
global_nm[0]=atoi(argv[1]);
global_nm[1]=atoi(argv[2]);
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
global_nm[0] = atoi(argv[1]);
global_nm[1] = atoi(argv[2]);
}
else error("Too many Arguments");
/* Padd N if needed */
/* Padd N in 'size' equal pieces */
local_nm[1]=global_nm[1];
global_padded_nm[1]=global_nm[1];
......@@ -62,10 +57,7 @@ int main(int argc, char ** argv) {
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
......
......@@ -11,8 +11,8 @@
#SBATCH --job-name=run_mpi # Job name
#SBATCH --output=MPI.out
#SBATCH --error=MPI.err
#SBATCH --ntasks=4 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=4 # Number of nodes requested
#SBATCH --ntasks=16 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=16 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --cpus-per-task=20 # Threads per task
#SBATCH --time=00:10:00 # walltime
......@@ -43,11 +43,11 @@ export OMP_PROC_BIND=spread # OpenMP thread affinity variable
for n;
do
srun $gpu_prog $n $n
srun $gpu_prog $n $n >> mpi.out
for tr in 1 2 5 10 20 # Run for different OpenMP thread numbers ( tr <= cpus-per-task )
do
export OMP_NUM_THREADS=$tr
srun $gpu_prog1 $n $n
srun $gpu_prog1 $n $n >> mpi.out
done
done
......
......@@ -22,16 +22,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
double timer;
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
......@@ -45,10 +39,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
......
......@@ -20,16 +20,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
double timer;
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
......@@ -50,10 +44,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
......
......@@ -17,17 +17,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
double timer;
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
......@@ -41,10 +34,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment