From 2475b2a243b43b84ddce74156f05276552aad9eb Mon Sep 17 00:00:00 2001
From: "petros.anastasiadis"
Date: Tue, 17 Oct 2017 14:53:28 +0300
Subject: [PATCH] Code comments, Some Graphs
---
GPUs/GPU.slurm | 14 +++++++-------
GPUs/cuBLAS.cu | 35 ++++++++++++-----------------------
GPUs/cuBLAS_MultiGPU.cu | 30 +++++++++++-------------------
GPUs/cuda_SingleGPU.cu | 20 ++++----------------
MPI/MPI-OpenMP.c | 22 +++++++---------------
MPI/MPI.c | 24 ++++++++----------------
MPI/MPI.slurm | 8 ++++----
OpenMP/OpenMP.c | 17 ++++-------------
OpenMP/OpenMP_aff.c | 17 ++++-------------
Serial/Serial.c | 18 ++++--------------
10 files changed, 65 insertions(+), 140 deletions(-)
diff --git a/GPUs/GPU.slurm b/GPUs/GPU.slurm
index 6a516e1..86653c5 100644
--- a/GPUs/GPU.slurm
+++ b/GPUs/GPU.slurm
@@ -11,11 +11,11 @@
#SBATCH --job-name=run_GPU # Job name
#SBATCH --output=GPU.out
#SBATCH --error=GPU.err
-#SBATCH --ntasks=1 # Number of processor cores (i.e. tasks)
-#SBATCH --nodes=1 # Number of nodes requested
-#SBATCH --ntasks-per-node=1 # Tasks per node
+#SBATCH --ntasks=32 # Number of processor cores (i.e. tasks)
+#SBATCH --nodes=16 # Number of nodes requested
+#SBATCH --ntasks-per-node=2 # Tasks per node
#SBATCH --cpus-per-task=1 # Threads per task
-#SBATCH --gres=gpu:1 # GPUs per node
+#SBATCH --gres=gpu:2 # GPUs per node
#SBATCH --time=00:40:00 # walltime
#SBATCH --mem=32G # memory per NODE
#SBATCH --partition=gpu # Partition
@@ -40,9 +40,9 @@ gpu_prog2="./cuBLAS_MultiGPU.exe"
for n;
do
- srun $gpu_prog $n $n
- srun $gpu_prog1 $n $n
+ #srun $gpu_prog $n $n >> temp.out
+ #srun $gpu_prog1 $n $n >> temp.out
# Important note: In MultiGPU version you must use gres=ntasks-per-node values in order to utilize all GPUs !!!
-# srun $gpu_prog2 $n $n
+ srun $gpu_prog2 $n $n >> temp.out
done
diff --git a/GPUs/cuBLAS.cu b/GPUs/cuBLAS.cu
index d05d5f2..d2c10ef 100644
--- a/GPUs/cuBLAS.cu
+++ b/GPUs/cuBLAS.cu
@@ -25,17 +25,9 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, n, m;
- int *I, *cooCol, n_z, sparse=0;
- double *cooVal, timer;
+ double timer;
-
- /* File Input to COO */
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
+ if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
@@ -44,31 +36,28 @@ int main(int argc, char **argv)
/* Allocate space */
double *x = (double *) malloc(m * sizeof(*x));
- double *y = (double *) malloc(n * sizeof(*y));
double *M = (double *) malloc(n * m * sizeof(*M));
-
- if( !y || !x || !M ) error("memory allocation failed");
+ if( !x || !M ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+ ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
- vec_init(y, n, 0.0);
/* Initialize cuda/cublas variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
- if (!device_num) printf("No available Cuda Devices");
- else {
- printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
+ if (!device_num) {
+ printf("No available Cuda Devices...terminating");
+ return 0;
+ }
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasHandle_t handle;
double *A, * y, *x_c;
+
+ printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
/* Initialize Unified memmory visible and accesible from both CPU and GPU */
cudaMallocManaged(&A, m*n * sizeof(double));
@@ -81,7 +70,7 @@ int main(int argc, char **argv)
matrix_col_major(M, A, n, m); /* We transpose the matrix because cuBLAS works with column-major format */
cublasCreate(&handle);
- /* Warmup */
+ /*GPU Warmup */
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
@@ -102,7 +91,7 @@ int main(int argc, char **argv)
fclose(fp) ;
#endif
report_results(timer);
- }
+
return 0;
}
diff --git a/GPUs/cuBLAS_MultiGPU.cu b/GPUs/cuBLAS_MultiGPU.cu
index 429c1fe..f8258dc 100644
--- a/GPUs/cuBLAS_MultiGPU.cu
+++ b/GPUs/cuBLAS_MultiGPU.cu
@@ -27,27 +27,22 @@ int main(int argc, char ** argv)
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
- int i, j, sparse=0, *cooCol, n_z, *I;
- double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t;
+ int i, j;
+ double * M, *M_cl, * A, * x, * y, *local_y, *x_c, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
- else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
- global_nm[0]=atoi(argv[1]);
- global_nm[1]=atoi(argv[2]);
+ if (argc < 3) error("Usage: ./Program N M");
+ else if ( argc == 3) { /*./Program N M */
+ global_nm[0] = atoi(argv[1]);
+ global_nm[1] = atoi(argv[2]);
}
else error("Too many Arguments");
- /* Padd M if needed */
+ /* Padd M in 'size' equal pieces */
local_nm[0]=global_nm[0];
global_padded_nm[0]=global_nm[0];
@@ -72,10 +67,7 @@ int main(int argc, char ** argv)
if( !y || !x || !M || !M_cl ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
+ ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local_nm[0],local_nm[1],global_padded_nm[0],global_padded_nm[1]);
@@ -87,9 +79,10 @@ int main(int argc, char ** argv)
x_c = (double *) malloc(local_nm[1] * sizeof(*x_c));
if ( !A || !local_y || !x_c) error("Process local alloc failed");
+ /* Unlike the MPI code, we want each proccess data to be in a good shape for GPU utilization. Thats why we transpose the matrix and we scatter it M dimension-wise */
if(rank == 0) matrix_col_major(M, M_cl, global_padded_nm[0], global_padded_nm[1]);
- /* Rank 0 scatters the global matrix and x vector */
+ /* Rank 0 scatters the global matrix and broadcasts x vector */
double * gsendbuf;
if (rank == 0){
gsendbuf = &(M_cl[0]);
@@ -113,7 +106,7 @@ int main(int argc, char ** argv)
cublasHandle_t handle;
cublasCreate(&handle);
- /* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large */
+ /* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large (possible performance degradation) */
double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
@@ -146,7 +139,6 @@ int main(int argc, char ** argv)
if (rank==0) comm_t= MPI_Wtime() - comm_t;
MPI_Reduce(local_y, y, local_nm[0], MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
- //MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t;
if (rank == 0) {
diff --git a/GPUs/cuda_SingleGPU.cu b/GPUs/cuda_SingleGPU.cu
index db61050..f580485 100644
--- a/GPUs/cuda_SingleGPU.cu
+++ b/GPUs/cuda_SingleGPU.cu
@@ -28,17 +28,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, n, m;
- int *I, *cooCol, n_z, sparse=0;
- double *cooVal, timer;
+ double timer;
- /* File Input to COO */
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
+ if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
@@ -48,6 +41,7 @@ int main(int argc, char **argv)
int grid_size = (n-1)/block_size + 1;
size_t shmem_size = 0;
+ /* GPU kernel block/grid sizes */
dim3 gpu_block(block_size, 1);
dim3 gpu_grid(grid_size, 1);
@@ -60,12 +54,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */
-
-
+ ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
@@ -90,7 +79,6 @@ int main(int argc, char **argv)
for (i = 0; i < m; i++) x_c[i] = x[i];
/* First naive kernel */
-
for ( i = 0; i < n*m; i++) A[i] = M[i] ;
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
diff --git a/MPI/MPI-OpenMP.c b/MPI/MPI-OpenMP.c
index 22e2530..372e79b 100644
--- a/MPI/MPI-OpenMP.c
+++ b/MPI/MPI-OpenMP.c
@@ -20,23 +20,18 @@ int main(int argc, char ** argv) {
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
- int i,j,k, sparse=0, *cooCol, n_z, *I;
- double * M, * A, * x, * y, *local_y, * cooVal, comm_t, comp_t;
+ int i,j,k;
+ double * M, * A, * x, * y, *local_y, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
- else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
- global_nm[0]=atoi(argv[1]);
- global_nm[1]=atoi(argv[2]);
+ if (argc < 3) error("Usage: ./Program N M");
+ else if ( argc == 3) { /*./Program N M */
+ global_nm[0] = atoi(argv[1]);
+ global_nm[1] = atoi(argv[2]);
}
else error("Too many Arguments");
@@ -62,10 +57,7 @@ int main(int argc, char ** argv) {
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
+ ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
diff --git a/MPI/MPI.c b/MPI/MPI.c
index 2741b86..ac56f49 100644
--- a/MPI/MPI.c
+++ b/MPI/MPI.c
@@ -20,27 +20,22 @@ int main(int argc, char ** argv) {
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
- int i,j,k, sparse=0, *cooCol, n_z, *I;
- double * M, * A, * x, * y, *local_y, * cooVal, comm_t, comp_t;
+ int i,j,k;
+ double * M, * A, * x, * y, *local_y, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
- else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
- global_nm[0]=atoi(argv[1]);
- global_nm[1]=atoi(argv[2]);
+ if (argc < 3) error("Usage: ./Program N M");
+ else if ( argc == 3) { /*./Program N M */
+ global_nm[0] = atoi(argv[1]);
+ global_nm[1] = atoi(argv[2]);
}
else error("Too many Arguments");
- /* Padd N if needed */
+ /* Padd N in 'size' equal pieces */
local_nm[1]=global_nm[1];
global_padded_nm[1]=global_nm[1];
@@ -62,10 +57,7 @@ int main(int argc, char ** argv) {
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
+ ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
diff --git a/MPI/MPI.slurm b/MPI/MPI.slurm
index deaf636..34eedc7 100644
--- a/MPI/MPI.slurm
+++ b/MPI/MPI.slurm
@@ -11,8 +11,8 @@
#SBATCH --job-name=run_mpi # Job name
#SBATCH --output=MPI.out
#SBATCH --error=MPI.err
-#SBATCH --ntasks=4 # Number of processor cores (i.e. tasks)
-#SBATCH --nodes=4 # Number of nodes requested
+#SBATCH --ntasks=16 # Number of processor cores (i.e. tasks)
+#SBATCH --nodes=16 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --cpus-per-task=20 # Threads per task
#SBATCH --time=00:10:00 # walltime
@@ -43,11 +43,11 @@ export OMP_PROC_BIND=spread # OpenMP thread affinity variable
for n;
do
- srun $gpu_prog $n $n
+ srun $gpu_prog $n $n >> mpi.out
for tr in 1 2 5 10 20 # Run for different OpenMP thread numbers ( tr <= cpus-per-task )
do
export OMP_NUM_THREADS=$tr
- srun $gpu_prog1 $n $n
+ srun $gpu_prog1 $n $n >> mpi.out
done
done
diff --git a/OpenMP/OpenMP.c b/OpenMP/OpenMP.c
index 642d321..4fb7725 100644
--- a/OpenMP/OpenMP.c
+++ b/OpenMP/OpenMP.c
@@ -22,16 +22,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
- int *I, *cooCol, n_z, sparse=0;
- double *cooVal, timer;
+ double timer;
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
- else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
+ if (argc < 3) error("Usage: ./Program N M");
+ else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
@@ -45,10 +39,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+ matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
diff --git a/OpenMP/OpenMP_aff.c b/OpenMP/OpenMP_aff.c
index 0dc29aa..9b59ece 100644
--- a/OpenMP/OpenMP_aff.c
+++ b/OpenMP/OpenMP_aff.c
@@ -20,16 +20,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
- int *I, *cooCol, n_z, sparse=0;
- double *cooVal, timer;
+ double timer;
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
- else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
+ if (argc < 3) error("Usage: ./Program N M");
+ else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
@@ -50,10 +44,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+ ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
diff --git a/Serial/Serial.c b/Serial/Serial.c
index a6dbfc6..66720c5 100644
--- a/Serial/Serial.c
+++ b/Serial/Serial.c
@@ -17,17 +17,10 @@ int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
- int *I, *cooCol, n_z, sparse=0;
- double *cooVal, timer;
+ double timer;
-
- if (argc < 2) error("Too few Arguments");
- else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
- {
- if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
- sparse = 1;
- }
- else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
+ if (argc < 3) error("Usage: ./Program N M");
+ else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
@@ -41,10 +34,7 @@ int main(int argc, char **argv)
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
- if (sparse) {
- regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
- }
- else matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+ matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
--
GitLab