From 2475b2a243b43b84ddce74156f05276552aad9eb Mon Sep 17 00:00:00 2001 From: "petros.anastasiadis" Date: Tue, 17 Oct 2017 14:53:28 +0300 Subject: [PATCH] Code comments, Some Graphs --- GPUs/GPU.slurm | 14 +++++++------- GPUs/cuBLAS.cu | 35 ++++++++++++----------------------- GPUs/cuBLAS_MultiGPU.cu | 30 +++++++++++------------------- GPUs/cuda_SingleGPU.cu | 20 ++++---------------- MPI/MPI-OpenMP.c | 22 +++++++--------------- MPI/MPI.c | 24 ++++++++---------------- MPI/MPI.slurm | 8 ++++---- OpenMP/OpenMP.c | 17 ++++------------- OpenMP/OpenMP_aff.c | 17 ++++------------- Serial/Serial.c | 18 ++++-------------- 10 files changed, 65 insertions(+), 140 deletions(-) diff --git a/GPUs/GPU.slurm b/GPUs/GPU.slurm index 6a516e1..86653c5 100644 --- a/GPUs/GPU.slurm +++ b/GPUs/GPU.slurm @@ -11,11 +11,11 @@ #SBATCH --job-name=run_GPU # Job name #SBATCH --output=GPU.out #SBATCH --error=GPU.err -#SBATCH --ntasks=1 # Number of processor cores (i.e. tasks) -#SBATCH --nodes=1 # Number of nodes requested -#SBATCH --ntasks-per-node=1 # Tasks per node +#SBATCH --ntasks=32 # Number of processor cores (i.e. tasks) +#SBATCH --nodes=16 # Number of nodes requested +#SBATCH --ntasks-per-node=2 # Tasks per node #SBATCH --cpus-per-task=1 # Threads per task -#SBATCH --gres=gpu:1 # GPUs per node +#SBATCH --gres=gpu:2 # GPUs per node #SBATCH --time=00:40:00 # walltime #SBATCH --mem=32G # memory per NODE #SBATCH --partition=gpu # Partition @@ -40,9 +40,9 @@ gpu_prog2="./cuBLAS_MultiGPU.exe" for n; do - srun $gpu_prog $n $n - srun $gpu_prog1 $n $n + #srun $gpu_prog $n $n >> temp.out + #srun $gpu_prog1 $n $n >> temp.out # Important note: In MultiGPU version you must use gres=ntasks-per-node values in order to utilize all GPUs !!! -# srun $gpu_prog2 $n $n + srun $gpu_prog2 $n $n >> temp.out done diff --git a/GPUs/cuBLAS.cu b/GPUs/cuBLAS.cu index d05d5f2..d2c10ef 100644 --- a/GPUs/cuBLAS.cu +++ b/GPUs/cuBLAS.cu @@ -25,17 +25,9 @@ int main(int argc, char **argv) { /* Initializations */ int i, j, n, m; - int *I, *cooCol, n_z, sparse=0; - double *cooVal, timer; + double timer; - - /* File Input to COO */ - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } + if (argc < 3) error("Usage: ./Program N M"); else if ( argc == 3) { /*./Program N M */ n = atoi(argv[1]); m = atoi(argv[2]); @@ -44,31 +36,28 @@ int main(int argc, char **argv) /* Allocate space */ double *x = (double *) malloc(m * sizeof(*x)); - double *y = (double *) malloc(n * sizeof(*y)); double *M = (double *) malloc(n * m * sizeof(*M)); - - if( !y || !x || !M ) error("memory allocation failed"); + if( !x || !M ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ + ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ /* Initialize vectors */ vec_init_rand(x, m, 1.0); - vec_init(y, n, 0.0); /* Initialize cuda/cublas variables */ int device_num=0; cudaGetDeviceCount(&device_num); - if (!device_num) printf("No available Cuda Devices"); - else { - printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m); + if (!device_num) { + printf("No available Cuda Devices...terminating"); + return 0; + } double alf=1.0; /* Y=a*A*x+b */ double beta=0.0; cublasHandle_t handle; double *A, * y, *x_c; + + printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m); /* Initialize Unified memmory visible and accesible from both CPU and GPU */ cudaMallocManaged(&A, m*n * sizeof(double)); @@ -81,7 +70,7 @@ int main(int argc, char **argv) matrix_col_major(M, A, n, m); /* We transpose the matrix because cuBLAS works with column-major format */ cublasCreate(&handle); - /* Warmup */ + /*GPU Warmup */ cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1); cudaDeviceSynchronize(); @@ -102,7 +91,7 @@ int main(int argc, char **argv) fclose(fp) ; #endif report_results(timer); - } + return 0; } diff --git a/GPUs/cuBLAS_MultiGPU.cu b/GPUs/cuBLAS_MultiGPU.cu index 429c1fe..f8258dc 100644 --- a/GPUs/cuBLAS_MultiGPU.cu +++ b/GPUs/cuBLAS_MultiGPU.cu @@ -27,27 +27,22 @@ int main(int argc, char ** argv) int rank,size; int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain) int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global) - int i, j, sparse=0, *cooCol, n_z, *I; - double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t; + int i, j; + double * M, *M_cl, * A, * x, * y, *local_y, *x_c, comm_t, comp_t; /* MPI basic initializations */ MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } - else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ - global_nm[0]=atoi(argv[1]); - global_nm[1]=atoi(argv[2]); + if (argc < 3) error("Usage: ./Program N M"); + else if ( argc == 3) { /*./Program N M */ + global_nm[0] = atoi(argv[1]); + global_nm[1] = atoi(argv[2]); } else error("Too many Arguments"); - /* Padd M if needed */ + /* Padd M in 'size' equal pieces */ local_nm[0]=global_nm[0]; global_padded_nm[0]=global_nm[0]; @@ -72,10 +67,7 @@ int main(int argc, char ** argv) if( !y || !x || !M || !M_cl ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ + ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ } //if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local_nm[0],local_nm[1],global_padded_nm[0],global_padded_nm[1]); @@ -87,9 +79,10 @@ int main(int argc, char ** argv) x_c = (double *) malloc(local_nm[1] * sizeof(*x_c)); if ( !A || !local_y || !x_c) error("Process local alloc failed"); + /* Unlike the MPI code, we want each proccess data to be in a good shape for GPU utilization. Thats why we transpose the matrix and we scatter it M dimension-wise */ if(rank == 0) matrix_col_major(M, M_cl, global_padded_nm[0], global_padded_nm[1]); - /* Rank 0 scatters the global matrix and x vector */ + /* Rank 0 scatters the global matrix and broadcasts x vector */ double * gsendbuf; if (rank == 0){ gsendbuf = &(M_cl[0]); @@ -113,7 +106,7 @@ int main(int argc, char ** argv) cublasHandle_t handle; cublasCreate(&handle); - /* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large */ + /* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large (possible performance degradation) */ double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ; double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ; double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ; @@ -146,7 +139,6 @@ int main(int argc, char ** argv) if (rank==0) comm_t= MPI_Wtime() - comm_t; MPI_Reduce(local_y, y, local_nm[0], MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - //MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD); if (rank==0) comm_t = MPI_Wtime() - comm_t; if (rank == 0) { diff --git a/GPUs/cuda_SingleGPU.cu b/GPUs/cuda_SingleGPU.cu index db61050..f580485 100644 --- a/GPUs/cuda_SingleGPU.cu +++ b/GPUs/cuda_SingleGPU.cu @@ -28,17 +28,10 @@ int main(int argc, char **argv) { /* Initializations */ int i, j, n, m; - int *I, *cooCol, n_z, sparse=0; - double *cooVal, timer; + double timer; - /* File Input to COO */ - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } + if (argc < 3) error("Usage: ./Program N M"); else if ( argc == 3) { /*./Program N M */ n = atoi(argv[1]); m = atoi(argv[2]); @@ -48,6 +41,7 @@ int main(int argc, char **argv) int grid_size = (n-1)/block_size + 1; size_t shmem_size = 0; + /* GPU kernel block/grid sizes */ dim3 gpu_block(block_size, 1); dim3 gpu_grid(grid_size, 1); @@ -60,12 +54,7 @@ int main(int argc, char **argv) if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */ - - + ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */ /* Initialize vectors */ vec_init_rand(x, m, 1.0); @@ -90,7 +79,6 @@ int main(int argc, char **argv) for (i = 0; i < m; i++) x_c[i] = x[i]; /* First naive kernel */ - for ( i = 0; i < n*m; i++) A[i] = M[i] ; timer=csecond(); for (j = 0; j < NR_ITER; ++j) { diff --git a/MPI/MPI-OpenMP.c b/MPI/MPI-OpenMP.c index 22e2530..372e79b 100644 --- a/MPI/MPI-OpenMP.c +++ b/MPI/MPI-OpenMP.c @@ -20,23 +20,18 @@ int main(int argc, char ** argv) { int rank,size; int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain) int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global) - int i,j,k, sparse=0, *cooCol, n_z, *I; - double * M, * A, * x, * y, *local_y, * cooVal, comm_t, comp_t; + int i,j,k; + double * M, * A, * x, * y, *local_y, comm_t, comp_t; /* MPI basic initializations */ MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } - else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ - global_nm[0]=atoi(argv[1]); - global_nm[1]=atoi(argv[2]); + if (argc < 3) error("Usage: ./Program N M"); + else if ( argc == 3) { /*./Program N M */ + global_nm[0] = atoi(argv[1]); + global_nm[1] = atoi(argv[2]); } else error("Too many Arguments"); @@ -62,10 +57,7 @@ int main(int argc, char ** argv) { if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ + ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ } diff --git a/MPI/MPI.c b/MPI/MPI.c index 2741b86..ac56f49 100644 --- a/MPI/MPI.c +++ b/MPI/MPI.c @@ -20,27 +20,22 @@ int main(int argc, char ** argv) { int rank,size; int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain) int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global) - int i,j,k, sparse=0, *cooCol, n_z, *I; - double * M, * A, * x, * y, *local_y, * cooVal, comm_t, comp_t; + int i,j,k; + double * M, * A, * x, * y, *local_y, comm_t, comp_t; /* MPI basic initializations */ MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } - else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ - global_nm[0]=atoi(argv[1]); - global_nm[1]=atoi(argv[2]); + if (argc < 3) error("Usage: ./Program N M"); + else if ( argc == 3) { /*./Program N M */ + global_nm[0] = atoi(argv[1]); + global_nm[1] = atoi(argv[2]); } else error("Too many Arguments"); - /* Padd N if needed */ + /* Padd N in 'size' equal pieces */ local_nm[1]=global_nm[1]; global_padded_nm[1]=global_nm[1]; @@ -62,10 +57,7 @@ int main(int argc, char ** argv) { if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ + ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ } diff --git a/MPI/MPI.slurm b/MPI/MPI.slurm index deaf636..34eedc7 100644 --- a/MPI/MPI.slurm +++ b/MPI/MPI.slurm @@ -11,8 +11,8 @@ #SBATCH --job-name=run_mpi # Job name #SBATCH --output=MPI.out #SBATCH --error=MPI.err -#SBATCH --ntasks=4 # Number of processor cores (i.e. tasks) -#SBATCH --nodes=4 # Number of nodes requested +#SBATCH --ntasks=16 # Number of processor cores (i.e. tasks) +#SBATCH --nodes=16 # Number of nodes requested #SBATCH --ntasks-per-node=1 # Tasks per node #SBATCH --cpus-per-task=20 # Threads per task #SBATCH --time=00:10:00 # walltime @@ -43,11 +43,11 @@ export OMP_PROC_BIND=spread # OpenMP thread affinity variable for n; do - srun $gpu_prog $n $n + srun $gpu_prog $n $n >> mpi.out for tr in 1 2 5 10 20 # Run for different OpenMP thread numbers ( tr <= cpus-per-task ) do export OMP_NUM_THREADS=$tr - srun $gpu_prog1 $n $n + srun $gpu_prog1 $n $n >> mpi.out done done diff --git a/OpenMP/OpenMP.c b/OpenMP/OpenMP.c index 642d321..4fb7725 100644 --- a/OpenMP/OpenMP.c +++ b/OpenMP/OpenMP.c @@ -22,16 +22,10 @@ int main(int argc, char **argv) { /* Initializations */ int i, j, k, n, m; - int *I, *cooCol, n_z, sparse=0; - double *cooVal, timer; + double timer; - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } - else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ + if (argc < 3) error("Usage: ./Program N M"); + else if ( argc == 3) { /*./Program N M */ n = atoi(argv[1]); m = atoi(argv[2]); } @@ -45,10 +39,7 @@ int main(int argc, char **argv) if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ + matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ /* Initialize vectors */ vec_init_rand(x, m, 1.0); diff --git a/OpenMP/OpenMP_aff.c b/OpenMP/OpenMP_aff.c index 0dc29aa..9b59ece 100644 --- a/OpenMP/OpenMP_aff.c +++ b/OpenMP/OpenMP_aff.c @@ -20,16 +20,10 @@ int main(int argc, char **argv) { /* Initializations */ int i, j, k, n, m; - int *I, *cooCol, n_z, sparse=0; - double *cooVal, timer; + double timer; - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } - else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ + if (argc < 3) error("Usage: ./Program N M"); + else if ( argc == 3) { /*./Program N M */ n = atoi(argv[1]); m = atoi(argv[2]); } @@ -50,10 +44,7 @@ int main(int argc, char **argv) if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ + ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ /* Initialize vectors */ vec_init_rand(x, m, 1.0); diff --git a/Serial/Serial.c b/Serial/Serial.c index a6dbfc6..66720c5 100644 --- a/Serial/Serial.c +++ b/Serial/Serial.c @@ -17,17 +17,10 @@ int main(int argc, char **argv) { /* Initializations */ int i, j, k, n, m; - int *I, *cooCol, n_z, sparse=0; - double *cooVal, timer; + double timer; - - if (argc < 2) error("Too few Arguments"); - else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ - { - if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed"); - sparse = 1; - } - else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ + if (argc < 3) error("Usage: ./Program N M"); + else if ( argc == 3) { /*./Program N M */ n = atoi(argv[1]); m = atoi(argv[2]); } @@ -41,10 +34,7 @@ int main(int argc, char **argv) if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ - if (sparse) { - regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ - } - else matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ + matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ /* Initialize vectors */ vec_init_rand(x, m, 1.0); -- GitLab