diff --git a/GPUs/S_GPU_cuB.slurm b/GPUs/GPU.slurm similarity index 86% rename from GPUs/S_GPU_cuB.slurm rename to GPUs/GPU.slurm index a7c17469e54c5d9ec968760d2f68bc9543c3a893..62b325c501de332c5c626520c83dd2fa7aa31342 100644 --- a/GPUs/S_GPU_cuB.slurm +++ b/GPUs/GPU.slurm @@ -36,9 +36,10 @@ output="/users/guest/petyros/Training/Outputs" ##/Inputs partition="gpu" ## Change this to the directory of your executable! gpu_prog="/users/guest/petyros/Training/GPUs/cuBLAS" -rm -f "$output/Single_GPU.$partition" +gpu_prog1="/users/guest/petyros/Training/GPUs/cuBLAS_MultiGPU" +rm -f "$output/Single_GPU.$partition" "$output/Multi_GPU.$partition" for n; do - srun $gpu_prog $n $n >> "$output/Single_GPU.$partition" + srun $gpu_prog1 $n $n >> "$output/Multi_GPU.$partition" done diff --git a/GPUs/Makefile b/GPUs/Makefile index 9848d62998fc1f6bd2ba751d02f6b862d35e3567..375faa97aca2d127018f73a7819c6ea826543efd 100644 --- a/GPUs/Makefile +++ b/GPUs/Makefile @@ -10,14 +10,16 @@ CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp #CFLAGS=-O3 -Wall -xCORE-AVX2 #ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I +MPI_PREFIX = $(I_MPI_ROOT) CUDA_PREFIX = $(CUDAROOT) -GPU_OPENMP_CXX = nvcc -Xcompiler -fopenmp -lgomp +GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi GPU_CXX = nvcc LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35 -CPU_COMPILE= $(CC) $(CFLAGS) +GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -arch sm_35 +CPU_COMPILE = $(CC) $(CFLAGS) ifeq ($(DEBUG), 1) CPU_COMPILE += -D_DEBUG_ @@ -29,12 +31,15 @@ GPU_COMPILE_OBJ= $(GPU_COMPILE) -c EXT_DIR = /users/guest/petyros/Training/External_Functions/ -SOURCE = cuBLAS.cu +SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o -PROGRAMS= cuBLAS +PROGRAMS= cuBLAS cuBLAS_MultiGPU all: $(PROGRAMS) +cuBLAS_MultiGPU: $(OBJECTS) cuBLAS_MultiGPU.cu + $(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu + cuBLAS: $(OBJECTS) cuBLAS.cu $(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu diff --git a/GPUs/cuBLAS b/GPUs/cuBLAS index a4f89ae43261d12356d9200e91dca87bd8a62178..2fdcdd01c33867b92a5f788e60607b6575ea6294 100755 Binary files a/GPUs/cuBLAS and b/GPUs/cuBLAS differ diff --git a/GPUs/cuBLAS.cu b/GPUs/cuBLAS.cu index 02408daf02c787394be7f39ba7953b9eb7a8fbfb..dfffef9920e6faf48a64a059c3692f8684a6025a 100644 --- a/GPUs/cuBLAS.cu +++ b/GPUs/cuBLAS.cu @@ -38,6 +38,7 @@ int main(int argc, char **argv) } else error("Too many Arguments"); + /* Allocate space */ double *x = (double *) malloc(m * sizeof(*x)); double *y = (double *) malloc(n * sizeof(*y)); double *M = (double *) malloc(n * m * sizeof(*M)); @@ -54,6 +55,7 @@ int main(int argc, char **argv) vec_init_rand(x, m, 1.0); vec_init(y, n, 0.0); + /* Initialize cuda/cublas variables */ int device_num=0; cudaGetDeviceCount(&device_num); if (!device_num) printf("No available Cuda Devices"); @@ -64,6 +66,8 @@ int main(int argc, char **argv) cublasStatus_t stat; cublasHandle_t handle; double *A, * y, *x_c; + + /* Initialize Unified memmory */ cudaMallocManaged(&A, m*n * sizeof(double)); cudaMallocManaged(&y, n * sizeof(double)); cudaMallocManaged(&x_c, m * sizeof(double)); @@ -86,12 +90,13 @@ int main(int argc, char **argv) timer = csecond() - timer; cudaCheckErrors("cublasDgemv failed"); -#ifdef _DEBUG_ /* Output y in a file for debug purposes */ - FILE * fp; - char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ; - if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n"); - for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ; - fclose(fp) ; +#ifdef _DEBUG_ + /* Output y vector to a file for debugging */ + FILE * fp; + char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ; + if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n"); + for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ; + fclose(fp) ; #endif report_results(timer); } diff --git a/GPUs/cuBLAS_MultiGPU b/GPUs/cuBLAS_MultiGPU new file mode 100755 index 0000000000000000000000000000000000000000..d7cf7745cfa6d457b305458f378f46353e618113 Binary files /dev/null and b/GPUs/cuBLAS_MultiGPU differ diff --git a/GPUs/cuBLAS_MultiGPU.cu b/GPUs/cuBLAS_MultiGPU.cu new file mode 100644 index 0000000000000000000000000000000000000000..05bbb74f0e897b48842ae15d4fd4a8e249a2c8c5 --- /dev/null +++ b/GPUs/cuBLAS_MultiGPU.cu @@ -0,0 +1,156 @@ +/* + * A Serial implementation of the Matrix-Vector multiplication + * + * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "/users/guest/petyros/Training/External_Functions/matrix_op.h" +#include "/users/guest/petyros/Training/External_Functions/util.h" +#include "/users/guest/petyros/Training/External_Functions/input.h" +#include "/users/guest/petyros/Training/External_Functions/gpu_util.h" + + + +int main(int argc, char ** argv) +{ + int rank,size; + int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain) + int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global) + int i,j,k, sparse=0, *cooCol, n_z, *I; + double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t; + + + MPI_Init(&argc,&argv); + MPI_Comm_size(MPI_COMM_WORLD,&size); + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + + if (argc < 2) error("Too few Arguments"); + else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ + { + if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed"); + sparse = 1; + } + else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ + global_nm[0]=atoi(argv[1]); + global_nm[1]=atoi(argv[2]); + } + else error("Too many Arguments"); + + /* Padd N if needed */ + local_nm[1]=global_nm[1]; + global_padded_nm[1]=global_nm[1]; + + if (global_nm[0]%size==0) { + local_nm[0]=global_nm[0]/size; + global_padded_nm[0]=global_nm[0]; + } + else { + local_nm[0]=(global_nm[0]/size)+1; + global_padded_nm[0]=local_nm[0]*size; + } + + x = (double *) malloc(global_padded_nm[1] * sizeof(*x)); + if (rank==0) { + M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M)); + M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl)); + vec_init_rand(x, global_padded_nm[1], 1.0); + y = (double *) malloc(global_padded_nm[0] * sizeof(*y)); + vec_init(y, global_padded_nm[0], 0.0); + if( !y || !x || !M || !M_cl ) error("memory allocation failed"); + + /* Initialize matrices */ + if (sparse) { + ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ + } + else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ + } + //if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]); + + + /* Initialize Unified memmory */ + cudaMallocManaged(&A, local_nm[0] * local_nm[1] * sizeof(double)); + + cudaMallocManaged(&local_y, local_nm[0] * sizeof(*local_y)); + vec_init(local_y, local_nm[0], 0.0); + + cudaMallocManaged(&x_c, m * sizeof(double)); + cudaDeviceSynchronize(); + cudaCheckErrors("Unified Alloc failed"); + if ( !A || !y || !x_c) error("unified alloc failed"); + matrix_col_major(M, M_cl, n, m); + + /* Rank 0 scatters the global matrix */ + double * gsendbuf; + if (rank == 0){ + gsendbuf = &(M_cl[0]); + comm_t= MPI_Wtime(); + } + + MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD); + //if (rank == 0) printf("Scatter complete\n"); + MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD); + for (i = 0; i < m; i++) x_c[i] = x[i]; + + /* Initialize cuda/cublas variables */ + int device_num=0; + cudaGetDeviceCount(&device_num); + if (!device_num) printf("No available Cuda Devices"); + else { + + double alf=1.0; /* Y=a*A*x+b */ + double beta=0.0; + cublasStatus_t stat; + cublasHandle_t handle; + stat = cublasCreate(&handle); + + /* Warmup */ + stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1); + cudaDeviceSynchronize(); + + if (rank==0) { + printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", n, m, device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ; + comp_t= MPI_Wtime(); + } + + for (j = 0; j < NR_ITER; ++j) { + stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1); + cudaDeviceSynchronize(); + } + cudaCheckErrors("cublasDgemv failed"); + + MPI_Barrier(MPI_COMM_WORLD); + if (rank==0) comp_t= MPI_Wtime() - comp_t; + MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t; + +#ifdef _DEBUG_ + /* Output y vector to a file for debugging */ + if (rank == 0) { + FILE * fp; + char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ; + if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n"); + for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ; + fclose(fp) ; +#endif + report_mpi_results(comm_t, comp_t); + free(M); + free(y); + } + free(x); + free(local_y); + free(A); + + MPI_Finalize(); + return 0; +} + + + diff --git a/GPUs/gpu_util.o b/GPUs/gpu_util.o index 7fa63c29355575a2adfd7d780dcf4607ffe8f4d5..fe0d98b88d49068ba0415350cc29cec8b89f16cb 100644 Binary files a/GPUs/gpu_util.o and b/GPUs/gpu_util.o differ diff --git a/GPUs/input.o b/GPUs/input.o index 6151b28af5a1adaddbc320baafafc7a6e77cae1f..3e045d3080b9c47c4c2e73b225b8229efef46427 100644 Binary files a/GPUs/input.o and b/GPUs/input.o differ diff --git a/GPUs/matrix_op.o b/GPUs/matrix_op.o index 3bf0cbda49d8badffe50c1522fb9f7894f54f8fc..fe41f317bd01913a48c353e014bb98f244977c4b 100644 Binary files a/GPUs/matrix_op.o and b/GPUs/matrix_op.o differ diff --git a/GPUs/timer.o b/GPUs/timer.o index 413acb8cb7879b51ebf1595766945e62cf0869e2..eb6db0f8036cd34a6c8b544aa26049d26f0de053 100644 Binary files a/GPUs/timer.o and b/GPUs/timer.o differ diff --git a/GPUs/util.o b/GPUs/util.o index 26ba20a666e79a65dc63bc46da818b49938c90ab..c7553f6c77631def65ffd99e2827628cbd03ddda 100644 Binary files a/GPUs/util.o and b/GPUs/util.o differ diff --git a/MPI/MPI b/MPI/MPI deleted file mode 100755 index e471b22f4e92ecacbe5a072943d5cc1dd67b3839..0000000000000000000000000000000000000000 Binary files a/MPI/MPI and /dev/null differ diff --git a/MPI/MPI-OpenMP b/MPI/MPI-OpenMP deleted file mode 100755 index 83b9b416469611a79fdda1d54630a93a4a7ef1d4..0000000000000000000000000000000000000000 Binary files a/MPI/MPI-OpenMP and /dev/null differ diff --git a/MPI/input.o b/MPI/input.o deleted file mode 100644 index 43c907c26e26ddf12b95de99eeb502441bf6f9df..0000000000000000000000000000000000000000 Binary files a/MPI/input.o and /dev/null differ diff --git a/MPI/matrix_op.o b/MPI/matrix_op.o deleted file mode 100644 index dec6c20d24f531a2d445d5435244f3b28883b45f..0000000000000000000000000000000000000000 Binary files a/MPI/matrix_op.o and /dev/null differ diff --git a/MPI/timer.o b/MPI/timer.o deleted file mode 100644 index da3a10d0d4a7907c4fb78c303e360254b6c5442e..0000000000000000000000000000000000000000 Binary files a/MPI/timer.o and /dev/null differ diff --git a/MPI/util.o b/MPI/util.o deleted file mode 100644 index 0ae3366f4fe19c1f4f84d7fba90e7b9314ae63db..0000000000000000000000000000000000000000 Binary files a/MPI/util.o and /dev/null differ diff --git a/Outputs/Multi_GPU.gpu b/Outputs/Multi_GPU.gpu new file mode 100644 index 0000000000000000000000000000000000000000..2910fc56f50ce845c16d2f96323adb41d4f7a91c --- /dev/null +++ b/Outputs/Multi_GPU.gpu @@ -0,0 +1 @@ +Single GPU CUDA Version(N=10000, M=10000): t= 4.058859 ms diff --git a/Outputs/Single_GPU.gpu b/Outputs/Single_GPU.gpu deleted file mode 100644 index 2f9c371bc0d2916cbe375003784bc9ff52854f81..0000000000000000000000000000000000000000 --- a/Outputs/Single_GPU.gpu +++ /dev/null @@ -1,2 +0,0 @@ -Single GPU CUDA Version(N=1000, M=1000): t= 0.092299 ms -Single GPU CUDA Version(N=10000, M=10000): t= 4.060280 ms