Commit 326a4535 authored by petros.anastasiadis's avatar petros.anastasiadis
Browse files

Update 26/09/2017 - Added Hybrid MPI-CUDA Version

parent 79a37947
......@@ -11,8 +11,8 @@
#SBATCH --job-name=run_GPU # Job name
#SBATCH --output=J.out # Stdout (%j expands to jobId)
#SBATCH --error=J.err # Stderr (%j expands to jobId)
#SBATCH --ntasks=1 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=1 # Number of nodes requested
#SBATCH --ntasks=4 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=4 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --cpus-per-task=1 # Threads per task
#SBATCH --gres=gpu:1 # GPUs per node
......@@ -39,6 +39,7 @@ gpu_prog="/users/guest/petyros/Training/GPUs/cuBLAS"
gpu_prog1="/users/guest/petyros/Training/GPUs/cuBLAS_MultiGPU"
rm -f "$output/Single_GPU.$partition" "$output/Multi_GPU.$partition"
## Important note!!! For full GPU utilization in MultiGPU version you must use gres=ntasks-per-node values!!!
for n;
do
srun $gpu_prog1 $n $n >> "$output/Multi_GPU.$partition"
......
......@@ -12,18 +12,19 @@ CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
MPI_PREFIX = $(I_MPI_ROOT)
CUDA_PREFIX = $(CUDAROOT)
GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi
GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
GPU_CXX = nvcc
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -arch sm_35
GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35
CPU_COMPILE = $(CC) $(CFLAGS)
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
GPU_COMPILE += -D_DEBUG_
CPU_COMPILE += -D_DEBUG_
GPU_COMPILE += -D_DEBUG_
GPU_MPI_COMPILE += -D_DEBUG_
endif
CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
......
No preview for this file type
No preview for this file type
......@@ -44,24 +44,24 @@ int main(int argc, char ** argv)
}
else error("Too many Arguments");
/* Padd N if needed */
local_nm[1]=global_nm[1];
global_padded_nm[1]=global_nm[1];
/* Padd M if needed */
local_nm[0]=global_nm[0];
global_padded_nm[0]=global_nm[0];
if (global_nm[0]%size==0) {
local_nm[0]=global_nm[0]/size;
global_padded_nm[0]=global_nm[0];
if (global_nm[1]%size==0) {
local_nm[1]=global_nm[1]/size;
global_padded_nm[1]=global_nm[1];
}
else {
local_nm[0]=(global_nm[0]/size)+1;
global_padded_nm[0]=local_nm[0]*size;
local_nm[1]=(global_nm[1]/size)+1;
global_padded_nm[1]=local_nm[1]*size;
}
x = (double *) malloc(global_padded_nm[1] * sizeof(*x));
if (rank==0) {
M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl));
vec_init_rand(x, global_padded_nm[1], 1.0);
vec_init_rand_p(x, global_nm[1], global_padded_nm[1] - global_nm[1], 1.0);
y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
vec_init(y, global_padded_nm[0], 0.0);
if( !y || !x || !M || !M_cl ) error("memory allocation failed");
......@@ -72,22 +72,19 @@ int main(int argc, char ** argv)
}
else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]);
//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local_nm[0],local_nm[1],global_padded_nm[0],global_padded_nm[1]);
/* Initialize Unified memmory */
cudaMallocManaged(&A, local_nm[0] * local_nm[1] * sizeof(double));
cudaMallocManaged(&local_y, local_nm[0] * sizeof(*local_y));
/* Initialize process local memmory */
local_y = (double *) malloc(local_nm[0] * sizeof(*local_y));
vec_init(local_y, local_nm[0], 0.0);
A = (double *) malloc(local_nm[0] * local_nm[1] * sizeof(*A));
x_c = (double *) malloc(local_nm[1] * sizeof(*x_c));
if ( !A || !local_y || !x_c) error("Process local alloc failed");
if(rank == 0) matrix_col_major(M, M_cl, global_padded_nm[0], global_padded_nm[1]);
cudaMallocManaged(&x_c, m * sizeof(double));
cudaDeviceSynchronize();
cudaCheckErrors("Unified Alloc failed");
if ( !A || !y || !x_c) error("unified alloc failed");
matrix_col_major(M, M_cl, n, m);
/* Rank 0 scatters the global matrix */
/* Rank 0 scatters the global matrix and x vector */
double * gsendbuf;
if (rank == 0){
gsendbuf = &(M_cl[0]);
......@@ -97,43 +94,57 @@ int main(int argc, char ** argv)
MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
//if (rank == 0) printf("Scatter complete\n");
MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < m; i++) x_c[i] = x[i];
if (rank == 0) comm_t= MPI_Wtime() - comm_t;
for (i = 0; i < local_nm[1]; i++) x_c[i] = x[rank*local_nm[1] + i];
/* Initialize cuda/cublas variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
cudaSetDevice(rank%device_num);
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
/* Initialize local GPU memmory */
double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
copy_to_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
copy_to_gpu(x_c, gpu_xc, local_nm[1] * sizeof(*x_c));
copy_to_gpu(A, gpu_A, local_nm[0] * local_nm[1] * sizeof(*A));
/* Warmup */
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1);
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
if (rank==0) {
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", n, m, device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", local_nm[0], local_nm[1], device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
comp_t= MPI_Wtime();
}
for (j = 0; j < NR_ITER; ++j) {
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1);
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
}
cudaCheckErrors("cublasDgemv failed");
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) comp_t= MPI_Wtime() - comp_t;
MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t;
copy_from_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
cudaDeviceSynchronize();
MPI_Barrier(MPI_COMM_WORLD);
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
if (rank==0) comm_t= MPI_Wtime() - comm_t;
MPI_Reduce(local_y, y, local_nm[0], MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
//MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t;
if (rank == 0) {
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
......@@ -142,11 +153,14 @@ int main(int argc, char ** argv)
#endif
report_mpi_results(comm_t, comp_t);
free(M);
free(M_cl);
free(y);
free(x);
}
gpu_free(local_y);
gpu_free(A);
gpu_free(x_c);
}
free(x);
free(local_y);
free(A);
MPI_Finalize();
return 0;
......
No preview for this file type
......@@ -103,11 +103,10 @@ int main(int argc, char ** argv) {
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) comp_t= MPI_Wtime() - comp_t;
MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t;
if (rank==0) {
comm_t = MPI_Wtime() - comm_t - comp_t;
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
if (rank == 0) {
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/MPI-OpenMP.out" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
......
......@@ -98,9 +98,9 @@ int main(int argc, char ** argv) {
MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t;
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
if (rank == 0) {
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/MPI.out" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
......
This diff is collapsed.
Single GPU CUDA Version(N=10000, M=10000): t= 4.058859 ms
Multi GPU CUDA-MPI Version(N=10000, M=2500, GPUs/Node=1, Nodes=4, Tasks/Node=1): comp_t= 1.087799 ms, comm_t= 213.823795 ms
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment