Commit 79a37947 authored by petros.anastasiadis's avatar petros.anastasiadis
Browse files

Update 25/09/2017

parent f25189ff
......@@ -36,9 +36,10 @@ output="/users/guest/petyros/Training/Outputs" ##/Inputs
partition="gpu"
## Change this to the directory of your executable!
gpu_prog="/users/guest/petyros/Training/GPUs/cuBLAS"
rm -f "$output/Single_GPU.$partition"
gpu_prog1="/users/guest/petyros/Training/GPUs/cuBLAS_MultiGPU"
rm -f "$output/Single_GPU.$partition" "$output/Multi_GPU.$partition"
for n;
do
srun $gpu_prog $n $n >> "$output/Single_GPU.$partition"
srun $gpu_prog1 $n $n >> "$output/Multi_GPU.$partition"
done
......@@ -10,14 +10,16 @@ CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
#CFLAGS=-O3 -Wall -xCORE-AVX2
#ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I
MPI_PREFIX = $(I_MPI_ROOT)
CUDA_PREFIX = $(CUDAROOT)
GPU_OPENMP_CXX = nvcc -Xcompiler -fopenmp -lgomp
GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi
GPU_CXX = nvcc
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35
CPU_COMPILE= $(CC) $(CFLAGS)
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -arch sm_35
CPU_COMPILE = $(CC) $(CFLAGS)
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
......@@ -29,12 +31,15 @@ GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
EXT_DIR = /users/guest/petyros/Training/External_Functions/
SOURCE = cuBLAS.cu
SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu
OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o
PROGRAMS= cuBLAS
PROGRAMS= cuBLAS cuBLAS_MultiGPU
all: $(PROGRAMS)
cuBLAS_MultiGPU: $(OBJECTS) cuBLAS_MultiGPU.cu
$(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
cuBLAS: $(OBJECTS) cuBLAS.cu
$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
......
No preview for this file type
......@@ -38,6 +38,7 @@ int main(int argc, char **argv)
}
else error("Too many Arguments");
/* Allocate space */
double *x = (double *) malloc(m * sizeof(*x));
double *y = (double *) malloc(n * sizeof(*y));
double *M = (double *) malloc(n * m * sizeof(*M));
......@@ -54,6 +55,7 @@ int main(int argc, char **argv)
vec_init_rand(x, m, 1.0);
vec_init(y, n, 0.0);
/* Initialize cuda/cublas variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
......@@ -64,6 +66,8 @@ int main(int argc, char **argv)
cublasStatus_t stat;
cublasHandle_t handle;
double *A, * y, *x_c;
/* Initialize Unified memmory */
cudaMallocManaged(&A, m*n * sizeof(double));
cudaMallocManaged(&y, n * sizeof(double));
cudaMallocManaged(&x_c, m * sizeof(double));
......@@ -86,12 +90,13 @@ int main(int argc, char **argv)
timer = csecond() - timer;
cudaCheckErrors("cublasDgemv failed");
#ifdef _DEBUG_ /* Output y in a file for debug purposes */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
fclose(fp) ;
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
fclose(fp) ;
#endif
report_results(timer);
}
......
/*
* A Serial implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*/
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include <mpi.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"
#include "/users/guest/petyros/Training/External_Functions/gpu_util.h"
int main(int argc, char ** argv)
{
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i,j,k, sparse=0, *cooCol, n_z, *I;
double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
global_nm[0]=atoi(argv[1]);
global_nm[1]=atoi(argv[2]);
}
else error("Too many Arguments");
/* Padd N if needed */
local_nm[1]=global_nm[1];
global_padded_nm[1]=global_nm[1];
if (global_nm[0]%size==0) {
local_nm[0]=global_nm[0]/size;
global_padded_nm[0]=global_nm[0];
}
else {
local_nm[0]=(global_nm[0]/size)+1;
global_padded_nm[0]=local_nm[0]*size;
}
x = (double *) malloc(global_padded_nm[1] * sizeof(*x));
if (rank==0) {
M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl));
vec_init_rand(x, global_padded_nm[1], 1.0);
y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
vec_init(y, global_padded_nm[0], 0.0);
if( !y || !x || !M || !M_cl ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]);
/* Initialize Unified memmory */
cudaMallocManaged(&A, local_nm[0] * local_nm[1] * sizeof(double));
cudaMallocManaged(&local_y, local_nm[0] * sizeof(*local_y));
vec_init(local_y, local_nm[0], 0.0);
cudaMallocManaged(&x_c, m * sizeof(double));
cudaDeviceSynchronize();
cudaCheckErrors("Unified Alloc failed");
if ( !A || !y || !x_c) error("unified alloc failed");
matrix_col_major(M, M_cl, n, m);
/* Rank 0 scatters the global matrix */
double * gsendbuf;
if (rank == 0){
gsendbuf = &(M_cl[0]);
comm_t= MPI_Wtime();
}
MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
//if (rank == 0) printf("Scatter complete\n");
MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < m; i++) x_c[i] = x[i];
/* Initialize cuda/cublas variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
/* Warmup */
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1);
cudaDeviceSynchronize();
if (rank==0) {
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", n, m, device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
comp_t= MPI_Wtime();
}
for (j = 0; j < NR_ITER; ++j) {
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1);
cudaDeviceSynchronize();
}
cudaCheckErrors("cublasDgemv failed");
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) comp_t= MPI_Wtime() - comp_t;
MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t;
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
if (rank == 0) {
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
fclose(fp) ;
#endif
report_mpi_results(comm_t, comp_t);
free(M);
free(y);
}
free(x);
free(local_y);
free(A);
MPI_Finalize();
return 0;
}
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
File deleted
File deleted
Single GPU CUDA Version(N=10000, M=10000): t= 4.058859 ms
Single GPU CUDA Version(N=1000, M=1000): t= 0.092299 ms
Single GPU CUDA Version(N=10000, M=10000): t= 4.060280 ms
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment