Commit d98c1b89 authored by petros.anastasiadis's avatar petros.anastasiadis
Browse files

Update 03/10/2017 - Added cuda version, gitignore

parent 2c57323d
*.o
*.exe
*.out
*.err
/*
* dmv.h -- Declarations and definitions related to the DMV
* multiplication kernels.
*
* Copyright (C) 2010-2012, Computing Systems Laboratory (CSLab)
* Copyright (C) 2010-2012, Vasileios Karakasis
*/
#include <stddef.h>
void vec_init(double *v, size_t n, double val);
void vec_init_rand(double *v, size_t n, double max);
void vec_init_rand_p(double *v, size_t n, size_t np, double max);
int vec_equals(const double *v1, const double *v2, size_t n, double eps);
void vec_print(const double *v, size_t n);
void dmv_csr(int * csrPtr, int *csrCol, double * csrVal, double *x, double *ys, int n);
void dmv_serial(float **a, const float *x, float *y, size_t n);
#!/bin/bash
####################################
# ARIS slurm script template #
# #
# Submit script: sbatch filename #
# #
####################################
##############################################
# ARIS slurm script template #
# #
# Submit script: sbatch GPU.slurm n1 n2 ... #
# #
##############################################
#SBATCH --job-name=run_GPU # Job name
#SBATCH --output=J.out # Stdout (%j expands to jobId)
#SBATCH --error=J.err # Stderr (%j expands to jobId)
#SBATCH --ntasks=16 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=8 # Number of nodes requested
#SBATCH --ntasks-per-node=2 # Tasks per node
#SBATCH --output=GPU.out
#SBATCH --error=GPU.err
#SBATCH --ntasks=1 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=1 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --cpus-per-task=1 # Threads per task
#SBATCH --gres=gpu:2 # GPUs per node
#SBATCH --gres=gpu:1 # GPUs per node
#SBATCH --time=00:40:00 # walltime
#SBATCH --mem=32G # memory per NODE
#SBATCH --partition=gpu # Partition
......@@ -33,17 +33,16 @@ module load cuda
export I_MPI_FABRICS=shm:dapl
output="/users/guest/petyros/Training/Outputs" ##/Inputs
partition="gpu"
## Change this to the directory of your executable!
gpu_prog="/users/guest/petyros/Training/GPUs/cuBLAS"
gpu_prog1="/users/guest/petyros/Training/GPUs/cuBLAS_MultiGPU"
#rm -f "$output/Multi_GPU.$partition" "$output/Single_GPU.$partition"
gpu_prog="./cuda_SingleGPU.exe"
gpu_prog1="./cuBLAS.exe"
gpu_prog2="./cuBLAS_MultiGPU.exe"
## Important note!!! For full GPU utilization in MultiGPU version you must use gres=ntasks-per-node values!!!
for n;
for n;
do
#srun $gpu_prog $n $n >> "$output/Single_GPU.$partition"
srun $gpu_prog1 $n $n >> "$output/Multi_GPU.$partition"
srun $gpu_prog $n $n
srun $gpu_prog1 $n $n
# Important note: In MultiGPU version you must use gres=ntasks-per-node values in order to utilize all GPUs !!!
# srun $gpu_prog2 $n $n
done
CC=g++
ICC =icc
DEBUG ?= 1
DEBUG ?= 0 # Set to 1 for debug
CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell
......@@ -10,16 +10,19 @@ CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
#CFLAGS=-O3 -Wall -xCORE-AVX2
#ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I
# Need to -I this for user-defined functions to work
EXT_DIR = ../External_Functions/
MPI_PREFIX = $(I_MPI_ROOT)
CUDA_PREFIX = $(CUDAROOT)
GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
GPU_CXX = nvcc
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35
CPU_COMPILE = $(CC) $(CFLAGS)
GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
CPU_COMPILE = $(CC) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS)
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
......@@ -30,23 +33,29 @@ endif
CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
EXT_DIR = /users/guest/petyros/Training/External_Functions/
SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu
OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o
PROGRAMS= cuBLAS cuBLAS_MultiGPU
SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu cuda_SingleGPU.cu
OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o dmv_gpu.o
PROGRAMS= cuBLAS.exe cuBLAS_MultiGPU.exe cuda_SingleGPU.exe
all: $(PROGRAMS)
cuBLAS_MultiGPU: $(OBJECTS) cuBLAS_MultiGPU.cu
$(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
cuda_SingleGPU.exe: $(OBJECTS) cuda_SingleGPU.cu
$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuda_SingleGPU.cu
cuBLAS_MultiGPU.exe: $(OBJECTS) cuBLAS_MultiGPU.cu
$(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuBLAS_MultiGPU.cu
cuBLAS: $(OBJECTS) cuBLAS.cu
$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
cuBLAS.exe: $(OBJECTS) cuBLAS.cu
$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuBLAS.cu
gpu_util.o: $(EXT_DIR)gpu_util.cu
$(GPU_COMPILE_OBJ) -o $@ $<
dmv_gpu.o: dmv_gpu.cu
$(GPU_COMPILE_OBJ) -o $@ $<
%.o: $(EXT_DIR)%.c
$(CPU_COMPILE_OBJ) -o $@ $<
......
......@@ -5,8 +5,11 @@
07/09/2017: Completed
13/09/2017: Modified to use unified memory
->cuBLAS_MultiGPU( cuBLAS implementation in multiple GPUs/Nodes)
->cuBLAS_MultiGPU(cuBLAS implementation in multiple GPUs/Nodes)
26/09/2017: Completed
->cuda_SingleGPU(3 cuda kernels showing the optimization steps in writing GPU code)
02/10/2017: Completed kernel 1
03/10/2017: Completed kernel 2 & 3
```
/*
* A Serial implementation of the Matrix-Vector multiplication
* A cuBLAS implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about cuBLAS see http://docs.nvidia.com/cuda/cublas/index.html
*
*/
#include <errno.h>
......@@ -11,16 +14,17 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"
#include "/users/guest/petyros/Training/External_Functions/gpu_util.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"
int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
int i, j, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
......@@ -60,14 +64,13 @@ int main(int argc, char **argv)
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m);
printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasStatus_t stat;
cublasHandle_t handle;
double *A, * y, *x_c;
/* Initialize Unified memmory */
/* Initialize Unified memmory visible and accesible from both CPU and GPU */
cudaMallocManaged(&A, m*n * sizeof(double));
cudaMallocManaged(&y, n * sizeof(double));
cudaMallocManaged(&x_c, m * sizeof(double));
......@@ -75,16 +78,16 @@ int main(int argc, char **argv)
cudaCheckErrors("Unified Alloc failed");
if ( !A || !y || !x_c) error("unified alloc failed");
for (i = 0; i < m; i++) x_c[i] = x[i];
matrix_col_major(M, A, n, m);
stat = cublasCreate(&handle);
matrix_col_major(M, A, n, m); /* We transpose the matrix because cuBLAS works with column-major format */
cublasCreate(&handle);
/* Warmup */
stat=cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
stat=cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
......@@ -93,9 +96,9 @@ int main(int argc, char **argv)
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ;
char filename[] = "cuBLAS.debug" ; /* Common directory for all implementations, change if needed */
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
fclose(fp) ;
#endif
report_results(timer);
......
/*
* A Serial implementation of the Matrix-Vector multiplication
* A Hybrid MPI-CUDA implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about hybrid MPI-CUDA (cublas here) see https://devblogs.nvidia.com/parallelforall/introduction-cuda-aware-mpi/
*/
#include <errno.h>
......@@ -12,10 +14,11 @@
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include <mpi.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"
#include "/users/guest/petyros/Training/External_Functions/gpu_util.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"
......@@ -24,10 +27,10 @@ int main(int argc, char ** argv)
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i,j,k, sparse=0, *cooCol, n_z, *I;
int i, j, sparse=0, *cooCol, n_z, *I;
double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
......@@ -107,30 +110,30 @@ int main(int argc, char ** argv)
cudaSetDevice(rank%device_num);
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
cublasCreate(&handle);
/* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large */
double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
/* Copy data to GPU memmory */
copy_to_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
copy_to_gpu(x_c, gpu_xc, local_nm[1] * sizeof(*x_c));
copy_to_gpu(A, gpu_A, local_nm[0] * local_nm[1] * sizeof(*A));
/* Warmup */
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
if (rank==0) {
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", local_nm[0], local_nm[1], device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", local_nm[0], local_nm[1], device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
comp_t= MPI_Wtime();
}
for (j = 0; j < NR_ITER; ++j) {
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
}
cudaCheckErrors("cublasDgemv failed");
......@@ -150,17 +153,19 @@ int main(int argc, char ** argv)
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ;
char filename[] = "cuBLAS_MultiGPU.debug" ; /* Common directory for all implementations, change if needed */
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
for (i = 0; i < global_nm[0]; ++i) fprintf(fp, "%lf ", y[i]) ;
fclose(fp) ;
#endif
report_mpi_results(comm_t, comp_t);
/* Free rank 0 local memmory */
free(M);
free(M_cl);
free(y);
free(x);
}
/* Free GPU memmory */
gpu_free(local_y);
gpu_free(A);
gpu_free(x_c);
......
/*
* A cuda implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* The device code for the GPU implemntations can be found in the 'dmv_gpu.cu' file
*
* For more info about coalesced memmory access and shmem, see https://cvw.cac.cornell.edu/gpu/coalesced
*/
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include "dmv_gpu.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"
int main(int argc, char **argv)
{
/* Initializations */
int i, j, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
/* File Input to COO */
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File */
{
if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
else error("Too many Arguments");
int block_size = 256; /* Number of GPU threads per block */
int grid_size = (n-1)/block_size + 1;
size_t shmem_size = 0;
dim3 gpu_block(block_size, 1);
dim3 gpu_grid(grid_size, 1);
/* Allocate space */
double *x = (double *) malloc(m * sizeof(*x));
double *y = (double *) malloc(n * sizeof(*y));
double *M = (double *) malloc(n * m * sizeof(*M));
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
vec_init(y, n, 0.0);
/* Initialize cuda variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
cudaSetDevice(0);
printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m);
double *A, * y, *x_c;
/* Initialize Unified memmory visible and accesible from both CPU and GPU */
cudaMallocManaged(&A, m*n * sizeof(double));
cudaMallocManaged(&y, n * sizeof(double));
cudaMallocManaged(&x_c, m * sizeof(double));
cudaDeviceSynchronize();
cudaCheckErrors("Unified Alloc failed");
if ( !A || !y || !x_c) error("unified alloc failed");
for (i = 0; i < m; i++) x_c[i] = x[i];
/* First naive kernel */
for ( i = 0; i < n*m; i++) A[i] = M[i] ;
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
dmv_gpu_naive<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
cudaCheckErrors("naive kernel failed");
report_results(timer);
/* Second kernel, using coalesced memmory accesses in the GPU by transposing the matrix. */
printf("Single GPU CUDA Coalesced Version(N=%d, M=%d): ", n, m);
matrix_col_major(M, A, n, m); /* We transpose the matrix to better match the GPU coalesced memmory access logic */
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
dmv_gpu_coalesced<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
cudaCheckErrors("coalesced kernel failed");
report_results(timer);
/* Third and final kernel further improves memmory access speed by using block exclusive shmem */
printf("Single GPU CUDA shmem Version(N=%d, M=%d): ", n, m);
shmem_size= block_size*sizeof(float);
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
dmv_gpu_shmem<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
cudaCheckErrors("shmem kernel failed");
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char filename[] = "CUDA.debug" ; /* Common directory for all implementations, change if needed */
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
fclose(fp) ;
#endif
report_results(timer);
}
return 0;
}
#include <stdio.h>
/*
* Utility function to get the thread ID within the
* global working space.
*/
__device__ int get_global_tid()
{
return (gridDim.x*blockIdx.y + blockIdx.x)*blockDim.x*blockDim.y +
blockDim.x*threadIdx.y + threadIdx.x;
}
/*
* Utility function to get the thread ID within the
* local/block working space.
*/
__device__ int get_local_tid()
{
return blockDim.x*threadIdx.y + threadIdx.x;
}
/*
* Naive kernel
*/
__global__ void dmv_gpu_naive(const double *a, const double *x, double *y,
size_t n)
{
int tid = get_global_tid();
double yi = 0.0;
if(tid >= n)
return ;
for ( int j = 0 ; j < n; j++ )
yi += + a[tid*n+j]*x[j];
y[tid]=yi;
}
/*
* Coalesced memory acceses
*/
__global__ void dmv_gpu_coalesced(const double *a, const double *x,
double *y, size_t n)
{
int tid = get_global_tid();
double yi = 0.0;
if(tid >= n)
return ;
for ( int j = 0 ; j < n; j++ )
yi += + a[j*n+tid]*x[j];
y[tid]=yi;
}
/*
* Use of shared memory
*/
__global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n)
{
extern __shared__ float shmem_buff[] ;
int tid = get_global_tid(), i, j;
double yi = 0.0;
if(tid >= n)
return ;
int block_s=blockDim.x*blockDim.y;
int lid=get_local_tid(), last_id = n/block_s ;
for( j = 0; j< last_id; j++) {
shmem_buff[lid] = x[block_s*j + lid];
__syncthreads();
for( i = 0 ; i < block_s; i++ ) {
yi += a[tid+ (i+j*block_s)*n]*shmem_buff[i];
}
__syncthreads();
}
y[tid]=yi;
}
#include <stdio.h>
/*
* Utility function to get the thread ID within the
* global working space.
*/
__device__ int get_global_tid();
/*
* Utility function to get the thread ID within the
* local/block working space.
*/
__device__ int get_local_tid();
/*
* Naive kernel
*/
__global__ void dmv_gpu_naive(const double *a, const double *x, double *y, size_t n);
/*
* Coalesced memory acceses
*/
__global__ void dmv_gpu_coalesced(const double *a, const double *x, double *y, size_t n);
/*
* Use of shared memory