Commit 0ef3b512 authored by petros.anastasiadis's avatar petros.anastasiadis
Browse files

added compile.mk global var initializer

parent 427bcc5b
DEBUG ?= 0 # Set to 1 for debug
# Need to -I this for user-defined functions to work
EXT_DIR = ../External_Functions/
MPI_PREFIX = $(I_MPI_ROOT)
CUDA_PREFIX = $(CUDAROOT)
#compile with gcc
CC=gcc
CPP=g++
MPICC=mpicc
NVCC=nvcc
CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -lrt
#CPU_COMPILE= $(CC) $(CFLAGS) -I$(EXT_DIR)
#CPU_COMPILE_OMP = $(CPU_COMPILE) -fopenmp
#MPI_COMPILE= $(MPICC) -I$(EXT_DIR)
#MPI_OMP_COMPILE= $(MPI_COMPILE) -fopenmp
GPU_MPI_CXX = $(NVCC) -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
GPU_COMPILE = $(NVCC) -I $(CUDA_PREFIX)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
CPU_COMPILE_CUDA = $(CPP) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS)
#compile with icc
ICC =icc
MPICC=mpiicc
ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I -lrt -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm
CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
CPU_COMPILE_OMP = $(CPU_COMPILE) -qopenmp
MPI_COMPILE= $(MPICC) $(ICFLAGS) -I$(EXT_DIR)
MPI_OMP_COMPILE= $(MPI_COMPILE) -mt_mpi -qopenmp
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
endif
CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
CPU_COMPILE_OMP_OBJ= $(CPU_COMPILE_OMP) -c
MPI_COMPILE_OBJ= $(MPI_COMPILE) -c
CPU_COMPILE_CUDA_OBJ= $(CPU_COMPILE_CUDA) -c
GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
CC=g++
ICC =icc
NVCC = nvcc
DEBUG ?= 0 # Set to 1 for debug
CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell
#CFLAGS=-O3 -Wall -xCORE-AVX-I
#CFLAGS=-O3 -Wall -xCORE-AVX2
#ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I
# Need to -I this for user-defined functions to work
EXT_DIR = ../External_Functions/
MPI_PREFIX = $(I_MPI_ROOT)
CUDA_PREFIX = $(CUDAROOT)
GPU_MPI_CXX = $(NVCC) -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
GPU_CXX = $(NVCC)
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
GPU_COMPILE = $(NVCC) -I $(CUDA_PREFIX)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
CPU_COMPILE = $(CC) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS)
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
GPU_COMPILE += -D_DEBUG_
GPU_MPI_COMPILE += -D_DEBUG_
endif
CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
include ../External_Functions/compile.mk
SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu cuda_SingleGPU.cu
OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o dmv_gpu.o
......@@ -58,7 +22,7 @@ dmv_gpu.o: dmv_gpu.cu
$(GPU_COMPILE_OBJ) -o $@ $<
%.o: $(EXT_DIR)%.c
$(CPU_COMPILE_OBJ) -o $@ $<
$(CPU_COMPILE_CUDA_OBJ) -o $@ $<
%.o: %.h
......
#CC=gcc
MPICC=mpiicc
DEBUG ?= 0 # Set to 1 for debug
#CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge
#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell
#CFLAGS=-O3 -Wall -xCORE-AVX-I
#CFLAGS=-O3 -Wall -xCORE-AVX2
# Need to -I this for user-defined functions to work
EXT_DIR = ../External_Functions/
ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I
MPI_COMPILE= $(MPICC) $(ICFLAGS) -I$(EXT_DIR)
MPI_OMP_COMPILE= $(MPI_COMPILE) -mt_mpi -qopenmp
ifeq ($(DEBUG), 1)
MPI_COMPILE += -D_DEBUG_
endif
MPI_COMPILE_OBJ= $(MPI_COMPILE) -c
include ../External_Functions/compile.mk
SOURCE = MPI.c MPI-OpenMP.c
OBJECTS = util.o matrix_op.o timer.o input.o
......
CC=gcc
ICC =icc
DEBUG ?= 0 # Set to 1 for debug
CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell
#CFLAGS=-O3 -Wall -xCORE-AVX-I
#CFLAGS=-O3 -Wall -xCORE-AVX2
ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I -lrt
# Need to -I this for user-defined functions to work
EXT_DIR = ../External_Functions/
CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
endif
CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
include ../External_Functions/compile.mk
SOURCE = OpenMP.c OpenMP_aff.c
OBJECTS = util.o matrix_op.o timer.o input.o
......@@ -29,13 +7,13 @@ PROGRAMS= OpenMP.exe OpenMP_aff.exe
all: $(PROGRAMS)
OpenMP.exe: $(OBJECTS) OpenMP.c
$(CPU_COMPILE) OpenMP.c -o $@ $(OBJECTS)
$(CPU_COMPILE_OMP) OpenMP.c -o $@ $(OBJECTS)
OpenMP_aff.exe: $(OBJECTS) OpenMP_aff.c
$(CPU_COMPILE) OpenMP_aff.c -o $@ $(OBJECTS)
$(CPU_COMPILE_OMP) OpenMP_aff.c -o $@ $(OBJECTS)
%.o: $(EXT_DIR)%.c
$(CPU_COMPILE_OBJ) -o $@ $<
$(CPU_COMPILE_OMP_OBJ) -o $@ $<
%.o: %.h
......
......@@ -15,7 +15,6 @@ Training
├── MPI
├── OpenMP
├── Outputs
│   └── Debug
└── Serial
```
......@@ -36,5 +35,5 @@ To further scale in multiple nodes, we use a non-shared memory model tool, MPI (
Finally, we implement our base-algorithm with CUDA in a Nvidia GPU(cuda_SingleGPU.cu + dmv_gpu.cu). We invoke 3 different kernels, starting from a simple-naive one and improving him as we go (in the second kernel we transpose the matrix to achieve coalesced memory access, and in the third one we also use the block shared memory (shmem) to utilize bandwidth better). To test our implementations we also implement a cuBLAS (Nvidia parallel BLAS routine library) version (cuBLAS_SingleGPU.cu). Then, we create a final hybrid cuBlAS-MPI version (cuBLAS_MultiGPU.cu) in order to utilize a possible multi-gpu/node architecture (MPI inter-process communication is still a big problem for the Matrix-Vector kernel, but in a more computational intensive scenario a huge scale-up is possible).
## Compilation/Running
All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made). Compilation is performed with intel and cuda compilers ( icc, mpicc, nvcc ), so in a system without the above the makefiles must be modified accordingly ( icc -> gcc, nvcc cannot be replaced), and aditional compile options might be required.
All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made). See compile.mk in External_Functions directory for all the available compiler options.
CC=gcc
ICC =icc
DEBUG ?= 0 # Set to 1 for debug
CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge
#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell
#CFLAGS=-O3 -Wall -xCORE-AVX-I
#CFLAGS=-O3 -Wall -xCORE-AVX2
ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I -lrt
# Need to -I this for user-defined functions to work
EXT_DIR = ../External_Functions/
CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
endif
CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
EXT_DIR = ../External_Functions/
include ../External_Functions/compile.mk
SOURCE = Serial.c
OBJECTS = util.o matrix_op.o timer.o input.o
......
......@@ -12,6 +12,8 @@
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "mkl.h"
#include "mkl_blas.h"
int main(int argc, char **argv)
{
......@@ -29,12 +31,11 @@ int main(int argc, char **argv)
/* Allocate space */
double *x = (double *) malloc(m * sizeof(*x));
double *y = (double *) malloc(n * sizeof(*y));
double **M = (double **) malloc(n * sizeof(*M));
for( i=0 ; i<n ; ++i) M[i] = (double *) calloc(m, sizeof(double));
double *M = (double *) malloc(n * m * sizeof(*M));
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
......@@ -48,11 +49,24 @@ int main(int argc, char **argv)
register double yi;
for (k = 0; k < n; ++k) {
yi = 0.0 ;
for (j = 0; j < m; ++j) yi += M[k][j]*x[j];
for (j = 0; j < m; ++j) yi += M[n*k+j]*x[j];
y[k] = yi;
}
}
timer = csecond() - timer ;
report_results(timer);
/* BLAS Kernel */
printf("BLAS dgemv Version(N=%d, M=%d): ", n, m);
const double a=1.0,b=0.0;
const char trans='N';
const int inc=1;
timer = csecond();
for (i = 0; i < NR_ITER; ++i){
dgemv_(&trans, &n, &m, &a, M, &n, x, &inc, &b, y, &inc);
}
timer = csecond() - timer ;
report_results(timer);
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
......@@ -63,8 +77,6 @@ int main(int argc, char **argv)
fclose(fp) ;
#endif
report_results(timer);
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment