From 0ef3b512d718178a3332c80df76d7d27eada1286 Mon Sep 17 00:00:00 2001 From: "petros.anastasiadis" Date: Fri, 20 Oct 2017 14:46:08 +0300 Subject: [PATCH] added compile.mk global var initializer --- External_Functions/compile.mk | 46 +++++++++++++++++++++++++++++++++++ GPUs/Makefile | 40 ++---------------------------- MPI/Makefile | 26 +------------------- OpenMP/Makefile | 30 +++-------------------- README.md | 3 +-- Serial/Makefile | 26 +------------------- Serial/Serial.c | 24 +++++++++++++----- 7 files changed, 73 insertions(+), 122 deletions(-) create mode 100644 External_Functions/compile.mk diff --git a/External_Functions/compile.mk b/External_Functions/compile.mk new file mode 100644 index 0000000..ac3ddeb --- /dev/null +++ b/External_Functions/compile.mk @@ -0,0 +1,46 @@ +DEBUG ?= 0 # Set to 1 for debug + +# Need to -I this for user-defined functions to work +EXT_DIR = ../External_Functions/ + +MPI_PREFIX = $(I_MPI_ROOT) +CUDA_PREFIX = $(CUDAROOT) + +#compile with gcc +CC=gcc +CPP=g++ +MPICC=mpicc +NVCC=nvcc +CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -lrt +#CPU_COMPILE= $(CC) $(CFLAGS) -I$(EXT_DIR) +#CPU_COMPILE_OMP = $(CPU_COMPILE) -fopenmp +#MPI_COMPILE= $(MPICC) -I$(EXT_DIR) +#MPI_OMP_COMPILE= $(MPI_COMPILE) -fopenmp +GPU_MPI_CXX = $(NVCC) -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc +LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt +GPU_COMPILE = $(NVCC) -I $(CUDA_PREFIX)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS) +GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS) +CPU_COMPILE_CUDA = $(CPP) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS) + +#compile with icc +ICC =icc +MPICC=mpiicc +ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I -lrt -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm +CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR) +CPU_COMPILE_OMP = $(CPU_COMPILE) -qopenmp +MPI_COMPILE= $(MPICC) $(ICFLAGS) -I$(EXT_DIR) +MPI_OMP_COMPILE= $(MPI_COMPILE) -mt_mpi -qopenmp + +ifeq ($(DEBUG), 1) + CPU_COMPILE += -D_DEBUG_ +endif + +CPU_COMPILE_OBJ= $(CPU_COMPILE) -c +CPU_COMPILE_OMP_OBJ= $(CPU_COMPILE_OMP) -c +MPI_COMPILE_OBJ= $(MPI_COMPILE) -c +CPU_COMPILE_CUDA_OBJ= $(CPU_COMPILE_CUDA) -c +GPU_COMPILE_OBJ= $(GPU_COMPILE) -c + + + + diff --git a/GPUs/Makefile b/GPUs/Makefile index 6830281..e3f12a2 100644 --- a/GPUs/Makefile +++ b/GPUs/Makefile @@ -1,40 +1,4 @@ -CC=g++ -ICC =icc -NVCC = nvcc - -DEBUG ?= 0 # Set to 1 for debug - -CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp -#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell - -#CFLAGS=-O3 -Wall -xCORE-AVX-I -#CFLAGS=-O3 -Wall -xCORE-AVX2 -#ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I - -# Need to -I this for user-defined functions to work -EXT_DIR = ../External_Functions/ - -MPI_PREFIX = $(I_MPI_ROOT) -CUDA_PREFIX = $(CUDAROOT) -GPU_MPI_CXX = $(NVCC) -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc -GPU_CXX = $(NVCC) - -LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt - -GPU_COMPILE = $(NVCC) -I $(CUDA_PREFIX)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS) -GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS) -CPU_COMPILE = $(CC) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS) - -ifeq ($(DEBUG), 1) - CPU_COMPILE += -D_DEBUG_ - GPU_COMPILE += -D_DEBUG_ - GPU_MPI_COMPILE += -D_DEBUG_ -endif - -CPU_COMPILE_OBJ= $(CPU_COMPILE) -c -GPU_COMPILE_OBJ= $(GPU_COMPILE) -c - - +include ../External_Functions/compile.mk SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu cuda_SingleGPU.cu OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o dmv_gpu.o @@ -58,7 +22,7 @@ dmv_gpu.o: dmv_gpu.cu $(GPU_COMPILE_OBJ) -o $@ $< %.o: $(EXT_DIR)%.c - $(CPU_COMPILE_OBJ) -o $@ $< + $(CPU_COMPILE_CUDA_OBJ) -o $@ $< %.o: %.h diff --git a/MPI/Makefile b/MPI/Makefile index 4b25531..c292a24 100644 --- a/MPI/Makefile +++ b/MPI/Makefile @@ -1,28 +1,4 @@ -#CC=gcc -MPICC=mpiicc - -DEBUG ?= 0 # Set to 1 for debug - -#CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell -#CFLAGS=-O3 -Wall -xCORE-AVX-I -#CFLAGS=-O3 -Wall -xCORE-AVX2 - -# Need to -I this for user-defined functions to work -EXT_DIR = ../External_Functions/ - -ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I - -MPI_COMPILE= $(MPICC) $(ICFLAGS) -I$(EXT_DIR) -MPI_OMP_COMPILE= $(MPI_COMPILE) -mt_mpi -qopenmp - -ifeq ($(DEBUG), 1) - MPI_COMPILE += -D_DEBUG_ -endif - -MPI_COMPILE_OBJ= $(MPI_COMPILE) -c - - +include ../External_Functions/compile.mk SOURCE = MPI.c MPI-OpenMP.c OBJECTS = util.o matrix_op.o timer.o input.o diff --git a/OpenMP/Makefile b/OpenMP/Makefile index adc8b2a..796fc6e 100644 --- a/OpenMP/Makefile +++ b/OpenMP/Makefile @@ -1,26 +1,4 @@ -CC=gcc -ICC =icc - -DEBUG ?= 0 # Set to 1 for debug - -CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp -#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell - -#CFLAGS=-O3 -Wall -xCORE-AVX-I -#CFLAGS=-O3 -Wall -xCORE-AVX2 -ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I -lrt - -# Need to -I this for user-defined functions to work -EXT_DIR = ../External_Functions/ - -CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR) - -ifeq ($(DEBUG), 1) - CPU_COMPILE += -D_DEBUG_ -endif - -CPU_COMPILE_OBJ= $(CPU_COMPILE) -c - +include ../External_Functions/compile.mk SOURCE = OpenMP.c OpenMP_aff.c OBJECTS = util.o matrix_op.o timer.o input.o @@ -29,13 +7,13 @@ PROGRAMS= OpenMP.exe OpenMP_aff.exe all: $(PROGRAMS) OpenMP.exe: $(OBJECTS) OpenMP.c - $(CPU_COMPILE) OpenMP.c -o $@ $(OBJECTS) + $(CPU_COMPILE_OMP) OpenMP.c -o $@ $(OBJECTS) OpenMP_aff.exe: $(OBJECTS) OpenMP_aff.c - $(CPU_COMPILE) OpenMP_aff.c -o $@ $(OBJECTS) + $(CPU_COMPILE_OMP) OpenMP_aff.c -o $@ $(OBJECTS) %.o: $(EXT_DIR)%.c - $(CPU_COMPILE_OBJ) -o $@ $< + $(CPU_COMPILE_OMP_OBJ) -o $@ $< %.o: %.h diff --git a/README.md b/README.md index 40f6996..6ec6fe0 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ Training ├── MPI ├── OpenMP ├── Outputs -│   └── Debug └── Serial ``` @@ -36,5 +35,5 @@ To further scale in multiple nodes, we use a non-shared memory model tool, MPI ( Finally, we implement our base-algorithm with CUDA in a Nvidia GPU(cuda_SingleGPU.cu + dmv_gpu.cu). We invoke 3 different kernels, starting from a simple-naive one and improving him as we go (in the second kernel we transpose the matrix to achieve coalesced memory access, and in the third one we also use the block shared memory (shmem) to utilize bandwidth better). To test our implementations we also implement a cuBLAS (Nvidia parallel BLAS routine library) version (cuBLAS_SingleGPU.cu). Then, we create a final hybrid cuBlAS-MPI version (cuBLAS_MultiGPU.cu) in order to utilize a possible multi-gpu/node architecture (MPI inter-process communication is still a big problem for the Matrix-Vector kernel, but in a more computational intensive scenario a huge scale-up is possible). ## Compilation/Running -All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made). Compilation is performed with intel and cuda compilers ( icc, mpicc, nvcc ), so in a system without the above the makefiles must be modified accordingly ( icc -> gcc, nvcc cannot be replaced), and aditional compile options might be required. +All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made). See compile.mk in External_Functions directory for all the available compiler options. diff --git a/Serial/Makefile b/Serial/Makefile index b5fdb77..e650cf8 100644 --- a/Serial/Makefile +++ b/Serial/Makefile @@ -1,28 +1,4 @@ -CC=gcc -ICC =icc - -DEBUG ?= 0 # Set to 1 for debug - -CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell - -#CFLAGS=-O3 -Wall -xCORE-AVX-I -#CFLAGS=-O3 -Wall -xCORE-AVX2 -ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I -lrt - - -# Need to -I this for user-defined functions to work -EXT_DIR = ../External_Functions/ - -CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR) - -ifeq ($(DEBUG), 1) - CPU_COMPILE += -D_DEBUG_ -endif - -CPU_COMPILE_OBJ= $(CPU_COMPILE) -c - -EXT_DIR = ../External_Functions/ +include ../External_Functions/compile.mk SOURCE = Serial.c OBJECTS = util.o matrix_op.o timer.o input.o diff --git a/Serial/Serial.c b/Serial/Serial.c index 66720c5..f4b06de 100644 --- a/Serial/Serial.c +++ b/Serial/Serial.c @@ -12,6 +12,8 @@ #include "matrix_op.h" #include "util.h" #include "input.h" +#include "mkl.h" +#include "mkl_blas.h" int main(int argc, char **argv) { @@ -29,12 +31,11 @@ int main(int argc, char **argv) /* Allocate space */ double *x = (double *) malloc(m * sizeof(*x)); double *y = (double *) malloc(n * sizeof(*y)); - double **M = (double **) malloc(n * sizeof(*M)); - for( i=0 ; i