diff --git a/External_Functions/compile.mk b/External_Functions/compile.mk
new file mode 100644
index 0000000000000000000000000000000000000000..ac3ddeb6eb78b559bc0d6b397443033557c0ebe1
--- /dev/null
+++ b/External_Functions/compile.mk
@@ -0,0 +1,46 @@
+DEBUG ?= 0  # Set to 1 for debug
+
+# Need to -I this for user-defined functions to work
+EXT_DIR = ../External_Functions/
+
+MPI_PREFIX = $(I_MPI_ROOT)
+CUDA_PREFIX = $(CUDAROOT)
+
+#compile with gcc
+CC=gcc
+CPP=g++
+MPICC=mpicc
+NVCC=nvcc
+CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -lrt
+#CPU_COMPILE= $(CC) $(CFLAGS) -I$(EXT_DIR)
+#CPU_COMPILE_OMP = $(CPU_COMPILE) -fopenmp
+#MPI_COMPILE= $(MPICC) -I$(EXT_DIR)
+#MPI_OMP_COMPILE= $(MPI_COMPILE) -fopenmp
+GPU_MPI_CXX = $(NVCC) -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
+LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt 
+GPU_COMPILE = $(NVCC) -I $(CUDA_PREFIX)/include  -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
+GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
+CPU_COMPILE_CUDA = $(CPP) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS)
+
+#compile with icc
+ICC =icc
+MPICC=mpiicc
+ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I -lrt -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm
+CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
+CPU_COMPILE_OMP = $(CPU_COMPILE) -qopenmp
+MPI_COMPILE= $(MPICC) $(ICFLAGS) -I$(EXT_DIR)
+MPI_OMP_COMPILE= $(MPI_COMPILE) -mt_mpi -qopenmp
+
+ifeq ($(DEBUG), 1)
+	CPU_COMPILE += -D_DEBUG_
+endif
+
+CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
+CPU_COMPILE_OMP_OBJ= $(CPU_COMPILE_OMP) -c
+MPI_COMPILE_OBJ= $(MPI_COMPILE) -c
+CPU_COMPILE_CUDA_OBJ= $(CPU_COMPILE_CUDA) -c
+GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
+
+
+
+
diff --git a/GPUs/Makefile b/GPUs/Makefile
index 6830281798a9c40ffe50dec8bebb907fadeb2618..e3f12a26de6659194ae62c893bcf97f8bcb2c1f8 100644
--- a/GPUs/Makefile
+++ b/GPUs/Makefile
@@ -1,40 +1,4 @@
-CC=g++
-ICC =icc
-NVCC = nvcc
-
-DEBUG ?= 0  # Set to 1 for debug
-
-CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
-#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
-
-#CFLAGS=-O3 -Wall -xCORE-AVX-I
-#CFLAGS=-O3 -Wall -xCORE-AVX2 
-#ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I
-
-# Need to -I this for user-defined functions to work
-EXT_DIR = ../External_Functions/
-
-MPI_PREFIX = $(I_MPI_ROOT)
-CUDA_PREFIX = $(CUDAROOT)
-GPU_MPI_CXX = $(NVCC) -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
-GPU_CXX = $(NVCC)
-
-LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt 
-
-GPU_COMPILE = $(NVCC) -I $(CUDA_PREFIX)/include  -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
-GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
-CPU_COMPILE = $(CC) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS)
-
-ifeq ($(DEBUG), 1)
-	CPU_COMPILE 	+= -D_DEBUG_
-	GPU_COMPILE 	+= -D_DEBUG_
-	GPU_MPI_COMPILE += -D_DEBUG_
-endif
-
-CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
-GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
-
-
+include ../External_Functions/compile.mk
 
 SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu cuda_SingleGPU.cu
 OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o dmv_gpu.o
@@ -58,7 +22,7 @@ dmv_gpu.o: dmv_gpu.cu
 	$(GPU_COMPILE_OBJ) -o $@ $<
 
 %.o: $(EXT_DIR)%.c
-	$(CPU_COMPILE_OBJ) -o $@ $<
+	$(CPU_COMPILE_CUDA_OBJ) -o $@ $<
 
 %.o: %.h
 
diff --git a/MPI/Makefile b/MPI/Makefile
index 4b25531a8d1aa0be5614045f0c8eff13dff2983b..c292a24db6961dd7f462a2982e0fc5872f5ee6fe 100644
--- a/MPI/Makefile
+++ b/MPI/Makefile
@@ -1,28 +1,4 @@
-#CC=gcc
-MPICC=mpiicc
-
-DEBUG ?= 0  # Set to 1 for debug
-
-#CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge 
-#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
-#CFLAGS=-O3 -Wall -xCORE-AVX-I
-#CFLAGS=-O3 -Wall -xCORE-AVX2 
-
-# Need to -I this for user-defined functions to work
-EXT_DIR = ../External_Functions/
-
-ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I
-
-MPI_COMPILE= $(MPICC) $(ICFLAGS) -I$(EXT_DIR)
-MPI_OMP_COMPILE= $(MPI_COMPILE) -mt_mpi -qopenmp
-
-ifeq ($(DEBUG), 1)
-	MPI_COMPILE += -D_DEBUG_
-endif
-
-MPI_COMPILE_OBJ= $(MPI_COMPILE) -c
-
-
+include ../External_Functions/compile.mk
 
 SOURCE = MPI.c MPI-OpenMP.c
 OBJECTS = util.o matrix_op.o timer.o input.o
diff --git a/OpenMP/Makefile b/OpenMP/Makefile
index adc8b2ad92d48a7f8b35bd471f844054304a88f2..796fc6e5e3edb556e3f56a5a5553612e565709ff 100644
--- a/OpenMP/Makefile
+++ b/OpenMP/Makefile
@@ -1,26 +1,4 @@
-CC=gcc
-ICC =icc
-
-DEBUG ?= 0 # Set to 1 for debug
-
-CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
-#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
- 
-#CFLAGS=-O3 -Wall -xCORE-AVX-I
-#CFLAGS=-O3 -Wall -xCORE-AVX2 
-ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I -lrt
-
-# Need to -I this for user-defined functions to work
-EXT_DIR = ../External_Functions/
-
-CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
-
-ifeq ($(DEBUG), 1)
-	CPU_COMPILE += -D_DEBUG_
-endif
-
-CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
-
+include ../External_Functions/compile.mk
 
 SOURCE = OpenMP.c OpenMP_aff.c 
 OBJECTS = util.o matrix_op.o timer.o input.o
@@ -29,13 +7,13 @@ PROGRAMS= OpenMP.exe OpenMP_aff.exe
 all: $(PROGRAMS)
 
 OpenMP.exe: $(OBJECTS) OpenMP.c
-	$(CPU_COMPILE) OpenMP.c -o $@ $(OBJECTS)
+	$(CPU_COMPILE_OMP) OpenMP.c -o $@ $(OBJECTS)
 
 OpenMP_aff.exe: $(OBJECTS) OpenMP_aff.c
-	$(CPU_COMPILE) OpenMP_aff.c -o $@ $(OBJECTS)
+	$(CPU_COMPILE_OMP) OpenMP_aff.c -o $@ $(OBJECTS)
 
 %.o: $(EXT_DIR)%.c
-	$(CPU_COMPILE_OBJ) -o $@ $<
+	$(CPU_COMPILE_OMP_OBJ) -o $@ $<
 
 %.o: %.h
 
diff --git a/README.md b/README.md
index 40f69969ee5033ac5ff8c37671bd17822767d36f..6ec6fe0946dcd055f36a6b4041053c6fb541c13b 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ Training
 ├── MPI
 ├── OpenMP
 ├── Outputs
-│   └── Debug
 └── Serial
 ```
 
@@ -36,5 +35,5 @@ To further scale in multiple nodes, we use a non-shared memory model tool, MPI (
 Finally, we implement our base-algorithm with CUDA in a Nvidia GPU(cuda_SingleGPU.cu + dmv_gpu.cu). We invoke 3 different kernels, starting from a simple-naive one and improving him as we go (in the second kernel we transpose the matrix to achieve coalesced memory access, and in the third one we also use the block shared memory (shmem) to utilize bandwidth better). To test our implementations we also implement a cuBLAS (Nvidia parallel BLAS routine library) version (cuBLAS_SingleGPU.cu). Then, we create a final hybrid cuBlAS-MPI version (cuBLAS_MultiGPU.cu) in order to utilize a possible multi-gpu/node architecture (MPI inter-process communication is still a big problem for the Matrix-Vector kernel, but in a more computational intensive scenario a huge scale-up is possible). 
 
 ## Compilation/Running
-All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made). Compilation is  performed with intel and cuda compilers ( icc, mpicc, nvcc ), so in a system without the above the makefiles must be modified accordingly ( icc -> gcc, nvcc cannot be replaced), and aditional compile options might be required.  
+All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made). See compile.mk in External_Functions directory for all the available compiler options.
 
diff --git a/Serial/Makefile b/Serial/Makefile
index b5fdb771894e6fa614055e2fb678de24ab7daae5..e650cf858745d84e7552d25e8bf8c3d143246efd 100644
--- a/Serial/Makefile
+++ b/Serial/Makefile
@@ -1,28 +1,4 @@
-CC=gcc
-ICC =icc
-
-DEBUG ?= 0  # Set to 1 for debug
-
-CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge 
-#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
- 
-#CFLAGS=-O3 -Wall -xCORE-AVX-I
-#CFLAGS=-O3 -Wall -xCORE-AVX2 
-ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I -lrt
-
-
-# Need to -I this for user-defined functions to work
-EXT_DIR = ../External_Functions/
-
-CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
-
-ifeq ($(DEBUG), 1)
-	CPU_COMPILE += -D_DEBUG_
-endif
-
-CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
-
-EXT_DIR = ../External_Functions/
+include ../External_Functions/compile.mk
 
 SOURCE = Serial.c
 OBJECTS = util.o matrix_op.o timer.o input.o
diff --git a/Serial/Serial.c b/Serial/Serial.c
index 66720c584212a1c449ba2bc95f975385dcb421ff..f4b06deea10e1dfe2aa2ff22ad12203c39438451 100644
--- a/Serial/Serial.c
+++ b/Serial/Serial.c
@@ -12,6 +12,8 @@
 #include "matrix_op.h"
 #include "util.h"
 #include "input.h"
+#include "mkl.h"
+#include "mkl_blas.h"
 
 int main(int argc, char **argv)
 {
@@ -29,12 +31,11 @@ int main(int argc, char **argv)
 	/* Allocate space */
 	double *x 	= (double *) malloc(m * sizeof(*x));
 	double *y	= (double *) malloc(n * sizeof(*y));
-	double **M 	= (double **) malloc(n * sizeof(*M));
-	for( i=0 ; i<n ; ++i) M[i] = (double *) calloc(m, sizeof(double));
+	double *M 			= (double *) malloc(n * m * sizeof(*M));
 	if( !y || !x || !M ) error("memory allocation failed");
 
 	/* Initialize matrices */
-	matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+	ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
 
 	/* Initialize vectors */
 	vec_init_rand(x, m, 1.0);
@@ -48,11 +49,24 @@ int main(int argc, char **argv)
 		register double yi;
 		for (k = 0; k < n; ++k) {
         	yi = 0.0 ;
-        	for (j = 0; j < m; ++j) yi += M[k][j]*x[j];
+        	for (j = 0; j < m; ++j) yi += M[n*k+j]*x[j];
         	y[k] = yi;
     	}
 	}
 	timer = csecond() - timer ;
+	report_results(timer);
+
+	/* BLAS Kernel */
+	printf("BLAS dgemv Version(N=%d, M=%d): ", n, m);
+	const double  a=1.0,b=0.0;
+	const char trans='N';
+	const int inc=1;
+	timer = csecond();
+	for (i = 0; i < NR_ITER; ++i){
+		dgemv_(&trans, &n, &m, &a, M, &n, x, &inc, &b, y, &inc);
+	}
+	timer = csecond() - timer ;
+	report_results(timer);
 
 #ifdef _DEBUG_
 	/* Output y vector to a file for debugging */
@@ -63,8 +77,6 @@ int main(int argc, char **argv)
 	fclose(fp) ;
 #endif
 
-	report_results(timer);
-
 	return 0;
 }