diff --git a/GPUs/S_GPU_cuB.slurm b/GPUs/GPU.slurm
similarity index 86%
rename from GPUs/S_GPU_cuB.slurm
rename to GPUs/GPU.slurm
index a7c17469e54c5d9ec968760d2f68bc9543c3a893..62b325c501de332c5c626520c83dd2fa7aa31342 100644
--- a/GPUs/S_GPU_cuB.slurm
+++ b/GPUs/GPU.slurm
@@ -36,9 +36,10 @@ output="/users/guest/petyros/Training/Outputs" ##/Inputs
 partition="gpu"
 ## Change this to the directory of your executable!
 gpu_prog="/users/guest/petyros/Training/GPUs/cuBLAS"
-rm -f "$output/Single_GPU.$partition"
+gpu_prog1="/users/guest/petyros/Training/GPUs/cuBLAS_MultiGPU"
+rm -f "$output/Single_GPU.$partition" "$output/Multi_GPU.$partition"
 
 for n;
 do
-	srun $gpu_prog $n $n >> "$output/Single_GPU.$partition"
+	srun $gpu_prog1 $n $n >> "$output/Multi_GPU.$partition"
 done
diff --git a/GPUs/Makefile b/GPUs/Makefile
index 9848d62998fc1f6bd2ba751d02f6b862d35e3567..375faa97aca2d127018f73a7819c6ea826543efd 100644
--- a/GPUs/Makefile
+++ b/GPUs/Makefile
@@ -10,14 +10,16 @@ CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
 #CFLAGS=-O3 -Wall -xCORE-AVX2 
 #ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I
 
+MPI_PREFIX = $(I_MPI_ROOT)
 CUDA_PREFIX = $(CUDAROOT)
-GPU_OPENMP_CXX = nvcc -Xcompiler -fopenmp -lgomp
+GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi
 GPU_CXX = nvcc
 
 LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
 
 GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include  -arch sm_35 
-CPU_COMPILE= $(CC) $(CFLAGS)
+GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include  -arch sm_35
+CPU_COMPILE = $(CC) $(CFLAGS) 
 
 ifeq ($(DEBUG), 1)
 	CPU_COMPILE += -D_DEBUG_
@@ -29,12 +31,15 @@ GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
 
 EXT_DIR = /users/guest/petyros/Training/External_Functions/
 
-SOURCE = cuBLAS.cu 
+SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu
 OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o
-PROGRAMS= cuBLAS
+PROGRAMS= cuBLAS cuBLAS_MultiGPU
 
 all: $(PROGRAMS)
 
+cuBLAS_MultiGPU: $(OBJECTS) cuBLAS_MultiGPU.cu
+	$(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
+
 cuBLAS: $(OBJECTS) cuBLAS.cu
 	$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
 
diff --git a/GPUs/cuBLAS b/GPUs/cuBLAS
index a4f89ae43261d12356d9200e91dca87bd8a62178..2fdcdd01c33867b92a5f788e60607b6575ea6294 100755
Binary files a/GPUs/cuBLAS and b/GPUs/cuBLAS differ
diff --git a/GPUs/cuBLAS.cu b/GPUs/cuBLAS.cu
index 02408daf02c787394be7f39ba7953b9eb7a8fbfb..dfffef9920e6faf48a64a059c3692f8684a6025a 100644
--- a/GPUs/cuBLAS.cu
+++ b/GPUs/cuBLAS.cu
@@ -38,6 +38,7 @@ int main(int argc, char **argv)
 	}
 	else error("Too many Arguments");
 
+	/* Allocate space */
 	double *x 			= (double *) malloc(m * sizeof(*x));
 	double *y	= (double *) malloc(n * sizeof(*y));
 	double *M 			= (double *) malloc(n * m * sizeof(*M));
@@ -54,6 +55,7 @@ int main(int argc, char **argv)
 	vec_init_rand(x, m, 1.0);
 	vec_init(y, n, 0.0);
 
+	/* Initialize cuda/cublas variables */
 	int device_num=0;
 	cudaGetDeviceCount(&device_num);
 	if (!device_num) printf("No available Cuda Devices");
@@ -64,6 +66,8 @@ int main(int argc, char **argv)
 	cublasStatus_t stat;
 	cublasHandle_t handle;
 	double *A, * y, *x_c;
+
+	/* Initialize Unified memmory */
 	cudaMallocManaged(&A, m*n * sizeof(double));
 	cudaMallocManaged(&y, n * sizeof(double));
 	cudaMallocManaged(&x_c, m * sizeof(double));
@@ -86,12 +90,13 @@ int main(int argc, char **argv)
 	timer = csecond() - timer;	
 	cudaCheckErrors("cublasDgemv failed");
 	
-#ifdef _DEBUG_ /* Output y in a file for debug purposes */
-        		FILE * fp;
-				char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ;
-				if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
-        		for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
-				fclose(fp) ;
+#ifdef _DEBUG_ 
+	/* Output y vector to a file for debugging */
+	FILE * fp;
+	char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ;
+	if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+	for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
+	fclose(fp) ;
 #endif
 	report_results(timer);
 	}
diff --git a/GPUs/cuBLAS_MultiGPU b/GPUs/cuBLAS_MultiGPU
new file mode 100755
index 0000000000000000000000000000000000000000..d7cf7745cfa6d457b305458f378f46353e618113
Binary files /dev/null and b/GPUs/cuBLAS_MultiGPU differ
diff --git a/GPUs/cuBLAS_MultiGPU.cu b/GPUs/cuBLAS_MultiGPU.cu
new file mode 100644
index 0000000000000000000000000000000000000000..05bbb74f0e897b48842ae15d4fd4a8e249a2c8c5
--- /dev/null
+++ b/GPUs/cuBLAS_MultiGPU.cu
@@ -0,0 +1,156 @@
+/*
+ * A Serial implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse_v2.h>
+#include <mpi.h>
+#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
+#include "/users/guest/petyros/Training/External_Functions/util.h"
+#include "/users/guest/petyros/Training/External_Functions/input.h"
+#include "/users/guest/petyros/Training/External_Functions/gpu_util.h"
+
+
+
+int main(int argc, char ** argv) 
+{
+    int rank,size;
+    int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
+    int global_padded_nm[2];   //padded global matrix dimensions (if padding is not needed, global_padded=global)
+    int i,j,k, sparse=0, *cooCol, n_z, *I;
+    double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t;
+    
+	
+    MPI_Init(&argc,&argv);
+    MPI_Comm_size(MPI_COMM_WORLD,&size);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+	
+	if (argc < 2) error("Too few Arguments");
+	else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
+	{
+		if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
+		sparse = 1;
+	}
+	else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
+        global_nm[0]=atoi(argv[1]);
+        global_nm[1]=atoi(argv[2]);	
+	}
+	else error("Too many Arguments");
+
+	/* Padd N if needed */
+   	local_nm[1]=global_nm[1];
+	global_padded_nm[1]=global_nm[1];
+
+	if (global_nm[0]%size==0) {
+		local_nm[0]=global_nm[0]/size;
+		global_padded_nm[0]=global_nm[0];
+	}
+	else {
+		local_nm[0]=(global_nm[0]/size)+1;
+		global_padded_nm[0]=local_nm[0]*size;
+	}
+
+	x =	(double *) malloc(global_padded_nm[1] * sizeof(*x));
+    if (rank==0) {
+		M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
+		M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl));
+		vec_init_rand(x, global_padded_nm[1], 1.0);
+		y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
+		vec_init(y, global_padded_nm[0], 0.0);
+		if( !y || !x || !M || !M_cl ) error("memory allocation failed");
+		
+		/* Initialize matrices */
+		if (sparse) {
+		; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
+		}
+		else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
+	}
+	//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]);
+
+
+	/* Initialize Unified memmory */
+	cudaMallocManaged(&A, local_nm[0] * local_nm[1] * sizeof(double));
+
+	cudaMallocManaged(&local_y, local_nm[0] * sizeof(*local_y));
+	vec_init(local_y, local_nm[0], 0.0);
+
+	cudaMallocManaged(&x_c, m * sizeof(double));
+	cudaDeviceSynchronize();
+	cudaCheckErrors("Unified Alloc failed");
+	if ( !A || !y || !x_c) error("unified alloc failed");
+	matrix_col_major(M, M_cl, n, m);
+
+    /* Rank 0 scatters the global matrix */
+	double * gsendbuf;
+	if (rank == 0){
+		gsendbuf = &(M_cl[0]);
+		comm_t= MPI_Wtime(); 
+	}
+
+	MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	//if (rank == 0) printf("Scatter complete\n");
+	MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	for (i = 0; i < m; i++) x_c[i] = x[i];
+
+	/* Initialize cuda/cublas variables */
+	int device_num=0;
+	cudaGetDeviceCount(&device_num);
+	if (!device_num) printf("No available Cuda Devices");
+	else {	
+
+	double alf=1.0; /* Y=a*A*x+b */
+	double beta=0.0;
+	cublasStatus_t stat;
+	cublasHandle_t handle;
+	stat = cublasCreate(&handle);
+
+	/* Warmup */
+	stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1);
+	cudaDeviceSynchronize();
+
+	if (rank==0) {
+		printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", n, m, device_num, getenv("SLURM_JOB_NUM_NODES"),                    getenv("SLURM_NTASKS_PER_NODE")) ;
+		comp_t= MPI_Wtime(); 
+	}
+
+	for (j = 0; j < NR_ITER; ++j) {	
+			stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1);
+			cudaDeviceSynchronize();
+	}	
+	cudaCheckErrors("cublasDgemv failed");
+	
+	MPI_Barrier(MPI_COMM_WORLD);
+	if (rank==0) comp_t= MPI_Wtime() - comp_t;
+	MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t;
+
+#ifdef _DEBUG_ 
+	/* Output y vector to a file for debugging */
+	if (rank == 0) {
+        FILE * fp;
+		char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ;
+		if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+        for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
+		fclose(fp) ;
+#endif
+		report_mpi_results(comm_t, comp_t);
+		free(M);
+		free(y);
+	}
+	free(x);
+	free(local_y);
+	free(A);
+
+    MPI_Finalize();
+    return 0;
+}
+
+
+
diff --git a/GPUs/gpu_util.o b/GPUs/gpu_util.o
index 7fa63c29355575a2adfd7d780dcf4607ffe8f4d5..fe0d98b88d49068ba0415350cc29cec8b89f16cb 100644
Binary files a/GPUs/gpu_util.o and b/GPUs/gpu_util.o differ
diff --git a/GPUs/input.o b/GPUs/input.o
index 6151b28af5a1adaddbc320baafafc7a6e77cae1f..3e045d3080b9c47c4c2e73b225b8229efef46427 100644
Binary files a/GPUs/input.o and b/GPUs/input.o differ
diff --git a/GPUs/matrix_op.o b/GPUs/matrix_op.o
index 3bf0cbda49d8badffe50c1522fb9f7894f54f8fc..fe41f317bd01913a48c353e014bb98f244977c4b 100644
Binary files a/GPUs/matrix_op.o and b/GPUs/matrix_op.o differ
diff --git a/GPUs/timer.o b/GPUs/timer.o
index 413acb8cb7879b51ebf1595766945e62cf0869e2..eb6db0f8036cd34a6c8b544aa26049d26f0de053 100644
Binary files a/GPUs/timer.o and b/GPUs/timer.o differ
diff --git a/GPUs/util.o b/GPUs/util.o
index 26ba20a666e79a65dc63bc46da818b49938c90ab..c7553f6c77631def65ffd99e2827628cbd03ddda 100644
Binary files a/GPUs/util.o and b/GPUs/util.o differ
diff --git a/MPI/MPI b/MPI/MPI
deleted file mode 100755
index e471b22f4e92ecacbe5a072943d5cc1dd67b3839..0000000000000000000000000000000000000000
Binary files a/MPI/MPI and /dev/null differ
diff --git a/MPI/MPI-OpenMP b/MPI/MPI-OpenMP
deleted file mode 100755
index 83b9b416469611a79fdda1d54630a93a4a7ef1d4..0000000000000000000000000000000000000000
Binary files a/MPI/MPI-OpenMP and /dev/null differ
diff --git a/MPI/input.o b/MPI/input.o
deleted file mode 100644
index 43c907c26e26ddf12b95de99eeb502441bf6f9df..0000000000000000000000000000000000000000
Binary files a/MPI/input.o and /dev/null differ
diff --git a/MPI/matrix_op.o b/MPI/matrix_op.o
deleted file mode 100644
index dec6c20d24f531a2d445d5435244f3b28883b45f..0000000000000000000000000000000000000000
Binary files a/MPI/matrix_op.o and /dev/null differ
diff --git a/MPI/timer.o b/MPI/timer.o
deleted file mode 100644
index da3a10d0d4a7907c4fb78c303e360254b6c5442e..0000000000000000000000000000000000000000
Binary files a/MPI/timer.o and /dev/null differ
diff --git a/MPI/util.o b/MPI/util.o
deleted file mode 100644
index 0ae3366f4fe19c1f4f84d7fba90e7b9314ae63db..0000000000000000000000000000000000000000
Binary files a/MPI/util.o and /dev/null differ
diff --git a/Outputs/Multi_GPU.gpu b/Outputs/Multi_GPU.gpu
new file mode 100644
index 0000000000000000000000000000000000000000..2910fc56f50ce845c16d2f96323adb41d4f7a91c
--- /dev/null
+++ b/Outputs/Multi_GPU.gpu
@@ -0,0 +1 @@
+Single GPU CUDA Version(N=10000, M=10000): t= 4.058859 ms
diff --git a/Outputs/Single_GPU.gpu b/Outputs/Single_GPU.gpu
deleted file mode 100644
index 2f9c371bc0d2916cbe375003784bc9ff52854f81..0000000000000000000000000000000000000000
--- a/Outputs/Single_GPU.gpu
+++ /dev/null
@@ -1,2 +0,0 @@
-Single GPU CUDA Version(N=1000, M=1000): t= 0.092299 ms
-Single GPU CUDA Version(N=10000, M=10000): t= 4.060280 ms