diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f7b86366735468071392fe87d3db500d6652993f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+**/*.o
+**/*.exe
+**/*.out
+**/*.err
+**/*.debug
+Outputs/
diff --git a/External_Functions/README.md b/External_Functions/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e2be16ac1fb3a79ba1369ef7a95ed8063453d5cd
--- /dev/null
+++ b/External_Functions/README.md
@@ -0,0 +1,25 @@
+
+# Basic helpfull functions used by some of the programs. 
+
+### Function explanation and usage included in corresponding header(.h) files.
+
+Modify at your own risk!
+
+```
+->input.c
+20/05/2017: Completed
+
+->util.c
+06/09/2017: Completed
+
+->matrix_op.c
+13/09/2017: Completed
+14/09/2017: Modified for array transpose
+
+->gpu_util.c
+14/09/2017: Completed
+
+->timer.c
+06/09/2017: Completed
+19/09/2017: Modified for better precision
+```
diff --git a/External_Functions/gpu_util.cu b/External_Functions/gpu_util.cu
new file mode 100755
index 0000000000000000000000000000000000000000..95db97b70f8a94052a593431d22c446cf0be1768
--- /dev/null
+++ b/External_Functions/gpu_util.cu
@@ -0,0 +1,64 @@
+/*
+ *  Some GPU utility functions for SpMV multiplication
+ *  Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ */ 
+
+#include <cuda.h>
+#include <stdio.h>
+#include <cuda_runtime.h>
+#include "gpu_util.h"
+
+const char *gpu_get_errmsg(cudaError_t err)
+{
+    return cudaGetErrorString(err);
+}
+
+const char *gpu_get_last_errmsg()
+{
+    return gpu_get_errmsg(cudaGetLastError());
+}
+
+void cudaCheckErrors(const char * msg)
+{
+        cudaError_t __err = cudaGetLastError();
+        if (__err != cudaSuccess) { 
+            printf("\nFatal error: %s (%s)\n", msg, cudaGetErrorString(__err));
+            exit(1); 
+        }
+}
+
+void *gpu_alloc(size_t count)
+{
+	void *ret;
+	if (cudaMalloc(&ret, count) != cudaSuccess) {
+		printf("Gpu alloc failed: %s\n", gpu_get_last_errmsg());
+		exit(1);
+	}
+	return ret;
+}
+
+void gpu_free(void *gpuptr)
+{
+    cudaFree(gpuptr);
+}
+
+int copy_to_gpu(const void *host, void *gpu, size_t count)
+{
+	if (cudaMemcpy(gpu, host, count, cudaMemcpyHostToDevice) != cudaSuccess){
+		printf("Copy to GPU failed: %s\n", gpu_get_last_errmsg());
+		exit(1);
+	}   
+	return 1;
+}
+
+int copy_from_gpu(void *host, const void *gpu, size_t count)
+{
+	if (cudaMemcpy(host, gpu, count, cudaMemcpyDeviceToHost) != cudaSuccess){
+		printf("Copy to Host failed: %s\n", gpu_get_last_errmsg());
+		exit(1);
+	}   
+	return 1;
+}
+
+
+
diff --git a/External_Functions/gpu_util.h b/External_Functions/gpu_util.h
new file mode 100755
index 0000000000000000000000000000000000000000..b9c41741f27cf5e2ea069366caffed075e11dac7
--- /dev/null
+++ b/External_Functions/gpu_util.h
@@ -0,0 +1,14 @@
+/*
+ *  some GPU utility functions
+ *  Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *	Use of Unified memory is advised, but not always optimal
+ */  
+
+void gpu_free(void *gpuptr);									/* Free GPU memory*/
+void *gpu_alloc(size_t count);									/* Allocate 'count' bytes in GPU memory (error safe) */
+int copy_to_gpu(const void *host, void *gpu, size_t count);		/* Copy 'count' bytes from host to gpu memory (error safe)  */
+int copy_from_gpu(void *host, const void *gpu, size_t count);	/* Copy 'count' bytes from gpu to host memory (error safe) */
+
+void cudaCheckErrors(const char * msg);							/* Check GPU for errors */
+const char *gpu_get_errmsg(cudaError_t err);
+const char *gpu_get_last_errmsg();
diff --git a/External_Functions/input.c b/External_Functions/input.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ad6c5d10b614971289b24d8daf46bbb3d4ecf1c
--- /dev/null
+++ b/External_Functions/input.c
@@ -0,0 +1,178 @@
+/*
+ * Some basic functions for mtx reading and formating
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+
+int mtx_read(int ** I, int ** cooCol, double ** cooVal, int * n, int * m, int * n_z, char * name)
+{
+	
+	char c;
+	char *type, *format, *var_type, *symmetry, *string=NULL;
+	FILE *fp ;
+	size_t len=0;
+
+	if ((fp=fopen(name, "r"))==NULL) error("Invalid File");
+	else if ((strstr(name, "mtx"))==NULL) error("Invalid File Type (*.mtx)");
+
+	getline(&string, &len, fp);
+	strtok(string," ");
+	type = strtok(NULL," ");
+	format = strtok(NULL," ");
+	var_type = strtok(NULL," ");
+	symmetry = strtok(NULL,"\n");
+	//printf("type=%s, format=%s, var_type=%s, ", type, format, var_type);
+	if (strcmp(type,"matrix")){
+		printf("type=%s unsupported...terminating\n\n\n\n\n\n\n\n\n\n\n\n", type);
+		exit(1);
+	}
+	if (strcmp(format,"coordinate")){
+		printf("format=%s unsupported...terminating\n\n\n\n\n\n\n\n\n\n\n\n", format);
+		exit(1);
+	}
+	if (strcmp(var_type,"integer") && strcmp(var_type,"real")){
+		printf("Var_type=%s unsupported...terminating\n\n\n\n\n\n\n\n\n\n\n\n", var_type);
+		exit(1);
+	}
+	while((c=getc(fp))=='%') while( (c=getc(fp))!='\n') ; 
+	ungetc(c, fp);
+	int k, lines = 0, sym_k=0;
+	fscanf(fp,"%d %d %d", n, m, &lines);
+	//printf("n=%d, m=%d, lines=%d, ", *n, *m, lines);
+	
+	*n_z = 0;
+	if (!strcmp(symmetry,"symmetric")){
+		get_nz_symmetric(n_z, name);
+		//printf("symmetry=symmetric\n");
+	}
+	else if (!strcmp(symmetry,"general")) {
+		*n_z=lines;
+		//printf("symmetry=general\n");
+	}
+	else {
+		printf("Invalid symmetry value:%s\n", symmetry); 
+		return 0; 
+	}
+	//printf("n_z=%d\n", *n_z);
+	*I = (int *) malloc(*n_z * sizeof(int));
+	*cooCol = (int *) malloc(*n_z * sizeof(int));
+	*cooVal = (double *) malloc(*n_z * sizeof(double));
+	double dum;
+	if ( !*I || !*cooCol || !*cooVal ) return 0;
+	if (!strcmp(symmetry,"symmetric")){
+		for (k = 0; k < lines; k++) {
+			fscanf(fp,"%d %d %lf", &((*I)[sym_k]), &((*cooCol)[sym_k]), &dum);
+			(*cooVal)[sym_k]=(double) dum;
+			(*I)[sym_k]--;
+			(*cooCol)[sym_k]--;
+			sym_k++;
+			if ((*I)[sym_k-1] != (*cooCol)[sym_k-1]) {
+				(*I)[sym_k] = (*cooCol)[sym_k-1];
+				(*cooCol)[sym_k] = (*I)[sym_k-1];
+				(*cooVal)[sym_k] = (*cooVal)[sym_k-1];
+				sym_k++;
+			}
+		}
+		if (sym_k!=*n_z){
+			printf("Error in symmetric read: sym_k=%d n_z=%d\n", sym_k, *n_z);
+			return 0;
+		}
+	}
+	else if (!strcmp(symmetry,"general")) for (k = 0; k < lines; k++){
+		fscanf(fp,"%d %d %lf", &((*I)[k]), &((*cooCol)[k]), &dum);
+		(*cooVal)[k] = (double) dum;
+		(*I)[k]--;
+		(*cooCol)[k]--;
+	}
+	quickSort( *I, *cooCol, *cooVal, 0, *n_z-1);
+	fclose(fp);
+	return 1;
+}
+
+void get_nz_symmetric( int * n_z, char* name)
+{
+	char c;
+	FILE *fp ;
+	if ((fp=fopen(name, "r"))==NULL){
+		printf("Problem in symmetric read pass\n");
+		exit(1);
+	}
+
+	while((c=getc(fp))=='%') while( (c=getc(fp))!='\n') ; 
+	ungetc(c, fp);
+	int k, i, j, n, m, lines;
+	double x;
+	fscanf(fp,"%d %d %d", &n, &m, &lines);
+	for (k = 0; k < lines; k++){
+		fscanf(fp,"%d %d %lf", &i, &j, &x);
+		(*n_z)++;
+		if(i!=j) (*n_z)++;
+	}
+}
+
+void csr_transform(double ** A, int n, int m, int n_z, double  *csrValA, int *csrRowPtrA, int *csrColIndA)
+{
+	int i,j,k=0;
+	for (i = 0; i < n; i++){
+		csrRowPtrA[i]=k;
+		for (j = 0; j < m; j++){
+			if (A[i][j]!=0.0){
+				csrValA[k]=A[i][j];
+				csrColIndA[k]= j;
+				k++;
+			}
+		}
+	}
+	csrRowPtrA[i]=k;
+	if (k!=n_z) printf("Error at non zeroes: %d\n", k-n_z);
+	return;
+}			
+
+void quickSort( int *a, int * b, double * c, int l, int r)
+{
+	int j;
+	if( l < r ) 
+	{	// divide and conquer		
+		j = partition( a, b, c, l, r);
+		quickSort( a, b, c, l, j-1);
+		quickSort( a, b, c, j+1, r);
+	}
+}
+
+int partition( int *a, int * b, double * c, int l, int r) 
+{
+	int pivot, i, j, t;
+	double t1;
+	pivot = a[l];
+	i = l; j = r+1;
+		
+	while(1)
+	{
+		do ++i; while( a[i] <= pivot && i <= r );
+   		do --j; while( a[j] > pivot );
+   		if( i >= j ) break;
+   		t = a[i]; a[i] = a[j]; a[j] = t;
+		t = b[i]; b[i] = b[j]; b[j] = t;
+		t1 = c[i]; c[i] = c[j]; c[j] = t1;
+   	}
+   	t = a[l]; a[l] = a[j]; a[j] = t;
+	t = b[l]; b[l] = b[j]; b[j] = t;
+	t1 = c[l]; c[l] = c[j]; c[j] = t1;
+   	return j;
+}
+
+
+
+
+
+
+
+
+
diff --git a/External_Functions/input.h b/External_Functions/input.h
new file mode 100644
index 0000000000000000000000000000000000000000..c86b655ca47844ffc71a88b67ef4ebedd96dd5b4
--- /dev/null
+++ b/External_Functions/input.h
@@ -0,0 +1,19 @@
+/*
+ * Some basic functions for mtx reading and formating
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ */
+
+/* a quicksort implementation (required for sorted coo impementation) */
+void quickSort( int *a, int * b, double * c, int l, int r);
+int partition( int *a, int * b, double * c, int l, int r);
+
+/* Read functions for sparce matrices (mtx format) */
+void get_nz_symmetric( int * n_z, char* name);
+int mtx_read(int ** I, int ** cooCol, double ** cooVal, int * n, int * m, int * n_z, char * name);
+void csr_transform(double **, int, int, int, double *, int *, int *);
+int read_mtx_coo(char *, int *, int * , double *, int *, int * , int * );
+
+
+
+
diff --git a/External_Functions/matrix_op.c b/External_Functions/matrix_op.c
new file mode 100755
index 0000000000000000000000000000000000000000..309aaa1f8df8fc1498a4106a687bba8ce3896717
--- /dev/null
+++ b/External_Functions/matrix_op.c
@@ -0,0 +1,96 @@
+/*
+ *  
+ *  matrix_op.c -- Basic Matrix transform/split operations
+ *  
+ *	Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ */ 
+
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include "matrix_op.h"
+
+void vec_init(double *v, size_t n, double val)
+{
+    size_t  i;
+    for (i = 0; i < n; ++i) {
+        v[i] = val;
+    }
+}
+
+void vec_init_rand(double *v, size_t n, double max)
+{
+    srand48(42);   // should only be called once
+    size_t  i;
+    for (i = 0; i < n; ++i) {
+        v[i] = max*(double) drand48();
+    }
+}
+
+void vec_init_rand_p(double *v, size_t n, size_t np, double max)
+{
+    srand48(42);   // should only be called once
+    size_t  i;
+    for (i = 0; i < n; ++i) {
+        v[i] = (double) drand48();
+    }
+	for (i = n; i < np; ++i) {
+        v[i] = 0.0;
+    }
+}
+
+void matrix_init_rand(double **v, size_t n, size_t m, double max)
+{
+    srand48(42);   // should only be called once
+    size_t  i,j;
+    for (i = 0; i < n; ++i) 
+		for (j = 0; j < m; ++j) 
+        	v[i][j] = max*(double) drand48();
+    
+}
+
+void ser_matrix_init_rand(double *v, size_t n, size_t m, double max)
+{
+    srand48(42);   // should only be called once
+    size_t  i,j;
+    for (i = 0; i < n; ++i) 
+		for (j = 0; j < m; ++j) 
+        	v[i*m+j] = max*(double) drand48();
+    
+}
+
+void ser_matrix_init_rand_p(double *v, size_t n, size_t m, size_t np, double max)
+{
+    srand48(42);   // should only be called once
+    size_t  i,j;
+    for (i = 0; i < n; ++i) 
+		for (j = 0; j < m; ++j) 
+        	v[i*m+j] = max*(double) drand48();
+    for (i = n*m; i < n*m+np; ++i) v[i] = 0.0;
+}
+
+void matrix_col_major(double *M, double *A, size_t n, size_t m)
+{
+	size_t  i,j;
+    for (i = 0; i < n; ++i) 
+		for (j = 0; j < m; ++j)
+			A[j*n+i] = M[i*m+j];
+}
+	
+void matrix_row_major(double **M, double *A, size_t n, size_t m)
+{
+	size_t  i,j;
+    for (i = 0; i < n; ++i) 
+		for (j = 0; j < m; ++j)
+			A[i*n+j] = M[i][j];
+}
+	
+
+void regenerate_matrix_coo(double **M, int *I, int *cooCol, double *cooVal, int n, int m, int n_z)
+{
+	int  i;
+    for (i = 0; i < n_z; ++i) M[I[i]][cooCol[i]] = cooVal[i];
+}
+
+
diff --git a/External_Functions/matrix_op.h b/External_Functions/matrix_op.h
new file mode 100755
index 0000000000000000000000000000000000000000..3b763f70c1e9b551fc6fa93fe7935b82ff74c4b5
--- /dev/null
+++ b/External_Functions/matrix_op.h
@@ -0,0 +1,19 @@
+/*
+ *  
+ *  matrix_op.h -- Basic Matrix transform/split operations
+ *  
+ *	Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ */ 
+
+void vec_init(double *v, size_t n, double val); 						/* Initialize n vector to 'val' */
+void vec_init_rand(double *v, size_t n, double max);					/* Initialize n vector to random values between 0 and 'max' (Constant seed for error checking) */
+void vec_init_rand_p(double *v, size_t n, size_t np, double max);		/* Initialize n+np vector to random values between 0 and 'max' for n elems and 0.0 for padding */
+void matrix_init_rand(double **v, size_t n, size_t m, double max);		/* Initialize v[n][n] matrix to random values between 0 and 'max' */
+void ser_matrix_init_rand(double *v, size_t n, size_t m, double max);	/* Initialize v[n*m] matrix to random values between 0 and 'max' */
+void ser_matrix_init_rand_p(double *v, size_t n, size_t m, size_t np, double max); /* Initialize v[n*m+np] matrix to random values between 0 and 'max' for n*m elems and 0.0 for padding */
+
+void matrix_col_major(double *M, double *A, size_t n, size_t m);		/* Transform row major M[n*m] to column major A[n*m]  */
+void matrix_row_major(double **M, double *A, size_t n, size_t m);		/* Transform column major M[n][m] to column major A[n*m]  */
+
+void regenerate_matrix_coo(double **M, int * I, int * cooCol, double * cooVal, int n, int m, int n_z);	/* Generate (sparse?) M[n][m] matrix from given COO format  */
diff --git a/External_Functions/timer.c b/External_Functions/timer.c
new file mode 100755
index 0000000000000000000000000000000000000000..24141df73429715446cdaf4fb01e30da8b4a7082
--- /dev/null
+++ b/External_Functions/timer.c
@@ -0,0 +1,23 @@
+#include <stdio.h>
+#include <time.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+double csecond(void) {
+
+    struct timespec tms;
+
+    if (clock_gettime(CLOCK_REALTIME,&tms)) {
+        return (0.0);
+    }
+    /* seconds, multiplied with 1 million */
+    int64_t micros = tms.tv_sec * 1000000;
+    /* Add full microseconds */
+    micros += tms.tv_nsec/1000;
+    /* round up if necessary */
+    if (tms.tv_nsec % 1000 >= 500) {
+        ++micros;
+    }
+    return( (double) micros /1000000.0) ;
+}
+
diff --git a/External_Functions/timer.h b/External_Functions/timer.h
new file mode 100755
index 0000000000000000000000000000000000000000..0b08bf43f5bf06a6c77bbe24f8d9e9606758e43e
--- /dev/null
+++ b/External_Functions/timer.h
@@ -0,0 +1 @@
+double csecond(void); /* A function to calculate current time. Use: double x = csecond(); ... y = csecond(); time = y-x; */
diff --git a/External_Functions/util.c b/External_Functions/util.c
new file mode 100644
index 0000000000000000000000000000000000000000..ee21bb51d82b36a863eecf71d01ce587daf635b1
--- /dev/null
+++ b/External_Functions/util.c
@@ -0,0 +1,44 @@
+/*
+ * util.c -- Some usefull functions for error checking
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "util.h"
+
+void error(const char * msg)
+{
+	perror(msg);
+	exit(1);
+}
+
+void check_result(double *test, double *orig, size_t n)
+{
+	size_t  i_fail = vec_equals(test, orig, n, 0.00001);
+	if (!i_fail) ; //printf("Checked, ");
+	else printf("FAILED %ld times", i_fail );
+}
+
+void report_results(double timer)
+{
+	printf("t= %lf ms\n",1000.0/NR_ITER*timer);
+}
+
+void report_mpi_results(double comm_timer, double comp_timer)
+{
+	printf("comp_t= %lf ms, comm_t= %lf ms\n",1000.0/NR_ITER*comp_timer, 1000.0*comm_timer);
+}
+
+int vec_equals(const double *v1, const double *v2, size_t n, double eps)
+{
+	size_t  i,k=0;
+    	for (i = 0; i < n; ++i) {
+		if (fabs(v1[i] - v2[i]) > eps) k++;	
+    	}
+	return k;
+}
diff --git a/External_Functions/util.h b/External_Functions/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..19d2d40c9f6168471d564c27f1d1be5c70b13a14
--- /dev/null
+++ b/External_Functions/util.h
@@ -0,0 +1,16 @@
+/*
+ * util.h -- Some usefull functions for error checking
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ */
+
+#include "timer.h"
+
+#define NR_ITER 100
+
+void error(const char * msg); /* A function for error printing and exiting */
+void report_results(double time); /* Print timer results */
+void report_mpi_results(double comm_timer, double comp_timer);
+
+void check_result(double *test, double *orig, size_t n); /* Check vector result  */
+int vec_equals(const double *v1, const double *v2, size_t n, double eps); /* Check vector v1[n], v2[n] equality with 'eps' precision  */
diff --git a/GPUs/GPU.slurm b/GPUs/GPU.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..86653c5d524e6fd371140f6aa9b48081b05b393a
--- /dev/null
+++ b/GPUs/GPU.slurm
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+##############################################
+#     ARIS slurm script template  			 #
+#                                  			 #
+# Submit script: sbatch GPU.slurm n1 n2 ...  #
+#                                  			 #
+##############################################
+
+
+#SBATCH --job-name=run_GPU   # Job name
+#SBATCH --output=GPU.out 
+#SBATCH --error=GPU.err 
+#SBATCH --ntasks=32   # Number of processor cores (i.e. tasks)
+#SBATCH --nodes=16    # Number of nodes requested
+#SBATCH --ntasks-per-node=2     # Tasks per node
+#SBATCH --cpus-per-task=1     # Threads per task
+#SBATCH --gres=gpu:2 # GPUs per node
+#SBATCH --time=00:40:00   # walltime
+#SBATCH --mem=32G   # memory per NODE
+#SBATCH --partition=gpu  # Partition
+#SBATCH --account=testproj  # Accounting project
+
+## LOAD MODULES ##
+module purge		# clean up loaded modules 
+
+# load necessary modules
+module load gnu
+module load intel
+module load intelmpi
+module load binutils
+module load cuda
+
+export I_MPI_FABRICS=shm:dapl
+
+## Change this to the directory of your executable!
+gpu_prog="./cuda_SingleGPU.exe"
+gpu_prog1="./cuBLAS.exe"
+gpu_prog2="./cuBLAS_MultiGPU.exe"
+
+for n; 
+do
+	#srun $gpu_prog $n $n >> temp.out
+	#srun $gpu_prog1 $n $n >> temp.out
+#	Important note: In MultiGPU version you must use gres=ntasks-per-node values in order to utilize all GPUs !!!
+  	srun $gpu_prog2 $n $n >> temp.out
+	
+done
diff --git a/GPUs/Makefile b/GPUs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..66ca83207858a2cf7967523146599e0dca3a9a9b
--- /dev/null
+++ b/GPUs/Makefile
@@ -0,0 +1,66 @@
+CC=g++
+ICC =icc
+
+DEBUG ?= 0  # Set to 1 for debug
+
+CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
+#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
+
+#CFLAGS=-O3 -Wall -xCORE-AVX-I
+#CFLAGS=-O3 -Wall -xCORE-AVX2 
+#ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I
+
+# Need to -I this for user-defined functions to work
+EXT_DIR = ../External_Functions/
+
+MPI_PREFIX = $(I_MPI_ROOT)
+CUDA_PREFIX = $(CUDAROOT)
+GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
+GPU_CXX = nvcc
+
+LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt 
+
+GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include  -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
+GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
+CPU_COMPILE = $(CC) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS)
+
+ifeq ($(DEBUG), 1)
+	CPU_COMPILE 	+= -D_DEBUG_
+	GPU_COMPILE 	+= -D_DEBUG_
+	GPU_MPI_COMPILE += -D_DEBUG_
+endif
+
+CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
+GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
+
+
+
+SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu cuda_SingleGPU.cu
+OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o dmv_gpu.o
+PROGRAMS= cuBLAS.exe cuBLAS_MultiGPU.exe cuda_SingleGPU.exe
+
+all: $(PROGRAMS)
+
+cuda_SingleGPU.exe: $(OBJECTS) cuda_SingleGPU.cu
+	$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuda_SingleGPU.cu
+
+cuBLAS_MultiGPU.exe: $(OBJECTS) cuBLAS_MultiGPU.cu
+	$(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuBLAS_MultiGPU.cu
+
+cuBLAS.exe: $(OBJECTS) cuBLAS.cu
+	$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuBLAS.cu
+
+gpu_util.o: $(EXT_DIR)gpu_util.cu
+	$(GPU_COMPILE_OBJ) -o $@ $<
+
+dmv_gpu.o: dmv_gpu.cu
+	$(GPU_COMPILE_OBJ) -o $@ $<
+
+%.o: $(EXT_DIR)%.c
+	$(CPU_COMPILE_OBJ) -o $@ $<
+
+%.o: %.h
+
+clean:
+	$(RM) $(PROGRAMS) $(OBJECTS)
+
diff --git a/GPUs/README.md b/GPUs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f5716d7e57a125ff949a5c02edf626d4e63bb74
--- /dev/null
+++ b/GPUs/README.md
@@ -0,0 +1,19 @@
+# A single GPU impementation of the Matrix-Vector algorithm with:
+
+```
+->cuBLAS(BLAS routines implemented on the GPU by NVIDIA)
+07/09/2017: Completed
+13/09/2017: Modified to use unified memory
+
+->cuBLAS_MultiGPU(cuBLAS implementation in multiple GPUs/Nodes)
+26/09/2017: Completed
+
+->cuda_SingleGPU(3 cuda kernels showing the optimization steps in writing GPU code)
+02/10/2017: Completed kernel 1
+03/10/2017: Completed kernel 2 & 3
+
+Tested environments:
+- Haswell Intel Xeon E5-2660v3 CPU with Linux x86_64 + Nvidia Tesla K40 GPUs and cuda/8.0.61
+
+```
+
diff --git a/GPUs/cuBLAS.cu b/GPUs/cuBLAS.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d2c10ef33400013eef769ea5f56669ec7910a397
--- /dev/null
+++ b/GPUs/cuBLAS.cu
@@ -0,0 +1,100 @@
+/*
+ * A cuBLAS implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ * For more info about cuBLAS see http://docs.nvidia.com/cuda/cublas/index.html 
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse_v2.h>
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+#include "gpu_util.h"
+
+
+int main(int argc, char **argv)
+{
+	/* Initializations */
+	int i, j, n, m;
+	double timer;
+
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+
+	/* Allocate space */
+	double *x 			= (double *) malloc(m * sizeof(*x));
+	double *M 			= (double *) malloc(n * m * sizeof(*M));
+	if( !x || !M ) error("memory allocation failed");
+
+	/* Initialize matrices */
+	ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+
+	/* Initialize vectors */
+	vec_init_rand(x, m, 1.0);
+
+	/* Initialize cuda/cublas variables */
+	int device_num=0;
+	cudaGetDeviceCount(&device_num);
+	if (!device_num) {
+		printf("No available Cuda Devices...terminating");
+		return 0;
+	}
+	double alf=1.0; /* Y=a*A*x+b */
+	double beta=0.0;
+	cublasHandle_t handle;
+	double *A, * y, *x_c;
+	
+	printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
+
+	/* Initialize Unified memmory visible and accesible from both CPU and GPU */
+	cudaMallocManaged(&A, m*n * sizeof(double));
+	cudaMallocManaged(&y, n * sizeof(double));
+	cudaMallocManaged(&x_c, m * sizeof(double));
+	cudaDeviceSynchronize();
+	cudaCheckErrors("Unified Alloc failed");
+	if ( !A || !y || !x_c) error("unified alloc failed");
+	for (i = 0; i < m; i++) x_c[i] = x[i];
+	matrix_col_major(M, A, n, m); /* We transpose the matrix because cuBLAS works with column-major format */
+	cublasCreate(&handle);
+
+	/*GPU Warmup */
+	cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
+	cudaDeviceSynchronize();
+
+	timer=csecond();
+	for (j = 0; j < NR_ITER; ++j) {	
+			cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
+			cudaDeviceSynchronize();
+	}
+	timer = csecond() - timer;	
+	cudaCheckErrors("cublasDgemv failed");
+	
+#ifdef _DEBUG_ 
+	/* Output y vector to a file for debugging */
+	FILE * fp;
+	char filename[] = "cuBLAS.debug" ; /* Common directory for all implementations, change if needed */
+	if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+	for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
+	fclose(fp) ;
+#endif
+	report_results(timer);
+
+	return 0;
+}
+
+
+
+
diff --git a/GPUs/cuBLAS_MultiGPU.cu b/GPUs/cuBLAS_MultiGPU.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f8258dc46e7045b7eaf26e3e7a12dad31c79c106
--- /dev/null
+++ b/GPUs/cuBLAS_MultiGPU.cu
@@ -0,0 +1,171 @@
+/*
+ * A Hybrid MPI-CUDA implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ * For more info about hybrid MPI-CUDA (cublas here) see https://devblogs.nvidia.com/parallelforall/introduction-cuda-aware-mpi/
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse_v2.h>
+#include <mpi.h>
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+#include "gpu_util.h"
+
+
+
+int main(int argc, char ** argv) 
+{
+    int rank,size;
+    int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
+    int global_padded_nm[2];   //padded global matrix dimensions (if padding is not needed, global_padded=global)
+    int i, j;
+    double * M, *M_cl, * A, * x, * y, *local_y, *x_c, comm_t, comp_t;
+    
+	/* MPI basic initializations */
+    MPI_Init(&argc,&argv);
+    MPI_Comm_size(MPI_COMM_WORLD,&size);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+	
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		global_nm[0] = atoi(argv[1]);
+		global_nm[1] = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+
+	/* Padd M in 'size' equal pieces */
+   	local_nm[0]=global_nm[0];
+	global_padded_nm[0]=global_nm[0];
+
+	if (global_nm[1]%size==0) {
+		local_nm[1]=global_nm[1]/size;
+		global_padded_nm[1]=global_nm[1];
+	}
+	else {
+		local_nm[1]=(global_nm[1]/size)+1;
+		global_padded_nm[1]=local_nm[1]*size;
+	}
+
+	x =	(double *) malloc(global_padded_nm[1] * sizeof(*x));
+
+    if (rank==0) {
+		/* Initialize proc 0 memmory/data */
+		M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
+		M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl));
+		vec_init_rand_p(x, global_nm[1], global_padded_nm[1] - global_nm[1], 1.0);
+		y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
+		vec_init(y, global_padded_nm[0], 0.0);
+		if( !y || !x || !M || !M_cl ) error("memory allocation failed");
+		
+		/* Initialize matrices */
+		ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
+	}
+
+	//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local_nm[0],local_nm[1],global_padded_nm[0],global_padded_nm[1]);
+
+	/* Initialize process local memmory */
+	local_y = (double *) malloc(local_nm[0] * sizeof(*local_y));
+	vec_init(local_y, local_nm[0], 0.0);
+	A = (double *) malloc(local_nm[0] * local_nm[1] * sizeof(*A));
+	x_c = (double *) malloc(local_nm[1] * sizeof(*x_c));
+	if ( !A || !local_y || !x_c) error("Process local alloc failed");
+	
+	/* Unlike the MPI code, we want each proccess data to be in a good shape for GPU utilization. Thats why we transpose the matrix and we scatter it M dimension-wise */
+	if(rank == 0) matrix_col_major(M, M_cl, global_padded_nm[0], global_padded_nm[1]);
+
+    /* Rank 0 scatters the global matrix and broadcasts x vector */
+	double * gsendbuf;
+	if (rank == 0){
+		gsendbuf = &(M_cl[0]);
+		comm_t= MPI_Wtime(); 
+	}
+
+	MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	//if (rank == 0) printf("Scatter complete\n");
+	MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	if (rank == 0) comm_t= MPI_Wtime() - comm_t; 	
+	for (i = 0; i < local_nm[1]; i++) x_c[i] = x[rank*local_nm[1] + i];
+
+	/* Initialize cuda/cublas variables */
+	int device_num=0;
+	cudaGetDeviceCount(&device_num);
+	if (!device_num) printf("No available Cuda Devices");
+	else {	
+	cudaSetDevice(rank%device_num);
+	double alf=1.0; /* Y=a*A*x+b */
+	double beta=0.0;
+	cublasHandle_t handle;
+	cublasCreate(&handle);
+
+	/* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large (possible performance degradation) */
+	double * gpu_y 	= (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
+	double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
+	double * gpu_A 	= (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
+	
+	/* Copy data to GPU memmory */
+	copy_to_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
+	copy_to_gpu(x_c, gpu_xc, local_nm[1] * sizeof(*x_c));
+	copy_to_gpu(A, gpu_A, local_nm[0] * local_nm[1] * sizeof(*A));
+
+	/* Warmup */
+	cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
+	cudaDeviceSynchronize();
+
+	if (rank==0) {
+		printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", local_nm[0], local_nm[1], device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
+		comp_t= MPI_Wtime(); 
+	}
+	
+	for (j = 0; j < NR_ITER; ++j) {	
+			cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
+			cudaDeviceSynchronize();
+	}	
+	cudaCheckErrors("cublasDgemv failed");
+
+	MPI_Barrier(MPI_COMM_WORLD);
+	if (rank==0) comp_t= MPI_Wtime() - comp_t;
+	copy_from_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
+	cudaDeviceSynchronize();
+	MPI_Barrier(MPI_COMM_WORLD);
+
+	if (rank==0) comm_t= MPI_Wtime() - comm_t;
+	MPI_Reduce(local_y, y, local_nm[0], MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	if (rank==0) comm_t = MPI_Wtime() - comm_t;
+
+	if (rank == 0) {
+#ifdef _DEBUG_ 
+		/* Output y vector to a file for debugging */
+        FILE * fp;
+		char filename[] = "cuBLAS_MultiGPU.debug" ; /* Common directory for all implementations, change if needed */
+		if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+        for (i = 0; i < global_nm[0]; ++i) fprintf(fp, "%lf ", y[i]) ;
+		fclose(fp) ;
+#endif
+		report_mpi_results(comm_t, comp_t);
+		/* Free rank 0 local memmory */
+		free(M);
+		free(M_cl);
+		free(y);
+		free(x);
+	}
+	/* Free GPU memmory */
+	gpu_free(local_y);
+	gpu_free(A);
+	gpu_free(x_c);
+	}
+
+    MPI_Finalize();
+    return 0;
+}
+
+
+
diff --git a/GPUs/cuda_SingleGPU.cu b/GPUs/cuda_SingleGPU.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f580485260d3c1b910a5715eda9d0aac2dc20875
--- /dev/null
+++ b/GPUs/cuda_SingleGPU.cu
@@ -0,0 +1,132 @@
+/*
+ * A cuda implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ * The device code for the GPU implemntations can be found in the 'dmv_gpu.cu' file
+ *
+ * For more info about coalesced memmory access and shmem, see https://cvw.cac.cornell.edu/gpu/coalesced
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse_v2.h>
+#include "dmv_gpu.h"
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+#include "gpu_util.h"
+
+#define block_size 256 /* Number of GPU threads per block. Modifying this value might lead to performance issues */ 
+
+int main(int argc, char **argv)
+{
+	/* Initializations */
+	int i, j, n, m;
+	double timer;
+
+
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+
+	int grid_size = (n-1)/block_size + 1; 
+	size_t shmem_size = 0;
+
+	/* GPU kernel block/grid sizes */
+	dim3 gpu_block(block_size, 1);
+    dim3 gpu_grid(grid_size, 1);
+
+
+	/* Allocate space */
+	double *x 			= (double *) malloc(m * sizeof(*x));
+	double *y	= (double *) malloc(n * sizeof(*y));
+	double *M 			= (double *) malloc(n * m * sizeof(*M));
+
+	if( !y || !x || !M ) error("memory allocation failed");
+
+	/* Initialize matrices */
+	ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */
+
+	/* Initialize vectors */
+	vec_init_rand(x, m, 1.0);
+	vec_init(y, n, 0.0);
+
+	/* Initialize cuda variables */
+	int device_num=0;
+	cudaGetDeviceCount(&device_num);
+	if (!device_num) printf("No available Cuda Devices");
+	else {	
+	cudaSetDevice(0);
+	printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m);
+	double *A, * y, *x_c;
+	
+	/* Initialize Unified memmory visible and accesible from both CPU and GPU */
+	cudaMallocManaged(&A, m*n * sizeof(double));
+	cudaMallocManaged(&y, n * sizeof(double));
+	cudaMallocManaged(&x_c, m * sizeof(double));
+	cudaDeviceSynchronize();
+	cudaCheckErrors("Unified Alloc failed");
+	if ( !A || !y || !x_c) error("unified alloc failed");
+	for (i = 0; i < m; i++) x_c[i] = x[i];
+
+	/* First naive kernel */
+	for ( i = 0; i < n*m; i++) A[i] = M[i] ;	
+	timer=csecond();
+	for (j = 0; j < NR_ITER; ++j) {	
+			dmv_gpu_naive<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
+			cudaDeviceSynchronize();
+	}
+	timer = csecond() - timer;
+	cudaCheckErrors("naive kernel failed");
+	report_results(timer);
+	
+	/* Second kernel, using coalesced memmory accesses in the GPU by transposing the matrix. */
+	printf("Single GPU CUDA Coalesced Version(N=%d, M=%d): ", n, m);
+	matrix_col_major(M, A, n, m); /* We transpose the matrix to better match the GPU coalesced memmory access logic */
+
+	timer=csecond();
+	for (j = 0; j < NR_ITER; ++j) {	
+			dmv_gpu_coalesced<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
+			cudaDeviceSynchronize();
+	}
+	timer = csecond() - timer;
+	cudaCheckErrors("coalesced kernel failed");
+	report_results(timer);
+	
+	/* Third and final kernel further improves memmory access speed by using block exclusive shmem */
+	printf("Single GPU CUDA shmem Version(N=%d, M=%d): ", n, m);
+	shmem_size= block_size*sizeof(float);
+	
+	timer=csecond();
+	for (j = 0; j < NR_ITER; ++j) {	
+			dmv_gpu_shmem<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
+			cudaDeviceSynchronize();
+	}
+	timer = csecond() - timer;
+	cudaCheckErrors("shmem kernel failed");
+
+#ifdef _DEBUG_ 
+	/* Output y vector to a file for debugging */
+	FILE * fp;
+	char filename[] = "CUDA.debug" ; /* Common directory for all implementations, change if needed */
+	if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+	for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
+	fclose(fp) ;
+#endif
+	report_results(timer);
+	}
+	return 0;
+}
+
+
+
+
diff --git a/GPUs/dmv_gpu.cu b/GPUs/dmv_gpu.cu
new file mode 100755
index 0000000000000000000000000000000000000000..9441cfe820d0a095652303e72d0252b364d03848
--- /dev/null
+++ b/GPUs/dmv_gpu.cu
@@ -0,0 +1,79 @@
+#include <stdio.h>
+
+/*
+ *  Utility function to get the thread ID within the
+ *  global working space.
+ */ 
+__device__ int get_global_tid()
+{
+    return (gridDim.x*blockIdx.y + blockIdx.x)*blockDim.x*blockDim.y +
+        blockDim.x*threadIdx.y + threadIdx.x;
+}
+
+/*
+ *  Utility function to get the thread ID within the
+ *  local/block working space.
+ */ 
+__device__ int get_local_tid()
+{
+    return blockDim.x*threadIdx.y + threadIdx.x;
+}
+
+/*
+ *  Naive kernel
+ */ 
+__global__ void dmv_gpu_naive(const double *a, const double *x, double *y,
+                              size_t n)
+{
+	int tid = get_global_tid();
+	double yi = 0.0;
+	if(tid >= n)
+		return ;
+	for ( int j = 0 ; j < n; j++ )
+		yi += + a[tid*n+j]*x[j];	
+	y[tid]=yi;
+}
+
+/*
+ *  Coalesced memory accesses kernel (requires transposed matrix a)
+ */
+__global__ void dmv_gpu_coalesced(const double *a, const double *x,
+                                  double *y, size_t n)
+{
+	int tid = get_global_tid();
+	double yi = 0.0;
+	if(tid >= n)
+		return ;
+	for ( int j = 0 ; j < n; j++ )
+		yi += + a[j*n+tid]*x[j];	
+	y[tid]=yi;	
+}
+
+/*
+ *  Final kernel making use of shared memory to improve memory bandwidth utilization and access pattern
+ */
+__global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n)
+{
+	extern __shared__ float shmem_buff[] ;
+
+	int tid = get_global_tid(), i, j;
+	double yi = 0.0;
+	if(tid >= n)
+		return ;
+
+	int block_s=blockDim.x*blockDim.y;
+	int lid=get_local_tid(), last_id = n/block_s ;
+
+	for( j = 0; j< last_id; j++) {
+		shmem_buff[lid] = x[block_s*j + lid];
+        	__syncthreads();
+		for( i = 0 ; i < block_s; i++ ) {
+			yi += a[tid+ (i+j*block_s)*n]*shmem_buff[i];
+		}
+		__syncthreads();
+	}	
+	y[tid]=yi;
+}
+
+
+
diff --git a/GPUs/dmv_gpu.h b/GPUs/dmv_gpu.h
new file mode 100755
index 0000000000000000000000000000000000000000..24bceb247e4b6d595f23d4744d8ca444213fe8a3
--- /dev/null
+++ b/GPUs/dmv_gpu.h
@@ -0,0 +1,33 @@
+
+#include <stdio.h>
+
+/*
+ *  Utility function to get the thread ID within the
+ *  global working space.
+ */ 
+__device__ int get_global_tid();
+
+/*
+ *  Utility function to get the thread ID within the
+ *  local/block working space.
+ */ 
+__device__ int get_local_tid();
+
+/*
+ *  Naive kernel
+ */ 
+__global__ void dmv_gpu_naive(const double *a, const double *x, double *y, size_t n);
+
+/*
+ *  Coalesced memory acceses
+ */
+__global__ void dmv_gpu_coalesced(const double *a, const double *x, double *y, size_t n);
+
+
+/*
+ *  Use of shared memory
+ */
+__global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n);
+
+
+
diff --git a/Global_make.sh b/Global_make.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6f02708a88babe91102bf48becbddf1874b5336c
--- /dev/null
+++ b/Global_make.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+## LOAD MODULES ##
+module purge		# clean up loaded modules 
+
+# load necessary modules
+module load gnu
+module load intel
+module load intelmpi
+module load binutils
+module load cuda
+
+for n;
+do
+	if cd "$n" ; 
+	then
+		make
+		cd ../
+	else
+		if [ "$n" == "-all" ];
+		then
+			cd GPUs  
+			make  
+			cd ../MPI  
+			make  
+			cd ../OpenMP  
+			make
+		else
+			echo "Use: ./Global_make.sh Prog_dir_name or ./Global_make.sh -all"
+		fi		
+	fi
+done
diff --git a/MPI/MPI-OpenMP.c b/MPI/MPI-OpenMP.c
new file mode 100644
index 0000000000000000000000000000000000000000..372e79bf9263f1cde707bb7bcc1ff7353abf0bf3
--- /dev/null
+++ b/MPI/MPI-OpenMP.c
@@ -0,0 +1,128 @@
+/*
+ * A Hybrid MPI-OpenMP implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ * For more info about hybrid MPI-OpenMP see http://openmp.org/wp-content/uploads/HybridPP_Slides.pdf
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <sys/time.h>
+#include <mpi.h>
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+
+int main(int argc, char ** argv) {
+    int rank,size;
+    int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
+    int global_padded_nm[2];   //padded global matrix dimensions (if padding is not needed, global_padded=global)
+    int i,j,k;
+    double * M, * A, * x, * y, *local_y, comm_t, comp_t;
+    
+	/* MPI basic initializations */
+    MPI_Init(&argc,&argv);
+    MPI_Comm_size(MPI_COMM_WORLD,&size);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+	
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		global_nm[0] = atoi(argv[1]);
+		global_nm[1] = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+
+	/* Padd N if needed */
+   	local_nm[1]=global_nm[1];
+	global_padded_nm[1]=global_nm[1];
+
+	if (global_nm[0]%size==0) {
+		local_nm[0]=global_nm[0]/size;
+		global_padded_nm[0]=global_nm[0];
+	}
+	else {
+		local_nm[0]=(global_nm[0]/size)+1;
+		global_padded_nm[0]=local_nm[0]*size;
+	}
+
+	x =	(double *) malloc(global_padded_nm[1] * sizeof(*x));
+    if (rank==0) {
+		M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
+		vec_init_rand(x, global_padded_nm[1], 1.0);
+		y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
+		vec_init(y, global_padded_nm[0], 0.0);
+		if( !y || !x || !M ) error("memory allocation failed");
+		
+		/* Initialize matrices */
+		ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
+	}
+
+	
+	local_y = (double *) malloc(local_nm[0] * sizeof(*local_y));
+	vec_init(local_y, local_nm[0], 0.0);
+
+	//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]);
+
+	A = (double *) malloc(local_nm[0] * local_nm[1] * sizeof(*A));
+
+	#pragma omp parallel for schedule(static) /* Initialize data for each thread in corresponding socket with first-touch policy */
+	for( i=0 ; i<local_nm[0] ; ++i){
+		for ( j=0 ; j< local_nm[1] ; ++j) A[i*local_nm[1]+j]=0.0;	
+		//printf( "Initialize data Thread=%d i=%d\n", omp_get_thread_num(), i);
+	}
+
+    /* Rank 0 scatters the global matrix */
+	double * gsendbuf;
+	if (rank == 0){
+		gsendbuf = &(M[0]);
+		comm_t= MPI_Wtime(); 
+	}
+
+	MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	//if (rank == 0) printf("Scatter complete\n");
+	MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	
+	if (rank==0) {
+		printf("MPI-OpenMP Version(N=%d, M=%d, Tasks=%d, Nodes=%s, Tasks/Node=%s, threads=%s): ", global_nm[0],  global_nm[1], size, 
+		getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE"), getenv("OMP_NUM_THREADS"));
+		comp_t= MPI_Wtime(); 
+	}
+	for (i = 0; i < NR_ITER; ++i){
+		register double	yi = 0;
+		#pragma omp parallel for private(j,yi) shared(local_nm,A,local_y) schedule(static)
+		for (k = 0; k < local_nm[0]; ++k) {
+			//printf( "Compute Thread=%d i=%d\n", omp_get_thread_num(), k);
+        	yi = 0.0;
+        	for (j = 0; j < local_nm[1]; ++j) yi += A[k*local_nm[1]+j]*x[j];
+        	local_y[k] = yi;
+    	}
+	}
+
+	MPI_Barrier(MPI_COMM_WORLD);
+	if (rank==0) comp_t= MPI_Wtime() - comp_t;
+	MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	if (rank==0) {
+		comm_t = MPI_Wtime() - comm_t - comp_t;
+#ifdef _DEBUG_
+		/* Output y vector to a file for debugging */
+        FILE * fp;
+		char filename[] = "MPI-OpenMP.debug" ;
+		if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+        for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
+		fclose(fp) ;
+#endif
+		report_mpi_results(comm_t, comp_t);
+		/* Free rank 0 local memmory */
+		free(M);
+		free(y);
+	}
+	free(x);
+	free(local_y);
+	free(A);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/MPI/MPI.c b/MPI/MPI.c
new file mode 100644
index 0000000000000000000000000000000000000000..ac56f49499f286cc810bd9726c88c88ab93b738b
--- /dev/null
+++ b/MPI/MPI.c
@@ -0,0 +1,121 @@
+/*
+ * An MPI implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ * For more info about MPI see https://computing.llnl.gov/tutorials/mpi/
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <sys/time.h>
+#include <mpi.h>
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+
+int main(int argc, char ** argv) {
+    int rank,size;
+    int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
+    int global_padded_nm[2];   //padded global matrix dimensions (if padding is not needed, global_padded=global)
+    int i,j,k;
+    double * M, * A, * x, * y, *local_y, comm_t, comp_t;
+    
+	/* MPI basic initializations */
+    MPI_Init(&argc,&argv);
+    MPI_Comm_size(MPI_COMM_WORLD,&size);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+	
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		global_nm[0] = atoi(argv[1]);
+		global_nm[1] = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+
+	/* Padd N in 'size' equal pieces */
+   	local_nm[1]=global_nm[1];
+	global_padded_nm[1]=global_nm[1];
+
+	if (global_nm[0]%size==0) {
+		local_nm[0]=global_nm[0]/size;
+		global_padded_nm[0]=global_nm[0];
+	}
+	else {
+		local_nm[0]=(global_nm[0]/size)+1;
+		global_padded_nm[0]=local_nm[0]*size;
+	}
+
+	x =	(double *) malloc(global_padded_nm[1] * sizeof(*x));
+    if (rank==0) {
+		M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
+		vec_init_rand(x, global_padded_nm[1], 1.0);
+		y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
+		vec_init(y, global_padded_nm[0], 0.0);
+		if( !y || !x || !M ) error("memory allocation failed");
+		
+		/* Initialize matrices */
+		ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
+	}
+
+	
+	local_y = (double *) malloc(local_nm[0] * sizeof(*local_y));
+	vec_init(local_y, local_nm[0], 0.0);
+
+	//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]);
+
+	A = (double *) malloc(local_nm[0] * local_nm[1] * sizeof(*M));
+
+    /* Rank 0 scatters the global matrix */
+	double * gsendbuf;
+	if (rank == 0){
+		gsendbuf = &(M[0]);
+		comm_t= MPI_Wtime(); 
+	}
+
+	MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	//if (rank == 0) printf("Scatter complete\n");
+	MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	
+	if (rank==0) {
+		printf("MPI Version(N=%d, M=%d, Tasks=%d, Nodes=%s, Tasks/Node=%s): ", global_nm[0],  global_nm[1], size, 
+		getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE"));
+		comp_t= MPI_Wtime(); 
+	}
+	for (i = 0; i < NR_ITER; ++i){
+		register double	yi = 0;
+		for (k = 0; k < local_nm[0]; ++k) {
+        	yi = 0.0;
+        	for (j = 0; j < local_nm[1]; ++j) yi += A[k*local_nm[1]+j]*x[j];
+        	local_y[k] = yi;
+    	}
+	}
+
+	MPI_Barrier(MPI_COMM_WORLD);
+	if (rank==0) comp_t= MPI_Wtime() - comp_t;
+	MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
+	if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t;
+
+	if (rank == 0) {
+#ifdef _DEBUG_ 
+		/* Output y vector to a file for debugging */
+        FILE * fp;
+		char filename[] = "MPI.debug" ;
+		if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+        for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
+		fclose(fp) ;
+#endif
+		report_mpi_results(comm_t, comp_t);
+		/* Free rank 0 local memmory */
+		free(M);
+		free(y);
+	}
+	free(x);
+	free(local_y);
+	free(A);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/MPI/MPI.slurm b/MPI/MPI.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..34eedc75241211013c5b294eb8fc2adbdaf60fd2
--- /dev/null
+++ b/MPI/MPI.slurm
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+##############################################
+#     ARIS slurm script template  			 #
+#                                  			 #
+# Submit script: sbatch MPI.slurm n1 n2 ...  #
+#                                  			 #
+##############################################
+
+
+#SBATCH --job-name=run_mpi    # Job name
+#SBATCH --output=MPI.out 
+#SBATCH --error=MPI.err 
+#SBATCH --ntasks=16    # Number of processor cores (i.e. tasks)
+#SBATCH --nodes=16 # Number of nodes requested
+#SBATCH --ntasks-per-node=1    # Tasks per node
+#SBATCH --cpus-per-task=20    # Threads per task
+#SBATCH --time=00:10:00   # walltime
+#SBATCH --mem=50G   # memory per NODE
+#SBATCH --partition=compute # Partition
+#SBATCH --account=testproj  # Accounting project
+
+
+export I_MPI_FABRICS=shm:dapl
+
+
+## LOAD MODULES ##
+module purge		# clean up loaded modules 
+
+# load necessary modules
+module load gnu ##/7.2.0
+module load intel ##/17.0.4
+module load intelmpi ##/5.1.3.258
+module load binutils
+module load cuda
+
+
+## Change this to the directory of your executable!
+gpu_prog="./MPI.exe"
+gpu_prog1="./MPI-OpenMP.exe"
+
+export OMP_PROC_BIND=spread # OpenMP thread affinity variable
+
+for n;
+do
+	srun $gpu_prog $n $n >> mpi.out
+	for tr in 1 2 5 10 20 # Run for different OpenMP thread numbers ( tr <= cpus-per-task ) 
+	do
+		export OMP_NUM_THREADS=$tr
+		srun $gpu_prog1 $n $n >> mpi.out
+	done
+done
+
+
+
+
+
+
diff --git a/MPI/Makefile b/MPI/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..45807e5ac7c94fc110390aecaa06a4569dd541dd
--- /dev/null
+++ b/MPI/Makefile
@@ -0,0 +1,46 @@
+#CC=gcc
+ICC=mpiicc
+
+DEBUG ?= 0  # Set to 1 for debug
+
+#CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge 
+#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
+#CFLAGS=-O3 -Wall -xCORE-AVX-I
+#CFLAGS=-O3 -Wall -xCORE-AVX2 
+
+# Need to -I this for user-defined functions to work
+EXT_DIR = ../External_Functions/
+
+ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I
+
+MPI_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
+MPI_OMP_COMPILE= $(MPI_COMPILE) -mt_mpi -qopenmp
+
+ifeq ($(DEBUG), 1)
+	MPI_COMPILE += -D_DEBUG_
+endif
+
+MPI_COMPILE_OBJ= $(MPI_COMPILE) -c
+
+
+
+SOURCE = MPI.c MPI-OpenMP.c
+OBJECTS = util.o matrix_op.o timer.o input.o
+PROGRAMS= MPI.exe MPI-OpenMP.exe
+
+all: $(PROGRAMS)
+
+MPI-OpenMP.exe: $(OBJECTS) MPI-OpenMP.c
+	$(MPI_OMP_COMPILE) MPI-OpenMP.c -o $@ $(OBJECTS)	
+
+MPI.exe: $(OBJECTS) MPI.c
+	$(MPI_COMPILE) MPI.c -o $@ $(OBJECTS)
+
+%.o: $(EXT_DIR)%.c
+	$(MPI_COMPILE_OBJ) -o $@ $<
+
+%.o: %.h
+
+clean:
+	$(RM) $(PROGRAMS) $(OBJECTS)
+
diff --git a/MPI/README.md b/MPI/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e24eb84d58917da76f56e282121019256fa612be
--- /dev/null
+++ b/MPI/README.md
@@ -0,0 +1,13 @@
+# A parallel MPI implementation of the matrix-vector multiplication algorithm
+
+```
+->MPI(Basic implementation using intel mpi for compilation)
+19/09/2017: Completed
+
+->MPI-OpenMP(Hybrid implementation with MPI for data management between nodes and OpenMP for computations)
+20/09/2017: Completed
+
+Tested environments:
+- Ivy Bridge Intel Xeon E5-2680v2 CPU with Linux x86_64 and intelmpi/5.0.3, intel/15.0.3
+
+```
diff --git a/OpenMP/Makefile b/OpenMP/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..adc8b2ad92d48a7f8b35bd471f844054304a88f2
--- /dev/null
+++ b/OpenMP/Makefile
@@ -0,0 +1,44 @@
+CC=gcc
+ICC =icc
+
+DEBUG ?= 0 # Set to 1 for debug
+
+CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
+#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
+ 
+#CFLAGS=-O3 -Wall -xCORE-AVX-I
+#CFLAGS=-O3 -Wall -xCORE-AVX2 
+ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I -lrt
+
+# Need to -I this for user-defined functions to work
+EXT_DIR = ../External_Functions/
+
+CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
+
+ifeq ($(DEBUG), 1)
+	CPU_COMPILE += -D_DEBUG_
+endif
+
+CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
+
+
+SOURCE = OpenMP.c OpenMP_aff.c 
+OBJECTS = util.o matrix_op.o timer.o input.o
+PROGRAMS= OpenMP.exe OpenMP_aff.exe
+
+all: $(PROGRAMS)
+
+OpenMP.exe: $(OBJECTS) OpenMP.c
+	$(CPU_COMPILE) OpenMP.c -o $@ $(OBJECTS)
+
+OpenMP_aff.exe: $(OBJECTS) OpenMP_aff.c
+	$(CPU_COMPILE) OpenMP_aff.c -o $@ $(OBJECTS)
+
+%.o: $(EXT_DIR)%.c
+	$(CPU_COMPILE_OBJ) -o $@ $<
+
+%.o: %.h
+
+clean:
+	$(RM) $(PROGRAMS) $(OBJECTS)
+
diff --git a/OpenMP/OpenMP.c b/OpenMP/OpenMP.c
new file mode 100644
index 0000000000000000000000000000000000000000..4fb7725e23a5a121e46833f94db2ed9a14c0e5a3
--- /dev/null
+++ b/OpenMP/OpenMP.c
@@ -0,0 +1,75 @@
+/*
+ * A simple OpenMP implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ * For more info about OpenMP programming see http://bisqwit.iki.fi/story/howto/openmp/
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <omp.h>
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+
+
+int main(int argc, char **argv)
+{
+	/* Initializations */
+	int i, j, k, n, m;
+	double timer;
+
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+
+	/* Allocate space */
+	double *x 			= (double *) malloc(m * sizeof(*x));
+	double *y	= (double *) malloc(n * sizeof(*y));
+	double **M 			= (double **) malloc(n * sizeof(*M));
+	for( i=0 ; i<n ; ++i) M[i] = (double *) calloc(m, sizeof(double));
+	if( !y || !x || !M ) error("memory allocation failed");
+
+	/* Initialize matrices */
+	matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+
+	/* Initialize vectors */
+	vec_init_rand(x, m, 1.0);
+	vec_init(y, n, 0.0);
+	
+
+	/* OpenMP Kernel */
+	printf("OpenMP Version(N=%d, M=%d, Threads=%s): ", n, m, getenv("OMP_NUM_THREADS"));
+	timer = csecond();
+	for (i = 0; i < NR_ITER; ++i){
+		register double	yi = 0;
+		#pragma omp parallel for private(j,yi) shared(n,m,M,y) schedule(dynamic)
+		for (k = 0; k < n; ++k) {
+        	yi = 0.0;
+        	for (j = 0; j < m; ++j) yi += M[k][j]*x[j];
+        	y[k] = yi;
+    	}
+	}
+	timer = csecond() - timer;
+
+#ifdef _DEBUG_
+	/* Output y vector to a file for debugging */
+    FILE * fp;
+	char * filename = "OpenMP.debug" ;
+	if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+    for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
+	fclose(fp) ;
+#endif
+	report_results(timer);
+
+	return 0;
+}
+
diff --git a/OpenMP/OpenMP.slurm b/OpenMP/OpenMP.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..30296f2e273025495c23f8b7b1486f1dca8d810e
--- /dev/null
+++ b/OpenMP/OpenMP.slurm
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+#################################################
+#     ARIS slurm script template  			 	#
+#                                  			 	#
+# Submit script: sbatch OpenMP.slurm n1 n2 ...  #
+#                                  			 	#
+#################################################
+
+
+#SBATCH --job-name=run_omp    # Job name
+#SBATCH --output=OpenMP.out
+#SBATCH --error=OpenMP.err
+#SBATCH --ntasks=1     # Number of processor cores (i.e. tasks)
+#SBATCH --nodes=1    # Number of nodes requested
+#SBATCH --ntasks-per-node=1     # Tasks per node
+#SBATCH --cpus-per-task=40     # Threads per task
+#SBATCH --time=00:40:00   # walltime
+#SBATCH --mem=120G   # memory per NODE
+#SBATCH --partition=fat # Partition
+#SBATCH --account=testproj  # Accounting project
+
+## LOAD MODULES ##
+module purge		# clean up loaded modules 
+
+# load necessary modules
+module load gnu
+module load intel
+module load intelmpi
+module load binutils
+module load cuda
+
+## Change this to the directory of your executable!
+gpu_prog="./OpenMP.exe"
+gpu_prog1="./OpenMP_aff.exe"
+
+export OMP_PROC_BIND=spread
+
+for n;
+do
+	for threads in 1 2 5 10 20 40
+	do
+		export OMP_NUM_THREADS=$threads
+		srun $gpu_prog $n $n 
+		srun $gpu_prog1 $n $n
+	done
+done
diff --git a/OpenMP/OpenMP_aff.c b/OpenMP/OpenMP_aff.c
new file mode 100644
index 0000000000000000000000000000000000000000..9b59ece5ebad896678a8c14e9c7677f4e4c77fbc
--- /dev/null
+++ b/OpenMP/OpenMP_aff.c
@@ -0,0 +1,82 @@
+/*
+ * A first-touch/affinity frendly OpenMP implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ *
+ * For more info about OpenMP thread affinity see http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-affinity.html
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <omp.h>
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+
+int main(int argc, char **argv)
+{
+	/* Initializations */
+	int i, j, k, n, m;
+	double timer;
+
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+
+	/* Allocate space */
+	double *x 			= (double *) malloc(m * sizeof(*x));
+	double *y	= (double *) malloc(n * sizeof(*y));
+	double *M 			= (double *) malloc(n * m * sizeof(*M));
+
+	#pragma omp parallel for schedule(static) /* Initialize data for each thread in corresponding socket/cache with first-touch policy */
+	for( i=0 ; i<n ; ++i){
+		for ( j=0 ; j<m ; ++j) M[i*m+j]=0.0;	
+		//printf( "Initialize data Thread=%d i=%d\n", omp_get_thread_num(), i);
+	}
+
+
+	if( !y || !x || !M ) error("memory allocation failed");
+
+	/* Initialize matrices */
+	ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+
+	/* Initialize vectors */
+	vec_init_rand(x, m, 1.0);
+	vec_init(y, n, 0.0);
+	
+
+	/* OpenMP Affinity Kernel */
+	printf("OpenMP_aff Version(N=%d, M=%d, Threads=%s): ", n, m, getenv("OMP_NUM_THREADS"));
+	timer = csecond();
+	for (i = 0; i < NR_ITER; ++i){
+		register double	yi = 0;
+		#pragma omp parallel for private(j,yi) shared(n,m,M,y) schedule(static) /* Each thread computes n/thread_num contiguous elements of y */
+		for (k = 0; k < n; ++k) {
+			//printf( "Compute Thread=%d i=%d\n", omp_get_thread_num(), k);
+        	yi = 0.0;
+        	for (j = 0; j < m; ++j) yi += M[k*m+j]*x[j];
+        	y[k] = yi;
+    	}
+	}
+	timer = csecond() - timer;
+
+#ifdef _DEBUG_ 
+	/* Output y vector to a file for debugging */
+	FILE * fp;
+	char * filename = "OpenMP_aff.debug" ;
+	if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+	for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
+	fclose(fp) ;
+#endif
+	report_results(timer);
+
+	return 0;
+}
+
+	
diff --git a/OpenMP/README.md b/OpenMP/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..274b146121660994bea216f6dcb90c6e7d1a898e
--- /dev/null
+++ b/OpenMP/README.md
@@ -0,0 +1,17 @@
+# Two OpenMP impementations of the Matrix-Vector algorithm:
+
+```
+->OpenMP (a simple naive parallel for implementation)
+07/09/2017: Completed
+
+->OpenMP_aff(matrix initialization with first touch policy to minimize socket memory transactions. Threads are bind to certain cores)
+13/09/2017: Completed
+18/09/2017: Added thread binding to match memory alocation pattern
+
+Tested environments:
+- Ivy Bridge Intel Xeon E5-2680v2 CPU with Linux x86_64 and intelmpi/5.0.3, intel/15.0.3
+- SandyBridge Intel Xeon E5-4650v2 CPU with Linux x86_64 and intelmpi/5.0.3, intel/15.0.3
+
+```
+
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c76ee42d977a8af3b986edd73ff685a8a05f21a7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,39 @@
+
+# From Serial to Parallel: A simple training using the Martix-Vector multiplication algorithm
+
+
+## Intro
+
+This training's purpose is to select a simple algorithm, and starting from a basic serial implementation to explore multiple parallel options and implementations.
+In this case, we used the simple Matrix-Vector multiplication algorithm, because of its simplicity and parallelization posibilities. All our experiments were implemented and tested on the GRNET's ARIS HPC. Each program subdirectory ( GPUs, MPI, OpenMP ) contains the corresponding programs source code, submit scripts and makefiles. 
+
+## Directory breakdown
+```
+Training
+├── External_Functions
+├── GPUs
+├── MPI
+├── OpenMP
+├── Outputs
+│   └── Debug
+└── Serial
+```
+
+## External_Functions
+
+This directory contains basic helper functions used by most of our programs. These are included and compiled along with the programs in their own directories. Changing this directory's location requires updating the program makefiles.
+
+## Serial
+A basic Serial Implementation of the Matrix-Vector multiplication algorithm, mostly used for error-checking and speedup calculation.
+
+## OpenMP
+OpenMP is the simplest parallelization tool for shared memory architectures, and thus this is where we start from. In this directory we start with a simple OpenMP 'parallel for' implementation (OpenMP.c), which scales only for a small number of cores. Then, we update this simple program to utilize thread affinity/binding (which is done externally by the calling script), by initializing data to the correct sockets/caches with first touch policy (OpenMP_aff.c). 
+
+## MPI
+To further scale in multiple nodes, we use a non-shared memory model tool, MPI (Intel MPI in our case for compiling). We start with a bacic MPI implementation which scales (theoritically) to any number of nodes/cores (MPI.c). Then, in order to utilize shared memory better we implement a hybrid MPI-OpenMP version (MPI-OpenMP.c - MPI for multinode and OpenMP internally in every node for shared memory multicore utilization). In both cases, computation time scales smoothly, but inter-process communication time poses a big problem (because of the small computational intensity of the Matrix-Vector kernel).
+
+## GPUs
+Finally, we implement our base-algorithm with CUDA in a Nvidia GPU(cuda_SingleGPU.cu + dmv_gpu.cu). We invoke 3 different kernels, starting from a simple-naive one and improving him as we go (in the second kernel we transpose the matrix to achieve coalesced memory access, and in the third one we also use the block shared memory (shmem) to utilize bandwidth better). To test our implementations we also implement a cuBLAS (Nvidia parallel BLAS routine library) version (cuBLAS_SingleGPU.cu). Then, we create a final hybrid cuBlAS-MPI version (cuBLAS_MultiGPU.cu) in order to utilize a possible multi-gpu/node architecture (MPI inter-process communication is still a big problem for the Matrix-Vector kernel, but in a more computational intensive scenario a huge scale-up is possible). 
+
+## Compilation/Running
+All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made).
diff --git a/Serial/Makefile b/Serial/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..b5fdb771894e6fa614055e2fb678de24ab7daae5
--- /dev/null
+++ b/Serial/Makefile
@@ -0,0 +1,43 @@
+CC=gcc
+ICC =icc
+
+DEBUG ?= 0  # Set to 1 for debug
+
+CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge 
+#CFLAGS=-O3 -lm -Wall -mavx2 -mfma  -march=haswell -mtune=haswell 
+ 
+#CFLAGS=-O3 -Wall -xCORE-AVX-I
+#CFLAGS=-O3 -Wall -xCORE-AVX2 
+ICFLAGS=-O3 -Wall -axCORE-AVX2,CORE-AVX-I -lrt
+
+
+# Need to -I this for user-defined functions to work
+EXT_DIR = ../External_Functions/
+
+CPU_COMPILE= $(ICC) $(ICFLAGS) -I$(EXT_DIR)
+
+ifeq ($(DEBUG), 1)
+	CPU_COMPILE += -D_DEBUG_
+endif
+
+CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
+
+EXT_DIR = ../External_Functions/
+
+SOURCE = Serial.c
+OBJECTS = util.o matrix_op.o timer.o input.o
+PROGRAMS= Serial.exe
+
+all: $(PROGRAMS)
+
+Serial.exe: $(OBJECTS) $(SOURCE)
+	$(CPU_COMPILE) $(SOURCE) -o $@ $(OBJECTS)
+
+%.o: $(EXT_DIR)%.c
+	$(CPU_COMPILE_OBJ) -o $@ $<
+
+%.o: %.h
+
+clean:
+	$(RM) $(PROGRAMS) $(OBJECTS)
+
diff --git a/Serial/README.md b/Serial/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cd2a468bfd4504e438ba1032504350a00ff01a5
--- /dev/null
+++ b/Serial/README.md
@@ -0,0 +1,6 @@
+# A serial impementation of the Matrix-Vector algorithm:
+
+```
+->Serial(Mostly used for time comparison and error checking)
+05/09/2017: Completed
+```
diff --git a/Serial/Serial.c b/Serial/Serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..66720c584212a1c449ba2bc95f975385dcb421ff
--- /dev/null
+++ b/Serial/Serial.c
@@ -0,0 +1,70 @@
+/*
+ * A Serial implementation of the Matrix-Vector multiplication
+ * 
+ * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+/* Need to include External_Functions for these */
+#include "matrix_op.h"
+#include "util.h"
+#include "input.h"
+
+int main(int argc, char **argv)
+{
+	/* Initializations */
+	int i, j, k, n, m;
+	double timer;
+
+	if (argc < 3) error("Usage: ./Program N M");
+	else if ( argc == 3) { /*./Program N M */
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);		
+	}
+	else error("Too many Arguments");
+	
+	/* Allocate space */
+	double *x 	= (double *) malloc(m * sizeof(*x));
+	double *y	= (double *) malloc(n * sizeof(*y));
+	double **M 	= (double **) malloc(n * sizeof(*M));
+	for( i=0 ; i<n ; ++i) M[i] = (double *) calloc(m, sizeof(double));
+	if( !y || !x || !M ) error("memory allocation failed");
+
+	/* Initialize matrices */
+	matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
+
+	/* Initialize vectors */
+	vec_init_rand(x, m, 1.0);
+	vec_init(y, n, 0.0);
+
+
+	/* Serial Kernel */
+	printf("Serial Version(N=%d, M=%d): ", n, m);
+	timer = csecond();
+	for (i = 0; i < NR_ITER; ++i){
+		register double yi;
+		for (k = 0; k < n; ++k) {
+        	yi = 0.0 ;
+        	for (j = 0; j < m; ++j) yi += M[k][j]*x[j];
+        	y[k] = yi;
+    	}
+	}
+	timer = csecond() - timer ;
+
+#ifdef _DEBUG_
+	/* Output y vector to a file for debugging */
+	FILE * fp;
+	char filename[] = "Serial.debug" ;
+	if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
+	for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
+	fclose(fp) ;
+#endif
+
+	report_results(timer);
+
+	return 0;
+}
+
diff --git a/Serial/Serial.slurm b/Serial/Serial.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..b967290e7458d087c2095a0d967802915d494674
--- /dev/null
+++ b/Serial/Serial.slurm
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+#################################################
+#     ARIS slurm script template   				#
+#                                  				#
+# Submit script: sbatch Serial.slurm n1 n2 ...  #
+#                                  				#
+#################################################
+
+
+#SBATCH --job-name=run_Serial    # Job name
+#SBATCH --output=Serial.out
+#SBATCH --error=Serial.err
+#SBATCH --ntasks=1     # Number of processor cores (i.e. tasks)
+#SBATCH --nodes=1    # Number of nodes requested
+#SBATCH --ntasks-per-node=1     # Tasks per node
+#SBATCH --cpus-per-task=1     # Threads per task
+#SBATCH --time=00:40:00   # walltime
+#SBATCH --mem=50G   # memory per NODE
+#SBATCH --partition=taskp  # Partition
+#SBATCH --account=testproj  # Accounting project
+
+## LOAD MODULES ##
+module purge		# clean up loaded modules 
+
+# load necessary modules
+module load gnu
+module load intel
+module load intelmpi
+module load binutils
+module load cuda
+
+## Change this to the directory of your executable!
+gpu_prog="./Serial.exe"
+
+for n;
+do
+	srun $gpu_prog $n $n
+done