From 25b76e212085b95c82fc33ebc7692b7fe02aad26 Mon Sep 17 00:00:00 2001
From: "petros.anastasiadis" <p.anastasiadis.ece@gmail.com>
Date: Mon, 9 Oct 2017 14:05:00 +0300
Subject: [PATCH] Readme updates, Global_maker

---
 .gitignore               |  1 +
 External_Functions/dmv.c | 67 ----------------------------------------
 External_Functions/dmv.h | 19 ------------
 GPUs/README.md           |  4 +++
 GPUs/cuda_SingleGPU.cu   |  2 +-
 GPUs/dmv_gpu.cu          |  4 +--
 Global_make.sh           | 32 +++++++++++++++++++
 MPI/README.md            |  3 ++
 OpenMP/README.md         |  8 ++++-
 Outputs/Runner.sh        | 16 +++++++---
 README.md                | 26 +++++++++++++---
 11 files changed, 84 insertions(+), 98 deletions(-)
 delete mode 100755 External_Functions/dmv.c
 delete mode 100755 External_Functions/dmv.h
 create mode 100755 Global_make.sh

diff --git a/.gitignore b/.gitignore
index 92937d9..f7b8636 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 **/*.out
 **/*.err
 **/*.debug
+Outputs/
diff --git a/External_Functions/dmv.c b/External_Functions/dmv.c
deleted file mode 100755
index 957734f..0000000
--- a/External_Functions/dmv.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Helpfull functions for SpMV multiplication
- * 
- * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
- */
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "dmv.h"
-
-void dmv_serial(double **a, const double *x, double *y,
-                size_t n, size_t m)
-{
-    size_t  i, j;
-	double yi;
-    for (i = 0; i < n; ++i) {
-        yi = 0.0 ;
-        for (j = 0; j < m; ++j) {
-            yi += a[i][j]*x[j];
-        }
-        y[i] = yi;
-    }
-}
-
-void dmv_omp(double **a, const double *x, double *y,
-                size_t n, size_t m)
-{
-    size_t  i, j;
-	#pragma omp parallel for private(i,j) shared(n,m,a,y) schedule(dynamic)
-    for (i = 0; i < n; ++i) {
-        register double    _yi = 0;
-        for (j = 0; j < m; ++j) {
-            _yi += a[i][j]*x[j];
-        }
-        y[i] = _yi;
-    }
-}
-
-
-void dmv_csr(int * csrPtr, int *csrCol, double * csrVal, double *x, double *ys, int n)
-{
-	int  i, j;
-	for (i = 0; i < n; ++i) {
-        	double yi = 0;
-        	for (j = csrPtr[i]; j < csrPtr[i + 1]; j++) yi += csrVal[j] * x[csrCol[j]];
-        	ys[i] = yi;
-    	}
-}
-
-int vec_equals(const double *v1, const double *v2, size_t n, double eps)
-{
-	size_t  i,k=0;
-    	for (i = 0; i < n; ++i) {
-		if (fabs(v1[i] - v2[i]) > eps) k++;	
-    	}
-	return k;
-}
-
-
-
-
-void vec_print(const double *v, size_t n)
-{
-    size_t  i;
-    for (i = 0; i < n; ++i)
-        printf("%f\n", v[i]);
-}
diff --git a/External_Functions/dmv.h b/External_Functions/dmv.h
deleted file mode 100755
index 44f8c7e..0000000
--- a/External_Functions/dmv.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  dmv.h -- Declarations and definitions related to the DMV
- *           multiplication kernels.
- *
- *  Copyright (C) 2010-2012, Computing Systems Laboratory (CSLab)
- *  Copyright (C) 2010-2012, Vasileios Karakasis
- */ 
-
-#include <stddef.h>
-
-void vec_init(double *v, size_t n, double val);
-void vec_init_rand(double *v, size_t n, double max);
-void vec_init_rand_p(double *v, size_t n, size_t np, double max);
-int vec_equals(const double *v1, const double *v2, size_t n, double eps);
-void vec_print(const double *v, size_t n);
-void dmv_csr(int * csrPtr, int *csrCol, double * csrVal, double *x, double *ys, int n);
-void dmv_serial(float **a, const float *x, float *y, size_t n);
-
-
diff --git a/GPUs/README.md b/GPUs/README.md
index 929edda..7f5716d 100644
--- a/GPUs/README.md
+++ b/GPUs/README.md
@@ -11,5 +11,9 @@
 ->cuda_SingleGPU(3 cuda kernels showing the optimization steps in writing GPU code)
 02/10/2017: Completed kernel 1
 03/10/2017: Completed kernel 2 & 3
+
+Tested environments:
+- Haswell Intel Xeon E5-2660v3 CPU with Linux x86_64 + Nvidia Tesla K40 GPUs and cuda/8.0.61
+
 ```
 
diff --git a/GPUs/cuda_SingleGPU.cu b/GPUs/cuda_SingleGPU.cu
index 03cde0b..db61050 100644
--- a/GPUs/cuda_SingleGPU.cu
+++ b/GPUs/cuda_SingleGPU.cu
@@ -22,6 +22,7 @@
 #include "input.h"
 #include "gpu_util.h"
 
+#define block_size 256 /* Number of GPU threads per block. Modifying this value might lead to performance issues */ 
 
 int main(int argc, char **argv)
 {
@@ -44,7 +45,6 @@ int main(int argc, char **argv)
 	}
 	else error("Too many Arguments");
 
-	int block_size = 256; /* Number of GPU threads per block */ 
 	int grid_size = (n-1)/block_size + 1; 
 	size_t shmem_size = 0;
 
diff --git a/GPUs/dmv_gpu.cu b/GPUs/dmv_gpu.cu
index e5b7b67..9441cfe 100755
--- a/GPUs/dmv_gpu.cu
+++ b/GPUs/dmv_gpu.cu
@@ -35,7 +35,7 @@ __global__ void dmv_gpu_naive(const double *a, const double *x, double *y,
 }
 
 /*
- *  Coalesced memory acceses
+ *  Coalesced memory accesses kernel (requires transposed matrix a)
  */
 __global__ void dmv_gpu_coalesced(const double *a, const double *x,
                                   double *y, size_t n)
@@ -50,7 +50,7 @@ __global__ void dmv_gpu_coalesced(const double *a, const double *x,
 }
 
 /*
- *  Use of shared memory
+ *  Final kernel making use of shared memory to improve memory bandwidth utilization and access pattern
  */
 __global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n)
 {
diff --git a/Global_make.sh b/Global_make.sh
new file mode 100755
index 0000000..6f02708
--- /dev/null
+++ b/Global_make.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+## LOAD MODULES ##
+module purge		# clean up loaded modules 
+
+# load necessary modules
+module load gnu
+module load intel
+module load intelmpi
+module load binutils
+module load cuda
+
+for n;
+do
+	if cd "$n" ; 
+	then
+		make
+		cd ../
+	else
+		if [ "$n" == "-all" ];
+		then
+			cd GPUs  
+			make  
+			cd ../MPI  
+			make  
+			cd ../OpenMP  
+			make
+		else
+			echo "Use: ./Global_make.sh Prog_dir_name or ./Global_make.sh -all"
+		fi		
+	fi
+done
diff --git a/MPI/README.md b/MPI/README.md
index b40d1dc..e24eb84 100644
--- a/MPI/README.md
+++ b/MPI/README.md
@@ -7,4 +7,7 @@
 ->MPI-OpenMP(Hybrid implementation with MPI for data management between nodes and OpenMP for computations)
 20/09/2017: Completed
 
+Tested environments:
+- Ivy Bridge Intel Xeon E5-2680v2 CPU with Linux x86_64 and intelmpi/5.0.3, intel/15.0.3
+
 ```
diff --git a/OpenMP/README.md b/OpenMP/README.md
index dc8c855..274b146 100644
--- a/OpenMP/README.md
+++ b/OpenMP/README.md
@@ -6,6 +6,12 @@
 
 ->OpenMP_aff(matrix initialization with first touch policy to minimize socket memory transactions. Threads are bind to certain cores)
 13/09/2017: Completed
-18/09/2017: Added thread binding to match memmory alocation pattern
+18/09/2017: Added thread binding to match memory alocation pattern
+
+Tested environments:
+- Ivy Bridge Intel Xeon E5-2680v2 CPU with Linux x86_64 and intelmpi/5.0.3, intel/15.0.3
+- SandyBridge Intel Xeon E5-4650v2 CPU with Linux x86_64 and intelmpi/5.0.3, intel/15.0.3
+
 ```
 
+
diff --git a/Outputs/Runner.sh b/Outputs/Runner.sh
index 9f1d46e..57d20de 100755
--- a/Outputs/Runner.sh
+++ b/Outputs/Runner.sh
@@ -1,7 +1,15 @@
 # Run all files. Must update if changes are made in directory tree
 submit=sbatch
-$submit ../Serial/Serial.slurm 25000
-$submit ../OpenMP/OpenMP.slurm 25000
-$submit ../GPUs/GPU.slurm 25000
-$submit ../MPI/MPI.slurm 25000
+cd ../Serial
+make
+$submit Serial.slurm 25000
+cd ../OpenMP
+make
+$submit OpenMP.slurm 25000
+cd ../GPUs
+make
+$submit GPU.slurm 25000
+cd ../MPI
+make
+$submit MPI.slurm 25000
 
diff --git a/README.md b/README.md
index 6d0798b..be062a9 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,13 @@
 
 # From Serial to Parallel: A simple training using the Martix-Vector multiplication algorithm
 
-## Directory layout(tree -d):
 
-```
+## Intro
+This training's purpose is to select a simple algorithm, and starting from a basic serial implementation to explore multiple parallel options and implementations.
+In this case, we used the simple Matrix-Vector multiplication algorithm, because of its simplicity and parallelization posibilities. All our experiments were implemented and tested on the GRNET's ARIS HPC. Each program subdirectory ( GPUs, MPI, OpenMP ) contains the corresponding programs source code, submit scripts and makefiles. 
+
+## Directory breakdown
+
 Training
 ├── External_Functions
 ├── GPUs
@@ -12,7 +16,21 @@ Training
 ├── Outputs
 │   └── Debug
 └── Serial
-```
 
-Loader contains required modules for all makefiles. Create executables with 'make' in corresponding directories. 
+## External_Functions
+This directory contains basic helper functions used by most of our programs. These are included and compiled along with the programs in their own directories. Changing this directory's location requires updating the program makefiles.
+
+##Serial
+A basic Serial Implementation of the Matrix-Vector multiplication algorithm, mostly used for error-checking and speedup calculation.
+
+##OpenMP
+OpenMP is the simplest parallelization tool for shared memory architectures, and thus this is where we start from. In this directory we start with a simple OpenMP 'parallel for' implementation (OpenMP.c), which scales only for a small number of cores. Then, we update this simple program to utilize thread affinity/binding (which is done externally by the calling script), by initializing data to the correct sockets/caches with first touch policy (OpenMP_aff.c). 
+
+##MPI
+To further scale in multiple nodes, we use a non-shared memory model tool, MPI (Intel MPI in our case for compiling). We start with a bacic MPI implementation which scales (theoritically) to any number of nodes/cores (MPI.c). Then, in order to utilize shared memory better we implement a hybrid MPI-OpenMP version (MPI-OpenMP.c - MPI for multinode and OpenMP internally in every node for shared memory multicore utilization). In both cases, computation time scales smoothly, but inter-process communication time poses a big problem (because of the small computational intensity of the Matrix-Vector kernel).
+
+## GPUs
+Finally, we implement our base-algorithm with CUDA in a Nvidia GPU(cuda_SingleGPU.cu + dmv_gpu.cu). We invoke 3 different kernels, starting from a simple-naive one and improving him as we go (in the second kernel we transpose the matrix to achieve coalesced memory access, and in the third one we also use the block shared memory (shmem) to utilize bandwidth better). To test our implementations we also implement a cuBLAS (Nvidia parallel BLAS routine library) version (cuBLAS_SingleGPU.cu). Then, we create a final hybrid cuBlAS-MPI version (cuBLAS_MultiGPU.cu) in order to utilize a possible multi-gpu/node architecture (MPI inter-process communication is still a big problem for the Matrix-Vector kernel, but in a more computational intensive scenario a huge scale-up is possible). 
 
+##Compilation/Running
+All executables can be created by running the Makefiles in the corresponding directories. There is also a global-maker in the project root directory. Every program directory contains a slurm file for execution in the ARIS system (for other systems corresponding adjustments must be made).
-- 
GitLab