Commit d98c1b89 authored by petros.anastasiadis's avatar petros.anastasiadis

Update 03/10/2017 - Added cuda version, gitignore

parent 2c57323d
*.o
*.exe
*.out
*.err
/*
* dmv.h -- Declarations and definitions related to the DMV
* multiplication kernels.
*
* Copyright (C) 2010-2012, Computing Systems Laboratory (CSLab)
* Copyright (C) 2010-2012, Vasileios Karakasis
*/
#include <stddef.h>
void vec_init(double *v, size_t n, double val);
void vec_init_rand(double *v, size_t n, double max);
void vec_init_rand_p(double *v, size_t n, size_t np, double max);
int vec_equals(const double *v1, const double *v2, size_t n, double eps);
void vec_print(const double *v, size_t n);
void dmv_csr(int * csrPtr, int *csrCol, double * csrVal, double *x, double *ys, int n);
void dmv_serial(float **a, const float *x, float *y, size_t n);
#!/bin/bash
####################################
# ARIS slurm script template #
# #
# Submit script: sbatch filename #
# #
####################################
##############################################
# ARIS slurm script template #
# #
# Submit script: sbatch GPU.slurm n1 n2 ... #
# #
##############################################
#SBATCH --job-name=run_GPU # Job name
#SBATCH --output=J.out # Stdout (%j expands to jobId)
#SBATCH --error=J.err # Stderr (%j expands to jobId)
#SBATCH --ntasks=16 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=8 # Number of nodes requested
#SBATCH --ntasks-per-node=2 # Tasks per node
#SBATCH --output=GPU.out
#SBATCH --error=GPU.err
#SBATCH --ntasks=1 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=1 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --cpus-per-task=1 # Threads per task
#SBATCH --gres=gpu:2 # GPUs per node
#SBATCH --gres=gpu:1 # GPUs per node
#SBATCH --time=00:40:00 # walltime
#SBATCH --mem=32G # memory per NODE
#SBATCH --partition=gpu # Partition
......@@ -33,17 +33,16 @@ module load cuda
export I_MPI_FABRICS=shm:dapl
output="/users/guest/petyros/Training/Outputs" ##/Inputs
partition="gpu"
## Change this to the directory of your executable!
gpu_prog="/users/guest/petyros/Training/GPUs/cuBLAS"
gpu_prog1="/users/guest/petyros/Training/GPUs/cuBLAS_MultiGPU"
#rm -f "$output/Multi_GPU.$partition" "$output/Single_GPU.$partition"
gpu_prog="./cuda_SingleGPU.exe"
gpu_prog1="./cuBLAS.exe"
gpu_prog2="./cuBLAS_MultiGPU.exe"
## Important note!!! For full GPU utilization in MultiGPU version you must use gres=ntasks-per-node values!!!
for n;
for n;
do
#srun $gpu_prog $n $n >> "$output/Single_GPU.$partition"
srun $gpu_prog1 $n $n >> "$output/Multi_GPU.$partition"
srun $gpu_prog $n $n
srun $gpu_prog1 $n $n
# Important note: In MultiGPU version you must use gres=ntasks-per-node values in order to utilize all GPUs !!!
# srun $gpu_prog2 $n $n
done
CC=g++
ICC =icc
DEBUG ?= 1
DEBUG ?= 0 # Set to 1 for debug
CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
#CFLAGS=-O3 -lm -Wall -mavx2 -mfma -march=haswell -mtune=haswell
......@@ -10,16 +10,19 @@ CFLAGS=-O3 -lm -Wall -mavx -march=ivybridge -mtune=ivybridge -fopenmp
#CFLAGS=-O3 -Wall -xCORE-AVX2
#ICFLAGS=-O3 -Wall -qopenmp -axCORE-AVX2,CORE-AVX-I
# Need to -I this for user-defined functions to work
EXT_DIR = ../External_Functions/
MPI_PREFIX = $(I_MPI_ROOT)
CUDA_PREFIX = $(CUDAROOT)
GPU_MPI_CXX = nvcc -L $(I_MPI_ROOT)/lib64 -lmpi -ccbin mpiicc
GPU_CXX = nvcc
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
LDFLAGS ?=-L $(CUDA_PREFIX)/lib64 -lcudart -lcublas -lcusparse -lm -lrt
GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35
CPU_COMPILE = $(CC) $(CFLAGS)
GPU_COMPILE = nvcc -I $(CUDA_PREFIX)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
GPU_MPI_COMPILE = $(GPU_MPI_CXX) -I $(CUDA_PREFIX)/include -I $(I_MPI_ROOT)/include -arch sm_35 -I$(EXT_DIR) $(LDFLAGS)
CPU_COMPILE = $(CC) $(CFLAGS) -I$(EXT_DIR) $(LDFLAGS)
ifeq ($(DEBUG), 1)
CPU_COMPILE += -D_DEBUG_
......@@ -30,23 +33,29 @@ endif
CPU_COMPILE_OBJ= $(CPU_COMPILE) -c
GPU_COMPILE_OBJ= $(GPU_COMPILE) -c
EXT_DIR = /users/guest/petyros/Training/External_Functions/
SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu
OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o
PROGRAMS= cuBLAS cuBLAS_MultiGPU
SOURCE = cuBLAS.cu cuBLAS_MultiGPU.cu cuda_SingleGPU.cu
OBJECTS = util.o matrix_op.o timer.o input.o gpu_util.o dmv_gpu.o
PROGRAMS= cuBLAS.exe cuBLAS_MultiGPU.exe cuda_SingleGPU.exe
all: $(PROGRAMS)
cuBLAS_MultiGPU: $(OBJECTS) cuBLAS_MultiGPU.cu
$(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
cuda_SingleGPU.exe: $(OBJECTS) cuda_SingleGPU.cu
$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuda_SingleGPU.cu
cuBLAS_MultiGPU.exe: $(OBJECTS) cuBLAS_MultiGPU.cu
$(GPU_MPI_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuBLAS_MultiGPU.cu
cuBLAS: $(OBJECTS) cuBLAS.cu
$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) $@.cu
cuBLAS.exe: $(OBJECTS) cuBLAS.cu
$(GPU_COMPILE) -o $@ $(OBJECTS) $(LDFLAGS) cuBLAS.cu
gpu_util.o: $(EXT_DIR)gpu_util.cu
$(GPU_COMPILE_OBJ) -o $@ $<
dmv_gpu.o: dmv_gpu.cu
$(GPU_COMPILE_OBJ) -o $@ $<
%.o: $(EXT_DIR)%.c
$(CPU_COMPILE_OBJ) -o $@ $<
......
......@@ -5,8 +5,11 @@
07/09/2017: Completed
13/09/2017: Modified to use unified memory
->cuBLAS_MultiGPU( cuBLAS implementation in multiple GPUs/Nodes)
->cuBLAS_MultiGPU(cuBLAS implementation in multiple GPUs/Nodes)
26/09/2017: Completed
->cuda_SingleGPU(3 cuda kernels showing the optimization steps in writing GPU code)
02/10/2017: Completed kernel 1
03/10/2017: Completed kernel 2 & 3
```
/*
* A Serial implementation of the Matrix-Vector multiplication
* A cuBLAS implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about cuBLAS see http://docs.nvidia.com/cuda/cublas/index.html
*
*/
#include <errno.h>
......@@ -11,16 +14,17 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"
#include "/users/guest/petyros/Training/External_Functions/gpu_util.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"
int main(int argc, char **argv)
{
/* Initializations */
int i, j, k, n, m;
int i, j, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
......@@ -60,14 +64,13 @@ int main(int argc, char **argv)
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m);
printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasStatus_t stat;
cublasHandle_t handle;
double *A, * y, *x_c;
/* Initialize Unified memmory */
/* Initialize Unified memmory visible and accesible from both CPU and GPU */
cudaMallocManaged(&A, m*n * sizeof(double));
cudaMallocManaged(&y, n * sizeof(double));
cudaMallocManaged(&x_c, m * sizeof(double));
......@@ -75,16 +78,16 @@ int main(int argc, char **argv)
cudaCheckErrors("Unified Alloc failed");
if ( !A || !y || !x_c) error("unified alloc failed");
for (i = 0; i < m; i++) x_c[i] = x[i];
matrix_col_major(M, A, n, m);
stat = cublasCreate(&handle);
matrix_col_major(M, A, n, m); /* We transpose the matrix because cuBLAS works with column-major format */
cublasCreate(&handle);
/* Warmup */
stat=cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
stat=cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
......@@ -93,9 +96,9 @@ int main(int argc, char **argv)
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ;
char filename[] = "cuBLAS.debug" ; /* Common directory for all implementations, change if needed */
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ;
for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
fclose(fp) ;
#endif
report_results(timer);
......
/*
* A Serial implementation of the Matrix-Vector multiplication
* A Hybrid MPI-CUDA implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about hybrid MPI-CUDA (cublas here) see https://devblogs.nvidia.com/parallelforall/introduction-cuda-aware-mpi/
*/
#include <errno.h>
......@@ -12,10 +14,11 @@
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include <mpi.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"
#include "/users/guest/petyros/Training/External_Functions/gpu_util.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"
......@@ -24,10 +27,10 @@ int main(int argc, char ** argv)
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i,j,k, sparse=0, *cooCol, n_z, *I;
int i, j, sparse=0, *cooCol, n_z, *I;
double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
......@@ -107,30 +110,30 @@ int main(int argc, char ** argv)
cudaSetDevice(rank%device_num);
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
cublasCreate(&handle);
/* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large */
double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
/* Copy data to GPU memmory */
copy_to_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
copy_to_gpu(x_c, gpu_xc, local_nm[1] * sizeof(*x_c));
copy_to_gpu(A, gpu_A, local_nm[0] * local_nm[1] * sizeof(*A));
/* Warmup */
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
if (rank==0) {
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", local_nm[0], local_nm[1], device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", local_nm[0], local_nm[1], device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
comp_t= MPI_Wtime();
}
for (j = 0; j < NR_ITER; ++j) {
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
}
cudaCheckErrors("cublasDgemv failed");
......@@ -150,17 +153,19 @@ int main(int argc, char ** argv)
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ;
char filename[] = "cuBLAS_MultiGPU.debug" ; /* Common directory for all implementations, change if needed */
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
for (i = 0; i < global_nm[0]; ++i) fprintf(fp, "%lf ", y[i]) ;
fclose(fp) ;
#endif
report_mpi_results(comm_t, comp_t);
/* Free rank 0 local memmory */
free(M);
free(M_cl);
free(y);
free(x);
}
/* Free GPU memmory */
gpu_free(local_y);
gpu_free(A);
gpu_free(x_c);
......
/*
* A cuda implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* The device code for the GPU implemntations can be found in the 'dmv_gpu.cu' file
*
* For more info about coalesced memmory access and shmem, see https://cvw.cac.cornell.edu/gpu/coalesced
*/
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include "dmv_gpu.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"
int main(int argc, char **argv)
{
/* Initializations */
int i, j, n, m;
int *I, *cooCol, n_z, sparse=0;
double *cooVal, timer;
/* File Input to COO */
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File */
{
if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
else error("Too many Arguments");
int block_size = 256; /* Number of GPU threads per block */
int grid_size = (n-1)/block_size + 1;
size_t shmem_size = 0;
dim3 gpu_block(block_size, 1);
dim3 gpu_grid(grid_size, 1);
/* Allocate space */
double *x = (double *) malloc(m * sizeof(*x));
double *y = (double *) malloc(n * sizeof(*y));
double *M = (double *) malloc(n * m * sizeof(*M));
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
vec_init(y, n, 0.0);
/* Initialize cuda variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
cudaSetDevice(0);
printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m);
double *A, * y, *x_c;
/* Initialize Unified memmory visible and accesible from both CPU and GPU */
cudaMallocManaged(&A, m*n * sizeof(double));
cudaMallocManaged(&y, n * sizeof(double));
cudaMallocManaged(&x_c, m * sizeof(double));
cudaDeviceSynchronize();
cudaCheckErrors("Unified Alloc failed");
if ( !A || !y || !x_c) error("unified alloc failed");
for (i = 0; i < m; i++) x_c[i] = x[i];
/* First naive kernel */
for ( i = 0; i < n*m; i++) A[i] = M[i] ;
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
dmv_gpu_naive<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
cudaCheckErrors("naive kernel failed");
report_results(timer);
/* Second kernel, using coalesced memmory accesses in the GPU by transposing the matrix. */
printf("Single GPU CUDA Coalesced Version(N=%d, M=%d): ", n, m);
matrix_col_major(M, A, n, m); /* We transpose the matrix to better match the GPU coalesced memmory access logic */
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
dmv_gpu_coalesced<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
cudaCheckErrors("coalesced kernel failed");
report_results(timer);
/* Third and final kernel further improves memmory access speed by using block exclusive shmem */
printf("Single GPU CUDA shmem Version(N=%d, M=%d): ", n, m);
shmem_size= block_size*sizeof(float);
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
dmv_gpu_shmem<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
cudaCheckErrors("shmem kernel failed");
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char filename[] = "CUDA.debug" ; /* Common directory for all implementations, change if needed */
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
fclose(fp) ;
#endif
report_results(timer);
}
return 0;
}
#include <stdio.h>
/*
* Utility function to get the thread ID within the
* global working space.
*/
__device__ int get_global_tid()
{
return (gridDim.x*blockIdx.y + blockIdx.x)*blockDim.x*blockDim.y +
blockDim.x*threadIdx.y + threadIdx.x;
}
/*
* Utility function to get the thread ID within the
* local/block working space.
*/
__device__ int get_local_tid()
{
return blockDim.x*threadIdx.y + threadIdx.x;
}
/*
* Naive kernel
*/
__global__ void dmv_gpu_naive(const double *a, const double *x, double *y,
size_t n)
{
int tid = get_global_tid();
double yi = 0.0;
if(tid >= n)
return ;
for ( int j = 0 ; j < n; j++ )
yi += + a[tid*n+j]*x[j];
y[tid]=yi;
}
/*
* Coalesced memory acceses
*/
__global__ void dmv_gpu_coalesced(const double *a, const double *x,
double *y, size_t n)
{
int tid = get_global_tid();
double yi = 0.0;
if(tid >= n)
return ;
for ( int j = 0 ; j < n; j++ )
yi += + a[j*n+tid]*x[j];
y[tid]=yi;
}
/*
* Use of shared memory
*/
__global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n)
{
extern __shared__ float shmem_buff[] ;
int tid = get_global_tid(), i, j;
double yi = 0.0;
if(tid >= n)
return ;
int block_s=blockDim.x*blockDim.y;
int lid=get_local_tid(), last_id = n/block_s ;
for( j = 0; j< last_id; j++) {
shmem_buff[lid] = x[block_s*j + lid];
__syncthreads();
for( i = 0 ; i < block_s; i++ ) {
yi += a[tid+ (i+j*block_s)*n]*shmem_buff[i];
}
__syncthreads();
}
y[tid]=yi;
}
#include <stdio.h>
/*
* Utility function to get the thread ID within the
* global working space.
*/
__device__ int get_global_tid();
/*
* Utility function to get the thread ID within the
* local/block working space.
*/
__device__ int get_local_tid();
/*
* Naive kernel
*/
__global__ void dmv_gpu_naive(const double *a, const double *x, double *y, size_t n);
/*
* Coalesced memory acceses
*/
__global__ void dmv_gpu_coalesced(const double *a, const double *x, double *y, size_t n);
/*
* Use of shared memory
*/
__global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n);
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
module load gnu/7.2.0
module load intel/17.0.4
module load intelmpi/5.1.3.258
module load binutils
module load cuda
File deleted
/*
* A Hybrid MPI-OpenMP implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about hybrid MPI-OpenMP see http://openmp.org/wp-content/uploads/HybridPP_Slides.pdf
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <mpi.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
int main(int argc, char ** argv) {
int rank,size;
......@@ -14,7 +23,7 @@ int main(int argc, char ** argv) {
int i,j,k, sparse=0, *cooCol, n_z, *I;
double * M, * A, * x, * y, *local_y, * cooVal, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
......@@ -85,7 +94,7 @@ int main(int argc, char ** argv) {
MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) {
printf("MPI Version(N=%d, M=%d, Tasks=%d, Nodes=%s, Tasks/Node=%s, threads=%s): ", global_nm[0], global_nm[1], size,
printf("MPI-OpenMP Version(N=%d, M=%d, Tasks=%d, Nodes=%s, Tasks/Node=%s, threads=%s): ", global_nm[0], global_nm[1], size,
getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE"), getenv("OMP_NUM_THREADS"));
comp_t= MPI_Wtime();
}
......@@ -108,12 +117,13 @@ int main(int argc, char ** argv) {
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/MPI-OpenMP.out" ;
char filename[] = "MPI-OpenMP.debug" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
fclose(fp) ;
#endif
report_mpi_results(comm_t, comp_t);
/* Free rank 0 local memmory */
free(M);
free(y);
}
......
/*
* An MPI implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about MPI see https://computing.llnl.gov/tutorials/mpi/
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <mpi.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"