Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/*
* A Serial implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*/
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include <mpi.h>
#include "/users/guest/petyros/Training/External_Functions/matrix_op.h"
#include "/users/guest/petyros/Training/External_Functions/util.h"
#include "/users/guest/petyros/Training/External_Functions/input.h"
#include "/users/guest/petyros/Training/External_Functions/gpu_util.h"
int main(int argc, char ** argv)
{
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i,j,k, sparse=0, *cooCol, n_z, *I;
double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if (argc < 2) error("Too few Arguments");
else if ( argc == 2) /* ./Program Input_File -> File Input to COO */
{
if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed");
sparse = 1;
}
else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */
global_nm[0]=atoi(argv[1]);
global_nm[1]=atoi(argv[2]);
}
else error("Too many Arguments");
/* Padd M if needed */
local_nm[0]=global_nm[0];
global_padded_nm[0]=global_nm[0];
if (global_nm[1]%size==0) {
local_nm[1]=global_nm[1]/size;
global_padded_nm[1]=global_nm[1];
local_nm[1]=(global_nm[1]/size)+1;
global_padded_nm[1]=local_nm[1]*size;
}
x = (double *) malloc(global_padded_nm[1] * sizeof(*x));
if (rank==0) {
M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl));
vec_init_rand_p(x, global_nm[1], global_padded_nm[1] - global_nm[1], 1.0);
y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
vec_init(y, global_padded_nm[0], 0.0);
if( !y || !x || !M || !M_cl ) error("memory allocation failed");
/* Initialize matrices */
if (sparse) {
; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */
}
else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local_nm[0],local_nm[1],global_padded_nm[0],global_padded_nm[1]);
/* Initialize process local memmory */
local_y = (double *) malloc(local_nm[0] * sizeof(*local_y));
A = (double *) malloc(local_nm[0] * local_nm[1] * sizeof(*A));
x_c = (double *) malloc(local_nm[1] * sizeof(*x_c));
if ( !A || !local_y || !x_c) error("Process local alloc failed");
if(rank == 0) matrix_col_major(M, M_cl, global_padded_nm[0], global_padded_nm[1]);
/* Rank 0 scatters the global matrix and x vector */
double * gsendbuf;
if (rank == 0){
gsendbuf = &(M_cl[0]);
comm_t= MPI_Wtime();
}
MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
//if (rank == 0) printf("Scatter complete\n");
MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank == 0) comm_t= MPI_Wtime() - comm_t;
for (i = 0; i < local_nm[1]; i++) x_c[i] = x[rank*local_nm[1] + i];
/* Initialize cuda/cublas variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) printf("No available Cuda Devices");
else {
cudaSetDevice(rank%device_num);
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
/* Initialize local GPU memmory */
double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
copy_to_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
copy_to_gpu(x_c, gpu_xc, local_nm[1] * sizeof(*x_c));
copy_to_gpu(A, gpu_A, local_nm[0] * local_nm[1] * sizeof(*A));
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", local_nm[0], local_nm[1], device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ;
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
}
cudaCheckErrors("cublasDgemv failed");
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) comp_t= MPI_Wtime() - comp_t;
copy_from_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
cudaDeviceSynchronize();
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) comm_t= MPI_Wtime() - comm_t;
MPI_Reduce(local_y, y, local_nm[0], MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
//MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t;
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
fclose(fp) ;
#endif
report_mpi_results(comm_t, comp_t);
free(M);
free(x);
}
gpu_free(local_y);
gpu_free(A);
gpu_free(x_c);
}
MPI_Finalize();
return 0;
}