/* * A Serial implementation of the Matrix-Vector multiplication * * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) */ #include #include #include #include #include #include #include #include #include "/users/guest/petyros/Training/External_Functions/matrix_op.h" #include "/users/guest/petyros/Training/External_Functions/util.h" #include "/users/guest/petyros/Training/External_Functions/input.h" #include "/users/guest/petyros/Training/External_Functions/gpu_util.h" int main(int argc, char ** argv) { int rank,size; int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain) int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global) int i,j,k, sparse=0, *cooCol, n_z, *I; double * M, *M_cl, * A, * x, * y, *local_y, *x_c, * cooVal, comm_t, comp_t; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); if (argc < 2) error("Too few Arguments"); else if ( argc == 2) /* ./Program Input_File -> File Input to COO */ { if(!mtx_read(&I, &cooCol, &cooVal, &global_nm[0], &global_nm[1], &n_z, argv[1])) error("input and/or COO convertion failed"); sparse = 1; } else if ( argc == 3) { /*./Program N M -> Generate random NxM matrix */ global_nm[0]=atoi(argv[1]); global_nm[1]=atoi(argv[2]); } else error("Too many Arguments"); /* Padd N if needed */ local_nm[1]=global_nm[1]; global_padded_nm[1]=global_nm[1]; if (global_nm[0]%size==0) { local_nm[0]=global_nm[0]/size; global_padded_nm[0]=global_nm[0]; } else { local_nm[0]=(global_nm[0]/size)+1; global_padded_nm[0]=local_nm[0]*size; } x = (double *) malloc(global_padded_nm[1] * sizeof(*x)); if (rank==0) { M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M)); M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl)); vec_init_rand(x, global_padded_nm[1], 1.0); y = (double *) malloc(global_padded_nm[0] * sizeof(*y)); vec_init(y, global_padded_nm[0], 0.0); if( !y || !x || !M || !M_cl ) error("memory allocation failed"); /* Initialize matrices */ if (sparse) { ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ } else ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */ } //if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]); /* Initialize Unified memmory */ cudaMallocManaged(&A, local_nm[0] * local_nm[1] * sizeof(double)); cudaMallocManaged(&local_y, local_nm[0] * sizeof(*local_y)); vec_init(local_y, local_nm[0], 0.0); cudaMallocManaged(&x_c, m * sizeof(double)); cudaDeviceSynchronize(); cudaCheckErrors("Unified Alloc failed"); if ( !A || !y || !x_c) error("unified alloc failed"); matrix_col_major(M, M_cl, n, m); /* Rank 0 scatters the global matrix */ double * gsendbuf; if (rank == 0){ gsendbuf = &(M_cl[0]); comm_t= MPI_Wtime(); } MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD); //if (rank == 0) printf("Scatter complete\n"); MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD); for (i = 0; i < m; i++) x_c[i] = x[i]; /* Initialize cuda/cublas variables */ int device_num=0; cudaGetDeviceCount(&device_num); if (!device_num) printf("No available Cuda Devices"); else { double alf=1.0; /* Y=a*A*x+b */ double beta=0.0; cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); /* Warmup */ stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1); cudaDeviceSynchronize(); if (rank==0) { printf("Multi GPU CUDA-MPI Version(N=%d, M=%d, GPUs/Node=%d, Nodes=%s, Tasks/Node=%s): ", n, m, device_num, getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE")) ; comp_t= MPI_Wtime(); } for (j = 0; j < NR_ITER; ++j) { stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, A , local_nm[0], x_c, 1, &beta, local_y, 1); cudaDeviceSynchronize(); } cudaCheckErrors("cublasDgemv failed"); MPI_Barrier(MPI_COMM_WORLD); if (rank==0) comp_t= MPI_Wtime() - comp_t; MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD); if (rank==0) comm_t = MPI_Wtime() - comm_t - comp_t; #ifdef _DEBUG_ /* Output y vector to a file for debugging */ if (rank == 0) { FILE * fp; char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS_MultiGPU.out" ; if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n"); for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ; fclose(fp) ; #endif report_mpi_results(comm_t, comp_t); free(M); free(y); } free(x); free(local_y); free(A); MPI_Finalize(); return 0; }