/* * A cuda implementation of the Matrix-Vector multiplication * * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) * * The device code for the GPU implemntations can be found in the 'dmv_gpu.cu' file * * For more info about coalesced memmory access and shmem, see https://cvw.cac.cornell.edu/gpu/coalesced */ #include #include #include #include #include #include #include #include "dmv_gpu.h" /* Need to include External_Functions for these */ #include "matrix_op.h" #include "util.h" #include "input.h" #include "gpu_util.h" #define block_size 256 /* Number of GPU threads per block. Modifying this value might lead to performance issues */ int main(int argc, char **argv) { /* Initializations */ int i, j, n, m; double timer; if (argc < 3) error("Usage: ./Program N M"); else if ( argc == 3) { /*./Program N M */ n = atoi(argv[1]); m = atoi(argv[2]); } else error("Too many Arguments"); int grid_size = (n-1)/block_size + 1; size_t shmem_size = 0; /* GPU kernel block/grid sizes */ dim3 gpu_block(block_size, 1); dim3 gpu_grid(grid_size, 1); /* Allocate space */ double *x = (double *) malloc(m * sizeof(*x)); double *y = (double *) malloc(n * sizeof(*y)); double *M = (double *) malloc(n * m * sizeof(*M)); if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */ /* Initialize vectors */ vec_init_rand(x, m, 1.0); vec_init(y, n, 0.0); /* Initialize cuda variables */ int device_num=0; cudaGetDeviceCount(&device_num); if (!device_num) printf("No available Cuda Devices"); else { cudaSetDevice(0); printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m); double *A, * y, *x_c; /* Initialize Unified memmory visible and accesible from both CPU and GPU */ cudaMallocManaged(&A, m*n * sizeof(double)); cudaMallocManaged(&y, n * sizeof(double)); cudaMallocManaged(&x_c, m * sizeof(double)); cudaDeviceSynchronize(); cudaCheckErrors("Unified Alloc failed"); if ( !A || !y || !x_c) error("unified alloc failed"); for (i = 0; i < m; i++) x_c[i] = x[i]; /* First naive kernel */ for ( i = 0; i < n*m; i++) A[i] = M[i] ; timer=csecond(); for (j = 0; j < NR_ITER; ++j) { dmv_gpu_naive<<>>(A, x_c, y, m); cudaDeviceSynchronize(); } timer = csecond() - timer; cudaCheckErrors("naive kernel failed"); report_results(timer); /* Second kernel, using coalesced memmory accesses in the GPU by transposing the matrix. */ printf("Single GPU CUDA Coalesced Version(N=%d, M=%d): ", n, m); matrix_col_major(M, A, n, m); /* We transpose the matrix to better match the GPU coalesced memmory access logic */ timer=csecond(); for (j = 0; j < NR_ITER; ++j) { dmv_gpu_coalesced<<>>(A, x_c, y, m); cudaDeviceSynchronize(); } timer = csecond() - timer; cudaCheckErrors("coalesced kernel failed"); report_results(timer); /* Third and final kernel further improves memmory access speed by using block exclusive shmem */ printf("Single GPU CUDA shmem Version(N=%d, M=%d): ", n, m); shmem_size= block_size*sizeof(float); timer=csecond(); for (j = 0; j < NR_ITER; ++j) { dmv_gpu_shmem<<>>(A, x_c, y, m); cudaDeviceSynchronize(); } timer = csecond() - timer; cudaCheckErrors("shmem kernel failed"); #ifdef _DEBUG_ /* Output y vector to a file for debugging */ FILE * fp; char filename[] = "CUDA.debug" ; /* Common directory for all implementations, change if needed */ if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n"); for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ; fclose(fp) ; #endif report_results(timer); } return 0; }