/* * A Serial implementation of the Matrix-Vector multiplication * * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) */ #include #include #include #include #include #include #include #include "/users/guest/petyros/Training/External_Functions/matrix_op.h" #include "/users/guest/petyros/Training/External_Functions/util.h" #include "/users/guest/petyros/Training/External_Functions/input.h" #include "/users/guest/petyros/Training/External_Functions/gpu_util.h" int main(int argc, char **argv) { /* Initializations */ int i, j, k, n, m; int *I, *cooCol, n_z, sparse=0; double *cooVal, timer; /* File Input to COO */ if (argc < 2) error("Too few Arguments"); else if ( argc == 2) /* ./Program Input_File */ { if(!mtx_read(&I, &cooCol, &cooVal, &n, &m, &n_z, argv[1])) error("input and/or COO convertion failed"); sparse = 1; } else if ( argc == 3) { /*./Program N M */ n = atoi(argv[1]); m = atoi(argv[2]); } else error("Too many Arguments"); /* Allocate space */ double *x = (double *) malloc(m * sizeof(*x)); double *y = (double *) malloc(n * sizeof(*y)); double *M = (double *) malloc(n * m * sizeof(*M)); if( !y || !x || !M ) error("memory allocation failed"); /* Initialize matrices */ if (sparse) { ; //regenerate_matrix_coo(M, I, cooCol, cooVal, n, m, n_z); /* Sparse matrices read from .mtx format */ } else ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */ /* Initialize vectors */ vec_init_rand(x, m, 1.0); vec_init(y, n, 0.0); /* Initialize cuda/cublas variables */ int device_num=0; cudaGetDeviceCount(&device_num); if (!device_num) printf("No available Cuda Devices"); else { printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m); double alf=1.0; /* Y=a*A*x+b */ double beta=0.0; cublasStatus_t stat; cublasHandle_t handle; double *A, * y, *x_c; /* Initialize Unified memmory */ cudaMallocManaged(&A, m*n * sizeof(double)); cudaMallocManaged(&y, n * sizeof(double)); cudaMallocManaged(&x_c, m * sizeof(double)); cudaDeviceSynchronize(); cudaCheckErrors("Unified Alloc failed"); if ( !A || !y || !x_c) error("unified alloc failed"); for (i = 0; i < m; i++) x_c[i] = x[i]; matrix_col_major(M, A, n, m); stat = cublasCreate(&handle); /* Warmup */ stat=cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1); cudaDeviceSynchronize(); timer=csecond(); for (j = 0; j < NR_ITER; ++j) { stat=cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1); cudaDeviceSynchronize(); } timer = csecond() - timer; cudaCheckErrors("cublasDgemv failed"); #ifdef _DEBUG_ /* Output y vector to a file for debugging */ FILE * fp; char * filename = "/users/guest/petyros/Training/Outputs/Debug/cuBLAS.out" ; if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n"); for (k = 0; k < n; ++k) fprintf(fp, "%lf ", y[k]) ; fclose(fp) ; #endif report_results(timer); } return 0; }