Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/*
* A cuBLAS implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about cuBLAS see http://docs.nvidia.com/cuda/cublas/index.html
*
*/
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"
int main(int argc, char **argv)
{
/* Initializations */
int i, j, n, m;
double timer;
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
n = atoi(argv[1]);
m = atoi(argv[2]);
}
else error("Too many Arguments");
/* Allocate space */
double *x = (double *) malloc(m * sizeof(*x));
double *M = (double *) malloc(n * m * sizeof(*M));
if( !x || !M ) error("memory allocation failed");
/* Initialize matrices */
ser_matrix_init_rand(M,n,m,1.0); /* Normal matrices generated randomly */
/* Initialize vectors */
vec_init_rand(x, m, 1.0);
/* Initialize cuda/cublas variables */
int device_num=0;
cudaGetDeviceCount(&device_num);
if (!device_num) {
printf("No available Cuda Devices...terminating");
return 0;
}
double alf=1.0; /* Y=a*A*x+b */
double beta=0.0;
cublasHandle_t handle;
double *A, * y, *x_c;
printf("Single GPU cuBLAS Version(N=%d, M=%d): ", n, m);
/* Initialize Unified memmory visible and accesible from both CPU and GPU */
cudaMallocManaged(&A, m*n * sizeof(double));
cudaMallocManaged(&y, n * sizeof(double));
cudaMallocManaged(&x_c, m * sizeof(double));
cudaDeviceSynchronize();
cudaCheckErrors("Unified Alloc failed");
if ( !A || !y || !x_c) error("unified alloc failed");
for (i = 0; i < m; i++) x_c[i] = x[i];
matrix_col_major(M, A, n, m); /* We transpose the matrix because cuBLAS works with column-major format */
cublasCreate(&handle);
/*GPU Warmup */
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
timer=csecond();
for (j = 0; j < NR_ITER; ++j) {
cublasDgemv(handle, CUBLAS_OP_N, n, m, &alf, A , n, x_c, 1, &beta, y, 1);
cudaDeviceSynchronize();
}
timer = csecond() - timer;
cudaCheckErrors("cublasDgemv failed");
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
FILE * fp;
char filename[] = "cuBLAS.debug" ; /* Common directory for all implementations, change if needed */
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
fclose(fp) ;
#endif
report_results(timer);
return 0;
}