Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include <stdio.h>
/*
* Utility function to get the thread ID within the
* global working space.
*/
__device__ int get_global_tid()
{
return (gridDim.x*blockIdx.y + blockIdx.x)*blockDim.x*blockDim.y +
blockDim.x*threadIdx.y + threadIdx.x;
}
/*
* Utility function to get the thread ID within the
* local/block working space.
*/
__device__ int get_local_tid()
{
return blockDim.x*threadIdx.y + threadIdx.x;
}
/*
* Naive kernel
*/
__global__ void dmv_gpu_naive(const double *a, const double *x, double *y,
size_t n)
{
int tid = get_global_tid();
double yi = 0.0;
if(tid >= n)
return ;
for ( int j = 0 ; j < n; j++ )
yi += + a[tid*n+j]*x[j];
y[tid]=yi;
}
/*
* Coalesced memory accesses kernel (requires transposed matrix a)
*/
__global__ void dmv_gpu_coalesced(const double *a, const double *x,
double *y, size_t n)
{
int tid = get_global_tid();
double yi = 0.0;
if(tid >= n)
return ;
for ( int j = 0 ; j < n; j++ )
yi += + a[j*n+tid]*x[j];
y[tid]=yi;
}
/*
* Final kernel making use of shared memory to improve memory bandwidth utilization and access pattern
*/
__global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n)
{
extern __shared__ float shmem_buff[] ;
int tid = get_global_tid(), i, j;
double yi = 0.0;
if(tid >= n)
return ;
int block_s=blockDim.x*blockDim.y;
int lid=get_local_tid(), last_id = n/block_s ;
for( j = 0; j< last_id; j++) {
shmem_buff[lid] = x[block_s*j + lid];
__syncthreads();
for( i = 0 ; i < block_s; i++ ) {
yi += a[tid+ (i+j*block_s)*n]*shmem_buff[i];
}
__syncthreads();
}
y[tid]=yi;
}