#include /* * Utility function to get the thread ID within the * global working space. */ __device__ int get_global_tid() { return (gridDim.x*blockIdx.y + blockIdx.x)*blockDim.x*blockDim.y + blockDim.x*threadIdx.y + threadIdx.x; } /* * Utility function to get the thread ID within the * local/block working space. */ __device__ int get_local_tid() { return blockDim.x*threadIdx.y + threadIdx.x; } /* * Naive kernel */ __global__ void dmv_gpu_naive(const double *a, const double *x, double *y, size_t n) { int tid = get_global_tid(); double yi = 0.0; if(tid >= n) return ; for ( int j = 0 ; j < n; j++ ) yi += + a[tid*n+j]*x[j]; y[tid]=yi; } /* * Coalesced memory accesses kernel (requires transposed matrix a) */ __global__ void dmv_gpu_coalesced(const double *a, const double *x, double *y, size_t n) { int tid = get_global_tid(); double yi = 0.0; if(tid >= n) return ; for ( int j = 0 ; j < n; j++ ) yi += + a[j*n+tid]*x[j]; y[tid]=yi; } /* * Final kernel making use of shared memory to improve memory bandwidth utilization and access pattern */ __global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n) { extern __shared__ float shmem_buff[] ; int tid = get_global_tid(), i, j; double yi = 0.0; if(tid >= n) return ; int block_s=blockDim.x*blockDim.y; int lid=get_local_tid(), last_id = n/block_s ; for( j = 0; j< last_id; j++) { shmem_buff[lid] = x[block_s*j + lid]; __syncthreads(); for( i = 0 ; i < block_s; i++ ) { yi += a[tid+ (i+j*block_s)*n]*shmem_buff[i]; } __syncthreads(); } y[tid]=yi; }