#include <stdio.h>

/*
 *  Utility function to get the thread ID within the
 *  global working space.
 */ 
__device__ int get_global_tid()
{
    return (gridDim.x*blockIdx.y + blockIdx.x)*blockDim.x*blockDim.y +
        blockDim.x*threadIdx.y + threadIdx.x;
}

/*
 *  Utility function to get the thread ID within the
 *  local/block working space.
 */ 
__device__ int get_local_tid()
{
    return blockDim.x*threadIdx.y + threadIdx.x;
}

/*
 *  Naive kernel
 */ 
__global__ void dmv_gpu_naive(const double *a, const double *x, double *y,
                              size_t n)
{
	int tid = get_global_tid();
	double yi = 0.0;
	if(tid >= n)
		return ;
	for ( int j = 0 ; j < n; j++ )
		yi += + a[tid*n+j]*x[j];	
	y[tid]=yi;
}

/*
 *  Coalesced memory accesses kernel (requires transposed matrix a)
 */
__global__ void dmv_gpu_coalesced(const double *a, const double *x,
                                  double *y, size_t n)
{
	int tid = get_global_tid();
	double yi = 0.0;
	if(tid >= n)
		return ;
	for ( int j = 0 ; j < n; j++ )
		yi += + a[j*n+tid]*x[j];	
	y[tid]=yi;	
}

/*
 *  Final kernel making use of shared memory to improve memory bandwidth utilization and access pattern
 */
__global__ void dmv_gpu_shmem(const double *a, const double *x, double *y, size_t n)
{
	extern __shared__ float shmem_buff[] ;

	int tid = get_global_tid(), i, j;
	double yi = 0.0;
	if(tid >= n)
		return ;

	int block_s=blockDim.x*blockDim.y;
	int lid=get_local_tid(), last_id = n/block_s ;

	for( j = 0; j< last_id; j++) {
		shmem_buff[lid] = x[block_s*j + lid];
        	__syncthreads();
		for( i = 0 ; i < block_s; i++ ) {
			yi += a[tid+ (i+j*block_s)*n]*shmem_buff[i];
		}
		__syncthreads();
	}	
	y[tid]=yi;
}