/*
 * A cuda implementation of the Matrix-Vector multiplication
 * 
 * Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr) 
 *
 * The device code for the GPU implemntations can be found in the 'dmv_gpu.cu' file
 *
 * For more info about coalesced memmory access and shmem, see https://cvw.cac.cornell.edu/gpu/coalesced
 */

#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse_v2.h>
#include "dmv_gpu.h"
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
#include "gpu_util.h"

#define block_size 256 /* Number of GPU threads per block. Modifying this value might lead to performance issues */ 

int main(int argc, char **argv)
{
	/* Initializations */
	int i, j, n, m;
	double timer;


	if (argc < 3) error("Usage: ./Program N M");
	else if ( argc == 3) { /*./Program N M */
		n = atoi(argv[1]);
		m = atoi(argv[2]);		
	}
	else error("Too many Arguments");

	int grid_size = (n-1)/block_size + 1; 
	size_t shmem_size = 0;

	/* GPU kernel block/grid sizes */
	dim3 gpu_block(block_size, 1);
    dim3 gpu_grid(grid_size, 1);


	/* Allocate space */
	double *x 			= (double *) malloc(m * sizeof(*x));
	double *y	= (double *) malloc(n * sizeof(*y));
	double *M 			= (double *) malloc(n * m * sizeof(*M));

	if( !y || !x || !M ) error("memory allocation failed");

	/* Initialize matrices */
	ser_matrix_init_rand(M, n, m, 1.0); /* Normal matrices generated randomly */

	/* Initialize vectors */
	vec_init_rand(x, m, 1.0);
	vec_init(y, n, 0.0);

	/* Initialize cuda variables */
	int device_num=0;
	cudaGetDeviceCount(&device_num);
	if (!device_num) printf("No available Cuda Devices");
	else {	
	cudaSetDevice(0);
	printf("Single GPU CUDA Version(N=%d, M=%d): ", n, m);
	double *A, * y, *x_c;
	
	/* Initialize Unified memmory visible and accesible from both CPU and GPU */
	cudaMallocManaged(&A, m*n * sizeof(double));
	cudaMallocManaged(&y, n * sizeof(double));
	cudaMallocManaged(&x_c, m * sizeof(double));
	cudaDeviceSynchronize();
	cudaCheckErrors("Unified Alloc failed");
	if ( !A || !y || !x_c) error("unified alloc failed");
	for (i = 0; i < m; i++) x_c[i] = x[i];

	/* First naive kernel */
	for ( i = 0; i < n*m; i++) A[i] = M[i] ;	
	timer=csecond();
	for (j = 0; j < NR_ITER; ++j) {	
			dmv_gpu_naive<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
			cudaDeviceSynchronize();
	}
	timer = csecond() - timer;
	cudaCheckErrors("naive kernel failed");
	report_results(timer);
	
	/* Second kernel, using coalesced memmory accesses in the GPU by transposing the matrix. */
	printf("Single GPU CUDA Coalesced Version(N=%d, M=%d): ", n, m);
	matrix_col_major(M, A, n, m); /* We transpose the matrix to better match the GPU coalesced memmory access logic */

	timer=csecond();
	for (j = 0; j < NR_ITER; ++j) {	
			dmv_gpu_coalesced<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
			cudaDeviceSynchronize();
	}
	timer = csecond() - timer;
	cudaCheckErrors("coalesced kernel failed");
	report_results(timer);
	
	/* Third and final kernel further improves memmory access speed by using block exclusive shmem */
	printf("Single GPU CUDA shmem Version(N=%d, M=%d): ", n, m);
	shmem_size= block_size*sizeof(float);
	
	timer=csecond();
	for (j = 0; j < NR_ITER; ++j) {	
			dmv_gpu_shmem<<<gpu_grid,gpu_block,shmem_size>>>(A, x_c, y, m);
			cudaDeviceSynchronize();
	}
	timer = csecond() - timer;
	cudaCheckErrors("shmem kernel failed");

#ifdef _DEBUG_ 
	/* Output y vector to a file for debugging */
	FILE * fp;
	char filename[] = "CUDA.debug" ; /* Common directory for all implementations, change if needed */
	if(( fp = fopen( filename, "w")) == NULL)  error("Output file creation failed\n");
	for (i = 0; i < n; ++i) fprintf(fp, "%lf ", y[i]) ;
	fclose(fp) ;
#endif
	report_results(timer);
	}
	return 0;
}