#include <stdio.h>
#include <cuda_runtime.h>
// CUDA Kernel
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    // Define a target element "i" in terms of block and thread identifiers

    // Perform the vector addition checking the limits of the array!
}

/**
 * Host main routine
 */
int
main(void)
{
    int numElements = 150000;
    size_t size = numElements * sizeof(float);
    printf("[Vector addition of %d elements]\n", numElements);

    float a[numElements],b[numElements],c[numElements];
    float *a_gpu,*b_gpu,*c_gpu;

    // Allocate device global memory
    // ...
    // ...
    // ...

    for (int i=0;i<numElements;++i ){
    
    	a[i] = i*i;
    	b[i] = i;
    
    }
    // Copy the host input vectors A and B in host memory to the device input vectors in
    // device memory
    // ...
    // ...

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    ///... 

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    // ...

    // Free device global memory
    // ...
    // ...
    // ...
    
    for (int i=0;i<numElements;++i ){
    
    	printf("%f \n",c[i]);
    
    }
    printf("Done\n");
    return 0;
}