#include #include // CUDA Kernel __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) { // Define a target element "i" in terms of block and thread identifiers int i = blockDim.x * blockIdx.x + threadIdx.x; // Perform the vector addition checking the limits of the array! if (i < numElements) { C[i] = A[i] + B[i]; } } /** * Host main routine */ int main(void) { int numElements = 150000; size_t size = numElements * sizeof(float); printf("[Vector addition of %d elements]\n", numElements); float a[numElements],b[numElements],c[numElements]; float *a_gpu,*b_gpu,*c_gpu; // Allocate device global memory cudaMalloc((void **)&a_gpu, size); cudaMalloc((void **)&b_gpu, size); cudaMalloc((void **)&c_gpu, size); for (int i=0;i>>(a_gpu, b_gpu, c_gpu, numElements); // Copy the device result vector in device memory to the host result vector // in host memory. printf("Copy output data from the CUDA device to the host memory\n"); cudaMemcpy(c, c_gpu, size, cudaMemcpyDeviceToHost); // Free device global memory cudaFree(a_gpu); cudaFree(b_gpu); cudaFree(c_gpu); for (int i=0;i