Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA Kernel
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
// Define a target element "i" in terms of block and thread identifiers
// Perform the vector addition checking the limits of the array!
}
/**
* Host main routine
*/
int
main(void)
{
int numElements = 150000;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float a[numElements],b[numElements],c[numElements];
float *a_gpu,*b_gpu,*c_gpu;
// Allocate device global memory
// ...
// ...
// ...
for (int i=0;i<numElements;++i ){
a[i] = i*i;
b[i] = i;
}
// Copy the host input vectors A and B in host memory to the device input vectors in
// device memory
// ...
// ...
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
///...
// Copy the device result vector in device memory to the host result vector
// in host memory.
// ...
// Free device global memory
// ...
// ...
// ...
for (int i=0;i<numElements;++i ){
printf("%f \n",c[i]);
}
printf("Done\n");
return 0;
}