Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Required imports for the exercise
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.compiler import SourceModule
# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
aux = range(150000)
a = np.array(aux).astype(np.float32)
b = (a*a).astype(np.float32)
c = np.zeros(len(aux)).astype(np.float32)
# Create the corresponding vectors in GPU memory
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)
# Open and read the file with CUDA code (use CUDA C version)
cudaCode = open("vector_add.cu","r")
myCUDACode = cudaCode.read()
# Create the source module for the code
myCode = SourceModule(myCUDACode)
# Import the kernel
importedKernel = myCode.get_function("vectorAdd")
# Define the desired number of threads per block,
# blocks per grid and grids per block
nThreadsPerBlock = 256
nBlockPerGrid = 1
nGridsPerBlock = 1
# Execute the imported kernel with the previous data layout
importedKernel(a_gpu.gpudata,b_gpu.gpudata,c_gpu.gpudata,block=(256,1,1))
# Get the result
c = c_gpu.get()