vector_add_3.py

# Required imports for the exercise
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.compiler import SourceModule

# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
aux = range(150000)
a = np.array(aux).astype(np.float32)
b = (a*a).astype(np.float32)
c = np.zeros(len(aux)).astype(np.float32)

# Create the corresponding vectors in GPU memory
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)

# Open and read the file with CUDA code (use CUDA C version)
cudaCode = open("vector_add.cu","r")
myCUDACode = cudaCode.read()

# Create the source module for the code
myCode = SourceModule(myCUDACode)

# Import the kernel
importedKernel = myCode.get_function("vectorAdd")

# Define the desired number of threads per block,
# blocks per grid and grids per block 
nThreadsPerBlock = 256
nBlockPerGrid = 1
nGridsPerBlock = 1

# Execute the imported kernel with the previous data layout
importedKernel(a_gpu.gpudata,b_gpu.gpudata,c_gpu.gpudata,block=(256,1,1))

# Get the result
c = c_gpu.get()