Commit f8a9ff6a authored by Carlos Teijeiro's avatar Carlos Teijeiro

New CUDA codes: vector addition

parent 0b2a1f0a
Pipeline #77 failed with stages
# CUDA
# GPU programming with CUDA
The present project contains exercises using CUDA C and PyCUDA for GPU programming using C and Python respectively.
In particular, the Python codes are presented without any print statement for compatibility purposes, so that they can be directly executed in an interactive session or inside a Jupyter notebook.
## Exercises
- [Vector addition](vector_add) Simple vector addition. Level: **basic**
CUDA codes
\ No newline at end of file
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA Kernel
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
// Define a target element "i" in terms of block and thread identifiers
int i = blockDim.x * blockIdx.x + threadIdx.x;
// Perform the vector addition checking the limits of the array!
if (i < numElements)
{
C[i] = A[i] + B[i];
}
}
/**
* Host main routine
*/
int
main(void)
{
int numElements = 150000;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float a[numElements],b[numElements],c[numElements];
float *a_gpu,*b_gpu,*c_gpu;
// Allocate device global memory
cudaMalloc((void **)&a_gpu, size);
cudaMalloc((void **)&b_gpu, size);
cudaMalloc((void **)&c_gpu, size);
for (int i=0;i<numElements;++i ){
a[i] = i*i;
b[i] = i;
}
// Copy the host input vectors A and B in host memory to the device input vectors in
// device memory
printf("Copy input data from the host memory to the CUDA device\n");
cudaMemcpy(a_gpu, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(b_gpu, b, size, cudaMemcpyHostToDevice);
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(a_gpu, b_gpu, c_gpu, numElements);
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
cudaMemcpy(c, c_gpu, size, cudaMemcpyDeviceToHost);
// Free device global memory
cudaFree(a_gpu);
cudaFree(b_gpu);
cudaFree(c_gpu);
for (int i=0;i<numElements;++i ){
printf("%f \n",c[i]);
}
printf("Done\n");
return 0;
}
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA Kernel
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
// Define a target element "i" in terms of block and thread identifiers
// Perform the vector addition checking the limits of the array!
}
/**
* Host main routine
*/
int
main(void)
{
int numElements = 150000;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float a[numElements],b[numElements],c[numElements];
float *a_gpu,*b_gpu,*c_gpu;
// Allocate device global memory
// ...
// ...
// ...
for (int i=0;i<numElements;++i ){
a[i] = i*i;
b[i] = i;
}
// Copy the host input vectors A and B in host memory to the device input vectors in
// device memory
// ...
// ...
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
///...
// Copy the device result vector in device memory to the host result vector
// in host memory.
// ...
// Free device global memory
// ...
// ...
// ...
for (int i=0;i<numElements;++i ){
printf("%f \n",c[i]);
}
printf("Done\n");
return 0;
}
# Vector addition with PyCUDA
This exercise is intended to implement the vector addition using three different models (GPUarrays, Elementwise kernels, SourceModule function).
The goal is to see the different possibilities of implementation and be able to compare them in terms of simplicity and flexibility.
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
aux = range(150000)
a = np.array(aux).astype(np.float32)
b = (a*a).astype(np.float32)
c = np.zeros(len(aux)).astype(np.float32)
# Create the corresponding vectors in GPU memory
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)
# Perform the vector addition on the GPU
c_gpu = a_gpu+b_gpu
# Collect the result from GPU memory
c = c_gpu.get()
# Required imports for the exercise
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.elementwise import ElementwiseKernel
# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
aux = range(150000)
a = np.array(aux).astype(np.float32)
b = (a*a).astype(np.float32)
c = np.zeros(len(aux)).astype(np.float32)
# Create the corresponding vectors in GPU memory
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)
# Define a CUDA function for vector addition
# using an element-wise kernel
myCudaFunc = ElementwiseKernel(arguments = "float *a, float *b, float *c",
operation = "c[i] = a[i]+b[i]",
name = "mySumK")
# Call the created function
myCudaFunc(a_gpu,b_gpu,c_gpu)
# Get the result
c = c_gpu.get()
# Required imports for the exercise
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.compiler import SourceModule
# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
aux = range(150000)
a = np.array(aux).astype(np.float32)
b = (a*a).astype(np.float32)
c = np.zeros(len(aux)).astype(np.float32)
# Create the corresponding vectors in GPU memory
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)
# Open and read the file with CUDA code (use CUDA C version)
cudaCode = open("vector_add.cu","r")
myCUDACode = cudaCode.read()
# Create the source module for the code
myCode = SourceModule(myCUDACode)
# Import the kernel
importedKernel = myCode.get_function("vectorAdd")
# Define the desired number of threads per block,
# blocks per grid and grids per block
nThreadsPerBlock = 256
nBlockPerGrid = 1
nGridsPerBlock = 1
# Execute the imported kernel with the previous data layout
importedKernel(a_gpu.gpudata,b_gpu.gpudata,c_gpu.gpudata,block=(256,1,1))
# Get the result
c = c_gpu.get()
# Required imports for the exercise
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
# a = ...
# b = ...
# c = ...
# Create the corresponding vectors in GPU memory
# a_gpu = ...
# b_gpu = ...
# c_gpu = ...
# Perform the vector addition on the GPU
# ...
# Collect the result from GPU memory
#...
# Required imports for the exercise
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.elementwise import ElementwiseKernel
# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
# a = ...
# b = ...
# c = ...
# Create the corresponding vectors in GPU memory
# a_gpu = ...
# b_gpu = ...
# c_gpu = ...
# Define a CUDA function for vector addition
# using an element-wise kernel
# ...
# Call the created function
# ...
# Get the result
# ...
# Required imports for the exercise
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.compiler import SourceModule
# Initialize vectors "a" and "b" with some numbers,
# and vector "c" with zeros
# Use numpy to define single precision vectors
# a = ...
# b = ...
# c = ...
# Create the corresponding vectors in GPU memory
# a_gpu = ...
# b_gpu = ...
# c_gpu = ...
# CREATE A CUDA C FILE WITH A VECTOR ADDITION KERNEL
# (call it vector_add.cu)
# Open and read the file with CUDA code
# ...
# Create the source module for the code
# ...
# Import the kernel
# ...
# Define the desired number of threads per block,
# blocks per grid and grids per block
# ...
# Execute the imported kernel with the previous data layout
# ...
# Get the result
# ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment