From f8a9ff6afafccfa0785d0b488d0a8a0e598ec558 Mon Sep 17 00:00:00 2001
From: Carlos Teijeiro <carlos.teijeiro@surfsara.nl>
Date: Tue, 12 Mar 2019 10:46:16 +0100
Subject: [PATCH] New CUDA codes: vector addition

---
 README.md                                  | 11 +++-
 vector_add/c/solution/vector_add.cu        | 70 ++++++++++++++++++++++
 vector_add/c/vector_add.cu                 | 63 +++++++++++++++++++
 vector_add/python/README.md                |  6 ++
 vector_add/python/solution/vector_add_1.py | 22 +++++++
 vector_add/python/solution/vector_add_2.py | 31 ++++++++++
 vector_add/python/solution/vector_add_3.py | 41 +++++++++++++
 vector_add/python/vector_add_1.py          | 22 +++++++
 vector_add/python/vector_add_2.py          | 28 +++++++++
 vector_add/python/vector_add_3.py          | 40 +++++++++++++
 10 files changed, 332 insertions(+), 2 deletions(-)
 create mode 100644 vector_add/c/solution/vector_add.cu
 create mode 100644 vector_add/c/vector_add.cu
 create mode 100644 vector_add/python/README.md
 create mode 100644 vector_add/python/solution/vector_add_1.py
 create mode 100644 vector_add/python/solution/vector_add_2.py
 create mode 100644 vector_add/python/solution/vector_add_3.py
 create mode 100644 vector_add/python/vector_add_1.py
 create mode 100644 vector_add/python/vector_add_2.py
 create mode 100644 vector_add/python/vector_add_3.py

diff --git a/README.md b/README.md
index 85380fd..ebc27f3 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,10 @@
-# CUDA
+# GPU programming with CUDA
+
+The present project contains exercises using CUDA C and PyCUDA for GPU programming using C and Python respectively.
+
+In particular, the Python codes are presented without any print statement for compatibility purposes, so that they can be directly executed in an interactive session or inside a Jupyter notebook. 
+
+## Exercises
+
+ - [Vector addition](vector_add) Simple vector addition. Level: **basic**
 
-CUDA codes
\ No newline at end of file
diff --git a/vector_add/c/solution/vector_add.cu b/vector_add/c/solution/vector_add.cu
new file mode 100644
index 0000000..e432242
--- /dev/null
+++ b/vector_add/c/solution/vector_add.cu
@@ -0,0 +1,70 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+// CUDA Kernel
+__global__ void
+vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    // Define a target element "i" in terms of block and thread identifiers
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    // Perform the vector addition checking the limits of the array!
+    if (i < numElements)
+    {
+        C[i] = A[i] + B[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int
+main(void)
+{
+    int numElements = 150000;
+    size_t size = numElements * sizeof(float);
+    printf("[Vector addition of %d elements]\n", numElements);
+
+    float a[numElements],b[numElements],c[numElements];
+    float *a_gpu,*b_gpu,*c_gpu;
+
+    // Allocate device global memory
+    cudaMalloc((void **)&a_gpu, size);
+    cudaMalloc((void **)&b_gpu, size);
+    cudaMalloc((void **)&c_gpu, size);
+
+    for (int i=0;i<numElements;++i ){
+    
+    	a[i] = i*i;
+    	b[i] = i;
+    
+    }
+    // Copy the host input vectors A and B in host memory to the device input vectors in
+    // device memory
+    printf("Copy input data from the host memory to the CUDA device\n");
+    cudaMemcpy(a_gpu, a, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(b_gpu, b, size, cudaMemcpyHostToDevice);
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(a_gpu, b_gpu, c_gpu, numElements);
+
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+    printf("Copy output data from the CUDA device to the host memory\n");
+    cudaMemcpy(c, c_gpu, size, cudaMemcpyDeviceToHost);
+
+    // Free device global memory
+    cudaFree(a_gpu);
+    cudaFree(b_gpu);
+    cudaFree(c_gpu);
+    
+    for (int i=0;i<numElements;++i ){
+    
+    	printf("%f \n",c[i]);
+    
+    }
+    printf("Done\n");
+    return 0;
+}
diff --git a/vector_add/c/vector_add.cu b/vector_add/c/vector_add.cu
new file mode 100644
index 0000000..9701315
--- /dev/null
+++ b/vector_add/c/vector_add.cu
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+// CUDA Kernel
+__global__ void
+vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    // Define a target element "i" in terms of block and thread identifiers
+
+    // Perform the vector addition checking the limits of the array!
+}
+
+/**
+ * Host main routine
+ */
+int
+main(void)
+{
+    int numElements = 150000;
+    size_t size = numElements * sizeof(float);
+    printf("[Vector addition of %d elements]\n", numElements);
+
+    float a[numElements],b[numElements],c[numElements];
+    float *a_gpu,*b_gpu,*c_gpu;
+
+    // Allocate device global memory
+    // ...
+    // ...
+    // ...
+
+    for (int i=0;i<numElements;++i ){
+    
+    	a[i] = i*i;
+    	b[i] = i;
+    
+    }
+    // Copy the host input vectors A and B in host memory to the device input vectors in
+    // device memory
+    // ...
+    // ...
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+    ///... 
+
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+    // ...
+
+    // Free device global memory
+    // ...
+    // ...
+    // ...
+    
+    for (int i=0;i<numElements;++i ){
+    
+    	printf("%f \n",c[i]);
+    
+    }
+    printf("Done\n");
+    return 0;
+}
diff --git a/vector_add/python/README.md b/vector_add/python/README.md
new file mode 100644
index 0000000..24d8352
--- /dev/null
+++ b/vector_add/python/README.md
@@ -0,0 +1,6 @@
+# Vector addition with PyCUDA
+
+This exercise is intended to implement the vector addition using three different models (GPUarrays, Elementwise kernels, SourceModule function).
+
+The goal is to see the different possibilities of implementation and be able to compare them in terms of simplicity and flexibility.
+
diff --git a/vector_add/python/solution/vector_add_1.py b/vector_add/python/solution/vector_add_1.py
new file mode 100644
index 0000000..277a678
--- /dev/null
+++ b/vector_add/python/solution/vector_add_1.py
@@ -0,0 +1,22 @@
+from pycuda import autoinit
+from pycuda import gpuarray
+import numpy as np
+
+# Initialize vectors "a" and "b" with some numbers,
+# and vector "c" with zeros
+# Use numpy to define single precision vectors
+aux = range(150000)
+a = np.array(aux).astype(np.float32)
+b = (a*a).astype(np.float32)
+c = np.zeros(len(aux)).astype(np.float32)
+
+# Create the corresponding vectors in GPU memory
+a_gpu = gpuarray.to_gpu(a)
+b_gpu = gpuarray.to_gpu(b)
+c_gpu = gpuarray.to_gpu(c)
+
+# Perform the vector addition on the GPU
+c_gpu = a_gpu+b_gpu
+
+# Collect the result from GPU memory
+c = c_gpu.get()
diff --git a/vector_add/python/solution/vector_add_2.py b/vector_add/python/solution/vector_add_2.py
new file mode 100644
index 0000000..70b8068
--- /dev/null
+++ b/vector_add/python/solution/vector_add_2.py
@@ -0,0 +1,31 @@
+# Required imports for the exercise
+from pycuda import autoinit
+from pycuda import gpuarray
+import numpy as np
+from pycuda.elementwise import ElementwiseKernel
+
+# Initialize vectors "a" and "b" with some numbers,
+# and vector "c" with zeros
+# Use numpy to define single precision vectors
+aux = range(150000)
+a = np.array(aux).astype(np.float32)
+b = (a*a).astype(np.float32)
+c = np.zeros(len(aux)).astype(np.float32)
+
+# Create the corresponding vectors in GPU memory
+a_gpu = gpuarray.to_gpu(a)
+b_gpu = gpuarray.to_gpu(b)
+c_gpu = gpuarray.to_gpu(c)
+
+# Define a CUDA function for vector addition
+# using an element-wise kernel
+myCudaFunc = ElementwiseKernel(arguments = "float *a, float *b, float *c",
+                               operation = "c[i] = a[i]+b[i]",
+                               name = "mySumK")
+
+# Call the created function
+myCudaFunc(a_gpu,b_gpu,c_gpu)
+
+# Get the result
+c = c_gpu.get()
+
diff --git a/vector_add/python/solution/vector_add_3.py b/vector_add/python/solution/vector_add_3.py
new file mode 100644
index 0000000..d587656
--- /dev/null
+++ b/vector_add/python/solution/vector_add_3.py
@@ -0,0 +1,41 @@
+# Required imports for the exercise
+from pycuda import autoinit
+from pycuda import gpuarray
+import numpy as np
+from pycuda.compiler import SourceModule
+
+# Initialize vectors "a" and "b" with some numbers,
+# and vector "c" with zeros
+# Use numpy to define single precision vectors
+aux = range(150000)
+a = np.array(aux).astype(np.float32)
+b = (a*a).astype(np.float32)
+c = np.zeros(len(aux)).astype(np.float32)
+
+# Create the corresponding vectors in GPU memory
+a_gpu = gpuarray.to_gpu(a)
+b_gpu = gpuarray.to_gpu(b)
+c_gpu = gpuarray.to_gpu(c)
+
+# Open and read the file with CUDA code (use CUDA C version)
+cudaCode = open("vector_add.cu","r")
+myCUDACode = cudaCode.read()
+
+# Create the source module for the code
+myCode = SourceModule(myCUDACode)
+
+# Import the kernel
+importedKernel = myCode.get_function("vectorAdd")
+
+# Define the desired number of threads per block,
+# blocks per grid and grids per block 
+nThreadsPerBlock = 256
+nBlockPerGrid = 1
+nGridsPerBlock = 1
+
+# Execute the imported kernel with the previous data layout
+importedKernel(a_gpu.gpudata,b_gpu.gpudata,c_gpu.gpudata,block=(256,1,1))
+
+# Get the result
+c = c_gpu.get()
+
diff --git a/vector_add/python/vector_add_1.py b/vector_add/python/vector_add_1.py
new file mode 100644
index 0000000..77bb867
--- /dev/null
+++ b/vector_add/python/vector_add_1.py
@@ -0,0 +1,22 @@
+# Required imports for the exercise
+from pycuda import autoinit
+from pycuda import gpuarray
+import numpy as np
+
+# Initialize vectors "a" and "b" with some numbers,
+# and vector "c" with zeros
+# Use numpy to define single precision vectors
+# a = ...
+# b = ...
+# c = ...
+
+# Create the corresponding vectors in GPU memory
+# a_gpu = ...
+# b_gpu = ...
+# c_gpu = ...
+
+# Perform the vector addition on the GPU
+# ...
+
+# Collect the result from GPU memory
+#...
diff --git a/vector_add/python/vector_add_2.py b/vector_add/python/vector_add_2.py
new file mode 100644
index 0000000..1ce025f
--- /dev/null
+++ b/vector_add/python/vector_add_2.py
@@ -0,0 +1,28 @@
+# Required imports for the exercise
+from pycuda import autoinit
+from pycuda import gpuarray
+import numpy as np
+from pycuda.elementwise import ElementwiseKernel
+
+# Initialize vectors "a" and "b" with some numbers,
+# and vector "c" with zeros
+# Use numpy to define single precision vectors
+# a = ...
+# b = ...
+# c = ...
+
+# Create the corresponding vectors in GPU memory
+# a_gpu = ...
+# b_gpu = ...
+# c_gpu = ...
+
+# Define a CUDA function for vector addition
+# using an element-wise kernel
+# ...
+
+# Call the created function
+# ...
+
+# Get the result
+# ...
+
diff --git a/vector_add/python/vector_add_3.py b/vector_add/python/vector_add_3.py
new file mode 100644
index 0000000..413ff4a
--- /dev/null
+++ b/vector_add/python/vector_add_3.py
@@ -0,0 +1,40 @@
+# Required imports for the exercise
+from pycuda import autoinit
+from pycuda import gpuarray
+import numpy as np
+from pycuda.compiler import SourceModule
+
+# Initialize vectors "a" and "b" with some numbers,
+# and vector "c" with zeros
+# Use numpy to define single precision vectors
+# a = ...
+# b = ...
+# c = ...
+
+# Create the corresponding vectors in GPU memory
+# a_gpu = ...
+# b_gpu = ...
+# c_gpu = ...
+
+# CREATE A CUDA C FILE WITH A VECTOR ADDITION KERNEL
+# (call it vector_add.cu)
+
+# Open and read the file with CUDA code
+# ...
+
+# Create the source module for the code
+# ...
+
+# Import the kernel
+# ...
+
+# Define the desired number of threads per block,
+# blocks per grid and grids per block 
+# ...
+
+# Execute the imported kernel with the previous data layout
+# ...
+
+# Get the result
+# ...
+
-- 
GitLab