diff --git a/gemm/cuda/CMakeLists.txt b/gemm/cuda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..bac94b025753d9449418fdb2461b7953bececac4 --- /dev/null +++ b/gemm/cuda/CMakeLists.txt @@ -0,0 +1,60 @@ + +# ================================================================================================== +# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0. +# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C). +# +# Author(s): +# Valeriu Codreanu <valeriu.codreanu@surfsara.nl> +# +# ================================================================================================== + +# Packages are optional: if they are not present, certain code samples are not compiled +find_package(CUDA) # Built-in in CMake +include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/common.cmake) +# ================================================================================================== +if ("${DWARF_PREFIX}" STREQUAL "") + set(DWARF_PREFIX 1_dense) +endif() + +# C++ compiler settings + +find_package(Common) +select_compiler_flags(cxx_flags + GNU "-march=native" # I suggest remove "-O3" as this is controlled by the CMAKE_BUILD_TYPE + CLANG "-march=native" # same here + Intel "-axavx2,avx") +set(CXX_FLAGS ${cxx_flags}) +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(CXX_FLAGS "${CXX_FLAGS} -Wall -Wno-comment") + if(APPLE) + set(CXX_FLAGS "${CXX_FLAGS} -Wa,-q") + endif() +endif() +if (OPENMP_FOUND) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}") + +# NVCC compiler settings +if (CUDA_FOUND) + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3") + set(CUDA_HOST_COMPILER "g++") +endif() + +# ================================================================================================== + +# GEMM with the CUDA library +set(NAME ${DWARF_PREFIX}_gemm_cuda) +if (CUDA_FOUND) + cuda_add_executable(${NAME} src/sgemm_cuda_matrixmul.cu) + target_link_libraries(${NAME} ${CUDA_curand_LIBRARY}) + install(TARGETS ${NAME} DESTINATION bin) +else() + message("** Skipping '${NAME}': no CUDA") + dummy_install(${NAME} "CUDA") +endif() + +unset(NAME) + +# ================================================================================================== diff --git a/gemm/cuda/README.md b/gemm/cuda/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7d9bb5f01da38250a42c8867ce72e8f0fc5ffdf6 --- /dev/null +++ b/gemm/cuda/README.md @@ -0,0 +1,47 @@ +======= +README +======= + +# 1. Code sample name +gemm + +# 2. Description of the code sample package +This example demonstrates a multi-dimensional thread blocks and shared memory implementation of matrix-matrix multiplication using NVIDIA's CUDA. The example is set-up to perform the computation of both CPU and GPU and in the end to verify the results. + +Additional pre-requisites: +* CUDA + +See http://docs.nvidia.com/cuda/ about CUDA information + +# 3. Release date +6 Dec 2016 + +# 4. Version history +1.0 + +Damian Podareanu <damian.podareanu@surfsara.nl> + +# 6. Copyright / License of the code sample +Apache 2.0 + +# 7. Language(s) +C++ +CUDA + +# 8. Parallelisation Implementation(s) +GPU + +# 9. Level of the code sample complexity +Basic level + +# 10. Instructions on how to compile the code +Uses the CodeVault CMake infrastructure, see main README.md + +# 11. Instructions on how to run the code +Run the executable with a single command-line option, the matrix size + +# 12. Sample input(s) +Input-data is generated automatically when running the program by computing sin and cos values. + +# 13. Sample output(s) +Output data is verified programmatically using a CPU implementation of matrix-matrix multiplication. diff --git a/gemm/cuda/src/dev_array.h b/gemm/cuda/src/dev_array.h new file mode 100644 index 0000000000000000000000000000000000000000..ad7d2132e6f384553bdc92dfc6887041b3e53612 --- /dev/null +++ b/gemm/cuda/src/dev_array.h @@ -0,0 +1,103 @@ +#ifndef _DEV_ARRAY_H_ +#define _DEV_ARRAY_H_ + +#include <stdexcept> +#include <algorithm> +#include <cuda_runtime.h> + +template <class T> +class dev_array +{ +// public functions +public: + explicit dev_array() + : start_(0), + end_(0) + {} + + // constructor + explicit dev_array(size_t size) + { + allocate(size); + } + // destructor + ~dev_array() + { + free(); + } + + // resize the vector + void resize(size_t size) + { + free(); + allocate(size); + } + + // get the size of the array + size_t getSize() const + { + return end_ - start_; + } + + // get data + const T* getData() const + { + return start_; + } + + T* getData() + { + return start_; + } + + // set + void set(const T* src, size_t size) + { + size_t min = std::min(size, getSize()); + cudaError_t result = cudaMemcpy(start_, src, min * sizeof(T), cudaMemcpyHostToDevice); + if (result != cudaSuccess) + { + throw std::runtime_error("failed to copy to device memory"); + } + } + // get + void get(T* dest, size_t size) + { + size_t min = std::min(size, getSize()); + cudaError_t result = cudaMemcpy(dest, start_, min * sizeof(T), cudaMemcpyDeviceToHost); + if (result != cudaSuccess) + { + throw std::runtime_error("failed to copy to host memory"); + } + } + + +// private functions +private: + // allocate memory on the device + void allocate(size_t size) + { + cudaError_t result = cudaMalloc((void**)&start_, size * sizeof(T)); + if (result != cudaSuccess) + { + start_ = end_ = 0; + throw std::runtime_error("failed to allocate device memory"); + } + end_ = start_ + size; + } + + // free memory on the device + void free() + { + if (start_ != 0) + { + cudaFree(start_); + start_ = end_ = 0; + } + } + + T* start_; + T* end_; +}; + +#endif \ No newline at end of file diff --git a/gemm/cuda/src/sgemm_cuda_kernel.cu b/gemm/cuda/src/sgemm_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..3485a531637eae654166f247d15125136f582908 --- /dev/null +++ b/gemm/cuda/src/sgemm_cuda_kernel.cu @@ -0,0 +1,40 @@ +#include <math.h> +#include <iostream> +#include "cuda_runtime.h" +#include "sgemm_cuda_kernel.h" +#include <stdlib.h> + +using namespace std; + +__global__ void matrixMultiplicationKernel(float* A, float* B, float* C, int N) { + + int ROW = blockIdx.y*blockDim.y+threadIdx.y; + int COL = blockIdx.x*blockDim.x+threadIdx.x; + + float tmpSum = 0; + + if (ROW < N && COL < N) { + // each thread computes one element of the block sub-matrix + for (int i = 0; i < N; i++) { + tmpSum += A[ROW * N + i] * B[i * N + COL]; + } + } + C[ROW * N + COL] = tmpSum; +} + + +void matrixMultiplication(float *A, float *B, float *C, int N){ + + // declare the number of blocks per grid and the number of threads per block + // use 1 to 512 threads per block + dim3 threadsPerBlock(N, N); + dim3 blocksPerGrid(1, 1); + if (N*N > 512){ + threadsPerBlock.x = 512; + threadsPerBlock.y = 512; + blocksPerGrid.x = ceil(double(N)/double(threadsPerBlock.x)); + blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y)); + } + + matrixMultiplicationKernel<<<blocksPerGrid,threadsPerBlock>>>(A, B, C, N); +} \ No newline at end of file diff --git a/gemm/cuda/src/sgemm_cuda_kernel.h b/gemm/cuda/src/sgemm_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..40b46493b71af47fee5193b8851f6402e80a5a4f --- /dev/null +++ b/gemm/cuda/src/sgemm_cuda_kernel.h @@ -0,0 +1,6 @@ +#ifndef KERNEL_CUH_ +#define KERNEL_CUH_ + +void matrixMultiplication(float *A, float *B, float *C, int N); + +#endif \ No newline at end of file diff --git a/gemm/cuda/src/sgemm_cuda_matrixmul.cu b/gemm/cuda/src/sgemm_cuda_matrixmul.cu new file mode 100644 index 0000000000000000000000000000000000000000..477a57f0fc6337ecf99c9780f153bd471cee42a8 --- /dev/null +++ b/gemm/cuda/src/sgemm_cuda_matrixmul.cu @@ -0,0 +1,109 @@ +#include <cstdio> +#include <iostream> +#include <cstdlib> +#include <vector> +#include <stdlib.h> +#include <time.h> +#include <cuda_runtime.h> +#include "sgemm_cuda_kernel.h" +#include "sgemm_cuda_kernel.cu" +#include "dev_array.h" +#include <math.h> + +using namespace std; + +int main(int argc, char **argv) +{ + int N; + + if (argc !=2) { + printf("Usage: ./1_dense_cuda <matrix_size> \n"); + return 1; + } else { + N = atoi(argv[1]); + } + + // Perform matrix multiplication C = A*B + // where A, B and C are NxN matrices + int SIZE = N*N; + + cudaEvent_t start, stop; + + // create cuda timer events + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // start the timer + cudaEventRecord(start, NULL); + + // Allocate memory on the host + vector<float> h_A(SIZE); + vector<float> h_B(SIZE); + vector<float> h_C(SIZE); + + // Initialize matrices on the host + for (int i=0; i<N; i++){ + for (int j=0; j<N; j++){ + h_A[i*N+j] = sin(i); + h_B[i*N+j] = cos(j); + } + } + + // Allocate memory on the device + dev_array<float> d_A(SIZE); + dev_array<float> d_B(SIZE); + dev_array<float> d_C(SIZE); + + d_A.set(&h_A[0], SIZE); + d_B.set(&h_B[0], SIZE); + + matrixMultiplication(d_A.getData(), d_B.getData(), d_C.getData(), N); + cudaDeviceSynchronize(); + + d_C.get(&h_C[0], SIZE); + cudaDeviceSynchronize(); + + cudaEventRecord(stop, NULL); + cudaEventSynchronize(stop); + + float msec_total = 0.0f; + cudaEventElapsedTime(&msec_total, start, stop); + + // Compute and print the performance + float msec_per_matrix_mul = msec_total; + double flops_per_matrix_mul = 2.0 * (double)N * (double)N * (double)N; + double giga_flops = (flops_per_matrix_mul * 1.0e-9f) / (msec_per_matrix_mul / 1000.0f); + printf( + "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", + giga_flops, + msec_per_matrix_mul, + flops_per_matrix_mul); + + + float *cpu_C; + cpu_C=new float[SIZE]; + + // Now do the matrix multiplication on the CPU + float sum; + for (int row=0; row<N; row++){ + for (int col=0; col<N; col++){ + sum = 0.f; + for (int n=0; n<N; n++){ + sum += h_A[row*N+n]*h_B[n*N+col]; + } + cpu_C[row*N+col] = sum; + } + } + + double err = 0; + // Check the result and make sure it is correct + for (int ROW=0; ROW < N; ROW++){ + for (int COL=0; COL < N; COL++){ + err += cpu_C[ROW * N + COL] - h_C[ROW * N + COL]; + } + } + + cout << "Error: " << err << endl; + + return 0; +} \ No newline at end of file