Skip to content
Snippets Groups Projects
Commit 8cd2c62b authored by Damian Podareanu's avatar Damian Podareanu
Browse files

CUDA only implementation of batch multiplication - with shared memory

parent 868e635a
Branches
No related merge requests found
# ==================================================================================================
# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
#
# Author(s):
# Valeriu Codreanu <valeriu.codreanu@surfsara.nl>
#
# ==================================================================================================
# Packages are optional: if they are not present, certain code samples are not compiled
find_package(CUDA) # Built-in in CMake
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/common.cmake)
# ==================================================================================================
if ("${DWARF_PREFIX}" STREQUAL "")
set(DWARF_PREFIX 1_dense)
endif()
# C++ compiler settings
find_package(Common)
select_compiler_flags(cxx_flags
GNU "-march=native" # I suggest remove "-O3" as this is controlled by the CMAKE_BUILD_TYPE
CLANG "-march=native" # same here
Intel "-axavx2,avx")
set(CXX_FLAGS ${cxx_flags})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CXX_FLAGS "${CXX_FLAGS} -Wall -Wno-comment")
if(APPLE)
set(CXX_FLAGS "${CXX_FLAGS} -Wa,-q")
endif()
endif()
if (OPENMP_FOUND)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
# NVCC compiler settings
if (CUDA_FOUND)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3")
set(CUDA_HOST_COMPILER "g++")
endif()
# ==================================================================================================
# GEMM with the CUDA library
set(NAME ${DWARF_PREFIX}_gemm_cuda)
if (CUDA_FOUND)
cuda_add_executable(${NAME} src/sgemm_cuda_matrixmul.cu)
target_link_libraries(${NAME} ${CUDA_curand_LIBRARY})
install(TARGETS ${NAME} DESTINATION bin)
else()
message("** Skipping '${NAME}': no CUDA")
dummy_install(${NAME} "CUDA")
endif()
unset(NAME)
# ==================================================================================================
=======
README
=======
# 1. Code sample name
gemm
# 2. Description of the code sample package
This example demonstrates a multi-dimensional thread blocks and shared memory implementation of matrix-matrix multiplication using NVIDIA's CUDA. The example is set-up to perform the computation of both CPU and GPU and in the end to verify the results.
Additional pre-requisites:
* CUDA
See http://docs.nvidia.com/cuda/ about CUDA information
# 3. Release date
6 Dec 2016
# 4. Version history
1.0
Damian Podareanu <damian.podareanu@surfsara.nl>
# 6. Copyright / License of the code sample
Apache 2.0
# 7. Language(s)
C++
CUDA
# 8. Parallelisation Implementation(s)
GPU
# 9. Level of the code sample complexity
Basic level
# 10. Instructions on how to compile the code
Uses the CodeVault CMake infrastructure, see main README.md
# 11. Instructions on how to run the code
Run the executable with a single command-line option, the matrix size
# 12. Sample input(s)
Input-data is generated automatically when running the program by computing sin and cos values.
# 13. Sample output(s)
Output data is verified programmatically using a CPU implementation of matrix-matrix multiplication.
#ifndef _DEV_ARRAY_H_
#define _DEV_ARRAY_H_
#include <stdexcept>
#include <algorithm>
#include <cuda_runtime.h>
template <class T>
class dev_array
{
// public functions
public:
explicit dev_array()
: start_(0),
end_(0)
{}
// constructor
explicit dev_array(size_t size)
{
allocate(size);
}
// destructor
~dev_array()
{
free();
}
// resize the vector
void resize(size_t size)
{
free();
allocate(size);
}
// get the size of the array
size_t getSize() const
{
return end_ - start_;
}
// get data
const T* getData() const
{
return start_;
}
T* getData()
{
return start_;
}
// set
void set(const T* src, size_t size)
{
size_t min = std::min(size, getSize());
cudaError_t result = cudaMemcpy(start_, src, min * sizeof(T), cudaMemcpyHostToDevice);
if (result != cudaSuccess)
{
throw std::runtime_error("failed to copy to device memory");
}
}
// get
void get(T* dest, size_t size)
{
size_t min = std::min(size, getSize());
cudaError_t result = cudaMemcpy(dest, start_, min * sizeof(T), cudaMemcpyDeviceToHost);
if (result != cudaSuccess)
{
throw std::runtime_error("failed to copy to host memory");
}
}
// private functions
private:
// allocate memory on the device
void allocate(size_t size)
{
cudaError_t result = cudaMalloc((void**)&start_, size * sizeof(T));
if (result != cudaSuccess)
{
start_ = end_ = 0;
throw std::runtime_error("failed to allocate device memory");
}
end_ = start_ + size;
}
// free memory on the device
void free()
{
if (start_ != 0)
{
cudaFree(start_);
start_ = end_ = 0;
}
}
T* start_;
T* end_;
};
#endif
\ No newline at end of file
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"
#include "sgemm_cuda_kernel.h"
#include <stdlib.h>
using namespace std;
__global__ void matrixMultiplicationKernel(float* A, float* B, float* C, int N) {
int ROW = blockIdx.y*blockDim.y+threadIdx.y;
int COL = blockIdx.x*blockDim.x+threadIdx.x;
float tmpSum = 0;
if (ROW < N && COL < N) {
// each thread computes one element of the block sub-matrix
for (int i = 0; i < N; i++) {
tmpSum += A[ROW * N + i] * B[i * N + COL];
}
}
C[ROW * N + COL] = tmpSum;
}
void matrixMultiplication(float *A, float *B, float *C, int N){
// declare the number of blocks per grid and the number of threads per block
// use 1 to 512 threads per block
dim3 threadsPerBlock(N, N);
dim3 blocksPerGrid(1, 1);
if (N*N > 512){
threadsPerBlock.x = 512;
threadsPerBlock.y = 512;
blocksPerGrid.x = ceil(double(N)/double(threadsPerBlock.x));
blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));
}
matrixMultiplicationKernel<<<blocksPerGrid,threadsPerBlock>>>(A, B, C, N);
}
\ No newline at end of file
#ifndef KERNEL_CUH_
#define KERNEL_CUH_
void matrixMultiplication(float *A, float *B, float *C, int N);
#endif
\ No newline at end of file
#include <cstdio>
#include <iostream>
#include <cstdlib>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#include "sgemm_cuda_kernel.h"
#include "sgemm_cuda_kernel.cu"
#include "dev_array.h"
#include <math.h>
using namespace std;
int main(int argc, char **argv)
{
int N;
if (argc !=2) {
printf("Usage: ./1_dense_cuda <matrix_size> \n");
return 1;
} else {
N = atoi(argv[1]);
}
// Perform matrix multiplication C = A*B
// where A, B and C are NxN matrices
int SIZE = N*N;
cudaEvent_t start, stop;
// create cuda timer events
cudaEventCreate(&start);
cudaEventCreate(&stop);
// start the timer
cudaEventRecord(start, NULL);
// Allocate memory on the host
vector<float> h_A(SIZE);
vector<float> h_B(SIZE);
vector<float> h_C(SIZE);
// Initialize matrices on the host
for (int i=0; i<N; i++){
for (int j=0; j<N; j++){
h_A[i*N+j] = sin(i);
h_B[i*N+j] = cos(j);
}
}
// Allocate memory on the device
dev_array<float> d_A(SIZE);
dev_array<float> d_B(SIZE);
dev_array<float> d_C(SIZE);
d_A.set(&h_A[0], SIZE);
d_B.set(&h_B[0], SIZE);
matrixMultiplication(d_A.getData(), d_B.getData(), d_C.getData(), N);
cudaDeviceSynchronize();
d_C.get(&h_C[0], SIZE);
cudaDeviceSynchronize();
cudaEventRecord(stop, NULL);
cudaEventSynchronize(stop);
float msec_total = 0.0f;
cudaEventElapsedTime(&msec_total, start, stop);
// Compute and print the performance
float msec_per_matrix_mul = msec_total;
double flops_per_matrix_mul = 2.0 * (double)N * (double)N * (double)N;
double giga_flops = (flops_per_matrix_mul * 1.0e-9f) / (msec_per_matrix_mul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
giga_flops,
msec_per_matrix_mul,
flops_per_matrix_mul);
float *cpu_C;
cpu_C=new float[SIZE];
// Now do the matrix multiplication on the CPU
float sum;
for (int row=0; row<N; row++){
for (int col=0; col<N; col++){
sum = 0.f;
for (int n=0; n<N; n++){
sum += h_A[row*N+n]*h_B[n*N+col];
}
cpu_C[row*N+col] = sum;
}
}
double err = 0;
// Check the result and make sure it is correct
for (int ROW=0; ROW < N; ROW++){
for (int COL=0; COL < N; COL++){
err += cpu_C[ROW * N + COL] - h_C[ROW * N + COL];
}
}
cout << "Error: " << err << endl;
return 0;
}
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment