Commit 868e635a authored by Damian Podareanu's avatar Damian Podareanu
Browse files

Iniitial batch matrix multiplication commit. Contains only cublas implementation for the moment

parent f0fe3794
# ==================================================================================================
# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
#
# Author(s):
# Damian Podareanu <damian.podareanu@surfsara.nl>
#
# ==================================================================================================
# Packages are optional: if they are not present, certain code samples are not compiled
find_package(OpenMP) # Built-in in CMake
find_package(MPI) # Built-in in CMake
find_package(CUDA) # Built-in in CMake
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/common.cmake)
# ==================================================================================================
if ("${DWARF_PREFIX}" STREQUAL "")
set(DWARF_PREFIX 1_dense)
endif()
# C++ compiler settings
find_package(Common)
select_compiler_flags(cxx_flags
GNU "-march=native" # I suggest remove "-O3" as this is controlled by the CMAKE_BUILD_TYPE
CLANG "-march=native" # same here
Intel "-axavx2,avx")
set(CXX_FLAGS ${cxx_flags})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CXX_FLAGS "${CXX_FLAGS} -Wall -Wno-comment")
if(APPLE)
set(CXX_FLAGS "${CXX_FLAGS} -Wa,-q")
endif()
endif()
if (OPENMP_FOUND)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
# NVCC compiler settings
if (CUDA_FOUND)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3 -gencode=arch=compute_35,code=\"sm_35,compute_35\"")
set(CUDA_HOST_COMPILER "g++")
endif()
# ==================================================================================================
# Batch GEMM with the CUDA cuBLAS library
set(NAME ${DWARF_PREFIX}_cublas_bgemm)
if (CUDA_FOUND)
cuda_add_executable(${NAME} src/cublas_bgemm.cpp)
target_link_libraries(${NAME} ${CUDA_CUBLAS_LIBRARIES})
target_link_libraries(${NAME} ${CUDA_curand_LIBRARY})
install(TARGETS ${NAME} DESTINATION bin)
if (OPENMP_FOUND)
message("** Enabling '${NAME}': with OpenMP")
else()
message("** Enabling '${NAME}': without OpenMP")
endif()
else()
message("** Skipping '${NAME}': no CUDA")
dummy_install(${NAME} "CUDA")
endif()
unset(NAME)
# ==================================================================================================
=======
README
=======
# 1. Code sample name
gemm
# 2. Description of the code sample package
This example demonstrates the use of NVIDIA's linear algebra library for CUDA: cuBLAS. The example is set-up to perform the a batch matrix multiplication.
Additional pre-requisites:
* CUDA (includes the cuBLAS library)
See http://docs.nvidia.com/cuda/cublas for the full cuBLAS documentation.
# 3. Release date
5q Dec 2016
# 4. Version history
1.0
# 5. Contributor (s) / Maintainer(s)
Damian Podareanu <damian.podareanu@surfsara.nl>
# 6. Copyright / License of the code sample
Apache 2.0
# 7. Language(s)
C++
CUDA
# 8. Parallelisation Implementation(s)
GPU
# 9. Level of the code sample complexity
Basic level, uses library calls only
# 10. Instructions on how to compile the code
Uses the CodeVault CMake infrastructure, see main README.md
# 11. Instructions on how to run the code
Run the executable with a single command-line option, the matrix size
# 12. Sample input(s)
Input-data is generated automatically when running the program.
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>
#define ROWM 2000
#define COLM 2000
#define COLN 2000
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float mytype;
// Pi = Mi x Ni
// pr = P rows = M rows
// pc = P cols = N cols
// mc = M cols = N rows
void GPU_Multi(mytype **M, mytype **N, mytype **P, size_t pr, size_t pc, size_t mc, size_t num_mat, mytype alpha, mytype beta)
{
mytype *devM[num_mat];
mytype *devN[num_mat];
mytype *devP[num_mat];
size_t p_size =sizeof(mytype) *pr*pc;
size_t m_size =sizeof(mytype) *pr*mc;
size_t n_size =sizeof(mytype) *mc*pc;
const mytype **d_Marray, **d_Narray;
mytype **d_Parray;
cublasHandle_t myhandle;
cublasStatus_t cublas_result;
for(int i = 0 ; i < num_mat; i ++ )
{
cudaMalloc((void**)&devM[ i ], m_size );
cudaMalloc((void**)&devN[ i ], n_size );
cudaMalloc((void**)&devP[ i ], p_size );
}
cudaMalloc((void**)&d_Marray, num_mat*sizeof(mytype *));
cudaMalloc((void**)&d_Narray, num_mat*sizeof(mytype *));
cudaMalloc((void**)&d_Parray, num_mat*sizeof(mytype *));
cudaCheckErrors("cudaMalloc fail");
for(int i = 0 ; i < num_mat; i ++ ) {
cudaMemcpy(devM[i], M[i], m_size , cudaMemcpyHostToDevice);
cudaMemcpy(devN[i], N[i], n_size , cudaMemcpyHostToDevice);
cudaMemcpy(devP[i], P[i], p_size , cudaMemcpyHostToDevice);
}
cudaMemcpy(d_Marray, devM, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
cudaMemcpy(d_Narray, devN, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
cudaMemcpy(d_Parray, devP, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D fail");
cublas_result = cublasCreate(&myhandle);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
// change to cublasDgemmBatched for double
cublas_result = cublasSgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N, pr, pc, mc, &alpha, d_Marray, pr, d_Narray, mc, &beta, d_Parray, pr, num_mat);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
for(int i = 0 ; i < num_mat ; i ++ )
{
cudaMemcpy(P[i], devP[i], p_size, cudaMemcpyDeviceToHost);
cudaFree(devM[i]);
cudaFree(devN[i]);
cudaFree(devP[i]);
}
cudaFree(d_Marray);
cudaFree(d_Narray);
cudaFree(d_Parray);
cudaCheckErrors("cudaMemcpy D2H fail");
}
int main(){
mytype h_M1[ROWM][COLM], h_M2[ROWM][COLM];
mytype h_N1[COLM][COLN], h_N2[COLM][COLN];
mytype h_P1[ROWM][COLN], h_P2[ROWM][COLN];
mytype *h_Marray[2], *h_Narray[2], *h_Parray[2];
for (int i = 0; i < ROWM; i++)
for (int j = 0; j < COLM; j++){
h_M1[i][j] = 1.0f; h_M2[i][j] = 2.0f;}
for (int i = 0; i < COLM; i++)
for (int j = 0; j < COLN; j++){
h_N1[i][j] = 1.0f; h_N2[i][j] = 1.0f;}
for (int i = 0; i < ROWM; i++)
for (int j = 0; j < COLN; j++){
h_P1[i][j] = 0.0f; h_P2[i][j] = 0.0f;}
h_Marray[0] = &(h_M1[0][0]);
h_Marray[1] = &(h_M2[0][0]);
h_Narray[0] = &(h_N1[0][0]);
h_Narray[1] = &(h_N2[0][0]);
h_Parray[0] = &(h_P1[0][0]);
h_Parray[1] = &(h_P2[0][0]);
GPU_Multi(h_Marray, h_Narray, h_Parray, ROWM, COLN, COLM, 2, 1.0f, 0.0f);
for (int i = 0; i < ROWM; i++)
for (int j = 0; j < COLN; j++){
if (h_P1[i][j] != COLM*1.0f) {printf("h_P1 mismatch at %d,%d was: %f should be: %f\n", i, j, h_P1[i][j], COLM*1.0f); return 1;}
if (h_P2[i][j] != COLM*2.0f) {printf("h_P2 mismatch at %d,%d was: %f should be: %f\n", i, j, h_P2[i][j], COLM*2.0f); return 1;}
}
printf("Success!\n");
return 0;
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment