diff --git a/gemm/cuda/CMakeLists.txt b/gemm/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bac94b025753d9449418fdb2461b7953bececac4
--- /dev/null
+++ b/gemm/cuda/CMakeLists.txt
@@ -0,0 +1,60 @@
+
+# ==================================================================================================
+# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
+# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
+#
+# Author(s):
+#   Valeriu Codreanu <valeriu.codreanu@surfsara.nl>
+#
+# ==================================================================================================
+
+# Packages are optional: if they are not present, certain code samples are not compiled
+find_package(CUDA)     # Built-in in CMake
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/common.cmake)
+# ==================================================================================================
+if ("${DWARF_PREFIX}" STREQUAL "")
+  set(DWARF_PREFIX 1_dense)
+endif()
+
+# C++ compiler settings
+
+find_package(Common)
+select_compiler_flags(cxx_flags
+  GNU "-march=native"   # I suggest remove "-O3" as this is controlled by the CMAKE_BUILD_TYPE
+  CLANG "-march=native" # same here
+  Intel "-axavx2,avx")
+set(CXX_FLAGS ${cxx_flags})
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(CXX_FLAGS "${CXX_FLAGS} -Wall -Wno-comment")
+  if(APPLE)
+    set(CXX_FLAGS "${CXX_FLAGS} -Wa,-q")
+  endif()
+endif()
+if (OPENMP_FOUND)
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
+
+# NVCC compiler settings
+if (CUDA_FOUND)
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3")
+  set(CUDA_HOST_COMPILER "g++")
+endif()
+
+# ==================================================================================================
+
+# GEMM with the CUDA library
+set(NAME ${DWARF_PREFIX}_gemm_cuda)
+if (CUDA_FOUND)
+  cuda_add_executable(${NAME} src/sgemm_cuda_matrixmul.cu)
+  target_link_libraries(${NAME} ${CUDA_curand_LIBRARY})
+  install(TARGETS ${NAME} DESTINATION bin)
+else()
+  message("** Skipping '${NAME}': no CUDA")
+  dummy_install(${NAME} "CUDA")
+endif()
+
+unset(NAME)
+
+# ==================================================================================================
diff --git a/gemm/cuda/README.md b/gemm/cuda/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d9bb5f01da38250a42c8867ce72e8f0fc5ffdf6
--- /dev/null
+++ b/gemm/cuda/README.md
@@ -0,0 +1,47 @@
+=======
+README
+=======
+
+# 1. Code sample name
+gemm
+
+# 2. Description of the code sample package
+This example demonstrates a multi-dimensional thread blocks and shared memory implementation of matrix-matrix multiplication using NVIDIA's CUDA. The example is set-up to perform the computation of both CPU and GPU and in the end to verify the results.
+
+Additional pre-requisites:
+* CUDA 
+
+See http://docs.nvidia.com/cuda/ about CUDA information
+
+# 3. Release date
+6 Dec 2016
+
+# 4. Version history 
+1.0
+
+Damian Podareanu <damian.podareanu@surfsara.nl>
+
+# 6. Copyright / License of the code sample
+Apache 2.0
+
+# 7. Language(s) 
+C++
+CUDA
+
+# 8. Parallelisation Implementation(s)
+GPU
+
+# 9. Level of the code sample complexity 
+Basic level
+
+# 10. Instructions on how to compile the code
+Uses the CodeVault CMake infrastructure, see main README.md
+
+# 11. Instructions on how to run the code
+Run the executable with a single command-line option, the matrix size
+
+# 12. Sample input(s)
+Input-data is generated automatically when running the program by computing sin and cos values.
+
+# 13. Sample output(s)
+Output data is verified programmatically using a CPU implementation of matrix-matrix multiplication.
diff --git a/gemm/cuda/src/dev_array.h b/gemm/cuda/src/dev_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad7d2132e6f384553bdc92dfc6887041b3e53612
--- /dev/null
+++ b/gemm/cuda/src/dev_array.h
@@ -0,0 +1,103 @@
+#ifndef _DEV_ARRAY_H_
+#define _DEV_ARRAY_H_
+
+#include <stdexcept>
+#include <algorithm>
+#include <cuda_runtime.h>
+
+template <class T>
+class dev_array
+{
+// public functions
+public:
+    explicit dev_array()
+        : start_(0),
+          end_(0)
+    {}
+
+    // constructor
+    explicit dev_array(size_t size)
+    {
+        allocate(size);
+    }
+    // destructor
+    ~dev_array()
+    {
+        free();
+    }
+
+    // resize the vector
+    void resize(size_t size)
+    {
+        free();
+        allocate(size);
+    }
+
+    // get the size of the array
+    size_t getSize() const
+    {
+        return end_ - start_;
+    }
+
+    // get data
+    const T* getData() const
+    {
+        return start_;
+    }
+
+    T* getData()
+    {
+        return start_;
+    }
+
+    // set
+    void set(const T* src, size_t size)
+    {
+        size_t min = std::min(size, getSize());
+        cudaError_t result = cudaMemcpy(start_, src, min * sizeof(T), cudaMemcpyHostToDevice);
+        if (result != cudaSuccess)
+        {
+            throw std::runtime_error("failed to copy to device memory");
+        }
+    }
+    // get
+    void get(T* dest, size_t size)
+    {
+        size_t min = std::min(size, getSize());
+        cudaError_t result = cudaMemcpy(dest, start_, min * sizeof(T), cudaMemcpyDeviceToHost);
+        if (result != cudaSuccess)
+        {
+            throw std::runtime_error("failed to copy to host memory");
+        }
+    }
+
+
+// private functions
+private:
+    // allocate memory on the device
+    void allocate(size_t size)
+    {
+        cudaError_t result = cudaMalloc((void**)&start_, size * sizeof(T));
+        if (result != cudaSuccess)
+        {
+            start_ = end_ = 0;
+            throw std::runtime_error("failed to allocate device memory");
+        }
+        end_ = start_ + size;
+    }
+
+    // free memory on the device
+    void free()
+    {
+        if (start_ != 0)
+        {
+            cudaFree(start_);
+            start_ = end_ = 0;
+        }
+    }
+
+    T* start_;
+    T* end_;
+};
+
+#endif
\ No newline at end of file
diff --git a/gemm/cuda/src/sgemm_cuda_kernel.cu b/gemm/cuda/src/sgemm_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3485a531637eae654166f247d15125136f582908
--- /dev/null
+++ b/gemm/cuda/src/sgemm_cuda_kernel.cu
@@ -0,0 +1,40 @@
+#include <math.h>
+#include <iostream>
+#include "cuda_runtime.h"
+#include "sgemm_cuda_kernel.h"
+#include <stdlib.h>
+
+using namespace std;
+
+__global__ void matrixMultiplicationKernel(float* A, float* B, float* C, int N) {
+
+    int ROW = blockIdx.y*blockDim.y+threadIdx.y;
+    int COL = blockIdx.x*blockDim.x+threadIdx.x;
+
+    float tmpSum = 0;
+
+    if (ROW < N && COL < N) {
+        // each thread computes one element of the block sub-matrix
+        for (int i = 0; i < N; i++) {
+            tmpSum += A[ROW * N + i] * B[i * N + COL];
+        }
+    }
+    C[ROW * N + COL] = tmpSum;
+}
+
+
+void matrixMultiplication(float *A, float *B, float *C, int N){
+
+    // declare the number of blocks per grid and the number of threads per block
+    // use 1 to 512 threads per block
+    dim3 threadsPerBlock(N, N);
+    dim3 blocksPerGrid(1, 1);
+        if (N*N > 512){
+            threadsPerBlock.x = 512;
+            threadsPerBlock.y = 512;
+            blocksPerGrid.x = ceil(double(N)/double(threadsPerBlock.x));
+            blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));
+        }
+
+    matrixMultiplicationKernel<<<blocksPerGrid,threadsPerBlock>>>(A, B, C, N);
+}
\ No newline at end of file
diff --git a/gemm/cuda/src/sgemm_cuda_kernel.h b/gemm/cuda/src/sgemm_cuda_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..40b46493b71af47fee5193b8851f6402e80a5a4f
--- /dev/null
+++ b/gemm/cuda/src/sgemm_cuda_kernel.h
@@ -0,0 +1,6 @@
+#ifndef KERNEL_CUH_
+#define KERNEL_CUH_
+
+void matrixMultiplication(float *A, float *B, float *C, int N);
+
+#endif
\ No newline at end of file
diff --git a/gemm/cuda/src/sgemm_cuda_matrixmul.cu b/gemm/cuda/src/sgemm_cuda_matrixmul.cu
new file mode 100644
index 0000000000000000000000000000000000000000..477a57f0fc6337ecf99c9780f153bd471cee42a8
--- /dev/null
+++ b/gemm/cuda/src/sgemm_cuda_matrixmul.cu
@@ -0,0 +1,109 @@
+#include <cstdio>
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
+#include "sgemm_cuda_kernel.h"
+#include "sgemm_cuda_kernel.cu"
+#include "dev_array.h"
+#include <math.h>
+
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    int N;
+
+    if (argc !=2)  {
+	printf("Usage: ./1_dense_cuda <matrix_size> \n");
+	return 1;
+    } else {
+	N = atoi(argv[1]);
+    }
+
+    // Perform matrix multiplication C = A*B
+    // where A, B and C are NxN matrices
+    int SIZE = N*N;
+
+    cudaEvent_t start, stop;
+
+    // create cuda timer events
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    // start the timer
+    cudaEventRecord(start, NULL);
+
+    // Allocate memory on the host
+    vector<float> h_A(SIZE);
+    vector<float> h_B(SIZE);
+    vector<float> h_C(SIZE);
+
+    // Initialize matrices on the host
+    for (int i=0; i<N; i++){
+        for (int j=0; j<N; j++){
+            h_A[i*N+j] = sin(i);
+            h_B[i*N+j] = cos(j);
+        }
+    }
+
+    // Allocate memory on the device
+    dev_array<float> d_A(SIZE);
+    dev_array<float> d_B(SIZE);
+    dev_array<float> d_C(SIZE);
+
+    d_A.set(&h_A[0], SIZE);
+    d_B.set(&h_B[0], SIZE);
+
+    matrixMultiplication(d_A.getData(), d_B.getData(), d_C.getData(), N);
+    cudaDeviceSynchronize();
+
+    d_C.get(&h_C[0], SIZE);
+    cudaDeviceSynchronize();
+
+    cudaEventRecord(stop, NULL);
+    cudaEventSynchronize(stop);
+
+    float msec_total = 0.0f;
+    cudaEventElapsedTime(&msec_total, start, stop);
+
+    // Compute and print the performance
+    float msec_per_matrix_mul = msec_total;
+    double flops_per_matrix_mul = 2.0 * (double)N * (double)N * (double)N;
+    double giga_flops = (flops_per_matrix_mul * 1.0e-9f) / (msec_per_matrix_mul / 1000.0f);
+    printf(
+        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
+        giga_flops,
+        msec_per_matrix_mul,
+        flops_per_matrix_mul);
+
+
+    float *cpu_C;
+    cpu_C=new float[SIZE];
+
+    // Now do the matrix multiplication on the CPU
+    float sum;
+    for (int row=0; row<N; row++){
+        for (int col=0; col<N; col++){
+            sum = 0.f;
+            for (int n=0; n<N; n++){
+                sum += h_A[row*N+n]*h_B[n*N+col];
+            }
+            cpu_C[row*N+col] = sum;
+        }
+    }
+
+    double err = 0;
+    // Check the result and make sure it is correct
+    for (int ROW=0; ROW < N; ROW++){
+        for (int COL=0; COL < N; COL++){
+            err += cpu_C[ROW * N + COL] - h_C[ROW * N + COL];
+        }
+    }
+
+    cout << "Error: " << err << endl;
+
+    return 0;
+}
\ No newline at end of file