Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"
#include "sgemm_cuda_kernel.h"
#include <stdlib.h>
using namespace std;
__global__ void matrixMultiplicationKernel(float* A, float* B, float* C, int N) {
int ROW = blockIdx.y*blockDim.y+threadIdx.y;
int COL = blockIdx.x*blockDim.x+threadIdx.x;
float tmpSum = 0;
if (ROW < N && COL < N) {
// each thread computes one element of the block sub-matrix
for (int i = 0; i < N; i++) {
tmpSum += A[ROW * N + i] * B[i * N + COL];
}
}
C[ROW * N + COL] = tmpSum;
}
void matrixMultiplication(float *A, float *B, float *C, int N){
// declare the number of blocks per grid and the number of threads per block
// use 1 to 512 threads per block
dim3 threadsPerBlock(N, N);
dim3 blocksPerGrid(1, 1);
if (N*N > 512){
threadsPerBlock.x = 512;
threadsPerBlock.y = 512;
blocksPerGrid.x = ceil(double(N)/double(threadsPerBlock.x));
blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));
}
matrixMultiplicationKernel<<<blocksPerGrid,threadsPerBlock>>>(A, B, C, N);
}