#include #include #include #include #include #include #include #include "sgemm_cuda_kernel.h" #include "sgemm_cuda_kernel.cu" #include "dev_array.h" #include using namespace std; int main(int argc, char **argv) { int N; if (argc !=2) { printf("Usage: ./1_dense_cuda \n"); return 1; } else { N = atoi(argv[1]); } // Perform matrix multiplication C = A*B // where A, B and C are NxN matrices int SIZE = N*N; cudaEvent_t start, stop; // create cuda timer events cudaEventCreate(&start); cudaEventCreate(&stop); // start the timer cudaEventRecord(start, NULL); // Allocate memory on the host vector h_A(SIZE); vector h_B(SIZE); vector h_C(SIZE); // Initialize matrices on the host for (int i=0; i d_A(SIZE); dev_array d_B(SIZE); dev_array d_C(SIZE); d_A.set(&h_A[0], SIZE); d_B.set(&h_B[0], SIZE); matrixMultiplication(d_A.getData(), d_B.getData(), d_C.getData(), N); cudaDeviceSynchronize(); d_C.get(&h_C[0], SIZE); cudaDeviceSynchronize(); cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); float msec_total = 0.0f; cudaEventElapsedTime(&msec_total, start, stop); // Compute and print the performance float msec_per_matrix_mul = msec_total; double flops_per_matrix_mul = 2.0 * (double)N * (double)N * (double)N; double giga_flops = (flops_per_matrix_mul * 1.0e-9f) / (msec_per_matrix_mul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", giga_flops, msec_per_matrix_mul, flops_per_matrix_mul); float *cpu_C; cpu_C=new float[SIZE]; // Now do the matrix multiplication on the CPU float sum; for (int row=0; row