Commit 5f1d5c8f authored by Valeriu Codreanu's avatar Valeriu Codreanu
Browse files

Merge branch 'master' into 'master'

Added OpenMP code samples for gemm and LU decomposition(Doolittle algorithm)



See merge request !72
parents 541cc903 84ebf503
# ==================================================================================================
# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
#
# Author(s):
# Rafal Gandecki <rafal.gandeci@pwr.edu.nl>
#
# ==================================================================================================
cmake_minimum_required(VERSION 2.8.7 FATAL_ERROR)
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/common.cmake)
# ==================================================================================================
if ("${DWARF_PREFIX}" STREQUAL "")
set(DWARF_PREFIX gemm_openmp)
endif()
find_package(Common)
find_package(OpenMP)
enable_language (C)
set(NAME ${DWARF_PREFIX})
if (OPENMP_FOUND)
set(CXX_FLAGS "${CXX_FLAGS} -Wall -Wno-comment -std=c++0x -fopenmp")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
add_executable(${NAME} src/gemm_openmp.cpp)
install(TARGETS ${NAME} DESTINATION bin)
else ()
message("## Skipping '${NAME}': no OpenMP support found")
endif()
unset(NAME)
README
=======
# 1. Code sample name
gemm_openmp
# 2. Description of the code sample package
This example demonstrates the use of OpenMP for matrix-matrix multiplication.
# 3. Release date
19 August 2016
# 4. Version history
1.0
# 6. Copyright / License of the code sample
Apache Version 2.0
# 5. Contributor (s) / Maintainer(s)
Rafal Gandecki <rafal.gandecki@pwr.edu.pl>
# 7. Language(s)
C++ 11
# 8. Parallelisation Implementation(s)
OpenMP
# 9. Level of the code sample complexity
basic
# 10. Instructions on how to compile the code
Uses the CodeVault CMake infrastructure, see main README.md
# 11. Instructions on how to run the code
Just run compiled executable
# 12. Sample input(s)
3 arguments:
- matrix A rows
- matrix A cols
- matrix B cols
# 13. Sample output(s)
execution time of algorithm wih and wihout OpenMP
// =================================================================================================
// This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
// CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
//
// Author(s):
// Rafal Gandecki <rafal.gandecki@pwr.edu.pl>
//
// This example demonstrates the use of OpenMP for matrix-matrix multiplication and
// compares execution time of algorithms.
// The example is set-up to perform single precision matrix-matrix multiplication.
// The example takes a triple input arguments (matrix A rows, matrix A cols, matric B cols),
// specifying the size of the matrices.
// See [http://www.openmp.org/] for the full OpenMP documentation.
//
// =================================================================================================
#include <omp.h>
#include <random>
#include <iostream>
void fill_random(float *A, const int &n, const int &m)
{
std::mt19937 e(static_cast<unsigned int>(time(nullptr)));
std::uniform_real_distribution<float> f;
for(int i=0; i<n; ++i)
{
for(int j=0; j<m; ++j)
{
A[i*m+j] = f(e);
}
}
}
void gemm(float *A, float *B, float *C,
const int &A_rows, const int &A_cols, const int &B_rows)
{
for(int i=0; i<A_rows; i++)
{
for(int j=0; j<B_rows; j++) {
float sum = 0.0;
for (int k=0; k<A_cols; k++) {
sum += A[i*A_cols+k] * B[k*B_rows+j];
}
C[i*B_rows+j ] = sum;
}
}
}
void gemm_OpenMP(float *A, float *B, float *C,
const int &A_rows, const int &A_cols, const int &B_rows)
{
int i, j, k;
#pragma omp parallel for shared(A, B, C, A_rows, A_cols, B_rows) private(i, j, k)
for (i = 0; i < A_rows; i++)
{
for (j = 0; j < B_rows; j++)
{
float sum = 0.0;
for (k=0; k<A_cols; k++)
{
sum += A[i*A_cols+k] * B[k*B_rows+j];
}
C[i*B_rows+j] = sum;
}
}
}
int main(int argc, char **argv)
{
int A_rows, A_cols, B_rows, B_cols;
if (argc != 4)
{
std::cout << "Usage: 3 arguments: matrix A rows, matrix A cols and matrix B cols"<< std::endl;
return 1;
}
else
{
A_rows = atoi(argv[1]);
A_cols = atoi(argv[2]);
B_rows = atoi(argv[2]);
B_cols = atoi(argv[3]);
}
double dtime;
float *A = new float[A_rows*A_cols];
float *B = new float[B_rows*B_cols];
float *C = new float[A_rows*B_cols];
fill_random(A, A_rows, A_cols);
fill_random(B, B_rows, B_cols);
dtime = omp_get_wtime();
gemm_OpenMP(A, B, C, A_rows, A_cols, B_cols);
dtime = omp_get_wtime() - dtime;
std::cout << "Time with OpenMp: " << dtime << std::endl;
dtime = omp_get_wtime();
gemm(A,B,C, A_rows, A_cols, B_cols);
dtime = omp_get_wtime() - dtime;
std::cout << "Time without OpenMP: " << dtime << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 0;
}
# ==================================================================================================
# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
#
# Author(s):
# Rafal Gandecki <rafal.gandeci@pwr.edu.nl>
#
# ==================================================================================================
cmake_minimum_required(VERSION 2.8.7 FATAL_ERROR)
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/common.cmake)
# ==================================================================================================
if ("${DWARF_PREFIX}" STREQUAL "")
set(DWARF_PREFIX lud_openmp)
endif()
find_package(Common)
find_package(OpenMP)
enable_language (C)
set(NAME ${DWARF_PREFIX})
if (OPENMP_FOUND)
set(CXX_FLAGS "${CXX_FLAGS} -Wall -Wno-comment -std=c++0x -fopenmp")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
add_executable(${NAME} src/lud_openmp.cpp)
install(TARGETS ${NAME} DESTINATION bin)
else()
message("## Skipping '${NAME}': no OpenMP support found")
endif()
unset(NAME)
README
=======
# 1. Code sample name
lud_openmp
# 2. Description of the code sample package
This example demonstrates the use of OpenMP for LU decomposition (Doolittle algorithm).
# 3. Release date
19 August 2016
# 4. Version history
1.0
# 6. Copyright / License of the code sample
Apache Version 2.0
# 5. Contributor (s) / Maintainer(s)
Rafal Gandecki <rafal.gandecki@pwr.edu.pl>
# 7. Language(s)
C++ 11
# 8. Parallelisation Implementation(s)
OpenMP
# 9. Level of the code sample complexity
basic
# 10. Instructions on how to compile the code
Uses the CodeVault CMake infrastructure, see main README.md
# 11. Instructions on how to run the code
Just run compiled executable
# 12. Sample input(s)
1 argument:
- matrix size
# 13. Sample output(s)
execution time of algorthims with and without OpenMP
~
~
~
~
~
~
// =================================================================================================
// This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
// CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
//
// Author(s):
// Rafal Gandecki <rafal.gandecki@pwr.edu.pl>
//
// This example demonstrates the use of OpenMP for LU decomposition (Doolittle algorithm) and
// compares execution time.
// The example takes a single input argument, specifying the size of the matrices.
//
// See [http://www.openmp.org/] for the full OpenMP documentation.
//
// =================================================================================================
#include <omp.h>
#include <random>
#include <iostream>
void fill_random(float *A, const int &n, const int &m)
{
std::mt19937 e(static_cast<unsigned int>(time(nullptr)));
std::uniform_real_distribution<float> f;
for(int i=0; i<n; ++i)
{
for(int j=0; j<m; ++j)
{
A[i*m+j] = f(e);
}
}
}
void lud(float *A, float *L, float *U, const int &n)
{
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
{
if(j>i)
U[j*n+i] = 0;
U[i*n+j] = A[i*n+j];
for(int k=0; k<i; k++)
{
U[i*n+j] -= U[k*n+j] * L[i*n+k];
}
}
for(int j=0; j<n; j++)
{
if(i>j)
L[j*n+i] = 0;
else if (j==i)
L[j*n+i] = 1;
else
{
L[j*n+i] = A[j*n+i] / U[i*n+i];
for(int k=0; k<i; k++)
{
L[j*n+i] -= ((U[k*n+i] * L[j*n+k]) / U[i*n+i]);
}
}
}
}
}
void lud_OpenMP(float *A, float *L, float *U, const int &n)
{
int i, j, k;
#pragma omp parallel for shared(A, L, U, n) private(i, j, k)
for (i=0; i<n; i++)
{
for(j=0; j<n; j++)
{
if(j>i)
U[j*n+i] = 0;
U[i*n+j] = A[i*n+j];
for(k=0; k<i; k++)
{
U[i*n+j] -= U[k*n+j] * L[i*n+k];
}
}
for(j=0; j<n; j++)
{
if(i>j)
L[j*n+i] = 0;
else if (j==i)
L[j*n+i] = 1;
else
{
L[j*n+i] = A[j*n+i] / U[i*n+i];
for(k=0; k<i; k++)
{
L[j*n+i] -= ((U[k*n+i] * L[j*n+k]) / U[i*n+i]);
}
}
}
}
}
int main(int argc, char **argv)
{
int n;
float *A, *L, *U;
if (argc != 2)
{
std::cout << "Usage: 1 argument: matrix size" << std::endl;
return 1;
}
else
{
n = atoi(argv[1]);
}
A = new float[n*n];
L = new float[n*n];
U = new float[n*n];
fill_random(A, n, n);
double dtime;
dtime = omp_get_wtime();
lud(A, L, U, n);
dtime = omp_get_wtime() - dtime;
std::cout << "Time without OpenMP: " << dtime << std::endl;
dtime = omp_get_wtime();
lud_OpenMP(A, L, U, n);
dtime = omp_get_wtime() - dtime;
std::cout << "Time with OpenMP: " << dtime << std::endl;
delete[] A;
delete[] L;
delete[] U;
return 0;
}
cmake_minimum_required(VERSION 2.8.7 FATAL_ERROR)
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/common.cmake)
# ==================================================================================================
if ("${DWARF_PREFIX}" STREQUAL "")
set(DWARF_PREFIX dense_linear_algebra)
endif()
find_package(Common)
find_package(OpenMP)
enable_language (Fortran)
set(NAME ${DWARF_PREFIX}_lud_openmp_fortran)
if (OPENMP_FOUND)
set (CMAKE_Fortran_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_executable(${NAME} src/lud.f90)
install(TARGETS ${NAME} DESTINATION bin)
else()
message("## Skipping '${NAME}_omp': no OpenMP support found")
dummy_install(${NAME} "OpenMP")
endif()
unset(NAME)
=======
README
=======
# 1. Code sample name
lud_openmp
# 2. Description of the code sample package
This example demonstrates the use of Fortran 90 with OpenMP for LU decomposition (Doolittle algorithm).
# 3. Release date
25 August 2016
# 4. Version history
1.0
# 6. Copyright / License of the code sample
Apache Version 2.0
# 5. Contributor (s) / Maintainer(s)
Rafal Gandecki <rafal.gandecki@pwr.edu.pl>
# 7. Language(s)
Fortran 90
# 8. Parallelisation Implementation(s)
OpenMP
# 9. Level of the code sample complexity
basic
# 10. Instructions on how to compile the code
Uses the CodeVault CMake infrastructure, see main README.md
# 11. Instructions on how to run the code
Just run compiled executable
# 12. Sample input(s)
Randomly generated on sample code execution
# 13. Sample output(s)
execution time of algorthims with and without OpenMP
! =================================================================================================
! This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
! CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
!
! Author(s):
! Rafal Gandecki <rafal.gandecki@pwr.edu.pl>
!
! This example demonstrates the use of Fortran 90 with OpenMP for LU decomposition (Doolittle algorithm).
!
! See [http://www.openmp.org/] for the full OpenMP documentation.
!
! =================================================================================================
program lud_openmp
integer, parameter :: n = 3000
real, allocatable :: A(:)
real, allocatable :: L(:)
real, allocatable :: U(:)
double precision :: t_config
integer :: t1, t2, clock_rate, clock_max
allocate(A(n*n))
allocate(L(n*n))
allocate(U(n*n))
call fill_random(A, n, n)
call system_clock (t1, clock_rate, clock_max )
call lud_algorithm(A, L, U, n)
call system_clock (t2, clock_rate, clock_max )
t_config = real ( t2 - t1 ) / real ( clock_rate )
print '("Time without OpenMp: ",f6.3," seconds.")', t_config
call system_clock (t1, clock_rate, clock_max )
call lud_openmp_algorithm(A, L, U, n)
call system_clock (t2, clock_rate, clock_max )
t_config = real ( t2 - t1 ) / real ( clock_rate )
print '("Time with OpenMp: ",f6.3," seconds.")', t_config
deallocate(A)
deallocate(L)
deallocate(U)
end program lud_openmp
subroutine fill_random(a, n, m)
implicit none
integer, intent(in) :: n, m
real, dimension(n*m), intent(inout) :: a
integer :: i, j
do i=1, n
do j=1, m
a((i-1)*m+j) = rand()
end do
end do
end subroutine fill_random
subroutine lud_algorithm(A, L, U, n)
integer, intent(in) :: n
real, dimension(n*n), intent(in) :: A
real, dimension(n*n), intent(inout) :: L
real, dimension(n*n), intent(inout) :: U
integer :: i, j, k
do i=1, n
do j=1, n
if (j>i) then
U((j-1)*n+i) = 0
end if
U((i-1)*n+j) = A((i-1)*n+j)
do k=1, i-1
U((i-1)*n+j) = U((i-1)*n+j) - (U((k-1)*n+j) * L((i-1)*n+k))
end do
end do
do j=1, n
if(i>j) then
L((j-1)*n+i) = 0
else if (j==i) then
L((j-1)*n+i) = 1
else
L((j-1)*n+i) = A((j-1)*n+i) / U((i-1)*n+i)
do k=1, i-1
L((j-1)*n+i) = L((j-1)*n+i) - ((U((k-1)*n+i) * L((j-1)*n+k)) / U((i-1)*n+i))
end do
end if
end do
end do
end subroutine
subroutine lud_openmp_algorithm(A, L, U, n)
integer, intent(in) :: n
real, dimension(n*n), intent(in) :: A
real, dimension(n*n), intent(inout) :: L
real, dimension(n*n), intent(inout) :: U
integer :: i, j, k
!$OMP PARALLEL DO DEFAULT (SHARED) PRIVATE(i,j,k)
do i=1, n
do j=1, n
if (j>i) then
U((j-1)*n+i) = 0
end if
U((i-1)*n+j) = A((i-1)*n+j)
do k=1, i-1
U((i-1)*n+j) = U((i-1)*n+j) - (U((k-1)*n+j) * L((i-1)*n+k))
end do
end do