Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
CodeVault
hpc-kernels
dense_linear_algebra
Commits
6c88643d
Commit
6c88643d
authored
Oct 12, 2016
by
Thomas Steinreiter
Browse files
inital commit of gemm benchmark serial openmp and mkl blas
parent
85db803d
Changes
3
Hide whitespace changes
Inline
Side-by-side
gemm/gemm_openmp_mkl/CMakeLists.txt
0 → 100755
View file @
6c88643d
# Packages are optional: if they are not present, certain code samples are not compiled
cmake_minimum_required
(
VERSION 2.8.10 FATAL_ERROR
)
find_package
(
OpenMP
)
find_package
(
Boost 1.58.0
)
include
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../../cmake/common.cmake
)
find_package
(
MKL
)
set
(
MKL_LIBRARIES mkl_intel_lp64 mkl_gnu_thread mkl_core
)
# ==================================================================================================
if
(
"
${
DWARF_PREFIX
}
"
STREQUAL
""
)
set
(
DWARF_PREFIX 1_dense
)
endif
()
set
(
NAME
${
DWARF_PREFIX
}
_gemm_tbb_lapack
)
if
(
OPENMP_FOUND AND Boost_FOUND AND MKL_FOUND
)
enable_language
(
CXX
)
include_directories
(
${
Boost_INCLUDE_DIR
}
${
MKL_INCLUDE_DIR
}
)
link_directories
(
${
MKL_LIBRARY_DIR
}
)
add_executable
(
${
NAME
}
main.cpp
)
set
(
CMAKE_BUILD_TYPE RelWithDebInfo
)
add_definitions
(
-DGSL_UNENFORCED_ON_CONTRACT_VIOLATION
)
if
(
"
${
CMAKE_CXX_COMPILER_ID
}
"
STREQUAL
"GNU"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-march=native -Wall -Wextra"
)
elseif
(
"
${
CMAKE_CXX_COMPILER_ID
}
"
STREQUAL
"Intel"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-xHost -std=c++14"
)
endif
()
set_target_properties
(
${
NAME
}
PROPERTIES CXX_STANDARD 14 CXX_STANDARD_REQUIRED YES
)
set_target_properties
(
${
NAME
}
PROPERTIES COMPILE_FLAGS
"
${
OpenMP_CXX_FLAGS
}
"
)
set_target_properties
(
${
NAME
}
PROPERTIES LINK_FLAGS
"
${
OpenMP_CXX_FLAGS
}
"
)
target_link_libraries
(
${
NAME
}
${
Boost_LIBRARIES
}
${
MKL_LIBRARIES
}
)
install
(
TARGETS
${
NAME
}
DESTINATION bin
)
message
(
"** Enabling '
${
NAME
}
': with OpenMP and MKL and Boost"
)
else
()
message
(
"## Skipping '
${
NAME
}
': OpenMP or MKL or Boost support missing"
)
# dummy_install(${NAME} "MPI")
endif
()
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
${
C_FLAGS
}
"
)
unset
(
NAME
)
# ==================================================================================================
gemm/gemm_openmp_mkl/benchmarks/chart.xlsx
0 → 100755
View file @
6c88643d
File added
gemm/gemm_openmp_mkl/main.cpp
0 → 100755
View file @
6c88643d
#include
<algorithm>
#include
<cassert>
#include
<chrono>
#include
<cstddef>
#include
<iostream>
#include
<iterator>
#include
<random>
#include
<vector>
#include
<boost/iterator/counting_iterator.hpp>
#include
<mkl_cblas.h>
// use std::for_each(std::execution::par, ... ) in C++17 instead
template
<
typename
UnaryFunction
>
void
omp_parallel_for
(
int
first
,
int
last
,
UnaryFunction
f
)
{
#pragma omp parallel for // OpenMP 2.0 compatibility: signed loop variable
for
(
int
i
=
first
;
i
<
last
;
++
i
)
{
f
(
i
);
}
}
template
<
typename
ForwardIt
>
void
fill_random
(
ForwardIt
begin
,
ForwardIt
end
)
{
std
::
random_device
rndDev
;
std
::
mt19937
rndEng
{
rndDev
()};
using
T
=
typename
std
::
iterator_traits
<
ForwardIt
>::
value_type
;
std
::
uniform_real_distribution
<
T
>
dist
{
-
1.0
,
1.0
};
std
::
generate
(
begin
,
end
,
[
&
]
{
return
dist
(
rndEng
);
});
}
enum
class
Mode
{
SerialIkj
,
//
SerialIjk
,
//
Parallel
,
//
Blas
//
};
template
<
typename
T
>
void
SerialIkj
(
const
T
*
a
,
const
T
*
b
,
T
*
c
,
std
::
size_t
aRows
,
std
::
size_t
aCols
,
std
::
size_t
bCols
)
{
for
(
std
::
size_t
i
{
0
};
i
<
aRows
;
++
i
)
{
for
(
std
::
size_t
k
{
0
};
k
<
aCols
;
++
k
)
{
for
(
std
::
size_t
j
{
0
};
j
<
bCols
;
++
j
)
{
c
[
i
*
bCols
+
j
]
+=
a
[
i
*
aCols
+
k
]
*
b
[
k
*
bCols
+
j
];
}
}
}
}
template
<
typename
T
>
void
SerialIjk
(
const
T
*
a
,
const
T
*
b
,
T
*
c
,
std
::
size_t
aRows
,
std
::
size_t
aCols
,
std
::
size_t
bCols
)
{
for
(
std
::
size_t
i
{
0
};
i
<
aRows
;
++
i
)
{
for
(
std
::
size_t
j
{
0
};
j
<
bCols
;
++
j
)
{
for
(
std
::
size_t
k
{
0
};
k
<
aCols
;
++
k
)
{
c
[
i
*
bCols
+
j
]
+=
a
[
i
*
aCols
+
k
]
*
b
[
k
*
bCols
+
j
];
}
}
}
}
template
<
typename
T
>
void
parallelGemm
(
const
T
*
a
,
const
T
*
b
,
T
*
c
,
std
::
size_t
aRows
,
std
::
size_t
aCols
,
std
::
size_t
bCols
)
{
omp_parallel_for
(
0
,
aRows
,
[
&
](
auto
i
)
{
for
(
std
::
size_t
k
{
0
};
k
<
aCols
;
++
k
)
{
for
(
std
::
size_t
j
{
0
};
j
<
bCols
;
++
j
)
{
c
[
i
*
bCols
+
j
]
+=
a
[
i
*
aCols
+
k
]
*
b
[
k
*
bCols
+
j
];
}
}
});
}
void
blasGemm
(
const
float
*
a
,
const
float
*
b
,
float
*
c
,
std
::
size_t
aRows
,
std
::
size_t
aCols
,
std
::
size_t
bCols
)
{
const
auto
m
=
aRows
;
const
auto
k
=
aCols
;
const
auto
n
=
bCols
;
const
float
alf
=
1
;
const
float
bet
=
0
;
cblas_sgemm
(
CblasRowMajor
,
// CBLAS_LAYOUT layout,
CblasNoTrans
,
// CBLAS_TRANSPOSE TransA,
CblasNoTrans
,
// CBLAS_TRANSPOSE TransB,
m
,
// const int M,
n
,
// const int N,
k
,
// const int K,
alf
,
// const float alpha,
a
,
// const float *A,
k
,
// const int lda,
b
,
// const float *B,
n
,
// const int ldb,
bet
,
// const float beta,
c
,
// float *C,
n
// const int ldc
);
}
void
blasGemm
(
const
double
*
a
,
const
double
*
b
,
double
*
c
,
std
::
size_t
aRows
,
std
::
size_t
aCols
,
std
::
size_t
bCols
)
{
const
auto
m
=
aRows
;
const
auto
k
=
aCols
;
const
auto
n
=
bCols
;
const
double
alf
=
1
;
const
double
bet
=
0
;
cblas_dgemm
(
CblasRowMajor
,
// CBLAS_LAYOUT layout,
CblasNoTrans
,
// CBLAS_TRANSPOSE TransA,
CblasNoTrans
,
// CBLAS_TRANSPOSE TransB,
m
,
// const int M,
n
,
// const int N,
k
,
// const int K,
alf
,
// const float alpha,
a
,
// const float *A,
k
,
// const int lda,
b
,
// const float *B,
n
,
// const int ldb,
bet
,
// const float beta,
c
,
// float *C,
n
// const int ldc
);
}
template
<
typename
T
>
auto
timed
(
T
&&
f
)
{
const
auto
starttime
=
std
::
chrono
::
high_resolution_clock
::
now
();
f
();
return
std
::
chrono
::
duration
<
double
>
{
std
::
chrono
::
high_resolution_clock
::
now
()
-
starttime
}
.
count
();
}
// dispatching to the impls
template
<
typename
T
>
void
gemm
(
const
Mode
mode
,
const
T
*
a
,
const
T
*
b
,
T
*
c
,
std
::
size_t
aRows
,
std
::
size_t
aCols
,
std
::
size_t
bCols
)
{
switch
(
mode
)
{
case
Mode
::
SerialIkj
:
SerialIkj
(
a
,
b
,
c
,
aRows
,
aCols
,
bCols
);
break
;
case
Mode
::
SerialIjk
:
SerialIjk
(
a
,
b
,
c
,
aRows
,
aCols
,
bCols
);
break
;
case
Mode
::
Parallel
:
parallelGemm
(
a
,
b
,
c
,
aRows
,
aCols
,
bCols
);
break
;
case
Mode
::
Blas
:
blasGemm
(
a
,
b
,
c
,
aRows
,
aCols
,
bCols
);
break
;
default:
std
::
cerr
<<
"unsupported mode!
\n
"
;
std
::
exit
(
EXIT_FAILURE
);
}
}
[[
noreturn
]]
void
failInvalidArgs
()
{
std
::
cout
<<
"invalid args!
\n
usage: prog "
"serialikj|serialijk|parallel|optimized|blas
\n
"
;
std
::
exit
(
EXIT_FAILURE
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
Real
=
float
;
if
(
argc
!=
2
)
{
failInvalidArgs
();
}
const
auto
mode
=
[
&
]()
{
auto
const
m
{
std
::
string
(
argv
[
1
])};
if
(
m
==
"serialikj"
)
{
return
Mode
::
SerialIkj
;
}
else
if
(
m
==
"serialijk"
)
{
return
Mode
::
SerialIjk
;
}
else
if
(
m
==
"parallel"
)
{
return
Mode
::
Parallel
;
}
else
if
(
m
==
"blas"
)
{
return
Mode
::
Blas
;
}
else
{
failInvalidArgs
();
}
}();
const
auto
minSize
=
2u
;
const
auto
maxSize
=
2048
;
const
auto
maxRepetitions
=
2048u
;
// do measurements with increasing matrix dimensions and decreasing
// repetitions to keep wall clock time short
auto
repetitions
=
maxRepetitions
;
for
(
std
::
size_t
size
{
minSize
};
size
<=
maxSize
;
size
*=
2
,
repetitions
/=
2
)
{
/// set up data
const
auto
aRows
=
size
;
const
auto
aCols
=
size
;
const
auto
bRows
=
size
;
const
auto
bCols
=
size
;
std
::
vector
<
Real
>
a
(
aRows
*
aCols
);
// m * k
std
::
vector
<
Real
>
b
(
bRows
*
bCols
);
// k * n
std
::
vector
<
Real
>
c
(
aRows
*
bCols
);
// m * n
fill_random
(
std
::
begin
(
a
),
std
::
end
(
a
));
fill_random
(
std
::
begin
(
b
),
std
::
end
(
b
));
/// warm up the caches
gemm
(
mode
,
a
.
data
(),
b
.
data
(),
c
.
data
(),
aRows
,
aCols
,
bCols
);
/// timed computations
auto
time
=
0.0
;
for
(
std
::
size_t
r
{
0
};
r
<
repetitions
;
++
r
)
{
std
::
fill
(
std
::
begin
(
c
),
std
::
end
(
c
),
0
);
time
+=
timed
([
&
]()
{
gemm
(
mode
,
a
.
data
(),
b
.
data
(),
c
.
data
(),
aRows
,
aCols
,
bCols
);
});
// access the output to prevent unwanted compiler optimizations
std
::
ostream
null
{
nullptr
};
std
::
copy
(
std
::
begin
(
c
),
std
::
end
(
c
),
std
::
ostream_iterator
<
Real
>
{
null
});
}
time
/=
repetitions
;
// get avg time per call
std
::
cout
<<
size
<<
";"
<<
time
<<
'\n'
;
}
}
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment