Commit 575fd4d4 authored by Thomas Steinreiter's avatar Thomas Steinreiter
Browse files

refactoring, reformating, added .clang-format

parent 6c88643d
Language: Cpp
DerivePointerAlignment: false
PointerAlignment: Left
IndentWidth: 4
TabWidth: 4
UseTab: ForIndentation
ColumnLimit: 80
AllowShortFunctionsOnASingleLine: true
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AllowShortBlocksOnASingleLine: true
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <functional>
#include <iostream>
#include <iterator>
#include <map>
#include <random>
#include <string>
#include <vector>
#include <boost/iterator/counting_iterator.hpp>
#include <boost/algorithm/string/join.hpp>
#include <boost/range/adaptor/transformed.hpp>
#include <mkl_cblas.h>
// use std::for_each(std::execution::par, ... ) in C++17 instead
template <typename UnaryFunction>
void omp_parallel_for(int first, int last, UnaryFunction f) {
#pragma omp parallel for // OpenMP 2.0 compatibility: signed loop variable
for (int i = first; i < last; ++i) {
f(i);
}
}
/// helper functions
template <typename ForwardIt> void fill_random(ForwardIt begin, ForwardIt end) {
std::random_device rndDev;
std::mt19937 rndEng{rndDev()};
using T = typename std::iterator_traits<ForwardIt>::value_type;
std::uniform_real_distribution<T> dist{-1.0, 1.0};
std::random_device rndDev;
std::mt19937 rndEng{rndDev()};
using T = typename std::iterator_traits<ForwardIt>::value_type;
std::uniform_real_distribution<T> dist{-1.0, 1.0};
std::generate(begin, end, [&] { return dist(rndEng); });
}
std::generate(begin, end, [&] { return dist(rndEng); });
template <typename T> auto timed(T&& f) {
const auto starttime = std::chrono::high_resolution_clock::now();
f();
return std::chrono::duration<double>{
std::chrono::high_resolution_clock::now() - starttime}
.count();
}
enum class Mode {
SerialIkj, //
SerialIjk, //
Parallel, //
Blas //
};
template <typename Map>[[noreturn]] void failInvalidArgs(const Map& modes) {
std::cerr << "invalid args!\nusage: prog "
<< boost::algorithm::join(
modes | boost::adaptors::transformed(
[](const auto& p) { return p.first; }),
"|")
<< "\n";
std::exit(EXIT_FAILURE);
}
/// gemm implementations
template <typename T>
void SerialIkj(const T *a, const T *b, T *c, std::size_t aRows,
void serialIkj(const T* a, const T* b, T* c, std::size_t aRows,
std::size_t aCols, std::size_t bCols) {
for (std::size_t i{0}; i < aRows; ++i) {
for (std::size_t k{0}; k < aCols; ++k) {
for (std::size_t j{0}; j < bCols; ++j) {
c[i * bCols + j] += a[i * aCols + k] * b[k * bCols + j];
}
}
}
for (std::size_t i{0}; i < aRows; ++i) {
for (std::size_t k{0}; k < aCols; ++k) {
for (std::size_t j{0}; j < bCols; ++j) {
c[i * bCols + j] += a[i * aCols + k] * b[k * bCols + j];
}
}
}
}
template <typename T>
void SerialIjk(const T *a, const T *b, T *c, std::size_t aRows,
void serialIjk(const T* a, const T* b, T* c, std::size_t aRows,
std::size_t aCols, std::size_t bCols) {
for (std::size_t i{0}; i < aRows; ++i) {
for (std::size_t j{0}; j < bCols; ++j) {
for (std::size_t k{0}; k < aCols; ++k) {
c[i * bCols + j] += a[i * aCols + k] * b[k * bCols + j];
}
}
}
for (std::size_t i{0}; i < aRows; ++i) {
for (std::size_t j{0}; j < bCols; ++j) {
for (std::size_t k{0}; k < aCols; ++k) {
c[i * bCols + j] += a[i * aCols + k] * b[k * bCols + j];
}
}
}
}
template <typename T>
void parallelGemm(const T *a, const T *b, T *c, std::size_t aRows,
void parallelGemm(const T* a, const T* b, T* c, std::size_t aRows,
std::size_t aCols, std::size_t bCols) {
omp_parallel_for(0, aRows, [&](auto i) {
for (std::size_t k{0}; k < aCols; ++k) {
for (std::size_t j{0}; j < bCols; ++j) {
c[i * bCols + j] += a[i * aCols + k] * b[k * bCols + j];
}
}
});
// use std::for_each(std::execution::par, ... ) in
// C++17 instead
#pragma omp parallel for
for (std::size_t i{0}; i < aRows; ++i) {
for (std::size_t k{0}; k < aCols; ++k) {
for (std::size_t j{0}; j < bCols; ++j) {
c[i * bCols + j] += a[i * aCols + k] * b[k * bCols + j];
}
}
}
}
void blasGemm(const float *a, const float *b, float *c, std::size_t aRows,
// float overload
void blasGemm(const float* a, const float* b, float* c, std::size_t aRows,
std::size_t aCols, std::size_t bCols) {
const auto m = aRows;
const auto k = aCols;
const auto n = bCols;
const float alf = 1;
const float bet = 0;
cblas_sgemm(CblasRowMajor, // CBLAS_LAYOUT layout,
CblasNoTrans, // CBLAS_TRANSPOSE TransA,
CblasNoTrans, // CBLAS_TRANSPOSE TransB,
m, // const int M,
n, // const int N,
k, // const int K,
alf, // const float alpha,
a, // const float *A,
k, // const int lda,
b, // const float *B,
n, // const int ldb,
bet, // const float beta,
c, // float *C,
n // const int ldc
);
const auto m = aRows;
const auto k = aCols;
const auto n = bCols;
const float alf = 1;
const float bet = 0;
cblas_sgemm(CblasRowMajor, // CBLAS_LAYOUT layout,
CblasNoTrans, // CBLAS_TRANSPOSE TransA,
CblasNoTrans, // CBLAS_TRANSPOSE TransB,
m, // const int M,
n, // const int N,
k, // const int K,
alf, // const float alpha,
a, // const float *A,
k, // const int lda,
b, // const float *B,
n, // const int ldb,
bet, // const float beta,
c, // float *C,
n // const int ldc
);
}
void blasGemm(const double *a, const double *b, double *c, std::size_t aRows,
// double overload
void blasGemm(const double* a, const double* b, double* c, std::size_t aRows,
std::size_t aCols, std::size_t bCols) {
const auto m = aRows;
const auto k = aCols;
const auto n = bCols;
const double alf = 1;
const double bet = 0;
cblas_dgemm(CblasRowMajor, // CBLAS_LAYOUT layout,
CblasNoTrans, // CBLAS_TRANSPOSE TransA,
CblasNoTrans, // CBLAS_TRANSPOSE TransB,
m, // const int M,
n, // const int N,
k, // const int K,
alf, // const float alpha,
a, // const float *A,
k, // const int lda,
b, // const float *B,
n, // const int ldb,
bet, // const float beta,
c, // float *C,
n // const int ldc
);
}
template <typename T> auto timed(T &&f) {
const auto starttime = std::chrono::high_resolution_clock::now();
f();
return std::chrono::duration<double>{
std::chrono::high_resolution_clock::now() - starttime}
.count();
}
// dispatching to the impls
template <typename T>
void gemm(const Mode mode, const T *a, const T *b, T *c, std::size_t aRows,
std::size_t aCols, std::size_t bCols) {
switch (mode) {
case Mode::SerialIkj:
SerialIkj(a, b, c, aRows, aCols, bCols);
break;
case Mode::SerialIjk:
SerialIjk(a, b, c, aRows, aCols, bCols);
break;
case Mode::Parallel:
parallelGemm(a, b, c, aRows, aCols, bCols);
break;
case Mode::Blas:
blasGemm(a, b, c, aRows, aCols, bCols);
break;
default:
std::cerr << "unsupported mode!\n";
std::exit(EXIT_FAILURE);
}
}
[[noreturn]] void failInvalidArgs() {
std::cout << "invalid args!\nusage: prog "
"serialikj|serialijk|parallel|optimized|blas\n";
std::exit(EXIT_FAILURE);
const auto m = aRows;
const auto k = aCols;
const auto n = bCols;
const double alf = 1;
const double bet = 0;
cblas_dgemm(CblasRowMajor, // CBLAS_LAYOUT layout,
CblasNoTrans, // CBLAS_TRANSPOSE TransA,
CblasNoTrans, // CBLAS_TRANSPOSE TransB,
m, // const int M,
n, // const int N,
k, // const int K,
alf, // const double alpha,
a, // const double *A,
k, // const int lda,
b, // const double *B,
n, // const int ldb,
bet, // const double beta,
c, // double *C,
n // const int ldc
);
}
int main(int argc, char *argv[]) {
using Real = float;
if (argc != 2) {
failInvalidArgs();
}
const auto mode = [&]() {
auto const m{std::string(argv[1])};
if (m == "serialikj") {
return Mode::SerialIkj;
} else if (m == "serialijk") {
return Mode::SerialIjk;
} else if (m == "parallel") {
return Mode::Parallel;
} else if (m == "blas") {
return Mode::Blas;
} else {
failInvalidArgs();
}
}();
const auto minSize = 2u;
const auto maxSize = 2048;
const auto maxRepetitions = 2048u;
// do measurements with increasing matrix dimensions and decreasing
// repetitions to keep wall clock time short
auto repetitions = maxRepetitions;
for (std::size_t size{minSize}; size <= maxSize;
size *= 2, repetitions /= 2) {
/// set up data
const auto aRows = size;
const auto aCols = size;
const auto bRows = size;
const auto bCols = size;
std::vector<Real> a(aRows * aCols); // m * k
std::vector<Real> b(bRows * bCols); // k * n
std::vector<Real> c(aRows * bCols); // m * n
fill_random(std::begin(a), std::end(a));
fill_random(std::begin(b), std::end(b));
/// warm up the caches
gemm(mode, a.data(), b.data(), c.data(), aRows, aCols, bCols);
/// timed computations
auto time = 0.0;
for (std::size_t r{0}; r < repetitions; ++r) {
std::fill(std::begin(c), std::end(c), 0);
time += timed([&]() {
gemm(mode, a.data(), b.data(), c.data(), aRows, aCols, bCols);
});
// access the output to prevent unwanted compiler optimizations
std::ostream null{nullptr};
std::copy(std::begin(c), std::end(c), std::ostream_iterator<Real>{null});
}
time /= repetitions; // get avg time per call
std::cout << size << ";" << time << '\n';
}
int main(int argc, char* argv[]) {
using Real = float;
using gemmFuncType = void(const Real*, const Real*, Real*, std::size_t,
std::size_t, std::size_t);
using gemmFuncPrtType = void (*)(const Real*, const Real*, Real*,
std::size_t, std::size_t, std::size_t);
// this map manages all implementations
std::map<std::string, std::function<gemmFuncType>> modes{
{"serialikj", &serialIkj<Real>},
{"serialijk", &serialIjk<Real>},
{"parallel", &parallelGemm<Real>},
{"blas", static_cast<gemmFuncPrtType>(&blasGemm)}};
// validate cmd args and select gemm implementation
if (argc != 2) { failInvalidArgs(modes); }
const auto modeArg{std::string(argv[1])};
if (modes.count(modeArg) != 1) { failInvalidArgs(modes); }
const auto gemm = modes[modeArg];
// benchmark sizes
const auto minSize = 2u;
const auto maxSize = 2048u;
const auto maxRepetitions = 2048u;
// do measurements with increasing matrix dimensions and decreasing
// repetitions to keep wall clock time short
std::size_t repetitions{};
std::size_t size{};
for (size = minSize, repetitions = maxRepetitions; size <= maxSize;
size *= 2, repetitions /= 2) {
/// set up data
const auto aRows = size;
const auto aCols = size;
const auto bRows = size;
const auto bCols = size;
std::vector<Real> a(aRows * aCols); // m * k
std::vector<Real> b(bRows * bCols); // k * n
std::vector<Real> c(aRows * bCols); // m * n
fill_random(std::begin(a), std::end(a));
fill_random(std::begin(b), std::end(b));
// create reference solution matrix using serialIkj
std::vector<Real> reference(c.size());
serialIkj(a.data(), b.data(), reference.data(), aRows, aCols, bCols);
/// warm up the caches
gemm(a.data(), b.data(), c.data(), aRows, aCols, bCols);
/// timed computations
auto time = 0.0;
for (std::size_t r{0}; r < repetitions; ++r) {
std::fill(std::begin(c), std::end(c), 0);
time += timed([&]() {
gemm(a.data(), b.data(), c.data(), aRows, aCols, bCols);
});
// validate the result c
if (!std::equal(std::begin(c), std::end(c), std::begin(reference),
std::end(reference),
[](const Real& l, const Real& r) {
return std::abs(l - r) < 1.0e-3;
})) {
std::cerr << "validation error!\n";
return EXIT_FAILURE;
}
}
time /= repetitions; // get avg time per call
std::cout << size << ";" << time << '\n';
}
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment