-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark.cpp
More file actions
155 lines (137 loc) · 4.42 KB
/
benchmark.cpp
File metadata and controls
155 lines (137 loc) · 4.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
* Playing around with parallelizing std::partial_sum
* by Paul Dreik https://www.pauldreik.se/
* LICENSE: http://www.boost.org/LICENSE_1_0.txt
*/
#include "include/parpartial.h"
#include <chrono>
#include <iostream>
#include <vector>
// gnu parallel stl - see
// https://gcc.gnu.org/onlinedocs/libstdc++/manual/parallel_mode_using.html
#if defined(__GNUC__)
#include <parallel/numeric>
#endif
// is fast math enabled?
constexpr char isFastMathEnabled() {
#if defined(__GNUC__) || defined(__clang__)
#ifdef __FAST_MATH__
return 'y';
#else
return 'n';
#endif
#endif
return '?';
}
enum AlgoName {
MY_PARTIAL_SUM = 1,
STD_PARTIAL_SUM,
MY_PARTIAL_SUM_STDASYNC,
MY_PARTIAL_SUM_V2,
GNU_PARALLEL_PARTIAL_SUM,
};
// function for knowing which of the algorithms need a number of threads/block
// size hint.
static constexpr bool needsBlockSize(const AlgoName algo) {
return (algo == MY_PARTIAL_SUM || algo == MY_PARTIAL_SUM_STDASYNC ||
algo == MY_PARTIAL_SUM_V2);
}
template <typename Numeric>
void doBenchmarkCase(const std::size_t Ndata, const std::size_t Nthreads,
const std::vector<Numeric> &input,
std::vector<Numeric> &output, const int algo) {
// make sure arrays are valid
output.at(Ndata - 1) = input.at(Ndata - 1);
whatever::Options options;
options.Nthreads = Nthreads;
options.execute_in_threads = true;
switch (algo) {
case AlgoName::MY_PARTIAL_SUM:
// uses my implementation with threads
whatever::par_partial_sum(input.cbegin(), input.cbegin() + Ndata,
output.begin(), options);
break;
case AlgoName::STD_PARTIAL_SUM:
// uses std implementation, singlethreaded
std::partial_sum(input.cbegin(), input.cbegin() + Ndata, output.begin());
break;
case AlgoName::MY_PARTIAL_SUM_STDASYNC:
// my implementation, with std::async instead of explicit threads.
whatever::par_partial_sum_async(input.cbegin(), input.cbegin() + Ndata,
output.begin(), Nthreads);
break;
case AlgoName::MY_PARTIAL_SUM_V2:
// uses my implementation with threads, tweaked a bit
whatever::par_partial_sum_v2(input.cbegin(), input.cbegin() + Ndata,
output.begin(), options);
break;
#if defined(__GNUC__)
case AlgoName::GNU_PARALLEL_PARTIAL_SUM:
// gnu parallel
__gnu_parallel::partial_sum(input.cbegin(), input.cbegin() + Ndata,
output.begin());
break;
#endif
default:
throw "invalid algo";
}
}
template <class Duration> double toSeconds(const Duration &d) {
return std::chrono::duration_cast<std::chrono::duration<double>>(d).count();
}
// runs a single case multiple times
template <typename Numeric>
void runCase(const std::size_t Ndata, const std::size_t Nthreads,
const std::vector<Numeric> &input, std::vector<Numeric> &output,
const int algo) {
const auto Nruns = input.size() / Ndata;
const auto t1 = std::chrono::steady_clock::now();
for (std::size_t i = 0; i < Nruns; ++i) {
doBenchmarkCase(Ndata, Nthreads, input, output, algo);
(void)&output.back();
}
const auto t2 = std::chrono::steady_clock::now();
auto Nvaluesrun = Nruns * Ndata;
// ns per element
const auto ns_per_element = toSeconds(t2 - t1) * 1e9 / Nvaluesrun;
std::cout << algo << ',' << Nruns << ',' << Nthreads << ',' << Ndata << ','
<< ns_per_element << ',' << isFastMathEnabled() << '\n';
std::cout.flush();
}
int main() {
std::vector<float> input;
const std::size_t N = 160'000'000;
// fill with garbage
input.resize(N);
// touch the memory
// const auto t0 = std::chrono::steady_clock::now();
for (auto &e : input) {
e = 1;
}
// also allocate results
auto output = input;
// which algos to test
const auto algos = {
MY_PARTIAL_SUM,
STD_PARTIAL_SUM,
MY_PARTIAL_SUM_STDASYNC,
MY_PARTIAL_SUM_V2,
#if defined(__GNUC__)
GNU_PARALLEL_PARTIAL_SUM,
#endif
};
// Start of benchmarking
std::cout << "algo,nruns,nthreads,ndata,nsperelement,fastmath\n";
for (const auto algo : algos) {
for (const auto Nblock : {100'000, 1'000'000, 10'000'000, 20'000'000,
40'000'000, 160'000'000}) {
if (needsBlockSize(algo)) {
for (const auto Nthreads : {1, 2, 4, 8}) {
runCase(Nblock, Nthreads, input, output, algo);
}
} else {
runCase(Nblock, 1, input, output, algo);
}
}
}
}