Skip to content

Commit 96e8966

Browse files
authored
comparison for load/store vs loadu/storeu (#21)
* comparison for load/store vs loadu/storeu
1 parent c29de5f commit 96e8966

2 files changed

Lines changed: 257 additions & 0 deletions

File tree

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ target_link_libraries(benchmarks
5959
benchmark
6060
benchmark_main
6161
pasta_bit_vector)
62+
6263
add_test(
6364
NAME Benchmarks
6465
COMMAND bt_benchmarks --benchmark_out=bm_report.csv --benchmark_out_format=csv
@@ -104,6 +105,17 @@ target_link_libraries(test_rmm
104105
gtest_main)
105106

106107

108+
add_executable(alignment_comparison
109+
src/alignment_comparison.cpp)
110+
111+
target_include_directories(alignment_comparison
112+
PUBLIC include
113+
)
114+
115+
target_link_libraries(alignment_comparison
116+
benchmark
117+
benchmark_main
118+
)
107119
FetchContent_Declare(
108120
doxygen-awesome-css
109121
URL https://github.com/jothepro/doxygen-awesome-css/archive/refs/heads/main.zip

src/alignment_comparison.cpp

Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
#include <benchmark/benchmark.h>
2+
#include <immintrin.h>
3+
4+
#include <chrono>
5+
#include <random>
6+
#include <vector>
7+
8+
#include "bits.h"
9+
10+
alignas(64) uint8_t data[(1 << 29) + 1];
11+
12+
#ifdef PIXIE_AVX512_SUPPORT
13+
14+
static void BM_Loadu512_aligned(benchmark::State& state) {
15+
size_t k = state.range(0);
16+
std::mt19937_64 rng(42);
17+
18+
for (auto _ : state) {
19+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
20+
const __m512i* ptr = reinterpret_cast<const __m512i*>(data + idx);
21+
22+
benchmark::DoNotOptimize(_mm512_loadu_si512(ptr));
23+
}
24+
}
25+
26+
static void BM_Loadu512_unaligned_crossing_64byte_border(
27+
benchmark::State& state) {
28+
size_t k = state.range(0);
29+
std::mt19937_64 rng(42);
30+
31+
for (auto _ : state) {
32+
size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48;
33+
const __m512i* ptr = reinterpret_cast<const __m512i*>(data + idx);
34+
35+
benchmark::DoNotOptimize(_mm512_loadu_si512(ptr));
36+
}
37+
}
38+
39+
static void BM_Load512_aligned(benchmark::State& state) {
40+
size_t k = state.range(0);
41+
std::mt19937_64 rng(42);
42+
43+
for (auto _ : state) {
44+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
45+
const __m512i* ptr = reinterpret_cast<const __m512i*>(data + idx);
46+
47+
benchmark::DoNotOptimize(_mm512_load_si512(ptr));
48+
}
49+
}
50+
51+
static void BM_Storeu512_aligned(benchmark::State& state) {
52+
size_t k = state.range(0);
53+
std::mt19937_64 rng(42);
54+
55+
for (auto _ : state) {
56+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
57+
__m512i* ptr = reinterpret_cast<__m512i*>(data + idx);
58+
__m512i value = _mm512_setzero_si512();
59+
60+
_mm512_storeu_si512(ptr, value);
61+
62+
benchmark::DoNotOptimize(ptr);
63+
}
64+
}
65+
66+
static void BM_Storeu512_unaligned_crossing_64byte_border(
67+
benchmark::State& state) {
68+
size_t k = state.range(0);
69+
std::mt19937_64 rng(42);
70+
71+
for (auto _ : state) {
72+
size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48;
73+
__m512i* ptr = reinterpret_cast<__m512i*>(data + idx);
74+
__m512i value = _mm512_setzero_si512();
75+
76+
_mm512_storeu_si512(ptr, value);
77+
78+
benchmark::DoNotOptimize(ptr);
79+
}
80+
}
81+
82+
static void BM_Store512_aligned(benchmark::State& state) {
83+
size_t k = state.range(0);
84+
std::mt19937_64 rng(42);
85+
86+
for (auto _ : state) {
87+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
88+
__m512i* ptr = reinterpret_cast<__m512i*>(data + idx);
89+
__m512i value = _mm512_setzero_si512();
90+
91+
_mm512_store_si512(ptr, value);
92+
93+
benchmark::DoNotOptimize(ptr);
94+
}
95+
}
96+
97+
BENCHMARK(BM_Loadu512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
98+
99+
BENCHMARK(BM_Loadu512_unaligned_crossing_64byte_border)
100+
->ArgNames({"k"})
101+
->DenseRange(1, 23, 2);
102+
103+
BENCHMARK(BM_Load512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
104+
105+
BENCHMARK(BM_Storeu512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
106+
107+
BENCHMARK(BM_Storeu512_unaligned_crossing_64byte_border)
108+
->ArgNames({"k"})
109+
->DenseRange(1, 23, 2);
110+
111+
BENCHMARK(BM_Store512_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
112+
113+
#else
114+
115+
static void BM_Loadu256_aligned(benchmark::State& state) {
116+
size_t k = state.range(0);
117+
std::mt19937_64 rng(42);
118+
119+
for (auto _ : state) {
120+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
121+
const __m256i* ptr = reinterpret_cast<const __m256i*>(data + idx);
122+
123+
benchmark::DoNotOptimize(_mm256_loadu_si256(ptr));
124+
}
125+
}
126+
127+
static void BM_Loadu256_unaligned(benchmark::State& state) {
128+
size_t k = state.range(0);
129+
std::mt19937_64 rng(42);
130+
131+
for (auto _ : state) {
132+
size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 16;
133+
const __m256i* ptr = reinterpret_cast<const __m256i*>(data + idx);
134+
135+
benchmark::DoNotOptimize(_mm256_loadu_si256(ptr));
136+
}
137+
}
138+
139+
static void BM_Loadu256_unaligned_crossing_64byte_border(
140+
benchmark::State& state) {
141+
size_t k = state.range(0);
142+
std::mt19937_64 rng(42);
143+
144+
for (auto _ : state) {
145+
size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48;
146+
const __m256i* ptr = reinterpret_cast<const __m256i*>(data + idx);
147+
148+
benchmark::DoNotOptimize(_mm256_loadu_si256(ptr));
149+
}
150+
}
151+
152+
static void BM_Load256_aligned(benchmark::State& state) {
153+
size_t k = state.range(0);
154+
std::mt19937_64 rng(42);
155+
156+
for (auto _ : state) {
157+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
158+
const __m256i* ptr = reinterpret_cast<const __m256i*>(data + idx);
159+
160+
benchmark::DoNotOptimize(_mm256_load_si256(ptr));
161+
}
162+
}
163+
164+
static void BM_Storeu256_aligned(benchmark::State& state) {
165+
size_t k = state.range(0);
166+
std::mt19937_64 rng(42);
167+
168+
for (auto _ : state) {
169+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
170+
__m256i* ptr = reinterpret_cast<__m256i*>(data + idx);
171+
__m256i value = _mm256_setzero_si256();
172+
173+
_mm256_storeu_si256(ptr, value);
174+
175+
benchmark::DoNotOptimize(ptr);
176+
}
177+
}
178+
179+
static void BM_Storeu256_unaligned(benchmark::State& state) {
180+
size_t k = state.range(0);
181+
std::mt19937_64 rng(42);
182+
183+
for (auto _ : state) {
184+
size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 16;
185+
__m256i* ptr = reinterpret_cast<__m256i*>(data + idx);
186+
__m256i value = _mm256_setzero_si256();
187+
188+
_mm256_storeu_si256(ptr, value);
189+
190+
benchmark::DoNotOptimize(ptr);
191+
}
192+
}
193+
194+
static void BM_Storeu256_unaligned_crossing_64byte_border(
195+
benchmark::State& state) {
196+
size_t k = state.range(0);
197+
std::mt19937_64 rng(42);
198+
199+
for (auto _ : state) {
200+
size_t idx = ((rng() & ((1 << k) - 1)) << 6) + 48;
201+
__m256i* ptr = reinterpret_cast<__m256i*>(data + idx);
202+
__m256i value = _mm256_setzero_si256();
203+
204+
_mm256_storeu_si256(ptr, value);
205+
206+
benchmark::DoNotOptimize(ptr);
207+
}
208+
}
209+
210+
static void BM_Store256_aligned(benchmark::State& state) {
211+
size_t k = state.range(0);
212+
std::mt19937_64 rng(42);
213+
214+
for (auto _ : state) {
215+
size_t idx = (rng() & ((1 << k) - 1)) << 6;
216+
__m256i* ptr = reinterpret_cast<__m256i*>(data + idx);
217+
__m256i value = _mm256_setzero_si256();
218+
219+
_mm256_store_si256(ptr, value);
220+
221+
benchmark::DoNotOptimize(ptr);
222+
}
223+
}
224+
225+
BENCHMARK(BM_Loadu256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
226+
227+
BENCHMARK(BM_Loadu256_unaligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
228+
229+
BENCHMARK(BM_Loadu256_unaligned_crossing_64byte_border)
230+
->ArgNames({"k"})
231+
->DenseRange(1, 23, 2);
232+
233+
BENCHMARK(BM_Load256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
234+
235+
BENCHMARK(BM_Storeu256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
236+
237+
BENCHMARK(BM_Storeu256_unaligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
238+
239+
BENCHMARK(BM_Storeu256_unaligned_crossing_64byte_border)
240+
->ArgNames({"k"})
241+
->DenseRange(1, 23, 2);
242+
243+
BENCHMARK(BM_Store256_aligned)->ArgNames({"k"})->DenseRange(1, 23, 2);
244+
245+
#endif

0 commit comments

Comments
 (0)