-
Notifications
You must be signed in to change notification settings - Fork 0
⚡ Thunderbolt: max_v3 — AVX2 Vectorized Max Reduction (8x unroll) #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -462,3 +462,59 @@ class MaxV2Benchmark : public MaxBenchmarkBase { | |
| std::size_t current_idx_ = 0; | ||
| }; | ||
| REGISTER_BENCHMARK(MaxV2Benchmark); | ||
|
|
||
| class MaxV3Benchmark : public MaxBenchmarkBase { | ||
| public: | ||
| const char *name() const override { return "max_v3"; } | ||
|
|
||
| void setup(int n) override { | ||
| size_t bytes_per_iteration = n * sizeof(float); | ||
| size_t target_pool_bytes = 100ULL * 1024 * 1024; | ||
| pool_size_ = g_use_pool ? std::max<std::size_t>(1, target_pool_bytes / bytes_per_iteration) : 1; | ||
|
|
||
| inputs_.resize(pool_size_); | ||
| std::mt19937 rng(12345); | ||
| std::uniform_real_distribution<float> dist(-4.0f, 4.0f); | ||
| for (std::size_t i = 0; i < pool_size_; ++i) { | ||
| inputs_[i].resize(n); | ||
| for (float &value : inputs_[i]) { | ||
| value = dist(rng); | ||
| } | ||
| } | ||
|
|
||
| result_ref_ = inputs_[0].size() == 0 | ||
| ? 0.0f | ||
| : *std::max_element(inputs_[0].begin(), inputs_[0].end()); | ||
| result_ = 0.0f; | ||
| current_idx_ = 0; | ||
| } | ||
|
|
||
| void run() override { | ||
| result_ = ml_kernels::max_v3(inputs_[current_idx_].data(), inputs_[current_idx_].size()); | ||
| current_idx_ = (current_idx_ + 1) % pool_size_; | ||
| } | ||
|
|
||
| bool verify() override { | ||
| current_idx_ = 0; | ||
| run(); | ||
| return std::fabs(result_ - result_ref_) <= 1e-6f; | ||
| } | ||
|
|
||
| void teardown() override { | ||
| inputs_.clear(); | ||
| result_ = 0.0f; | ||
| result_ref_ = 0.0f; | ||
| } | ||
|
|
||
| double flops(int n) const override { | ||
| return static_cast<double>(n); // 1 comparison per element | ||
| } | ||
|
|
||
| private: | ||
| std::vector<AlignedBuffer<float>> inputs_; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inconsistent input buffer type vs.
Recommend updating 🤖 Prompt for AI Agents |
||
| float result_; | ||
| float result_ref_; | ||
| std::size_t pool_size_; | ||
| std::size_t current_idx_ = 0; | ||
| }; | ||
| REGISTER_BENCHMARK(MaxV3Benchmark); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| #include <iostream> | ||
| #include <vector> | ||
| #include <chrono> | ||
| #include <immintrin.h> | ||
| #include "include/aligned_buffer.h" | ||
|
|
||
| double myddot_original(int n, const double *x, const double *y){ | ||
| register double sum = 0.0; | ||
| register int i; | ||
| for(i=0;i<n;++i){ | ||
| sum += x[i]*y[i]; | ||
| } | ||
| return sum; | ||
| } | ||
|
|
||
| double myddot_avx2(int n, const double *x, const double *y){ | ||
| register double sum = 0.0; | ||
| register int i = 0; | ||
| __m256d sum_v = _mm256_setzero_pd(); | ||
| for(;i+3<n;i+=4){ | ||
| sum_v = _mm256_fmadd_pd(_mm256_loadu_pd(&x[i]), _mm256_loadu_pd(&y[i]), sum_v); | ||
| } | ||
| __m128d t1 = _mm_add_pd(_mm256_extractf128_pd(sum_v, 0), _mm256_extractf128_pd(sum_v, 1)); | ||
| __m128d t2 = _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1)); | ||
| sum = _mm_cvtsd_f64(t2); | ||
| for(;i<n;++i){ | ||
| sum += x[i]*y[i]; | ||
| } | ||
| return sum; | ||
| } | ||
|
|
||
| double myddot_avx512(int n, const double *x, const double *y){ | ||
| register double sum = 0.0; | ||
| register int i = 0; | ||
| __m512d sum_v = _mm512_setzero_pd(); | ||
| for(;i+7<n;i+=8){ | ||
| sum_v = _mm512_fmadd_pd(_mm512_loadu_pd(&x[i]), _mm512_loadu_pd(&y[i]), sum_v); | ||
| } | ||
| sum = _mm512_reduce_add_pd(sum_v); | ||
| for(;i<n;++i){ | ||
| sum += x[i]*y[i]; | ||
| } | ||
| return sum; | ||
| } | ||
|
|
||
| int main() { | ||
| std::size_t n = 16384; | ||
| std::vector<double> data1(n, 1.0); | ||
| std::vector<double> data2(n, 1.0); | ||
|
|
||
| auto t1 = std::chrono::high_resolution_clock::now(); | ||
| for (int k = 0; k < 100000; ++k) myddot_original(n, data1.data(), data2.data()); | ||
| auto t2 = std::chrono::high_resolution_clock::now(); | ||
| std::cout << "myddot_original: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << " ms\n"; | ||
|
|
||
| auto t3 = std::chrono::high_resolution_clock::now(); | ||
| for (int k = 0; k < 100000; ++k) myddot_avx2(n, data1.data(), data2.data()); | ||
| auto t4 = std::chrono::high_resolution_clock::now(); | ||
| std::cout << "myddot_avx2: " << std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3).count() << " ms\n"; | ||
|
|
||
| auto t5 = std::chrono::high_resolution_clock::now(); | ||
| for (int k = 0; k < 100000; ++k) myddot_avx512(n, data1.data(), data2.data()); | ||
| auto t6 = std::chrono::high_resolution_clock::now(); | ||
| std::cout << "myddot_avx512: " << std::chrono::duration_cast<std::chrono::milliseconds>(t6 - t5).count() << " ms\n"; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Evidence numbers don't match PR results.
The journal records
4.03 -> 4.36 GFLOP/s(~8%) for N=6553600, but the PR description reports4.50 -> 4.68 GFLOP/s(~4%) for the same configuration. Please reconcile so the documented "Action" recommendation (default to 8x unrolling for >2-cycle reductions) rests on accurate evidence — the magnitude of the end-to-end win materially affects how strongly that guideline should be applied.🤖 Prompt for AI Agents