-
Notifications
You must be signed in to change notification settings - Fork 0
⚡ Thunderbolt: max_v2 — AVX2 4x unrolled horizontal reduction #32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| #pragma once | ||
|
|
||
| #include <cstddef> | ||
| #include <limits> | ||
| #include <immintrin.h> | ||
| #include <algorithm> | ||
|
|
||
| namespace ml_kernels { | ||
|
|
||
| // ⚡ Thunderbolt: AVX2 Vectorized Max Reduction | ||
| // Target: AVX2 (Haswell+) | ||
| // Reason: The naive scalar max reduction (max_naive) is bottlenecked by a loop-carried dependency and low ILP. | ||
| // Vectorizing it with AVX2 and unrolling 4x allows 32 elements to be processed per iteration across multiple execution ports. | ||
| // The final reduction is done efficiently in-register using shuffles, avoiding a scalar extraction loop. | ||
| // Expected gain: ~4-5x throughput vs max_naive. | ||
| inline float max_v2(const float *input, std::size_t n) { | ||
| if (n == 0) return 0.0f; | ||
|
|
||
| std::size_t i = 0; | ||
| __m256 max_v = _mm256_set1_ps(std::numeric_limits<float>::lowest()); | ||
| __m256 max0 = max_v, max1 = max_v, max2 = max_v, max3 = max_v; | ||
|
|
||
| // Unroll 4x for 32 elements per iteration | ||
| for (; i + 31 < n; i += 32) { | ||
| max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i)); | ||
| max1 = _mm256_max_ps(max1, _mm256_loadu_ps(input + i + 8)); | ||
| max2 = _mm256_max_ps(max2, _mm256_loadu_ps(input + i + 16)); | ||
| max3 = _mm256_max_ps(max3, _mm256_loadu_ps(input + i + 24)); | ||
| } | ||
|
|
||
| // Reduce the 4 vectors into 1 | ||
| max0 = _mm256_max_ps(max0, max1); | ||
| max2 = _mm256_max_ps(max2, max3); | ||
| max0 = _mm256_max_ps(max0, max2); | ||
|
|
||
| // Remainder loop for multiples of 8 elements | ||
| for (; i + 7 < n; i += 8) { | ||
| max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i)); | ||
| } | ||
|
|
||
| // In-register horizontal reduction | ||
| __m128 lo = _mm256_castps256_ps128(max0); | ||
| __m128 hi = _mm256_extractf128_ps(max0, 1); | ||
| lo = _mm_max_ps(lo, hi); | ||
|
|
||
| __m128 shuf = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(2, 3, 0, 1)); | ||
| lo = _mm_max_ps(lo, shuf); | ||
| shuf = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(1, 0, 3, 2)); | ||
| lo = _mm_max_ps(lo, shuf); | ||
|
|
||
| float max_val = _mm_cvtss_f32(lo); | ||
|
|
||
| // Scalar epilogue | ||
| for (; i < n; ++i) { | ||
| if (input[i] > max_val) { | ||
| max_val = input[i]; | ||
| } | ||
| } | ||
| return max_val; | ||
| } | ||
|
|
||
| } // namespace ml_kernels |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -138,7 +138,12 @@ REGISTER_RELU_BENCHMARK(relu_v2_6); | |
| REGISTER_RELU_BENCHMARK(relu_v2_7); | ||
| REGISTER_RELU_BENCHMARK(relu_v2_8); | ||
|
|
||
| class MaxBenchmark : public BenchmarkBase { | ||
| class MaxBenchmarkBase : public BenchmarkBase { | ||
| public: | ||
| double flops(int n) const override { return static_cast<double>(n); } | ||
| }; | ||
|
|
||
| class MaxBenchmark : public MaxBenchmarkBase { | ||
| public: | ||
| const char *name() const override { return "max_naive"; } | ||
|
|
||
|
|
@@ -399,3 +404,61 @@ int main(int argc, char **argv) { | |
|
|
||
| return 0; | ||
| } | ||
|
|
||
| #include "ml_kernels/max.h" | ||
|
|
||
| class MaxV2Benchmark : public MaxBenchmarkBase { | ||
|
Comment on lines
+407
to
+410
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Move The ♻️ Suggested organizationMove the include up to the other kernel headers (near Line 16–18): `#include` "ml_kernels/naive_ops.h"
+#include "ml_kernels/max.h"
`#include` "ml_kernels/relu.h"
`#include` "ml_kernels/softmax.h"Move the Also applies to: 464-464 🤖 Prompt for AI Agents |
||
| public: | ||
| const char *name() const override { return "max_v2"; } | ||
|
|
||
| void setup(int n) override { | ||
| size_t bytes_per_iteration = n * sizeof(float); | ||
| size_t target_pool_bytes = 100ULL * 1024 * 1024; | ||
| pool_size_ = g_use_pool ? std::max<std::size_t>(1, target_pool_bytes / bytes_per_iteration) : 1; | ||
|
|
||
| inputs_.resize(pool_size_); | ||
| std::mt19937 rng(12345); | ||
| std::uniform_real_distribution<float> dist(-4.0f, 4.0f); | ||
| for (std::size_t i = 0; i < pool_size_; ++i) { | ||
| inputs_[i].resize(n); | ||
| for (float &value : inputs_[i]) { | ||
| value = dist(rng); | ||
| } | ||
| } | ||
|
|
||
| result_ref_ = inputs_[0].size() == 0 | ||
| ? 0.0f | ||
| : *std::max_element(inputs_[0].begin(), inputs_[0].end()); | ||
| result_ = 0.0f; | ||
| current_idx_ = 0; | ||
| } | ||
|
|
||
| void run() override { | ||
| result_ = ml_kernels::max_v2(inputs_[current_idx_].data(), inputs_[current_idx_].size()); | ||
| current_idx_ = (current_idx_ + 1) % pool_size_; | ||
| } | ||
|
|
||
| bool verify() override { | ||
| current_idx_ = 0; | ||
| run(); | ||
| return std::fabs(result_ - result_ref_) <= 1e-6f; | ||
| } | ||
|
|
||
| void teardown() override { | ||
| inputs_.clear(); | ||
| result_ = 0.0f; | ||
| result_ref_ = 0.0f; | ||
| } | ||
|
|
||
| double flops(int n) const override { | ||
| return static_cast<double>(n); // 1 comparison per element | ||
| } | ||
|
|
||
| private: | ||
| std::vector<std::vector<float>> inputs_; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Separately, 🛠️ Minimal fix- double flops(int n) const override {
- return static_cast<double>(n); // 1 comparison per element
- }
+ double bytes_accessed(int n) const override { return n * sizeof(float); }
private:
- std::vector<std::vector<float>> inputs_;
+ std::vector<AlignedBuffer<float>> inputs_;Better still, hoist 🤖 Prompt for AI Agents |
||
| float result_; | ||
| float result_ref_; | ||
| std::size_t pool_size_; | ||
| std::size_t current_idx_ = 0; | ||
| }; | ||
| REGISTER_BENCHMARK(MaxV2Benchmark); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Date is off by two years.
This PR was created on 2026-04-24, but the new entry is dated
2024-04-24.📝 Committable suggestion
🤖 Prompt for AI Agents