From d444ae7ab9f5e691f007eb092cd57bf8d165fd39 Mon Sep 17 00:00:00 2001 From: Jonathan Peppers Date: Sat, 2 May 2026 19:06:14 -0500 Subject: [PATCH 1/4] Emit branchless Math.Min/Max in ScalarEmitter for supported types Update ScalarEmitter.EmitSortMethod to generate Math.Min/Math.Max compare-and-swap patterns instead of branching if/swap for types with direct System.Math overloads (byte, sbyte, short, ushort, int, uint, long, ulong, float, double). The JIT lowers these to branchless cmov instructions on x64. Types without Math.Min/Max overloads (char, nint, nuint, string) and custom IComparable types retain the existing branching pattern. Fixes #32 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- SortingNetworks.Generators/ScalarEmitter.cs | 33 ++++++++++++++++----- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/SortingNetworks.Generators/ScalarEmitter.cs b/SortingNetworks.Generators/ScalarEmitter.cs index 707a3a0..f40e5c2 100644 --- a/SortingNetworks.Generators/ScalarEmitter.cs +++ b/SortingNetworks.Generators/ScalarEmitter.cs @@ -1,3 +1,4 @@ +using System.Collections.Generic; using System.Text; namespace SortingNetworks.Generators @@ -9,6 +10,15 @@ namespace SortingNetworks.Generators /// internal static class ScalarEmitter { + /// + /// Types that have direct System.Math.Min/Max overloads. + /// The JIT lowers these to branchless cmov instructions on x64. + /// + private static readonly HashSet MathMinMaxTypes = new HashSet + { + "byte", "sbyte", "short", "ushort", "int", "uint", "long", "ulong", "float", "double" + }; + /// /// Emits a scalar Sort method for the given network size and element type. /// @@ -28,18 +38,27 @@ internal static string EmitSortMethod(int size, string typeName, int[] network, // Emit compare-and-swap for each pair bool isString = typeName == "string"; + bool useMathMinMax = !useCompareTo && !isString && MathMinMaxTypes.Contains(typeName); for (int i = 0; i < network.Length; i += 2) { int a = network[i]; int b = network[i + 1]; - string condition; - if (useCompareTo) - condition = $"e{a}.CompareTo(e{b}) > 0"; - else if (isString) - condition = $"string.CompareOrdinal(e{a}, e{b}) > 0"; + if (useMathMinMax) + { + // Branchless: Math.Min/Max → JIT emits cmov on x64 + sb.AppendLine($" {{ {typeName} t0 = System.Math.Min(e{a}, e{b}); {typeName} t1 = System.Math.Max(e{a}, e{b}); e{a} = t0; e{b} = t1; }}"); + } else - condition = $"e{a} > e{b}"; - sb.AppendLine($" if ({condition}) {{ {typeName} temp = e{a}; e{a} = e{b}; e{b} = temp; }}"); + { + string condition; + if (useCompareTo) + condition = $"e{a}.CompareTo(e{b}) > 0"; + else if (isString) + condition = $"string.CompareOrdinal(e{a}, e{b}) > 0"; + else + condition = $"e{a} > e{b}"; + sb.AppendLine($" if ({condition}) {{ {typeName} temp = e{a}; e{a} = e{b}; e{b} = temp; }}"); + } } sb.AppendLine(); From d6d084cc2e12b2e95175a6081c626b898392b076 Mon Sep 17 00:00:00 2001 From: Jonathan Peppers Date: Sat, 2 May 2026 20:03:21 -0500 Subject: [PATCH 2/4] Address PR review: use SpecialType, add tests, update docs - Replace string-based MathMinMaxTypes HashSet with SpecialType-based SupportsBranchlessMinMax() method, avoiding duplication with the generator's existing type metadata - Keep float/double in branchless path (NaN is unsupported per #10/#11) - Add generator tests asserting Math.Min/Max emitted for numeric types and branching if/swap for char - Update class-level XML docs to reflect branchless vs branching paths - Update performance.instructions.md and README.md scalar examples Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../instructions/performance.instructions.md | 2 +- README.md | 30 ++++++++++-------- SortingNetworks.Generators/ScalarEmitter.cs | 31 +++++++++++++------ .../SortingNetworkGenerator.cs | 6 ++-- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/.github/instructions/performance.instructions.md b/.github/instructions/performance.instructions.md index 7f07a20..63b0ea4 100644 --- a/.github/instructions/performance.instructions.md +++ b/.github/instructions/performance.instructions.md @@ -10,7 +10,7 @@ This is a high-performance sorting library. All code in the hot path must follow - Use `MemoryMarshal.GetReference(span)` to get a ref to the first element. - Mark hot-path private methods with `[MethodImpl(MethodImplOptions.AggressiveInlining)]`. - Avoid heap allocations in sort methods — no LINQ, no closures, no boxing. -- Use inline compare-and-swap (`if (a > b) { T temp = a; a = b; b = temp; }`) for primitive types rather than `IComparer`. +- Use branchless `Math.Min`/`Math.Max` compare-and-swap for numeric primitive types (`byte`, `sbyte`, `short`, `ushort`, `int`, `uint`, `long`, `ulong`, `float`, `double`). The JIT lowers these to `cmov` on x64. For `char` and custom types, use branching `if (a > b) { T temp = a; a = b; b = temp; }`. - **NaN is not supported** for `float`/`double` sorting. Sorting networks use ordered comparisons where `NaN > x` is always false, so NaN values disrupt sort order. See [#10](https://github.com/jonathanpeppers/SortingNetworks/issues/10) and [#11](https://github.com/jonathanpeppers/SortingNetworks/issues/11). - `IComparer` overloads use the loop-based `ApplyNetworkWithComparer` path and are not unrolled. - Fallback to `span.Sort()` or `Array.Sort()` for sizes outside the network range. diff --git a/README.md b/README.md index 51b5eaf..9b3e4ed 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ MySorter.Sort(otherData); // any other size → OnFallback the comparer path still throws. The source generator emits optimized sort methods with: -- **Scalar unrolled** compare-and-swap for all sizes/types +- **Scalar unrolled** compare-and-swap for all sizes/types (branchless `Math.Min`/`Math.Max` for numeric types) - **x86 SIMD** (AVX2, AVX-512) when the type and size fit in SIMD registers - **ARM64 SIMD** (AdvSimd/NEON) for supported types - **IComparer<T>** overloads using loop-based network application @@ -171,27 +171,31 @@ comparators that involve channel 27. ### Scalar implementation The simplest path unrolls every compare-and-swap from the network into -straight-line code. For a 3-element example, a depth-3 network looks like: +straight-line code. For numeric types, branchless `Math.Min`/`Math.Max` calls +are used (the JIT lowers these to `cmov` instructions). For a 3-element +example, a depth-3 network looks like: ```csharp // Sort 3 elements with a sorting network (depth 3, 3 comparators) static void Sort3(ref int e0, ref int e1, ref int e2) { - // Layer 1 — two independent comparators could go here, but - // for 3 elements there is only one pair per layer. - if (e0 > e1) { int t = e0; e0 = e1; e1 = t; } + // Layer 1 + { int t0 = Math.Min(e0, e1); int t1 = Math.Max(e0, e1); e0 = t0; e1 = t1; } // Layer 2 - if (e1 > e2) { int t = e1; e1 = e2; e2 = t; } + { int t0 = Math.Min(e1, e2); int t1 = Math.Max(e1, e2); e1 = t0; e2 = t1; } // Layer 3 - if (e0 > e1) { int t = e0; e0 = e1; e1 = t; } + { int t0 = Math.Min(e0, e1); int t1 = Math.Max(e0, e1); e0 = t0; e1 = t1; } } ``` +For char and custom types, branching `if (a > b) swap` is used instead +(char lacks `Math.Min`/`Math.Max` overloads). + For the real 27/28-element networks the same pattern is used — the code -generator emits all ~185 comparators across 13 layers as a flat `if`/swap -sequence. Elements are loaded into local variables via `Unsafe.Add(ref T, n)` +generator emits all ~185 comparators across 13 layers as a flat sequence. +Elements are loaded into local variables via `Unsafe.Add(ref T, n)` to avoid bounds checks: ```csharp @@ -200,10 +204,10 @@ int e0 = first; int e1 = Unsafe.Add(ref first, 1); // ... load e2 through e26 ... -// Layer 1 comparators: -if (e1 > e26) { int temp = e1; e1 = e26; e26 = temp; } -if (e2 > e25) { int temp = e2; e2 = e25; e25 = temp; } -if (e3 > e24) { int temp = e3; e3 = e24; e24 = temp; } +// Layer 1 comparators (branchless Math.Min/Max for integer types): +{ int t0 = Math.Min(e1, e26); int t1 = Math.Max(e1, e26); e1 = t0; e26 = t1; } +{ int t0 = Math.Min(e2, e25); int t1 = Math.Max(e2, e25); e2 = t0; e25 = t1; } +{ int t0 = Math.Min(e3, e24); int t1 = Math.Max(e3, e24); e3 = t0; e24 = t1; } // ... remaining comparators in layers 2–13 ... ``` diff --git a/SortingNetworks.Generators/ScalarEmitter.cs b/SortingNetworks.Generators/ScalarEmitter.cs index f40e5c2..ad710a3 100644 --- a/SortingNetworks.Generators/ScalarEmitter.cs +++ b/SortingNetworks.Generators/ScalarEmitter.cs @@ -1,28 +1,41 @@ -using System.Collections.Generic; using System.Text; +using Microsoft.CodeAnalysis; namespace SortingNetworks.Generators { /// - /// Emits unrolled scalar compare-and-swap sorting network code. /// Emits unrolled scalar sorting code for a given network size and element type. - /// uses ref + Unsafe.Add for element access, inline compare-and-swap. + /// Uses ref + Unsafe.Add for element access. For numeric types with + /// System.Math.Min/Max overloads, emits branchless min/max swaps; for all + /// other types (char, string, custom) emits branching if/swap. /// internal static class ScalarEmitter { /// - /// Types that have direct System.Math.Min/Max overloads. - /// The JIT lowers these to branchless cmov instructions on x64. + /// Returns true if the given has direct + /// System.Math.Min/Max overloads suitable for branchless emission. + /// Excludes char/nint/nuint (no Math.Min/Max overloads). + /// Float/double are included — NaN is unsupported (see issues #10, #11). /// - private static readonly HashSet MathMinMaxTypes = new HashSet + internal static bool SupportsBranchlessMinMax(SpecialType specialType) => specialType switch { - "byte", "sbyte", "short", "ushort", "int", "uint", "long", "ulong", "float", "double" + SpecialType.System_Byte => true, + SpecialType.System_SByte => true, + SpecialType.System_Int16 => true, + SpecialType.System_UInt16 => true, + SpecialType.System_Int32 => true, + SpecialType.System_UInt32 => true, + SpecialType.System_Int64 => true, + SpecialType.System_UInt64 => true, + SpecialType.System_Single => true, + SpecialType.System_Double => true, + _ => false, }; /// /// Emits a scalar Sort method for the given network size and element type. /// - internal static string EmitSortMethod(int size, string typeName, int[] network, bool useCompareTo = false) + internal static string EmitSortMethod(int size, string typeName, SpecialType specialType, int[] network, bool useCompareTo = false) { var sb = new StringBuilder(); sb.AppendLine($" private static void Sort{size}(ref {typeName} first)"); @@ -38,7 +51,7 @@ internal static string EmitSortMethod(int size, string typeName, int[] network, // Emit compare-and-swap for each pair bool isString = typeName == "string"; - bool useMathMinMax = !useCompareTo && !isString && MathMinMaxTypes.Contains(typeName); + bool useMathMinMax = !useCompareTo && !isString && SupportsBranchlessMinMax(specialType); for (int i = 0; i < network.Length; i += 2) { int a = network[i]; diff --git a/SortingNetworks.Generators/SortingNetworkGenerator.cs b/SortingNetworks.Generators/SortingNetworkGenerator.cs index 6a4841e..67b231f 100644 --- a/SortingNetworks.Generators/SortingNetworkGenerator.cs +++ b/SortingNetworks.Generators/SortingNetworkGenerator.cs @@ -862,7 +862,7 @@ private static void Execute(SourceProductionContext context, ImmutableArray Date: Sat, 2 May 2026 21:27:04 -0500 Subject: [PATCH 3/4] Platform-specific scalar codegen: branchless on x86, branching on ARM - ScalarEmitter now emits a runtime X86Base.IsSupported check for numeric types: Math.Min/Max (cmov) on x86, if/swap on ARM where branch prediction outperforms csel data-dependency chains. The JIT dead-code-eliminates the unused path. - Added Branchless property to SortingNetworkAttribute for explicit control: Branchless = true forces Math.Min/Max, false forces if/swap, unset (default) uses runtime auto-detection. - Generator reads the Branchless named argument via Roslyn NamedArguments and passes it through NetworkRequest to ScalarEmitter. - Updated tests: PlatformSpecific_EmittedForNumericTypes (10 types), BranchlessTrue_EmitsOnlyMinMax, BranchlessFalse_EmitsOnlyBranching. - Updated performance.instructions.md and README.md with platform-specific documentation and Branchless attribute examples. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../instructions/performance.instructions.md | 2 +- README.md | 24 +++- SortingNetworks.Generators/ScalarEmitter.cs | 73 +++++++--- .../SortingNetworkGenerator.cs | 25 +++- SortingNetworks.Tests/GeneratorTests.cs | 127 ++++++++++++++++++ SortingNetworks/PublicAPI.Unshipped.txt | 2 + SortingNetworks/SortingNetworkAttribute.cs | 13 ++ 7 files changed, 236 insertions(+), 30 deletions(-) diff --git a/.github/instructions/performance.instructions.md b/.github/instructions/performance.instructions.md index 63b0ea4..cecaa54 100644 --- a/.github/instructions/performance.instructions.md +++ b/.github/instructions/performance.instructions.md @@ -10,7 +10,7 @@ This is a high-performance sorting library. All code in the hot path must follow - Use `MemoryMarshal.GetReference(span)` to get a ref to the first element. - Mark hot-path private methods with `[MethodImpl(MethodImplOptions.AggressiveInlining)]`. - Avoid heap allocations in sort methods — no LINQ, no closures, no boxing. -- Use branchless `Math.Min`/`Math.Max` compare-and-swap for numeric primitive types (`byte`, `sbyte`, `short`, `ushort`, `int`, `uint`, `long`, `ulong`, `float`, `double`). The JIT lowers these to `cmov` on x64. For `char` and custom types, use branching `if (a > b) { T temp = a; a = b; b = temp; }`. +- Use platform-specific compare-and-swap for numeric primitive types (`byte`, `sbyte`, `short`, `ushort`, `int`, `uint`, `long`, `ulong`, `float`, `double`). By default, the generator emits a runtime `X86Base.IsSupported` check: branchless `Math.Min`/`Math.Max` on x86 (JIT lowers to `cmov`), branching `if/swap` on ARM (where branch prediction outperforms `csel` chains). The `Branchless` attribute property can force one strategy: `[SortingNetwork(27, typeof(int), Branchless = true)]`. For `char` and custom types, always use branching `if (a > b) { T temp = a; a = b; b = temp; }`. - **NaN is not supported** for `float`/`double` sorting. Sorting networks use ordered comparisons where `NaN > x` is always false, so NaN values disrupt sort order. See [#10](https://github.com/jonathanpeppers/SortingNetworks/issues/10) and [#11](https://github.com/jonathanpeppers/SortingNetworks/issues/11). - `IComparer` overloads use the loop-based `ApplyNetworkWithComparer` path and are not unrolled. - Fallback to `span.Sort()` or `Array.Sort()` for sizes outside the network range. diff --git a/README.md b/README.md index 9b3e4ed..487bbd3 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ MySorter.Sort(otherData); // any other size → OnFallback the comparer path still throws. The source generator emits optimized sort methods with: -- **Scalar unrolled** compare-and-swap for all sizes/types (branchless `Math.Min`/`Math.Max` for numeric types) +- **Scalar unrolled** compare-and-swap for all sizes/types (platform-adaptive: branchless `Math.Min`/`Math.Max` on x86, branching `if/swap` on ARM) - **x86 SIMD** (AVX2, AVX-512) when the type and size fit in SIMD registers - **ARM64 SIMD** (AdvSimd/NEON) for supported types - **IComparer<T>** overloads using loop-based network application @@ -171,9 +171,11 @@ comparators that involve channel 27. ### Scalar implementation The simplest path unrolls every compare-and-swap from the network into -straight-line code. For numeric types, branchless `Math.Min`/`Math.Max` calls -are used (the JIT lowers these to `cmov` instructions). For a 3-element -example, a depth-3 network looks like: +straight-line code. For numeric types, the generator emits a runtime platform +check: on x86, branchless `Math.Min`/`Math.Max` calls are used (the JIT lowers +these to `cmov` instructions); on ARM, branching `if/swap` is used (branch +prediction outperforms `csel` data-dependency chains). The JIT +dead-code-eliminates the unused path. For a 3-element example on x86: ```csharp // Sort 3 elements with a sorting network (depth 3, 3 comparators) @@ -191,7 +193,19 @@ static void Sort3(ref int e0, ref int e1, ref int e2) ``` For char and custom types, branching `if (a > b) swap` is used instead -(char lacks `Math.Min`/`Math.Max` overloads). +(char lacks `Math.Min`/`Math.Max` overloads). The swap strategy can also be +controlled explicitly with the `Branchless` attribute property: + +```csharp +// Force branchless Math.Min/Max on all platforms +[SortingNetwork(27, typeof(int), Branchless = true)] + +// Force branching if/swap on all platforms +[SortingNetwork(27, typeof(int), Branchless = false)] + +// Default: auto-detect at runtime (recommended) +[SortingNetwork(27, typeof(int))] +``` For the real 27/28-element networks the same pattern is used — the code generator emits all ~185 comparators across 13 layers as a flat sequence. diff --git a/SortingNetworks.Generators/ScalarEmitter.cs b/SortingNetworks.Generators/ScalarEmitter.cs index ad710a3..31e7691 100644 --- a/SortingNetworks.Generators/ScalarEmitter.cs +++ b/SortingNetworks.Generators/ScalarEmitter.cs @@ -6,8 +6,12 @@ namespace SortingNetworks.Generators /// /// Emits unrolled scalar sorting code for a given network size and element type. /// Uses ref + Unsafe.Add for element access. For numeric types with - /// System.Math.Min/Max overloads, emits branchless min/max swaps; for all - /// other types (char, string, custom) emits branching if/swap. + /// System.Math.Min/Max overloads: + /// - Default (branchless=null): emits a runtime X86Base.IsSupported check with + /// branchless min/max on x86 and branching if/swap elsewhere. + /// - Branchless=true: emits only Math.Min/Max swaps. + /// - Branchless=false: emits only branching if/swap. + /// For char, string, and custom types, always emits branching if/swap. /// internal static class ScalarEmitter { @@ -34,8 +38,9 @@ internal static class ScalarEmitter /// /// Emits a scalar Sort method for the given network size and element type. + /// : null = auto (runtime platform check), true = force branchless, false = force branching. /// - internal static string EmitSortMethod(int size, string typeName, SpecialType specialType, int[] network, bool useCompareTo = false) + internal static string EmitSortMethod(int size, string typeName, SpecialType specialType, int[] network, bool useCompareTo = false, bool? branchless = null) { var sb = new StringBuilder(); sb.AppendLine($" private static void Sort{size}(ref {typeName} first)"); @@ -49,17 +54,56 @@ internal static string EmitSortMethod(int size, string typeName, SpecialType spe } sb.AppendLine(); - // Emit compare-and-swap for each pair + // Determine swap strategy bool isString = typeName == "string"; - bool useMathMinMax = !useCompareTo && !isString && SupportsBranchlessMinMax(specialType); + bool canUseMathMinMax = !useCompareTo && !isString && SupportsBranchlessMinMax(specialType); + + if (canUseMathMinMax && branchless == true) + { + // Force branchless: Math.Min/Max only + EmitComparators(sb, network, typeName, branchless: true, indent: " "); + } + else if (canUseMathMinMax && branchless != false) + { + // Auto-detect (branchless == null): emit runtime platform check + // The JIT treats X86Base.IsSupported as a constant and dead-code-eliminates the unused branch + sb.AppendLine(" if (System.Runtime.Intrinsics.X86.X86Base.IsSupported)"); + sb.AppendLine(" {"); + EmitComparators(sb, network, typeName, branchless: true, indent: " "); + sb.AppendLine(" }"); + sb.AppendLine(" else"); + sb.AppendLine(" {"); + EmitComparators(sb, network, typeName, branchless: false, indent: " "); + sb.AppendLine(" }"); + } + else + { + // Force branching, or type doesn't support Math.Min/Max + EmitComparators(sb, network, typeName, branchless: false, indent: " ", + useCompareTo: useCompareTo, isString: isString); + } + sb.AppendLine(); + + // Write back + sb.AppendLine($" first = e0;"); + for (int i = 1; i < size; i++) + { + sb.AppendLine($" System.Runtime.CompilerServices.Unsafe.Add(ref first, {i}) = e{i};"); + } + + sb.AppendLine($" }}"); + return sb.ToString(); + } + + private static void EmitComparators(StringBuilder sb, int[] network, string typeName, bool branchless, string indent, bool useCompareTo = false, bool isString = false) + { for (int i = 0; i < network.Length; i += 2) { int a = network[i]; int b = network[i + 1]; - if (useMathMinMax) + if (branchless) { - // Branchless: Math.Min/Max → JIT emits cmov on x64 - sb.AppendLine($" {{ {typeName} t0 = System.Math.Min(e{a}, e{b}); {typeName} t1 = System.Math.Max(e{a}, e{b}); e{a} = t0; e{b} = t1; }}"); + sb.AppendLine($"{indent}{{ {typeName} t0 = System.Math.Min(e{a}, e{b}); {typeName} t1 = System.Math.Max(e{a}, e{b}); e{a} = t0; e{b} = t1; }}"); } else { @@ -70,20 +114,9 @@ internal static string EmitSortMethod(int size, string typeName, SpecialType spe condition = $"string.CompareOrdinal(e{a}, e{b}) > 0"; else condition = $"e{a} > e{b}"; - sb.AppendLine($" if ({condition}) {{ {typeName} temp = e{a}; e{a} = e{b}; e{b} = temp; }}"); + sb.AppendLine($"{indent}if ({condition}) {{ {typeName} temp = e{a}; e{a} = e{b}; e{b} = temp; }}"); } } - sb.AppendLine(); - - // Write back - sb.AppendLine($" first = e0;"); - for (int i = 1; i < size; i++) - { - sb.AppendLine($" System.Runtime.CompilerServices.Unsafe.Add(ref first, {i}) = e{i};"); - } - - sb.AppendLine($" }}"); - return sb.ToString(); } /// diff --git a/SortingNetworks.Generators/SortingNetworkGenerator.cs b/SortingNetworks.Generators/SortingNetworkGenerator.cs index 67b231f..8002f91 100644 --- a/SortingNetworks.Generators/SortingNetworkGenerator.cs +++ b/SortingNetworks.Generators/SortingNetworkGenerator.cs @@ -57,6 +57,20 @@ public class SortingNetworkGenerator : IIncrementalGenerator _ => null, }; + /// + /// Reads the optional Branchless named argument from an attribute. + /// Returns null if not specified (auto-detect), true/false if explicitly set. + /// + private static bool? GetBranchlessArg(AttributeData attr) + { + foreach (var arg in attr.NamedArguments) + { + if (arg.Key == "Branchless" && arg.Value.Value is bool value) + return value; + } + return null; + } + public void Initialize(IncrementalGeneratorInitializationContext context) { // Find all class declarations with [SortingNetwork] attributes @@ -108,7 +122,7 @@ public void Initialize(IncrementalGeneratorInitializationContext context) } } - attributes.Add(new NetworkRequest(size, typeName, typeSymbol.SpecialType, isComparable)); + attributes.Add(new NetworkRequest(size, typeName, typeSymbol.SpecialType, isComparable, branchless: GetBranchlessArg(attr))); } } @@ -862,7 +876,7 @@ private static void Execute(SourceProductionContext context, ImmutableArray public SpecialType SpecialType { get; } public bool IsCustomType { get; } public bool IsComparable { get; } + /// null = auto (platform-detect), true = force branchless, false = force branching + public bool? Branchless { get; } - public NetworkRequest(int size, string typeName, SpecialType specialType, bool isComparable) + public NetworkRequest(int size, string typeName, SpecialType specialType, bool isComparable, bool? branchless = null) { Size = size; TypeName = typeName; SpecialType = specialType; IsCustomType = !SupportedSpecialTypes.Contains(specialType); IsComparable = isComparable; + Branchless = branchless; } public bool Equals(NetworkRequest? other) diff --git a/SortingNetworks.Tests/GeneratorTests.cs b/SortingNetworks.Tests/GeneratorTests.cs index c3612a4..dd3a47e 100644 --- a/SortingNetworks.Tests/GeneratorTests.cs +++ b/SortingNetworks.Tests/GeneratorTests.cs @@ -912,6 +912,133 @@ public partial class MySorter {{ }} Assert.Contains($"private static void Sort16(ref {type32} first)", generatedSource); } + [Theory] + [InlineData("byte")] + [InlineData("sbyte")] + [InlineData("short")] + [InlineData("ushort")] + [InlineData("int")] + [InlineData("uint")] + [InlineData("long")] + [InlineData("ulong")] + [InlineData("float")] + [InlineData("double")] + public void PlatformSpecific_EmittedForNumericTypes(string typeName) + { + var source = $@" +using SortingNetworks; + +[SortingNetwork(4, typeof({typeName}))] +public partial class MySorter {{ }} +"; + var compilation = SourceGeneratorDriver.CreateCompilation(source); + var (result, updatedCompilation) = SourceGeneratorDriver.RunGeneratorWithCompilation(compilation); + + var errors = result.Diagnostics.Where(d => d.Severity == DiagnosticSeverity.Error).ToArray(); + Assert.Empty(errors); + + var compilationErrors = SourceGeneratorDriver.GetErrors(updatedCompilation); + Assert.Empty(compilationErrors); + + var generatedSource = result.GeneratedTrees + .Select(t => t.GetText().ToString()) + .FirstOrDefault(s => s.Contains("Sort4")); + Assert.NotNull(generatedSource); + // Default: emits runtime platform check with both paths + Assert.Contains("X86Base.IsSupported", generatedSource); + Assert.Contains("System.Math.Min", generatedSource); + Assert.Contains("System.Math.Max", generatedSource); + Assert.Contains("if (e", generatedSource); + } + + [Theory] + [InlineData("int")] + [InlineData("float")] + public void BranchlessTrue_EmitsOnlyMinMax(string typeName) + { + var source = $@" +using SortingNetworks; + +[SortingNetwork(4, typeof({typeName}), Branchless = true)] +public partial class MySorter {{ }} +"; + var compilation = SourceGeneratorDriver.CreateCompilation(source); + var (result, updatedCompilation) = SourceGeneratorDriver.RunGeneratorWithCompilation(compilation); + + var errors = result.Diagnostics.Where(d => d.Severity == DiagnosticSeverity.Error).ToArray(); + Assert.Empty(errors); + + var compilationErrors = SourceGeneratorDriver.GetErrors(updatedCompilation); + Assert.Empty(compilationErrors); + + var generatedSource = result.GeneratedTrees + .Select(t => t.GetText().ToString()) + .FirstOrDefault(s => s.Contains("Sort4")); + Assert.NotNull(generatedSource); + Assert.Contains("System.Math.Min", generatedSource); + Assert.Contains("System.Math.Max", generatedSource); + Assert.DoesNotContain("X86Base.IsSupported", generatedSource); + Assert.DoesNotContain("if (e", generatedSource); + } + + [Theory] + [InlineData("int")] + [InlineData("float")] + public void BranchlessFalse_EmitsOnlyBranching(string typeName) + { + var source = $@" +using SortingNetworks; + +[SortingNetwork(4, typeof({typeName}), Branchless = false)] +public partial class MySorter {{ }} +"; + var compilation = SourceGeneratorDriver.CreateCompilation(source); + var (result, updatedCompilation) = SourceGeneratorDriver.RunGeneratorWithCompilation(compilation); + + var errors = result.Diagnostics.Where(d => d.Severity == DiagnosticSeverity.Error).ToArray(); + Assert.Empty(errors); + + var compilationErrors = SourceGeneratorDriver.GetErrors(updatedCompilation); + Assert.Empty(compilationErrors); + + var generatedSource = result.GeneratedTrees + .Select(t => t.GetText().ToString()) + .FirstOrDefault(s => s.Contains("Sort4")); + Assert.NotNull(generatedSource); + Assert.Contains("if (e", generatedSource); + Assert.DoesNotContain("System.Math.Min", generatedSource); + Assert.DoesNotContain("System.Math.Max", generatedSource); + Assert.DoesNotContain("X86Base.IsSupported", generatedSource); + } + + [Theory] + [InlineData("char")] + public void BranchingSwap_EmittedForNonMinMaxTypes(string typeName) + { + var source = $@" +using SortingNetworks; + +[SortingNetwork(4, typeof({typeName}))] +public partial class MySorter {{ }} +"; + var compilation = SourceGeneratorDriver.CreateCompilation(source); + var (result, updatedCompilation) = SourceGeneratorDriver.RunGeneratorWithCompilation(compilation); + + var errors = result.Diagnostics.Where(d => d.Severity == DiagnosticSeverity.Error).ToArray(); + Assert.Empty(errors); + + var compilationErrors = SourceGeneratorDriver.GetErrors(updatedCompilation); + Assert.Empty(compilationErrors); + + var generatedSource = result.GeneratedTrees + .Select(t => t.GetText().ToString()) + .FirstOrDefault(s => s.Contains("Sort4")); + Assert.NotNull(generatedSource); + Assert.Contains("if (e", generatedSource); + Assert.DoesNotContain("System.Math.Min", generatedSource); + Assert.DoesNotContain("System.Math.Max", generatedSource); + } + [Fact] public void IncrementalCache_SameCompilation_OutputIsCached() { diff --git a/SortingNetworks/PublicAPI.Unshipped.txt b/SortingNetworks/PublicAPI.Unshipped.txt index 7dc5c58..3fef6a7 100644 --- a/SortingNetworks/PublicAPI.Unshipped.txt +++ b/SortingNetworks/PublicAPI.Unshipped.txt @@ -1 +1,3 @@ #nullable enable +SortingNetworks.SortingNetworkAttribute.Branchless.get -> bool +SortingNetworks.SortingNetworkAttribute.Branchless.set -> void diff --git a/SortingNetworks/SortingNetworkAttribute.cs b/SortingNetworks/SortingNetworkAttribute.cs index 06694f3..8826f95 100644 --- a/SortingNetworks/SortingNetworkAttribute.cs +++ b/SortingNetworks/SortingNetworkAttribute.cs @@ -20,6 +20,19 @@ public sealed class SortingNetworkAttribute : Attribute /// public Type ElementType { get; } + /// + /// Controls the compare-and-swap strategy for scalar sorting network methods. + /// When true, emits branchless Math.Min/Math.Max swaps + /// (optimal on x86/x64 where the JIT lowers to cmov). + /// When false, emits branching if/swap + /// (optimal on ARM where branch prediction outperforms csel chains). + /// When not set (default), the generator emits a runtime platform check + /// that selects the best strategy automatically. + /// Only applies to numeric types with Math.Min/Math.Max overloads; + /// ignored for char, string, and custom types. + /// + public bool Branchless { get; set; } + /// /// Initializes a new instance of the class. /// From 57acd68bb77238b08e4b658f6bcd17b540901136 Mon Sep 17 00:00:00 2001 From: Jonathan Peppers Date: Sun, 3 May 2026 10:39:45 -0500 Subject: [PATCH 4/4] Update README benchmarks with platform-adaptive scalar results Add int scalar sizes 23-32 benchmark table showing platform-specific results across Ubuntu x64, Windows x64, and macOS ARM. Update ARM64 int detailed results with latest CI run data. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 487bbd3..0542073 100644 --- a/README.md +++ b/README.md @@ -593,17 +593,39 @@ multi-stage TBL overhead exceeds SIMD benefit at these sizes: | Size | Kind | GeneratedSort | Ratio vs ArraySort | |---|---|---|---| -| 27 | Random | 74 ns | **0.74x** (26% faster) | -| 27 | Sorted | 30 ns | **0.52x** (48% faster) | -| 27 | Reversed | 78 ns | 1.22x | -| 27 | Duplicates | 77 ns | **0.72x** (28% faster) | -| 28 | Random | 80 ns | **0.65x** (35% faster) | -| 28 | Sorted | 30 ns | **0.51x** (49% faster) | -| 28 | Reversed | 73 ns | 1.15x | -| 28 | Duplicates | 80 ns | **0.73x** (27% faster) | +| 27 | Random | 74 ns | **0.76x** (24% faster) | +| 27 | Sorted | 28 ns | **0.52x** (48% faster) | +| 27 | Reversed | 73 ns | 1.21x | +| 27 | Duplicates | 77 ns | **0.77x** (23% faster) | +| 28 | Random | 72 ns | **0.60x** (40% faster) | +| 28 | Sorted | 30 ns | **0.52x** (48% faster) | +| 28 | Reversed | 72 ns | 1.17x | +| 28 | Duplicates | 71 ns | **0.68x** (32% faster) | > With AVX2 SIMD, GeneratedSort is consistently faster than Array.Sort for `int` across all input patterns. On ARM64, the early-exit sorted check makes sorted input ~2x faster than ArraySort. Reversed input is slightly slower due to the overhead of cross-vector TBL/TBX shuffles with 7 registers. +### int scalar sizes 23-32 (platform-adaptive) + +For sizes outside the SIMD range, the scalar unrolled network uses a runtime +`X86Base.IsSupported` check: branchless `Math.Min`/`Math.Max` on x86 (JIT +lowers to `cmov`), branching `if/swap` on ARM (where branch prediction +outperforms `csel` data-dependency chains): + +| Size | Ubuntu x64 (EPYC) | Windows x64 (EPYC) | macOS ARM (M1) | +|---|---|---|---| +| 23 | **0.82x** (18% faster) | **0.68x** (32% faster) | **0.75x** (25% faster) | +| 24 | **0.73x** (27% faster) | 1.02x | **0.77x** (23% faster) | +| 25 | **0.87x** (13% faster) | **0.85x** (15% faster) | **0.75x** (25% faster) | +| 26 | **0.81x** (19% faster) | **0.84x** (16% faster) | **0.76x** (24% faster) | +| 29 | **0.95x** | — | — | +| 30 | **0.93x** | — | — | +| 31 | **0.75x** (25% faster) | — | — | +| 32 | **0.78x** (22% faster) | — | — | + +> **Note:** The `Branchless` attribute property can force one strategy on all +> platforms: `[SortingNetwork(27, typeof(int), Branchless = true)]` for +> branchless-only, `Branchless = false` for branching-only. + ### Sizes 33-64 (x86, scalar unrolled) Networks for sizes 33-64 use best-known networks from [Dobbelaere's SorterHunter](https://github.com/bertdobbelaere/SorterHunter).