From 3f5d7d1123c38dec3f0119cc610dc8e0a1b4089f Mon Sep 17 00:00:00 2001 From: ThomasNing Date: Wed, 28 Jan 2026 10:25:48 -0600 Subject: [PATCH 1/3] add padding to cshuffle epilogue to avoid bank conflict --- .../ops/epilogue/cshuffle_epilogue.hpp | 118 ++++++++++++++++-- 1 file changed, 111 insertions(+), 7 deletions(-) diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index 4f636b59625..a7238b4d853 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -296,19 +296,123 @@ struct CShuffleEpilogue template CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor() { + constexpr auto DataTypeSize = sizeof(ODataType); + constexpr index_t VectorLen = GetVectorSizeC(); + constexpr index_t BytesPerBank = 4; + constexpr index_t Banks = get_n_lds_banks(); + if constexpr(std::is_same_v) + { + static_assert(NPerIterationShuffle % VectorLen == 0, + "NPerIterationShuffle must be divisible by VectorLen."); + } + else if constexpr(std::is_same_v) + { + static_assert(MPerIterationShuffle % VectorLen == 0, + "MPerIterationShuffle must be divisible by VectorLen."); + } + constexpr auto compute_padding = [](index_t stride_elems) constexpr -> index_t { + constexpr index_t elem_bytes = sizeof(ODataType); + constexpr index_t banks = get_n_lds_banks(); + constexpr index_t bytes_per_bk = 4; + + const index_t stride_bytes = stride_elems * elem_bytes; + const bool bad_stride = (stride_bytes % (banks * bytes_per_bk) == 0); + + return bad_stride ? GetVectorSizeC() : 0; + }; // N is contiguous dimension if constexpr(std::is_same_v) { - return make_naive_tensor_descriptor( - make_tuple(number{}, number{}), - make_tuple(number{}, number<1>{})); + // Layering spreads row-start addresses across banks when N is small (or stride + // periodic). + constexpr index_t MLdsLayerRequired = + Banks * BytesPerBank / (NPerIterationShuffle * DataTypeSize); + + constexpr index_t MLdsLayer = max(index_t{1}, MLdsLayerRequired); + + static_assert(MPerIterationShuffle % MLdsLayer == 0, + "MPerIterationShuffle must be divisible by MLdsLayer."); + + // Decide padding based on the row stride *without* padding (in elements). + // Here stride is NPerIterationShuffle * MLdsLayer. + constexpr index_t PaddingAmount = compute_padding(NPerIterationShuffle * MLdsLayer); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, + number<(NPerIterationShuffle / VectorLen) * MLdsLayer>{}, + number{}), + make_tuple(number{}, + number{}, + number<1>{}), + number{}, + number<1>{}); + + // Split the middle dim into (layer, N/VectorLen) + constexpr auto lds_block_desc_1 = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_pass_through_transform(number{}), + make_unmerge_transform(make_tuple( + number{}, number{})), + make_pass_through_transform(number{})), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), + make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{})); + + // Merge back to logical (M,N) while preserving the physical bank-friendly layout + constexpr auto lds_block_desc = transform_tensor_descriptor( + lds_block_desc_1, + make_tuple(make_merge_transform_v3_division_mod(make_tuple( + number{}, number{})), + make_merge_transform_v3_division_mod(make_tuple( + number{}, number{}))), + make_tuple(sequence<0, 1>{}, sequence<2, 3>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_block_desc; } // M is contiguous dimension else if constexpr(std::is_same_v) { - return make_naive_tensor_descriptor( - make_tuple(number{}, number{}), - make_tuple(number<1>{}, number{})); + constexpr index_t NLdsLayerRequired = + Banks * BytesPerBank / (MPerIterationShuffle * DataTypeSize); + + constexpr index_t NLdsLayer = max(index_t{1}, NLdsLayerRequired); + + static_assert(NPerIterationShuffle % NLdsLayer == 0, + "NPerIterationShuffle must be divisible by NLdsLayer."); + + // Decide padding based on the row stride (now MPerIterationShuffle * NLdsLayer). + constexpr index_t PaddingAmount = compute_padding(MPerIterationShuffle * NLdsLayer); + + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, + number<(MPerIterationShuffle / VectorLen) * NLdsLayer>{}, + number{}), + make_tuple(number{}, + number{}, + number<1>{}), + number{}, + number<1>{}); + + // Split the middle dim into (layer, M/VectorLen) + constexpr auto lds_block_desc_1 = transform_tensor_descriptor( + lds_block_desc_0, + make_tuple(make_pass_through_transform(number{}), + make_unmerge_transform(make_tuple( + number{}, number{})), + make_pass_through_transform(number{})), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), + make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{})); + + constexpr auto lds_block_desc = transform_tensor_descriptor( + lds_block_desc_1, + make_tuple(make_merge_transform_v3_division_mod(make_tuple( + number{}, number{})), + make_merge_transform_v3_division_mod(make_tuple( + number{}, number{}))), + make_tuple(sequence<0, 1>{}, sequence<2, 3>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return lds_block_desc; } else { @@ -697,7 +801,7 @@ struct CShuffleEpilogue MPerIterationShuffle, NPerIterationShuffle, GetVectorSizeC(), - tile_distribution_pattern::thread_raked, + tile_distribution_pattern::warp_raked, Problem::kNumWaveGroups>; constexpr auto dram_tile_distribution = TileEncodingPattern::make_2d_static_tile_distribution(); From 5b6cbd329c8e0cee3fc9891ea5f514f88e8b9bc6 Mon Sep 17 00:00:00 2001 From: ThomasNing Date: Wed, 28 Jan 2026 14:31:54 -0600 Subject: [PATCH 2/3] Fix gfx942 and 90a --- .../ops/epilogue/cshuffle_epilogue.hpp | 65 +++++-------------- 1 file changed, 16 insertions(+), 49 deletions(-) diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index a7238b4d853..040b8a4eb81 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -296,50 +296,27 @@ struct CShuffleEpilogue template CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor() { - constexpr auto DataTypeSize = sizeof(ODataType); - constexpr index_t VectorLen = GetVectorSizeC(); - constexpr index_t BytesPerBank = 4; - constexpr index_t Banks = get_n_lds_banks(); - if constexpr(std::is_same_v) - { - static_assert(NPerIterationShuffle % VectorLen == 0, - "NPerIterationShuffle must be divisible by VectorLen."); - } - else if constexpr(std::is_same_v) - { - static_assert(MPerIterationShuffle % VectorLen == 0, - "MPerIterationShuffle must be divisible by VectorLen."); - } - constexpr auto compute_padding = [](index_t stride_elems) constexpr -> index_t { - constexpr index_t elem_bytes = sizeof(ODataType); - constexpr index_t banks = get_n_lds_banks(); - constexpr index_t bytes_per_bk = 4; + constexpr auto DataTypeSize = sizeof(ODataType); + constexpr index_t VectorLen = GetVectorSizeC(); + constexpr index_t Banks = get_n_lds_banks(); - const index_t stride_bytes = stride_elems * elem_bytes; - const bool bad_stride = (stride_bytes % (banks * bytes_per_bk) == 0); - - return bad_stride ? GetVectorSizeC() : 0; - }; + // calculate how many elements to pad to avoid bank conflict +#if defined(__gfx950__) + constexpr auto PaddingAmount = VectorLen; +#else + constexpr auto PaddingAmount = 0; +#endif + constexpr index_t BytesPerBank = 4; // N is contiguous dimension if constexpr(std::is_same_v) { - // Layering spreads row-start addresses across banks when N is small (or stride - // periodic). constexpr index_t MLdsLayerRequired = - Banks * BytesPerBank / (NPerIterationShuffle * DataTypeSize); - - constexpr index_t MLdsLayer = max(index_t{1}, MLdsLayerRequired); - - static_assert(MPerIterationShuffle % MLdsLayer == 0, - "MPerIterationShuffle must be divisible by MLdsLayer."); - - // Decide padding based on the row stride *without* padding (in elements). - // Here stride is NPerIterationShuffle * MLdsLayer. - constexpr index_t PaddingAmount = compute_padding(NPerIterationShuffle * MLdsLayer); + Banks * BytesPerBank / NPerIterationShuffle / DataTypeSize; + constexpr auto MLdsLayer = max(1, MLdsLayerRequired); constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, - number<(NPerIterationShuffle / VectorLen) * MLdsLayer>{}, + number{}, number{}), make_tuple(number{}, number{}, @@ -347,7 +324,6 @@ struct CShuffleEpilogue number{}, number<1>{}); - // Split the middle dim into (layer, N/VectorLen) constexpr auto lds_block_desc_1 = transform_tensor_descriptor( lds_block_desc_0, make_tuple(make_pass_through_transform(number{}), @@ -357,7 +333,6 @@ struct CShuffleEpilogue make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{})); - // Merge back to logical (M,N) while preserving the physical bank-friendly layout constexpr auto lds_block_desc = transform_tensor_descriptor( lds_block_desc_1, make_tuple(make_merge_transform_v3_division_mod(make_tuple( @@ -373,19 +348,12 @@ struct CShuffleEpilogue else if constexpr(std::is_same_v) { constexpr index_t NLdsLayerRequired = - Banks * BytesPerBank / (MPerIterationShuffle * DataTypeSize); - - constexpr index_t NLdsLayer = max(index_t{1}, NLdsLayerRequired); - - static_assert(NPerIterationShuffle % NLdsLayer == 0, - "NPerIterationShuffle must be divisible by NLdsLayer."); - - // Decide padding based on the row stride (now MPerIterationShuffle * NLdsLayer). - constexpr index_t PaddingAmount = compute_padding(MPerIterationShuffle * NLdsLayer); + get_n_lds_banks() * BytesPerBank / MPerIterationShuffle / DataTypeSize; + constexpr auto NLdsLayer = max(1, NLdsLayerRequired); constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, - number<(MPerIterationShuffle / VectorLen) * NLdsLayer>{}, + number{}, number{}), make_tuple(number{}, number{}, @@ -393,7 +361,6 @@ struct CShuffleEpilogue number{}, number<1>{}); - // Split the middle dim into (layer, M/VectorLen) constexpr auto lds_block_desc_1 = transform_tensor_descriptor( lds_block_desc_0, make_tuple(make_pass_through_transform(number{}), From 24baa5245fbf894c167f5269d96008eaf7331fc3 Mon Sep 17 00:00:00 2001 From: ThomasNing Date: Wed, 28 Jan 2026 23:15:21 -0600 Subject: [PATCH 3/3] add up the padding algorithm --- .../ops/epilogue/cshuffle_epilogue.hpp | 46 +++++++++++++++---- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index 040b8a4eb81..f9bd18e0be6 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -298,22 +298,33 @@ struct CShuffleEpilogue { constexpr auto DataTypeSize = sizeof(ODataType); constexpr index_t VectorLen = GetVectorSizeC(); - constexpr index_t Banks = get_n_lds_banks(); + constexpr index_t banks = get_n_lds_banks(); - // calculate how many elements to pad to avoid bank conflict -#if defined(__gfx950__) - constexpr auto PaddingAmount = VectorLen; -#else - constexpr auto PaddingAmount = 0; -#endif constexpr index_t BytesPerBank = 4; + // N is contiguous dimension if constexpr(std::is_same_v) { constexpr index_t MLdsLayerRequired = - Banks * BytesPerBank / NPerIterationShuffle / DataTypeSize; + banks * BytesPerBank / NPerIterationShuffle / DataTypeSize; constexpr auto MLdsLayer = max(1, MLdsLayerRequired); + constexpr index_t BaseStrideElems = NPerIterationShuffle * MLdsLayer; + static_assert((BaseStrideElems * DataTypeSize) % BytesPerBank == 0, + "LDS row stride must be 4B-aligned for bank-word padding logic"); + // calculate how many elements to pad to avoid bank conflict +#if defined(__gfx950__) + constexpr index_t ElemsPer4B = BytesPerBank / ck_tile::gcd(BytesPerBank, DataTypeSize); + constexpr auto ToWords = [](index_t elems) constexpr { + return (elems * DataTypeSize) / BytesPerBank; + }; + constexpr index_t BaseWords = ToWords(BaseStrideElems); + constexpr index_t PadWords = ((BaseWords % 2) == 0) ? 1 : 0; + constexpr auto PaddingAmount = PadWords * ElemsPer4B; +#else + constexpr auto PaddingAmount = 0; +#endif + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, @@ -351,6 +362,23 @@ struct CShuffleEpilogue get_n_lds_banks() * BytesPerBank / MPerIterationShuffle / DataTypeSize; constexpr auto NLdsLayer = max(1, NLdsLayerRequired); + constexpr index_t BaseStrideElems = MPerIterationShuffle * NLdsLayer; + + static_assert((BaseStrideElems * DataTypeSize) % BytesPerBank == 0, + "LDS row stride must be 4B-aligned for bank-word padding logic"); + +#if defined(__gfx950__) + constexpr index_t ElemsPer4B = BytesPerBank / ck_tile::gcd(BytesPerBank, DataTypeSize); + constexpr auto ToWords = [](index_t elems) constexpr { + return (elems * DataTypeSize) / BytesPerBank; + }; + constexpr index_t BaseWords = ToWords(BaseStrideElems); + constexpr index_t PadWords = ((BaseWords % 2) == 0) ? 1 : 0; + constexpr auto PaddingAmount = PadWords * ElemsPer4B; +#else + constexpr auto PaddingAmount = 0; +#endif + constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor( make_tuple(number{}, number{}, @@ -768,7 +796,7 @@ struct CShuffleEpilogue MPerIterationShuffle, NPerIterationShuffle, GetVectorSizeC(), - tile_distribution_pattern::warp_raked, + tile_distribution_pattern::thread_raked, Problem::kNumWaveGroups>; constexpr auto dram_tile_distribution = TileEncodingPattern::make_2d_static_tile_distribution();