From b33b130916760fc86a027dd832cfd0b6ffe0dcdc Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 27 Feb 2026 09:54:40 -0800 Subject: [PATCH] Remove fallback --- csrc/index_compute.cpp | 189 +++++++++++++++-------------------------- csrc/index_compute.h | 1 + 2 files changed, 69 insertions(+), 121 deletions(-) diff --git a/csrc/index_compute.cpp b/csrc/index_compute.cpp index 1829afa8138..512104d777a 100644 --- a/csrc/index_compute.cpp +++ b/csrc/index_compute.cpp @@ -1413,52 +1413,42 @@ std::vector Index::getNonGlobalProducerStridedIndices( Val* Index::getLinearLogicalIndex( TensorView* consumer_tv, const std::vector& loops) { - if (!ir_utils::hasRootToLoopLinearTransformations(consumer_tv) || - ir_utils::isCpAsyncBulkLoad(consumer_tv->definition()) || - GpuLower::current()->idModelOptions().isTensorIndexerEnabled() || - GpuLower::current()->tmemInfo().hasTMemTensor()) { - const TensorIndexer& indexer = GpuLower::current()->tensorIndexer(); - auto per_dim_indices = indexer.getIndexFor( - consumer_tv->definition(), - /*as_consumer=*/true, - consumer_tv->getLogicalDomain(), - loops, - /*use_magic_zero=*/true); - Val* stride = consumer_tv->fusion()->oneVal(); - for (const auto [i, logical_id] : - enumerate(consumer_tv->getLogicalDomain()) | std::views::reverse) { - auto per_dim_index = per_dim_indices.at(i); - auto per_dim_strided_index = - SimplifyingIrBuilder::mulExpr(per_dim_index, stride); - per_dim_indices.at(i) = per_dim_strided_index; - stride = SimplifyingIrBuilder::mulExpr(stride, logical_id->extent()); - } - return sumVals(per_dim_indices); - } else { - auto guard = ir_utils::allocateToLogicalDomainGuard(consumer_tv, true); - return sumVals(getGlobalConsumerStridedIndices(consumer_tv, loops)); - } + NVF_ERROR( + GpuLower::current()->idModelOptions().isTensorIndexerEnabled(), + "Legacy indexer no longer available"); + + const TensorIndexer& indexer = GpuLower::current()->tensorIndexer(); + auto per_dim_indices = indexer.getIndexFor( + consumer_tv->definition(), + /*as_consumer=*/true, + consumer_tv->getLogicalDomain(), + loops, + /*use_magic_zero=*/true); + Val* stride = consumer_tv->fusion()->oneVal(); + for (const auto [i, logical_id] : + enumerate(consumer_tv->getLogicalDomain()) | std::views::reverse) { + auto per_dim_index = per_dim_indices.at(i); + auto per_dim_strided_index = + SimplifyingIrBuilder::mulExpr(per_dim_index, stride); + per_dim_indices.at(i) = per_dim_strided_index; + stride = SimplifyingIrBuilder::mulExpr(stride, logical_id->extent()); + } + return sumVals(per_dim_indices); } std::vector Index::getConsumerPerDimLogicalIndex( TensorView* consumer_tv, const std::vector& loops) { - if (!ir_utils::hasRootToLoopLinearTransformations(consumer_tv) || - GpuLower::current()->idModelOptions().isTensorIndexerEnabled() || - GpuLower::current()->tmemInfo().hasTMemTensor()) { - const TensorIndexer& indexer = GpuLower::current()->tensorIndexer(); - return indexer.getIndexFor( - consumer_tv->definition(), - /*as_consumer=*/true, - consumer_tv->getLogicalDomain(), - loops); - } else { - auto guard = ir_utils::allocateToLogicalDomainGuard(consumer_tv, false); - IndexFromIdGraph index_from_id_graph = - getTensorIndexFromIdGraph(loops, consumer_tv); - return getConsumerAllocationIndices( - consumer_tv, loops, index_from_id_graph); - } + NVF_ERROR( + GpuLower::current()->idModelOptions().isTensorIndexerEnabled(), + "Legacy indexer no longer available"); + + const TensorIndexer& indexer = GpuLower::current()->tensorIndexer(); + return indexer.getIndexFor( + consumer_tv->definition(), + /*as_consumer=*/true, + consumer_tv->getLogicalDomain(), + loops); } std::vector Index::getProducerPerDimLogicalIndex( @@ -1913,38 +1903,6 @@ Val* Index::getProducerStridedIndices( } } -namespace { - -bool shouldUseTensorIndexer( - const TensorView* producer, - const TensorView* consumer) { - // Check if TensorIndexer is definitely required - auto is_tensor_indexer_required = [&]() -> bool { - bool is_producer_tma_op = producer->definition() != nullptr && - producer->definition()->isA() && - ir_utils::isCpAsyncBulkLoad(producer->definition()); - bool is_consumer_tma_op = consumer->definition() != nullptr && - consumer->definition()->isA() && - ir_utils::isCpAsyncBulkLoad(consumer->definition()); - - return !ir_utils::hasRootToLoopLinearTransformations(producer) || - (consumer->definition()->isA() && - isHopper(consumer->definition()->as()->macro())) || - is_producer_tma_op || is_consumer_tma_op || - GpuLower::current()->tmemInfo().hasTMemTensor(); - }; - - // TensorIndexer is always used when required or if not disabled. - // Note: Previously, ldmatrix and stmatrix were first introduced - // with Ampere, their indexing were only implemented in the legacy - // indexer in a rather manual way. The current implementation uses - // the alternate loop domain to enable TensorIndexer-based indexing. - return is_tensor_indexer_required() || - GpuLower::current()->idModelOptions().isTensorIndexerEnabled(); -} - -} // namespace - // Producer is the inputs of an expression kir::TensorIndex* Index::getProducerIndex( TensorView* producer, @@ -1954,28 +1912,25 @@ kir::TensorIndex* Index::getProducerIndex( bool generate_pointer, DataType as_type, bool ld_st_matrix) { - Val* index = nullptr; - - if (shouldUseTensorIndexer(producer, consumer)) { - index = GpuLower::current()->tensorIndexer().getLinearIndex( - producer, consumer->definition(), loops, override_index, ld_st_matrix); - if (generate_pointer) { - auto address_offset = index; - if (producer->getMemoryType() == MemoryType::Shared) { - auto producer_dt = producer->getDataType(); - NVF_ERROR(producer_dt.has_value()); - auto index_dt = index->getDataType(); - NVF_ERROR(index_dt.has_value()); - address_offset = SimplifyingIrBuilder::mulExpr( - address_offset, - IrBuilder::create(dataTypeSizeByte(*producer_dt), *index_dt)); - } - index = SimplifyingIrBuilder::addExpr( - IrBuilder::baseAddressExpr(producer), address_offset); - } - } else { - index = getProducerStridedIndices( - producer, consumer, loops, override_index, generate_pointer); + NVF_ERROR( + GpuLower::current()->idModelOptions().isTensorIndexerEnabled(), + "Legacy indexer no longer available"); + + Val* index = GpuLower::current()->tensorIndexer().getLinearIndex( + producer, consumer->definition(), loops, override_index, ld_st_matrix); + if (generate_pointer) { + auto address_offset = index; + if (producer->getMemoryType() == MemoryType::Shared) { + auto producer_dt = producer->getDataType(); + NVF_ERROR(producer_dt.has_value()); + auto index_dt = index->getDataType(); + NVF_ERROR(index_dt.has_value()); + address_offset = SimplifyingIrBuilder::mulExpr( + address_offset, + IrBuilder::create(dataTypeSizeByte(*producer_dt), *index_dt)); + } + index = SimplifyingIrBuilder::addExpr( + IrBuilder::baseAddressExpr(producer), address_offset); } index = GpuLower::current()->commonScalarMap().hoistScalar(index, loops); @@ -2050,33 +2005,25 @@ kir::TensorIndex* Index::getConsumerIndex( bool generate_pointer, DataType as_type, bool ld_st_matrix) { - Val* index = nullptr; - if (!ir_utils::hasRootToLoopLinearTransformations(consumer) || - ir_utils::isCpAsyncBulkLoad(consumer->definition()) || - GpuLower::current()->idModelOptions().isTensorIndexerEnabled() || - GpuLower::current()->tmemInfo().hasTMemTensor()) { - index = GpuLower::current()->tensorIndexer().getLinearIndex( - consumer, consumer->definition(), loops, override_index, ld_st_matrix); - if (generate_pointer) { - auto address_offset = index; - if (consumer->getMemoryType() == MemoryType::Shared) { - auto consumer_dt = consumer->getDataType(); - NVF_ERROR(consumer_dt.has_value()); - auto index_dt = index->getDataType(); - NVF_ERROR(index_dt.has_value()); - address_offset = SimplifyingIrBuilder::mulExpr( - index, - IrBuilder::create(dataTypeSizeByte(*consumer_dt), *index_dt)); - } - index = SimplifyingIrBuilder::addExpr( - IrBuilder::baseAddressExpr(consumer), address_offset); + NVF_ERROR( + GpuLower::current()->idModelOptions().isTensorIndexerEnabled(), + "Legacy indexer no longer available"); + + Val* index = GpuLower::current()->tensorIndexer().getLinearIndex( + consumer, consumer->definition(), loops, override_index, ld_st_matrix); + if (generate_pointer) { + auto address_offset = index; + if (consumer->getMemoryType() == MemoryType::Shared) { + auto consumer_dt = consumer->getDataType(); + NVF_ERROR(consumer_dt.has_value()); + auto index_dt = index->getDataType(); + NVF_ERROR(index_dt.has_value()); + address_offset = SimplifyingIrBuilder::mulExpr( + index, + IrBuilder::create(dataTypeSizeByte(*consumer_dt), *index_dt)); } - } else { - NVF_ERROR( - override_index.empty(), - "Overriding of consumer indexing with the legacy indexer is not " - "supported"); - index = getConsumerStridedIndices(consumer, loops, generate_pointer); + index = SimplifyingIrBuilder::addExpr( + IrBuilder::baseAddressExpr(consumer), address_offset); } index = GpuLower::current()->commonScalarMap().hoistScalar(index, loops); diff --git a/csrc/index_compute.h b/csrc/index_compute.h index 544388557c0..8ee430bd373 100644 --- a/csrc/index_compute.h +++ b/csrc/index_compute.h @@ -480,6 +480,7 @@ class Index { const std::unordered_map& override_index = {}, bool generate_pointer = false); + // TODO: Remove //! Returns a vector of strided indices mapped onto the //! allocation domain of a consumer tensor. The size of the returned //! vector is guaranteed to be equal to the number of axes of the