From 87cb9a0c72d8e27fcded4a98e7b3f885ba0016f0 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Thu, 5 Mar 2026 12:51:04 +0800 Subject: [PATCH 01/10] feat(taskflow): integrate MapTaskOnCgraPass for multi-CGRA placements --- include/TaskflowDialect/TaskflowPasses.h | 6 + .../Transforms/MapTaskOnCgraPass.cpp | 183 +++++++++++++----- .../ResourceAwareTaskOptimizationPass.cpp | 7 + .../irregular-loop/irregular-loop.mlir | 6 +- .../taskflow/multi-nested/multi-nested.mlir | 19 +- .../parallel-nested/parallel-nested.mlir | 3 +- .../taskflow/resnet/simple_resnet_tosa.mlir | 18 +- .../resource-heavy/resource-heavy.mlir | 3 +- 8 files changed, 183 insertions(+), 62 deletions(-) diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index a23c5b02..866365eb 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -5,6 +5,7 @@ #include "TaskflowDialect/TaskflowDialect.h" #include "TaskflowDialect/TaskflowOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Pass/PassRegistry.h" @@ -23,6 +24,11 @@ std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); std::unique_ptr createMapTaskOnCgraPass(); +// Runs the CGRA task placement logic directly on a function. +// grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols). +void runMapTaskOnCgra(mlir::func::FuncOp func, + int grid_rows = 4, int grid_cols = 4); + //=========================================================// // Optimization Passes //=========================================================// diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index c04df0b7..3ed417c4 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -407,25 +407,97 @@ class TaskMapper { } - /// Finds best placement for a task. - /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding logic - /// (cgra_count > 1) is experimental/placeholder and should ideally be handled - /// by an upstream resource binding pass. + // Parses a tile_shape string like "2x2" or "2x2[(0,0)(1,0)(0,1)]". + // Returns a list of (col, row) offsets relative to the placement origin. + // For rectangular shapes "NxM", generates all NxM positions. + // For non-rectangular shapes with explicit positions, uses the listed coords. + SmallVector> parseTileShapeOffsets( + StringRef tile_shape, int cgra_count) { + SmallVector> offsets; + + if (tile_shape.empty() || cgra_count <= 1) { + offsets.push_back({0, 0}); + return offsets; + } + + // Checks for explicit position list: "NxM[(c0,r0)(c1,r1)...]" + size_t bracket_pos = tile_shape.find('['); + if (bracket_pos != StringRef::npos) { + StringRef positions_str = tile_shape.substr(bracket_pos); + // Parses each (c,r) pair. + size_t pos = 0; + while (pos < positions_str.size()) { + size_t open = positions_str.find('(', pos); + if (open == StringRef::npos) break; + size_t close = positions_str.find(')', open); + if (close == StringRef::npos) break; + StringRef pair_str = positions_str.slice(open + 1, close); + auto [col_str, row_str] = pair_str.split(','); + int col_off = 0, row_off = 0; + col_str.getAsInteger(10, col_off); + row_str.getAsInteger(10, row_off); + offsets.push_back({col_off, row_off}); + pos = close + 1; + } + } else { + // Rectangular shape: "NxM" — parse rows × cols. + auto [rows_str, cols_str] = tile_shape.split('x'); + int rows = 1, cols = 1; + rows_str.getAsInteger(10, rows); + cols_str.getAsInteger(10, cols); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + offsets.push_back({c, r}); + } + } + } + + // Sanity: if parsing failed, at least return a single cell. + if (offsets.empty()) { + offsets.push_back({0, 0}); + } + return offsets; + } + + // Finds best placement for a task on the CGRA grid. + // For cgra_count > 1, reads the tile_shape attribute to determine the + // physical layout (rectangular or L/T-shape) and validates that all + // required positions fit within the grid boundary and are unoccupied. TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, TaskMemoryGraph &graph) { int best_score = INT_MIN; TaskPlacement best_placement; - // Baseline: For cgra_count=1, finds single best position. + // Reads tile_shape attribute if present. + StringRef tile_shape; + if (auto attr = task_node->op->getAttrOfType("tile_shape")) { + tile_shape = attr.getValue(); + } + + // Parses shape offsets from tile_shape string. + SmallVector> shape_offsets = + parseTileShapeOffsets(tile_shape, cgra_count); + + // Tries every valid placement origin on the grid. for (int r = 0; r < grid_rows_; ++r) { for (int c = 0; c < grid_cols_; ++c) { - if (occupied_[r][c]) { + // Checks if ALL positions in the shape fit within bounds and are free. + bool valid = true; + TaskPlacement candidate; + for (auto &[col_off, row_off] : shape_offsets) { + int pr = r + row_off; + int pc = c + col_off; + if (pr < 0 || pr >= grid_rows_ || pc < 0 || pc >= grid_cols_ || + occupied_[pr][pc]) { + valid = false; + break; + } + candidate.cgra_positions.push_back({pr, pc}); + } + if (!valid) { continue; } - TaskPlacement candidate; - candidate.cgra_positions.push_back({r, c}); - int score = computeScore(task_node, candidate, graph); if (score > best_score) { best_score = score; @@ -436,21 +508,35 @@ class TaskMapper { // Error handling: No available position found (grid over-subscribed). if (best_placement.cgra_positions.empty()) { - assert(false && "No available CGRA position found (grid over-subscribed)."); + llvm::errs() << "[MapTaskOnCgra] WARNING: No valid placement for task " + << task_node->op.getTaskName() + << " with cgra_count=" << cgra_count + << " tile_shape=" << tile_shape << "\n"; + // Fallback: place on any single free cell. + for (int r = 0; r < grid_rows_ && best_placement.cgra_positions.empty(); ++r) { + for (int c = 0; c < grid_cols_ && best_placement.cgra_positions.empty(); ++c) { + if (!occupied_[r][c]) { + best_placement.cgra_positions.push_back({r, c}); + } + } + } + if (best_placement.cgra_positions.empty()) { + assert(false && "No available CGRA position found (grid over-subscribed)."); + } } return best_placement; } - /// Computes placement score based on Task-Memory Graph. - /// TODO: Introduce explicit 'direct_wires' attributes in the IR for - /// downstream hardware generators to configure fast bypass paths between - /// adjacent PEs with dependencies. - /// - /// Score = α·SSA_Dist + β·Mem_Dist. - /// - /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands). - /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs. + // Computes placement score based on Task-Memory Graph. + // For multi-CGRA placements, uses the minimum distance from any position + // in the placement to the target, since adjacent CGRAs can communicate + // via fast bypass paths. + // + // Score = α·SSA_Dist + β·Mem_Dist. + // + // SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands). + // Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs. int computeScore(TaskNode *task_node, const TaskPlacement &placement, TaskMemoryGraph &graph) { // Weight constants (tunable). @@ -459,40 +545,44 @@ class TaskMapper { int ssa_score = 0; int mem_score = 0; - - CGRAPosition current_pos = placement.primary(); + + // Helper: minimum Manhattan distance from any position in this placement + // to a target position. + auto minDistToTarget = [&](const CGRAPosition &target) -> int { + int min_dist = INT_MAX; + for (const auto &pos : placement.cgra_positions) { + int d = pos.manhattanDistance(target); + min_dist = std::min(min_dist, d); + } + return min_dist; + }; // 1. SSA proximity (predecessors & successors). for (TaskNode *producer : task_node->ssa_operands) { - if (!producer->placement.empty()) { - int dist = current_pos.manhattanDistance(producer->placement[0]); - // Uses negative distance to penalize far-away placements. - ssa_score -= dist; - } + if (!producer->placement.empty()) { + int dist = minDistToTarget(producer->placement[0]); + ssa_score -= dist; + } } for (TaskNode *consumer : task_node->ssa_users) { - if (!consumer->placement.empty()) { - int dist = current_pos.manhattanDistance(consumer->placement[0]); - ssa_score -= dist; - } + if (!consumer->placement.empty()) { + int dist = minDistToTarget(consumer->placement[0]); + ssa_score -= dist; + } } // 2. Memory proximity. - // For read memrefs. for (MemoryNode *mem : task_node->read_memrefs) { - if (mem->assigned_sram_pos) { - int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos); - mem_score -= dist; - } + if (mem->assigned_sram_pos) { + int dist = minDistToTarget(*mem->assigned_sram_pos); + mem_score -= dist; + } } - // For write memrefs. - // If we write to a memory that is already assigned (e.g. read by previous task), - // we want to be close to it too. for (MemoryNode *mem : task_node->write_memrefs) { - if (mem->assigned_sram_pos) { - int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos); - mem_score -= dist; - } + if (mem->assigned_sram_pos) { + int dist = minDistToTarget(*mem->assigned_sram_pos); + mem_score -= dist; + } } return kAlpha * ssa_score + kBeta * mem_score; @@ -564,8 +654,8 @@ struct MapTaskOnCgraPass void runOnOperation() override { func::FuncOp func = getOperation(); - constexpr int kDefaultGridRows = 3; - constexpr int kDefaultGridCols = 3; + constexpr int kDefaultGridRows = 4; + constexpr int kDefaultGridCols = 4; TaskMapper mapper(kDefaultGridRows, kDefaultGridCols); mapper.place(func); } @@ -580,5 +670,10 @@ std::unique_ptr createMapTaskOnCgraPass() { return std::make_unique(); } +void runMapTaskOnCgra(func::FuncOp func, int grid_rows, int grid_cols) { + TaskMapper mapper(grid_rows, grid_cols); + mapper.place(func); +} + } // namespace taskflow } // namespace mlir diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp index c5052b83..2ce2c18e 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp @@ -1768,6 +1768,13 @@ struct ResourceAwareTaskOptimizationPass std::string shape_str = node->shape.irAttr(); node->op->setAttr("tile_shape", b.getStringAttr(shape_str)); } + + // Runs MapTaskOnCgraPass to produce global placement (task_mapping_info) + // with multi-CGRA support. The pass reads cgra_count and tile_shape + // from each task and places them on the 4x4 grid, validating that + // shapes physically fit and don't overlap. + taskflow::runMapTaskOnCgra(func, kCgraGridRows, kCgraGridCols); + break; } } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 364bcadc..13c56ab1 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -395,6 +395,8 @@ module attributes {} { // 0=Task_0_Task_1_utilfused, 1=Task_2; 2/16 CGRAs used // RESOPT: taskflow.task @Task_0_Task_1_utilfused -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 3 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 32 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 3 : i32, steps = 5 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 32 : i32} // RESOPT: taskflow.task @Task_2 -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 32 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 32 : i32} diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 42f99361..c4e7b76c 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -523,22 +523,25 @@ module attributes {} { // HYPERBLOCK-NEXT:} // PLACEMENT: taskflow.task @Task_0 -// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]} // PLACEMENT: taskflow.task @Task_1 -// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]} // PLACEMENT: taskflow.task @Task_2 -// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 3 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}, {col = 2 : i32, row = 0 : i32}, {col = 3 : i32, row = 0 : i32}], write_sram_locations = [{col = 3 : i32, row = 0 : i32}]} // PLACEMENT: taskflow.task @Task_3 -// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 1 : i32}]} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} // PLACEMENT: taskflow.task @Task_4 -// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]} +// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]} // RESOPT: taskflow.task @Task_1 -// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 160 : i32 +// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 160 : i32 // RESOPT: taskflow.task @Task_0_Task_2_fused_Task_3_utilfused -// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 192 : i32 +// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 5 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 192 : i32 // RESOPT: taskflow.task @Task_4 -// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 36 : i32 +// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 36 : i32 // RESOPT: return // CGRA Tile Occupation after RESOPT (4x4 grid, col x row): diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index 3d63f767..881d81ec 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -161,7 +161,8 @@ module { // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 0 : i32}]} // RESOPT: taskflow.task @Task_0_Task_1_utilfused -// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 64 : i32 +// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 64 : i32 // RESOPT: return // CGRA Tile Occupation after RESOPT (4x4 grid, col x row): diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir index f1741b0a..f6974c26 100644 --- a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir +++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir @@ -704,17 +704,23 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // RESOPT: taskflow.task @Task_1_Task_0_Task_2_utilfused_utilfused -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 4 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 6400 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 4 : i32, steps = 3 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 6400 : i32} // RESOPT: taskflow.task @Task_3 -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 2359296 : i32} // RESOPT: taskflow.task @Task_4_Task_5_fused_Task_7_utilfused -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 6400 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 6400 : i32} // RESOPT: taskflow.task @Task_6_Task_8_utilfused -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 4096 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 3 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 4096 : i32} // RESOPT: taskflow.task @Task_9 -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 2359296 : i32} // RESOPT: taskflow.task @Task_10_Task_11_Task_12_fused_fused -// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 8 : i32, tile_shape = "1x1", trip_count = 4096 : i32} +// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 8 : i32 +// RESOPT-SAME: tile_shape = "1x1", trip_count = 4096 : i32} // RESOPT: return diff --git a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir index ffc37f2d..3e253b8c 100644 --- a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir +++ b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir @@ -192,7 +192,8 @@ module { // TASKFLOW: return // RESOPT: taskflow.task @Task_0_Task_1_utilfused -// RESOPT-SAME: {cgra_count = 3 : i32, compiled_ii = 1 : i32, steps = 10 : i32, tile_shape = "2x2[(0,0)(1,0)(0,1)]", trip_count = 64 : i32} +// RESOPT-SAME: {cgra_count = 3 : i32, compiled_ii = 1 : i32, steps = 10 : i32 +// RESOPT-SAME: tile_shape = "2x2[(0,0)(1,0)(0,1)]", trip_count = 64 : i32} // RESOPT: return // CGRA Tile Occupation after RESOPT (4x4 grid, col x row): From 273610ac91ec32639ab6c99033641f4798eab747 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Thu, 5 Mar 2026 13:33:27 +0800 Subject: [PATCH 02/10] Refined fallback logic --- .../Transforms/MapTaskOnCgraPass.cpp | 110 +++++++++++++++++- 1 file changed, 106 insertions(+), 4 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index 3ed417c4..c321545f 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -22,7 +22,9 @@ #include #include #include +#include #include +#include #include #include @@ -453,6 +455,8 @@ class TaskMapper { } // Sanity: if parsing failed, at least return a single cell. + // USER COMMENT: Assert here. + assert(!offsets.empty() && "tile_shape parsing yielded empty offsets"); if (offsets.empty()) { offsets.push_back({0, 0}); } @@ -513,13 +517,111 @@ class TaskMapper { << " with cgra_count=" << cgra_count << " tile_shape=" << tile_shape << "\n"; // Fallback: place on any single free cell. - for (int r = 0; r < grid_rows_ && best_placement.cgra_positions.empty(); ++r) { - for (int c = 0; c < grid_cols_ && best_placement.cgra_positions.empty(); ++c) { - if (!occupied_[r][c]) { - best_placement.cgra_positions.push_back({r, c}); + // USER COMMENT: The logic should be: 1.tires rectangular shape 2. If fails, try other shapes 3. If all fail, fallback to current cell -1 numbers of cells. + for (int k = cgra_count; k >= 1 && best_placement.cgra_positions.empty(); --k) { + + // 1. Try rectangular shapes of size k + SmallVector>> rect_shapes; + for (int r = 1; r <= k; ++r) { + if (k % r == 0) { + int c = k / r; + SmallVector> shape; + for (int i = 0; i < r; ++i) { + for (int j = 0; j < c; ++j) { + shape.push_back({j, i}); + } + } + rect_shapes.push_back(shape); + } + } + + int current_best_score = INT_MIN; + TaskPlacement current_best_placement; + + for (const auto &shape : rect_shapes) { + for (int r = 0; r < grid_rows_; ++r) { + for (int c = 0; c < grid_cols_; ++c) { + bool valid = true; + TaskPlacement candidate; + for (auto &[col_off, row_off] : shape) { + int pr = r + row_off; + int pc = c + col_off; + if (pr < 0 || pr >= grid_rows_ || pc < 0 || pc >= grid_cols_ || + occupied_[pr][pc]) { + valid = false; + break; + } + candidate.cgra_positions.push_back({pr, pc}); + } + if (valid) { + int score = computeScore(task_node, candidate, graph); + if (score > current_best_score) { + current_best_score = score; + current_best_placement = candidate; + } + } + } } } + + if (!current_best_placement.cgra_positions.empty()) { + best_placement = current_best_placement; + break; // Found valid rectangular placement + } + + // 2. Try other (non-rectangular) connected shapes of size k + std::set visited_masks; + int other_best_score = INT_MIN; + TaskPlacement other_best_placement; + + std::function&, uint64_t)> searchShapes = + [&](SmallVector& current, uint64_t mask) { + if (current.size() == (size_t)k) { + if (visited_masks.insert(mask).second) { + TaskPlacement candidate; + candidate.cgra_positions = current; + int score = computeScore(task_node, candidate, graph); + if (score > other_best_score) { + other_best_score = score; + other_best_placement = candidate; + } + } + return; + } + for (size_t i = 0; i < current.size(); ++i) { + auto pos = current[i]; + const int dr[] = {-1, 1, 0, 0}; + const int dc[] = {0, 0, -1, 1}; + for (int d = 0; d < 4; ++d) { + int nr = pos.row + dr[d]; + int nc = pos.col + dc[d]; + if (nr >= 0 && nr < grid_rows_ && nc >= 0 && nc < grid_cols_ && !occupied_[nr][nc]) { + uint64_t bit = 1ULL << (nr * grid_cols_ + nc); + if ((mask & bit) == 0) { + current.push_back({nr, nc}); + searchShapes(current, mask | bit); + current.pop_back(); + } + } + } + } + }; + + for (int r = 0; r < grid_rows_; ++r) { + for (int c = 0; c < grid_cols_; ++c) { + if (!occupied_[r][c]) { + SmallVector start = {{r, c}}; + searchShapes(start, 1ULL << (r * grid_cols_ + c)); + } + } + } + + if (!other_best_placement.cgra_positions.empty()) { + best_placement = other_best_placement; + break; // Found valid non-rectangular connected placement + } } + if (best_placement.cgra_positions.empty()) { assert(false && "No available CGRA position found (grid over-subscribed)."); } From 798fe841bcd8c601d87bf85007c2c27f11d4798f Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Mar 2026 09:01:05 +0800 Subject: [PATCH 03/10] refactor(taskflow): cascading shape search in MapTaskOnCgraPass placement - findBestPlacement now tries rectangular shapes first, then non-rectangular connected shapes, then falls back to k-1 CGRAs (down to 1). - Removed outdated TODO comment about MapTaskOnCgraPass not supporting multi-CGRA placement. - Added assert for empty tile_shape offsets. - Cleaned up USER COMMENT annotations. --- .../Transforms/MapTaskOnCgraPass.cpp | 200 ++++++++---------- .../ResourceAwareTaskOptimizationPass.cpp | 4 - 2 files changed, 88 insertions(+), 116 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index c321545f..081a3417 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -454,38 +454,21 @@ class TaskMapper { } } - // Sanity: if parsing failed, at least return a single cell. - // USER COMMENT: Assert here. + // Sanity: if parsing failed, assert. assert(!offsets.empty() && "tile_shape parsing yielded empty offsets"); - if (offsets.empty()) { - offsets.push_back({0, 0}); - } return offsets; } - // Finds best placement for a task on the CGRA grid. - // For cgra_count > 1, reads the tile_shape attribute to determine the - // physical layout (rectangular or L/T-shape) and validates that all - // required positions fit within the grid boundary and are unoccupied. - TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, - TaskMemoryGraph &graph) { + // Tries placing a shape (given as col/row offsets) at every grid origin. + // Returns the best-scoring valid placement, or empty if none fits. + TaskPlacement tryPlaceShape( + TaskNode *task_node, + const SmallVector> &shape_offsets, + TaskMemoryGraph &graph) { int best_score = INT_MIN; TaskPlacement best_placement; - - // Reads tile_shape attribute if present. - StringRef tile_shape; - if (auto attr = task_node->op->getAttrOfType("tile_shape")) { - tile_shape = attr.getValue(); - } - - // Parses shape offsets from tile_shape string. - SmallVector> shape_offsets = - parseTileShapeOffsets(tile_shape, cgra_count); - - // Tries every valid placement origin on the grid. for (int r = 0; r < grid_rows_; ++r) { for (int c = 0; c < grid_cols_; ++c) { - // Checks if ALL positions in the shape fit within bounds and are free. bool valid = true; TaskPlacement candidate; for (auto &[col_off, row_off] : shape_offsets) { @@ -498,10 +481,7 @@ class TaskMapper { } candidate.cgra_positions.push_back({pr, pc}); } - if (!valid) { - continue; - } - + if (!valid) continue; int score = computeScore(task_node, candidate, graph); if (score > best_score) { best_score = score; @@ -509,97 +489,60 @@ class TaskMapper { } } } + return best_placement; + } - // Error handling: No available position found (grid over-subscribed). - if (best_placement.cgra_positions.empty()) { - llvm::errs() << "[MapTaskOnCgra] WARNING: No valid placement for task " - << task_node->op.getTaskName() - << " with cgra_count=" << cgra_count - << " tile_shape=" << tile_shape << "\n"; - // Fallback: place on any single free cell. - // USER COMMENT: The logic should be: 1.tires rectangular shape 2. If fails, try other shapes 3. If all fail, fallback to current cell -1 numbers of cells. - for (int k = cgra_count; k >= 1 && best_placement.cgra_positions.empty(); --k) { - - // 1. Try rectangular shapes of size k - SmallVector>> rect_shapes; - for (int r = 1; r <= k; ++r) { - if (k % r == 0) { - int c = k / r; - SmallVector> shape; - for (int i = 0; i < r; ++i) { - for (int j = 0; j < c; ++j) { - shape.push_back({j, i}); - } - } - rect_shapes.push_back(shape); - } - } - - int current_best_score = INT_MIN; - TaskPlacement current_best_placement; - - for (const auto &shape : rect_shapes) { - for (int r = 0; r < grid_rows_; ++r) { - for (int c = 0; c < grid_cols_; ++c) { - bool valid = true; - TaskPlacement candidate; - for (auto &[col_off, row_off] : shape) { - int pr = r + row_off; - int pc = c + col_off; - if (pr < 0 || pr >= grid_rows_ || pc < 0 || pc >= grid_cols_ || - occupied_[pr][pc]) { - valid = false; - break; - } - candidate.cgra_positions.push_back({pr, pc}); - } - if (valid) { - int score = computeScore(task_node, candidate, graph); - if (score > current_best_score) { - current_best_score = score; - current_best_placement = candidate; - } - } - } - } - } - - if (!current_best_placement.cgra_positions.empty()) { - best_placement = current_best_placement; - break; // Found valid rectangular placement - } + // Generates all rectangular shapes (as col/row offset lists) of size k. + // E.g. k=4 → 1×4, 2×2, 4×1. + SmallVector>> getRectShapes(int k) { + SmallVector>> shapes; + for (int rows = 1; rows <= k; ++rows) { + if (k % rows != 0) continue; + int cols = k / rows; + SmallVector> offsets; + for (int r = 0; r < rows; ++r) + for (int c = 0; c < cols; ++c) + offsets.push_back({c, r}); // {col_off, row_off} + shapes.push_back(offsets); + } + return shapes; + } - // 2. Try other (non-rectangular) connected shapes of size k - std::set visited_masks; - int other_best_score = INT_MIN; - TaskPlacement other_best_placement; + // Searches all connected non-rectangular shapes of size k on the grid + // and returns the best-scoring valid placement, or empty if none found. + TaskPlacement tryNonRectShapes(TaskNode *task_node, int k, + TaskMemoryGraph &graph) { + std::set visited_masks; + int best_score = INT_MIN; + TaskPlacement best_placement; - std::function&, uint64_t)> searchShapes = - [&](SmallVector& current, uint64_t mask) { - if (current.size() == (size_t)k) { + std::function &, uint64_t)> search = + [&](SmallVector ¤t, uint64_t mask) { + if ((int)current.size() == k) { if (visited_masks.insert(mask).second) { TaskPlacement candidate; candidate.cgra_positions = current; int score = computeScore(task_node, candidate, graph); - if (score > other_best_score) { - other_best_score = score; - other_best_placement = candidate; + if (score > best_score) { + best_score = score; + best_placement = candidate; } } return; } + constexpr int dr[] = {-1, 1, 0, 0}; + constexpr int dc[] = {0, 0, -1, 1}; for (size_t i = 0; i < current.size(); ++i) { auto pos = current[i]; - const int dr[] = {-1, 1, 0, 0}; - const int dc[] = {0, 0, -1, 1}; for (int d = 0; d < 4; ++d) { int nr = pos.row + dr[d]; int nc = pos.col + dc[d]; - if (nr >= 0 && nr < grid_rows_ && nc >= 0 && nc < grid_cols_ && !occupied_[nr][nc]) { + if (nr >= 0 && nr < grid_rows_ && nc >= 0 && nc < grid_cols_ && + !occupied_[nr][nc]) { uint64_t bit = 1ULL << (nr * grid_cols_ + nc); if ((mask & bit) == 0) { current.push_back({nr, nc}); - searchShapes(current, mask | bit); + search(current, mask | bit); current.pop_back(); } } @@ -607,26 +550,59 @@ class TaskMapper { } }; - for (int r = 0; r < grid_rows_; ++r) { - for (int c = 0; c < grid_cols_; ++c) { - if (!occupied_[r][c]) { - SmallVector start = {{r, c}}; - searchShapes(start, 1ULL << (r * grid_cols_ + c)); - } - } + for (int r = 0; r < grid_rows_; ++r) { + for (int c = 0; c < grid_cols_; ++c) { + if (!occupied_[r][c]) { + SmallVector start = {{r, c}}; + search(start, 1ULL << (r * grid_cols_ + c)); } + } + } + return best_placement; + } + + // Finds best placement for a task on the CGRA grid. + // + // Search order (for k = cgra_count down to 1): + // 1. Try all rectangular shapes of size k (1×k, 2×(k/2), …, k×1). + // 2. If none fits, try all connected non-rectangular shapes of size k. + // 3. If still nothing, decrement k and repeat. + // This guarantees the task gets the largest possible contiguous CGRA + // allocation that physically fits on the current grid. + TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, + TaskMemoryGraph &graph) { + TaskPlacement best_placement; - if (!other_best_placement.cgra_positions.empty()) { - best_placement = other_best_placement; - break; // Found valid non-rectangular connected placement + for (int k = cgra_count; k >= 1; --k) { + // 1. Rectangular shapes of size k. + for (auto &shape : getRectShapes(k)) { + best_placement = tryPlaceShape(task_node, shape, graph); + if (!best_placement.cgra_positions.empty()) { + if (k < cgra_count) { + llvm::errs() << "[MapTaskOnCgra] Fallback: placed " + << task_node->op.getTaskName() + << " on " << k << " CGRAs (requested " + << cgra_count << ")\n"; + } + return best_placement; } } - if (best_placement.cgra_positions.empty()) { - assert(false && "No available CGRA position found (grid over-subscribed)."); + // 2. Non-rectangular connected shapes of size k. + best_placement = tryNonRectShapes(task_node, k, graph); + if (!best_placement.cgra_positions.empty()) { + if (k < cgra_count) { + llvm::errs() << "[MapTaskOnCgra] Fallback (non-rect): placed " + << task_node->op.getTaskName() + << " on " << k << " CGRAs (requested " + << cgra_count << ")\n"; + } + return best_placement; } } + // Should never reach here on a valid grid. + assert(false && "No available CGRA position found (grid over-subscribed)."); return best_placement; } diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp index 2ce2c18e..5a7e0d63 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp @@ -852,10 +852,6 @@ class PipelineBalancer { // verify if the speculatively increased CGRA count and its proposed shape // actually fit on the 4x4 grid alongside other previously allocated tasks. // - // Currently, MapTaskOnCgraPass does not support multi-CGRA task placement. - // Once it does, we should call it here; if global placement fails for the - // "best" shape, we should backtrack and try alternative shapes before - // saturating the node. if (!canFitOnGrid(new_cgra_count)) { saturated_nodes.insert(bottleneck); continue; From 1b41e8dd32226db022db36d7475538fb4af2142d Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Mar 2026 09:08:39 +0800 Subject: [PATCH 04/10] refactor(taskflow): cascading placement search with fallback, clean up comments - findBestPlacement tries rect then non-rect shapes for requested cgra_count. - If placement fails, caller falls back to cgra_count-1 (reject extra CGRA). - Normalize /// to // comment style throughout MapTaskOnCgraPass. - Remove outdated TODO comments. --- .../Transforms/MapTaskOnCgraPass.cpp | 118 ++++++++---------- 1 file changed, 53 insertions(+), 65 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index 081a3417..1c37c39b 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -36,7 +36,7 @@ namespace { //===----------------------------------------------------------------------===// // CGRA Grid Position //===----------------------------------------------------------------------===// -/// Represents a position on the 2D CGRA grid. +// Represents a position on the 2D CGRA grid. struct CGRAPosition { int row; int col; @@ -49,12 +49,12 @@ struct CGRAPosition { return !(*this == other); } - /// Computes Manhattan distance to another position. + // Computes Manhattan distance to another position. int manhattanDistance(const CGRAPosition &other) const { return std::abs(row - other.row) + std::abs(col - other.col); } - /// Checks if adjacent (Manhattan distance = 1). + // Checks if adjacent (Manhattan distance = 1). bool isAdjacent(const CGRAPosition &other) const { return manhattanDistance(other) == 1; } @@ -63,19 +63,19 @@ struct CGRAPosition { //===----------------------------------------------------------------------===// // Task Placement Info //===----------------------------------------------------------------------===// -/// Stores placement info for a task: can span multiple combined CGRAs. +// Stores placement info for a task: can span multiple combined CGRAs. struct TaskPlacement { SmallVector cgra_positions; // CGRAs assigned to this task. - /// Returns the primary (first) position. + // Returns the primary (first) position. CGRAPosition primary() const { return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0]; } - /// Returns the number of CGRAs assigned. + // Returns the number of CGRAs assigned. size_t cgraCount() const { return cgra_positions.size(); } - /// Checks if any CGRA in this task is adjacent to any in other task. + // Checks if any CGRA in this task is adjacent to any in other task. bool hasAdjacentCGRA(const TaskPlacement &other) const { for (const auto &pos : cgra_positions) { for (const auto &other_pos : other.cgra_positions) { @@ -94,7 +94,7 @@ struct TaskPlacement { struct MemoryNode; -/// Represents a Task node in the graph. +// Represents a Task node in the graph. struct TaskNode { size_t id; TaskflowTaskOp op; @@ -112,7 +112,7 @@ struct TaskNode { TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {} }; -/// Represents a Memory node (MemRef) in the graph. +// Represents a Memory node (MemRef) in the graph. struct MemoryNode { Value memref; @@ -126,7 +126,7 @@ struct MemoryNode { MemoryNode(Value memref) : memref(memref) {} }; -/// The Task-Memory Dependency Graph. +// The Task-Memory Dependency Graph. class TaskMemoryGraph { public: SmallVector> task_nodes; @@ -196,7 +196,7 @@ class TaskMemoryGraph { //===----------------------------------------------------------------------===// // Task Mapper //===----------------------------------------------------------------------===// -/// Maps a task-memory graph onto a 2D CGRA grid. +// Maps a task-memory graph onto a 2D CGRA grid. class TaskMapper { public: @@ -208,7 +208,7 @@ class TaskMapper { } } - /// Places all tasks and performs memory mapping. + // Places all tasks and performs memory mapping. void place(func::FuncOp func) { SmallVector tasks; func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); }); @@ -268,11 +268,20 @@ class TaskMapper { // Finds best placement using SRAM positions from previous iter (or -1/default). TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph); - + + // If the requested cgra_count doesn't fit, fall back to cgra_count-1 + // (i.e. reject the extra CGRA and keep previous allocation). + if (placement.cgra_positions.empty() && cgra_count > 1) { + int fallback = cgra_count - 1; + llvm::errs() << "[MapTaskOnCgra] Cannot place " + << task_node->op.getTaskName() + << " with cgra_count=" << cgra_count + << ", falling back to " << fallback << "\n"; + placement = findBestPlacement(task_node, fallback, graph); + } + // Commits Placement. task_node->placement.push_back(placement.primary()); - // Handles mapping one task on multi-CGRAs. - // TODO: Introduce explicit multi-CGRA binding logic. for (size_t i = 1; i < placement.cgra_positions.size(); ++i) { task_node->placement.push_back(placement.cgra_positions[i]); } @@ -359,7 +368,7 @@ class TaskMapper { } private: - /// Clears task placement and occupied grid. + // Clears task placement and occupied grid. void resetTaskPlacements(TaskMemoryGraph &graph) { for (auto &task : graph.task_nodes) { task->placement.clear(); @@ -370,8 +379,8 @@ class TaskMapper { } } - /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks. - /// Returns true if any SRAM assignment changed. + // Assigns all memory nodes to SRAMs based on centroid of accessing tasks. + // Returns true if any SRAM assignment changed. bool assignAllSRAMs(TaskMemoryGraph &graph) { bool changed = false; for (auto &mem_node : graph.memory_nodes) { @@ -563,47 +572,26 @@ class TaskMapper { // Finds best placement for a task on the CGRA grid. // - // Search order (for k = cgra_count down to 1): - // 1. Try all rectangular shapes of size k (1×k, 2×(k/2), …, k×1). - // 2. If none fits, try all connected non-rectangular shapes of size k. - // 3. If still nothing, decrement k and repeat. - // This guarantees the task gets the largest possible contiguous CGRA - // allocation that physically fits on the current grid. + // Search order: + // 1. Try all rectangular shapes of size cgra_count. + // 2. If none fits, try all connected non-rectangular shapes of size cgra_count. + // 3. If still nothing, return empty (caller handles fallback to cgra_count-1). TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count, TaskMemoryGraph &graph) { - TaskPlacement best_placement; - - for (int k = cgra_count; k >= 1; --k) { - // 1. Rectangular shapes of size k. - for (auto &shape : getRectShapes(k)) { - best_placement = tryPlaceShape(task_node, shape, graph); - if (!best_placement.cgra_positions.empty()) { - if (k < cgra_count) { - llvm::errs() << "[MapTaskOnCgra] Fallback: placed " - << task_node->op.getTaskName() - << " on " << k << " CGRAs (requested " - << cgra_count << ")\n"; - } - return best_placement; - } - } + // 1. Rectangular shapes. + for (auto &shape : getRectShapes(cgra_count)) { + TaskPlacement p = tryPlaceShape(task_node, shape, graph); + if (!p.cgra_positions.empty()) return p; + } - // 2. Non-rectangular connected shapes of size k. - best_placement = tryNonRectShapes(task_node, k, graph); - if (!best_placement.cgra_positions.empty()) { - if (k < cgra_count) { - llvm::errs() << "[MapTaskOnCgra] Fallback (non-rect): placed " - << task_node->op.getTaskName() - << " on " << k << " CGRAs (requested " - << cgra_count << ")\n"; - } - return best_placement; - } + // 2. Non-rectangular connected shapes. + if (cgra_count > 1) { + TaskPlacement p = tryNonRectShapes(task_node, cgra_count, graph); + if (!p.cgra_positions.empty()) return p; } - // Should never reach here on a valid grid. - assert(false && "No available CGRA position found (grid over-subscribed)."); - return best_placement; + // Nothing fits — return empty so caller can decide. + return {}; } // Computes placement score based on Task-Memory Graph. @@ -666,16 +654,16 @@ class TaskMapper { return kAlpha * ssa_score + kBeta * mem_score; } - /// Computes dependency depth for all tasks in the graph. - /// - /// Dependency depth = longest path from this node to any sink node in the - /// dependency graph (via SSA or memory edges). - /// - /// Tasks with higher dependency depth have longer chains of dependent tasks - /// after them. By placing these tasks first: - /// 1. They get priority access to good grid positions. - /// 2. Their dependent tasks can then be positioned adjacent to them, - /// minimizing inter-task communication distance. + // Computes dependency depth for all tasks in the graph. + // + // Dependency depth = longest path from this node to any sink node in the + // dependency graph (via SSA or memory edges). + // + // Tasks with higher dependency depth have longer chains of dependent tasks + // after them. By placing these tasks first: + // 1. They get priority access to good grid positions. + // 2. Their dependent tasks can then be positioned adjacent to them, + // minimizing inter-task communication distance. void computeDependencyDepth(TaskMemoryGraph &graph) { DenseMap depth_cache; for (auto &node : graph.task_nodes) { @@ -683,7 +671,7 @@ class TaskMapper { } } - /// Recursively calculates dependency depth for a single task. + // Recursively calculates dependency depth for a single task. int calculateDepth(TaskNode *node, DenseMap &depth_cache) { if (depth_cache.count(node)) { return depth_cache[node]; From 8f37f89254c68c349daec89624a3378f7c34b63d Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Fri, 6 Mar 2026 09:30:03 +0800 Subject: [PATCH 05/10] fix(taskflow): multi-CGRA aware SRAM centroid and SSA scoring - SRAM centroid now includes ALL CGRA positions of multi-CGRA tasks, not just placement[0]. - SSA proximity scoring uses min distance between two multi-CGRA placements (minDistToPlacement) instead of only comparing to the other task's primary position. --- .../Transforms/MapTaskOnCgraPass.cpp | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp index 1c37c39b..029404a8 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp @@ -387,16 +387,16 @@ class TaskMapper { // Computes centroid of all tasks that access this memory. int total_row = 0, total_col = 0, count = 0; for (TaskNode *reader : mem_node->readers) { - if (!reader->placement.empty()) { - total_row += reader->placement[0].row; - total_col += reader->placement[0].col; + for (const auto &pos : reader->placement) { + total_row += pos.row; + total_col += pos.col; count++; } } for (TaskNode *writer : mem_node->writers) { - if (!writer->placement.empty()) { - total_row += writer->placement[0].row; - total_col += writer->placement[0].col; + for (const auto &pos : writer->placement) { + total_row += pos.row; + total_col += pos.col; count++; } } @@ -612,13 +612,24 @@ class TaskMapper { int ssa_score = 0; int mem_score = 0; + // Helper: minimum Manhattan distance between any position in this + // placement and any position in another task's placement. + auto minDistToPlacement = [&](const SmallVector &other) -> int { + int min_dist = INT_MAX; + for (const auto &pos : placement.cgra_positions) { + for (const auto &opos : other) { + min_dist = std::min(min_dist, pos.manhattanDistance(opos)); + } + } + return min_dist; + }; + // Helper: minimum Manhattan distance from any position in this placement - // to a target position. + // to a single target position. auto minDistToTarget = [&](const CGRAPosition &target) -> int { int min_dist = INT_MAX; for (const auto &pos : placement.cgra_positions) { - int d = pos.manhattanDistance(target); - min_dist = std::min(min_dist, d); + min_dist = std::min(min_dist, pos.manhattanDistance(target)); } return min_dist; }; @@ -626,13 +637,13 @@ class TaskMapper { // 1. SSA proximity (predecessors & successors). for (TaskNode *producer : task_node->ssa_operands) { if (!producer->placement.empty()) { - int dist = minDistToTarget(producer->placement[0]); + int dist = minDistToPlacement(producer->placement); ssa_score -= dist; } } for (TaskNode *consumer : task_node->ssa_users) { if (!consumer->placement.empty()) { - int dist = minDistToTarget(consumer->placement[0]); + int dist = minDistToPlacement(consumer->placement); ssa_score -= dist; } } From c17d7fcfa08913a989e88067d1baa949b998a0b6 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Sat, 7 Mar 2026 12:06:51 +0800 Subject: [PATCH 06/10] refactor: rename MapToAcceleratorPass to MapOperationOnTilePass --- .../NeuraDialect/Architecture/Architecture.h | 2 +- include/NeuraDialect/NeuraPasses.h | 6 +- include/NeuraDialect/NeuraPasses.td | 10 +-- lib/NeuraDialect/NeuraPasses.cpp | 2 +- lib/NeuraDialect/Transforms/CMakeLists.txt | 2 +- ...torPass.cpp => MapOperationOnTilePass.cpp} | 68 +++++++++---------- .../ResourceAwareTaskOptimizationPass.cpp | 20 +++--- test/arch_spec/README.md | 2 +- test/c2llvm2mlir/nested_loop/test.mlir | 2 +- test/c2llvm2mlir/simple_loop/test.mlir | 2 +- test/code_gen/test_code_generate.mlir | 2 +- .../perfect_nested/perfect_nested.mlir | 2 +- .../simple_loop/simple_loop.mlir | 2 +- .../simple_loop_reduction.mlir | 2 +- test/e2e/axpy/axpy_kernel.mlir | 2 +- test/e2e/bicg/bicg_int_kernel.mlir | 2 +- test/e2e/bicg/bicg_kernel.mlir | 2 +- test/e2e/fft/fft_kernel.mlir | 2 +- test/e2e/fir/fir_kernel.mlir | 2 +- test/e2e/fir/fir_kernel_vec.mlir | 2 +- test/e2e/gemm/gemm_kernel.mlir | 2 +- test/e2e/gemv/gemv_kernel.mlir | 2 +- test/e2e/histogram/histogram_kernel.mlir | 2 +- test/e2e/relu/relu_kernel.mlir | 2 +- test/e2e/spmv/spmv_kernel.mlir | 2 +- test/honor_arch/fir_removed_tiles_test.mlir | 2 +- test/mapping_quality/branch_for.mlir | 4 +- test/mapping_quality/tiny_loop.mlir | 4 +- test/multi-cgra/kernel_mapping/fir/fir.mlir | 2 +- .../loop-in-kernel/loop-in-kernel.mlir | 2 +- test/multi-cgra/kernel_mapping/relu/relu.mlir | 2 +- test/neura/ctrl/branch_for.mlir | 4 +- test/neura/for_loop/relu_test.mlir | 2 +- test/neura/fusion/test.mlir | 4 +- .../steer_ctrl/loop_with_return_value.mlir | 2 +- 35 files changed, 87 insertions(+), 87 deletions(-) rename lib/NeuraDialect/Transforms/{MapToAcceleratorPass.cpp => MapOperationOnTilePass.cpp} (87%) diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h index 4f8e5cc2..cef36626 100644 --- a/include/NeuraDialect/Architecture/Architecture.h +++ b/include/NeuraDialect/Architecture/Architecture.h @@ -505,7 +505,7 @@ class Architecture { // specific tiles are valid: // std::vector overrides; // // First mark all tiles as non-existent, then mark valid ones existent. - // // (see MapToAcceleratorPass for the full valid_tiles parsing logic) + // // (see MapOperationOnTilePass for the full valid_tiles parsing logic) // auto arch_T = getArchitecture().cloneWithNewDimensions(8, 12, overrides); std::unique_ptr cloneWithNewDimensions( int new_per_cgra_rows, int new_per_cgra_columns, diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h index 56a9e785..f06568a1 100644 --- a/include/NeuraDialect/NeuraPasses.h +++ b/include/NeuraDialect/NeuraPasses.h @@ -23,10 +23,10 @@ std::unique_ptr createInsertCtrlMovPass(); std::unique_ptr createAssignAcceleratorPass(); std::unique_ptr createTransformCtrlToDataFlowPass(); std::unique_ptr createLeveragePredicatedValuePass(); -// Creates the MapToAccelerator pass. Tile dimensions default to 0 (use +// Creates the MapOperationOnTile pass. Tile dimensions default to 0 (use // architecture singleton) when not specified via options. -std::unique_ptr createMapToAcceleratorPass( - const MapToAcceleratorOptions &options = MapToAcceleratorOptions{}); +std::unique_ptr createMapOperationOnTilePass( + const MapOperationOnTileOptions &options = MapOperationOnTileOptions{}); std::unique_ptr createGenerateCodePass(); std::unique_ptr createCanonicalizeReturnPass(); std::unique_ptr createCanonicalizeLiveInPass(); diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index f7fc06a3..965aaa2e 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -50,7 +50,7 @@ def LeveragePredicatedValue : Pass<"leverage-predicated-value", "ModuleOp"> { let constructor = "neura::createLeveragePredicatedValuePass()"; } -def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> { +def MapOperationOnTile : Pass<"map-operation-on-tile", "ModuleOp"> { let summary = "Map Neura operations onto a given accelerator"; let description = [{ This pass performs mapping from Neura operations to accelerator. @@ -65,11 +65,11 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> { Examples: Single CGRA (default): - --map-to-accelerator + --map-operation-on-tile 1×3 rectangular (3 CGRAs in a row): - --map-to-accelerator x-tiles=12 y-tiles=4 + --map-operation-on-tile x-tiles=12 y-tiles=4 T-shape (4 CGRAs: top row of 3 + centre below): - --map-to-accelerator x-tiles=12 y-tiles=8 \ + --map-operation-on-tile x-tiles=12 y-tiles=8 \ valid-tiles="0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,8_0,9_0,10_0,11_0,\ 4_1,5_1,6_1,7_1,4_4,5_4,6_4,7_4,4_5,5_5,6_5,7_5" }]; @@ -89,7 +89,7 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> { "x-tiles x y-tiles rectangle are valid. " "Example: 0_0,1_0,0_1 selects three tiles forming an L-shape."> ]; - let constructor = "neura::createMapToAcceleratorPass()"; + let constructor = "neura::createMapOperationOnTilePass()"; } def GenerateCode : Pass<"generate-code", "ModuleOp"> { diff --git a/lib/NeuraDialect/NeuraPasses.cpp b/lib/NeuraDialect/NeuraPasses.cpp index 80b6a6f1..296626b1 100644 --- a/lib/NeuraDialect/NeuraPasses.cpp +++ b/lib/NeuraDialect/NeuraPasses.cpp @@ -43,7 +43,7 @@ void mlir::neura::registerNeuraConversionPassPipeline() { pm.addPass(mlir::neura::createInsertDataMovPass()); pm.addPass(mlir::createPrintOpGraphPass(os)); - pm.addPass(mlir::neura::createMapToAcceleratorPass()); + pm.addPass(mlir::neura::createMapOperationOnTilePass()); pm.addPass(mlir::neura::createGenerateCodePass()); }); } diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt index 010fc3c7..703f3360 100644 --- a/lib/NeuraDialect/Transforms/CMakeLists.txt +++ b/lib/NeuraDialect/Transforms/CMakeLists.txt @@ -8,7 +8,7 @@ add_mlir_library( AssignAcceleratorPass.cpp TransformCtrlToDataFlowPass.cpp LeveragePredicatedValuePass.cpp - MapToAcceleratorPass.cpp + MapOperationOnTilePass.cpp GenerateCodePass.cpp CanonicalizeReturnPass.cpp CanonicalizeLiveInPass.cpp diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapOperationOnTilePass.cpp similarity index 87% rename from lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp rename to lib/NeuraDialect/Transforms/MapOperationOnTilePass.cpp index f6166968..ae55ff73 100644 --- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp +++ b/lib/NeuraDialect/Transforms/MapOperationOnTilePass.cpp @@ -32,11 +32,11 @@ using namespace mlir::neura::yamlkeys; #include "NeuraDialect/NeuraPasses.h.inc" namespace { -struct MapToAcceleratorPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapToAcceleratorPass) +struct MapOperationOnTilePass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapOperationOnTilePass) - StringRef getArgument() const override { return "map-to-accelerator"; } + StringRef getArgument() const override { return "map-operation-on-tile"; } StringRef getDescription() const override { return "Maps IR to the target accelerator."; } @@ -45,14 +45,14 @@ struct MapToAcceleratorPass registry.insert(); } - MapToAcceleratorPass() = default; - MapToAcceleratorPass(const MapToAcceleratorOptions &options) : MapToAcceleratorPass() { + MapOperationOnTilePass() = default; + MapOperationOnTilePass(const MapOperationOnTileOptions &options) : MapOperationOnTilePass() { this->x_tiles = options.x_tiles; this->y_tiles = options.y_tiles; this->valid_tiles = options.valid_tiles; } - MapToAcceleratorPass(const MapToAcceleratorPass &pass) - : PassWrapper>(pass) {} + MapOperationOnTilePass(const MapOperationOnTilePass &pass) + : PassWrapper>(pass) {} Option mappingStrategy{ *this, "mapping-strategy", llvm::cl::desc("Mapping strategy to use for mapping operations to the " @@ -104,10 +104,10 @@ struct MapToAcceleratorPass } if (mapping_mode_str == attr::val::kSpatialOnly || mapping_mode_str == attr::val::kSpatialTemporal) { - llvm::errs() << "[MapToAcceleratorPass] Using Mapping Mode: " + llvm::errs() << "[MapOperationOnTilePass] Using Mapping Mode: " << mapping_mode_str << "\n"; } else { - llvm::errs() << "[MapToAcceleratorPass] Unsupported mapping mode: " + llvm::errs() << "[MapOperationOnTilePass] Unsupported mapping mode: " << mapping_mode_str << "\n"; return false; } @@ -144,29 +144,29 @@ struct MapToAcceleratorPass mapping_strategy = std::make_unique(max_loc, max_depth); llvm::errs() - << "[MapToAcceleratorPass] Use custom backtrack parameters: " + << "[MapOperationOnTilePass] Use custom backtrack parameters: " << "max_location_to_try=" << max_loc << ", max_backtrack_depth=" << max_depth << "\n"; } else { - llvm::errs() << "[MapToAcceleratorPass] Illegal customized " + llvm::errs() << "[MapOperationOnTilePass] Illegal customized " "parameters format: " << backtrack_str << "\n"; return false; } } else { - llvm::errs() << "[MapToAcceleratorPass] Illegal customized " + llvm::errs() << "[MapOperationOnTilePass] Illegal customized " "parameters format: " << backtrack_str << "\n"; return false; } } else { - llvm::errs() << "[MapToAcceleratorPass] Unsupported backtrack config: " + llvm::errs() << "[MapOperationOnTilePass] Unsupported backtrack config: " << backtrack_str << "\n"; return false; } resolved_mapping_strategy = mapping_strategy_str.str(); } else { - llvm::errs() << "[MapToAcceleratorPass] Unsupported mapping strategy: " + llvm::errs() << "[MapOperationOnTilePass] Unsupported mapping strategy: " << mapping_strategy_str << "\n"; return false; } @@ -184,12 +184,12 @@ struct MapToAcceleratorPass for (Operation *op : sorted_ops) { op->setAttr(attr::kDfgId, IntegerAttr::get(IntegerType::get(ctx, 32), next_id)); - llvm::errs() << "[MapToAcceleratorPass] Assigned dfg_id=" << next_id + llvm::errs() << "[MapOperationOnTilePass] Assigned dfg_id=" << next_id << " to " << *op << "\n"; next_id++; } - llvm::errs() << "[MapToAcceleratorPass] Assigned " << next_id + llvm::errs() << "[MapOperationOnTilePass] Assigned " << next_id << " dfg_id(s) in total\n"; } @@ -231,7 +231,7 @@ struct MapToAcceleratorPass } if (longest) { - llvm::outs() << "[MapToAcceleratorPass] Longest recurrence cycle (length " + llvm::outs() << "[MapOperationOnTilePass] Longest recurrence cycle (length " << longest->length << "):\n"; for (Operation *op : longest->operations) { op->print(llvm::outs()), llvm::outs() << "\n"; @@ -241,7 +241,7 @@ struct MapToAcceleratorPass rec_mii = 1; // No recurrence cycles found, set MII to 1. } - llvm::errs() << "[MapToAcceleratorPass] Calculated Recurrence MII: " + llvm::errs() << "[MapOperationOnTilePass] Calculated Recurrence MII: " << rec_mii << "\n"; int res_mii = calculateResMii(region, architecture); @@ -265,7 +265,7 @@ struct MapToAcceleratorPass if (parent_op && parent_op->getName().getStringRef().contains(attr::val::kOpFused)) { // Skips operations inside a fused_op region. - llvm::outs() << "[MapToAcceleratorPass] Skipping op inside fused_op: " + llvm::outs() << "[MapOperationOnTilePass] Skipping op inside fused_op: " << *op << "\n"; skipped_count++; continue; @@ -275,19 +275,19 @@ struct MapToAcceleratorPass topologically_sorted_ops = std::move(filtered_ops); if (skipped_count > 0) { - llvm::errs() << "[MapToAcceleratorPass] Filtered out " << skipped_count + llvm::errs() << "[MapOperationOnTilePass] Filtered out " << skipped_count << " operations inside fused_op regions\n"; } for (Operation *op : topologically_sorted_ops) { - llvm::outs() << "[MapToAcceleratorPass] Topologically sorted op: " << *op + llvm::outs() << "[MapOperationOnTilePass] Topologically sorted op: " << *op << "\n"; } std::vector> level_buckets = getOpsInAlapLevels(topologically_sorted_ops, critical_ops); for (int level = 0; level < static_cast(level_buckets.size()); ++level) { - llvm::outs() << "[MapToAcceleratorPass] ALAP Bucket Level " << level + llvm::outs() << "[MapOperationOnTilePass] ALAP Bucket Level " << level << ": " << level_buckets[level].size() << " ops\n"; for (Operation *op : level_buckets[level]) { llvm::outs() << " " << *op << "\n"; @@ -296,12 +296,12 @@ struct MapToAcceleratorPass std::vector> sorted_ops_with_alap_levels = flatten_level_buckets(level_buckets, critical_ops); for (const auto &[op, level] : sorted_ops_with_alap_levels) { - llvm::outs() << "[MapToAcceleratorPass] ALAP sorted op: " << *op + llvm::outs() << "[MapOperationOnTilePass] ALAP sorted op: " << *op << " (ALAP level: " << level << ")\n"; } // assert(false); for (int ii = possible_min_ii; ii <= max_ii; ++ii) { - llvm::errs() << "[MapToAcceleratorPass] Start mapping with target II of " + llvm::errs() << "[MapOperationOnTilePass] Start mapping with target II of " << ii << "\n"; // Creates a mapping state for the current II. MappingState mapping_state(architecture, ii, is_spatial_only); @@ -349,18 +349,18 @@ struct MapToAcceleratorPass op->setAttr(attr::kMappingInfo, mapping_info); return true; } - llvm::errs() << "[MapToAcceleratorPass] Mapping failed for target II of " + llvm::errs() << "[MapOperationOnTilePass] Mapping failed for target II of " << ii << "\n"; mapping_state.dumpOpToLocs(); } llvm::errs() - << "[MapToAcceleratorPass] Mapping failed for all target II values.\n"; + << "[MapOperationOnTilePass] Mapping failed for all target II values.\n"; return false; } void runOnOperation() override { ModuleOp module = getOperation(); - llvm::errs() << "[MapToAcceleratorPass] Starting mapping pass...\n"; + llvm::errs() << "[MapOperationOnTilePass] Starting mapping pass...\n"; std::unique_ptr mapping_strategy; std::string resolved_mapping_mode; std::string resolved_mapping_strategy; @@ -414,7 +414,7 @@ struct MapToAcceleratorPass custom_arch = global_arch.cloneWithNewDimensions( y_tiles.getValue(), x_tiles.getValue(), additional_overrides); target_arch = custom_arch.get(); - llvm::errs() << "[MapToAcceleratorPass] Overriding architecture dimensions to " + llvm::errs() << "[MapOperationOnTilePass] Overriding architecture dimensions to " << y_tiles.getValue() << "x" << x_tiles.getValue() << " tiles.\n"; } @@ -432,7 +432,7 @@ struct MapToAcceleratorPass if (!mapRegion(kernel_op, kernel_region, architecture, mapping_strategy.get(), is_spatial_only, resolved_mapping_mode, resolved_mapping_strategy)) { - llvm::errs() << "[MapToAcceleratorPass] Mapping failed for kernel.\n"; + llvm::errs() << "[MapOperationOnTilePass] Mapping failed for kernel.\n"; signalPassFailure(); } }); @@ -450,7 +450,7 @@ struct MapToAcceleratorPass if (!mapRegion(func_op, func_region, architecture, mapping_strategy.get(), is_spatial_only, resolved_mapping_mode, resolved_mapping_strategy)) { - llvm::errs() << "[MapToAcceleratorPass] Failed to map function.\n"; + llvm::errs() << "[MapOperationOnTilePass] Failed to map function.\n"; signalPassFailure(); } }); @@ -461,9 +461,9 @@ struct MapToAcceleratorPass namespace mlir::neura { -std::unique_ptr createMapToAcceleratorPass( - const MapToAcceleratorOptions &options) { - return std::make_unique(options); +std::unique_ptr createMapOperationOnTilePass( + const MapOperationOnTileOptions &options) { + return std::make_unique(options); } } // namespace mlir::neura diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp index 5a7e0d63..755cbb08 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp @@ -344,7 +344,7 @@ class TaskDependencyGraph { // Public wrapper for profileTask: used by UtilizationFuser to re-profile // fused tasks with the real downstream Neura pipeline. // When skip_mapper=true, only ResMII/RecMII analytical estimates are used - // (no MapToAcceleratorPass). This is safe for speculative balance checks + // (no MapOperationOnTilePass). This is safe for speculative balance checks // where the mapper may backtrack indefinitely on larger tile arrays. void profileTaskPublic(TaskGraphNode *node, TaskflowTaskOp task, bool skip_mapper = false) { @@ -363,7 +363,7 @@ class TaskDependencyGraph { } // Profiles a single TaskflowTaskOp: clones the task, wraps the kernel in a - // standalone func, and runs InsertDataMov + MapToAcceleratorPass to obtain + // standalone func, and runs InsertDataMov + MapOperationOnTilePass to obtain // ii. skip_mapper: use only ResMII/RecMII analytical estimates. void profileTask(TaskGraphNode *node, TaskflowTaskOp task, bool skip_mapper = false) { @@ -456,7 +456,7 @@ class TaskDependencyGraph { // InsertDataMov + mapper, and returns compiled_ii / cp_depth. // x_tiles/y_tiles: multi-CGRA tile grid dimensions. // valid_tiles: explicit tile list for non-rectangular shapes (empty = full). - // skip_mapper: skip MapToAcceleratorPass, use ResMII/RecMII only. + // skip_mapper: skip MapOperationOnTilePass, use ResMII/RecMII only. LogicalResult runNeuraPipelineOnKernel(MLIRContext *ctx, neura::KernelOp kernel, ModuleOp dst_module, @@ -563,7 +563,7 @@ class TaskDependencyGraph { }); } - // Optionally run MapToAcceleratorPass to get the true compiled_ii. + // Optionally run MapOperationOnTilePass to get the true compiled_ii. // // Guards: // 1. skip_mapper=true: caller explicitly requests analytical-only (e.g. @@ -609,19 +609,19 @@ class TaskDependencyGraph { << " limit=" << kMapperOpLimit << "\n"; if (all_data_movs_ok && total_mapped_ops <= kMapperOpLimit) { - // Runs MapToAcceleratorPass in a fresh pass manager on the already-lowered + // Runs MapOperationOnTilePass in a fresh pass manager on the already-lowered // dst_module (pre-mapper pipeline already ran above). // Passes the correct tile dimensions so the mapper uses the right array. PassManager pm2(ctx); pm2.enableVerifier(false); if (x_tiles > 0 && y_tiles > 0) { - neura::MapToAcceleratorOptions map_options; + neura::MapOperationOnTileOptions map_options; map_options.x_tiles = x_tiles; map_options.y_tiles = y_tiles; map_options.valid_tiles = valid_tiles; - pm2.addPass(neura::createMapToAcceleratorPass(map_options)); + pm2.addPass(neura::createMapOperationOnTilePass(map_options)); } else { - pm2.addPass(neura::createMapToAcceleratorPass()); + pm2.addPass(neura::createMapOperationOnTilePass()); } if (succeeded(pm2.run(dst_module))) { @@ -641,7 +641,7 @@ class TaskDependencyGraph { return success(); } // Mapper failed for all II values — keep ResMII/RecMII from above. - llvm::errs() << "[profileTask] WARNING: MapToAcceleratorPass failed, " + llvm::errs() << "[profileTask] WARNING: MapOperationOnTilePass failed, " << "keeping analytical fallback compiled_ii=" << compiled_ii << "\n"; } else { @@ -1612,7 +1612,7 @@ struct ResourceAwareTaskOptimizationPass // Estimation mode for profiling task II / steps. // "compiled" (default): runs the full Neura lowering + mapping pipeline - // to obtain accurate compiled_ii and steps from MapToAcceleratorPass. + // to obtain accurate compiled_ii and steps from MapOperationOnTilePass. // "analytical": uses only ResMII / RecMII analytical estimates without // running the mapper. Much faster but less accurate — useful for // rapid design-space exploration or when the mapper is unavailable. diff --git a/test/arch_spec/README.md b/test/arch_spec/README.md index 9741f984..af7ede8e 100644 --- a/test/arch_spec/README.md +++ b/test/arch_spec/README.md @@ -45,7 +45,7 @@ To use this architecture specification in your tests, add the following option t mlir-neura-opt input.mlir \ --assign-accelerator \ --lower-llvm-to-neura \ - --map-to-accelerator="mapping-strategy=heuristic" \ + --map-operation-on-tile="mapping-strategy=heuristic" \ --architecture-spec=arch_spec/architecture.yaml \ --generate-code ``` diff --git a/test/c2llvm2mlir/nested_loop/test.mlir b/test/c2llvm2mlir/nested_loop/test.mlir index 3bf536ff..9e64619c 100644 --- a/test/c2llvm2mlir/nested_loop/test.mlir +++ b/test/c2llvm2mlir/nested_loop/test.mlir @@ -21,7 +21,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-LLVM2NEURA-MAP // CHECK-LLVM2NEURA: accelerator = "neura" diff --git a/test/c2llvm2mlir/simple_loop/test.mlir b/test/c2llvm2mlir/simple_loop/test.mlir index 2af2d0c6..e5440152 100644 --- a/test/c2llvm2mlir/simple_loop/test.mlir +++ b/test/c2llvm2mlir/simple_loop/test.mlir @@ -45,7 +45,7 @@ // RUN: --view-op-graph \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized=5,3 dump-mapping-table=true" %t-kernel.mlir -o %t-kernel-mapped.mlir 2>&1 | tee %t-kernel-mapping-output.txt +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized=5,3 dump-mapping-table=true" %t-kernel.mlir -o %t-kernel-mapped.mlir 2>&1 | tee %t-kernel-mapping-output.txt // RUN: FileCheck %s --check-prefix=CHECK-MAPPING-TABLE < %t-kernel-mapping-output.txt // RUN: FileCheck %s --check-prefix=CHECK-LLVM2NEURA-MAP < %t-kernel-mapped.mlir diff --git a/test/code_gen/test_code_generate.mlir b/test/code_gen/test_code_generate.mlir index a9671b86..ae9c6cc8 100644 --- a/test/code_gen/test_code_generate.mlir +++ b/test/code_gen/test_code_generate.mlir @@ -6,7 +6,7 @@ // RUN: --leverage-predicated-value \ // RUN: --transform-ctrl-to-data-flow \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING diff --git a/test/controflow_fuse/perfect_nested/perfect_nested.mlir b/test/controflow_fuse/perfect_nested/perfect_nested.mlir index bbc5877e..4f0ff7e0 100644 --- a/test/controflow_fuse/perfect_nested/perfect_nested.mlir +++ b/test/controflow_fuse/perfect_nested/perfect_nested.mlir @@ -45,7 +45,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml \ // RUN: | FileCheck %s -check-prefix=MAPPING diff --git a/test/controflow_fuse/simple_loop/simple_loop.mlir b/test/controflow_fuse/simple_loop/simple_loop.mlir index e9c04f7c..320edeb8 100644 --- a/test/controflow_fuse/simple_loop/simple_loop.mlir +++ b/test/controflow_fuse/simple_loop/simple_loop.mlir @@ -70,7 +70,7 @@ // RUN: --fuse-loop-control \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml \ // RUN: -o %t-fused-mapped.mlir // RUN: FileCheck %s --input-file=%t-fused-mapped.mlir --check-prefix=FUSE-MAPPING diff --git a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir index ace0dd26..e3335754 100644 --- a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir +++ b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir @@ -71,7 +71,7 @@ // RUN: --fuse-loop-control \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml | FileCheck %s -check-prefix=FUSE-MAPPING module attributes {} { diff --git a/test/e2e/axpy/axpy_kernel.mlir b/test/e2e/axpy/axpy_kernel.mlir index 8d3e9fba..441f3a7f 100644 --- a/test/e2e/axpy/axpy_kernel.mlir +++ b/test/e2e/axpy/axpy_kernel.mlir @@ -15,7 +15,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING diff --git a/test/e2e/bicg/bicg_int_kernel.mlir b/test/e2e/bicg/bicg_int_kernel.mlir index f9aa4d3d..ac4d308a 100644 --- a/test/e2e/bicg/bicg_int_kernel.mlir +++ b/test/e2e/bicg/bicg_int_kernel.mlir @@ -19,7 +19,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir diff --git a/test/e2e/bicg/bicg_kernel.mlir b/test/e2e/bicg/bicg_kernel.mlir index d353ec1f..c016d053 100644 --- a/test/e2e/bicg/bicg_kernel.mlir +++ b/test/e2e/bicg/bicg_kernel.mlir @@ -38,7 +38,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING diff --git a/test/e2e/fft/fft_kernel.mlir b/test/e2e/fft/fft_kernel.mlir index 1f42fb7a..7df8b22b 100644 --- a/test/e2e/fft/fft_kernel.mlir +++ b/test/e2e/fft/fft_kernel.mlir @@ -15,7 +15,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING diff --git a/test/e2e/fir/fir_kernel.mlir b/test/e2e/fir/fir_kernel.mlir index f7049b62..2991a59a 100644 --- a/test/e2e/fir/fir_kernel.mlir +++ b/test/e2e/fir/fir_kernel.mlir @@ -16,7 +16,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: cp %t.dir/tmp-generated-instructions.yaml %t-generated-instructions.yaml diff --git a/test/e2e/fir/fir_kernel_vec.mlir b/test/e2e/fir/fir_kernel_vec.mlir index 366feba8..a4b39d61 100644 --- a/test/e2e/fir/fir_kernel_vec.mlir +++ b/test/e2e/fir/fir_kernel_vec.mlir @@ -14,7 +14,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING diff --git a/test/e2e/gemm/gemm_kernel.mlir b/test/e2e/gemm/gemm_kernel.mlir index 3376fe0a..bbf3cbcb 100644 --- a/test/e2e/gemm/gemm_kernel.mlir +++ b/test/e2e/gemm/gemm_kernel.mlir @@ -15,7 +15,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING diff --git a/test/e2e/gemv/gemv_kernel.mlir b/test/e2e/gemv/gemv_kernel.mlir index 9f8f1317..4779714e 100644 --- a/test/e2e/gemv/gemv_kernel.mlir +++ b/test/e2e/gemv/gemv_kernel.mlir @@ -15,7 +15,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING diff --git a/test/e2e/histogram/histogram_kernel.mlir b/test/e2e/histogram/histogram_kernel.mlir index 9f2d6f23..43c2fab6 100644 --- a/test/e2e/histogram/histogram_kernel.mlir +++ b/test/e2e/histogram/histogram_kernel.mlir @@ -16,7 +16,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING diff --git a/test/e2e/relu/relu_kernel.mlir b/test/e2e/relu/relu_kernel.mlir index a6588a54..a73d5f61 100644 --- a/test/e2e/relu/relu_kernel.mlir +++ b/test/e2e/relu/relu_kernel.mlir @@ -23,7 +23,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING diff --git a/test/e2e/spmv/spmv_kernel.mlir b/test/e2e/spmv/spmv_kernel.mlir index 32a50da9..103e5511 100644 --- a/test/e2e/spmv/spmv_kernel.mlir +++ b/test/e2e/spmv/spmv_kernel.mlir @@ -15,7 +15,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../arch_spec/architecture.yaml \ // RUN: --generate-code -o %t-mapping.mlir // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING diff --git a/test/honor_arch/fir_removed_tiles_test.mlir b/test/honor_arch/fir_removed_tiles_test.mlir index 23e4009d..f207b3b0 100644 --- a/test/honor_arch/fir_removed_tiles_test.mlir +++ b/test/honor_arch/fir_removed_tiles_test.mlir @@ -16,7 +16,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../arch_spec/custom_arch_with_removed_tiles.yaml \ // RUN: -o %t-after-mapping.mlir diff --git a/test/mapping_quality/branch_for.mlir b/test/mapping_quality/branch_for.mlir index f78a1be1..05374d2f 100644 --- a/test/mapping_quality/branch_for.mlir +++ b/test/mapping_quality/branch_for.mlir @@ -54,7 +54,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \ // RUN: --architecture-spec=../arch_spec/architecture.yaml \ // RUN: | FileCheck %s -check-prefix=MAPPING @@ -67,7 +67,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \ // RUN: --architecture-spec=../arch_spec/architecture.yaml \ // RUN: --generate-code // RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml -check-prefix=YAML diff --git a/test/mapping_quality/tiny_loop.mlir b/test/mapping_quality/tiny_loop.mlir index 1b23c2bf..20382890 100644 --- a/test/mapping_quality/tiny_loop.mlir +++ b/test/mapping_quality/tiny_loop.mlir @@ -25,7 +25,7 @@ // RUN: --fuse-loop-control \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized=4,3" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized=4,3" \ // RUN: --architecture-spec=../arch_spec/architecture.yaml \ // RUN: | FileCheck %s -check-prefix=SPATIAL @@ -45,7 +45,7 @@ // RUN: --fuse-loop-control \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-temporal backtrack-config=customized=4,4" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic mapping-mode=spatial-temporal backtrack-config=customized=4,4" \ // RUN: --architecture-spec=../arch_spec/architecture.yaml \ // RUN: | FileCheck %s -check-prefix=SPATIAL-TEMPORAL diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index f70d99ca..9a49b645 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -68,7 +68,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \ // RUN: -o %t.mapped.mlir // RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir index 1802e538..669a19a2 100644 --- a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir +++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir @@ -47,7 +47,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \ // RUN: -o %t.mapped.mlir // RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir index 309c8512..664e1a2d 100644 --- a/test/multi-cgra/kernel_mapping/relu/relu.mlir +++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir @@ -68,7 +68,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \ // RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \ // RUN: -o %t.mapped.mlir // RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir index 0a7d6031..f170fea7 100644 --- a/test/neura/ctrl/branch_for.mlir +++ b/test/neura/ctrl/branch_for.mlir @@ -60,7 +60,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml \ // RUN: | FileCheck %s -check-prefix=MAPPING @@ -75,7 +75,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" \ // RUN: --architecture-spec=../../arch_spec/architecture.yaml \ // RUN: --generate-code // RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml -check-prefix=YAML diff --git a/test/neura/for_loop/relu_test.mlir b/test/neura/for_loop/relu_test.mlir index a34e4fd7..7c90141d 100644 --- a/test/neura/for_loop/relu_test.mlir +++ b/test/neura/for_loop/relu_test.mlir @@ -30,7 +30,7 @@ // RUN: --transform-ctrl-to-data-flow \ // RUN: --fold-constant \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \ +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \ // RUN: | FileCheck %s --check-prefix=MAPPING // CHECK: func.func @_Z6kernelPiS_ diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir index 63881151..ce3afe60 100644 --- a/test/neura/fusion/test.mlir +++ b/test/neura/fusion/test.mlir @@ -25,7 +25,7 @@ // RUN: --fold-constant \ // RUN: --fuse-pattern \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-MAPPING +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-MAPPING // CHECK-FUSED: func.func @_Z6kernelPA1024_iPiS1_S1_S1_ // CHECK-FUSED: accelerator = "neura" @@ -110,7 +110,7 @@ // RUN: --fold-constant \ // RUN: --iter-merge-pattern="min-support=3 max-iter=4" \ // RUN: --insert-data-mov \ -// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-ITER-MERGE-PATTERN-MAPPING +// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-ITER-MERGE-PATTERN-MAPPING // CHECK-ITER-MERGE-PATTERN-MAPPING: mapping_info = {compiled_ii = 12 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 8 : i32, res_mii = 3 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32} diff --git a/test/neura/steer_ctrl/loop_with_return_value.mlir b/test/neura/steer_ctrl/loop_with_return_value.mlir index 1104a7a7..c35ab82c 100644 --- a/test/neura/steer_ctrl/loop_with_return_value.mlir +++ b/test/neura/steer_ctrl/loop_with_return_value.mlir @@ -30,7 +30,7 @@ // RUN: --transform-to-steer-control \ // RUN: --remove-predicated-type \ // RUN: --insert-data-mov -// RU: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized" +// RU: --map-operation-on-tile="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized" // RU: | FileCheck %s -check-prefix=MAPPING module { From 2d00d5a5da985bb13aaaa17cb21d74649eadce5e Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Sat, 7 Mar 2026 12:15:59 +0800 Subject: [PATCH 07/10] refactor: rename MapTaskOnCgraPass to AllocateCgraToTaskPass --- include/TaskflowDialect/TaskflowPasses.h | 4 ++-- include/TaskflowDialect/TaskflowPasses.td | 4 ++-- ...graPass.cpp => AllocateCgraToTaskPass.cpp} | 20 +++++++++---------- lib/TaskflowDialect/Transforms/CMakeLists.txt | 2 +- .../ResourceAwareTaskOptimizationPass.cpp | 6 +++--- .../irregular-loop/irregular-loop.mlir | 2 +- .../taskflow/multi-nested/multi-nested.mlir | 2 +- .../parallel-nested/parallel-nested.mlir | 2 +- 8 files changed, 21 insertions(+), 21 deletions(-) rename lib/TaskflowDialect/Transforms/{MapTaskOnCgraPass.cpp => AllocateCgraToTaskPass.cpp} (97%) diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 866365eb..cd48c4a2 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -22,11 +22,11 @@ void registerTosaToAffineConversionPassPipeline(); #include "TaskflowDialect/TaskflowPasses.h.inc" std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); -std::unique_ptr createMapTaskOnCgraPass(); +std::unique_ptr createAllocateCgraToTaskPass(); // Runs the CGRA task placement logic directly on a function. // grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols). -void runMapTaskOnCgra(mlir::func::FuncOp func, +void runAllocateCgraToTask(mlir::func::FuncOp func, int grid_rows = 4, int grid_cols = 4); //=========================================================// diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 8d765498..0869c01a 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -61,7 +61,7 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ let constructor = "taskflow::createClassifyCountersPass()"; } -def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> { +def AllocateCgraToTask : Pass<"allocate-cgra-to-task", "func::FuncOp"> { let summary = "Maps Taskflow tasks onto a 2D CGRA grid array"; let description = [{ This pass maps Taskflow tasks onto a 2D CGRA grid array. @@ -70,7 +70,7 @@ def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> { Uses a default 3x3 CGRA grid. }]; - let constructor = "taskflow::createMapTaskOnCgraPass()"; + let constructor = "taskflow::createAllocateCgraToTaskPass()"; } def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::FuncOp"> { diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp similarity index 97% rename from lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp rename to lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp index 029404a8..174ed003 100644 --- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp +++ b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp @@ -1,4 +1,4 @@ -//===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===// +//===- AllocateCgraToTaskPass.cpp - Task to CGRA Mapping Pass ----------------===// // // This pass maps Taskflow tasks onto a 2D CGRA grid array: // 1. Places tasks with SSA dependencies on adjacent CGRAs. @@ -273,7 +273,7 @@ class TaskMapper { // (i.e. reject the extra CGRA and keep previous allocation). if (placement.cgra_positions.empty() && cgra_count > 1) { int fallback = cgra_count - 1; - llvm::errs() << "[MapTaskOnCgra] Cannot place " + llvm::errs() << "[AllocateCgraToTask] Cannot place " << task_node->op.getTaskName() << " with cgra_count=" << cgra_count << ", falling back to " << fallback << "\n"; @@ -716,13 +716,13 @@ class TaskMapper { //===----------------------------------------------------------------------===// // Pass Definition //===----------------------------------------------------------------------===// -struct MapTaskOnCgraPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapTaskOnCgraPass) +struct AllocateCgraToTaskPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AllocateCgraToTaskPass) - MapTaskOnCgraPass() = default; + AllocateCgraToTaskPass() = default; - StringRef getArgument() const override { return "map-task-on-cgra"; } + StringRef getArgument() const override { return "allocate-cgra-to-task"; } StringRef getDescription() const override { return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency " @@ -743,11 +743,11 @@ struct MapTaskOnCgraPass namespace mlir { namespace taskflow { -std::unique_ptr createMapTaskOnCgraPass() { - return std::make_unique(); +std::unique_ptr createAllocateCgraToTaskPass() { + return std::make_unique(); } -void runMapTaskOnCgra(func::FuncOp func, int grid_rows, int grid_cols) { +void runAllocateCgraToTask(func::FuncOp func, int grid_rows, int grid_cols) { TaskMapper mapper(grid_rows, grid_cols); mapper.place(func); } diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index 5dcb6736..23b01a33 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -3,7 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp ClassifyCountersPass.cpp - MapTaskOnCgraPass.cpp + AllocateCgraToTaskPass.cpp DEPENDS MLIRTaskflowTransformsIncGen diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp index 755cbb08..5cedf3a5 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp @@ -848,7 +848,7 @@ class PipelineBalancer { // Check if incrementing cgra_count is feasible on the 4×4 grid. // TODO: This currently only checks the capacity (total CGRA count). Ideally, - // we should invoke a global placement pass (aka MapTaskOnCgraPass) here to + // we should invoke a global placement pass (aka AllocateCgraToTaskPass) here to // verify if the speculatively increased CGRA count and its proposed shape // actually fit on the 4x4 grid alongside other previously allocated tasks. // @@ -1765,11 +1765,11 @@ struct ResourceAwareTaskOptimizationPass node->op->setAttr("tile_shape", b.getStringAttr(shape_str)); } - // Runs MapTaskOnCgraPass to produce global placement (task_mapping_info) + // Runs AllocateCgraToTaskPass to produce global placement (task_mapping_info) // with multi-CGRA support. The pass reads cgra_count and tile_shape // from each task and places them on the 4x4 grid, validating that // shapes physically fit and don't overlap. - taskflow::runMapTaskOnCgra(func, kCgraGridRows, kCgraGridCols); + taskflow::runAllocateCgraToTask(func, kCgraGridRows, kCgraGridCols); break; } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 13c56ab1..80417c2b 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -29,7 +29,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-task-on-cgra \ +// RUN: --allocate-cgra-to-task \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index c4e7b76c..84d431ed 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -61,7 +61,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-task-on-cgra \ +// RUN: --allocate-cgra-to-task \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index 881d81ec..abd6a950 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -42,7 +42,7 @@ // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --map-task-on-cgra \ +// RUN: --allocate-cgra-to-task \ // RUN: -o %t.placement.mlir // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT From 9112303d4ec5afe291c156acbb7fd7d2ae0fd219 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Sat, 7 Mar 2026 12:41:42 +0800 Subject: [PATCH 08/10] fix: resolve merge conflicts with main (FuseTaskPass, CMakeLists) --- include/TaskflowDialect/TaskflowPasses.h | 2 ++ lib/TaskflowDialect/CMakeLists.txt | 3 +-- lib/TaskflowDialect/Transforms/CMakeLists.txt | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index cd48c4a2..6d7f97fc 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -23,6 +23,8 @@ void registerTosaToAffineConversionPassPipeline(); std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); std::unique_ptr createAllocateCgraToTaskPass(); +std::unique_ptr createFuseTaskPass(); + // Runs the CGRA task placement logic directly on a function. // grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols). diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt index 49d60c57..d8e5d7ff 100644 --- a/lib/TaskflowDialect/CMakeLists.txt +++ b/lib/TaskflowDialect/CMakeLists.txt @@ -13,5 +13,4 @@ add_mlir_dialect_library(MLIRTaskflow MLIRInferTypeOpInterface ) -add_subdirectory(Transforms) -add_subdirectory(Transforms/Optimizations) \ No newline at end of file +add_subdirectory(Transforms) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index 210e071f..984b7407 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp ClassifyCountersPass.cpp AllocateCgraToTaskPass.cpp + FuseTaskPass.cpp DEPENDS MLIRTaskflowTransformsIncGen From 27b7bcf8ab78d43298f6620f8add8fc0fd1e9a49 Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Sat, 7 Mar 2026 13:40:44 +0800 Subject: [PATCH 09/10] refactor: rename CGRAPosition to CgraPosition for consistency --- .../Transforms/AllocateCgraToTaskPass.cpp | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp index 174ed003..d99d0051 100644 --- a/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp @@ -37,25 +37,25 @@ namespace { // CGRA Grid Position //===----------------------------------------------------------------------===// // Represents a position on the 2D CGRA grid. -struct CGRAPosition { +struct CgraPosition { int row; int col; - bool operator==(const CGRAPosition &other) const { + bool operator==(const CgraPosition &other) const { return row == other.row && col == other.col; } - bool operator!=(const CGRAPosition &other) const { + bool operator!=(const CgraPosition &other) const { return !(*this == other); } // Computes Manhattan distance to another position. - int manhattanDistance(const CGRAPosition &other) const { + int manhattanDistance(const CgraPosition &other) const { return std::abs(row - other.row) + std::abs(col - other.col); } // Checks if adjacent (Manhattan distance = 1). - bool isAdjacent(const CGRAPosition &other) const { + bool isAdjacent(const CgraPosition &other) const { return manhattanDistance(other) == 1; } }; @@ -65,11 +65,11 @@ struct CGRAPosition { //===----------------------------------------------------------------------===// // Stores placement info for a task: can span multiple combined CGRAs. struct TaskPlacement { - SmallVector cgra_positions; // CGRAs assigned to this task. + SmallVector cgra_positions; // CGRAs assigned to this task. // Returns the primary (first) position. - CGRAPosition primary() const { - return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0]; + CgraPosition primary() const { + return cgra_positions.empty() ? CgraPosition{-1, -1} : cgra_positions[0]; } // Returns the number of CGRAs assigned. @@ -107,7 +107,7 @@ struct TaskNode { SmallVector ssa_operands; // Placement result - SmallVector placement; + SmallVector placement; TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {} }; @@ -121,7 +121,7 @@ struct MemoryNode { SmallVector writers; // Mapping result. - std::optional assigned_sram_pos; + std::optional assigned_sram_pos; MemoryNode(Value memref) : memref(memref) {} }; @@ -401,12 +401,12 @@ class TaskMapper { } } - std::optional new_sram_pos; + std::optional new_sram_pos; if (count > 0) { // Rounds to the nearest integer. int avg_row = (total_row + count / 2) / count; int avg_col = (total_col + count / 2) / count; - new_sram_pos = CGRAPosition{avg_row, avg_col}; + new_sram_pos = CgraPosition{avg_row, avg_col}; } if (mem_node->assigned_sram_pos != new_sram_pos) { @@ -525,8 +525,8 @@ class TaskMapper { int best_score = INT_MIN; TaskPlacement best_placement; - std::function &, uint64_t)> search = - [&](SmallVector ¤t, uint64_t mask) { + std::function &, uint64_t)> search = + [&](SmallVector ¤t, uint64_t mask) { if ((int)current.size() == k) { if (visited_masks.insert(mask).second) { TaskPlacement candidate; @@ -562,7 +562,7 @@ class TaskMapper { for (int r = 0; r < grid_rows_; ++r) { for (int c = 0; c < grid_cols_; ++c) { if (!occupied_[r][c]) { - SmallVector start = {{r, c}}; + SmallVector start = {{r, c}}; search(start, 1ULL << (r * grid_cols_ + c)); } } @@ -614,7 +614,7 @@ class TaskMapper { // Helper: minimum Manhattan distance between any position in this // placement and any position in another task's placement. - auto minDistToPlacement = [&](const SmallVector &other) -> int { + auto minDistToPlacement = [&](const SmallVector &other) -> int { int min_dist = INT_MAX; for (const auto &pos : placement.cgra_positions) { for (const auto &opos : other) { @@ -626,7 +626,7 @@ class TaskMapper { // Helper: minimum Manhattan distance from any position in this placement // to a single target position. - auto minDistToTarget = [&](const CGRAPosition &target) -> int { + auto minDistToTarget = [&](const CgraPosition &target) -> int { int min_dist = INT_MAX; for (const auto &pos : placement.cgra_positions) { min_dist = std::min(min_dist, pos.manhattanDistance(target)); From ea0a0b666b5aeb7205379d251b917b95f887e11b Mon Sep 17 00:00:00 2001 From: Shiran Guo Date: Tue, 10 Mar 2026 10:04:38 +0800 Subject: [PATCH 10/10] resolve TODOs, add post-placement re-profiling --- .../ResourceAwareTaskOptimizationPass.cpp | 469 +++++++++++++++--- .../resource-heavy/resource-heavy.mlir | 10 +- 2 files changed, 407 insertions(+), 72 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp index 03707eb9..9909938a 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp @@ -158,49 +158,233 @@ static SmallVector getNonRectangularShapes(int cgra_count) { // identical bounding box area, we prefer more square-like bounds over long // rectangles. // -// TODO: This function only picks a localized shape for an idealized single task -// mapping. Global placement and conflict resolution across multiple tasks is -// legitimately deferred to downstream map-on-cgra pass, as speculative -// profiling assumes unconstrained placement. +// This function picks a localized shape for speculative per-task +// profiling (assumes unconstrained placement). Global placement conflict +// resolution across multiple tasks is handled by canAllTasksFitOnGrid() +// during the balance phase and by the downstream AllocateCgraToTaskPass. static CgraShape pickBestShape(int cgra_count) { - // For cgra_count == 3, the 2x2 L-shape has a smaller maximum physical routing - // distance (dist=2) compared to a 1x3 rectangle (dist=3), despite having a - // larger bounding box. We explicitly prefer the more compact L-shape here for - // better speculative latency. - if (cgra_count == 3) { - auto non_rect_shapes = getNonRectangularShapes(3); - if (!non_rect_shapes.empty()) { - return non_rect_shapes.front(); - } - } SmallVector candidates = getRectangularShapes(cgra_count); for (const auto &s : getNonRectangularShapes(cgra_count)) { candidates.push_back(s); } - if (!candidates.empty()) { - return *std::min_element(candidates.begin(), candidates.end(), - [](const CgraShape &a, const CgraShape &b) { - int area_a = a.area(); - int area_b = b.area(); - if (area_a != area_b) - return area_a < area_b; - return std::abs(a.rows - a.cols) < - std::abs(b.rows - b.cols); - }); + // Selects the shape with smallest bounding-box area first; + // among equal areas, prefers the most square-like shape. + assert(!candidates.empty() && + "No valid shapes for cgra_count in [1..kMaxCgrasPerTask]"); + return *std::min_element(candidates.begin(), candidates.end(), + [](const CgraShape &a, const CgraShape &b) { + int area_a = a.area(); + int area_b = b.area(); + if (area_a != area_b){ + return area_a < area_b; + } + return std::abs(a.rows - a.cols) < + std::abs(b.rows - b.cols); + }); +} + +//===----------------------------------------------------------------------===// +// Global Placement Feasibility Check +//===----------------------------------------------------------------------===// + +// Generates all placement-candidate shapes for `cgra_count` CGRAs, including +// rotations. Rectangular shapes include both orientations (rows×cols and +// cols×rows, deduplicated for squares). Non-rectangular shapes include all +// four 90° rotations. +// +// Ordering (tried first to last): +// 1. Rectangular shapes, sorted by squareness (e.g. 2×2 before 1×4), +// with smaller bounding-box area as tiebreaker. +// 2. Non-rectangular shapes (L, T, etc.) in all unique rotations. +static SmallVector getAllPlacementShapes(int cgra_count) { + SmallVector shapes; + + // 1. Rectangular shapes with both orientations, deduplicated. + { + llvm::DenseSet seen_keys; // encodes (rows<<16)|cols + for (int row_dim = 1; row_dim <= kCgraGridRows; ++row_dim) { + for (int col_dim = 1; col_dim <= kCgraGridCols; ++col_dim) { + if (row_dim * col_dim == cgra_count) { + int64_t key = ((int64_t)row_dim << 16) | col_dim; + if (seen_keys.insert(key).second) { + shapes.push_back({row_dim, col_dim, true, {}}); + // Adds the rotated orientation if different (e.g. 1×4 -> 4×1). + if (row_dim != col_dim) { + int64_t rotated_key = ((int64_t)col_dim << 16) | row_dim; + if (seen_keys.insert(rotated_key).second) + shapes.push_back({col_dim, row_dim, true, {}}); + } + } + } + } + } + // Sorts rectangles: prefer more square-like (smaller |rows-cols|), then + // smaller bounding-box area as tiebreaker. + llvm::sort(shapes, [](const CgraShape &lhs, const CgraShape &rhs) { + int squareness_lhs = std::abs(lhs.rows - lhs.cols); + int squareness_rhs = std::abs(rhs.rows - rhs.cols); + if (squareness_lhs != squareness_rhs) + return squareness_lhs < squareness_rhs; + return lhs.area() < rhs.area(); + }); } - // Fallback: smallest bounding box (should not be reached for 1..4 CGRAs). - CgraShape best = {kCgraGridRows, kCgraGridCols, false, {}}; - for (int r = 1; r <= kCgraGridRows; ++r) { - for (int c = 1; c <= kCgraGridCols; ++c) { - if (r * c >= cgra_count && r * c < best.area()) { - best = {r, c, false, {}}; + // 2. Non-rectangular shapes with all four 90° rotations. + auto base_non_rect = getNonRectangularShapes(cgra_count); + for (const auto &base : base_non_rect) { + // Generates 4 rotations of the cgra_positions list. + // Rotation by 90° CW: (col, row) -> (row, -col). + // Each rotation is normalised so that offsets start from (0, 0). + SmallVector>, 4> rotation_variants; + rotation_variants.push_back( + SmallVector>(base.cgra_positions)); + + // Rotates 3 more times (90°, 180°, 270°). + auto prev_positions = base.cgra_positions; + for (int rotation_idx = 0; rotation_idx < 3; ++rotation_idx) { + SmallVector> rotated_positions; + for (auto &[col_off, row_off] : prev_positions) + rotated_positions.push_back( + {row_off, -col_off}); // 90° CW in (col, row) space + + // Normalises to non-negative offsets starting from (0, 0). + int min_col = INT_MAX, min_row = INT_MAX; + for (auto &[col_off, row_off] : rotated_positions) { + min_col = std::min(min_col, col_off); + min_row = std::min(min_row, row_off); + } + for (auto &[col_off, row_off] : rotated_positions) { + col_off -= min_col; + row_off -= min_row; + } + rotation_variants.push_back(rotated_positions); + prev_positions = rotated_positions; + } + + // Deduplicates rotations that produce the same position set. + llvm::DenseSet seen_hashes; + for (auto &positions : rotation_variants) { + // Sorts positions for canonical comparison. + auto sorted_positions = positions; + llvm::sort(sorted_positions, + [](const std::pair &lhs, + const std::pair &rhs) { return lhs < rhs; }); + // Simple hash of sorted positions. + int64_t hash = 0; + for (auto &[col_off, row_off] : sorted_positions) + hash = hash * 131 + col_off * 17 + row_off; + if (!seen_hashes.insert(hash).second) + continue; + + // Computes bounding box for this rotation. + int max_col = 0, max_row = 0; + for (auto &[col_off, row_off] : positions) { + max_col = std::max(max_col, col_off); + max_row = std::max(max_row, row_off); + } + shapes.push_back( + {max_row + 1, max_col + 1, false, std::move(positions)}); + } + } + + return shapes; +} + +// Simulates greedy placement of all tasks' shapes on the kCgraGridRows × +// kCgraGridCols grid to verify that they physically fit without overlap. +// +// For each task, all valid shapes (including rotations) are tried. Rectangular +// shapes prefer square-like orientations (e.g. 2×2 over 1×4). Non-rectangular +// shapes are tried in all four 90° rotations. +// +// `task_cgra_counts` contains the cgra_count for every task in the graph +// (including the speculatively modified one). +// +// Returns true if all tasks can be placed without overlap. +static bool canAllTasksFitOnGrid(ArrayRef task_cgra_counts) { + // Quick capacity check: total CGRAs must not exceed grid size. + int total_cgras = 0; + for (int count : task_cgra_counts) + total_cgras += count; + if (total_cgras > kTotalCGRAs) + return false; + + // Simulates placement on a grid. + bool occupied[kCgraGridRows][kCgraGridCols] = {}; + + // Sorts tasks by descending cgra_count for better packing (largest-first + // decreasing, a standard bin-packing heuristic). Each task may have a + // different cgra_count because the balance phase only increments one + // bottleneck at a time; this array reflects the heterogeneous allocation + // across all tasks in the current trial configuration. + SmallVector sorted_counts(task_cgra_counts.begin(), + task_cgra_counts.end()); + llvm::sort(sorted_counts, [](int lhs, int rhs) { return lhs > rhs; }); + + for (int cgra_count : sorted_counts) { + SmallVector candidates = getAllPlacementShapes(cgra_count); + bool placed = false; + + for (const auto &shape : candidates) { + if (placed) + break; + + if (shape.is_rectangular) { + // Rectangular: tries every origin where the rows×cols bbox fits. + for (int origin_row = 0; + origin_row <= kCgraGridRows - shape.rows && !placed; + ++origin_row) { + for (int origin_col = 0; + origin_col <= kCgraGridCols - shape.cols && !placed; + ++origin_col) { + bool fits = true; + for (int delta_row = 0; delta_row < shape.rows && fits; + ++delta_row) + for (int delta_col = 0; delta_col < shape.cols && fits; + ++delta_col) + if (occupied[origin_row + delta_row][origin_col + delta_col]) + fits = false; + if (fits) { + for (int delta_row = 0; delta_row < shape.rows; ++delta_row) + for (int delta_col = 0; delta_col < shape.cols; ++delta_col) + occupied[origin_row + delta_row][origin_col + delta_col] = + true; + placed = true; + } + } + } + } else { + // Non-rectangular: cgra_positions stores (col, row) offsets. + for (int origin_row = 0; origin_row < kCgraGridRows && !placed; + ++origin_row) { + for (int origin_col = 0; origin_col < kCgraGridCols && !placed; + ++origin_col) { + bool fits = true; + for (auto &[col_off, row_off] : shape.cgra_positions) { + int abs_row = origin_row + row_off; + int abs_col = origin_col + col_off; + if (abs_row < 0 || abs_row >= kCgraGridRows || abs_col < 0 || + abs_col >= kCgraGridCols || occupied[abs_row][abs_col]) { + fits = false; + break; + } + } + if (fits) { + for (auto &[col_off, row_off] : shape.cgra_positions) + occupied[origin_row + row_off][origin_col + col_off] = true; + placed = true; + } + } + } } } + + if (!placed) + return false; } - return best; + return true; } //===----------------------------------------------------------------------===// @@ -857,17 +1041,34 @@ class PipelineBalancer { int old_cgra_count = bottleneck->cgra_count; int new_cgra_count = old_cgra_count + 1; - // Check if incrementing cgra_count is feasible on the 4×4 grid. - // TODO: This currently only checks the capacity (total CGRA count). Ideally, - // we should invoke a global placement pass (aka AllocateCgraToTaskPass) here to - // verify if the speculatively increased CGRA count and its proposed shape - // actually fit on the 4x4 grid alongside other previously allocated tasks. - // + // Check 1: Per-task CGRA limit. if (!canFitOnGrid(new_cgra_count)) { saturated_nodes.insert(bottleneck); continue; } + // Check 2: Global placement feasibility — simulates placing all tasks' + // shapes (with the speculatively increased cgra_count for the bottleneck) + // on the physical kCgraGridRows × kCgraGridCols grid to verify they + // fit without overlap. + { + SmallVector trial_counts; + for (auto &node : graph.nodes) { + if (node.get() == bottleneck) + trial_counts.push_back(new_cgra_count); + else + trial_counts.push_back(node->cgra_count); + } + if (!canAllTasksFitOnGrid(trial_counts)) { + llvm::errs() << " Balance: global placement infeasible for Task " + << bottleneck->id << " (" + << bottleneck->op.getTaskName().str() + << ") with cgra_count=" << new_cgra_count << "\n"; + saturated_nodes.insert(bottleneck); + continue; + } + } + // Saves state for potential rollback. int64_t old_latency = bottleneck->estimatedLatency(); int64_t old_ii = bottleneck->ii; @@ -1782,14 +1983,8 @@ struct ResourceAwareTaskOptimizationPass // intermediate iterations; ii, steps, and trip_count live only in the // graph node and must be persisted here. // - // Note: no re-profiling is done here. When balance-skip-mapper=true - // (the default), the balance phase uses analytical estimates; those - // are the values written to the final IR. When - // balance-skip-mapper=false, the balance phase already ran the real - // mapper for each speculative probe, so the graph already contains - // accurate compiled_ii / steps values. Either way, the converged - // graph state is authoritative and written directly to IR. - + // Phase A: Write speculative attributes so AllocateCgraToTask can + // read cgra_count and tile_shape from the IR. for (auto &node : graph.nodes) { OpBuilder b(node->op); node->shape = pickBestShape(node->cgra_count); @@ -1799,18 +1994,109 @@ struct ResourceAwareTaskOptimizationPass node->op->setAttr("steps", b.getI32IntegerAttr(node->steps)); node->op->setAttr("trip_count", b.getI32IntegerAttr(node->trip_count)); - // Writes tile_shape attribute: simple "NxM" bounding-box string. - // The detailed occupancy diagram is printed in the summary below. std::string shape_str = node->shape.irAttr(); node->op->setAttr("tile_shape", b.getStringAttr(shape_str)); } - // Runs AllocateCgraToTaskPass to produce global placement (task_mapping_info) - // with multi-CGRA support. The pass reads cgra_count and tile_shape - // from each task and places them on the 4x4 grid, validating that - // shapes physically fit and don't overlap. + // Phase B: Run global placement. AllocateCgraToTask reads + // cgra_count / tile_shape from the IR and produces + // task_mapping_info with the actual cgra_positions on the 4×4 grid. taskflow::runAllocateCgraToTask(func, kCgraGridRows, kCgraGridCols); + // Phase C: Post-placement reconciliation. + // Reads back the actual placed shape from task_mapping_info + // and re-profiles tasks whose placed shape + // differs from the speculative pickBestShape. + for (auto &node : graph.nodes) { + auto mapping_attr = + node->op->getAttrOfType("task_mapping_info"); + if (!mapping_attr) + continue; + auto positions_attr = + mapping_attr.getAs("cgra_positions"); + if (!positions_attr || positions_attr.empty()) + continue; + + // Extracts (col, row) pairs from the placement result. + SmallVector> placed_positions; + for (Attribute pos_attr : positions_attr) { + auto coord = cast(pos_attr); + int row = cast(coord.get("row")).getInt(); + int col = cast(coord.get("col")).getInt(); + placed_positions.emplace_back(col, row); + } + + int actual_cgra_count = static_cast(placed_positions.size()); + + // Computes bounding box of the actual placement. + int min_row = INT_MAX, max_row = INT_MIN; + int min_col = INT_MAX, max_col = INT_MIN; + for (auto &[col, row] : placed_positions) { + min_row = std::min(min_row, row); + max_row = std::max(max_row, row); + min_col = std::min(min_col, col); + max_col = std::max(max_col, col); + } + int bbox_rows = max_row - min_row + 1; + int bbox_cols = max_col - min_col + 1; + bool is_rect = (bbox_rows * bbox_cols == actual_cgra_count); + + // Builds the actual CgraShape. + CgraShape actual_shape; + actual_shape.rows = bbox_rows; + actual_shape.cols = bbox_cols; + actual_shape.is_rectangular = is_rect; + if (!is_rect) { + // Normalizes positions to (0,0) origin for the shape. + for (auto &[col, row] : placed_positions) + actual_shape.cgra_positions.emplace_back(col - min_col, + row - min_row); + } + + // Checks whether the placed shape differs from the speculative + // shape used during balance profiling. + bool shape_changed = + (actual_cgra_count != node->cgra_count) || + (actual_shape.rows != node->shape.rows) || + (actual_shape.cols != node->shape.cols) || + (actual_shape.is_rectangular != node->shape.is_rectangular); + + if (shape_changed) { + llvm::errs() + << "[ResourceAware] Post-placement shape mismatch for " + << node->op.getTaskName() + << ": speculative=" << node->shape.describe(node->cgra_count) + << ", actual=" << actual_shape.describe(actual_cgra_count) + << " — re-profiling\n"; + + // Updates the node to reflect the actual placement. + node->cgra_count = actual_cgra_count; + node->shape = actual_shape; + + // Re-profiles with the actual shape. + graph.profileTaskPublic(node.get(), node->op, + /*skip_mapper=*/use_analytical); + + // Writes updated attributes back to IR. + OpBuilder b(node->op); + node->op->setAttr("cgra_count", + b.getI32IntegerAttr(node->cgra_count)); + node->op->setAttr("compiled_ii", + b.getI32IntegerAttr(node->ii)); + node->op->setAttr("steps", + b.getI32IntegerAttr(node->steps)); + std::string actual_shape_str = node->shape.irAttr(); + node->op->setAttr("tile_shape", + b.getStringAttr(actual_shape_str)); + + llvm::errs() + << "[ResourceAware] Post-placement re-profiled " + << node->op.getTaskName() + << ": compiled_ii=" << node->ii + << ", steps=" << node->steps << "\n"; + } + } + break; } } @@ -1828,15 +2114,55 @@ struct ResourceAwareTaskOptimizationPass std::vector> combined_grid( kCgraGridRows, std::vector(kCgraGridCols, -1)); - // Packs tasks onto the grid left-to-right, top-to-bottom. - int next_col = 0, next_row = 0; + // Packs tasks onto the grid using actual placement results. + int next_col = 0, next_row = 0; // Fallback for tasks without placement. int task_idx = 0; llvm::errs() << "\n=== Tile Occupation Summary (4x" << kCgraGridCols << " CGRA Grid) ===\n"; for (auto &node : final_graph.nodes) { - auto shape = pickBestShape(node->cgra_count); + // Reads the actual placed shape from task_mapping_info instead of + // re-computing with pickBestShape, so the summary is consistent + // with the real placement result. + CgraShape shape = pickBestShape(node->cgra_count); // fallback + SmallVector> actual_grid_positions; + + if (auto mapping_attr = + node->op->getAttrOfType("task_mapping_info")) { + if (auto positions_attr = + mapping_attr.getAs("cgra_positions")) { + if (!positions_attr.empty()) { + actual_grid_positions.clear(); + int min_row = INT_MAX, max_row = INT_MIN; + int min_col = INT_MAX, max_col = INT_MIN; + for (Attribute pos_attr : positions_attr) { + auto coord = cast(pos_attr); + int row = cast(coord.get("row")).getInt(); + int col = cast(coord.get("col")).getInt(); + actual_grid_positions.emplace_back(col, row); + min_row = std::min(min_row, row); + max_row = std::max(max_row, row); + min_col = std::min(min_col, col); + max_col = std::max(max_col, col); + } + int bbox_rows = max_row - min_row + 1; + int bbox_cols = max_col - min_col + 1; + int placed_count = + static_cast(actual_grid_positions.size()); + bool is_rect = (bbox_rows * bbox_cols == placed_count); + shape.rows = bbox_rows; + shape.cols = bbox_cols; + shape.is_rectangular = is_rect; + shape.cgra_positions.clear(); + if (!is_rect) { + for (auto &[c, r] : actual_grid_positions) + shape.cgra_positions.emplace_back(c - min_col, r - min_row); + } + } + } + } + int tile_rows = shape.rows * neura::getArchitecture().getPerCgraRows(); int tile_cols = shape.cols * neura::getArchitecture().getPerCgraColumns(); @@ -1873,20 +2199,29 @@ struct ResourceAwareTaskOptimizationPass llvm::errs() << "\n"; } - // Places onto combined grid (pack sequentially). - int placed = 0; - for (int r = next_row; r < kCgraGridRows && placed < node->cgra_count; - ++r) { - for (int c = (r == next_row ? next_col : 0); - c < kCgraGridCols && placed < node->cgra_count; ++c) { - combined_grid[r][c] = task_idx; - next_row = r; - next_col = c + 1; - if (next_col >= kCgraGridCols) { - next_col = 0; - next_row = r + 1; + // Places onto combined grid using actual placement positions when + // available, falling back to sequential packing. + if (!actual_grid_positions.empty()) { + for (auto &[col, row] : actual_grid_positions) { + if (row >= 0 && row < kCgraGridRows && col >= 0 && + col < kCgraGridCols) + combined_grid[row][col] = task_idx; + } + } else { + int placed = 0; + for (int r = next_row; + r < kCgraGridRows && placed < node->cgra_count; ++r) { + for (int c = (r == next_row ? next_col : 0); + c < kCgraGridCols && placed < node->cgra_count; ++c) { + combined_grid[r][c] = task_idx; + next_row = r; + next_col = c + 1; + if (next_col >= kCgraGridCols) { + next_col = 0; + next_row = r + 1; + } + ++placed; } - ++placed; } } ++task_idx; diff --git a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir index 8d6971db..96e4e3d1 100644 --- a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir +++ b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir @@ -193,16 +193,16 @@ module { // RESOPT: taskflow.task @Task_0_Task_1_utilfused // RESOPT-SAME: {cgra_count = 3 : i32, compiled_ii = 1 : i32, steps = 10 : i32 -// RESOPT-SAME: tile_shape = "2x2[(0,0)(1,0)(0,1)]", trip_count = 64 : i32} +// RESOPT-SAME: tile_shape = "1x3", trip_count = 64 : i32} // RESOPT: return // CGRA Tile Occupation after RESOPT (4x4 grid, col x row): // +---+---+---+---+ -// | 0 | 0 | . | . | row=0: Task_0_Task_1_utilfused occupies 3 CGRAs -// +---+---+---+---+ in a 2x2 non-rectangular layout: -// | 0 | . | . | . | (0,0), (1,0), (0,1) +// | 0 | 0 | 0 | . | row=0: Task_0_Task_1_utilfused occupies 3 CGRAs +// +---+---+---+---+ in a 1x3 rectangular layout: +// | . | . | . | . | (0,0), (1,0), (2,0) // +---+---+---+---+ -// | . | . | . | . | Total tile array: 8x8 (3 CGRAs × 16 tiles = 48 tiles) +// | . | . | . | . | Total tile array: 4x12 (3 CGRAs × 16 tiles = 48 tiles) // +---+---+---+---+ // | . | . | . | . | res_mii=3 (16 tiles) → 2 (32 tiles) → 1 (48 tiles) // +---+---+---+---+