From 87cb9a0c72d8e27fcded4a98e7b3f885ba0016f0 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Thu, 5 Mar 2026 12:51:04 +0800
Subject: [PATCH 01/10] feat(taskflow): integrate MapTaskOnCgraPass for
 multi-CGRA placements

---
 include/TaskflowDialect/TaskflowPasses.h      |   6 +
 .../Transforms/MapTaskOnCgraPass.cpp          | 183 +++++++++++++-----
 .../ResourceAwareTaskOptimizationPass.cpp     |   7 +
 .../irregular-loop/irregular-loop.mlir        |   6 +-
 .../taskflow/multi-nested/multi-nested.mlir   |  19 +-
 .../parallel-nested/parallel-nested.mlir      |   3 +-
 .../taskflow/resnet/simple_resnet_tosa.mlir   |  18 +-
 .../resource-heavy/resource-heavy.mlir        |   3 +-
 8 files changed, 183 insertions(+), 62 deletions(-)

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index a23c5b02..866365eb 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -5,6 +5,7 @@
 
 #include "TaskflowDialect/TaskflowDialect.h"
 #include "TaskflowDialect/TaskflowOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -23,6 +24,11 @@ std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
 
+// Runs the CGRA task placement logic directly on a function.
+// grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols).
+void runMapTaskOnCgra(mlir::func::FuncOp func,
+                      int grid_rows = 4, int grid_cols = 4);
+
 //=========================================================//
 // Optimization Passes
 //=========================================================//
diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index c04df0b7..3ed417c4 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -407,25 +407,97 @@ class TaskMapper {
   }
 
 
-  /// Finds best placement for a task.
-  /// TODO: Currently defaults to single-CGRA placement. Multi-CGRA binding logic
-  /// (cgra_count > 1) is experimental/placeholder and should ideally be handled 
-  /// by an upstream resource binding pass.
+  // Parses a tile_shape string like "2x2" or "2x2[(0,0)(1,0)(0,1)]".
+  // Returns a list of (col, row) offsets relative to the placement origin.
+  // For rectangular shapes "NxM", generates all NxM positions.
+  // For non-rectangular shapes with explicit positions, uses the listed coords.
+  SmallVector<std::pair<int, int>> parseTileShapeOffsets(
+      StringRef tile_shape, int cgra_count) {
+    SmallVector<std::pair<int, int>> offsets;
+
+    if (tile_shape.empty() || cgra_count <= 1) {
+      offsets.push_back({0, 0});
+      return offsets;
+    }
+
+    // Checks for explicit position list: "NxM[(c0,r0)(c1,r1)...]"
+    size_t bracket_pos = tile_shape.find('[');
+    if (bracket_pos != StringRef::npos) {
+      StringRef positions_str = tile_shape.substr(bracket_pos);
+      // Parses each (c,r) pair.
+      size_t pos = 0;
+      while (pos < positions_str.size()) {
+        size_t open = positions_str.find('(', pos);
+        if (open == StringRef::npos) break;
+        size_t close = positions_str.find(')', open);
+        if (close == StringRef::npos) break;
+        StringRef pair_str = positions_str.slice(open + 1, close);
+        auto [col_str, row_str] = pair_str.split(',');
+        int col_off = 0, row_off = 0;
+        col_str.getAsInteger(10, col_off);
+        row_str.getAsInteger(10, row_off);
+        offsets.push_back({col_off, row_off});
+        pos = close + 1;
+      }
+    } else {
+      // Rectangular shape: "NxM" — parse rows × cols.
+      auto [rows_str, cols_str] = tile_shape.split('x');
+      int rows = 1, cols = 1;
+      rows_str.getAsInteger(10, rows);
+      cols_str.getAsInteger(10, cols);
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          offsets.push_back({c, r});
+        }
+      }
+    }
+
+    // Sanity: if parsing failed, at least return a single cell.
+    if (offsets.empty()) {
+      offsets.push_back({0, 0});
+    }
+    return offsets;
+  }
+
+  // Finds best placement for a task on the CGRA grid.
+  // For cgra_count > 1, reads the tile_shape attribute to determine the
+  // physical layout (rectangular or L/T-shape) and validates that all
+  // required positions fit within the grid boundary and are unoccupied.
   TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
                                   TaskMemoryGraph &graph) {
     int best_score = INT_MIN;
     TaskPlacement best_placement;
 
-    // Baseline: For cgra_count=1, finds single best position.
+    // Reads tile_shape attribute if present.
+    StringRef tile_shape;
+    if (auto attr = task_node->op->getAttrOfType<StringAttr>("tile_shape")) {
+      tile_shape = attr.getValue();
+    }
+
+    // Parses shape offsets from tile_shape string.
+    SmallVector<std::pair<int, int>> shape_offsets =
+        parseTileShapeOffsets(tile_shape, cgra_count);
+
+    // Tries every valid placement origin on the grid.
     for (int r = 0; r < grid_rows_; ++r) {
       for (int c = 0; c < grid_cols_; ++c) {
-        if (occupied_[r][c]) {
+        // Checks if ALL positions in the shape fit within bounds and are free.
+        bool valid = true;
+        TaskPlacement candidate;
+        for (auto &[col_off, row_off] : shape_offsets) {
+          int pr = r + row_off;
+          int pc = c + col_off;
+          if (pr < 0 || pr >= grid_rows_ || pc < 0 || pc >= grid_cols_ ||
+              occupied_[pr][pc]) {
+            valid = false;
+            break;
+          }
+          candidate.cgra_positions.push_back({pr, pc});
+        }
+        if (!valid) {
           continue;
         }
 
-        TaskPlacement candidate;
-        candidate.cgra_positions.push_back({r, c});
-
         int score = computeScore(task_node, candidate, graph);
         if (score > best_score) {
           best_score = score;
@@ -436,21 +508,35 @@ class TaskMapper {
 
     // Error handling: No available position found (grid over-subscribed).
     if (best_placement.cgra_positions.empty()) {
-      assert(false && "No available CGRA position found (grid over-subscribed).");
+      llvm::errs() << "[MapTaskOnCgra] WARNING: No valid placement for task "
+                   << task_node->op.getTaskName()
+                   << " with cgra_count=" << cgra_count
+                   << " tile_shape=" << tile_shape << "\n";
+      // Fallback: place on any single free cell.
+      for (int r = 0; r < grid_rows_ && best_placement.cgra_positions.empty(); ++r) {
+        for (int c = 0; c < grid_cols_ && best_placement.cgra_positions.empty(); ++c) {
+          if (!occupied_[r][c]) {
+            best_placement.cgra_positions.push_back({r, c});
+          }
+        }
+      }
+      if (best_placement.cgra_positions.empty()) {
+        assert(false && "No available CGRA position found (grid over-subscribed).");
+      }
     }
 
     return best_placement;
   }
 
-  /// Computes placement score based on Task-Memory Graph.
-  /// TODO: Introduce explicit 'direct_wires' attributes in the IR for
-  /// downstream hardware generators to configure fast bypass paths between
-  /// adjacent PEs with dependencies.
-  ///
-  /// Score = α·SSA_Dist + β·Mem_Dist.
-  ///
-  /// SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands).
-  /// Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs.
+  // Computes placement score based on Task-Memory Graph.
+  // For multi-CGRA placements, uses the minimum distance from any position
+  // in the placement to the target, since adjacent CGRAs can communicate
+  // via fast bypass paths.
+  //
+  // Score = α·SSA_Dist + β·Mem_Dist.
+  //
+  // SSA_Dist: Minimize distance to placed SSA predecessors (ssa_operands).
+  // Mem_Dist: Minimize distance to assigned SRAMs for read/write memrefs.
   int computeScore(TaskNode *task_node, const TaskPlacement &placement,
                    TaskMemoryGraph &graph) {
     // Weight constants (tunable).
@@ -459,40 +545,44 @@ class TaskMapper {
 
     int ssa_score = 0;
     int mem_score = 0;
-    
-    CGRAPosition current_pos = placement.primary();
+
+    // Helper: minimum Manhattan distance from any position in this placement
+    // to a target position.
+    auto minDistToTarget = [&](const CGRAPosition &target) -> int {
+      int min_dist = INT_MAX;
+      for (const auto &pos : placement.cgra_positions) {
+        int d = pos.manhattanDistance(target);
+        min_dist = std::min(min_dist, d);
+      }
+      return min_dist;
+    };
 
     // 1. SSA proximity (predecessors & successors).
     for (TaskNode *producer : task_node->ssa_operands) {
-        if (!producer->placement.empty()) {
-            int dist = current_pos.manhattanDistance(producer->placement[0]);
-            // Uses negative distance to penalize far-away placements.
-            ssa_score -= dist;
-        }
+      if (!producer->placement.empty()) {
+        int dist = minDistToTarget(producer->placement[0]);
+        ssa_score -= dist;
+      }
     }
     for (TaskNode *consumer : task_node->ssa_users) {
-        if (!consumer->placement.empty()) {
-            int dist = current_pos.manhattanDistance(consumer->placement[0]);
-            ssa_score -= dist;
-        }
+      if (!consumer->placement.empty()) {
+        int dist = minDistToTarget(consumer->placement[0]);
+        ssa_score -= dist;
+      }
     }
 
     // 2. Memory proximity.
-    // For read memrefs.
     for (MemoryNode *mem : task_node->read_memrefs) {
-        if (mem->assigned_sram_pos) {
-            int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos);
-            mem_score -= dist;
-        }
+      if (mem->assigned_sram_pos) {
+        int dist = minDistToTarget(*mem->assigned_sram_pos);
+        mem_score -= dist;
+      }
     }
-    // For write memrefs.
-    // If we write to a memory that is already assigned (e.g. read by previous task),
-    // we want to be close to it too.
     for (MemoryNode *mem : task_node->write_memrefs) {
-         if (mem->assigned_sram_pos) {
-            int dist = current_pos.manhattanDistance(*mem->assigned_sram_pos);
-            mem_score -= dist;
-        }
+      if (mem->assigned_sram_pos) {
+        int dist = minDistToTarget(*mem->assigned_sram_pos);
+        mem_score -= dist;
+      }
     }
 
     return kAlpha * ssa_score + kBeta * mem_score;
@@ -564,8 +654,8 @@ struct MapTaskOnCgraPass
 
   void runOnOperation() override {
     func::FuncOp func = getOperation();
-    constexpr int kDefaultGridRows = 3;
-    constexpr int kDefaultGridCols = 3;
+    constexpr int kDefaultGridRows = 4;
+    constexpr int kDefaultGridCols = 4;
     TaskMapper mapper(kDefaultGridRows, kDefaultGridCols);
     mapper.place(func);
   }
@@ -580,5 +670,10 @@ std::unique_ptr<Pass> createMapTaskOnCgraPass() {
   return std::make_unique<MapTaskOnCgraPass>();
 }
 
+void runMapTaskOnCgra(func::FuncOp func, int grid_rows, int grid_cols) {
+  TaskMapper mapper(grid_rows, grid_cols);
+  mapper.place(func);
+}
+
 } // namespace taskflow
 } // namespace mlir
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
index c5052b83..2ce2c18e 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
@@ -1768,6 +1768,13 @@ struct ResourceAwareTaskOptimizationPass
           std::string shape_str = node->shape.irAttr();
           node->op->setAttr("tile_shape", b.getStringAttr(shape_str));
         }
+
+        // Runs MapTaskOnCgraPass to produce global placement (task_mapping_info)
+        // with multi-CGRA support. The pass reads cgra_count and tile_shape
+        // from each task and places them on the 4x4 grid, validating that
+        // shapes physically fit and don't overlap.
+        taskflow::runMapTaskOnCgra(func, kCgraGridRows, kCgraGridCols);
+
         break;
       }
     }
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 364bcadc..13c56ab1 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -395,6 +395,8 @@ module attributes {} {
 // 0=Task_0_Task_1_utilfused, 1=Task_2; 2/16 CGRAs used
 
 // RESOPT:      taskflow.task @Task_0_Task_1_utilfused
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 3 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 32 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 3 : i32, steps = 5 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 32 : i32}
 // RESOPT:      taskflow.task @Task_2
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 32 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 32 : i32}
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index 42f99361..c4e7b76c 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -523,22 +523,25 @@ module attributes {} {
 // HYPERBLOCK-NEXT:}
 
 // PLACEMENT:      taskflow.task @Task_0
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 0 : i32}], read_sram_locations = [{col = 0 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]}
 // PLACEMENT:      taskflow.task @Task_1
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 0 : i32}]}
 // PLACEMENT:      taskflow.task @Task_2
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}, {col = 0 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 3 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}, {col = 2 : i32, row = 0 : i32}, {col = 3 : i32, row = 0 : i32}], write_sram_locations = [{col = 3 : i32, row = 0 : i32}]}
 // PLACEMENT:      taskflow.task @Task_3
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 2 : i32, row = 1 : i32}]}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 2 : i32, row = 0 : i32}], read_sram_locations = [{col = 2 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
 // PLACEMENT:      taskflow.task @Task_4
-// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
+// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 0 : i32, row = 1 : i32}, {col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
 
 // RESOPT:      taskflow.task @Task_1
-// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 160 : i32
+// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 160 : i32
 // RESOPT:      taskflow.task @Task_0_Task_2_fused_Task_3_utilfused
-// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 192 : i32
+// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 5 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 192 : i32
 // RESOPT:      taskflow.task @Task_4
-// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 36 : i32
+// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 36 : i32
 // RESOPT:      return
 
 // CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index 3d63f767..881d81ec 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -161,7 +161,8 @@ module {
 // PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 0 : i32}]}
 
 // RESOPT:      taskflow.task @Task_0_Task_1_utilfused
-// RESOPT:      cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 64 : i32
+// RESOPT-SAME: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 64 : i32
 // RESOPT:      return
 
 // CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
index f1741b0a..f6974c26 100644
--- a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
+++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir
@@ -704,17 +704,23 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
 
 
 // RESOPT:      taskflow.task @Task_1_Task_0_Task_2_utilfused_utilfused
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 4 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 6400 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 4 : i32, steps = 3 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 6400 : i32}
 // RESOPT:      taskflow.task @Task_3
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 2359296 : i32}
 // RESOPT:      taskflow.task @Task_4_Task_5_fused_Task_7_utilfused
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 6400 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 6400 : i32}
 // RESOPT:      taskflow.task @Task_6_Task_8_utilfused
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 4096 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 3 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 4096 : i32}
 // RESOPT:      taskflow.task @Task_9
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 2359296 : i32}
 // RESOPT:      taskflow.task @Task_10_Task_11_Task_12_fused_fused
-// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 8 : i32, tile_shape = "1x1", trip_count = 4096 : i32}
+// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 8 : i32
+// RESOPT-SAME: tile_shape = "1x1", trip_count = 4096 : i32}
 // RESOPT:      return
 
 
diff --git a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
index ffc37f2d..3e253b8c 100644
--- a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
+++ b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
@@ -192,7 +192,8 @@ module {
 // TASKFLOW:        return
 
 // RESOPT:      taskflow.task @Task_0_Task_1_utilfused
-// RESOPT-SAME: {cgra_count = 3 : i32, compiled_ii = 1 : i32, steps = 10 : i32, tile_shape = "2x2[(0,0)(1,0)(0,1)]", trip_count = 64 : i32}
+// RESOPT-SAME: {cgra_count = 3 : i32, compiled_ii = 1 : i32, steps = 10 : i32
+// RESOPT-SAME: tile_shape = "2x2[(0,0)(1,0)(0,1)]", trip_count = 64 : i32}
 // RESOPT:      return
 
 // CGRA Tile Occupation after RESOPT (4x4 grid, col x row):

From 273610ac91ec32639ab6c99033641f4798eab747 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Thu, 5 Mar 2026 13:33:27 +0800
Subject: [PATCH 02/10] Refined fallback logic

---
 .../Transforms/MapTaskOnCgraPass.cpp          | 110 +++++++++++++++++-
 1 file changed, 106 insertions(+), 4 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index 3ed417c4..c321545f 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -22,7 +22,9 @@
 #include <algorithm>
 #include <climits>
 #include <cmath>
+#include <functional>
 #include <optional>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -453,6 +455,8 @@ class TaskMapper {
     }
 
     // Sanity: if parsing failed, at least return a single cell.
+    // USER COMMENT: Assert here.
+    assert(!offsets.empty() && "tile_shape parsing yielded empty offsets");
     if (offsets.empty()) {
       offsets.push_back({0, 0});
     }
@@ -513,13 +517,111 @@ class TaskMapper {
                    << " with cgra_count=" << cgra_count
                    << " tile_shape=" << tile_shape << "\n";
       // Fallback: place on any single free cell.
-      for (int r = 0; r < grid_rows_ && best_placement.cgra_positions.empty(); ++r) {
-        for (int c = 0; c < grid_cols_ && best_placement.cgra_positions.empty(); ++c) {
-          if (!occupied_[r][c]) {
-            best_placement.cgra_positions.push_back({r, c});
+      // USER COMMENT: The logic should be: 1.tires rectangular shape 2. If fails, try other shapes 3. If all fail, fallback to current cell -1 numbers of cells.
+      for (int k = cgra_count; k >= 1 && best_placement.cgra_positions.empty(); --k) {
+        
+        // 1. Try rectangular shapes of size k
+        SmallVector<SmallVector<std::pair<int, int>>> rect_shapes;
+        for (int r = 1; r <= k; ++r) {
+          if (k % r == 0) {
+            int c = k / r;
+            SmallVector<std::pair<int, int>> shape;
+            for (int i = 0; i < r; ++i) {
+              for (int j = 0; j < c; ++j) {
+                shape.push_back({j, i});
+              }
+            }
+            rect_shapes.push_back(shape);
+          }
+        }
+
+        int current_best_score = INT_MIN;
+        TaskPlacement current_best_placement;
+
+        for (const auto &shape : rect_shapes) {
+          for (int r = 0; r < grid_rows_; ++r) {
+            for (int c = 0; c < grid_cols_; ++c) {
+              bool valid = true;
+              TaskPlacement candidate;
+              for (auto &[col_off, row_off] : shape) {
+                int pr = r + row_off;
+                int pc = c + col_off;
+                if (pr < 0 || pr >= grid_rows_ || pc < 0 || pc >= grid_cols_ ||
+                    occupied_[pr][pc]) {
+                  valid = false;
+                  break;
+                }
+                candidate.cgra_positions.push_back({pr, pc});
+              }
+              if (valid) {
+                int score = computeScore(task_node, candidate, graph);
+                if (score > current_best_score) {
+                  current_best_score = score;
+                  current_best_placement = candidate;
+                }
+              }
+            }
           }
         }
+
+        if (!current_best_placement.cgra_positions.empty()) {
+          best_placement = current_best_placement;
+          break; // Found valid rectangular placement
+        }
+
+        // 2. Try other (non-rectangular) connected shapes of size k
+        std::set<uint64_t> visited_masks;
+        int other_best_score = INT_MIN;
+        TaskPlacement other_best_placement;
+
+        std::function<void(SmallVector<CGRAPosition>&, uint64_t)> searchShapes = 
+            [&](SmallVector<CGRAPosition>& current, uint64_t mask) {
+          if (current.size() == (size_t)k) {
+            if (visited_masks.insert(mask).second) {
+              TaskPlacement candidate;
+              candidate.cgra_positions = current;
+              int score = computeScore(task_node, candidate, graph);
+              if (score > other_best_score) {
+                other_best_score = score;
+                other_best_placement = candidate;
+              }
+            }
+            return;
+          }
+          for (size_t i = 0; i < current.size(); ++i) {
+            auto pos = current[i];
+            const int dr[] = {-1, 1, 0, 0};
+            const int dc[] = {0, 0, -1, 1};
+            for (int d = 0; d < 4; ++d) {
+              int nr = pos.row + dr[d];
+              int nc = pos.col + dc[d];
+              if (nr >= 0 && nr < grid_rows_ && nc >= 0 && nc < grid_cols_ && !occupied_[nr][nc]) {
+                uint64_t bit = 1ULL << (nr * grid_cols_ + nc);
+                if ((mask & bit) == 0) {
+                  current.push_back({nr, nc});
+                  searchShapes(current, mask | bit);
+                  current.pop_back();
+                }
+              }
+            }
+          }
+        };
+
+        for (int r = 0; r < grid_rows_; ++r) {
+          for (int c = 0; c < grid_cols_; ++c) {
+            if (!occupied_[r][c]) {
+              SmallVector<CGRAPosition> start = {{r, c}};
+              searchShapes(start, 1ULL << (r * grid_cols_ + c));
+            }
+          }
+        }
+
+        if (!other_best_placement.cgra_positions.empty()) {
+          best_placement = other_best_placement;
+          break; // Found valid non-rectangular connected placement
+        }
       }
+
       if (best_placement.cgra_positions.empty()) {
         assert(false && "No available CGRA position found (grid over-subscribed).");
       }

From 798fe841bcd8c601d87bf85007c2c27f11d4798f Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Mar 2026 09:01:05 +0800
Subject: [PATCH 03/10] refactor(taskflow): cascading shape search in
 MapTaskOnCgraPass placement

- findBestPlacement now tries rectangular shapes first, then non-rectangular
  connected shapes, then falls back to k-1 CGRAs (down to 1).
- Removed outdated TODO comment about MapTaskOnCgraPass not supporting
  multi-CGRA placement.
- Added assert for empty tile_shape offsets.
- Cleaned up USER COMMENT annotations.
---
 .../Transforms/MapTaskOnCgraPass.cpp          | 200 ++++++++----------
 .../ResourceAwareTaskOptimizationPass.cpp     |   4 -
 2 files changed, 88 insertions(+), 116 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index c321545f..081a3417 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -454,38 +454,21 @@ class TaskMapper {
       }
     }
 
-    // Sanity: if parsing failed, at least return a single cell.
-    // USER COMMENT: Assert here.
+    // Sanity: if parsing failed, assert.
     assert(!offsets.empty() && "tile_shape parsing yielded empty offsets");
-    if (offsets.empty()) {
-      offsets.push_back({0, 0});
-    }
     return offsets;
   }
 
-  // Finds best placement for a task on the CGRA grid.
-  // For cgra_count > 1, reads the tile_shape attribute to determine the
-  // physical layout (rectangular or L/T-shape) and validates that all
-  // required positions fit within the grid boundary and are unoccupied.
-  TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
-                                  TaskMemoryGraph &graph) {
+  // Tries placing a shape (given as col/row offsets) at every grid origin.
+  // Returns the best-scoring valid placement, or empty if none fits.
+  TaskPlacement tryPlaceShape(
+      TaskNode *task_node,
+      const SmallVector<std::pair<int, int>> &shape_offsets,
+      TaskMemoryGraph &graph) {
     int best_score = INT_MIN;
     TaskPlacement best_placement;
-
-    // Reads tile_shape attribute if present.
-    StringRef tile_shape;
-    if (auto attr = task_node->op->getAttrOfType<StringAttr>("tile_shape")) {
-      tile_shape = attr.getValue();
-    }
-
-    // Parses shape offsets from tile_shape string.
-    SmallVector<std::pair<int, int>> shape_offsets =
-        parseTileShapeOffsets(tile_shape, cgra_count);
-
-    // Tries every valid placement origin on the grid.
     for (int r = 0; r < grid_rows_; ++r) {
       for (int c = 0; c < grid_cols_; ++c) {
-        // Checks if ALL positions in the shape fit within bounds and are free.
         bool valid = true;
         TaskPlacement candidate;
         for (auto &[col_off, row_off] : shape_offsets) {
@@ -498,10 +481,7 @@ class TaskMapper {
           }
           candidate.cgra_positions.push_back({pr, pc});
         }
-        if (!valid) {
-          continue;
-        }
-
+        if (!valid) continue;
         int score = computeScore(task_node, candidate, graph);
         if (score > best_score) {
           best_score = score;
@@ -509,97 +489,60 @@ class TaskMapper {
         }
       }
     }
+    return best_placement;
+  }
 
-    // Error handling: No available position found (grid over-subscribed).
-    if (best_placement.cgra_positions.empty()) {
-      llvm::errs() << "[MapTaskOnCgra] WARNING: No valid placement for task "
-                   << task_node->op.getTaskName()
-                   << " with cgra_count=" << cgra_count
-                   << " tile_shape=" << tile_shape << "\n";
-      // Fallback: place on any single free cell.
-      // USER COMMENT: The logic should be: 1.tires rectangular shape 2. If fails, try other shapes 3. If all fail, fallback to current cell -1 numbers of cells.
-      for (int k = cgra_count; k >= 1 && best_placement.cgra_positions.empty(); --k) {
-        
-        // 1. Try rectangular shapes of size k
-        SmallVector<SmallVector<std::pair<int, int>>> rect_shapes;
-        for (int r = 1; r <= k; ++r) {
-          if (k % r == 0) {
-            int c = k / r;
-            SmallVector<std::pair<int, int>> shape;
-            for (int i = 0; i < r; ++i) {
-              for (int j = 0; j < c; ++j) {
-                shape.push_back({j, i});
-              }
-            }
-            rect_shapes.push_back(shape);
-          }
-        }
-
-        int current_best_score = INT_MIN;
-        TaskPlacement current_best_placement;
-
-        for (const auto &shape : rect_shapes) {
-          for (int r = 0; r < grid_rows_; ++r) {
-            for (int c = 0; c < grid_cols_; ++c) {
-              bool valid = true;
-              TaskPlacement candidate;
-              for (auto &[col_off, row_off] : shape) {
-                int pr = r + row_off;
-                int pc = c + col_off;
-                if (pr < 0 || pr >= grid_rows_ || pc < 0 || pc >= grid_cols_ ||
-                    occupied_[pr][pc]) {
-                  valid = false;
-                  break;
-                }
-                candidate.cgra_positions.push_back({pr, pc});
-              }
-              if (valid) {
-                int score = computeScore(task_node, candidate, graph);
-                if (score > current_best_score) {
-                  current_best_score = score;
-                  current_best_placement = candidate;
-                }
-              }
-            }
-          }
-        }
-
-        if (!current_best_placement.cgra_positions.empty()) {
-          best_placement = current_best_placement;
-          break; // Found valid rectangular placement
-        }
+  // Generates all rectangular shapes (as col/row offset lists) of size k.
+  // E.g. k=4 → 1×4, 2×2, 4×1.
+  SmallVector<SmallVector<std::pair<int, int>>> getRectShapes(int k) {
+    SmallVector<SmallVector<std::pair<int, int>>> shapes;
+    for (int rows = 1; rows <= k; ++rows) {
+      if (k % rows != 0) continue;
+      int cols = k / rows;
+      SmallVector<std::pair<int, int>> offsets;
+      for (int r = 0; r < rows; ++r)
+        for (int c = 0; c < cols; ++c)
+          offsets.push_back({c, r}); // {col_off, row_off}
+      shapes.push_back(offsets);
+    }
+    return shapes;
+  }
 
-        // 2. Try other (non-rectangular) connected shapes of size k
-        std::set<uint64_t> visited_masks;
-        int other_best_score = INT_MIN;
-        TaskPlacement other_best_placement;
+  // Searches all connected non-rectangular shapes of size k on the grid
+  // and returns the best-scoring valid placement, or empty if none found.
+  TaskPlacement tryNonRectShapes(TaskNode *task_node, int k,
+                                 TaskMemoryGraph &graph) {
+    std::set<uint64_t> visited_masks;
+    int best_score = INT_MIN;
+    TaskPlacement best_placement;
 
-        std::function<void(SmallVector<CGRAPosition>&, uint64_t)> searchShapes = 
-            [&](SmallVector<CGRAPosition>& current, uint64_t mask) {
-          if (current.size() == (size_t)k) {
+    std::function<void(SmallVector<CGRAPosition> &, uint64_t)> search =
+        [&](SmallVector<CGRAPosition> &current, uint64_t mask) {
+          if ((int)current.size() == k) {
             if (visited_masks.insert(mask).second) {
               TaskPlacement candidate;
               candidate.cgra_positions = current;
               int score = computeScore(task_node, candidate, graph);
-              if (score > other_best_score) {
-                other_best_score = score;
-                other_best_placement = candidate;
+              if (score > best_score) {
+                best_score = score;
+                best_placement = candidate;
               }
             }
             return;
           }
+          constexpr int dr[] = {-1, 1, 0, 0};
+          constexpr int dc[] = {0, 0, -1, 1};
           for (size_t i = 0; i < current.size(); ++i) {
             auto pos = current[i];
-            const int dr[] = {-1, 1, 0, 0};
-            const int dc[] = {0, 0, -1, 1};
             for (int d = 0; d < 4; ++d) {
               int nr = pos.row + dr[d];
               int nc = pos.col + dc[d];
-              if (nr >= 0 && nr < grid_rows_ && nc >= 0 && nc < grid_cols_ && !occupied_[nr][nc]) {
+              if (nr >= 0 && nr < grid_rows_ && nc >= 0 && nc < grid_cols_ &&
+                  !occupied_[nr][nc]) {
                 uint64_t bit = 1ULL << (nr * grid_cols_ + nc);
                 if ((mask & bit) == 0) {
                   current.push_back({nr, nc});
-                  searchShapes(current, mask | bit);
+                  search(current, mask | bit);
                   current.pop_back();
                 }
               }
@@ -607,26 +550,59 @@ class TaskMapper {
           }
         };
 
-        for (int r = 0; r < grid_rows_; ++r) {
-          for (int c = 0; c < grid_cols_; ++c) {
-            if (!occupied_[r][c]) {
-              SmallVector<CGRAPosition> start = {{r, c}};
-              searchShapes(start, 1ULL << (r * grid_cols_ + c));
-            }
-          }
+    for (int r = 0; r < grid_rows_; ++r) {
+      for (int c = 0; c < grid_cols_; ++c) {
+        if (!occupied_[r][c]) {
+          SmallVector<CGRAPosition> start = {{r, c}};
+          search(start, 1ULL << (r * grid_cols_ + c));
         }
+      }
+    }
+    return best_placement;
+  }
+
+  // Finds best placement for a task on the CGRA grid.
+  //
+  // Search order (for k = cgra_count down to 1):
+  //   1. Try all rectangular shapes of size k  (1×k, 2×(k/2), …, k×1).
+  //   2. If none fits, try all connected non-rectangular shapes of size k.
+  //   3. If still nothing, decrement k and repeat.
+  // This guarantees the task gets the largest possible contiguous CGRA
+  // allocation that physically fits on the current grid.
+  TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
+                                  TaskMemoryGraph &graph) {
+    TaskPlacement best_placement;
 
-        if (!other_best_placement.cgra_positions.empty()) {
-          best_placement = other_best_placement;
-          break; // Found valid non-rectangular connected placement
+    for (int k = cgra_count; k >= 1; --k) {
+      // 1. Rectangular shapes of size k.
+      for (auto &shape : getRectShapes(k)) {
+        best_placement = tryPlaceShape(task_node, shape, graph);
+        if (!best_placement.cgra_positions.empty()) {
+          if (k < cgra_count) {
+            llvm::errs() << "[MapTaskOnCgra] Fallback: placed "
+                         << task_node->op.getTaskName()
+                         << " on " << k << " CGRAs (requested "
+                         << cgra_count << ")\n";
+          }
+          return best_placement;
         }
       }
 
-      if (best_placement.cgra_positions.empty()) {
-        assert(false && "No available CGRA position found (grid over-subscribed).");
+      // 2. Non-rectangular connected shapes of size k.
+      best_placement = tryNonRectShapes(task_node, k, graph);
+      if (!best_placement.cgra_positions.empty()) {
+        if (k < cgra_count) {
+          llvm::errs() << "[MapTaskOnCgra] Fallback (non-rect): placed "
+                       << task_node->op.getTaskName()
+                       << " on " << k << " CGRAs (requested "
+                       << cgra_count << ")\n";
+        }
+        return best_placement;
       }
     }
 
+    // Should never reach here on a valid grid.
+    assert(false && "No available CGRA position found (grid over-subscribed).");
     return best_placement;
   }
 
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
index 2ce2c18e..5a7e0d63 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
@@ -852,10 +852,6 @@ class PipelineBalancer {
       // verify if the speculatively increased CGRA count and its proposed shape 
       // actually fit on the 4x4 grid alongside other previously allocated tasks.
       //
-      // Currently, MapTaskOnCgraPass does not support multi-CGRA task placement. 
-      // Once it does, we should call it here; if global placement fails for the 
-      // "best" shape, we should backtrack and try alternative shapes before 
-      // saturating the node.
       if (!canFitOnGrid(new_cgra_count)) {
         saturated_nodes.insert(bottleneck);
         continue;

From 1b41e8dd32226db022db36d7475538fb4af2142d Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Mar 2026 09:08:39 +0800
Subject: [PATCH 04/10] refactor(taskflow): cascading placement search with
 fallback, clean up comments

- findBestPlacement tries rect then non-rect shapes for requested cgra_count.
- If placement fails, caller falls back to cgra_count-1 (reject extra CGRA).
- Normalize /// to // comment style throughout MapTaskOnCgraPass.
- Remove outdated TODO comments.
---
 .../Transforms/MapTaskOnCgraPass.cpp          | 118 ++++++++----------
 1 file changed, 53 insertions(+), 65 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index 081a3417..1c37c39b 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -36,7 +36,7 @@ namespace {
 //===----------------------------------------------------------------------===//
 // CGRA Grid Position
 //===----------------------------------------------------------------------===//
-/// Represents a position on the 2D CGRA grid.
+// Represents a position on the 2D CGRA grid.
 struct CGRAPosition {
   int row;
   int col;
@@ -49,12 +49,12 @@ struct CGRAPosition {
     return !(*this == other);
   }
 
-  /// Computes Manhattan distance to another position.
+  // Computes Manhattan distance to another position.
   int manhattanDistance(const CGRAPosition &other) const {
     return std::abs(row - other.row) + std::abs(col - other.col);
   }
 
-  /// Checks if adjacent (Manhattan distance = 1).
+  // Checks if adjacent (Manhattan distance = 1).
   bool isAdjacent(const CGRAPosition &other) const {
     return manhattanDistance(other) == 1;
   }
@@ -63,19 +63,19 @@ struct CGRAPosition {
 //===----------------------------------------------------------------------===//
 // Task Placement Info
 //===----------------------------------------------------------------------===//
-/// Stores placement info for a task: can span multiple combined CGRAs.
+// Stores placement info for a task: can span multiple combined CGRAs.
 struct TaskPlacement {
   SmallVector<CGRAPosition> cgra_positions; // CGRAs assigned to this task.
 
-  /// Returns the primary (first) position.
+  // Returns the primary (first) position.
   CGRAPosition primary() const {
     return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0];
   }
 
-  /// Returns the number of CGRAs assigned.
+  // Returns the number of CGRAs assigned.
   size_t cgraCount() const { return cgra_positions.size(); }
 
-  /// Checks if any CGRA in this task is adjacent to any in other task.
+  // Checks if any CGRA in this task is adjacent to any in other task.
   bool hasAdjacentCGRA(const TaskPlacement &other) const {
     for (const auto &pos : cgra_positions) {
       for (const auto &other_pos : other.cgra_positions) {
@@ -94,7 +94,7 @@ struct TaskPlacement {
 
 struct MemoryNode;
 
-/// Represents a Task node in the graph.
+// Represents a Task node in the graph.
 struct TaskNode {
   size_t id;
   TaskflowTaskOp op;
@@ -112,7 +112,7 @@ struct TaskNode {
   TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {}
 };
 
-/// Represents a Memory node (MemRef) in the graph.
+// Represents a Memory node (MemRef) in the graph.
 struct MemoryNode {
   Value memref;
   
@@ -126,7 +126,7 @@ struct MemoryNode {
   MemoryNode(Value memref) : memref(memref) {}
 };
 
-/// The Task-Memory Dependency Graph.
+// The Task-Memory Dependency Graph.
 class TaskMemoryGraph {
 public:
   SmallVector<std::unique_ptr<TaskNode>> task_nodes;
@@ -196,7 +196,7 @@ class TaskMemoryGraph {
 //===----------------------------------------------------------------------===//
 // Task Mapper
 //===----------------------------------------------------------------------===//
-/// Maps a task-memory graph onto a 2D CGRA grid.
+// Maps a task-memory graph onto a 2D CGRA grid.
 
 class TaskMapper {
 public:
@@ -208,7 +208,7 @@ class TaskMapper {
     }
   }
 
-  /// Places all tasks and performs memory mapping.
+  // Places all tasks and performs memory mapping.
   void place(func::FuncOp func) {
     SmallVector<TaskflowTaskOp> tasks;
     func.walk([&](TaskflowTaskOp task) { tasks.push_back(task); });
@@ -268,11 +268,20 @@ class TaskMapper {
 
           // Finds best placement using SRAM positions from previous iter (or -1/default).
           TaskPlacement placement = findBestPlacement(task_node, cgra_count, graph);
-          
+
+          // If the requested cgra_count doesn't fit, fall back to cgra_count-1
+          // (i.e. reject the extra CGRA and keep previous allocation).
+          if (placement.cgra_positions.empty() && cgra_count > 1) {
+            int fallback = cgra_count - 1;
+            llvm::errs() << "[MapTaskOnCgra] Cannot place "
+                         << task_node->op.getTaskName()
+                         << " with cgra_count=" << cgra_count
+                         << ", falling back to " << fallback << "\n";
+            placement = findBestPlacement(task_node, fallback, graph);
+          }
+
           // Commits Placement.
           task_node->placement.push_back(placement.primary());
-          // Handles mapping one task on multi-CGRAs.
-          // TODO: Introduce explicit multi-CGRA binding logic.
           for (size_t i = 1; i < placement.cgra_positions.size(); ++i) {
              task_node->placement.push_back(placement.cgra_positions[i]);
           }
@@ -359,7 +368,7 @@ class TaskMapper {
   }
 
 private:
-  /// Clears task placement and occupied grid.
+  // Clears task placement and occupied grid.
   void resetTaskPlacements(TaskMemoryGraph &graph) {
     for (auto &task : graph.task_nodes) {
         task->placement.clear();
@@ -370,8 +379,8 @@ class TaskMapper {
     }
   }
 
-  /// Assigns all memory nodes to SRAMs based on centroid of accessing tasks.
-  /// Returns true if any SRAM assignment changed.
+  // Assigns all memory nodes to SRAMs based on centroid of accessing tasks.
+  // Returns true if any SRAM assignment changed.
   bool assignAllSRAMs(TaskMemoryGraph &graph) {
     bool changed = false;
     for (auto &mem_node : graph.memory_nodes) {
@@ -563,47 +572,26 @@ class TaskMapper {
 
   // Finds best placement for a task on the CGRA grid.
   //
-  // Search order (for k = cgra_count down to 1):
-  //   1. Try all rectangular shapes of size k  (1×k, 2×(k/2), …, k×1).
-  //   2. If none fits, try all connected non-rectangular shapes of size k.
-  //   3. If still nothing, decrement k and repeat.
-  // This guarantees the task gets the largest possible contiguous CGRA
-  // allocation that physically fits on the current grid.
+  // Search order:
+  //   1. Try all rectangular shapes of size cgra_count.
+  //   2. If none fits, try all connected non-rectangular shapes of size cgra_count.
+  //   3. If still nothing, return empty (caller handles fallback to cgra_count-1).
   TaskPlacement findBestPlacement(TaskNode *task_node, int cgra_count,
                                   TaskMemoryGraph &graph) {
-    TaskPlacement best_placement;
-
-    for (int k = cgra_count; k >= 1; --k) {
-      // 1. Rectangular shapes of size k.
-      for (auto &shape : getRectShapes(k)) {
-        best_placement = tryPlaceShape(task_node, shape, graph);
-        if (!best_placement.cgra_positions.empty()) {
-          if (k < cgra_count) {
-            llvm::errs() << "[MapTaskOnCgra] Fallback: placed "
-                         << task_node->op.getTaskName()
-                         << " on " << k << " CGRAs (requested "
-                         << cgra_count << ")\n";
-          }
-          return best_placement;
-        }
-      }
+    // 1. Rectangular shapes.
+    for (auto &shape : getRectShapes(cgra_count)) {
+      TaskPlacement p = tryPlaceShape(task_node, shape, graph);
+      if (!p.cgra_positions.empty()) return p;
+    }
 
-      // 2. Non-rectangular connected shapes of size k.
-      best_placement = tryNonRectShapes(task_node, k, graph);
-      if (!best_placement.cgra_positions.empty()) {
-        if (k < cgra_count) {
-          llvm::errs() << "[MapTaskOnCgra] Fallback (non-rect): placed "
-                       << task_node->op.getTaskName()
-                       << " on " << k << " CGRAs (requested "
-                       << cgra_count << ")\n";
-        }
-        return best_placement;
-      }
+    // 2. Non-rectangular connected shapes.
+    if (cgra_count > 1) {
+      TaskPlacement p = tryNonRectShapes(task_node, cgra_count, graph);
+      if (!p.cgra_positions.empty()) return p;
     }
 
-    // Should never reach here on a valid grid.
-    assert(false && "No available CGRA position found (grid over-subscribed).");
-    return best_placement;
+    // Nothing fits — return empty so caller can decide.
+    return {};
   }
 
   // Computes placement score based on Task-Memory Graph.
@@ -666,16 +654,16 @@ class TaskMapper {
     return kAlpha * ssa_score + kBeta * mem_score;
   }
 
-  /// Computes dependency depth for all tasks in the graph.
-  ///
-  /// Dependency depth = longest path from this node to any sink node in the
-  /// dependency graph (via SSA or memory edges).
-  ///
-  /// Tasks with higher dependency depth have longer chains of dependent tasks
-  /// after them. By placing these tasks first:
-  /// 1. They get priority access to good grid positions.
-  /// 2. Their dependent tasks can then be positioned adjacent to them,
-  ///    minimizing inter-task communication distance.
+  // Computes dependency depth for all tasks in the graph.
+  //
+  // Dependency depth = longest path from this node to any sink node in the
+  // dependency graph (via SSA or memory edges).
+  //
+  // Tasks with higher dependency depth have longer chains of dependent tasks
+  // after them. By placing these tasks first:
+  // 1. They get priority access to good grid positions.
+  // 2. Their dependent tasks can then be positioned adjacent to them,
+  //    minimizing inter-task communication distance.
   void computeDependencyDepth(TaskMemoryGraph &graph) {
     DenseMap<TaskNode*, int> depth_cache;
     for (auto &node : graph.task_nodes) {
@@ -683,7 +671,7 @@ class TaskMapper {
     }
   }
 
-  /// Recursively calculates dependency depth for a single task.
+  // Recursively calculates dependency depth for a single task.
   int calculateDepth(TaskNode *node, DenseMap<TaskNode*, int> &depth_cache) {
     if (depth_cache.count(node)) {
         return depth_cache[node];

From 8f37f89254c68c349daec89624a3378f7c34b63d Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Fri, 6 Mar 2026 09:30:03 +0800
Subject: [PATCH 05/10] fix(taskflow): multi-CGRA aware SRAM centroid and SSA
 scoring

- SRAM centroid now includes ALL CGRA positions of multi-CGRA tasks,
  not just placement[0].
- SSA proximity scoring uses min distance between two multi-CGRA
  placements (minDistToPlacement) instead of only comparing to
  the other task's primary position.
---
 .../Transforms/MapTaskOnCgraPass.cpp          | 33 ++++++++++++-------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
index 1c37c39b..029404a8 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
@@ -387,16 +387,16 @@ class TaskMapper {
       // Computes centroid of all tasks that access this memory.
       int total_row = 0, total_col = 0, count = 0;
       for (TaskNode *reader : mem_node->readers) {
-        if (!reader->placement.empty()) {
-          total_row += reader->placement[0].row;
-          total_col += reader->placement[0].col;
+        for (const auto &pos : reader->placement) {
+          total_row += pos.row;
+          total_col += pos.col;
           count++;
         }
       }
       for (TaskNode *writer : mem_node->writers) {
-        if (!writer->placement.empty()) {
-          total_row += writer->placement[0].row;
-          total_col += writer->placement[0].col;
+        for (const auto &pos : writer->placement) {
+          total_row += pos.row;
+          total_col += pos.col;
           count++;
         }
       }
@@ -612,13 +612,24 @@ class TaskMapper {
     int ssa_score = 0;
     int mem_score = 0;
 
+    // Helper: minimum Manhattan distance between any position in this
+    // placement and any position in another task's placement.
+    auto minDistToPlacement = [&](const SmallVector<CGRAPosition> &other) -> int {
+      int min_dist = INT_MAX;
+      for (const auto &pos : placement.cgra_positions) {
+        for (const auto &opos : other) {
+          min_dist = std::min(min_dist, pos.manhattanDistance(opos));
+        }
+      }
+      return min_dist;
+    };
+
     // Helper: minimum Manhattan distance from any position in this placement
-    // to a target position.
+    // to a single target position.
     auto minDistToTarget = [&](const CGRAPosition &target) -> int {
       int min_dist = INT_MAX;
       for (const auto &pos : placement.cgra_positions) {
-        int d = pos.manhattanDistance(target);
-        min_dist = std::min(min_dist, d);
+        min_dist = std::min(min_dist, pos.manhattanDistance(target));
       }
       return min_dist;
     };
@@ -626,13 +637,13 @@ class TaskMapper {
     // 1. SSA proximity (predecessors & successors).
     for (TaskNode *producer : task_node->ssa_operands) {
       if (!producer->placement.empty()) {
-        int dist = minDistToTarget(producer->placement[0]);
+        int dist = minDistToPlacement(producer->placement);
         ssa_score -= dist;
       }
     }
     for (TaskNode *consumer : task_node->ssa_users) {
       if (!consumer->placement.empty()) {
-        int dist = minDistToTarget(consumer->placement[0]);
+        int dist = minDistToPlacement(consumer->placement);
         ssa_score -= dist;
       }
     }

From c17d7fcfa08913a989e88067d1baa949b998a0b6 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sat, 7 Mar 2026 12:06:51 +0800
Subject: [PATCH 06/10] refactor: rename MapToAcceleratorPass to
 MapOperationOnTilePass

---
 .../NeuraDialect/Architecture/Architecture.h  |  2 +-
 include/NeuraDialect/NeuraPasses.h            |  6 +-
 include/NeuraDialect/NeuraPasses.td           | 10 +--
 lib/NeuraDialect/NeuraPasses.cpp              |  2 +-
 lib/NeuraDialect/Transforms/CMakeLists.txt    |  2 +-
 ...torPass.cpp => MapOperationOnTilePass.cpp} | 68 +++++++++----------
 .../ResourceAwareTaskOptimizationPass.cpp     | 20 +++---
 test/arch_spec/README.md                      |  2 +-
 test/c2llvm2mlir/nested_loop/test.mlir        |  2 +-
 test/c2llvm2mlir/simple_loop/test.mlir        |  2 +-
 test/code_gen/test_code_generate.mlir         |  2 +-
 .../perfect_nested/perfect_nested.mlir        |  2 +-
 .../simple_loop/simple_loop.mlir              |  2 +-
 .../simple_loop_reduction.mlir                |  2 +-
 test/e2e/axpy/axpy_kernel.mlir                |  2 +-
 test/e2e/bicg/bicg_int_kernel.mlir            |  2 +-
 test/e2e/bicg/bicg_kernel.mlir                |  2 +-
 test/e2e/fft/fft_kernel.mlir                  |  2 +-
 test/e2e/fir/fir_kernel.mlir                  |  2 +-
 test/e2e/fir/fir_kernel_vec.mlir              |  2 +-
 test/e2e/gemm/gemm_kernel.mlir                |  2 +-
 test/e2e/gemv/gemv_kernel.mlir                |  2 +-
 test/e2e/histogram/histogram_kernel.mlir      |  2 +-
 test/e2e/relu/relu_kernel.mlir                |  2 +-
 test/e2e/spmv/spmv_kernel.mlir                |  2 +-
 test/honor_arch/fir_removed_tiles_test.mlir   |  2 +-
 test/mapping_quality/branch_for.mlir          |  4 +-
 test/mapping_quality/tiny_loop.mlir           |  4 +-
 test/multi-cgra/kernel_mapping/fir/fir.mlir   |  2 +-
 .../loop-in-kernel/loop-in-kernel.mlir        |  2 +-
 test/multi-cgra/kernel_mapping/relu/relu.mlir |  2 +-
 test/neura/ctrl/branch_for.mlir               |  4 +-
 test/neura/for_loop/relu_test.mlir            |  2 +-
 test/neura/fusion/test.mlir                   |  4 +-
 .../steer_ctrl/loop_with_return_value.mlir    |  2 +-
 35 files changed, 87 insertions(+), 87 deletions(-)
 rename lib/NeuraDialect/Transforms/{MapToAcceleratorPass.cpp => MapOperationOnTilePass.cpp} (87%)

diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
index 4f8e5cc2..cef36626 100644
--- a/include/NeuraDialect/Architecture/Architecture.h
+++ b/include/NeuraDialect/Architecture/Architecture.h
@@ -505,7 +505,7 @@ class Architecture {
   // specific tiles are valid:
   //   std::vector<TileOverride> overrides;
   //   // First mark all tiles as non-existent, then mark valid ones existent.
-  //   // (see MapToAcceleratorPass for the full valid_tiles parsing logic)
+  //   // (see MapOperationOnTilePass for the full valid_tiles parsing logic)
   //   auto arch_T = getArchitecture().cloneWithNewDimensions(8, 12, overrides);
   std::unique_ptr<Architecture> cloneWithNewDimensions(
       int new_per_cgra_rows, int new_per_cgra_columns,
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 56a9e785..f06568a1 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -23,10 +23,10 @@ std::unique_ptr<mlir::Pass> createInsertCtrlMovPass();
 std::unique_ptr<mlir::Pass> createAssignAcceleratorPass();
 std::unique_ptr<mlir::Pass> createTransformCtrlToDataFlowPass();
 std::unique_ptr<mlir::Pass> createLeveragePredicatedValuePass();
-// Creates the MapToAccelerator pass. Tile dimensions default to 0 (use
+// Creates the MapOperationOnTile pass. Tile dimensions default to 0 (use
 // architecture singleton) when not specified via options.
-std::unique_ptr<mlir::Pass> createMapToAcceleratorPass(
-    const MapToAcceleratorOptions &options = MapToAcceleratorOptions{});
+std::unique_ptr<mlir::Pass> createMapOperationOnTilePass(
+    const MapOperationOnTileOptions &options = MapOperationOnTileOptions{});
 std::unique_ptr<mlir::Pass> createGenerateCodePass();
 std::unique_ptr<mlir::Pass> createCanonicalizeReturnPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index f7fc06a3..965aaa2e 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -50,7 +50,7 @@ def LeveragePredicatedValue : Pass<"leverage-predicated-value", "ModuleOp"> {
   let constructor = "neura::createLeveragePredicatedValuePass()";
 }
 
-def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> {
+def MapOperationOnTile : Pass<"map-operation-on-tile", "ModuleOp"> {
   let summary = "Map Neura operations onto a given accelerator";
   let description = [{
     This pass performs mapping from Neura operations to accelerator.
@@ -65,11 +65,11 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> {
 
     Examples:
       Single CGRA (default):
-        --map-to-accelerator
+        --map-operation-on-tile
       1×3 rectangular (3 CGRAs in a row):
-        --map-to-accelerator x-tiles=12 y-tiles=4
+        --map-operation-on-tile x-tiles=12 y-tiles=4
       T-shape (4 CGRAs: top row of 3 + centre below):
-        --map-to-accelerator x-tiles=12 y-tiles=8 \
+        --map-operation-on-tile x-tiles=12 y-tiles=8 \
             valid-tiles="0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,8_0,9_0,10_0,11_0,\
                          4_1,5_1,6_1,7_1,4_4,5_4,6_4,7_4,4_5,5_5,6_5,7_5"
   }];
@@ -89,7 +89,7 @@ def MapToAccelerator : Pass<"map-to-accelerator", "ModuleOp"> {
            "x-tiles x y-tiles rectangle are valid. "
            "Example: 0_0,1_0,0_1 selects three tiles forming an L-shape.">
   ];
-  let constructor = "neura::createMapToAcceleratorPass()";
+  let constructor = "neura::createMapOperationOnTilePass()";
 }
 
 def GenerateCode : Pass<"generate-code", "ModuleOp"> {
diff --git a/lib/NeuraDialect/NeuraPasses.cpp b/lib/NeuraDialect/NeuraPasses.cpp
index 80b6a6f1..296626b1 100644
--- a/lib/NeuraDialect/NeuraPasses.cpp
+++ b/lib/NeuraDialect/NeuraPasses.cpp
@@ -43,7 +43,7 @@ void mlir::neura::registerNeuraConversionPassPipeline() {
         pm.addPass(mlir::neura::createInsertDataMovPass());
         pm.addPass(mlir::createPrintOpGraphPass(os));
 
-        pm.addPass(mlir::neura::createMapToAcceleratorPass());
+        pm.addPass(mlir::neura::createMapOperationOnTilePass());
         pm.addPass(mlir::neura::createGenerateCodePass());
       });
 }
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
index 010fc3c7..703f3360 100644
--- a/lib/NeuraDialect/Transforms/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -8,7 +8,7 @@ add_mlir_library(
     AssignAcceleratorPass.cpp
     TransformCtrlToDataFlowPass.cpp
     LeveragePredicatedValuePass.cpp
-    MapToAcceleratorPass.cpp
+    MapOperationOnTilePass.cpp
     GenerateCodePass.cpp
     CanonicalizeReturnPass.cpp
     CanonicalizeLiveInPass.cpp
diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapOperationOnTilePass.cpp
similarity index 87%
rename from lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
rename to lib/NeuraDialect/Transforms/MapOperationOnTilePass.cpp
index f6166968..ae55ff73 100644
--- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/MapOperationOnTilePass.cpp
@@ -32,11 +32,11 @@ using namespace mlir::neura::yamlkeys;
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
-struct MapToAcceleratorPass
-    : public PassWrapper<MapToAcceleratorPass, OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapToAcceleratorPass)
+struct MapOperationOnTilePass
+    : public PassWrapper<MapOperationOnTilePass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapOperationOnTilePass)
 
-  StringRef getArgument() const override { return "map-to-accelerator"; }
+  StringRef getArgument() const override { return "map-operation-on-tile"; }
   StringRef getDescription() const override {
     return "Maps IR to the target accelerator.";
   }
@@ -45,14 +45,14 @@ struct MapToAcceleratorPass
     registry.insert<mlir::neura::NeuraDialect>();
   }
 
-  MapToAcceleratorPass() = default;
-  MapToAcceleratorPass(const MapToAcceleratorOptions &options) : MapToAcceleratorPass() {
+  MapOperationOnTilePass() = default;
+  MapOperationOnTilePass(const MapOperationOnTileOptions &options) : MapOperationOnTilePass() {
     this->x_tiles = options.x_tiles;
     this->y_tiles = options.y_tiles;
     this->valid_tiles = options.valid_tiles;
   }
-  MapToAcceleratorPass(const MapToAcceleratorPass &pass)
-      : PassWrapper<MapToAcceleratorPass, OperationPass<ModuleOp>>(pass) {}
+  MapOperationOnTilePass(const MapOperationOnTilePass &pass)
+      : PassWrapper<MapOperationOnTilePass, OperationPass<ModuleOp>>(pass) {}
   Option<std::string> mappingStrategy{
       *this, "mapping-strategy",
       llvm::cl::desc("Mapping strategy to use for mapping operations to the "
@@ -104,10 +104,10 @@ struct MapToAcceleratorPass
     }
     if (mapping_mode_str == attr::val::kSpatialOnly ||
         mapping_mode_str == attr::val::kSpatialTemporal) {
-      llvm::errs() << "[MapToAcceleratorPass] Using Mapping Mode: "
+      llvm::errs() << "[MapOperationOnTilePass] Using Mapping Mode: "
                    << mapping_mode_str << "\n";
     } else {
-      llvm::errs() << "[MapToAcceleratorPass] Unsupported mapping mode: "
+      llvm::errs() << "[MapOperationOnTilePass] Unsupported mapping mode: "
                    << mapping_mode_str << "\n";
       return false;
     }
@@ -144,29 +144,29 @@ struct MapToAcceleratorPass
             mapping_strategy =
                 std::make_unique<HeuristicMapping>(max_loc, max_depth);
             llvm::errs()
-                << "[MapToAcceleratorPass] Use custom backtrack parameters: "
+                << "[MapOperationOnTilePass] Use custom backtrack parameters: "
                 << "max_location_to_try=" << max_loc
                 << ", max_backtrack_depth=" << max_depth << "\n";
           } else {
-            llvm::errs() << "[MapToAcceleratorPass] Illegal customized "
+            llvm::errs() << "[MapOperationOnTilePass] Illegal customized "
                             "parameters format: "
                          << backtrack_str << "\n";
             return false;
           }
         } else {
-          llvm::errs() << "[MapToAcceleratorPass] Illegal customized "
+          llvm::errs() << "[MapOperationOnTilePass] Illegal customized "
                           "parameters format: "
                        << backtrack_str << "\n";
           return false;
         }
       } else {
-        llvm::errs() << "[MapToAcceleratorPass] Unsupported backtrack config: "
+        llvm::errs() << "[MapOperationOnTilePass] Unsupported backtrack config: "
                      << backtrack_str << "\n";
         return false;
       }
       resolved_mapping_strategy = mapping_strategy_str.str();
     } else {
-      llvm::errs() << "[MapToAcceleratorPass] Unsupported mapping strategy: "
+      llvm::errs() << "[MapOperationOnTilePass] Unsupported mapping strategy: "
                    << mapping_strategy_str << "\n";
       return false;
     }
@@ -184,12 +184,12 @@ struct MapToAcceleratorPass
     for (Operation *op : sorted_ops) {
       op->setAttr(attr::kDfgId,
                   IntegerAttr::get(IntegerType::get(ctx, 32), next_id));
-      llvm::errs() << "[MapToAcceleratorPass] Assigned dfg_id=" << next_id
+      llvm::errs() << "[MapOperationOnTilePass] Assigned dfg_id=" << next_id
                    << " to " << *op << "\n";
       next_id++;
     }
 
-    llvm::errs() << "[MapToAcceleratorPass] Assigned " << next_id
+    llvm::errs() << "[MapOperationOnTilePass] Assigned " << next_id
                  << " dfg_id(s) in total\n";
   }
 
@@ -231,7 +231,7 @@ struct MapToAcceleratorPass
     }
 
     if (longest) {
-      llvm::outs() << "[MapToAcceleratorPass] Longest recurrence cycle (length "
+      llvm::outs() << "[MapOperationOnTilePass] Longest recurrence cycle (length "
                    << longest->length << "):\n";
       for (Operation *op : longest->operations) {
         op->print(llvm::outs()), llvm::outs() << "\n";
@@ -241,7 +241,7 @@ struct MapToAcceleratorPass
       rec_mii = 1; // No recurrence cycles found, set MII to 1.
     }
 
-    llvm::errs() << "[MapToAcceleratorPass] Calculated Recurrence MII: "
+    llvm::errs() << "[MapOperationOnTilePass] Calculated Recurrence MII: "
                  << rec_mii << "\n";
 
     int res_mii = calculateResMii(region, architecture);
@@ -265,7 +265,7 @@ struct MapToAcceleratorPass
       if (parent_op &&
           parent_op->getName().getStringRef().contains(attr::val::kOpFused)) {
         // Skips operations inside a fused_op region.
-        llvm::outs() << "[MapToAcceleratorPass] Skipping op inside fused_op: "
+        llvm::outs() << "[MapOperationOnTilePass] Skipping op inside fused_op: "
                      << *op << "\n";
         skipped_count++;
         continue;
@@ -275,19 +275,19 @@ struct MapToAcceleratorPass
     topologically_sorted_ops = std::move(filtered_ops);
 
     if (skipped_count > 0) {
-      llvm::errs() << "[MapToAcceleratorPass] Filtered out " << skipped_count
+      llvm::errs() << "[MapOperationOnTilePass] Filtered out " << skipped_count
                    << " operations inside fused_op regions\n";
     }
 
     for (Operation *op : topologically_sorted_ops) {
-      llvm::outs() << "[MapToAcceleratorPass] Topologically sorted op: " << *op
+      llvm::outs() << "[MapOperationOnTilePass] Topologically sorted op: " << *op
                    << "\n";
     }
     std::vector<std::vector<Operation *>> level_buckets =
         getOpsInAlapLevels(topologically_sorted_ops, critical_ops);
     for (int level = 0; level < static_cast<int>(level_buckets.size());
          ++level) {
-      llvm::outs() << "[MapToAcceleratorPass] ALAP Bucket Level " << level
+      llvm::outs() << "[MapOperationOnTilePass] ALAP Bucket Level " << level
                    << ": " << level_buckets[level].size() << " ops\n";
       for (Operation *op : level_buckets[level]) {
         llvm::outs() << "  " << *op << "\n";
@@ -296,12 +296,12 @@ struct MapToAcceleratorPass
     std::vector<std::pair<Operation *, int>> sorted_ops_with_alap_levels =
         flatten_level_buckets(level_buckets, critical_ops);
     for (const auto &[op, level] : sorted_ops_with_alap_levels) {
-      llvm::outs() << "[MapToAcceleratorPass] ALAP sorted op: " << *op
+      llvm::outs() << "[MapOperationOnTilePass] ALAP sorted op: " << *op
                    << " (ALAP level: " << level << ")\n";
     }
     // assert(false);
     for (int ii = possible_min_ii; ii <= max_ii; ++ii) {
-      llvm::errs() << "[MapToAcceleratorPass] Start mapping with target II of "
+      llvm::errs() << "[MapOperationOnTilePass] Start mapping with target II of "
                    << ii << "\n";
       // Creates a mapping state for the current II.
       MappingState mapping_state(architecture, ii, is_spatial_only);
@@ -349,18 +349,18 @@ struct MapToAcceleratorPass
         op->setAttr(attr::kMappingInfo, mapping_info);
         return true;
       }
-      llvm::errs() << "[MapToAcceleratorPass] Mapping failed for target II of "
+      llvm::errs() << "[MapOperationOnTilePass] Mapping failed for target II of "
                    << ii << "\n";
       mapping_state.dumpOpToLocs();
     }
     llvm::errs()
-        << "[MapToAcceleratorPass] Mapping failed for all target II values.\n";
+        << "[MapOperationOnTilePass] Mapping failed for all target II values.\n";
     return false;
   }
 
   void runOnOperation() override {
     ModuleOp module = getOperation();
-    llvm::errs() << "[MapToAcceleratorPass] Starting mapping pass...\n";
+    llvm::errs() << "[MapOperationOnTilePass] Starting mapping pass...\n";
     std::unique_ptr<Mapping> mapping_strategy;
     std::string resolved_mapping_mode;
     std::string resolved_mapping_strategy;
@@ -414,7 +414,7 @@ struct MapToAcceleratorPass
       custom_arch = global_arch.cloneWithNewDimensions(
         y_tiles.getValue(), x_tiles.getValue(), additional_overrides);
       target_arch = custom_arch.get();
-      llvm::errs() << "[MapToAcceleratorPass] Overriding architecture dimensions to "
+      llvm::errs() << "[MapOperationOnTilePass] Overriding architecture dimensions to "
                    << y_tiles.getValue() << "x" << x_tiles.getValue() << " tiles.\n";
     }
 
@@ -432,7 +432,7 @@ struct MapToAcceleratorPass
       if (!mapRegion(kernel_op, kernel_region, architecture,
                      mapping_strategy.get(), is_spatial_only,
                      resolved_mapping_mode, resolved_mapping_strategy)) {
-        llvm::errs() << "[MapToAcceleratorPass] Mapping failed for kernel.\n";
+        llvm::errs() << "[MapOperationOnTilePass] Mapping failed for kernel.\n";
         signalPassFailure();
       }
     });
@@ -450,7 +450,7 @@ struct MapToAcceleratorPass
       if (!mapRegion(func_op, func_region, architecture, mapping_strategy.get(),
                      is_spatial_only, resolved_mapping_mode,
                      resolved_mapping_strategy)) {
-        llvm::errs() << "[MapToAcceleratorPass] Failed to map function.\n";
+        llvm::errs() << "[MapOperationOnTilePass] Failed to map function.\n";
         signalPassFailure();
       }
     });
@@ -461,9 +461,9 @@ struct MapToAcceleratorPass
 
 namespace mlir::neura {
 
-std::unique_ptr<Pass> createMapToAcceleratorPass(
-    const MapToAcceleratorOptions &options) {
-  return std::make_unique<MapToAcceleratorPass>(options);
+std::unique_ptr<Pass> createMapOperationOnTilePass(
+    const MapOperationOnTileOptions &options) {
+  return std::make_unique<MapOperationOnTilePass>(options);
 }
 
 } // namespace mlir::neura
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
index 5a7e0d63..755cbb08 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
@@ -344,7 +344,7 @@ class TaskDependencyGraph {
   // Public wrapper for profileTask: used by UtilizationFuser to re-profile
   // fused tasks with the real downstream Neura pipeline.
   // When skip_mapper=true, only ResMII/RecMII analytical estimates are used
-  // (no MapToAcceleratorPass). This is safe for speculative balance checks
+  // (no MapOperationOnTilePass). This is safe for speculative balance checks
   // where the mapper may backtrack indefinitely on larger tile arrays.
   void profileTaskPublic(TaskGraphNode *node, TaskflowTaskOp task,
                          bool skip_mapper = false) {
@@ -363,7 +363,7 @@ class TaskDependencyGraph {
   }
 
   // Profiles a single TaskflowTaskOp: clones the task, wraps the kernel in a
-  // standalone func, and runs InsertDataMov + MapToAcceleratorPass to obtain
+  // standalone func, and runs InsertDataMov + MapOperationOnTilePass to obtain
   // ii.  skip_mapper: use only ResMII/RecMII analytical estimates.
   void profileTask(TaskGraphNode *node, TaskflowTaskOp task,
                    bool skip_mapper = false) {
@@ -456,7 +456,7 @@ class TaskDependencyGraph {
   // InsertDataMov + mapper, and returns compiled_ii / cp_depth.
   // x_tiles/y_tiles: multi-CGRA tile grid dimensions.
   // valid_tiles: explicit tile list for non-rectangular shapes (empty = full).
-  // skip_mapper: skip MapToAcceleratorPass, use ResMII/RecMII only.
+  // skip_mapper: skip MapOperationOnTilePass, use ResMII/RecMII only.
   LogicalResult runNeuraPipelineOnKernel(MLIRContext *ctx,
                                          neura::KernelOp kernel,
                                          ModuleOp dst_module,
@@ -563,7 +563,7 @@ class TaskDependencyGraph {
       });
     }
 
-    // Optionally run MapToAcceleratorPass to get the true compiled_ii.
+    // Optionally run MapOperationOnTilePass to get the true compiled_ii.
     //
     // Guards:
     //   1. skip_mapper=true: caller explicitly requests analytical-only (e.g.
@@ -609,19 +609,19 @@ class TaskDependencyGraph {
                  << " limit=" << kMapperOpLimit << "\n";
 
     if (all_data_movs_ok && total_mapped_ops <= kMapperOpLimit) {
-      // Runs MapToAcceleratorPass in a fresh pass manager on the already-lowered
+      // Runs MapOperationOnTilePass in a fresh pass manager on the already-lowered
       // dst_module (pre-mapper pipeline already ran above).
       // Passes the correct tile dimensions so the mapper uses the right array.
       PassManager pm2(ctx);
       pm2.enableVerifier(false);
       if (x_tiles > 0 && y_tiles > 0) {
-        neura::MapToAcceleratorOptions map_options;
+        neura::MapOperationOnTileOptions map_options;
         map_options.x_tiles = x_tiles;
         map_options.y_tiles = y_tiles;
         map_options.valid_tiles = valid_tiles;
-        pm2.addPass(neura::createMapToAcceleratorPass(map_options));
+        pm2.addPass(neura::createMapOperationOnTilePass(map_options));
       } else {
-        pm2.addPass(neura::createMapToAcceleratorPass());
+        pm2.addPass(neura::createMapOperationOnTilePass());
       }
 
       if (succeeded(pm2.run(dst_module))) {
@@ -641,7 +641,7 @@ class TaskDependencyGraph {
         return success();
       }
       // Mapper failed for all II values — keep ResMII/RecMII from above.
-      llvm::errs() << "[profileTask] WARNING: MapToAcceleratorPass failed, "
+      llvm::errs() << "[profileTask] WARNING: MapOperationOnTilePass failed, "
                    << "keeping analytical fallback compiled_ii=" << compiled_ii
                    << "\n";
     } else {
@@ -1612,7 +1612,7 @@ struct ResourceAwareTaskOptimizationPass
 
   // Estimation mode for profiling task II / steps.
   //   "compiled" (default): runs the full Neura lowering + mapping pipeline
-  //       to obtain accurate compiled_ii and steps from MapToAcceleratorPass.
+  //       to obtain accurate compiled_ii and steps from MapOperationOnTilePass.
   //   "analytical": uses only ResMII / RecMII analytical estimates without
   //       running the mapper.  Much faster but less accurate — useful for
   //       rapid design-space exploration or when the mapper is unavailable.
diff --git a/test/arch_spec/README.md b/test/arch_spec/README.md
index 9741f984..af7ede8e 100644
--- a/test/arch_spec/README.md
+++ b/test/arch_spec/README.md
@@ -45,7 +45,7 @@ To use this architecture specification in your tests, add the following option t
 mlir-neura-opt input.mlir \
   --assign-accelerator \
   --lower-llvm-to-neura \
-  --map-to-accelerator="mapping-strategy=heuristic" \
+  --map-operation-on-tile="mapping-strategy=heuristic" \
   --architecture-spec=arch_spec/architecture.yaml \
   --generate-code
 ```
diff --git a/test/c2llvm2mlir/nested_loop/test.mlir b/test/c2llvm2mlir/nested_loop/test.mlir
index 3bf536ff..9e64619c 100644
--- a/test/c2llvm2mlir/nested_loop/test.mlir
+++ b/test/c2llvm2mlir/nested_loop/test.mlir
@@ -21,7 +21,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" \
 // RUN:   --architecture-spec=../../arch_spec/architecture.yaml %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-LLVM2NEURA-MAP
 
 // CHECK-LLVM2NEURA: accelerator = "neura"
diff --git a/test/c2llvm2mlir/simple_loop/test.mlir b/test/c2llvm2mlir/simple_loop/test.mlir
index 2af2d0c6..e5440152 100644
--- a/test/c2llvm2mlir/simple_loop/test.mlir
+++ b/test/c2llvm2mlir/simple_loop/test.mlir
@@ -45,7 +45,7 @@
 // RUN:   --view-op-graph \
 // RUN:   --architecture-spec=../../arch_spec/architecture.yaml \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized=5,3 dump-mapping-table=true" %t-kernel.mlir -o %t-kernel-mapped.mlir 2>&1 | tee %t-kernel-mapping-output.txt
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized=5,3 dump-mapping-table=true" %t-kernel.mlir -o %t-kernel-mapped.mlir 2>&1 | tee %t-kernel-mapping-output.txt
 // RUN: FileCheck %s --check-prefix=CHECK-MAPPING-TABLE < %t-kernel-mapping-output.txt
 // RUN: FileCheck %s --check-prefix=CHECK-LLVM2NEURA-MAP < %t-kernel-mapped.mlir
 
diff --git a/test/code_gen/test_code_generate.mlir b/test/code_gen/test_code_generate.mlir
index a9671b86..ae9c6cc8 100644
--- a/test/code_gen/test_code_generate.mlir
+++ b/test/code_gen/test_code_generate.mlir
@@ -6,7 +6,7 @@
 // RUN:   --leverage-predicated-value \
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir
 // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
diff --git a/test/controflow_fuse/perfect_nested/perfect_nested.mlir b/test/controflow_fuse/perfect_nested/perfect_nested.mlir
index bbc5877e..4f0ff7e0 100644
--- a/test/controflow_fuse/perfect_nested/perfect_nested.mlir
+++ b/test/controflow_fuse/perfect_nested/perfect_nested.mlir
@@ -45,7 +45,7 @@
 // RUN: --transform-ctrl-to-data-flow \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN: --architecture-spec=../../arch_spec/architecture.yaml \
 // RUN: | FileCheck %s -check-prefix=MAPPING
 
diff --git a/test/controflow_fuse/simple_loop/simple_loop.mlir b/test/controflow_fuse/simple_loop/simple_loop.mlir
index e9c04f7c..320edeb8 100644
--- a/test/controflow_fuse/simple_loop/simple_loop.mlir
+++ b/test/controflow_fuse/simple_loop/simple_loop.mlir
@@ -70,7 +70,7 @@
 // RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN: --architecture-spec=../../arch_spec/architecture.yaml \
 // RUN: -o %t-fused-mapped.mlir
 // RUN: FileCheck %s --input-file=%t-fused-mapped.mlir --check-prefix=FUSE-MAPPING
diff --git a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
index ace0dd26..e3335754 100644
--- a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
+++ b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
@@ -71,7 +71,7 @@
 // RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN: --architecture-spec=../../arch_spec/architecture.yaml | FileCheck %s -check-prefix=FUSE-MAPPING
 
 module attributes {} {
diff --git a/test/e2e/axpy/axpy_kernel.mlir b/test/e2e/axpy/axpy_kernel.mlir
index 8d3e9fba..441f3a7f 100644
--- a/test/e2e/axpy/axpy_kernel.mlir
+++ b/test/e2e/axpy/axpy_kernel.mlir
@@ -15,7 +15,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir 
 // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING
diff --git a/test/e2e/bicg/bicg_int_kernel.mlir b/test/e2e/bicg/bicg_int_kernel.mlir
index f9aa4d3d..ac4d308a 100644
--- a/test/e2e/bicg/bicg_int_kernel.mlir
+++ b/test/e2e/bicg/bicg_int_kernel.mlir
@@ -19,7 +19,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir
 
diff --git a/test/e2e/bicg/bicg_kernel.mlir b/test/e2e/bicg/bicg_kernel.mlir
index d353ec1f..c016d053 100644
--- a/test/e2e/bicg/bicg_kernel.mlir
+++ b/test/e2e/bicg/bicg_kernel.mlir
@@ -38,7 +38,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir
 // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
diff --git a/test/e2e/fft/fft_kernel.mlir b/test/e2e/fft/fft_kernel.mlir
index 1f42fb7a..7df8b22b 100644
--- a/test/e2e/fft/fft_kernel.mlir
+++ b/test/e2e/fft/fft_kernel.mlir
@@ -15,7 +15,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir
 // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING
diff --git a/test/e2e/fir/fir_kernel.mlir b/test/e2e/fir/fir_kernel.mlir
index f7049b62..2991a59a 100644
--- a/test/e2e/fir/fir_kernel.mlir
+++ b/test/e2e/fir/fir_kernel.mlir
@@ -16,7 +16,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir
 // RUN: cp %t.dir/tmp-generated-instructions.yaml %t-generated-instructions.yaml
diff --git a/test/e2e/fir/fir_kernel_vec.mlir b/test/e2e/fir/fir_kernel_vec.mlir
index 366feba8..a4b39d61 100644
--- a/test/e2e/fir/fir_kernel_vec.mlir
+++ b/test/e2e/fir/fir_kernel_vec.mlir
@@ -14,7 +14,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir 
 // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
diff --git a/test/e2e/gemm/gemm_kernel.mlir b/test/e2e/gemm/gemm_kernel.mlir
index 3376fe0a..bbf3cbcb 100644
--- a/test/e2e/gemm/gemm_kernel.mlir
+++ b/test/e2e/gemm/gemm_kernel.mlir
@@ -15,7 +15,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir
 // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING
diff --git a/test/e2e/gemv/gemv_kernel.mlir b/test/e2e/gemv/gemv_kernel.mlir
index 9f8f1317..4779714e 100644
--- a/test/e2e/gemv/gemv_kernel.mlir
+++ b/test/e2e/gemv/gemv_kernel.mlir
@@ -15,7 +15,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir 
 // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING
diff --git a/test/e2e/histogram/histogram_kernel.mlir b/test/e2e/histogram/histogram_kernel.mlir
index 9f2d6f23..43c2fab6 100644
--- a/test/e2e/histogram/histogram_kernel.mlir
+++ b/test/e2e/histogram/histogram_kernel.mlir
@@ -16,7 +16,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir 
 // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
diff --git a/test/e2e/relu/relu_kernel.mlir b/test/e2e/relu/relu_kernel.mlir
index a6588a54..a73d5f61 100644
--- a/test/e2e/relu/relu_kernel.mlir
+++ b/test/e2e/relu/relu_kernel.mlir
@@ -23,7 +23,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir
 // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
diff --git a/test/e2e/spmv/spmv_kernel.mlir b/test/e2e/spmv/spmv_kernel.mlir
index 32a50da9..103e5511 100644
--- a/test/e2e/spmv/spmv_kernel.mlir
+++ b/test/e2e/spmv/spmv_kernel.mlir
@@ -15,7 +15,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
 // RUN:   --generate-code -o %t-mapping.mlir 
 // RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING
diff --git a/test/honor_arch/fir_removed_tiles_test.mlir b/test/honor_arch/fir_removed_tiles_test.mlir
index 23e4009d..f207b3b0 100644
--- a/test/honor_arch/fir_removed_tiles_test.mlir
+++ b/test/honor_arch/fir_removed_tiles_test.mlir
@@ -16,7 +16,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN:   --architecture-spec=%S/../arch_spec/custom_arch_with_removed_tiles.yaml \
 // RUN:   -o %t-after-mapping.mlir
 
diff --git a/test/mapping_quality/branch_for.mlir b/test/mapping_quality/branch_for.mlir
index f78a1be1..05374d2f 100644
--- a/test/mapping_quality/branch_for.mlir
+++ b/test/mapping_quality/branch_for.mlir
@@ -54,7 +54,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN:   --architecture-spec=../arch_spec/architecture.yaml \
 // RUN:   | FileCheck %s -check-prefix=MAPPING
 
@@ -67,7 +67,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN:   --architecture-spec=../arch_spec/architecture.yaml \
 // RUN:   --generate-code 
 // RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml -check-prefix=YAML
diff --git a/test/mapping_quality/tiny_loop.mlir b/test/mapping_quality/tiny_loop.mlir
index 1b23c2bf..20382890 100644
--- a/test/mapping_quality/tiny_loop.mlir
+++ b/test/mapping_quality/tiny_loop.mlir
@@ -25,7 +25,7 @@
 // RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized=4,3" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized=4,3" \
 // RUN: --architecture-spec=../arch_spec/architecture.yaml \
 // RUN: | FileCheck %s -check-prefix=SPATIAL
 
@@ -45,7 +45,7 @@
 // RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-temporal backtrack-config=customized=4,4" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic mapping-mode=spatial-temporal backtrack-config=customized=4,4" \
 // RUN: --architecture-spec=../arch_spec/architecture.yaml \
 // RUN: | FileCheck %s -check-prefix=SPATIAL-TEMPORAL
 
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
index f70d99ca..9a49b645 100644
--- a/test/multi-cgra/kernel_mapping/fir/fir.mlir
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -68,7 +68,7 @@
 // RUN: --transform-ctrl-to-data-flow \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
 // RUN: -o %t.mapped.mlir
 // RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED
diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
index 1802e538..669a19a2 100644
--- a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
+++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
@@ -47,7 +47,7 @@
 // RUN: --transform-ctrl-to-data-flow \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN: --architecture-spec=%S/../../../arch_spec/architecture.yaml \
 // RUN: -o %t.mapped.mlir
 // RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED
diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir
index 309c8512..664e1a2d 100644
--- a/test/multi-cgra/kernel_mapping/relu/relu.mlir
+++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir
@@ -68,7 +68,7 @@
 // RUN: --transform-ctrl-to-data-flow \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
-// RUN: --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN: --map-operation-on-tile="mapping-strategy=heuristic" \
 // RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
 // RUN: -o %t.mapped.mlir
 // RUN: FileCheck %s --input-file=%t.mapped.mlir --check-prefixes=MAPPED
diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir
index 0a7d6031..f170fea7 100644
--- a/test/neura/ctrl/branch_for.mlir
+++ b/test/neura/ctrl/branch_for.mlir
@@ -60,7 +60,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" \
 // RUN:   --architecture-spec=../../arch_spec/architecture.yaml \
 // RUN:   | FileCheck %s -check-prefix=MAPPING
 
@@ -75,7 +75,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" \
 // RUN:   --architecture-spec=../../arch_spec/architecture.yaml \
 // RUN:   --generate-code
 // RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml -check-prefix=YAML
diff --git a/test/neura/for_loop/relu_test.mlir b/test/neura/for_loop/relu_test.mlir
index a34e4fd7..7c90141d 100644
--- a/test/neura/for_loop/relu_test.mlir
+++ b/test/neura/for_loop/relu_test.mlir
@@ -30,7 +30,7 @@
 // RUN:   --transform-ctrl-to-data-flow \
 // RUN:   --fold-constant \
 // RUN:   --insert-data-mov \
-// RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
+// RUN:   --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN:   | FileCheck %s --check-prefix=MAPPING
 
 // CHECK:      func.func @_Z6kernelPiS_
diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir
index 63881151..ce3afe60 100644
--- a/test/neura/fusion/test.mlir
+++ b/test/neura/fusion/test.mlir
@@ -25,7 +25,7 @@
 // RUN:           --fold-constant \
 // RUN:           --fuse-pattern \
 // RUN:           --insert-data-mov \
-// RUN:           --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-MAPPING
+// RUN:           --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=customized" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-MAPPING
 
 // CHECK-FUSED: func.func @_Z6kernelPA1024_iPiS1_S1_S1_
 // CHECK-FUSED: accelerator = "neura"
@@ -110,7 +110,7 @@
 // RUN:           --fold-constant \
 // RUN:           --iter-merge-pattern="min-support=3 max-iter=4" \
 // RUN:           --insert-data-mov \
-// RUN:           --map-to-accelerator="mapping-strategy=heuristic backtrack-config=simple" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-ITER-MERGE-PATTERN-MAPPING
+// RUN:           --map-operation-on-tile="mapping-strategy=heuristic backtrack-config=simple" %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-ITER-MERGE-PATTERN-MAPPING
 
 // CHECK-ITER-MERGE-PATTERN-MAPPING: mapping_info = {compiled_ii = 12 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 8 : i32, res_mii = 3 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}
 
diff --git a/test/neura/steer_ctrl/loop_with_return_value.mlir b/test/neura/steer_ctrl/loop_with_return_value.mlir
index 1104a7a7..c35ab82c 100644
--- a/test/neura/steer_ctrl/loop_with_return_value.mlir
+++ b/test/neura/steer_ctrl/loop_with_return_value.mlir
@@ -30,7 +30,7 @@
 // RUN: --transform-to-steer-control \
 // RUN: --remove-predicated-type \
 // RUN: --insert-data-mov 
-// RU: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized" 
+// RU: --map-operation-on-tile="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized" 
 // RU: | FileCheck %s -check-prefix=MAPPING
 
 module {

From 2d00d5a5da985bb13aaaa17cb21d74649eadce5e Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sat, 7 Mar 2026 12:15:59 +0800
Subject: [PATCH 07/10] refactor: rename MapTaskOnCgraPass to
 AllocateCgraToTaskPass

---
 include/TaskflowDialect/TaskflowPasses.h      |  4 ++--
 include/TaskflowDialect/TaskflowPasses.td     |  4 ++--
 ...graPass.cpp => AllocateCgraToTaskPass.cpp} | 20 +++++++++----------
 lib/TaskflowDialect/Transforms/CMakeLists.txt |  2 +-
 .../ResourceAwareTaskOptimizationPass.cpp     |  6 +++---
 .../irregular-loop/irregular-loop.mlir        |  2 +-
 .../taskflow/multi-nested/multi-nested.mlir   |  2 +-
 .../parallel-nested/parallel-nested.mlir      |  2 +-
 8 files changed, 21 insertions(+), 21 deletions(-)
 rename lib/TaskflowDialect/Transforms/{MapTaskOnCgraPass.cpp => AllocateCgraToTaskPass.cpp} (97%)

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 866365eb..cd48c4a2 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -22,11 +22,11 @@ void registerTosaToAffineConversionPassPipeline();
 #include "TaskflowDialect/TaskflowPasses.h.inc"
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
-std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
+std::unique_ptr<mlir::Pass> createAllocateCgraToTaskPass();
 
 // Runs the CGRA task placement logic directly on a function.
 // grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols).
-void runMapTaskOnCgra(mlir::func::FuncOp func,
+void runAllocateCgraToTask(mlir::func::FuncOp func,
                       int grid_rows = 4, int grid_cols = 4);
 
 //=========================================================//
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 8d765498..0869c01a 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -61,7 +61,7 @@ def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
   let constructor = "taskflow::createClassifyCountersPass()";
 }
 
-def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> {
+def AllocateCgraToTask : Pass<"allocate-cgra-to-task", "func::FuncOp"> {
   let summary = "Maps Taskflow tasks onto a 2D CGRA grid array";
   let description = [{
     This pass maps Taskflow tasks onto a 2D CGRA grid array.
@@ -70,7 +70,7 @@ def MapTaskOnCgra : Pass<"map-task-on-cgra", "func::FuncOp"> {
 
     Uses a default 3x3 CGRA grid.
   }];
-  let constructor = "taskflow::createMapTaskOnCgraPass()";
+  let constructor = "taskflow::createAllocateCgraToTaskPass()";
 }
 
 def MemoryAccessStreamingFusion : Pass<"memory-access-streaming-fusion", "func::FuncOp"> {
diff --git a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
similarity index 97%
rename from lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
rename to lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
index 029404a8..174ed003 100644
--- a/lib/TaskflowDialect/Transforms/MapTaskOnCgraPass.cpp
+++ b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
@@ -1,4 +1,4 @@
-//===- MapTaskOnCgraPass.cpp - Task to CGRA Mapping Pass ----------------===//
+//===- AllocateCgraToTaskPass.cpp - Task to CGRA Mapping Pass ----------------===//
 //
 // This pass maps Taskflow tasks onto a 2D CGRA grid array:
 // 1. Places tasks with SSA dependencies on adjacent CGRAs.
@@ -273,7 +273,7 @@ class TaskMapper {
           // (i.e. reject the extra CGRA and keep previous allocation).
           if (placement.cgra_positions.empty() && cgra_count > 1) {
             int fallback = cgra_count - 1;
-            llvm::errs() << "[MapTaskOnCgra] Cannot place "
+            llvm::errs() << "[AllocateCgraToTask] Cannot place "
                          << task_node->op.getTaskName()
                          << " with cgra_count=" << cgra_count
                          << ", falling back to " << fallback << "\n";
@@ -716,13 +716,13 @@ class TaskMapper {
 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//
-struct MapTaskOnCgraPass
-    : public PassWrapper<MapTaskOnCgraPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MapTaskOnCgraPass)
+struct AllocateCgraToTaskPass
+    : public PassWrapper<AllocateCgraToTaskPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AllocateCgraToTaskPass)
 
-  MapTaskOnCgraPass() = default;
+  AllocateCgraToTaskPass() = default;
 
-  StringRef getArgument() const override { return "map-task-on-cgra"; }
+  StringRef getArgument() const override { return "allocate-cgra-to-task"; }
 
   StringRef getDescription() const override {
     return "Maps Taskflow tasks onto a 2D CGRA grid with adjacency "
@@ -743,11 +743,11 @@ struct MapTaskOnCgraPass
 namespace mlir {
 namespace taskflow {
 
-std::unique_ptr<Pass> createMapTaskOnCgraPass() {
-  return std::make_unique<MapTaskOnCgraPass>();
+std::unique_ptr<Pass> createAllocateCgraToTaskPass() {
+  return std::make_unique<AllocateCgraToTaskPass>();
 }
 
-void runMapTaskOnCgra(func::FuncOp func, int grid_rows, int grid_cols) {
+void runAllocateCgraToTask(func::FuncOp func, int grid_rows, int grid_cols) {
   TaskMapper mapper(grid_rows, grid_cols);
   mapper.place(func);
 }
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index 5dcb6736..23b01a33 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -3,7 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     ClassifyCountersPass.cpp
-    MapTaskOnCgraPass.cpp
+    AllocateCgraToTaskPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
index 755cbb08..5cedf3a5 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
@@ -848,7 +848,7 @@ class PipelineBalancer {
 
       // Check if incrementing cgra_count is feasible on the 4×4 grid.
       // TODO: This currently only checks the capacity (total CGRA count). Ideally, 
-      // we should invoke a global placement pass (aka MapTaskOnCgraPass) here to 
+      // we should invoke a global placement pass (aka AllocateCgraToTaskPass) here to 
       // verify if the speculatively increased CGRA count and its proposed shape 
       // actually fit on the 4x4 grid alongside other previously allocated tasks.
       //
@@ -1765,11 +1765,11 @@ struct ResourceAwareTaskOptimizationPass
           node->op->setAttr("tile_shape", b.getStringAttr(shape_str));
         }
 
-        // Runs MapTaskOnCgraPass to produce global placement (task_mapping_info)
+        // Runs AllocateCgraToTaskPass to produce global placement (task_mapping_info)
         // with multi-CGRA support. The pass reads cgra_count and tile_shape
         // from each task and places them on the 4x4 grid, validating that
         // shapes physically fit and don't overlap.
-        taskflow::runMapTaskOnCgra(func, kCgraGridRows, kCgraGridCols);
+        taskflow::runAllocateCgraToTask(func, kCgraGridRows, kCgraGridCols);
 
         break;
       }
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 13c56ab1..80417c2b 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -29,7 +29,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-task-on-cgra \
+// RUN: --allocate-cgra-to-task \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index c4e7b76c..84d431ed 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -61,7 +61,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-task-on-cgra \
+// RUN: --allocate-cgra-to-task \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index 881d81ec..abd6a950 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -42,7 +42,7 @@
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --map-task-on-cgra \
+// RUN: --allocate-cgra-to-task \
 // RUN: -o %t.placement.mlir
 // RUN: FileCheck %s --input-file=%t.placement.mlir --check-prefixes=PLACEMENT
 

From 9112303d4ec5afe291c156acbb7fd7d2ae0fd219 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sat, 7 Mar 2026 12:41:42 +0800
Subject: [PATCH 08/10] fix: resolve merge conflicts with main (FuseTaskPass,
 CMakeLists)

---
 include/TaskflowDialect/TaskflowPasses.h      | 2 ++
 lib/TaskflowDialect/CMakeLists.txt            | 3 +--
 lib/TaskflowDialect/Transforms/CMakeLists.txt | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index cd48c4a2..6d7f97fc 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -23,6 +23,8 @@ void registerTosaToAffineConversionPassPipeline();
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 std::unique_ptr<mlir::Pass> createAllocateCgraToTaskPass();
+std::unique_ptr<mlir::Pass> createFuseTaskPass();
+
 
 // Runs the CGRA task placement logic directly on a function.
 // grid_rows/grid_cols default to 4x4 (kCgraGridRows/kCgraGridCols).
diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt
index 49d60c57..d8e5d7ff 100644
--- a/lib/TaskflowDialect/CMakeLists.txt
+++ b/lib/TaskflowDialect/CMakeLists.txt
@@ -13,5 +13,4 @@ add_mlir_dialect_library(MLIRTaskflow
         MLIRInferTypeOpInterface
 )
 
-add_subdirectory(Transforms)
-add_subdirectory(Transforms/Optimizations)
\ No newline at end of file
+add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index 210e071f..984b7407 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     ClassifyCountersPass.cpp
     AllocateCgraToTaskPass.cpp
+    FuseTaskPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen

From 27b7bcf8ab78d43298f6620f8add8fc0fd1e9a49 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sat, 7 Mar 2026 13:40:44 +0800
Subject: [PATCH 09/10] refactor: rename CGRAPosition to CgraPosition for
 consistency

---
 .../Transforms/AllocateCgraToTaskPass.cpp     | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
index 174ed003..d99d0051 100644
--- a/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/AllocateCgraToTaskPass.cpp
@@ -37,25 +37,25 @@ namespace {
 // CGRA Grid Position
 //===----------------------------------------------------------------------===//
 // Represents a position on the 2D CGRA grid.
-struct CGRAPosition {
+struct CgraPosition {
   int row;
   int col;
 
-  bool operator==(const CGRAPosition &other) const {
+  bool operator==(const CgraPosition &other) const {
     return row == other.row && col == other.col;
   }
 
-  bool operator!=(const CGRAPosition &other) const {
+  bool operator!=(const CgraPosition &other) const {
     return !(*this == other);
   }
 
   // Computes Manhattan distance to another position.
-  int manhattanDistance(const CGRAPosition &other) const {
+  int manhattanDistance(const CgraPosition &other) const {
     return std::abs(row - other.row) + std::abs(col - other.col);
   }
 
   // Checks if adjacent (Manhattan distance = 1).
-  bool isAdjacent(const CGRAPosition &other) const {
+  bool isAdjacent(const CgraPosition &other) const {
     return manhattanDistance(other) == 1;
   }
 };
@@ -65,11 +65,11 @@ struct CGRAPosition {
 //===----------------------------------------------------------------------===//
 // Stores placement info for a task: can span multiple combined CGRAs.
 struct TaskPlacement {
-  SmallVector<CGRAPosition> cgra_positions; // CGRAs assigned to this task.
+  SmallVector<CgraPosition> cgra_positions; // CGRAs assigned to this task.
 
   // Returns the primary (first) position.
-  CGRAPosition primary() const {
-    return cgra_positions.empty() ? CGRAPosition{-1, -1} : cgra_positions[0];
+  CgraPosition primary() const {
+    return cgra_positions.empty() ? CgraPosition{-1, -1} : cgra_positions[0];
   }
 
   // Returns the number of CGRAs assigned.
@@ -107,7 +107,7 @@ struct TaskNode {
   SmallVector<TaskNode *> ssa_operands;
 
   // Placement result
-  SmallVector<CGRAPosition> placement;
+  SmallVector<CgraPosition> placement;
 
   TaskNode(size_t id, TaskflowTaskOp op) : id(id), op(op) {}
 };
@@ -121,7 +121,7 @@ struct MemoryNode {
   SmallVector<TaskNode *> writers;
   
   // Mapping result.
-  std::optional<CGRAPosition> assigned_sram_pos;
+  std::optional<CgraPosition> assigned_sram_pos;
 
   MemoryNode(Value memref) : memref(memref) {}
 };
@@ -401,12 +401,12 @@ class TaskMapper {
         }
       }
       
-      std::optional<CGRAPosition> new_sram_pos;
+      std::optional<CgraPosition> new_sram_pos;
       if (count > 0) {
         // Rounds to the nearest integer.
         int avg_row = (total_row + count / 2) / count;
         int avg_col = (total_col + count / 2) / count;
-        new_sram_pos = CGRAPosition{avg_row, avg_col};
+        new_sram_pos = CgraPosition{avg_row, avg_col};
       }
 
       if (mem_node->assigned_sram_pos != new_sram_pos) {
@@ -525,8 +525,8 @@ class TaskMapper {
     int best_score = INT_MIN;
     TaskPlacement best_placement;
 
-    std::function<void(SmallVector<CGRAPosition> &, uint64_t)> search =
-        [&](SmallVector<CGRAPosition> &current, uint64_t mask) {
+    std::function<void(SmallVector<CgraPosition> &, uint64_t)> search =
+        [&](SmallVector<CgraPosition> &current, uint64_t mask) {
           if ((int)current.size() == k) {
             if (visited_masks.insert(mask).second) {
               TaskPlacement candidate;
@@ -562,7 +562,7 @@ class TaskMapper {
     for (int r = 0; r < grid_rows_; ++r) {
       for (int c = 0; c < grid_cols_; ++c) {
         if (!occupied_[r][c]) {
-          SmallVector<CGRAPosition> start = {{r, c}};
+          SmallVector<CgraPosition> start = {{r, c}};
           search(start, 1ULL << (r * grid_cols_ + c));
         }
       }
@@ -614,7 +614,7 @@ class TaskMapper {
 
     // Helper: minimum Manhattan distance between any position in this
     // placement and any position in another task's placement.
-    auto minDistToPlacement = [&](const SmallVector<CGRAPosition> &other) -> int {
+    auto minDistToPlacement = [&](const SmallVector<CgraPosition> &other) -> int {
       int min_dist = INT_MAX;
       for (const auto &pos : placement.cgra_positions) {
         for (const auto &opos : other) {
@@ -626,7 +626,7 @@ class TaskMapper {
 
     // Helper: minimum Manhattan distance from any position in this placement
     // to a single target position.
-    auto minDistToTarget = [&](const CGRAPosition &target) -> int {
+    auto minDistToTarget = [&](const CgraPosition &target) -> int {
       int min_dist = INT_MAX;
       for (const auto &pos : placement.cgra_positions) {
         min_dist = std::min(min_dist, pos.manhattanDistance(target));

From ea0a0b666b5aeb7205379d251b917b95f887e11b Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Tue, 10 Mar 2026 10:04:38 +0800
Subject: [PATCH 10/10] resolve TODOs, add post-placement re-profiling

---
 .../ResourceAwareTaskOptimizationPass.cpp     | 469 +++++++++++++++---
 .../resource-heavy/resource-heavy.mlir        |  10 +-
 2 files changed, 407 insertions(+), 72 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
index 03707eb9..9909938a 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp
@@ -158,49 +158,233 @@ static SmallVector<CgraShape> getNonRectangularShapes(int cgra_count) {
 // identical bounding box area, we prefer more square-like bounds over long
 // rectangles.
 //
-// TODO: This function only picks a localized shape for an idealized single task
-// mapping. Global placement and conflict resolution across multiple tasks is
-// legitimately deferred to downstream map-on-cgra pass, as speculative
-// profiling assumes unconstrained placement.
+// This function picks a localized shape for speculative per-task  
+// profiling (assumes unconstrained placement).  Global placement conflict
+// resolution across multiple tasks is handled by canAllTasksFitOnGrid()
+// during the balance phase and by the downstream AllocateCgraToTaskPass.
 static CgraShape pickBestShape(int cgra_count) {
-  // For cgra_count == 3, the 2x2 L-shape has a smaller maximum physical routing
-  // distance (dist=2) compared to a 1x3 rectangle (dist=3), despite having a
-  // larger bounding box. We explicitly prefer the more compact L-shape here for
-  // better speculative latency.
-  if (cgra_count == 3) {
-    auto non_rect_shapes = getNonRectangularShapes(3);
-    if (!non_rect_shapes.empty()) {
-      return non_rect_shapes.front();
-    }
-  }
 
   SmallVector<CgraShape> candidates = getRectangularShapes(cgra_count);
   for (const auto &s : getNonRectangularShapes(cgra_count)) {
     candidates.push_back(s);
   }
 
-  if (!candidates.empty()) {
-    return *std::min_element(candidates.begin(), candidates.end(),
-                             [](const CgraShape &a, const CgraShape &b) {
-                               int area_a = a.area();
-                               int area_b = b.area();
-                               if (area_a != area_b)
-                                 return area_a < area_b;
-                               return std::abs(a.rows - a.cols) <
-                                      std::abs(b.rows - b.cols);
-                             });
+  // Selects the shape with smallest bounding-box area first;
+  // among equal areas, prefers the most square-like shape.
+  assert(!candidates.empty() &&
+         "No valid shapes for cgra_count in [1..kMaxCgrasPerTask]");
+  return *std::min_element(candidates.begin(), candidates.end(),
+                           [](const CgraShape &a, const CgraShape &b) {
+                             int area_a = a.area();
+                             int area_b = b.area();
+                             if (area_a != area_b){
+                               return area_a < area_b;
+                             }
+                             return std::abs(a.rows - a.cols) <
+                                    std::abs(b.rows - b.cols);
+                           });
+}
+
+//===----------------------------------------------------------------------===//
+// Global Placement Feasibility Check
+//===----------------------------------------------------------------------===//
+
+// Generates all placement-candidate shapes for `cgra_count` CGRAs, including
+// rotations.  Rectangular shapes include both orientations (rows×cols and
+// cols×rows, deduplicated for squares).  Non-rectangular shapes include all
+// four 90° rotations.
+//
+// Ordering (tried first to last):
+//   1. Rectangular shapes, sorted by squareness (e.g. 2×2 before 1×4),
+//      with smaller bounding-box area as tiebreaker.
+//   2. Non-rectangular shapes (L, T, etc.) in all unique rotations.
+static SmallVector<CgraShape> getAllPlacementShapes(int cgra_count) {
+  SmallVector<CgraShape> shapes;
+
+  // 1. Rectangular shapes with both orientations, deduplicated.
+  {
+    llvm::DenseSet<int64_t> seen_keys; // encodes (rows<<16)|cols
+    for (int row_dim = 1; row_dim <= kCgraGridRows; ++row_dim) {
+      for (int col_dim = 1; col_dim <= kCgraGridCols; ++col_dim) {
+        if (row_dim * col_dim == cgra_count) {
+          int64_t key = ((int64_t)row_dim << 16) | col_dim;
+          if (seen_keys.insert(key).second) {
+            shapes.push_back({row_dim, col_dim, true, {}});
+            // Adds the rotated orientation if different (e.g. 1×4 -> 4×1).
+            if (row_dim != col_dim) {
+              int64_t rotated_key = ((int64_t)col_dim << 16) | row_dim;
+              if (seen_keys.insert(rotated_key).second)
+                shapes.push_back({col_dim, row_dim, true, {}});
+            }
+          }
+        }
+      }
+    }
+    // Sorts rectangles: prefer more square-like (smaller |rows-cols|), then
+    // smaller bounding-box area as tiebreaker.
+    llvm::sort(shapes, [](const CgraShape &lhs, const CgraShape &rhs) {
+      int squareness_lhs = std::abs(lhs.rows - lhs.cols);
+      int squareness_rhs = std::abs(rhs.rows - rhs.cols);
+      if (squareness_lhs != squareness_rhs)
+        return squareness_lhs < squareness_rhs;
+      return lhs.area() < rhs.area();
+    });
   }
 
-  // Fallback: smallest bounding box (should not be reached for 1..4 CGRAs).
-  CgraShape best = {kCgraGridRows, kCgraGridCols, false, {}};
-  for (int r = 1; r <= kCgraGridRows; ++r) {
-    for (int c = 1; c <= kCgraGridCols; ++c) {
-      if (r * c >= cgra_count && r * c < best.area()) {
-        best = {r, c, false, {}};
+  // 2. Non-rectangular shapes with all four 90° rotations.
+  auto base_non_rect = getNonRectangularShapes(cgra_count);
+  for (const auto &base : base_non_rect) {
+    // Generates 4 rotations of the cgra_positions list.
+    // Rotation by 90° CW: (col, row) -> (row, -col).
+    // Each rotation is normalised so that offsets start from (0, 0).
+    SmallVector<SmallVector<std::pair<int, int>>, 4> rotation_variants;
+    rotation_variants.push_back(
+        SmallVector<std::pair<int, int>>(base.cgra_positions));
+
+    // Rotates 3 more times (90°, 180°, 270°).
+    auto prev_positions = base.cgra_positions;
+    for (int rotation_idx = 0; rotation_idx < 3; ++rotation_idx) {
+      SmallVector<std::pair<int, int>> rotated_positions;
+      for (auto &[col_off, row_off] : prev_positions)
+        rotated_positions.push_back(
+            {row_off, -col_off}); // 90° CW in (col, row) space
+
+      // Normalises to non-negative offsets starting from (0, 0).
+      int min_col = INT_MAX, min_row = INT_MAX;
+      for (auto &[col_off, row_off] : rotated_positions) {
+        min_col = std::min(min_col, col_off);
+        min_row = std::min(min_row, row_off);
+      }
+      for (auto &[col_off, row_off] : rotated_positions) {
+        col_off -= min_col;
+        row_off -= min_row;
+      }
+      rotation_variants.push_back(rotated_positions);
+      prev_positions = rotated_positions;
+    }
+
+    // Deduplicates rotations that produce the same position set.
+    llvm::DenseSet<int64_t> seen_hashes;
+    for (auto &positions : rotation_variants) {
+      // Sorts positions for canonical comparison.
+      auto sorted_positions = positions;
+      llvm::sort(sorted_positions,
+                 [](const std::pair<int, int> &lhs,
+                    const std::pair<int, int> &rhs) { return lhs < rhs; });
+      // Simple hash of sorted positions.
+      int64_t hash = 0;
+      for (auto &[col_off, row_off] : sorted_positions)
+        hash = hash * 131 + col_off * 17 + row_off;
+      if (!seen_hashes.insert(hash).second)
+        continue;
+
+      // Computes bounding box for this rotation.
+      int max_col = 0, max_row = 0;
+      for (auto &[col_off, row_off] : positions) {
+        max_col = std::max(max_col, col_off);
+        max_row = std::max(max_row, row_off);
+      }
+      shapes.push_back(
+          {max_row + 1, max_col + 1, false, std::move(positions)});
+    }
+  }
+
+  return shapes;
+}
+
+// Simulates greedy placement of all tasks' shapes on the kCgraGridRows ×
+// kCgraGridCols grid to verify that they physically fit without overlap.
+//
+// For each task, all valid shapes (including rotations) are tried.  Rectangular
+// shapes prefer square-like orientations (e.g. 2×2 over 1×4).  Non-rectangular
+// shapes are tried in all four 90° rotations.
+//
+// `task_cgra_counts` contains the cgra_count for every task in the graph
+// (including the speculatively modified one).
+//
+// Returns true if all tasks can be placed without overlap.
+static bool canAllTasksFitOnGrid(ArrayRef<int> task_cgra_counts) {
+  // Quick capacity check: total CGRAs must not exceed grid size.
+  int total_cgras = 0;
+  for (int count : task_cgra_counts)
+    total_cgras += count;
+  if (total_cgras > kTotalCGRAs)
+    return false;
+
+  // Simulates placement on a grid.
+  bool occupied[kCgraGridRows][kCgraGridCols] = {};
+
+  // Sorts tasks by descending cgra_count for better packing (largest-first
+  // decreasing, a standard bin-packing heuristic).  Each task may have a
+  // different cgra_count because the balance phase only increments one
+  // bottleneck at a time; this array reflects the heterogeneous allocation
+  // across all tasks in the current trial configuration.
+  SmallVector<int> sorted_counts(task_cgra_counts.begin(),
+                                 task_cgra_counts.end());
+  llvm::sort(sorted_counts, [](int lhs, int rhs) { return lhs > rhs; });
+
+  for (int cgra_count : sorted_counts) {
+    SmallVector<CgraShape> candidates = getAllPlacementShapes(cgra_count);
+    bool placed = false;
+
+    for (const auto &shape : candidates) {
+      if (placed)
+        break;
+
+      if (shape.is_rectangular) {
+        // Rectangular: tries every origin where the rows×cols bbox fits.
+        for (int origin_row = 0;
+             origin_row <= kCgraGridRows - shape.rows && !placed;
+             ++origin_row) {
+          for (int origin_col = 0;
+               origin_col <= kCgraGridCols - shape.cols && !placed;
+               ++origin_col) {
+            bool fits = true;
+            for (int delta_row = 0; delta_row < shape.rows && fits;
+                 ++delta_row)
+              for (int delta_col = 0; delta_col < shape.cols && fits;
+                   ++delta_col)
+                if (occupied[origin_row + delta_row][origin_col + delta_col])
+                  fits = false;
+            if (fits) {
+              for (int delta_row = 0; delta_row < shape.rows; ++delta_row)
+                for (int delta_col = 0; delta_col < shape.cols; ++delta_col)
+                  occupied[origin_row + delta_row][origin_col + delta_col] =
+                      true;
+              placed = true;
+            }
+          }
+        }
+      } else {
+        // Non-rectangular: cgra_positions stores (col, row) offsets.
+        for (int origin_row = 0; origin_row < kCgraGridRows && !placed;
+             ++origin_row) {
+          for (int origin_col = 0; origin_col < kCgraGridCols && !placed;
+               ++origin_col) {
+            bool fits = true;
+            for (auto &[col_off, row_off] : shape.cgra_positions) {
+              int abs_row = origin_row + row_off;
+              int abs_col = origin_col + col_off;
+              if (abs_row < 0 || abs_row >= kCgraGridRows || abs_col < 0 ||
+                  abs_col >= kCgraGridCols || occupied[abs_row][abs_col]) {
+                fits = false;
+                break;
+              }
+            }
+            if (fits) {
+              for (auto &[col_off, row_off] : shape.cgra_positions)
+                occupied[origin_row + row_off][origin_col + col_off] = true;
+              placed = true;
+            }
+          }
+        }
       }
     }
+
+    if (!placed)
+      return false;
   }
-  return best;
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -857,17 +1041,34 @@ class PipelineBalancer {
       int old_cgra_count = bottleneck->cgra_count;
       int new_cgra_count = old_cgra_count + 1;
 
-      // Check if incrementing cgra_count is feasible on the 4×4 grid.
-      // TODO: This currently only checks the capacity (total CGRA count). Ideally, 
-      // we should invoke a global placement pass (aka AllocateCgraToTaskPass) here to 
-      // verify if the speculatively increased CGRA count and its proposed shape 
-      // actually fit on the 4x4 grid alongside other previously allocated tasks.
-      //
+      // Check 1: Per-task CGRA limit.
       if (!canFitOnGrid(new_cgra_count)) {
         saturated_nodes.insert(bottleneck);
         continue;
       }
 
+      // Check 2: Global placement feasibility — simulates placing all tasks'
+      // shapes (with the speculatively increased cgra_count for the bottleneck)
+      // on the physical kCgraGridRows × kCgraGridCols grid to verify they
+      // fit without overlap.
+      {
+        SmallVector<int> trial_counts;
+        for (auto &node : graph.nodes) {
+          if (node.get() == bottleneck)
+            trial_counts.push_back(new_cgra_count);
+          else
+            trial_counts.push_back(node->cgra_count);
+        }
+        if (!canAllTasksFitOnGrid(trial_counts)) {
+          llvm::errs() << "  Balance: global placement infeasible for Task "
+                       << bottleneck->id << " ("
+                       << bottleneck->op.getTaskName().str()
+                       << ") with cgra_count=" << new_cgra_count << "\n";
+          saturated_nodes.insert(bottleneck);
+          continue;
+        }
+      }
+
       // Saves state for potential rollback.
       int64_t old_latency = bottleneck->estimatedLatency();
       int64_t old_ii = bottleneck->ii;
@@ -1782,14 +1983,8 @@ struct ResourceAwareTaskOptimizationPass
         // intermediate iterations; ii, steps, and trip_count live only in the
         // graph node and must be persisted here.
         //
-        // Note: no re-profiling is done here.  When balance-skip-mapper=true
-        // (the default), the balance phase uses analytical estimates; those
-        // are the values written to the final IR.  When
-        // balance-skip-mapper=false, the balance phase already ran the real
-        // mapper for each speculative probe, so the graph already contains
-        // accurate compiled_ii / steps values.  Either way, the converged
-        // graph state is authoritative and written directly to IR.
-
+        // Phase A: Write speculative attributes so AllocateCgraToTask can
+        // read cgra_count and tile_shape from the IR.
         for (auto &node : graph.nodes) {
           OpBuilder b(node->op);
           node->shape = pickBestShape(node->cgra_count);
@@ -1799,18 +1994,109 @@ struct ResourceAwareTaskOptimizationPass
           node->op->setAttr("steps", b.getI32IntegerAttr(node->steps));
           node->op->setAttr("trip_count",
                             b.getI32IntegerAttr(node->trip_count));
-          // Writes tile_shape attribute: simple "NxM" bounding-box string.
-          // The detailed occupancy diagram is printed in the summary below.
           std::string shape_str = node->shape.irAttr();
           node->op->setAttr("tile_shape", b.getStringAttr(shape_str));
         }
 
-        // Runs AllocateCgraToTaskPass to produce global placement (task_mapping_info)
-        // with multi-CGRA support. The pass reads cgra_count and tile_shape
-        // from each task and places them on the 4x4 grid, validating that
-        // shapes physically fit and don't overlap.
+        // Phase B: Run global placement.  AllocateCgraToTask reads
+        // cgra_count / tile_shape from the IR and produces
+        // task_mapping_info with the actual cgra_positions on the 4×4 grid.
         taskflow::runAllocateCgraToTask(func, kCgraGridRows, kCgraGridCols);
 
+        // Phase C: Post-placement reconciliation.
+        // Reads back the actual placed shape from task_mapping_info
+        // and re-profiles tasks whose placed shape
+        // differs from the speculative pickBestShape.  
+        for (auto &node : graph.nodes) {
+          auto mapping_attr =
+              node->op->getAttrOfType<DictionaryAttr>("task_mapping_info");
+          if (!mapping_attr)
+            continue;
+          auto positions_attr =
+              mapping_attr.getAs<ArrayAttr>("cgra_positions");
+          if (!positions_attr || positions_attr.empty())
+            continue;
+
+          // Extracts (col, row) pairs from the placement result.
+          SmallVector<std::pair<int, int>> placed_positions;
+          for (Attribute pos_attr : positions_attr) {
+            auto coord = cast<DictionaryAttr>(pos_attr);
+            int row = cast<IntegerAttr>(coord.get("row")).getInt();
+            int col = cast<IntegerAttr>(coord.get("col")).getInt();
+            placed_positions.emplace_back(col, row);
+          }
+
+          int actual_cgra_count = static_cast<int>(placed_positions.size());
+
+          // Computes bounding box of the actual placement.
+          int min_row = INT_MAX, max_row = INT_MIN;
+          int min_col = INT_MAX, max_col = INT_MIN;
+          for (auto &[col, row] : placed_positions) {
+            min_row = std::min(min_row, row);
+            max_row = std::max(max_row, row);
+            min_col = std::min(min_col, col);
+            max_col = std::max(max_col, col);
+          }
+          int bbox_rows = max_row - min_row + 1;
+          int bbox_cols = max_col - min_col + 1;
+          bool is_rect = (bbox_rows * bbox_cols == actual_cgra_count);
+
+          // Builds the actual CgraShape.
+          CgraShape actual_shape;
+          actual_shape.rows = bbox_rows;
+          actual_shape.cols = bbox_cols;
+          actual_shape.is_rectangular = is_rect;
+          if (!is_rect) {
+            // Normalizes positions to (0,0) origin for the shape.
+            for (auto &[col, row] : placed_positions)
+              actual_shape.cgra_positions.emplace_back(col - min_col,
+                                                       row - min_row);
+          }
+
+          // Checks whether the placed shape differs from the speculative
+          // shape used during balance profiling.
+          bool shape_changed =
+              (actual_cgra_count != node->cgra_count) ||
+              (actual_shape.rows != node->shape.rows) ||
+              (actual_shape.cols != node->shape.cols) ||
+              (actual_shape.is_rectangular != node->shape.is_rectangular);
+
+          if (shape_changed) {
+            llvm::errs()
+                << "[ResourceAware] Post-placement shape mismatch for "
+                << node->op.getTaskName()
+                << ": speculative=" << node->shape.describe(node->cgra_count)
+                << ", actual=" << actual_shape.describe(actual_cgra_count)
+                << " — re-profiling\n";
+
+            // Updates the node to reflect the actual placement.
+            node->cgra_count = actual_cgra_count;
+            node->shape = actual_shape;
+
+            // Re-profiles with the actual shape.
+            graph.profileTaskPublic(node.get(), node->op,
+                                    /*skip_mapper=*/use_analytical);
+
+            // Writes updated attributes back to IR.
+            OpBuilder b(node->op);
+            node->op->setAttr("cgra_count",
+                              b.getI32IntegerAttr(node->cgra_count));
+            node->op->setAttr("compiled_ii",
+                              b.getI32IntegerAttr(node->ii));
+            node->op->setAttr("steps",
+                              b.getI32IntegerAttr(node->steps));
+            std::string actual_shape_str = node->shape.irAttr();
+            node->op->setAttr("tile_shape",
+                              b.getStringAttr(actual_shape_str));
+
+            llvm::errs()
+                << "[ResourceAware] Post-placement re-profiled "
+                << node->op.getTaskName()
+                << ": compiled_ii=" << node->ii
+                << ", steps=" << node->steps << "\n";
+          }
+        }
+
         break;
       }
     }
@@ -1828,15 +2114,55 @@ struct ResourceAwareTaskOptimizationPass
       std::vector<std::vector<int>> combined_grid(
           kCgraGridRows, std::vector<int>(kCgraGridCols, -1));
 
-      // Packs tasks onto the grid left-to-right, top-to-bottom.
-      int next_col = 0, next_row = 0;
+      // Packs tasks onto the grid using actual placement results.
+      int next_col = 0, next_row = 0; // Fallback for tasks without placement.
       int task_idx = 0;
 
       llvm::errs() << "\n=== Tile Occupation Summary (4x" << kCgraGridCols
                    << " CGRA Grid) ===\n";
 
       for (auto &node : final_graph.nodes) {
-        auto shape = pickBestShape(node->cgra_count);
+        // Reads the actual placed shape from task_mapping_info instead of
+        // re-computing with pickBestShape, so the summary is consistent
+        // with the real placement result.
+        CgraShape shape = pickBestShape(node->cgra_count); // fallback
+        SmallVector<std::pair<int, int>> actual_grid_positions;
+
+        if (auto mapping_attr =
+                node->op->getAttrOfType<DictionaryAttr>("task_mapping_info")) {
+          if (auto positions_attr =
+                  mapping_attr.getAs<ArrayAttr>("cgra_positions")) {
+            if (!positions_attr.empty()) {
+              actual_grid_positions.clear();
+              int min_row = INT_MAX, max_row = INT_MIN;
+              int min_col = INT_MAX, max_col = INT_MIN;
+              for (Attribute pos_attr : positions_attr) {
+                auto coord = cast<DictionaryAttr>(pos_attr);
+                int row = cast<IntegerAttr>(coord.get("row")).getInt();
+                int col = cast<IntegerAttr>(coord.get("col")).getInt();
+                actual_grid_positions.emplace_back(col, row);
+                min_row = std::min(min_row, row);
+                max_row = std::max(max_row, row);
+                min_col = std::min(min_col, col);
+                max_col = std::max(max_col, col);
+              }
+              int bbox_rows = max_row - min_row + 1;
+              int bbox_cols = max_col - min_col + 1;
+              int placed_count =
+                  static_cast<int>(actual_grid_positions.size());
+              bool is_rect = (bbox_rows * bbox_cols == placed_count);
+              shape.rows = bbox_rows;
+              shape.cols = bbox_cols;
+              shape.is_rectangular = is_rect;
+              shape.cgra_positions.clear();
+              if (!is_rect) {
+                for (auto &[c, r] : actual_grid_positions)
+                  shape.cgra_positions.emplace_back(c - min_col, r - min_row);
+              }
+            }
+          }
+        }
+
         int tile_rows = shape.rows * neura::getArchitecture().getPerCgraRows();
         int tile_cols =
             shape.cols * neura::getArchitecture().getPerCgraColumns();
@@ -1873,20 +2199,29 @@ struct ResourceAwareTaskOptimizationPass
           llvm::errs() << "\n";
         }
 
-        // Places onto combined grid (pack sequentially).
-        int placed = 0;
-        for (int r = next_row; r < kCgraGridRows && placed < node->cgra_count;
-             ++r) {
-          for (int c = (r == next_row ? next_col : 0);
-               c < kCgraGridCols && placed < node->cgra_count; ++c) {
-            combined_grid[r][c] = task_idx;
-            next_row = r;
-            next_col = c + 1;
-            if (next_col >= kCgraGridCols) {
-              next_col = 0;
-              next_row = r + 1;
+        // Places onto combined grid using actual placement positions when
+        // available, falling back to sequential packing.
+        if (!actual_grid_positions.empty()) {
+          for (auto &[col, row] : actual_grid_positions) {
+            if (row >= 0 && row < kCgraGridRows && col >= 0 &&
+                col < kCgraGridCols)
+              combined_grid[row][col] = task_idx;
+          }
+        } else {
+          int placed = 0;
+          for (int r = next_row;
+               r < kCgraGridRows && placed < node->cgra_count; ++r) {
+            for (int c = (r == next_row ? next_col : 0);
+                 c < kCgraGridCols && placed < node->cgra_count; ++c) {
+              combined_grid[r][c] = task_idx;
+              next_row = r;
+              next_col = c + 1;
+              if (next_col >= kCgraGridCols) {
+                next_col = 0;
+                next_row = r + 1;
+              }
+              ++placed;
             }
-            ++placed;
           }
         }
         ++task_idx;
diff --git a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
index 8d6971db..96e4e3d1 100644
--- a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
+++ b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir
@@ -193,16 +193,16 @@ module {
 
 // RESOPT:      taskflow.task @Task_0_Task_1_utilfused
 // RESOPT-SAME: {cgra_count = 3 : i32, compiled_ii = 1 : i32, steps = 10 : i32
-// RESOPT-SAME: tile_shape = "2x2[(0,0)(1,0)(0,1)]", trip_count = 64 : i32}
+// RESOPT-SAME: tile_shape = "1x3", trip_count = 64 : i32}
 // RESOPT:      return
 
 // CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
 // +---+---+---+---+
-// | 0 | 0 | . | . |   row=0: Task_0_Task_1_utilfused occupies 3 CGRAs
-// +---+---+---+---+         in a 2x2 non-rectangular layout:
-// | 0 | . | . | . |         (0,0), (1,0), (0,1)
+// | 0 | 0 | 0 | . |   row=0: Task_0_Task_1_utilfused occupies 3 CGRAs
+// +---+---+---+---+         in a 1x3 rectangular layout:
+// | . | . | . | . |         (0,0), (1,0), (2,0)
 // +---+---+---+---+
-// | . | . | . | . |   Total tile array: 8x8 (3 CGRAs × 16 tiles = 48 tiles)
+// | . | . | . | . |   Total tile array: 4x12 (3 CGRAs × 16 tiles = 48 tiles)
 // +---+---+---+---+
 // | . | . | . | . |   res_mii=3 (16 tiles) → 2 (32 tiles) → 1 (48 tiles)
 // +---+---+---+---+