From b538839d5e390be6d46e4bbb159ecf84c4253511 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 31 Jan 2026 22:59:27 +0800
Subject: [PATCH 1/9] detecting loop-epilogue code when constructing hyperblock

---
 include/TaskflowDialect/TaskflowPasses.h      |   1 +
 include/TaskflowDialect/TaskflowPasses.td     |  33 +-
 lib/TaskflowDialect/Transforms/CMakeLists.txt |   4 +-
 .../ConstructHyperblockFromTaskPass.cpp       | 103 +++-
 .../AffineLoopTreeSerializationPass.cpp       | 439 ++++++++++++++++++
 .../Transforms/Optimizations/CMakeLists.txt   |  18 +
 6 files changed, 581 insertions(+), 17 deletions(-)
 create mode 100644 lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp
 create mode 100644 lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index c0007ce1..69c2a37e 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -15,6 +15,7 @@ namespace taskflow {
 // Passes defined in TaskflowPasses.td
 #define GEN_PASS_DECL
 #include "TaskflowDialect/TaskflowPasses.h.inc"
+std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 4fc2137f..6aef5870 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -8,28 +8,35 @@ include "mlir/Pass/PassBase.td"
 //=========================================================//
 // Passes for the Taskflow dialect
 //=========================================================//
-def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> {
-  let summary = "Constructs hyperblocks and counter chain from Taskflow tasks";
+def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "ModuleOp">{
+  let summary = "Serializes top-level affine.for loops into minimized task operations";
   let description = [{
-    This pass constructs hyperblocks and counter chain from Taskflow tasks.
+    This pass converts top-level affine.for loops in a function into
+    minimized and canonicalized task operations.
   }];
-  let constructor = "taskflow::createConstructHyperblockFromTaskPass()";
+  let constructor = "taskflow::createAffineLoopTreeSerializationPass()";
+  let dependentDialects = [
+    "mlir::taskflow::TaskflowDialect",
+    "mlir::affine::AffineDialect",
+    "mlir::func::FuncDialect"];
 }
 
-def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{
-  let summary = "Canonicalizes tasks by splitting each hyperblock into a separate atomic task";
+def CanonicalizeTask : Pass<"canonicalize-task", "func::FuncOp"> {
+  let summary = "Canonicalizes Taskflow tasks";
   let description = [{
-    This pass splits tasks so that each task contains exactly one hyperblock.
-    This creates atomic task units that can be analyzed and optimized independently.
-
-    Input: Task with N hyperblocks
-    Output: N atomic tasks, each containing one hyperblock
-
-    This is a prerequisite pass before fusion optimizations.
+    This pass canonicalizes Taskflow tasks.
   }];
   let constructor = "taskflow::createCanonicalizeTaskPass()";
 }
 
+def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> {
+  let summary = "Constructs hyperblocks and counter chain from Taskflow tasks";
+  let description = [{
+    This pass constructs hyperblocks and counter chain from Taskflow tasks.
+  }];
+  let constructor = "taskflow::createConstructHyperblockFromTaskPass()";
+}
+
 def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
   let summary = "Classifies counters as root/relay/leaf";
   let description = [{
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index e44401d8..ff12e671 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -16,4 +16,6 @@ add_mlir_library(MLIRTaskflowTransforms
     MLIRTaskflow
     ${dialect_libs}
     LLVMSupport
-)
\ No newline at end of file
+)
+
+add_subdirectory(Optimizations)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index 6955e29c..792412ff 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -58,6 +58,9 @@ struct HyperblockInfo {
 
   // The corresponding loop.
   affine::AffineForOp loop_op = nullptr;
+
+  // Marks if this hyperblock follows the LEC pattern.
+  bool is_lec_pattern = false;
 };
 
 //----------------------------------------------------------------------------
@@ -176,12 +179,64 @@ getTopLevelLoopsInfo(SmallVector<LoopInfo> &loops_info) {
   return top_level_loops_info;
 }
 
+//----------------------------------------------------------------------------
+// Loop-Epilogue Code (LEC) Pattern Detection
+//----------------------------------------------------------------------------
+// Loop-Epilogue Code means code that appears after an inner loop.
+// Example:
+// for %i (outer loop) {
+//   for %j (nested loop) {
+//     <loop body>
+//   }
+//   <epilogue code>  ← Loop-Epilogue Code
+// }
+// For this pattern, we need to wrap the inner loop and the epilogue code into
+// a hyperblock. Only by doing this can we maintain the hyperblock as a pure
+// data-driven code block.
+struct LECPattern {
+  affine::AffineForOp outer_loop;
+  affine::AffineForOp inner_loop;
+
+  SmallVector<Operation *> prologue_code;
+  SmallVector<Operation *> epilogue_code;
+
+  bool has_lec_pattern = false;
+};
+
+// Detects Loop-Epilogue Code pattern in the task.
+static LECPattern detectLECPattern(affine::AffineForOp outer_loop) {
+  LECPattern pattern;
+  pattern.outer_loop = outer_loop;
+
+  Block &body = outer_loop.getRegion().front();
+  bool found_nested_loop = false;
+
+  for (Operation &op : body.getOperations()) {
+    if (auto nested_for = dyn_cast<affine::AffineForOp>(&op)) {
+      found_nested_loop = true;
+      if (!pattern.inner_loop) {
+        pattern.inner_loop = nested_for;
+      }
+    } else if (!(isa<affine::AffineYieldOp>(&op) && op.getOperands().empty())) {
+      if (!found_nested_loop) {
+        pattern.prologue_code.push_back(&op);
+        pattern.has_lec_pattern = true;
+      } else {
+        pattern.epilogue_code.push_back(&op);
+        pattern.has_lec_pattern = true;
+      }
+    }
+  }
+
+  return pattern;
+}
+
 //----------------------------------------------------------------------------
 // Hyperblock Creation
 //----------------------------------------------------------------------------
 // Recursively extracts hyperblocks from a region.
-// Key insight: Operations in a loop body that are used by nested loops should
-// be inlined into the nested loop's hyperblock.
+// Key insight: Operations in a loop body that are used by nested loops
+// should be inlined into the nested loop's hyperblock.
 static void extractHyperblocksInfoFromRegion(
     Region &region,
     const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map,
@@ -196,15 +251,57 @@ static void extractHyperblocksInfoFromRegion(
 
   for (Operation &op : block.getOperations()) {
     if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
+
+      LECPattern lec_pattern = detectLECPattern(for_op);
+
       // Gets the loop info.
       LoopInfo *loop_info = loop_info_map.lookup(for_op);
       assert(loop_info && "Loop not found in loop_info_map");
 
-      // Builds trigger indices fro this loop (parent indices + this loop's
+      // Builds trigger indices for this loop (parent indices + this loop's
       // index).
       SmallVector<Value> loop_indices = parent_indices;
       loop_indices.push_back(loop_info->counter_index);
 
+      // Handles the LEC pattern.
+      if (lec_pattern.has_lec_pattern) {
+        // 1. Emits any accumulated operations as a hyperblock.
+        if (!current_block_ops.empty()) {
+          HyperblockInfo info;
+          info.operations = current_block_ops;
+          info.trigger_indices = parent_indices;
+          info.is_loop_body = !parent_indices.empty();
+          info.loop_op = enclosing_loop;
+          hyperblocks_info.push_back(info);
+          current_block_ops.clear();
+        }
+
+        // 2. Creates a hyperblock for the prologue + inner loop + epilogue.
+        HyperblockInfo info;
+        if (!lec_pattern.prologue_code.empty()) {
+          info.operations.append(lec_pattern.prologue_code.begin(),
+                                 lec_pattern.prologue_code.end());
+        }
+
+        info.operations.push_back(lec_pattern.inner_loop);
+
+        if (!lec_pattern.epilogue_code.empty()) {
+          info.operations.append(lec_pattern.epilogue_code.begin(),
+                                 lec_pattern.epilogue_code.end());
+        }
+
+        info.trigger_indices = loop_indices;
+        info.is_loop_body = true;
+        info.loop_op = for_op;
+        info.is_lec_pattern = true;
+        hyperblocks_info.push_back(info);
+
+        // No need for further processing of this loop. Since we have already
+        // handled the whole for_op.
+        current_block_ops.clear();
+        continue;
+      }
+
       // Analyzes which of the current_ops are used by this loop.
       DenseSet<Value> values_used_in_loop;
       for_op.walk([&](Operation *nested_op) {
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp
new file mode 100644
index 00000000..da5cc7fa
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp
@@ -0,0 +1,439 @@
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+#include "TaskflowDialect/TaskflowTypes.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//==============================================================================
+// Static Affine Loop Tree (SALT) Node.
+//==============================================================================
+struct SALTNode {
+  affine::AffineForOp loop_op;
+  int64_t lower_bound;
+  int64_t upper_bound;
+  int64_t step;
+
+  SALTNode *parent = nullptr;
+  SmallVector<SALTNode *> children;
+
+  // Operations that are NOT nested loops (the actual computation at this
+  // level).
+  SmallVector<Operation *> body_operations;
+
+  bool isLeaf() const { return children.empty(); }
+  bool isRoot() const { return parent == nullptr; }
+};
+
+//==============================================================================
+// Loop Chain - Path from Root to Leaf.
+//==============================================================================
+struct LoopChain {
+  SmallVector<SALTNode *> nodes; // Ordered from root to leaf.
+
+  SALTNode *getRoot() const { return nodes.front(); }
+  SALTNode *getLeaf() const { return nodes.back(); }
+};
+
+//==============================================================================
+// SALT Builder.
+//==============================================================================
+class SALTBuilder {
+public:
+  SmallVector<SALTNode *> build(func::FuncOp func_op) {
+    SmallVector<SALTNode *> roots;
+
+    for (Block &block : func_op.getBlocks()) {
+      for (Operation &op : block) {
+        if (affine::AffineForOp for_op = dyn_cast<affine::AffineForOp>(&op)) {
+          if (for_op.hasConstantLowerBound() &&
+              for_op.hasConstantUpperBound()) {
+            SALTNode *root = buildNodeRecursively(for_op, nullptr);
+            if (root) {
+              roots.push_back(root);
+            }
+          }
+        }
+      }
+    }
+
+    return roots;
+  }
+
+  const SmallVector<std::unique_ptr<SALTNode>> &getAllNodes() const {
+    return all_nodes;
+  }
+
+private:
+  SmallVector<std::unique_ptr<SALTNode>> all_nodes;
+
+  SALTNode *buildNodeRecursively(affine::AffineForOp for_op, SALTNode *parent) {
+    auto node = std::make_unique<SALTNode>();
+    node->loop_op = for_op;
+    node->lower_bound = for_op.getConstantLowerBound();
+    node->upper_bound = for_op.getConstantUpperBound();
+    node->step = for_op.getStepAsInt();
+    node->parent = parent;
+
+    SALTNode *node_ptr = node.get();
+    all_nodes.push_back(std::move(node));
+
+    Block &body = for_op.getRegion().front();
+    for (Operation &op : body) {
+      if (auto nested_for = dyn_cast<affine::AffineForOp>(&op)) {
+        if (nested_for.hasConstantLowerBound() &&
+            nested_for.hasConstantUpperBound()) {
+          SALTNode *child = buildNodeRecursively(nested_for, node_ptr);
+          if (child) {
+            node_ptr->children.push_back(child);
+          }
+        } else {
+          node_ptr->body_operations.push_back(&op);
+        }
+      } else if (!isa<affine::AffineYieldOp>(&op)) {
+        node_ptr->body_operations.push_back(&op);
+      }
+    }
+
+    return node_ptr;
+  }
+};
+
+//==============================================================================
+// Loop Chain Extractor (DFS).
+//==============================================================================
+class LoopChainExtractor {
+public:
+  SmallVector<LoopChain> extract(const SmallVector<SALTNode *> &roots) {
+    SmallVector<LoopChain> chains;
+
+    for (SALTNode *root : roots) {
+      SmallVector<SALTNode *> current_path;
+      dfs(root, current_path, chains);
+    }
+
+    return chains;
+  }
+
+private:
+  void dfs(SALTNode *node, SmallVector<SALTNode *> &current_path,
+           SmallVector<LoopChain> &chains) {
+    current_path.push_back(node);
+
+    if (node->isLeaf()) {
+      LoopChain chain;
+      chain.nodes = current_path;
+      chains.push_back(chain);
+    } else {
+      for (SALTNode *child : node->children) {
+        dfs(child, current_path, chains);
+      }
+    }
+
+    current_path.pop_back();
+  }
+};
+
+//==============================================================================
+// MCT Builder - Builds nested affine.for loops for the entire chain.
+//==============================================================================
+class MCTBuilder {
+public:
+  MCTBuilder(OpBuilder &builder, Location loc) : builder(builder), loc(loc) {}
+
+  // Builds the loop chain and returns the outermost loop.
+  // The built loops will be inserted at the builder's current insertion point.
+  affine::AffineForOp build(const LoopChain &chain) {
+    // Mapping from old values to new values.
+    IRMapping mapping;
+
+    affine::AffineForOp outer_loop = nullptr;
+    Block *current_insert_block = nullptr;
+    SmallVector<affine::AffineForOp> created_loops;
+
+    // Iterate from root to leaf to build the nested loops.
+    for (size_t i = 0; i < chain.nodes.size(); ++i) {
+      SALTNode *node = chain.nodes[i];
+      bool is_first = (i == 0);
+
+      OpBuilder loop_builder(builder.getContext());
+      if (is_first) {
+        loop_builder = builder;
+      } else {
+        // We want to insert the nested loop at the end of the current block.
+        // If the block has a terminator (e.g. yield we just added/cloned?),
+        // we should insert before it?
+        // Actually, we remove default yield immediately after creation.
+        // So the block usually doesn't have a terminator when we are filling
+        // it, UNLESS we cloned a yield from body ops? SALT excludes Yields from
+        // body_operations. So current_insert_block should be terminator-free
+        // (or we removed it).
+        loop_builder = OpBuilder::atBlockEnd(current_insert_block);
+      }
+
+      // Prepare iter_args for the new loop.
+      SmallVector<Value> iter_args_init_values;
+      if (node->loop_op.getNumIterOperands() > 0) {
+        for (Value init : node->loop_op.getInits()) {
+          iter_args_init_values.push_back(mapping.lookupOrDefault(init));
+        }
+      }
+
+      // Create new loop with same bounds and iter_args.
+      auto new_loop = loop_builder.create<affine::AffineForOp>(
+          loc, node->lower_bound, node->upper_bound, node->step,
+          iter_args_init_values);
+
+      created_loops.push_back(new_loop);
+
+      // Map the old induction variable to the new one.
+      mapping.map(node->loop_op.getInductionVar(), new_loop.getInductionVar());
+
+      // Map the old iter_args (block args) to the new iter_args (block args).
+      if (node->loop_op.getNumRegionIterArgs() > 0) {
+        for (auto [old_arg, new_arg] :
+             llvm::zip(node->loop_op.getRegionIterArgs(),
+                       new_loop.getRegionIterArgs())) {
+          mapping.map(old_arg, new_arg);
+        }
+      }
+
+      if (is_first) {
+        outer_loop = new_loop;
+      }
+
+      // Update current insertion block to the body of the new loop.
+      current_insert_block = new_loop.getBody();
+
+      // Remove the default yield created by create<AffineForOp>.
+      if (!current_insert_block->empty() &&
+          isa<affine::AffineYieldOp>(current_insert_block->back()))
+        current_insert_block->back().erase();
+
+      // Clone body operations for THIS node.
+      OpBuilder body_builder = OpBuilder::atBlockEnd(current_insert_block);
+      for (Operation *op : node->body_operations) {
+        Operation *new_op = body_builder.clone(*op, mapping);
+        // Update mapping with results of the new op.
+        for (auto [old_res, new_res] :
+             llvm::zip(op->getResults(), new_op->getResults())) {
+          mapping.map(old_res, new_res);
+        }
+      }
+    }
+
+    // Fix up yields for non-leaf loops (bottom-up).
+    for (int i = created_loops.size() - 2; i >= 0; --i) {
+      affine::AffineForOp parent = created_loops[i];
+      affine::AffineForOp child = created_loops[i + 1];
+
+      OpBuilder yield_builder = OpBuilder::atBlockEnd(parent.getBody());
+
+      if (child.getNumResults() > 0) {
+        yield_builder.create<affine::AffineYieldOp>(loc, child.getResults());
+      } else {
+        yield_builder.create<affine::AffineYieldOp>(loc);
+      }
+    }
+
+    // For the LEAF loop, we cloned body operations (which excludes Yields).
+    // So the leaf loop likely has NO yield now.
+    // We must add a yield to the leaf loop that yields the results of the
+    // operations that produced results (mapped from original yield).
+    // SALTNode loop_op is the original loop.
+    // The original loop body had a yield.
+    // We need to find what the original yield yielded, map it, and yield it
+    // here.
+
+    // Wait, if SALT excludes Yield from body_operations, then we NEVER cloned
+    // the yield. So the leaf loop has no terminator. We must reconstruct the
+    // yield for the leaf loop.
+
+    if (!created_loops.empty()) {
+      affine::AffineForOp new_leaf = created_loops.back();
+      SALTNode *leaf_node = chain.getLeaf(); // or chain.nodes.back()
+
+      // Find the yield op in the original leaf node.
+      Operation *original_yield = nullptr;
+      for (Operation &op : leaf_node->loop_op.getBody()->getOperations()) {
+        if (isa<affine::AffineYieldOp>(&op)) {
+          original_yield = &op;
+          break;
+        }
+      }
+
+      if (original_yield) {
+        OpBuilder leaf_yield_builder =
+            OpBuilder::atBlockEnd(new_leaf.getBody());
+        SmallVector<Value> yielded_values;
+        for (Value operand : original_yield->getOperands()) {
+          yielded_values.push_back(mapping.lookupOrDefault(operand));
+        }
+        leaf_yield_builder.create<affine::AffineYieldOp>(loc, yielded_values);
+      } else {
+        // Should not happen for valid AffineForOp
+        OpBuilder leaf_yield_builder =
+            OpBuilder::atBlockEnd(new_leaf.getBody());
+        leaf_yield_builder.create<affine::AffineYieldOp>(loc);
+      }
+    }
+
+    return outer_loop;
+  }
+
+private:
+  OpBuilder &builder;
+  Location loc;
+};
+
+//==============================================================================
+// Pass Implementation
+//==============================================================================
+struct AffineLoopTreeSerializationPass
+    : public PassWrapper<AffineLoopTreeSerializationPass,
+                         OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AffineLoopTreeSerializationPass)
+
+  StringRef getArgument() const final {
+    return "affine-loop-tree-serialization";
+  }
+
+  StringRef getDescription() const final {
+    return "Serialize Affine loop trees into a linear sequence of loop nests "
+           "for MCT construction.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TaskflowDialect, affine::AffineDialect, func::FuncDialect,
+                    arith::ArithDialect, memref::MemRefDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+
+    WalkResult result = module.walk([&](func::FuncOp func_op) {
+      if (failed(convertFunction(func_op))) {
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+
+    if (result.wasInterrupted()) {
+      signalPassFailure();
+    }
+  }
+
+private:
+  LogicalResult convertFunction(func::FuncOp func_op) {
+    Location loc = func_op.getLoc();
+
+    // Builds static affine loop tree.
+    SALTBuilder salt_builder;
+    SmallVector<SALTNode *> roots = salt_builder.build(func_op);
+
+    if (roots.empty()) {
+      return success();
+    }
+
+    llvm::errs() << "=== SALT Structure ===\n";
+    for (SALTNode *root : roots) {
+      printSALT(root, 0);
+    }
+
+    // Extracts loop chains.
+    LoopChainExtractor extractor;
+    SmallVector<LoopChain> chains = extractor.extract(roots);
+
+    llvm::errs() << "=== Extracted " << chains.size() << " MCT(s) ===\n";
+    for (size_t i = 0; i < chains.size(); ++i) {
+      llvm::errs() << "MCT " << i << ": ";
+      for (SALTNode *node : chains[i].nodes) {
+        llvm::errs() << "[" << node->lower_bound << "," << node->upper_bound
+                     << ") ";
+      }
+      llvm::errs() << "\n";
+    }
+
+    // LoopChainExtractor iterates roots in order of SALTBuilder (order
+    // of appearance). So we can iterate through roots, and for each root, build
+    // its chains, replace root with chains.
+
+    for (SALTNode *root : roots) {
+      OpBuilder builder(root->loop_op);
+
+      // Finds chains originating from this root.
+      SmallVector<LoopChain> root_chains;
+      for (const auto &chain : chains) {
+        if (chain.getRoot() == root) {
+          root_chains.push_back(chain);
+        }
+      }
+
+      // Builds new chains.
+      for (const LoopChain &chain : root_chains) {
+        MCTBuilder mct_builder(builder, loc);
+        affine::AffineForOp new_loop = mct_builder.build(chain);
+
+        // If the original root loop had results (iter_args), and the new loop
+        // has matching results, we must replace the uses of the original
+        // results with the new ones. NOTE: This assumes that for a loop
+        // defining values, there is a corresponding single chain that produces
+        // all the values (or at least the one we process). If a root with
+        // results is split into multiple chains, this simple logic might loop
+        // over them. However, for a reduction loop that is a single chain, this
+        // works.
+        if (root->loop_op.getNumResults() > 0 && new_loop &&
+            new_loop.getNumResults() == root->loop_op.getNumResults()) {
+          root->loop_op.replaceAllUsesWith(new_loop.getResults());
+        }
+      }
+
+      // Erases the original root loop.
+      root->loop_op.erase();
+    }
+
+    return success();
+  }
+
+  void printSALT(SALTNode *node, int depth) {
+    for (int i = 0; i < depth; ++i) {
+      llvm::errs() << "  ";
+    }
+    llvm::errs() << "Loop [" << node->lower_bound << "," << node->upper_bound
+                 << ") step=" << node->step
+                 << " | body_ops=" << node->body_operations.size()
+                 << " | children=" << node->children.size() << "\n";
+    for (SALTNode *child : node->children) {
+      printSALT(child, depth + 1);
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::taskflow::createAffineLoopTreeSerializationPass() {
+  return std::make_unique<AffineLoopTreeSerializationPass>();
+}
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
new file mode 100644
index 00000000..3e1ce5cd
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
@@ -0,0 +1,18 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRTaskflowOptimization
+  AffineLoopTreeSerializationPass.cpp
+
+  DEPENDS
+  MLIRTaskflowTransformsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRTaskflow
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRLinalgDialect
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MLIRSupport
+)
\ No newline at end of file

From 11010dc0bc5b35c688e295b9fa70f41452ba986f Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sun, 1 Feb 2026 16:58:29 +0800
Subject: [PATCH 2/9] introducing original memrefs in task

---
 include/TaskflowDialect/TaskflowOps.td        |  40 +-
 include/TaskflowDialect/TaskflowPasses.h      |   1 -
 include/TaskflowDialect/TaskflowPasses.td     |   8 -
 .../AffineToTaskflow/AffineToTaskflowPass.cpp | 109 +++-
 lib/TaskflowDialect/TaskflowOps.cpp           | 281 +++++++++
 lib/TaskflowDialect/Transforms/CMakeLists.txt |   1 -
 .../Transforms/CanonicalizeTaskPass.cpp       | 560 ------------------
 7 files changed, 381 insertions(+), 619 deletions(-)
 delete mode 100644 lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp

diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td
index a7ee4a6c..d4d7c326 100644
--- a/include/TaskflowDialect/TaskflowOps.td
+++ b/include/TaskflowDialect/TaskflowOps.td
@@ -39,11 +39,17 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [
     1. Memory dependencies: memrefs that are read or written by the task
     2. Value dependencies: SSA values from producer tasks
 
+    The `read_memrefs` and `write_memrefs` attributes record the actural
+    original memrefs that this task accesses,
+    enabling data placement analysis for multi-CGRA mapping.
+
     Example:
-      // Memory input: %mem, Value input: %val
+      // Memory inputs: %mem, Value inputs: %val
       $out_mem, %out_val = taskflow.task "Task_0"
-        memory_inputs(%mem : memref<4xi32>)
-        value_inputs(%val : i32) {
+        read_inputs(%mem : memref<4xi32>)
+        value_inputs(%val : i32)
+        original_read_memrefs(%arg0 : memref<?x8x6xi32>)
+        original_write_memrefs(%arg5 : memref<?xi32>) {
       ^bb0(%a0: memref<4xi32>, %a1: i32):
         affine.for %i = 0 to 4 {
           %v = affine.load %a0[%i] : memref<4xi32>
@@ -55,28 +61,22 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [
   }];
 
   let arguments = (ins
-    Variadic<AnyMemRef>:$memory_inputs,
+    Variadic<AnyMemRef>:$read_inputs,
+    Variadic<AnyType>:$write_inputs,
     Variadic<AnyType>:$value_inputs,
-    StrAttr:$task_name
+    StrAttr:$task_name,
+    Variadic<AnyMemRef>:$original_read_memrefs,
+    Variadic<AnyMemRef>:$original_write_memrefs
   );
 
   let results = (outs
-    Variadic<AnyMemRef>:$memory_outputs,
+    Variadic<AnyMemRef>:$write_outputs,
     Variadic<AnyType>:$value_outputs
   );
 
   let regions = (region SizedRegion<1>:$body);
 
-  // let hasCustomAssemblyFormat = 1;
-
-  // let assemblyFormat = [{
-  //   (`memory_inputs` `(` $memory_inputs^ `:` type($memory_inputs) `)`)?
-  //   (`value_inputs` `(` $value_inputs^ `:` type($value_inputs) `)`)?
-  //   attr-dict-with-keyword
-  //   $body
-  //   `->` `(` type($memory_outputs) `,` type($value_outputs) `)`
-  // }];
-
+  let hasCustomAssemblyFormat = 1;
 }
 
 // Defines the yield operation to terminate a Taskflow task.
@@ -97,13 +97,7 @@ def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, Att
     Variadic<AnyMemRef>:$memory_results,
     Variadic<AnyType>:$value_results);
 
-  // let assemblyFormat = [{
-  //   (`memory_outputs` `(` $memory_results^ `:` type($memory_results) `)`)?
-  //   (`value_outputs` `(` $value_results^ `:` type($value_results) `)`)?
-  //   attr-dict
-  // }];
-
-  // let hasCustomAssemblyFormat = 1;
+  let hasCustomAssemblyFormat = 1;
 
   let builders = [
     // Default builder for empty yield.
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 69c2a37e..09a28aee 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -17,7 +17,6 @@ namespace taskflow {
 #include "TaskflowDialect/TaskflowPasses.h.inc"
 std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
-std::unique_ptr<mlir::Pass> createCanonicalizeTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
 
 #define GEN_PASS_REGISTRATION
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 6aef5870..1e3c55c2 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -21,14 +21,6 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module
     "mlir::func::FuncDialect"];
 }
 
-def CanonicalizeTask : Pass<"canonicalize-task", "func::FuncOp"> {
-  let summary = "Canonicalizes Taskflow tasks";
-  let description = [{
-    This pass canonicalizes Taskflow tasks.
-  }];
-  let constructor = "taskflow::createCanonicalizeTaskPass()";
-}
-
 def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> {
   let summary = "Constructs hyperblocks and counter chain from Taskflow tasks";
   let description = [{
diff --git a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp
index 111dec0c..318c530d 100644
--- a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp
+++ b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -30,7 +31,6 @@ namespace {
 //------------------------------------------------------------------------------
 // Helper Functions.
 //------------------------------------------------------------------------------
-
 // Collects memrefs that are loaded (read) within a given operation scope.
 static void collectReadMemrefs(Operation *op, SetVector<Value> &read_memrefs) {
   op->walk([&](Operation *nested_op) {
@@ -104,15 +104,41 @@ updateOperationOperands(Operation *op,
   }
 }
 
+//------------------------------------------------------------------------------
+// Analyzes all the original memory access info before conversion.
+//------------------------------------------------------------------------------
+struct MemrefAccessInfo {
+  SetVector<Value> read_memrefs;
+  SetVector<Value> write_memrefs;
+};
+
+static DenseMap<Operation *, MemrefAccessInfo>
+analyzeMemrefAccesses(func::FuncOp func_op) {
+  DenseMap<Operation *, MemrefAccessInfo> loop_to_memref_info;
+
+  func_op.walk([&](affine::AffineForOp for_op) {
+    llvm::errs() << "\nAnalyzing memref accesses for loop:\n" << for_op << "\n";
+    MemrefAccessInfo access_info;
+
+    collectReadMemrefs(for_op.getOperation(), access_info.read_memrefs);
+    collectWrittenMemrefs(for_op.getOperation(), access_info.write_memrefs);
+
+    loop_to_memref_info[for_op] = access_info;
+  });
+
+  return loop_to_memref_info;
+}
+
 //------------------------------------------------------------------------------
 // Task Conversion
 //------------------------------------------------------------------------------
 
 // Converts a top-level affine.for to a taskflow.task operation.
-static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
-                                        affine::AffineForOp for_op,
-                                        DenseMap<Value, Value> &value_mapping,
-                                        int task_id) {
+static TaskflowTaskOp convertLoopToTask(
+    OpBuilder &builder, affine::AffineForOp for_op,
+    DenseMap<Value, Value> &value_mapping,
+    const DenseMap<Operation *, MemrefAccessInfo> &loop_to_original_memref_info,
+    int task_id) {
   Location loc = for_op.getLoc();
   std::string task_name = "Task_" + std::to_string(task_id);
 
@@ -125,9 +151,9 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
   // Step 1: Collects read and written memrefs.
   //-------------------------------------------------------------------
   SetVector<Value> read_memrefs;
-  SetVector<Value> written_memrefs;
+  SetVector<Value> write_memrefs;
   collectReadMemrefs(for_op.getOperation(), read_memrefs);
-  collectWrittenMemrefs(for_op.getOperation(), written_memrefs);
+  collectWrittenMemrefs(for_op.getOperation(), write_memrefs);
 
   llvm::errs() << "Read memrefs for loop:\n" << for_op << "\n";
   for (Value memref : read_memrefs) {
@@ -135,23 +161,25 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
   }
 
   llvm::errs() << "Written memrefs for loop:\n" << for_op << "\n";
-  for (Value memref : written_memrefs) {
+  for (Value memref : write_memrefs) {
     llvm::errs() << memref << "\n";
   }
 
+  // Collects original memref access info.
+  auto it = loop_to_original_memref_info.find(for_op.getOperation());
+  assert(it != loop_to_original_memref_info.end() &&
+         "Original memref access info not found for the loop");
+  const MemrefAccessInfo &original_memref_info = it->second;
+  SetVector<Value> original_read_memrefs = original_memref_info.read_memrefs;
+  SetVector<Value> original_write_memrefs = original_memref_info.write_memrefs;
+
   //-------------------------------------------------------------------
   // Step 2: Determines memory inputs and outputs.
   //-------------------------------------------------------------------
-  // Memory inputs: ALL memrefs that are accessed (read OR written).
-  // This ensures WAR and WAW dependencies are respected.
-  SetVector<Value> accessed_memrefs;
-  accessed_memrefs.insert(read_memrefs.begin(), read_memrefs.end());
-  accessed_memrefs.insert(written_memrefs.begin(), written_memrefs.end());
-
   // Memory outputs: ONLY memrefs that are written.
   // This ensures RAW and WAW dependencies are respected.
   SetVector<Value> output_memrefs;
-  output_memrefs.insert(written_memrefs.begin(), written_memrefs.end());
+  output_memrefs.insert(write_memrefs.begin(), write_memrefs.end());
 
   //-------------------------------------------------------------------
   // Step 3: Collects external SSA values (non-memref).
@@ -167,17 +195,28 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
   //-------------------------------------------------------------------
   // Step 4: Resolves inputs through value mapping.
   //-------------------------------------------------------------------
-  SmallVector<Value> memory_inputs;
+  SmallVector<Value> read_inputs;
+  SmallVector<Value> write_inputs;
   SmallVector<Value> value_inputs;
   IRMapping mapping;
 
-  // Resolves memory inputs.
-  for (Value memref : accessed_memrefs) {
+  // Resolves read inputs.
+  for (Value memref : read_memrefs) {
+    Value resolved_memref = value_mapping.lookup(memref);
+    if (!resolved_memref) {
+      resolved_memref = memref;
+    }
+    read_inputs.push_back(resolved_memref);
+    mapping.map(memref, resolved_memref);
+  }
+
+  // Resolves write inputs.
+  for (Value memref : write_memrefs) {
     Value resolved_memref = value_mapping.lookup(memref);
     if (!resolved_memref) {
       resolved_memref = memref;
     }
-    memory_inputs.push_back(resolved_memref);
+    write_inputs.push_back(resolved_memref);
     mapping.map(memref, resolved_memref);
   }
 
@@ -211,9 +250,12 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
       loc,
       /*memory_outputs=*/memory_output_types,
       /*value_outputs=*/value_output_types,
-      /*memory_inputs=*/memory_inputs,
+      /*read_inputs=*/read_inputs,
+      /*write_inputs=*/write_inputs,
       /*value_inputs=*/value_inputs,
-      /*task_name=*/builder.getStringAttr(task_name));
+      /*task_name=*/builder.getStringAttr(task_name),
+      /*original_read_memrefs=*/original_read_memrefs.getArrayRef(),
+      /*original_write_memrefs=*/original_write_memrefs.getArrayRef());
 
   //-------------------------------------------------------------------
   // Step 7: Builds the task body.
@@ -223,8 +265,15 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
 
   // Adds block arguments (memory inputs first, then value inputs).
   DenseMap<Value, BlockArgument> input_to_block_arg;
-  // Memory input arguments.
-  for (Value memref : accessed_memrefs) {
+  // Memory read input arguments.
+  for (Value memref : read_memrefs) {
+    BlockArgument arg = task_body->addArgument(memref.getType(), loc);
+    mapping.map(memref, arg);
+    input_to_block_arg[memref] = arg;
+  }
+
+  // Memory write input arguments.
+  for (Value memref : write_memrefs) {
     BlockArgument arg = task_body->addArgument(memref.getType(), loc);
     mapping.map(memref, arg);
     input_to_block_arg[memref] = arg;
@@ -270,7 +319,7 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder,
   //-------------------------------------------------------------------
   // Memory outputs.
   for (auto [memref, task_output] :
-       llvm::zip(output_memrefs, task_op.getMemoryOutputs())) {
+       llvm::zip(output_memrefs, task_op.getWriteOutputs())) {
     value_mapping[memref] = task_output;
   }
 
@@ -285,6 +334,8 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
 
   llvm::errs() << "\n===Converting function: " << func_op.getName() << "===\n";
 
+  DenseMap<Operation *, MemrefAccessInfo> loop_to_original_memref_info =
+      analyzeMemrefAccesses(func_op);
   OpBuilder builder(func_op.getContext());
   SmallVector<affine::AffineForOp> loops_to_erase;
   DenseMap<Value, Value> value_mapping;
@@ -298,13 +349,19 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) {
       ops_to_process.push_back(&op);
     }
 
+    llvm::errs() << "ops_to_process:\n";
+    for (Operation *op : ops_to_process) {
+      llvm::errs() << *op << "\n";
+    }
+
     // Processes each operation in order (top to bottom).
     for (Operation *op : ops_to_process) {
       if (auto for_op = dyn_cast<affine::AffineForOp>(op)) {
         // Converts affine.for to taskflow.task.
         OpBuilder builder(for_op);
-        TaskflowTaskOp task_op = convertLoopToTask(
-            builder, for_op, value_mapping, task_id_counter++);
+        TaskflowTaskOp task_op =
+            convertLoopToTask(builder, for_op, value_mapping,
+                              loop_to_original_memref_info, task_id_counter++);
 
         // Replaces uses of loop results with task value outputs.
         for (auto [loop_result, task_value_output] :
diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp
index e69de29b..44aa255a 100644
--- a/lib/TaskflowDialect/TaskflowOps.cpp
+++ b/lib/TaskflowDialect/TaskflowOps.cpp
@@ -0,0 +1,281 @@
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpImplementation.h"
+#include <cstddef>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+//===----------------------------------------------------------------------===//
+// TaskflowTaskOp
+//===----------------------------------------------------------------------===//
+
+ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) {
+  // Parses task name: @Task_0.
+  StringAttr task_name;
+  if (parser.parseSymbolName(task_name))
+    return failure();
+  result.addAttribute("task_name", task_name);
+
+  // Parses read_inputs: read_inputs(%arg0, %arg1 : memref<?xi32>,
+  // memref<?xi32>).
+  SmallVector<OpAsmParser::UnresolvedOperand> read_operands;
+  SmallVector<Type> read_types;
+  if (succeeded(parser.parseOptionalKeyword("read_inputs"))) {
+    if (parser.parseLParen() || parser.parseOperandList(read_operands) ||
+        parser.parseColonTypeList(read_types) || parser.parseRParen())
+      return failure();
+  }
+
+  // Parses write_inputs: write_inputs(%arg5 : memref<?xi32>).
+  SmallVector<OpAsmParser::UnresolvedOperand> write_operands;
+  SmallVector<Type> write_types;
+  if (succeeded(parser.parseOptionalKeyword("write_inputs"))) {
+    if (parser.parseLParen() || parser.parseOperandList(write_operands) ||
+        parser.parseColonTypeList(write_types) || parser.parseRParen())
+      return failure();
+  }
+
+  // Parses value_inputs: value_inputs(%scalar : i32).
+  SmallVector<OpAsmParser::UnresolvedOperand> value_operands;
+  SmallVector<Type> value_types;
+  if (succeeded(parser.parseOptionalKeyword("value_inputs"))) {
+    if (parser.parseLParen() || parser.parseOperandList(value_operands) ||
+        parser.parseColonTypeList(value_types) || parser.parseRParen())
+      return failure();
+  }
+
+  // Parses original memrefs: [original_read_memrefs(%arg0),
+  // original_write_memrefs(%arg5)].
+  SmallVector<OpAsmParser::UnresolvedOperand> original_read_operands;
+  SmallVector<Type> original_read_types;
+  SmallVector<OpAsmParser::UnresolvedOperand> original_write_operands;
+  SmallVector<Type> original_write_types;
+
+  if (succeeded(parser.parseOptionalLSquare())) {
+    // original_reads.
+    if (succeeded(parser.parseOptionalKeyword("original_read_memrefs"))) {
+      if (parser.parseLParen() ||
+          parser.parseOperandList(original_read_operands) ||
+          parser.parseRParen())
+        return failure();
+    }
+
+    // optional comma.
+    (void)parser.parseOptionalComma();
+
+    // original_writes.
+    if (succeeded(parser.parseOptionalKeyword("original_write_memrefs"))) {
+      if (parser.parseLParen() ||
+          parser.parseOperandList(original_write_operands) ||
+          parser.parseRParen())
+        return failure();
+    }
+
+    if (parser.parseRSquare())
+      return failure();
+  }
+
+  // Resolves operands.
+  if (parser.resolveOperands(read_operands, read_types,
+                             parser.getCurrentLocation(), result.operands) ||
+      parser.resolveOperands(write_operands, write_types,
+                             parser.getCurrentLocation(), result.operands) ||
+      parser.resolveOperands(value_operands, value_types,
+                             parser.getCurrentLocation(), result.operands))
+    return failure();
+
+  // Resolves original memrefs (infer types from read/write memrefs).
+  for (size_t i = 0; i < original_read_operands.size(); ++i) {
+    original_read_types.push_back(read_types.empty() ? write_types[0]
+                                                     : read_types[0]);
+  }
+  for (size_t i = 0; i < original_write_operands.size(); ++i) {
+    original_write_types.push_back(write_types.empty() ? read_types[0]
+                                                       : write_types[0]);
+  }
+
+  if (parser.resolveOperands(original_read_operands, original_read_types,
+                             parser.getCurrentLocation(), result.operands) ||
+      parser.resolveOperands(original_write_operands, original_write_types,
+                             parser.getCurrentLocation(), result.operands))
+    return failure();
+
+  // Parses optional attributes.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  // Parses function type: : (...) -> (...).
+  FunctionType func_type;
+  if (parser.parseColon() || parser.parseType(func_type))
+    return failure();
+
+  // Adds result types.
+  result.addTypes(func_type.getResults());
+
+  // Parses region.
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, /*args=*/{}, /*argTypes=*/{}))
+    return failure();
+
+  // Adds operand segment sizes.
+  result.addAttribute(
+      "operandSegmentSizes",
+      parser.getBuilder().getDenseI32ArrayAttr(
+          {static_cast<int32_t>(read_operands.size()),
+           static_cast<int32_t>(write_operands.size()),
+           static_cast<int32_t>(value_operands.size()),
+           static_cast<int32_t>(original_read_operands.size()),
+           static_cast<int32_t>(original_write_operands.size())}));
+
+  // Adds result segment sizes.
+  size_t num_write_outputs = 0;
+  size_t num_value_outputs = 0;
+  for (Type t : func_type.getResults()) {
+    if (isa<MemRefType>(t))
+      num_write_outputs++;
+    else
+      num_value_outputs++;
+  }
+  result.addAttribute("resultSegmentSizes",
+                      parser.getBuilder().getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(num_write_outputs),
+                           static_cast<int32_t>(num_value_outputs)}));
+
+  return success();
+}
+
+void TaskflowTaskOp::print(OpAsmPrinter &printer) {
+  // Prints task name.
+  printer << " @" << getTaskName();
+
+  // Prints read_inputs.
+  if (!getReadInputs().empty()) {
+    printer << " read_inputs(";
+    llvm::interleaveComma(getReadInputs(), printer);
+    printer << " : ";
+    llvm::interleaveComma(getReadInputs().getTypes(), printer);
+    printer << ")";
+  }
+
+  // Prints write_inputs.
+  if (!getWriteInputs().empty()) {
+    printer << " write_inputs(";
+    llvm::interleaveComma(getWriteInputs(), printer);
+    printer << " : ";
+    llvm::interleaveComma(getWriteInputs().getTypes(), printer);
+    printer << ")";
+  }
+
+  // Prints value_inputs.
+  if (!getValueInputs().empty()) {
+    printer << " value_inputs(";
+    llvm::interleaveComma(getValueInputs(), printer);
+    printer << " : ";
+    llvm::interleaveComma(getValueInputs().getTypes(), printer);
+    printer << ")";
+  }
+
+  // Prints original memrefs.
+  if (!getOriginalReadMemrefs().empty() || !getOriginalWriteMemrefs().empty()) {
+    printer << " [";
+
+    if (!getOriginalReadMemrefs().empty()) {
+      printer << "original_read_memrefs(";
+      llvm::interleaveComma(getOriginalReadMemrefs(), printer);
+      printer << ")";
+    }
+
+    if (!getOriginalReadMemrefs().empty() && !getOriginalWriteMemrefs().empty())
+      printer << ", ";
+
+    if (!getOriginalWriteMemrefs().empty()) {
+      printer << "original_write_memrefs(";
+      llvm::interleaveComma(getOriginalWriteMemrefs(), printer);
+      printer << ")";
+    }
+
+    printer << "]";
+  }
+
+  // Prints attributes (skip operandSegmentSizes, resultSegmentSizes,
+  // task_name).
+  SmallVector<StringRef> elidedAttrs = {"operandSegmentSizes",
+                                        "resultSegmentSizes", "task_name"};
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+  // Prints function type.
+  printer << " : (";
+  llvm::interleaveComma(llvm::concat<const Type>(getReadInputs().getTypes(),
+                                                 getWriteInputs().getTypes(),
+                                                 getValueInputs().getTypes()),
+                        printer);
+  printer << ") -> (";
+  llvm::interleaveComma(llvm::concat<const Type>(getWriteOutputs().getTypes(),
+                                                 getValueOutputs().getTypes()),
+                        printer);
+  printer << ")";
+
+  // Prints region.
+  printer << " ";
+  printer.printRegion(getBody(), /*printEntryBlockArgs=*/true);
+}
+
+//===----------------------------------------------------------------------===//
+// TaskflowYieldOp
+//===----------------------------------------------------------------------===//
+
+ParseResult TaskflowYieldOp::parse(OpAsmParser &parser,
+                                   OperationState &result) {
+  SmallVector<OpAsmParser::UnresolvedOperand> write_operands;
+  SmallVector<Type> write_types;
+  SmallVector<OpAsmParser::UnresolvedOperand> value_operands;
+  SmallVector<Type> value_types;
+
+  // Parses writes.
+  if (succeeded(parser.parseOptionalKeyword("writes"))) {
+    if (parser.parseLParen() || parser.parseOperandList(write_operands) ||
+        parser.parseColonTypeList(write_types) || parser.parseRParen())
+      return failure();
+  }
+
+  // Parses values.
+  if (succeeded(parser.parseOptionalKeyword("values"))) {
+    if (parser.parseLParen() || parser.parseOperandList(value_operands) ||
+        parser.parseColonTypeList(value_types) || parser.parseRParen())
+      return failure();
+  }
+
+  if (parser.resolveOperands(write_operands, write_types,
+                             parser.getCurrentLocation(), result.operands) ||
+      parser.resolveOperands(value_operands, value_types,
+                             parser.getCurrentLocation(), result.operands))
+    return failure();
+
+  result.addAttribute("operandSegmentSizes",
+                      parser.getBuilder().getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(write_operands.size()),
+                           static_cast<int32_t>(value_operands.size())}));
+
+  return success();
+}
+
+void TaskflowYieldOp::print(OpAsmPrinter &printer) {
+  if (!getMemoryResults().empty()) {
+    printer << " writes(";
+    llvm::interleaveComma(getMemoryResults(), printer);
+    printer << " : ";
+    llvm::interleaveComma(getMemoryResults().getTypes(), printer);
+    printer << ")";
+  }
+
+  if (!getValueResults().empty()) {
+    printer << " values(";
+    llvm::interleaveComma(getValueResults(), printer);
+    printer << " : ";
+    llvm::interleaveComma(getValueResults().getTypes(), printer);
+    printer << ")";
+  }
+}
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index ff12e671..a5443158 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -2,7 +2,6 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
-    CanonicalizeTaskPass.cpp
     ClassifyCountersPass.cpp
 
     DEPENDS
diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
deleted file mode 100644
index 4281fae2..00000000
--- a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
+++ /dev/null
@@ -1,560 +0,0 @@
-#include "TaskflowDialect/TaskflowDialect.h"
-#include "TaskflowDialect/TaskflowOps.h"
-#include "TaskflowDialect/TaskflowPasses.h"
-
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Unit.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "llvm/ADT/SmallVector.h"
-
-using namespace mlir;
-using namespace mlir::taskflow;
-
-namespace {
-//----------------------------------------------------------------------
-// Memory and Value Access Info.
-//----------------------------------------------------------------------
-// This struct analyzes accesses information within a hyperblock.
-struct AccessInfo {
-  // Set of read memrefs.
-  SetVector<Value> memref_reads;
-  // Set of written memrefs.
-  SetVector<Value> memref_writes;
-  // Set of read values.
-  SetVector<Value> value_reads;
-
-  void analyze(TaskflowHyperblockOp hyperblock, Block *task_body) {
-    DenseSet<Value> task_block_args;
-    for (Value arg : task_body->getArguments()) {
-      task_block_args.insert(arg);
-    }
-
-    hyperblock.walk([&](Operation *op) {
-      if (auto load = dyn_cast<memref::LoadOp>(op)) {
-        this->memref_reads.insert(load.getMemRef());
-      } else if (auto store = dyn_cast<memref::StoreOp>(op)) {
-        this->memref_writes.insert(store.getMemRef());
-      }
-
-      for (Value operand : op->getOperands()) {
-        if (task_block_args.contains(operand)) {
-          this->value_reads.insert(operand);
-        }
-      }
-    });
-  }
-
-  SetVector<Value> getAllMemRefs() const {
-    SetVector<Value> all;
-    all.insert(this->memref_reads.begin(), this->memref_reads.end());
-    all.insert(this->memref_writes.begin(), this->memref_writes.end());
-    return all;
-  }
-
-  SetVector<Value> getAllValues() const { return this->value_reads; }
-};
-
-//----------------------------------------------------------------------
-// Counter Collector.
-//----------------------------------------------------------------------
-// This class is used to collects all counters needed by a hyperblock.
-class CounterCollector {
-public:
-  void collect(TaskflowHyperblockOp hyperblock) {
-    for (Value idx : hyperblock.getIndices()) {
-      collectRecursively(idx);
-    }
-  }
-
-  // Gets the collected counters sorted by their depth.
-  SmallVector<TaskflowCounterOp> getSortedCounters() const {
-    SmallVector<TaskflowCounterOp> result(this->counters.begin(),
-                                          this->counters.end());
-    llvm::sort(result, [this](TaskflowCounterOp a, TaskflowCounterOp b) {
-      return getDepth(a) < getDepth(b);
-    });
-    return result;
-  }
-
-private:
-  // Collects counters recursively.
-  void collectRecursively(Value idx) {
-    TaskflowCounterOp counter = idx.getDefiningOp<TaskflowCounterOp>();
-    if (!counter) {
-      return;
-    }
-    this->counters.insert(counter);
-    if (Value parent = counter.getParentIndex()) {
-      collectRecursively(parent);
-    }
-  }
-
-  // Gets the depth of a counter.
-  size_t getDepth(TaskflowCounterOp counter) const {
-    size_t depth = 0;
-    Value parent = counter.getParentIndex();
-    while (parent) {
-      depth++;
-      if (TaskflowCounterOp p = parent.getDefiningOp<TaskflowCounterOp>()) {
-        parent = p.getParentIndex();
-      } else {
-        break;
-      }
-    }
-    return depth;
-  }
-
-  SetVector<TaskflowCounterOp> counters;
-};
-
-//----------------------------------------------------------------------
-// Block Argument Resolver.
-//----------------------------------------------------------------------
-// This class resolves the input arguments of a task block to their source
-// values.
-// For example:
-// taskflow.task(%buf_input, %val_input) {
-// ^bb0(%arg0: memref<?xi32>, %arg1: i32):   // ← block arguments
-//   // %arg0 corresponds to %buf_input
-//   // %arg1 corresponds to %val_input
-// }
-// resolveToSource(%arg0) -> %buf_input
-class BlockArgResolver {
-public:
-  explicit BlockArgResolver(TaskflowTaskOp task) {
-    Block *body = &task.getBody().front();
-
-    // Resolves memory inputs.
-    auto mem_inputs = task.getMemoryInputs();
-    auto mem_args = body->getArguments().take_front(mem_inputs.size());
-    for (auto [input, arg] : llvm::zip(mem_inputs, mem_args)) {
-      this->block_arg_to_source[arg] = input;
-      this->source_to_block_arg[input] = arg;
-    }
-
-    // Resolves value inputs.
-    auto val_inputs = task.getValueInputs();
-    auto val_args = body->getArguments().drop_front(mem_inputs.size());
-    for (auto [input, arg] : llvm::zip(val_inputs, val_args)) {
-      this->block_arg_to_source[arg] = input;
-      this->source_to_block_arg[input] = arg;
-    }
-  }
-
-  // Gets the source value for a given block argument.
-  Value resolveToSource(Value val) const {
-    auto it = this->block_arg_to_source.find(val);
-    return it != this->block_arg_to_source.end() ? it->second : val;
-  }
-
-  // Gets the block argument for a given source value.
-  Value getBlockArg(Value source) const {
-    auto it = this->source_to_block_arg.find(source);
-    return it != this->source_to_block_arg.end() ? it->second : Value();
-  }
-
-private:
-  // Maps block argument to its source value.
-  DenseMap<Value, Value> block_arg_to_source;
-  // Maps source value to its block argument.
-  DenseMap<Value, Value> source_to_block_arg;
-};
-
-//----------------------------------------------------------------------
-// Atomic Task Builder.
-//----------------------------------------------------------------------
-// This class builds an atomic task from a hyperblock.
-class AtomicTaskBuilder {
-public:
-  AtomicTaskBuilder(OpBuilder &builder, Location loc, unsigned global_task_idx,
-                    DenseMap<Value, Value> &memref_to_latest_version,
-                    DenseMap<Value, Value> &value_to_latest_version)
-      : builder(builder), loc(loc), global_task_idx(global_task_idx),
-        memref_to_latest_version(memref_to_latest_version),
-        value_to_latest_version(value_to_latest_version) {}
-
-  TaskflowTaskOp build(TaskflowHyperblockOp hyperblock,
-                       TaskflowTaskOp original_task) {
-    AccessInfo access_info;
-    access_info.analyze(hyperblock, &original_task.getBody().front());
-
-    BlockArgResolver resolver(original_task);
-
-    // Determines memref inputs.
-    SmallVector<Value> memref_inputs;
-    DenseMap<Value, unsigned> source_to_memref_input_idx;
-
-    for (Value memref : access_info.getAllMemRefs()) {
-      Value source = resolver.resolveToSource(memref);
-      Value input_memref = getLatestMemrefVersion(source);
-
-      if (!source_to_memref_input_idx.count(source)) {
-        source_to_memref_input_idx[source] = memref_inputs.size();
-        memref_inputs.push_back(input_memref);
-      }
-    }
-
-    // Determines value inputs.
-    SmallVector<Value> value_inputs;
-    DenseMap<Value, unsigned> source_to_value_input_idx;
-
-    for (Value val : access_info.getAllValues()) {
-      Value source = resolver.resolveToSource(val);
-      Value input_val = getLatestValueVersion(source);
-
-      if (!source_to_value_input_idx.count(source)) {
-        source_to_value_input_idx[source] = value_inputs.size();
-        value_inputs.push_back(input_val);
-      }
-    }
-
-    // Determines memref outputs.
-    SmallVector<Type> memref_output_types;
-    // The source memrefs of the written memrefs.
-    SmallVector<Value> written_memref_sources;
-
-    for (Value memref : access_info.memref_writes) {
-      Value source = resolver.resolveToSource(memref);
-      memref_output_types.push_back(source.getType());
-      written_memref_sources.push_back(source);
-    }
-
-    // Determines value outputs.
-    SmallVector<Type> value_output_types;
-    SmallVector<Value> yielded_value_sources;
-
-    if (!hyperblock.getOutputs().empty()) {
-      for (Value output : hyperblock.getOutputs()) {
-        value_output_types.push_back(output.getType());
-        // For value outputs, they are source themselves.
-        yielded_value_sources.push_back(output);
-      }
-    }
-
-    // Creates a new task.
-    std::string task_name = "Task_" + std::to_string(this->global_task_idx);
-    auto new_task = builder.create<TaskflowTaskOp>(
-        this->loc, memref_output_types, value_output_types, memref_inputs,
-        value_inputs, builder.getStringAttr(task_name));
-
-    // Creates the task body.
-    Block *task_body = new Block();
-    new_task.getBody().push_back(task_body);
-
-    // Adds memref input arguments.
-    for (Value input : memref_inputs) {
-      task_body->addArgument(input.getType(), this->loc);
-    }
-    // Adds value input arguments.
-    for (Value input : value_inputs) {
-      task_body->addArgument(input.getType(), this->loc);
-    }
-
-    // Builds value mapping.
-    IRMapping mapping;
-
-    // Maps memref inputs.
-    for (auto [source, idx] : source_to_memref_input_idx) {
-      BlockArgument new_arg = task_body->getArgument(idx);
-      mapping.map(source, new_arg);
-
-      if (Value orig_arg = resolver.getBlockArg(source)) {
-        mapping.map(orig_arg, new_arg);
-      }
-    }
-
-    // Maps value inputs.
-    size_t value_arg_offset = memref_inputs.size();
-    for (auto [source, idx] : source_to_value_input_idx) {
-      BlockArgument new_arg = task_body->getArgument(value_arg_offset + idx);
-      mapping.map(source, new_arg);
-
-      if (Value orig_arg = resolver.getBlockArg(source)) {
-        mapping.map(orig_arg, new_arg);
-      }
-    }
-
-    // Clones counters and hyperblock.
-    OpBuilder task_builder(task_body, task_body->begin());
-    cloneCounters(task_builder, hyperblock, mapping);
-    cloneHyperblock(task_builder, hyperblock, mapping);
-
-    // Creates yield.
-    SmallVector<Value> memref_yield_operands;
-    for (Value memref : access_info.memref_writes) {
-      memref_yield_operands.push_back(mapping.lookupOrDefault(memref));
-    }
-
-    SmallVector<Value> value_yield_operands;
-    // If this hyperblock has value outputs, we need to yield them from the
-    // mapped hyperblock.
-    if (!hyperblock.getOutputs().empty()) {
-      // Finds the cloned hyperblock op.
-      TaskflowHyperblockOp cloned_hb = nullptr;
-      for (Operation &op : task_body->getOperations()) {
-        if (auto hb = dyn_cast<TaskflowHyperblockOp>(op)) {
-          cloned_hb = hb;
-          break;
-        }
-        if (cloned_hb) {
-          for (Value output : cloned_hb.getOutputs()) {
-            value_yield_operands.push_back(output);
-          }
-        }
-      }
-    }
-
-    task_builder.setInsertionPointToEnd(task_body);
-    task_builder.create<TaskflowYieldOp>(this->loc, memref_yield_operands,
-                                         value_yield_operands);
-
-    // Updates latest versions.
-    auto memref_outputs = new_task.getMemoryOutputs();
-    for (auto [source, output] :
-         llvm::zip(written_memref_sources, memref_outputs)) {
-      this->memref_to_latest_version[source] = output;
-    }
-
-    auto value_outputs = new_task.getValueOutputs();
-    for (auto [source, output] :
-         llvm::zip(yielded_value_sources, value_outputs)) {
-      this->value_to_latest_version[source] = output;
-    }
-
-    return new_task;
-  }
-
-private:
-  Value getLatestMemrefVersion(Value source) {
-    auto it = this->memref_to_latest_version.find(source);
-    return it != this->memref_to_latest_version.end() ? it->second : source;
-  }
-
-  Value getLatestValueVersion(Value source) {
-    auto it = this->value_to_latest_version.find(source);
-    return it != this->value_to_latest_version.end() ? it->second : source;
-  }
-
-  void cloneCounters(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock,
-                     IRMapping &mapping) {
-    CounterCollector collector;
-    collector.collect(hyperblock);
-
-    for (TaskflowCounterOp counter : collector.getSortedCounters()) {
-      task_builder.clone(*counter.getOperation(), mapping);
-    }
-  }
-
-  void cloneHyperblock(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock,
-                       IRMapping &mapping) {
-    SmallVector<Value> mapped_indices;
-    for (Value idx : hyperblock.getIndices()) {
-      mapped_indices.push_back(mapping.lookupOrDefault(idx));
-    }
-
-    SmallVector<Value> mapped_iter_args;
-    for (Value arg : hyperblock.getIterArgs()) {
-      mapped_iter_args.push_back(mapping.lookupOrDefault(arg));
-    }
-
-    SmallVector<Type> output_types(hyperblock.getOutputs().getTypes());
-    auto newHB = task_builder.create<TaskflowHyperblockOp>(
-        this->loc, output_types, mapped_indices, mapped_iter_args);
-
-    Block *new_body = new Block();
-    newHB.getBody().push_back(new_body);
-
-    for (Value idx : mapped_indices) {
-      new_body->addArgument(idx.getType(), this->loc);
-    }
-
-    for (Value arg : mapped_iter_args) {
-      new_body->addArgument(arg.getType(), this->loc);
-    }
-
-    Block *old_body = &hyperblock.getBody().front();
-    for (auto [old_arg, new_arg] :
-         llvm::zip(old_body->getArguments(), new_body->getArguments())) {
-      mapping.map(old_arg, new_arg);
-    }
-
-    OpBuilder hb_builder(new_body, new_body->begin());
-    for (Operation &op : old_body->without_terminator()) {
-      hb_builder.clone(op, mapping);
-    }
-
-    if (auto yield =
-            dyn_cast<TaskflowHyperblockYieldOp>(old_body->getTerminator())) {
-      SmallVector<Value> yield_results;
-      SmallVector<Value> yield_iter_args_next;
-      for (Value v : yield.getResults()) {
-        yield_results.push_back(mapping.lookupOrDefault(v));
-      }
-      for (Value v : yield.getIterArgsNext()) {
-        yield_iter_args_next.push_back(mapping.lookupOrDefault(v));
-      }
-      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc, yield_results,
-                                                   yield_iter_args_next);
-    } else {
-      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc);
-    }
-  }
-
-  OpBuilder &builder;
-  Location loc;
-  unsigned global_task_idx;
-  DenseMap<Value, Value> &memref_to_latest_version;
-  DenseMap<Value, Value> &value_to_latest_version;
-};
-
-//----------------------------------------------------------------------
-// Pass Implementation.
-//----------------------------------------------------------------------
-
-struct CanonicalizeTaskPass
-    : public PassWrapper<CanonicalizeTaskPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeTaskPass)
-
-  StringRef getArgument() const final { return "canonicalize-task"; }
-
-  StringRef getDescription() const final {
-    return "Canonicalizes tasks by splitting each hyperblock into a separate "
-           "atomic task (one hyperblock per task)";
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<taskflow::TaskflowDialect, arith::ArithDialect,
-                memref::MemRefDialect, func::FuncDialect, scf::SCFDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func_op = getOperation();
-
-    SmallVector<TaskflowTaskOp> tasks_to_process;
-    func_op.walk(
-        [&](TaskflowTaskOp task_op) { tasks_to_process.push_back(task_op); });
-
-    unsigned global_task_idx = 0;
-
-    for (TaskflowTaskOp original_task : tasks_to_process) {
-      OpBuilder builder(original_task);
-      // Collects hyperblocks within the original task.
-      SmallVector<TaskflowHyperblockOp> hyperblocks;
-      original_task.walk(
-          [&](TaskflowHyperblockOp hb) { hyperblocks.push_back(hb); });
-
-      assert(!hyperblocks.empty() &&
-             "Expected at least one hyperblock in the task");
-
-      // If there's only one hyperblock, it is already canonical.
-      if (hyperblocks.size() == 1) {
-        std::string task_name = "Task_" + std::to_string(global_task_idx++);
-        original_task.setTaskNameAttr(builder.getStringAttr(task_name));
-        continue;
-      }
-
-      //----------------------------------------------------------------
-      // Step 1: Builds mapping from original task's memory outputs to their
-      //         corresponding source memrefs (the original inputs).
-      //----------------------------------------------------------------
-      // Gets the yield operation to find which memrefs are yielded.
-      auto yield_op = cast<TaskflowYieldOp>(
-          original_task.getBody().front().getTerminator());
-      auto original_mem_outputs = original_task.getMemoryOutputs();
-      auto original_val_outputs = original_task.getValueOutputs();
-      auto yielded_memrefs = yield_op.getMemoryResults();
-      auto yielded_values = yield_op.getValueResults();
-
-      // Map: yielded -> original task output.
-      DenseMap<Value, Value> yielded_to_output;
-      for (auto [yielded, output] :
-           llvm::zip(yielded_memrefs, original_mem_outputs)) {
-        yielded_to_output[yielded] = output;
-      }
-      for (auto [yielded, output] :
-           llvm::zip(yielded_values, original_val_outputs)) {
-        yielded_to_output[yielded] = output;
-      }
-
-      // Map: original input memref -> original task output (if it's yielded).
-      // This tells us which original outputs correspond to which input memrefs.
-      Block *orig_body = &original_task.getBody().front();
-      auto orig_mem_inputs = original_task.getMemoryInputs();
-      auto orig_val_inputs = original_task.getValueInputs();
-
-      DenseMap<Value, Value> source_to_original_output;
-
-      // Maps memref inputs.
-      for (auto [input, arg] : llvm::zip(
-               orig_mem_inputs,
-               orig_body->getArguments().take_front(orig_mem_inputs.size()))) {
-        if (yielded_to_output.count(arg)) {
-          source_to_original_output[input] = yielded_to_output[arg];
-        }
-      }
-
-      // Maps value inputs.
-      for (auto [input, arg] : llvm::zip(
-               orig_val_inputs,
-               orig_body->getArguments().drop_front(orig_mem_inputs.size()))) {
-        if (yielded_to_output.count(arg)) {
-          source_to_original_output[input] = yielded_to_output[arg];
-        }
-      }
-
-      //----------------------------------------------------------------
-      // Step 2: Creates atomic tasks for each hyperblock.
-      //----------------------------------------------------------------
-      // Records the mapping from source memref to the latest version after
-      // executing each atomic task.
-      DenseMap<Value, Value> memref_to_latest_version;
-      DenseMap<Value, Value> value_to_latest_version;
-
-      for (size_t i = 0; i < hyperblocks.size(); ++i) {
-        AtomicTaskBuilder task_builder(
-            builder, original_task.getLoc(), global_task_idx++,
-            memref_to_latest_version, value_to_latest_version);
-        task_builder.build(hyperblocks[i], original_task);
-      }
-
-      //----------------------------------------------------------------
-      // Step 3: Replaces uses of original task outputs with the latest
-      // versions.
-      //----------------------------------------------------------------
-      for (auto [source, original_output] : source_to_original_output) {
-        Value latest = nullptr;
-        if (memref_to_latest_version.count(source)) {
-          latest = memref_to_latest_version[source];
-        } else if (value_to_latest_version.count(source)) {
-          latest = value_to_latest_version[source];
-        }
-
-        if (latest) {
-          original_output.replaceAllUsesWith(latest);
-        }
-      }
-
-      //----------------------------------------------------------------
-      // Step 4: Erase the original task.
-      //----------------------------------------------------------------
-      original_task.erase();
-    }
-  }
-};
-
-} // namespace
-
-std::unique_ptr<Pass> mlir::taskflow::createCanonicalizeTaskPass() {
-  return std::make_unique<CanonicalizeTaskPass>();
-}
\ No newline at end of file

From ac60077f98c69010b1c6b004c3438aae5799b28c Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sun, 1 Feb 2026 17:30:03 +0800
Subject: [PATCH 3/9] [fix] fix bug in hyperblock construction

---
 .../ConstructHyperblockFromTaskPass.cpp       |  2 +-
 .../TosaToTaskflow/affine-to-taskflow.mlir    | 27 +++++++++-------
 .../TosaToTaskflow/tosa-to-taskflow.mlir      | 30 ++++++++++--------
 test/e2e/tosa_e2e.mlir                        | 31 ++++++++++---------
 test/multi-cgra/kernel_mapping/fir/fir.mlir   |  7 ++---
 5 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index 792412ff..c1e6ddff 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -206,6 +206,7 @@ struct LECPattern {
 // Detects Loop-Epilogue Code pattern in the task.
 static LECPattern detectLECPattern(affine::AffineForOp outer_loop) {
   LECPattern pattern;
+  pattern.has_lec_pattern = false;
   pattern.outer_loop = outer_loop;
 
   Block &body = outer_loop.getRegion().front();
@@ -220,7 +221,6 @@ static LECPattern detectLECPattern(affine::AffineForOp outer_loop) {
     } else if (!(isa<affine::AffineYieldOp>(&op) && op.getOperands().empty())) {
       if (!found_nested_loop) {
         pattern.prologue_code.push_back(&op);
-        pattern.has_lec_pattern = true;
       } else {
         pattern.epilogue_code.push_back(&op);
         pattern.has_lec_pattern = true;
diff --git a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir
index 3f07f91d..38bb3ca2 100644
--- a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir
+++ b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-neura-opt --convert-affine-to-taskflow %s 2>/dev/null | FileCheck %s
+// RUN: mlir-neura-opt --convert-affine-to-taskflow %s \
+// RUN: -o %t-taskflow.mlir
+// RUN: FileCheck %s --input-file=%t-taskflow.mlir
 
 // Test Affine to Taskflow conversion
 module {
@@ -13,15 +15,16 @@ module {
   }
 }
 
-// CHECK-LABEL: func.func @simple_add
-// CHECK-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %arg2)
-// CHECK-SAME: task_name = "Task_0"
-// CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>):
-// CHECK-NEXT:   affine.for %arg6 = 0 to 16 {
-// CHECK-NEXT:     %0 = affine.load %arg3[%arg6] : memref<16xf32>
-// CHECK-NEXT:     %1 = affine.load %arg4[%arg6] : memref<16xf32>
-// CHECK-NEXT:     %2 = arith.addf %0, %1 : f32
-// CHECK-NEXT:     affine.store %2, %arg5[%arg6] : memref<16xf32>
+// CHECK:        func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: memref<16xf32>) {
+// CHECK-NEXT:     %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg2)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
+// CHECK-NEXT:     ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>):
+// CHECK-NEXT:       affine.for %arg6 = 0 to 16 {
+// CHECK-NEXT:         %0 = affine.load %arg3[%arg6] : memref<16xf32>
+// CHECK-NEXT:         %1 = affine.load %arg4[%arg6] : memref<16xf32>
+// CHECK-NEXT:         %2 = arith.addf %0, %1 : f32
+// CHECK-NEXT:         affine.store %2, %arg5[%arg6] : memref<16xf32>
+// CHECK-NEXT:       }
+// CHECK-NEXT:       taskflow.yield writes(%arg5 : memref<16xf32>)
+// CHECK-NEXT:     }
+// CHECK-NEXT:     return
 // CHECK-NEXT:   }
-// CHECK-NEXT:   "taskflow.yield"(%arg5)
-// CHECK: return
diff --git a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir
index 7c2356cf..dd7083ba 100644
--- a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir
+++ b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir
@@ -1,4 +1,7 @@
-// RUN: mlir-neura-opt --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' %s 2>&1 | FileCheck %s
+// RUN: mlir-neura-opt --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' %s \
+// RUN: -o %t-taskflow.mlir 
+// RUN: FileCheck %s --input-file=%t-taskflow.mlir
+
 // Simple TOSA add lowering test
 
 func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf32> {
@@ -6,16 +9,17 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16
   return %0 : tensor<16xf32>
 }
 
-// CHECK-LABEL: func.func @simple_add
-// CHECK: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32>
-// CHECK: %[[RES:.*]] = "taskflow.task"(%arg0, %arg1, %alloc)
-// CHECK-SAME: task_name = "Task_0"
-// CHECK-NEXT: ^bb0(%[[BA1:.*]]: memref<16xf32>, %[[BA2:.*]]: memref<16xf32>, %[[BA3:.*]]: memref<16xf32>):
-// CHECK-NEXT:   affine.for %[[IV:.*]] = 0 to 16 {
-// CHECK-NEXT:     %0 = affine.load %[[BA1]][%[[IV]]] : memref<16xf32>
-// CHECK-NEXT:     %1 = affine.load %[[BA2]][%[[IV]]] : memref<16xf32>
-// CHECK-NEXT:     %2 = arith.addf %0, %1 : f32
-// CHECK-NEXT:     affine.store %2, %[[BA3]][%[[IV]]] : memref<16xf32>
+// CHECK:      func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> {
+// CHECK-NEXT:   %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32>
+// CHECK-NEXT:   %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
+// CHECK-NEXT:   ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>):
+// CHECK-NEXT:     affine.for %arg5 = 0 to 16 {
+// CHECK-NEXT:       %0 = affine.load %arg2[%arg5] : memref<16xf32>
+// CHECK-NEXT:       %1 = affine.load %arg3[%arg5] : memref<16xf32>
+// CHECK-NEXT:       %2 = arith.addf %0, %1 : f32
+// CHECK-NEXT:       affine.store %2, %arg4[%arg5] : memref<16xf32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:     taskflow.yield writes(%arg4 : memref<16xf32>)
 // CHECK-NEXT:   }
-// CHECK-NEXT:   "taskflow.yield"(%[[BA3]])
-// CHECK: return %[[RES]] : memref<16xf32>
+// CHECK-NEXT:   return %write_outputs : memref<16xf32>
+// CHECK-NEXT: }
diff --git a/test/e2e/tosa_e2e.mlir b/test/e2e/tosa_e2e.mlir
index 19a75576..f291ffd7 100644
--- a/test/e2e/tosa_e2e.mlir
+++ b/test/e2e/tosa_e2e.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-neura-opt %s --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' | FileCheck %s
+// RUN: mlir-neura-opt %s --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' \
+// RUN: -o %t-taskflow.mlir 
+// RUN: FileCheck %s --input-file=%t-taskflow.mlir
 
 // Verifies the end-to-end lowering from TOSA to Taskflow.
 func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf32> {
@@ -7,17 +9,18 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf
   return %1 : tensor<16xf32>
 }
 
-// CHECK-LABEL: func.func @test_e2e
-// CHECK: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32>
-// CHECK: %[[RES:.*]] = "taskflow.task"(%arg0, %arg1, %alloc)
-// CHECK-SAME: task_name = "Task_0"
-// CHECK-NEXT: ^bb0(%[[BA1:.*]]: memref<16xf32>, %[[BA2:.*]]: memref<16xf32>, %[[BA3:.*]]: memref<16xf32>):
-// CHECK-NEXT:   affine.for %[[IV:.*]] = 0 to 16 {
-// CHECK-NEXT:     %0 = affine.load %[[BA1]][%[[IV]]] : memref<16xf32>
-// CHECK-NEXT:     %1 = affine.load %[[BA2]][%[[IV]]] : memref<16xf32>
-// CHECK-NEXT:     %2 = arith.addf %0, %1 : f32
-// CHECK-NEXT:     %3 = arith.mulf %2, %2 : f32
-// CHECK-NEXT:     affine.store %3, %[[BA3]][%[[IV]]] : memref<16xf32>
+// CHECK:      func.func @test_e2e(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> {
+// CHECK-NEXT:   %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32>
+// CHECK-NEXT:   %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
+// CHECK-NEXT:   ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>):
+// CHECK-NEXT:     affine.for %arg5 = 0 to 16 {
+// CHECK-NEXT:       %0 = affine.load %arg2[%arg5] : memref<16xf32>
+// CHECK-NEXT:       %1 = affine.load %arg3[%arg5] : memref<16xf32>
+// CHECK-NEXT:       %2 = arith.addf %0, %1 : f32
+// CHECK-NEXT:       %3 = arith.mulf %2, %2 : f32
+// CHECK-NEXT:       affine.store %3, %arg4[%arg5] : memref<16xf32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:     taskflow.yield writes(%arg4 : memref<16xf32>)
 // CHECK-NEXT:   }
-// CHECK-NEXT:   "taskflow.yield"(%[[BA3]])
-// CHECK: return %[[RES]] : memref<16xf32>
+// CHECK-NEXT:   return %write_outputs : memref<16xf32>
+// CHECK-NEXT: }
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
index cc2bf924..d8facaa3 100644
--- a/test/multi-cgra/kernel_mapping/fir/fir.mlir
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -4,7 +4,6 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: -o %t.canonicalized.mlir
 // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
 
@@ -100,7 +99,7 @@ module attributes {} {
 // TASKFLOW:      module {
 // TASKFLOW-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // TASKFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
-// TASKFLOW-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// TASKFLOW-NEXT:     %value_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // TASKFLOW-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // TASKFLOW-NEXT:       %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) {
 // TASKFLOW-NEXT:         %1 = affine.load %arg3[%arg6] : memref<?xi32>
@@ -109,8 +108,8 @@ module attributes {} {
 // TASKFLOW-NEXT:         %4 = arith.addi %arg7, %3 : i32
 // TASKFLOW-NEXT:         affine.yield %4 : i32
 // TASKFLOW-NEXT:       }
-// TASKFLOW-NEXT:       "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// TASKFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// TASKFLOW-NEXT:       taskflow.yield values(%0 : i32)
+// TASKFLOW-NEXT:     }
 // TASKFLOW-NEXT:     return %value_outputs : i32
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }

From 02faa878f4b95a2f22ad0fedcd0648da59b72fe9 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sun, 1 Feb 2026 21:28:39 +0800
Subject: [PATCH 4/9] modify the ple pattern

---
 include/TaskflowDialect/TaskflowOps.td        |   4 +-
 include/TaskflowDialect/TaskflowPasses.h      |   1 +
 include/TaskflowDialect/TaskflowPasses.td     |  14 +
 lib/TaskflowDialect/TaskflowOps.cpp           |  32 +-
 lib/TaskflowDialect/Transforms/CMakeLists.txt |   1 +
 .../Transforms/CanonicalizeTaskPass.cpp       | 676 ++++++++++++++++++
 .../ConstructHyperblockFromTaskPass.cpp       |  58 +-
 .../TosaToTaskflow/affine-to-taskflow.mlir    |   2 +-
 .../TosaToTaskflow/tosa-to-taskflow.mlir      |   2 +-
 test/e2e/tosa_e2e.mlir                        |   2 +-
 10 files changed, 744 insertions(+), 48 deletions(-)
 create mode 100644 lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp

diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td
index d4d7c326..8359b0cc 100644
--- a/include/TaskflowDialect/TaskflowOps.td
+++ b/include/TaskflowDialect/TaskflowOps.td
@@ -61,8 +61,8 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [
   }];
 
   let arguments = (ins
-    Variadic<AnyMemRef>:$read_inputs,
-    Variadic<AnyType>:$write_inputs,
+    Variadic<AnyMemRef>:$read_memrefs,
+    Variadic<AnyMemRef>:$write_memrefs,
     Variadic<AnyType>:$value_inputs,
     StrAttr:$task_name,
     Variadic<AnyMemRef>:$original_read_memrefs,
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index 09a28aee..c4c73b6b 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -18,6 +18,7 @@ namespace taskflow {
 std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
+std::unique_ptr<mlir::Pass> createCanonicalizeTaskPass();
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 1e3c55c2..7f2e78b6 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -29,6 +29,20 @@ def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::
   let constructor = "taskflow::createConstructHyperblockFromTaskPass()";
 }
 
+def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{
+  let summary = "Canonicalizes tasks by splitting each hyperblock into a separate atomic task";
+  let description = [{
+    This pass splits tasks so that each task contains exactly one hyperblock.
+    This creates atomic task units that can be analyzed and optimized independently.
+
+    Input: Task with N hyperblocks
+    Output: N atomic tasks, each containing one hyperblock
+
+    This is a prerequisite pass before fusion optimizations.
+  }];
+  let constructor = "taskflow::createCanonicalizeTaskPass()";
+}
+
 def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
   let summary = "Classifies counters as root/relay/leaf";
   let description = [{
diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp
index 44aa255a..06fa3c49 100644
--- a/lib/TaskflowDialect/TaskflowOps.cpp
+++ b/lib/TaskflowDialect/TaskflowOps.cpp
@@ -19,20 +19,20 @@ ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) {
     return failure();
   result.addAttribute("task_name", task_name);
 
-  // Parses read_inputs: read_inputs(%arg0, %arg1 : memref<?xi32>,
+  // Parses read_memrefs: read_memrefs(%arg0, %arg1 : memref<?xi32>,
   // memref<?xi32>).
   SmallVector<OpAsmParser::UnresolvedOperand> read_operands;
   SmallVector<Type> read_types;
-  if (succeeded(parser.parseOptionalKeyword("read_inputs"))) {
+  if (succeeded(parser.parseOptionalKeyword("read_memrefs"))) {
     if (parser.parseLParen() || parser.parseOperandList(read_operands) ||
         parser.parseColonTypeList(read_types) || parser.parseRParen())
       return failure();
   }
 
-  // Parses write_inputs: write_inputs(%arg5 : memref<?xi32>).
+  // Parses write_memrefs: write_memrefs(%arg5 : memref<?xi32>).
   SmallVector<OpAsmParser::UnresolvedOperand> write_operands;
   SmallVector<Type> write_types;
-  if (succeeded(parser.parseOptionalKeyword("write_inputs"))) {
+  if (succeeded(parser.parseOptionalKeyword("write_memrefs"))) {
     if (parser.parseLParen() || parser.parseOperandList(write_operands) ||
         parser.parseColonTypeList(write_types) || parser.parseRParen())
       return failure();
@@ -151,21 +151,21 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) {
   // Prints task name.
   printer << " @" << getTaskName();
 
-  // Prints read_inputs.
-  if (!getReadInputs().empty()) {
-    printer << " read_inputs(";
-    llvm::interleaveComma(getReadInputs(), printer);
+  // Prints read_memrefs.
+  if (!getReadMemrefs().empty()) {
+    printer << " read_memrefs(";
+    llvm::interleaveComma(getReadMemrefs(), printer);
     printer << " : ";
-    llvm::interleaveComma(getReadInputs().getTypes(), printer);
+    llvm::interleaveComma(getReadMemrefs().getTypes(), printer);
     printer << ")";
   }
 
-  // Prints write_inputs.
-  if (!getWriteInputs().empty()) {
-    printer << " write_inputs(";
-    llvm::interleaveComma(getWriteInputs(), printer);
+  // Prints write_memrefs.
+  if (!getWriteMemrefs().empty()) {
+    printer << " write_memrefs(";
+    llvm::interleaveComma(getWriteMemrefs(), printer);
     printer << " : ";
-    llvm::interleaveComma(getWriteInputs().getTypes(), printer);
+    llvm::interleaveComma(getWriteMemrefs().getTypes(), printer);
     printer << ")";
   }
 
@@ -208,8 +208,8 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) {
 
   // Prints function type.
   printer << " : (";
-  llvm::interleaveComma(llvm::concat<const Type>(getReadInputs().getTypes(),
-                                                 getWriteInputs().getTypes(),
+  llvm::interleaveComma(llvm::concat<const Type>(getReadMemrefs().getTypes(),
+                                                 getWriteMemrefs().getTypes(),
                                                  getValueInputs().getTypes()),
                         printer);
   printer << ") -> (";
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index a5443158..ff12e671 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
+    CanonicalizeTaskPass.cpp
     ClassifyCountersPass.cpp
 
     DEPENDS
diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
new file mode 100644
index 00000000..636e02b9
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
@@ -0,0 +1,676 @@
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Unit.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+//----------------------------------------------------------------------
+// Memory and Value Access Info.
+//----------------------------------------------------------------------
+// This struct analyzes accesses information within a hyperblock.
+struct AccessInfo {
+  // Set of read memrefs.
+  SetVector<Value> memref_reads;
+  // Set of written memrefs.
+  SetVector<Value> memref_writes;
+  // Set of read values.
+  SetVector<Value> value_reads;
+
+  void analyze(TaskflowHyperblockOp hyperblock, Block *task_body) {
+    DenseSet<Value> task_block_args;
+    for (Value arg : task_body->getArguments()) {
+      task_block_args.insert(arg);
+    }
+
+    hyperblock.walk([&](Operation *op) {
+      if (auto load = dyn_cast<memref::LoadOp>(op)) {
+        this->memref_reads.insert(load.getMemRef());
+      } else if (auto store = dyn_cast<memref::StoreOp>(op)) {
+        this->memref_writes.insert(store.getMemRef());
+      }
+
+      for (Value operand : op->getOperands()) {
+        if (task_block_args.contains(operand)) {
+          this->value_reads.insert(operand);
+        }
+      }
+    });
+  }
+
+  SetVector<Value> getReadMemRefs() const {
+    SetVector<Value> all;
+    all.insert(this->memref_reads.begin(), this->memref_reads.end());
+    return all;
+  }
+
+  SetVector<Value> getWriteMemRefs() const {
+    SetVector<Value> all;
+    all.insert(this->memref_writes.begin(), this->memref_writes.end());
+    return all;
+  }
+
+  SetVector<Value> getAllValues() const { return this->value_reads; }
+};
+
+//----------------------------------------------------------------------
+// Counter Collector.
+//----------------------------------------------------------------------
+// This class is used to collects all counters needed by a hyperblock.
+class CounterCollector {
+public:
+  void collect(TaskflowHyperblockOp hyperblock) {
+    for (Value idx : hyperblock.getIndices()) {
+      collectRecursively(idx);
+    }
+  }
+
+  // Gets the collected counters sorted by their depth.
+  SmallVector<TaskflowCounterOp> getSortedCounters() const {
+    SmallVector<TaskflowCounterOp> result(this->counters.begin(),
+                                          this->counters.end());
+    llvm::sort(result, [this](TaskflowCounterOp a, TaskflowCounterOp b) {
+      return getDepth(a) < getDepth(b);
+    });
+    return result;
+  }
+
+private:
+  // Collects counters recursively.
+  void collectRecursively(Value idx) {
+    TaskflowCounterOp counter = idx.getDefiningOp<TaskflowCounterOp>();
+    if (!counter) {
+      return;
+    }
+    this->counters.insert(counter);
+    if (Value parent = counter.getParentIndex()) {
+      collectRecursively(parent);
+    }
+  }
+
+  // Gets the depth of a counter.
+  size_t getDepth(TaskflowCounterOp counter) const {
+    size_t depth = 0;
+    Value parent = counter.getParentIndex();
+    while (parent) {
+      depth++;
+      if (TaskflowCounterOp p = parent.getDefiningOp<TaskflowCounterOp>()) {
+        parent = p.getParentIndex();
+      } else {
+        break;
+      }
+    }
+    return depth;
+  }
+
+  SetVector<TaskflowCounterOp> counters;
+};
+
+//----------------------------------------------------------------------
+// Block Argument Resolver.
+//----------------------------------------------------------------------
+// This class resolves the input arguments of a task block to their source
+// values.
+// For example:
+// taskflow.task(%buf_input, %val_input) {
+// ^bb0(%arg0: memref<?xi32>, %arg1: i32):   // ← block arguments
+//   // %arg0 corresponds to %buf_input
+//   // %arg1 corresponds to %val_input
+// }
+// resolveToSource(%arg0) -> %buf_input
+class BlockArgResolver {
+public:
+  explicit BlockArgResolver(TaskflowTaskOp task) {
+    Block *body = &task.getBody().front();
+
+    // Resolves memory inputs.
+    auto read_memrefs = task.getReadMemrefs();
+    auto read_args = body->getArguments().take_front(read_memrefs.size());
+    for (auto [input, arg] : llvm::zip(read_memrefs, read_args)) {
+      this->block_arg_to_source[arg] = input;
+      this->source_to_block_arg[input] = arg;
+    }
+
+    // Resolves memory inputs.
+    auto write_memrefs = task.getWriteMemrefs();
+    auto mem_args = body->getArguments().take_front(write_memrefs.size());
+    for (auto [input, arg] : llvm::zip(write_memrefs, mem_args)) {
+      this->block_arg_to_source[arg] = input;
+      this->source_to_block_arg[input] = arg;
+    }
+
+    // Resolves value inputs.
+    auto val_inputs = task.getValueInputs();
+    auto val_args = body->getArguments().drop_front(read_memrefs.size() +
+                                                    write_memrefs.size());
+    for (auto [input, arg] : llvm::zip(val_inputs, val_args)) {
+      this->block_arg_to_source[arg] = input;
+      this->source_to_block_arg[input] = arg;
+    }
+  }
+
+  // Gets the source value for a given block argument.
+  Value resolveToSource(Value val) const {
+    auto it = this->block_arg_to_source.find(val);
+    return it != this->block_arg_to_source.end() ? it->second : val;
+  }
+
+  // Gets the block argument for a given source value.
+  Value getBlockArg(Value source) const {
+    auto it = this->source_to_block_arg.find(source);
+    return it != this->source_to_block_arg.end() ? it->second : Value();
+  }
+
+private:
+  // Maps block argument to its source value.
+  DenseMap<Value, Value> block_arg_to_source;
+  // Maps source value to its block argument.
+  DenseMap<Value, Value> source_to_block_arg;
+};
+
+//----------------------------------------------------------------------
+// Atomic Task Builder.
+//----------------------------------------------------------------------
+// This class builds an atomic task from a hyperblock.
+class AtomicTaskBuilder {
+public:
+  AtomicTaskBuilder(OpBuilder &builder, Location loc, unsigned global_task_idx,
+                    DenseMap<Value, Value> &memref_to_latest_version,
+                    DenseMap<Value, Value> &value_to_latest_version)
+      : builder(builder), loc(loc), global_task_idx(global_task_idx),
+        memref_to_latest_version(memref_to_latest_version),
+        value_to_latest_version(value_to_latest_version) {}
+
+  TaskflowTaskOp build(TaskflowHyperblockOp hyperblock,
+                       TaskflowTaskOp original_task) {
+    AccessInfo access_info;
+    access_info.analyze(hyperblock, &original_task.getBody().front());
+
+    BlockArgResolver resolver(original_task);
+
+    //------------------------------------------------------
+    // Step1: Determines read/write memresfs and value inputs.
+    //------------------------------------------------------
+    SmallVector<Value> read_memrefs;
+    SmallVector<Value> write_memrefs;
+    SmallVector<Value> value_inputs;
+
+    DenseMap<Value, unsigned> source_to_read_memref_idx;
+    DenseMap<Value, unsigned> source_to_write_memref_idx;
+    DenseMap<Value, unsigned> source_to_value_input_idx;
+
+    // Classifies memrefs into read and write sets.
+    for (Value memref : access_info.getReadMemRefs()) {
+      Value source = resolver.resolveToSource(memref);
+      Value input_memref = getLatestMemrefVersion(source);
+
+      if (!source_to_read_memref_idx.count(source)) {
+        source_to_read_memref_idx[source] = read_memrefs.size();
+        read_memrefs.push_back(input_memref);
+      }
+    }
+
+    for (Value memref : access_info.getWriteMemRefs()) {
+      Value source = resolver.resolveToSource(memref);
+      Value input_memref = getLatestMemrefVersion(source);
+
+      if (!source_to_write_memref_idx.count(source)) {
+        source_to_write_memref_idx[source] = write_memrefs.size();
+        write_memrefs.push_back(input_memref);
+      }
+    }
+
+    for (Value val : access_info.getAllValues()) {
+      Value source = resolver.resolveToSource(val);
+      Value input_val = getLatestValueVersion(source);
+
+      if (!source_to_value_input_idx.count(source)) {
+        source_to_value_input_idx[source] = value_inputs.size();
+        value_inputs.push_back(input_val);
+      }
+    }
+
+    //------------------------------------------------------
+    // Step 2: Determines output types.
+    //------------------------------------------------------
+    // Determines memref outputs.
+    SmallVector<Type> memref_output_types;
+    // The source memrefs of the written memrefs.
+    SmallVector<Value> written_memref_sources;
+
+    for (Value memref : access_info.memref_writes) {
+      Value source = resolver.resolveToSource(memref);
+      memref_output_types.push_back(source.getType());
+      written_memref_sources.push_back(source);
+    }
+
+    // Determines value outputs.
+    SmallVector<Type> value_output_types;
+    SmallVector<Value> yielded_value_sources;
+
+    if (!hyperblock.getOutputs().empty()) {
+      for (Value output : hyperblock.getOutputs()) {
+        value_output_types.push_back(output.getType());
+        // For value outputs, they are source themselves.
+        yielded_value_sources.push_back(output);
+      }
+    }
+
+    //------------------------------------------------------
+    // Step 3: Resolves original_read_memrefs and original_write_memrefs.
+    //------------------------------------------------------
+    // Map: block arg -> original memref (from original task).
+    DenseMap<Value, Value> arg_to_original_read;
+    DenseMap<Value, Value> arg_to_original_write;
+
+    Block *orig_body = &original_task.getBody().front();
+    auto orig_read_memrefs = original_task.getOriginalReadMemrefs();
+    auto orig_write_memrefs = original_task.getOriginalWriteMemrefs();
+
+    size_t read_arg_count = original_task.getReadMemrefs().size();
+    size_t write_arg_count = original_task.getWriteMemrefs().size();
+
+    // Maps read args to original read memrefs.
+    for (auto [orig_memref, arg] :
+         llvm::zip(orig_read_memrefs,
+                   orig_body->getArguments().take_front(read_arg_count))) {
+      arg_to_original_read[arg] = orig_memref;
+    }
+
+    // Maps write args to original write memrefs.
+    for (auto [orig_memref, arg] :
+         llvm::zip(orig_write_memrefs, orig_body->getArguments().slice(
+                                           read_arg_count, write_arg_count))) {
+      arg_to_original_write[arg] = orig_memref;
+    }
+
+    // Collects original memrefs for this new task.
+    SmallVector<Value> new_original_read_memrefs;
+    SmallVector<Value> new_original_write_memrefs;
+
+    for (Value memref : access_info.memref_reads) {
+      if (arg_to_original_read.count(memref)) {
+        new_original_read_memrefs.push_back(arg_to_original_read[memref]);
+      } else if (arg_to_original_write.count(memref)) {
+        // If reading from a write memref, add to original read.
+        new_original_read_memrefs.push_back(arg_to_original_write[memref]);
+      }
+    }
+
+    for (Value memref : access_info.memref_writes) {
+      if (arg_to_original_write.count(memref)) {
+        new_original_write_memrefs.push_back(arg_to_original_write[memref]);
+      } else if (arg_to_original_read.count(memref)) {
+        // If writing to a read memref, add to original write.
+        new_original_write_memrefs.push_back(arg_to_original_read[memref]);
+      }
+    }
+
+    //------------------------------------------------------
+    // Step 4: Creates the new taskflow.task operation.
+    //------------------------------------------------------
+    std::string task_name = "Task_" + std::to_string(this->global_task_idx);
+    auto new_task = builder.create<TaskflowTaskOp>(
+        this->loc, memref_output_types, value_output_types, read_memrefs,
+        write_memrefs, value_inputs, builder.getStringAttr(task_name),
+        new_original_read_memrefs, new_original_write_memrefs);
+
+    //------------------------------------------------------
+    // Step 5: Builds the task body.
+    //------------------------------------------------------
+    Block *task_body = new Block();
+    new_task.getBody().push_back(task_body);
+
+    // Adds block arguments: [read_memrefs, write_memrefs, value_inputs].
+    for (Value input : read_memrefs) {
+      task_body->addArgument(input.getType(), this->loc);
+    }
+    for (Value input : write_memrefs) {
+      task_body->addArgument(input.getType(), this->loc);
+    }
+    for (Value input : value_inputs) {
+      task_body->addArgument(input.getType(), this->loc);
+    }
+
+    // Builds value mapping.
+    IRMapping mapping;
+
+    // Maps read memrefs.
+    for (auto [source, idx] : source_to_read_memref_idx) {
+      BlockArgument new_arg = task_body->getArgument(idx);
+      mapping.map(source, new_arg);
+
+      if (Value orig_arg = resolver.getBlockArg(source)) {
+        mapping.map(orig_arg, new_arg);
+      }
+    }
+
+    // Maps write memrefs.
+    size_t write_arg_offset = read_memrefs.size();
+    for (auto [source, idx] : source_to_write_memref_idx) {
+      BlockArgument new_arg = task_body->getArgument(write_arg_offset + idx);
+      mapping.map(source, new_arg);
+
+      if (Value orig_arg = resolver.getBlockArg(source)) {
+        mapping.map(orig_arg, new_arg);
+      }
+    }
+
+    // Maps value inputs.
+    size_t value_arg_offset = read_memrefs.size() + write_memrefs.size();
+    for (auto [source, idx] : source_to_value_input_idx) {
+      BlockArgument new_arg = task_body->getArgument(value_arg_offset + idx);
+      mapping.map(source, new_arg);
+
+      if (Value orig_arg = resolver.getBlockArg(source)) {
+        mapping.map(orig_arg, new_arg);
+      }
+    }
+
+    // Clones counters and hyperblock.
+    OpBuilder task_builder(task_body, task_body->begin());
+    cloneCounters(task_builder, hyperblock, mapping);
+    cloneHyperblock(task_builder, hyperblock, mapping);
+
+    // Creates yield.
+    SmallVector<Value> memref_yield_operands;
+    for (Value memref : access_info.memref_writes) {
+      memref_yield_operands.push_back(mapping.lookupOrDefault(memref));
+    }
+
+    SmallVector<Value> value_yield_operands;
+    // If this hyperblock has value outputs, we need to yield them from the
+    // mapped hyperblock.
+    if (!hyperblock.getOutputs().empty()) {
+      // Finds the cloned hyperblock op.
+      TaskflowHyperblockOp cloned_hb = nullptr;
+      for (Operation &op : task_body->getOperations()) {
+        if (auto hb = dyn_cast<TaskflowHyperblockOp>(op)) {
+          cloned_hb = hb;
+          break;
+        }
+        if (cloned_hb) {
+          for (Value output : cloned_hb.getOutputs()) {
+            value_yield_operands.push_back(output);
+          }
+        }
+      }
+    }
+
+    task_builder.setInsertionPointToEnd(task_body);
+    task_builder.create<TaskflowYieldOp>(this->loc, memref_yield_operands,
+                                         value_yield_operands);
+
+    //------------------------------------------------------
+    // Step 6: Updates latest versions.
+    //------------------------------------------------------
+    // Updates latest versions.
+    auto memref_outputs = new_task.getWriteOutputs();
+    for (auto [source, output] :
+         llvm::zip(written_memref_sources, memref_outputs)) {
+      this->memref_to_latest_version[source] = output;
+    }
+
+    auto value_outputs = new_task.getValueOutputs();
+    for (auto [source, output] :
+         llvm::zip(yielded_value_sources, value_outputs)) {
+      this->value_to_latest_version[source] = output;
+    }
+
+    return new_task;
+  }
+
+private:
+  Value getLatestMemrefVersion(Value source) {
+    auto it = this->memref_to_latest_version.find(source);
+    return it != this->memref_to_latest_version.end() ? it->second : source;
+  }
+
+  Value getLatestValueVersion(Value source) {
+    auto it = this->value_to_latest_version.find(source);
+    return it != this->value_to_latest_version.end() ? it->second : source;
+  }
+
+  void cloneCounters(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock,
+                     IRMapping &mapping) {
+    CounterCollector collector;
+    collector.collect(hyperblock);
+
+    for (TaskflowCounterOp counter : collector.getSortedCounters()) {
+      task_builder.clone(*counter.getOperation(), mapping);
+    }
+  }
+
+  void cloneHyperblock(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock,
+                       IRMapping &mapping) {
+    SmallVector<Value> mapped_indices;
+    for (Value idx : hyperblock.getIndices()) {
+      mapped_indices.push_back(mapping.lookupOrDefault(idx));
+    }
+
+    SmallVector<Value> mapped_iter_args;
+    for (Value arg : hyperblock.getIterArgs()) {
+      mapped_iter_args.push_back(mapping.lookupOrDefault(arg));
+    }
+
+    SmallVector<Type> output_types(hyperblock.getOutputs().getTypes());
+    auto newHB = task_builder.create<TaskflowHyperblockOp>(
+        this->loc, output_types, mapped_indices, mapped_iter_args);
+
+    Block *new_body = new Block();
+    newHB.getBody().push_back(new_body);
+
+    for (Value idx : mapped_indices) {
+      new_body->addArgument(idx.getType(), this->loc);
+    }
+
+    for (Value arg : mapped_iter_args) {
+      new_body->addArgument(arg.getType(), this->loc);
+    }
+
+    Block *old_body = &hyperblock.getBody().front();
+    for (auto [old_arg, new_arg] :
+         llvm::zip(old_body->getArguments(), new_body->getArguments())) {
+      mapping.map(old_arg, new_arg);
+    }
+
+    OpBuilder hb_builder(new_body, new_body->begin());
+    for (Operation &op : old_body->without_terminator()) {
+      hb_builder.clone(op, mapping);
+    }
+
+    if (auto yield =
+            dyn_cast<TaskflowHyperblockYieldOp>(old_body->getTerminator())) {
+      SmallVector<Value> yield_results;
+      SmallVector<Value> yield_iter_args_next;
+      for (Value v : yield.getResults()) {
+        yield_results.push_back(mapping.lookupOrDefault(v));
+      }
+      for (Value v : yield.getIterArgsNext()) {
+        yield_iter_args_next.push_back(mapping.lookupOrDefault(v));
+      }
+      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc, yield_results,
+                                                   yield_iter_args_next);
+    } else {
+      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc);
+    }
+  }
+
+  OpBuilder &builder;
+  Location loc;
+  unsigned global_task_idx;
+  DenseMap<Value, Value> &memref_to_latest_version;
+  DenseMap<Value, Value> &value_to_latest_version;
+};
+
+//----------------------------------------------------------------------
+// Pass Implementation.
+//----------------------------------------------------------------------
+
+struct CanonicalizeTaskPass
+    : public PassWrapper<CanonicalizeTaskPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeTaskPass)
+
+  StringRef getArgument() const final { return "canonicalize-task"; }
+
+  StringRef getDescription() const final {
+    return "Canonicalizes tasks by splitting each hyperblock into a separate "
+           "atomic task (one hyperblock per task)";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry
+        .insert<taskflow::TaskflowDialect, arith::ArithDialect,
+                memref::MemRefDialect, func::FuncDialect, scf::SCFDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func_op = getOperation();
+
+    SmallVector<TaskflowTaskOp> tasks_to_process;
+    func_op.walk(
+        [&](TaskflowTaskOp task_op) { tasks_to_process.push_back(task_op); });
+
+    unsigned global_task_idx = 0;
+
+    for (TaskflowTaskOp original_task : tasks_to_process) {
+      OpBuilder builder(original_task);
+      // Collects hyperblocks within the original task.
+      SmallVector<TaskflowHyperblockOp> hyperblocks;
+      original_task.walk(
+          [&](TaskflowHyperblockOp hb) { hyperblocks.push_back(hb); });
+
+      assert(!hyperblocks.empty() &&
+             "Expected at least one hyperblock in the task");
+
+      // If there's only one hyperblock, it is already canonical.
+      if (hyperblocks.size() == 1) {
+        std::string task_name = "Task_" + std::to_string(global_task_idx++);
+        original_task.setTaskNameAttr(builder.getStringAttr(task_name));
+        continue;
+      }
+
+      //----------------------------------------------------------------
+      // Step 1: Builds mapping from original task's memory outputs to their
+      //         corresponding source memrefs (the original inputs).
+      //----------------------------------------------------------------
+      // Gets the yield operation to find which memrefs are yielded.
+      auto yield_op = cast<TaskflowYieldOp>(
+          original_task.getBody().front().getTerminator());
+
+      auto original_write_outputs = original_task.getWriteOutputs();
+      auto original_val_outputs = original_task.getValueOutputs();
+      auto yielded_memrefs = yield_op.getMemoryResults();
+      auto yielded_values = yield_op.getValueResults();
+
+      // Map: yielded -> original task output.
+      DenseMap<Value, Value> yielded_to_output;
+      for (auto [yielded, output] :
+           llvm::zip(yielded_memrefs, original_write_outputs)) {
+        yielded_to_output[yielded] = output;
+      }
+      for (auto [yielded, output] :
+           llvm::zip(yielded_values, original_val_outputs)) {
+        yielded_to_output[yielded] = output;
+      }
+
+      // Map: original input memref -> original task output (if it's yielded).
+      // This tells us which original outputs correspond to which input memrefs.
+      Block *orig_body = &original_task.getBody().front();
+      auto orig_read_memrefs = original_task.getReadMemrefs();
+      auto orig_write_memrefs = original_task.getWriteMemrefs();
+      auto orig_val_inputs = original_task.getValueInputs();
+
+      DenseMap<Value, Value> source_to_original_output;
+
+      // Maps read memrefs.
+      for (auto [input, arg] :
+           llvm::zip(orig_read_memrefs, orig_body->getArguments().take_front(
+                                            orig_read_memrefs.size()))) {
+        if (yielded_to_output.count(arg)) {
+          source_to_original_output[input] = yielded_to_output[arg];
+        }
+      }
+
+      // Maps write memrefs.
+      size_t write_offset = orig_read_memrefs.size();
+      for (auto [input, arg] :
+           llvm::zip(orig_write_memrefs,
+                     orig_body->getArguments().slice(
+                         write_offset, orig_write_memrefs.size()))) {
+        if (yielded_to_output.count(arg)) {
+          source_to_original_output[input] = yielded_to_output[arg];
+        }
+      }
+
+      // Maps value inputs.
+      for (auto [input, arg] :
+           llvm::zip(orig_val_inputs,
+                     orig_body->getArguments().drop_front(
+                         write_offset + orig_write_memrefs.size()))) {
+        if (yielded_to_output.count(arg)) {
+          source_to_original_output[input] = yielded_to_output[arg];
+        }
+      }
+
+      //----------------------------------------------------------------
+      // Step 2: Creates atomic tasks for each hyperblock.
+      //----------------------------------------------------------------
+      // Records the mapping from source memref to the latest version after
+      // executing each atomic task.
+      DenseMap<Value, Value> memref_to_latest_version;
+      DenseMap<Value, Value> value_to_latest_version;
+
+      for (size_t i = 0; i < hyperblocks.size(); ++i) {
+        AtomicTaskBuilder task_builder(
+            builder, original_task.getLoc(), global_task_idx++,
+            memref_to_latest_version, value_to_latest_version);
+        task_builder.build(hyperblocks[i], original_task);
+      }
+
+      //----------------------------------------------------------------
+      // Step 3: Replaces uses of original task outputs with the latest
+      // versions.
+      //----------------------------------------------------------------
+      for (auto [source, original_output] : source_to_original_output) {
+        Value latest = nullptr;
+        if (memref_to_latest_version.count(source)) {
+          latest = memref_to_latest_version[source];
+        } else if (value_to_latest_version.count(source)) {
+          latest = value_to_latest_version[source];
+        }
+
+        if (latest) {
+          original_output.replaceAllUsesWith(latest);
+        }
+      }
+
+      //----------------------------------------------------------------
+      // Step 4: Erase the original task.
+      //----------------------------------------------------------------
+      original_task.erase();
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::taskflow::createCanonicalizeTaskPass() {
+  return std::make_unique<CanonicalizeTaskPass>();
+}
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index c1e6ddff..e8a9927c 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -59,8 +59,8 @@ struct HyperblockInfo {
   // The corresponding loop.
   affine::AffineForOp loop_op = nullptr;
 
-  // Marks if this hyperblock follows the LEC pattern.
-  bool is_lec_pattern = false;
+  // Marks if this hyperblock follows the PLE pattern.
+  bool is_ple_pattern = false;
 };
 
 //----------------------------------------------------------------------------
@@ -180,33 +180,33 @@ getTopLevelLoopsInfo(SmallVector<LoopInfo> &loops_info) {
 }
 
 //----------------------------------------------------------------------------
-// Loop-Epilogue Code (LEC) Pattern Detection
+// Prologue-Loop-Epilogue Code (PLE) Pattern Detection
 //----------------------------------------------------------------------------
-// Loop-Epilogue Code means code that appears after an inner loop.
-// Example:
-// for %i (outer loop) {
+// Prologue-Loop-Epilogue Code means code that appears before and after an inner
+// loop. Example: for %i (outer loop) {
+//   <prologue code>
 //   for %j (nested loop) {
 //     <loop body>
 //   }
 //   <epilogue code>  ← Loop-Epilogue Code
 // }
-// For this pattern, we need to wrap the inner loop and the epilogue code into
-// a hyperblock. Only by doing this can we maintain the hyperblock as a pure
-// data-driven code block.
-struct LECPattern {
+// For this pattern, we need to wrap the inner loop and the prologue-epilogue
+// code into a hyperblock. Only by doing this can we maintain the hyperblock as
+// a pure data-driven code block.
+struct PLEPattern {
   affine::AffineForOp outer_loop;
   affine::AffineForOp inner_loop;
 
   SmallVector<Operation *> prologue_code;
   SmallVector<Operation *> epilogue_code;
 
-  bool has_lec_pattern = false;
+  bool has_ple_pattern = false;
 };
 
-// Detects Loop-Epilogue Code pattern in the task.
-static LECPattern detectLECPattern(affine::AffineForOp outer_loop) {
-  LECPattern pattern;
-  pattern.has_lec_pattern = false;
+// Detects Prologue-Loop-Epilogue Code pattern in the task.
+static PLEPattern detectPLEPattern(affine::AffineForOp outer_loop) {
+  PLEPattern pattern;
+  pattern.has_ple_pattern = false;
   pattern.outer_loop = outer_loop;
 
   Block &body = outer_loop.getRegion().front();
@@ -223,11 +223,15 @@ static LECPattern detectLECPattern(affine::AffineForOp outer_loop) {
         pattern.prologue_code.push_back(&op);
       } else {
         pattern.epilogue_code.push_back(&op);
-        pattern.has_lec_pattern = true;
+        pattern.has_ple_pattern = true;
       }
     }
   }
 
+  if (found_nested_loop && (!pattern.prologue_code.empty())) {
+    pattern.has_ple_pattern = true;
+  }
+
   return pattern;
 }
 
@@ -252,7 +256,7 @@ static void extractHyperblocksInfoFromRegion(
   for (Operation &op : block.getOperations()) {
     if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
 
-      LECPattern lec_pattern = detectLECPattern(for_op);
+      PLEPattern ple_pattern = detectPLEPattern(for_op);
 
       // Gets the loop info.
       LoopInfo *loop_info = loop_info_map.lookup(for_op);
@@ -263,8 +267,8 @@ static void extractHyperblocksInfoFromRegion(
       SmallVector<Value> loop_indices = parent_indices;
       loop_indices.push_back(loop_info->counter_index);
 
-      // Handles the LEC pattern.
-      if (lec_pattern.has_lec_pattern) {
+      // Handles the PLE pattern.
+      if (ple_pattern.has_ple_pattern) {
         // 1. Emits any accumulated operations as a hyperblock.
         if (!current_block_ops.empty()) {
           HyperblockInfo info;
@@ -278,22 +282,22 @@ static void extractHyperblocksInfoFromRegion(
 
         // 2. Creates a hyperblock for the prologue + inner loop + epilogue.
         HyperblockInfo info;
-        if (!lec_pattern.prologue_code.empty()) {
-          info.operations.append(lec_pattern.prologue_code.begin(),
-                                 lec_pattern.prologue_code.end());
+        if (!ple_pattern.prologue_code.empty()) {
+          info.operations.append(ple_pattern.prologue_code.begin(),
+                                 ple_pattern.prologue_code.end());
         }
 
-        info.operations.push_back(lec_pattern.inner_loop);
+        info.operations.push_back(ple_pattern.inner_loop);
 
-        if (!lec_pattern.epilogue_code.empty()) {
-          info.operations.append(lec_pattern.epilogue_code.begin(),
-                                 lec_pattern.epilogue_code.end());
+        if (!ple_pattern.epilogue_code.empty()) {
+          info.operations.append(ple_pattern.epilogue_code.begin(),
+                                 ple_pattern.epilogue_code.end());
         }
 
         info.trigger_indices = loop_indices;
         info.is_loop_body = true;
         info.loop_op = for_op;
-        info.is_lec_pattern = true;
+        info.is_ple_pattern = true;
         hyperblocks_info.push_back(info);
 
         // No need for further processing of this loop. Since we have already
diff --git a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir
index 38bb3ca2..658e3062 100644
--- a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir
+++ b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir
@@ -16,7 +16,7 @@ module {
 }
 
 // CHECK:        func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: memref<16xf32>) {
-// CHECK-NEXT:     %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg2)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
+// CHECK-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg2)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
 // CHECK-NEXT:     ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>):
 // CHECK-NEXT:       affine.for %arg6 = 0 to 16 {
 // CHECK-NEXT:         %0 = affine.load %arg3[%arg6] : memref<16xf32>
diff --git a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir
index dd7083ba..32931f5f 100644
--- a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir
+++ b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir
@@ -11,7 +11,7 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16
 
 // CHECK:      func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> {
 // CHECK-NEXT:   %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32>
-// CHECK-NEXT:   %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
+// CHECK-NEXT:   %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
 // CHECK-NEXT:   ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>):
 // CHECK-NEXT:     affine.for %arg5 = 0 to 16 {
 // CHECK-NEXT:       %0 = affine.load %arg2[%arg5] : memref<16xf32>
diff --git a/test/e2e/tosa_e2e.mlir b/test/e2e/tosa_e2e.mlir
index f291ffd7..a8474588 100644
--- a/test/e2e/tosa_e2e.mlir
+++ b/test/e2e/tosa_e2e.mlir
@@ -11,7 +11,7 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf
 
 // CHECK:      func.func @test_e2e(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> {
 // CHECK-NEXT:   %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32>
-// CHECK-NEXT:   %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
+// CHECK-NEXT:   %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) {
 // CHECK-NEXT:   ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>):
 // CHECK-NEXT:     affine.for %arg5 = 0 to 16 {
 // CHECK-NEXT:       %0 = affine.load %arg2[%arg5] : memref<16xf32>

From 074615b53b045df332c77542a67474cb1e9f7f56 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sun, 1 Feb 2026 23:15:14 +0800
Subject: [PATCH 5/9] enable atomic canonical task creation

---
 include/TaskflowDialect/TaskflowPasses.h      |   7 +-
 include/TaskflowDialect/TaskflowPasses.td     |  19 +-
 .../TaskflowToNeura/TaskflowToNeuraPass.cpp   |   6 +
 lib/TaskflowDialect/Transforms/CMakeLists.txt |   1 -
 .../Transforms/CanonicalizeTaskPass.cpp       | 676 ------------------
 .../ConstructHyperblockFromTaskPass.cpp       |   8 +-
 test/multi-cgra/kernel_mapping/fir/fir.mlir   |  72 +-
 .../loop-in-kernel/loop-in-kernel.mlir        |  94 +--
 test/multi-cgra/kernel_mapping/relu/relu.mlir | 187 +++--
 .../irregular-loop/irregular-loop.mlir        | 284 ++++----
 .../taskflow/multi-nested/multi-nested.mlir   | 363 +++++-----
 .../parallel-nested/parallel-nested.mlir      | 113 ++-
 12 files changed, 574 insertions(+), 1256 deletions(-)
 delete mode 100644 lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index c4c73b6b..88c9d5bb 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -15,10 +15,13 @@ namespace taskflow {
 // Passes defined in TaskflowPasses.td
 #define GEN_PASS_DECL
 #include "TaskflowDialect/TaskflowPasses.h.inc"
-std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 std::unique_ptr<mlir::Pass> createClassifyCountersPass();
-std::unique_ptr<mlir::Pass> createCanonicalizeTaskPass();
+
+//=========================================================//
+// Optimization Passes
+//=========================================================//
+std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 7f2e78b6..7c6b5a17 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -6,7 +6,7 @@
 include "mlir/Pass/PassBase.td"
 
 //=========================================================//
-// Passes for the Taskflow dialect
+// Passes for Task Level Optimizations
 //=========================================================//
 def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "ModuleOp">{
   let summary = "Serializes top-level affine.for loops into minimized task operations";
@@ -21,6 +21,9 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module
     "mlir::func::FuncDialect"];
 }
 
+//=========================================================//
+// Passes for the Taskflow dialect
+//=========================================================//
 def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> {
   let summary = "Constructs hyperblocks and counter chain from Taskflow tasks";
   let description = [{
@@ -29,20 +32,6 @@ def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::
   let constructor = "taskflow::createConstructHyperblockFromTaskPass()";
 }
 
-def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{
-  let summary = "Canonicalizes tasks by splitting each hyperblock into a separate atomic task";
-  let description = [{
-    This pass splits tasks so that each task contains exactly one hyperblock.
-    This creates atomic task units that can be analyzed and optimized independently.
-
-    Input: Task with N hyperblocks
-    Output: N atomic tasks, each containing one hyperblock
-
-    This is a prerequisite pass before fusion optimizations.
-  }];
-  let constructor = "taskflow::createCanonicalizeTaskPass()";
-}
-
 def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{
   let summary = "Classifies counters as root/relay/leaf";
   let description = [{
diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
index fc34a545..f0eb7cb0 100644
--- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
+++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp
@@ -54,6 +54,12 @@ struct HyperblockToKernelPattern
       return failure();
     }
 
+    // Asserts that each task contains only one hyperblock.
+    int hyperblock_count = 0;
+    task_op.walk([&](TaskflowHyperblockOp op) { hyperblock_count++; });
+    assert(hyperblock_count == 1 &&
+           "Each taskflow.task should contain only one hyperblock");
+
     Block &hb_block = hyperblock_op.getBody().front();
     Block &task_block = task_op.getBody().front();
 
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index ff12e671..a5443158 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -2,7 +2,6 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
-    CanonicalizeTaskPass.cpp
     ClassifyCountersPass.cpp
 
     DEPENDS
diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
deleted file mode 100644
index 636e02b9..00000000
--- a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp
+++ /dev/null
@@ -1,676 +0,0 @@
-#include "TaskflowDialect/TaskflowDialect.h"
-#include "TaskflowDialect/TaskflowOps.h"
-#include "TaskflowDialect/TaskflowPasses.h"
-
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Unit.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "llvm/ADT/SmallVector.h"
-
-using namespace mlir;
-using namespace mlir::taskflow;
-
-namespace {
-//----------------------------------------------------------------------
-// Memory and Value Access Info.
-//----------------------------------------------------------------------
-// This struct analyzes accesses information within a hyperblock.
-struct AccessInfo {
-  // Set of read memrefs.
-  SetVector<Value> memref_reads;
-  // Set of written memrefs.
-  SetVector<Value> memref_writes;
-  // Set of read values.
-  SetVector<Value> value_reads;
-
-  void analyze(TaskflowHyperblockOp hyperblock, Block *task_body) {
-    DenseSet<Value> task_block_args;
-    for (Value arg : task_body->getArguments()) {
-      task_block_args.insert(arg);
-    }
-
-    hyperblock.walk([&](Operation *op) {
-      if (auto load = dyn_cast<memref::LoadOp>(op)) {
-        this->memref_reads.insert(load.getMemRef());
-      } else if (auto store = dyn_cast<memref::StoreOp>(op)) {
-        this->memref_writes.insert(store.getMemRef());
-      }
-
-      for (Value operand : op->getOperands()) {
-        if (task_block_args.contains(operand)) {
-          this->value_reads.insert(operand);
-        }
-      }
-    });
-  }
-
-  SetVector<Value> getReadMemRefs() const {
-    SetVector<Value> all;
-    all.insert(this->memref_reads.begin(), this->memref_reads.end());
-    return all;
-  }
-
-  SetVector<Value> getWriteMemRefs() const {
-    SetVector<Value> all;
-    all.insert(this->memref_writes.begin(), this->memref_writes.end());
-    return all;
-  }
-
-  SetVector<Value> getAllValues() const { return this->value_reads; }
-};
-
-//----------------------------------------------------------------------
-// Counter Collector.
-//----------------------------------------------------------------------
-// This class is used to collects all counters needed by a hyperblock.
-class CounterCollector {
-public:
-  void collect(TaskflowHyperblockOp hyperblock) {
-    for (Value idx : hyperblock.getIndices()) {
-      collectRecursively(idx);
-    }
-  }
-
-  // Gets the collected counters sorted by their depth.
-  SmallVector<TaskflowCounterOp> getSortedCounters() const {
-    SmallVector<TaskflowCounterOp> result(this->counters.begin(),
-                                          this->counters.end());
-    llvm::sort(result, [this](TaskflowCounterOp a, TaskflowCounterOp b) {
-      return getDepth(a) < getDepth(b);
-    });
-    return result;
-  }
-
-private:
-  // Collects counters recursively.
-  void collectRecursively(Value idx) {
-    TaskflowCounterOp counter = idx.getDefiningOp<TaskflowCounterOp>();
-    if (!counter) {
-      return;
-    }
-    this->counters.insert(counter);
-    if (Value parent = counter.getParentIndex()) {
-      collectRecursively(parent);
-    }
-  }
-
-  // Gets the depth of a counter.
-  size_t getDepth(TaskflowCounterOp counter) const {
-    size_t depth = 0;
-    Value parent = counter.getParentIndex();
-    while (parent) {
-      depth++;
-      if (TaskflowCounterOp p = parent.getDefiningOp<TaskflowCounterOp>()) {
-        parent = p.getParentIndex();
-      } else {
-        break;
-      }
-    }
-    return depth;
-  }
-
-  SetVector<TaskflowCounterOp> counters;
-};
-
-//----------------------------------------------------------------------
-// Block Argument Resolver.
-//----------------------------------------------------------------------
-// This class resolves the input arguments of a task block to their source
-// values.
-// For example:
-// taskflow.task(%buf_input, %val_input) {
-// ^bb0(%arg0: memref<?xi32>, %arg1: i32):   // ← block arguments
-//   // %arg0 corresponds to %buf_input
-//   // %arg1 corresponds to %val_input
-// }
-// resolveToSource(%arg0) -> %buf_input
-class BlockArgResolver {
-public:
-  explicit BlockArgResolver(TaskflowTaskOp task) {
-    Block *body = &task.getBody().front();
-
-    // Resolves memory inputs.
-    auto read_memrefs = task.getReadMemrefs();
-    auto read_args = body->getArguments().take_front(read_memrefs.size());
-    for (auto [input, arg] : llvm::zip(read_memrefs, read_args)) {
-      this->block_arg_to_source[arg] = input;
-      this->source_to_block_arg[input] = arg;
-    }
-
-    // Resolves memory inputs.
-    auto write_memrefs = task.getWriteMemrefs();
-    auto mem_args = body->getArguments().take_front(write_memrefs.size());
-    for (auto [input, arg] : llvm::zip(write_memrefs, mem_args)) {
-      this->block_arg_to_source[arg] = input;
-      this->source_to_block_arg[input] = arg;
-    }
-
-    // Resolves value inputs.
-    auto val_inputs = task.getValueInputs();
-    auto val_args = body->getArguments().drop_front(read_memrefs.size() +
-                                                    write_memrefs.size());
-    for (auto [input, arg] : llvm::zip(val_inputs, val_args)) {
-      this->block_arg_to_source[arg] = input;
-      this->source_to_block_arg[input] = arg;
-    }
-  }
-
-  // Gets the source value for a given block argument.
-  Value resolveToSource(Value val) const {
-    auto it = this->block_arg_to_source.find(val);
-    return it != this->block_arg_to_source.end() ? it->second : val;
-  }
-
-  // Gets the block argument for a given source value.
-  Value getBlockArg(Value source) const {
-    auto it = this->source_to_block_arg.find(source);
-    return it != this->source_to_block_arg.end() ? it->second : Value();
-  }
-
-private:
-  // Maps block argument to its source value.
-  DenseMap<Value, Value> block_arg_to_source;
-  // Maps source value to its block argument.
-  DenseMap<Value, Value> source_to_block_arg;
-};
-
-//----------------------------------------------------------------------
-// Atomic Task Builder.
-//----------------------------------------------------------------------
-// This class builds an atomic task from a hyperblock.
-class AtomicTaskBuilder {
-public:
-  AtomicTaskBuilder(OpBuilder &builder, Location loc, unsigned global_task_idx,
-                    DenseMap<Value, Value> &memref_to_latest_version,
-                    DenseMap<Value, Value> &value_to_latest_version)
-      : builder(builder), loc(loc), global_task_idx(global_task_idx),
-        memref_to_latest_version(memref_to_latest_version),
-        value_to_latest_version(value_to_latest_version) {}
-
-  TaskflowTaskOp build(TaskflowHyperblockOp hyperblock,
-                       TaskflowTaskOp original_task) {
-    AccessInfo access_info;
-    access_info.analyze(hyperblock, &original_task.getBody().front());
-
-    BlockArgResolver resolver(original_task);
-
-    //------------------------------------------------------
-    // Step1: Determines read/write memresfs and value inputs.
-    //------------------------------------------------------
-    SmallVector<Value> read_memrefs;
-    SmallVector<Value> write_memrefs;
-    SmallVector<Value> value_inputs;
-
-    DenseMap<Value, unsigned> source_to_read_memref_idx;
-    DenseMap<Value, unsigned> source_to_write_memref_idx;
-    DenseMap<Value, unsigned> source_to_value_input_idx;
-
-    // Classifies memrefs into read and write sets.
-    for (Value memref : access_info.getReadMemRefs()) {
-      Value source = resolver.resolveToSource(memref);
-      Value input_memref = getLatestMemrefVersion(source);
-
-      if (!source_to_read_memref_idx.count(source)) {
-        source_to_read_memref_idx[source] = read_memrefs.size();
-        read_memrefs.push_back(input_memref);
-      }
-    }
-
-    for (Value memref : access_info.getWriteMemRefs()) {
-      Value source = resolver.resolveToSource(memref);
-      Value input_memref = getLatestMemrefVersion(source);
-
-      if (!source_to_write_memref_idx.count(source)) {
-        source_to_write_memref_idx[source] = write_memrefs.size();
-        write_memrefs.push_back(input_memref);
-      }
-    }
-
-    for (Value val : access_info.getAllValues()) {
-      Value source = resolver.resolveToSource(val);
-      Value input_val = getLatestValueVersion(source);
-
-      if (!source_to_value_input_idx.count(source)) {
-        source_to_value_input_idx[source] = value_inputs.size();
-        value_inputs.push_back(input_val);
-      }
-    }
-
-    //------------------------------------------------------
-    // Step 2: Determines output types.
-    //------------------------------------------------------
-    // Determines memref outputs.
-    SmallVector<Type> memref_output_types;
-    // The source memrefs of the written memrefs.
-    SmallVector<Value> written_memref_sources;
-
-    for (Value memref : access_info.memref_writes) {
-      Value source = resolver.resolveToSource(memref);
-      memref_output_types.push_back(source.getType());
-      written_memref_sources.push_back(source);
-    }
-
-    // Determines value outputs.
-    SmallVector<Type> value_output_types;
-    SmallVector<Value> yielded_value_sources;
-
-    if (!hyperblock.getOutputs().empty()) {
-      for (Value output : hyperblock.getOutputs()) {
-        value_output_types.push_back(output.getType());
-        // For value outputs, they are source themselves.
-        yielded_value_sources.push_back(output);
-      }
-    }
-
-    //------------------------------------------------------
-    // Step 3: Resolves original_read_memrefs and original_write_memrefs.
-    //------------------------------------------------------
-    // Map: block arg -> original memref (from original task).
-    DenseMap<Value, Value> arg_to_original_read;
-    DenseMap<Value, Value> arg_to_original_write;
-
-    Block *orig_body = &original_task.getBody().front();
-    auto orig_read_memrefs = original_task.getOriginalReadMemrefs();
-    auto orig_write_memrefs = original_task.getOriginalWriteMemrefs();
-
-    size_t read_arg_count = original_task.getReadMemrefs().size();
-    size_t write_arg_count = original_task.getWriteMemrefs().size();
-
-    // Maps read args to original read memrefs.
-    for (auto [orig_memref, arg] :
-         llvm::zip(orig_read_memrefs,
-                   orig_body->getArguments().take_front(read_arg_count))) {
-      arg_to_original_read[arg] = orig_memref;
-    }
-
-    // Maps write args to original write memrefs.
-    for (auto [orig_memref, arg] :
-         llvm::zip(orig_write_memrefs, orig_body->getArguments().slice(
-                                           read_arg_count, write_arg_count))) {
-      arg_to_original_write[arg] = orig_memref;
-    }
-
-    // Collects original memrefs for this new task.
-    SmallVector<Value> new_original_read_memrefs;
-    SmallVector<Value> new_original_write_memrefs;
-
-    for (Value memref : access_info.memref_reads) {
-      if (arg_to_original_read.count(memref)) {
-        new_original_read_memrefs.push_back(arg_to_original_read[memref]);
-      } else if (arg_to_original_write.count(memref)) {
-        // If reading from a write memref, add to original read.
-        new_original_read_memrefs.push_back(arg_to_original_write[memref]);
-      }
-    }
-
-    for (Value memref : access_info.memref_writes) {
-      if (arg_to_original_write.count(memref)) {
-        new_original_write_memrefs.push_back(arg_to_original_write[memref]);
-      } else if (arg_to_original_read.count(memref)) {
-        // If writing to a read memref, add to original write.
-        new_original_write_memrefs.push_back(arg_to_original_read[memref]);
-      }
-    }
-
-    //------------------------------------------------------
-    // Step 4: Creates the new taskflow.task operation.
-    //------------------------------------------------------
-    std::string task_name = "Task_" + std::to_string(this->global_task_idx);
-    auto new_task = builder.create<TaskflowTaskOp>(
-        this->loc, memref_output_types, value_output_types, read_memrefs,
-        write_memrefs, value_inputs, builder.getStringAttr(task_name),
-        new_original_read_memrefs, new_original_write_memrefs);
-
-    //------------------------------------------------------
-    // Step 5: Builds the task body.
-    //------------------------------------------------------
-    Block *task_body = new Block();
-    new_task.getBody().push_back(task_body);
-
-    // Adds block arguments: [read_memrefs, write_memrefs, value_inputs].
-    for (Value input : read_memrefs) {
-      task_body->addArgument(input.getType(), this->loc);
-    }
-    for (Value input : write_memrefs) {
-      task_body->addArgument(input.getType(), this->loc);
-    }
-    for (Value input : value_inputs) {
-      task_body->addArgument(input.getType(), this->loc);
-    }
-
-    // Builds value mapping.
-    IRMapping mapping;
-
-    // Maps read memrefs.
-    for (auto [source, idx] : source_to_read_memref_idx) {
-      BlockArgument new_arg = task_body->getArgument(idx);
-      mapping.map(source, new_arg);
-
-      if (Value orig_arg = resolver.getBlockArg(source)) {
-        mapping.map(orig_arg, new_arg);
-      }
-    }
-
-    // Maps write memrefs.
-    size_t write_arg_offset = read_memrefs.size();
-    for (auto [source, idx] : source_to_write_memref_idx) {
-      BlockArgument new_arg = task_body->getArgument(write_arg_offset + idx);
-      mapping.map(source, new_arg);
-
-      if (Value orig_arg = resolver.getBlockArg(source)) {
-        mapping.map(orig_arg, new_arg);
-      }
-    }
-
-    // Maps value inputs.
-    size_t value_arg_offset = read_memrefs.size() + write_memrefs.size();
-    for (auto [source, idx] : source_to_value_input_idx) {
-      BlockArgument new_arg = task_body->getArgument(value_arg_offset + idx);
-      mapping.map(source, new_arg);
-
-      if (Value orig_arg = resolver.getBlockArg(source)) {
-        mapping.map(orig_arg, new_arg);
-      }
-    }
-
-    // Clones counters and hyperblock.
-    OpBuilder task_builder(task_body, task_body->begin());
-    cloneCounters(task_builder, hyperblock, mapping);
-    cloneHyperblock(task_builder, hyperblock, mapping);
-
-    // Creates yield.
-    SmallVector<Value> memref_yield_operands;
-    for (Value memref : access_info.memref_writes) {
-      memref_yield_operands.push_back(mapping.lookupOrDefault(memref));
-    }
-
-    SmallVector<Value> value_yield_operands;
-    // If this hyperblock has value outputs, we need to yield them from the
-    // mapped hyperblock.
-    if (!hyperblock.getOutputs().empty()) {
-      // Finds the cloned hyperblock op.
-      TaskflowHyperblockOp cloned_hb = nullptr;
-      for (Operation &op : task_body->getOperations()) {
-        if (auto hb = dyn_cast<TaskflowHyperblockOp>(op)) {
-          cloned_hb = hb;
-          break;
-        }
-        if (cloned_hb) {
-          for (Value output : cloned_hb.getOutputs()) {
-            value_yield_operands.push_back(output);
-          }
-        }
-      }
-    }
-
-    task_builder.setInsertionPointToEnd(task_body);
-    task_builder.create<TaskflowYieldOp>(this->loc, memref_yield_operands,
-                                         value_yield_operands);
-
-    //------------------------------------------------------
-    // Step 6: Updates latest versions.
-    //------------------------------------------------------
-    // Updates latest versions.
-    auto memref_outputs = new_task.getWriteOutputs();
-    for (auto [source, output] :
-         llvm::zip(written_memref_sources, memref_outputs)) {
-      this->memref_to_latest_version[source] = output;
-    }
-
-    auto value_outputs = new_task.getValueOutputs();
-    for (auto [source, output] :
-         llvm::zip(yielded_value_sources, value_outputs)) {
-      this->value_to_latest_version[source] = output;
-    }
-
-    return new_task;
-  }
-
-private:
-  Value getLatestMemrefVersion(Value source) {
-    auto it = this->memref_to_latest_version.find(source);
-    return it != this->memref_to_latest_version.end() ? it->second : source;
-  }
-
-  Value getLatestValueVersion(Value source) {
-    auto it = this->value_to_latest_version.find(source);
-    return it != this->value_to_latest_version.end() ? it->second : source;
-  }
-
-  void cloneCounters(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock,
-                     IRMapping &mapping) {
-    CounterCollector collector;
-    collector.collect(hyperblock);
-
-    for (TaskflowCounterOp counter : collector.getSortedCounters()) {
-      task_builder.clone(*counter.getOperation(), mapping);
-    }
-  }
-
-  void cloneHyperblock(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock,
-                       IRMapping &mapping) {
-    SmallVector<Value> mapped_indices;
-    for (Value idx : hyperblock.getIndices()) {
-      mapped_indices.push_back(mapping.lookupOrDefault(idx));
-    }
-
-    SmallVector<Value> mapped_iter_args;
-    for (Value arg : hyperblock.getIterArgs()) {
-      mapped_iter_args.push_back(mapping.lookupOrDefault(arg));
-    }
-
-    SmallVector<Type> output_types(hyperblock.getOutputs().getTypes());
-    auto newHB = task_builder.create<TaskflowHyperblockOp>(
-        this->loc, output_types, mapped_indices, mapped_iter_args);
-
-    Block *new_body = new Block();
-    newHB.getBody().push_back(new_body);
-
-    for (Value idx : mapped_indices) {
-      new_body->addArgument(idx.getType(), this->loc);
-    }
-
-    for (Value arg : mapped_iter_args) {
-      new_body->addArgument(arg.getType(), this->loc);
-    }
-
-    Block *old_body = &hyperblock.getBody().front();
-    for (auto [old_arg, new_arg] :
-         llvm::zip(old_body->getArguments(), new_body->getArguments())) {
-      mapping.map(old_arg, new_arg);
-    }
-
-    OpBuilder hb_builder(new_body, new_body->begin());
-    for (Operation &op : old_body->without_terminator()) {
-      hb_builder.clone(op, mapping);
-    }
-
-    if (auto yield =
-            dyn_cast<TaskflowHyperblockYieldOp>(old_body->getTerminator())) {
-      SmallVector<Value> yield_results;
-      SmallVector<Value> yield_iter_args_next;
-      for (Value v : yield.getResults()) {
-        yield_results.push_back(mapping.lookupOrDefault(v));
-      }
-      for (Value v : yield.getIterArgsNext()) {
-        yield_iter_args_next.push_back(mapping.lookupOrDefault(v));
-      }
-      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc, yield_results,
-                                                   yield_iter_args_next);
-    } else {
-      hb_builder.create<TaskflowHyperblockYieldOp>(this->loc);
-    }
-  }
-
-  OpBuilder &builder;
-  Location loc;
-  unsigned global_task_idx;
-  DenseMap<Value, Value> &memref_to_latest_version;
-  DenseMap<Value, Value> &value_to_latest_version;
-};
-
-//----------------------------------------------------------------------
-// Pass Implementation.
-//----------------------------------------------------------------------
-
-struct CanonicalizeTaskPass
-    : public PassWrapper<CanonicalizeTaskPass, OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeTaskPass)
-
-  StringRef getArgument() const final { return "canonicalize-task"; }
-
-  StringRef getDescription() const final {
-    return "Canonicalizes tasks by splitting each hyperblock into a separate "
-           "atomic task (one hyperblock per task)";
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<taskflow::TaskflowDialect, arith::ArithDialect,
-                memref::MemRefDialect, func::FuncDialect, scf::SCFDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func_op = getOperation();
-
-    SmallVector<TaskflowTaskOp> tasks_to_process;
-    func_op.walk(
-        [&](TaskflowTaskOp task_op) { tasks_to_process.push_back(task_op); });
-
-    unsigned global_task_idx = 0;
-
-    for (TaskflowTaskOp original_task : tasks_to_process) {
-      OpBuilder builder(original_task);
-      // Collects hyperblocks within the original task.
-      SmallVector<TaskflowHyperblockOp> hyperblocks;
-      original_task.walk(
-          [&](TaskflowHyperblockOp hb) { hyperblocks.push_back(hb); });
-
-      assert(!hyperblocks.empty() &&
-             "Expected at least one hyperblock in the task");
-
-      // If there's only one hyperblock, it is already canonical.
-      if (hyperblocks.size() == 1) {
-        std::string task_name = "Task_" + std::to_string(global_task_idx++);
-        original_task.setTaskNameAttr(builder.getStringAttr(task_name));
-        continue;
-      }
-
-      //----------------------------------------------------------------
-      // Step 1: Builds mapping from original task's memory outputs to their
-      //         corresponding source memrefs (the original inputs).
-      //----------------------------------------------------------------
-      // Gets the yield operation to find which memrefs are yielded.
-      auto yield_op = cast<TaskflowYieldOp>(
-          original_task.getBody().front().getTerminator());
-
-      auto original_write_outputs = original_task.getWriteOutputs();
-      auto original_val_outputs = original_task.getValueOutputs();
-      auto yielded_memrefs = yield_op.getMemoryResults();
-      auto yielded_values = yield_op.getValueResults();
-
-      // Map: yielded -> original task output.
-      DenseMap<Value, Value> yielded_to_output;
-      for (auto [yielded, output] :
-           llvm::zip(yielded_memrefs, original_write_outputs)) {
-        yielded_to_output[yielded] = output;
-      }
-      for (auto [yielded, output] :
-           llvm::zip(yielded_values, original_val_outputs)) {
-        yielded_to_output[yielded] = output;
-      }
-
-      // Map: original input memref -> original task output (if it's yielded).
-      // This tells us which original outputs correspond to which input memrefs.
-      Block *orig_body = &original_task.getBody().front();
-      auto orig_read_memrefs = original_task.getReadMemrefs();
-      auto orig_write_memrefs = original_task.getWriteMemrefs();
-      auto orig_val_inputs = original_task.getValueInputs();
-
-      DenseMap<Value, Value> source_to_original_output;
-
-      // Maps read memrefs.
-      for (auto [input, arg] :
-           llvm::zip(orig_read_memrefs, orig_body->getArguments().take_front(
-                                            orig_read_memrefs.size()))) {
-        if (yielded_to_output.count(arg)) {
-          source_to_original_output[input] = yielded_to_output[arg];
-        }
-      }
-
-      // Maps write memrefs.
-      size_t write_offset = orig_read_memrefs.size();
-      for (auto [input, arg] :
-           llvm::zip(orig_write_memrefs,
-                     orig_body->getArguments().slice(
-                         write_offset, orig_write_memrefs.size()))) {
-        if (yielded_to_output.count(arg)) {
-          source_to_original_output[input] = yielded_to_output[arg];
-        }
-      }
-
-      // Maps value inputs.
-      for (auto [input, arg] :
-           llvm::zip(orig_val_inputs,
-                     orig_body->getArguments().drop_front(
-                         write_offset + orig_write_memrefs.size()))) {
-        if (yielded_to_output.count(arg)) {
-          source_to_original_output[input] = yielded_to_output[arg];
-        }
-      }
-
-      //----------------------------------------------------------------
-      // Step 2: Creates atomic tasks for each hyperblock.
-      //----------------------------------------------------------------
-      // Records the mapping from source memref to the latest version after
-      // executing each atomic task.
-      DenseMap<Value, Value> memref_to_latest_version;
-      DenseMap<Value, Value> value_to_latest_version;
-
-      for (size_t i = 0; i < hyperblocks.size(); ++i) {
-        AtomicTaskBuilder task_builder(
-            builder, original_task.getLoc(), global_task_idx++,
-            memref_to_latest_version, value_to_latest_version);
-        task_builder.build(hyperblocks[i], original_task);
-      }
-
-      //----------------------------------------------------------------
-      // Step 3: Replaces uses of original task outputs with the latest
-      // versions.
-      //----------------------------------------------------------------
-      for (auto [source, original_output] : source_to_original_output) {
-        Value latest = nullptr;
-        if (memref_to_latest_version.count(source)) {
-          latest = memref_to_latest_version[source];
-        } else if (value_to_latest_version.count(source)) {
-          latest = value_to_latest_version[source];
-        }
-
-        if (latest) {
-          original_output.replaceAllUsesWith(latest);
-        }
-      }
-
-      //----------------------------------------------------------------
-      // Step 4: Erase the original task.
-      //----------------------------------------------------------------
-      original_task.erase();
-    }
-  }
-};
-
-} // namespace
-
-std::unique_ptr<Pass> mlir::taskflow::createCanonicalizeTaskPass() {
-  return std::make_unique<CanonicalizeTaskPass>();
-}
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index e8a9927c..bb503c5d 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -25,7 +25,7 @@ using namespace mlir::taskflow;
 
 namespace {
 //---------------------------------------------------------------------------
-// Loop Info Structure
+// Loop Info Structure.
 //----------------------------------------------------------------------------
 struct LoopInfo {
   affine::AffineForOp for_op;
@@ -42,7 +42,7 @@ struct LoopInfo {
 };
 
 //---------------------------------------------------------------------------
-// Hyperblock Info Structure
+// Hyperblock Info Structure.
 //----------------------------------------------------------------------------
 // Represents a code block that should become a hyperblock.
 struct HyperblockInfo {
@@ -64,7 +64,7 @@ struct HyperblockInfo {
 };
 
 //----------------------------------------------------------------------------
-// Helper Functions
+// Helper Functions.
 //----------------------------------------------------------------------------
 // Extracts loop parameters from affine.for operation.
 static std::optional<LoopInfo> extractLoopBound(affine::AffineForOp for_op) {
@@ -123,7 +123,7 @@ static SmallVector<LoopInfo> collectLoopInfo(TaskflowTaskOp task_op) {
 }
 
 //----------------------------------------------------------------------------
-// Counter Chain Creation
+// Counter Chain Creation.
 //----------------------------------------------------------------------------
 // Recursively creates counter chain for each top-level loop.
 static void createCounterChainRecursivly(OpBuilder &builder, Location loc,
diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir
index d8facaa3..8927cbf6 100644
--- a/test/multi-cgra/kernel_mapping/fir/fir.mlir
+++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir
@@ -4,12 +4,11 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: -o %t.canonicalized.mlir
-// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
+// RUN: -o %t.hyperblock.mlir
+// RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: -o %t.kernel.mlir
@@ -17,7 +16,6 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: --lower-affine \
@@ -33,7 +31,6 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: --lower-affine \
@@ -56,7 +53,6 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: --lower-affine \
@@ -99,7 +95,7 @@ module attributes {} {
 // TASKFLOW:      module {
 // TASKFLOW-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // TASKFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
-// TASKFLOW-NEXT:     %value_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
+// TASKFLOW-NEXT:     %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // TASKFLOW-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // TASKFLOW-NEXT:       %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) {
 // TASKFLOW-NEXT:         %1 = affine.load %arg3[%arg6] : memref<?xi32>
@@ -114,30 +110,30 @@ module attributes {} {
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }
 
-// CANONICALIZE:      module {
-// CANONICALIZE-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// CANONICALIZE-NEXT:     %c0_i32 = arith.constant 0 : i32
-// CANONICALIZE-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
-// CANONICALIZE-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// CANONICALIZE-NEXT:       %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array<i32: 1, 1>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg6: index, %arg7: i32):
-// CANONICALIZE-NEXT:         %2 = memref.load %arg3[%arg6] : memref<?xi32>
-// CANONICALIZE-NEXT:         %3 = memref.load %arg4[%arg6] : memref<?xi32>
-// CANONICALIZE-NEXT:         %4 = arith.muli %2, %3 : i32
-// CANONICALIZE-NEXT:         %5 = arith.addi %arg7, %4 : i32
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield iter_args_next(%5 : i32) results(%5 : i32)
-// CANONICALIZE-NEXT:       }) : (index, i32) -> i32
-// CANONICALIZE-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
-// CANONICALIZE-NEXT:     return %value_outputs : i32
-// CANONICALIZE-NEXT:   }
-// CANONICALIZE-NEXT: }
+// HYPERBLOCK:      module {
+// HYPERBLOCK-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// HYPERBLOCK-NEXT:     %c0_i32 = arith.constant 0 : i32
+// HYPERBLOCK-NEXT:     %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
+// HYPERBLOCK-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// HYPERBLOCK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// HYPERBLOCK-NEXT:       %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array<i32: 1, 1>}> ({
+// HYPERBLOCK-NEXT:       ^bb0(%arg6: index, %arg7: i32):
+// HYPERBLOCK-NEXT:         %2 = memref.load %arg3[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:         %3 = memref.load %arg4[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:         %4 = arith.muli %2, %3 : i32
+// HYPERBLOCK-NEXT:         %5 = arith.addi %arg7, %4 : i32
+// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield iter_args_next(%5 : i32) results(%5 : i32)
+// HYPERBLOCK-NEXT:       }) : (index, i32) -> i32
+// HYPERBLOCK-NEXT:       taskflow.yield values(%1 : i32)
+// HYPERBLOCK-NEXT:     }
+// HYPERBLOCK-NEXT:     return %value_outputs : i32
+// HYPERBLOCK-NEXT:   }
+// HYPERBLOCK-NEXT: }
 
 // KERNEL:      module {
 // KERNEL-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // KERNEL-NEXT:     %c0_i32 = arith.constant 0 : i32
-// KERNEL-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// KERNEL-NEXT:     %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // KERNEL-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
 // KERNEL-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) {
@@ -149,8 +145,8 @@ module attributes {} {
 // KERNEL-NEXT:         %6 = arith.addi %arg8, %5 : i32
 // KERNEL-NEXT:         neura.yield iter_args_next(%6 : i32) results(%6 : i32)
 // KERNEL-NEXT:       } : i32
-// KERNEL-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// KERNEL-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// KERNEL-NEXT:       taskflow.yield values(%1 : i32)
+// KERNEL-NEXT:     }
 // KERNEL-NEXT:     return %value_outputs : i32
 // KERNEL-NEXT:   }
 // KERNEL-NEXT: }
@@ -158,7 +154,7 @@ module attributes {} {
 // NEURA:      module {
 // NEURA-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // NEURA-NEXT:     %c0_i32 = arith.constant 0 : i32
-// NEURA-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// NEURA-NEXT:     %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // NEURA-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // NEURA-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
 // NEURA-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura"} {
@@ -170,8 +166,8 @@ module attributes {} {
 // NEURA-NEXT:         %6 = "neura.add"(%arg8, %5) : (i32, i32) -> i32
 // NEURA-NEXT:         neura.yield iter_args_next(%6 : i32) results(%6 : i32)
 // NEURA-NEXT:       } : i32
-// NEURA-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// NEURA-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// NEURA-NEXT:       taskflow.yield values(%1 : i32)
+// NEURA-NEXT:     }
 // NEURA-NEXT:     return %value_outputs : i32
 // NEURA-NEXT:   }
 // NEURA-NEXT: }
@@ -179,7 +175,7 @@ module attributes {} {
 // DATAFLOW:      module {
 // DATAFLOW-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // DATAFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
-// DATAFLOW-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// DATAFLOW-NEXT:     %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // DATAFLOW-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // DATAFLOW-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
 // DATAFLOW-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
@@ -199,8 +195,8 @@ module attributes {} {
 // DATAFLOW-NEXT:         neura.return_value %12 : !neura.data<i32, i1>
 // DATAFLOW-NEXT:         neura.yield
 // DATAFLOW-NEXT:       } : i32
-// DATAFLOW-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// DATAFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// DATAFLOW-NEXT:       taskflow.yield values(%1 : i32)
+// DATAFLOW-NEXT:     }
 // DATAFLOW-NEXT:     return %value_outputs : i32
 // DATAFLOW-NEXT:   }
 // DATAFLOW-NEXT: }
@@ -208,7 +204,7 @@ module attributes {} {
 // MAPPED:      module {
 // MAPPED-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // MAPPED-NEXT:     %c0_i32 = arith.constant 0 : i32
-// MAPPED-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// MAPPED-NEXT:     %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // MAPPED-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // MAPPED-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
 // MAPPED-NEXT:       %1 = neura.kernel inputs(%arg3, %arg4 : memref<?xi32>, memref<?xi32>) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
@@ -240,8 +236,8 @@ module attributes {} {
 // MAPPED-NEXT:         neura.return_value %24 : !neura.data<i32, i1> {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 2 : i32}]}
 // MAPPED-NEXT:         neura.yield {dfg_id = 3 : i32}
 // MAPPED-NEXT:       } : i32
-// MAPPED-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// MAPPED-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// MAPPED-NEXT:       taskflow.yield values(%1 : i32)
+// MAPPED-NEXT:     }
 // MAPPED-NEXT:     return %value_outputs : i32
 // MAPPED-NEXT:   }
 // MAPPED-NEXT: }
\ No newline at end of file
diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
index f926d548..1802e538 100644
--- a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
+++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir
@@ -55,7 +55,7 @@
 module {
   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
     %c0_i32 = arith.constant 0 : i32
-    %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+    %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
       %1 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) {
       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
@@ -68,8 +68,8 @@ module {
         }
         neura.yield results(%0 : i32)
       } : i32
-      "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-    }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+      taskflow.yield values(%1 : i32)
+    }
     return %value_outputs : i32
   }
 }
@@ -77,7 +77,7 @@ module {
 // NEURA:      module {
 // NEURA-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // NEURA-NEXT:     %c0_i32 = arith.constant 0 : i32
-// NEURA-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// NEURA-NEXT:     %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // NEURA-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // NEURA-NEXT:       %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) attributes {accelerator = "neura"} {
 // NEURA-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
@@ -101,56 +101,56 @@ module {
 // NEURA-NEXT:       ^bb3:  // pred: ^bb1
 // NEURA-NEXT:         neura.yield results(%6 : i32)
 // NEURA-NEXT:       } : i32
-// NEURA-NEXT:       "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// NEURA-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// NEURA-NEXT:       taskflow.yield values(%0 : i32)
+// NEURA-NEXT:     }
 // NEURA-NEXT:     return %value_outputs : i32
 // NEURA-NEXT:   }
 // NEURA-NEXT: }
 
 
-// DATAFLOW:     module {
-// DATAFLOW-NEXT:  func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// DATAFLOW-NEXT:    %c0_i32 = arith.constant 0 : i32
-// DATAFLOW-NEXT:    %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
-// DATAFLOW-NEXT:    ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
-// DATAFLOW-NEXT:      %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
-// DATAFLOW-NEXT:      ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
-// DATAFLOW-NEXT:        %1 = "neura.grant_once"() <{constant_value = "%input2"}> : () -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %2 = "neura.constant"() <{value = 0 : index}> : () -> !neura.data<index, i1>
-// DATAFLOW-NEXT:        %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (!neura.data<index, i1>) -> !neura.data<i64, i1>
-// DATAFLOW-NEXT:        %4 = "neura.grant_once"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// DATAFLOW-NEXT:        %5 = neura.reserve : !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %6 = neura.phi_start %1, %5 : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %7 = neura.reserve : !neura.data<i64, i1>
-// DATAFLOW-NEXT:        %8 = neura.phi_start %4, %7 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// DATAFLOW-NEXT:        %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (!neura.data<i64, i1>) -> !neura.data<index, i1>
-// DATAFLOW-NEXT:        %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {rhs_value = 32 : index} : (!neura.data<index, i1>) -> !neura.data<i1, i1>
-// DATAFLOW-NEXT:        %11 = neura.grant_predicate %9, %10 : !neura.data<index, i1>, !neura.data<i1, i1> -> !neura.data<index, i1>
-// DATAFLOW-NEXT:        %12 = neura.grant_predicate %6, %10 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %13 = "neura.not"(%10) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// DATAFLOW-NEXT:        %14 = neura.grant_predicate %6, %13 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:        neura.return_value %14 : !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %15 = neura.load_indexed [%11 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %16 = neura.load_indexed [%11 : !neura.data<index, i1>]  {lhs_value = "%input1"} : !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %17 = "neura.mul"(%15, %16) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %18 = "neura.add"(%12, %17) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// DATAFLOW-NEXT:        %19 = "neura.add"(%11) {rhs_value = 1 : index} : (!neura.data<index, i1>) -> !neura.data<index, i1>
-// DATAFLOW-NEXT:        %20 = "neura.cast"(%19) <{cast_type = "index_to_int"}> : (!neura.data<index, i1>) -> !neura.data<i64, i1>
-// DATAFLOW-NEXT:        neura.ctrl_mov %20 -> %7 : !neura.data<i64, i1> !neura.data<i64, i1>
-// DATAFLOW-NEXT:        neura.ctrl_mov %18 -> %5 : !neura.data<i32, i1> !neura.data<i32, i1>
-// DATAFLOW-NEXT:        neura.yield
-// DATAFLOW-NEXT:      } : i32
-// DATAFLOW-NEXT:      "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// DATAFLOW-NEXT:    }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
-// DATAFLOW-NEXT:    return %value_outputs : i32
-// DATAFLOW-NEXT:  }
-// DATAFLOW-NEXT:}
+// DATAFLOW:      module {
+// DATAFLOW-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// DATAFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
+// DATAFLOW-NEXT:     %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
+// DATAFLOW-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// DATAFLOW-NEXT:       %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
+// DATAFLOW-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
+// DATAFLOW-NEXT:         %1 = "neura.grant_once"() <{constant_value = "%input2"}> : () -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %2 = "neura.constant"() <{value = 0 : index}> : () -> !neura.data<index, i1>
+// DATAFLOW-NEXT:         %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (!neura.data<index, i1>) -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:         %4 = "neura.grant_once"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:         %5 = neura.reserve : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %6 = neura.phi_start %1, %5 : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %7 = neura.reserve : !neura.data<i64, i1>
+// DATAFLOW-NEXT:         %8 = neura.phi_start %4, %7 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:         %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (!neura.data<i64, i1>) -> !neura.data<index, i1>
+// DATAFLOW-NEXT:         %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {rhs_value = 32 : index} : (!neura.data<index, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %11 = neura.grant_predicate %9, %10 : !neura.data<index, i1>, !neura.data<i1, i1> -> !neura.data<index, i1>
+// DATAFLOW-NEXT:         %12 = neura.grant_predicate %6, %10 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %13 = "neura.not"(%10) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// DATAFLOW-NEXT:         %14 = neura.grant_predicate %6, %13 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.return_value %14 : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %15 = neura.load_indexed [%11 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %16 = neura.load_indexed [%11 : !neura.data<index, i1>]  {lhs_value = "%input1"} : !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %17 = "neura.mul"(%15, %16) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %18 = "neura.add"(%12, %17) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         %19 = "neura.add"(%11) {rhs_value = 1 : index} : (!neura.data<index, i1>) -> !neura.data<index, i1>
+// DATAFLOW-NEXT:         %20 = "neura.cast"(%19) <{cast_type = "index_to_int"}> : (!neura.data<index, i1>) -> !neura.data<i64, i1>
+// DATAFLOW-NEXT:         neura.ctrl_mov %20 -> %7 : !neura.data<i64, i1> !neura.data<i64, i1>
+// DATAFLOW-NEXT:         neura.ctrl_mov %18 -> %5 : !neura.data<i32, i1> !neura.data<i32, i1>
+// DATAFLOW-NEXT:         neura.yield
+// DATAFLOW-NEXT:       } : i32
+// DATAFLOW-NEXT:       taskflow.yield values(%0 : i32)
+// DATAFLOW-NEXT:     }
+// DATAFLOW-NEXT:     return %value_outputs : i32
+// DATAFLOW-NEXT:   }
+// DATAFLOW-NEXT: }
 
 
 // MAPPED:      module {
 // MAPPED-NEXT:   func.func @_Z6kernelPiS_S_(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %arg2: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // MAPPED-NEXT:     %c0_i32 = arith.constant 0 : i32
-// MAPPED-NEXT:     %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// MAPPED-NEXT:     %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref<?xi32>, memref<?xi32>) value_inputs(%c0_i32 : i32) : (memref<?xi32>, memref<?xi32>, i32) -> (i32) {
 // MAPPED-NEXT:     ^bb0(%arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // MAPPED-NEXT:       %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref<?xi32>, memref<?xi32>, i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
 // MAPPED-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: i32):
@@ -193,11 +193,13 @@ module {
 // MAPPED-NEXT:         neura.ctrl_mov %32 -> %3 {dfg_id = 37 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
 // MAPPED-NEXT:         neura.yield {dfg_id = 4 : i32}
 // MAPPED-NEXT:       } : i32
-// MAPPED-NEXT:       "taskflow.yield"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// MAPPED-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> i32
+// MAPPED-NEXT:       taskflow.yield values(%0 : i32)
+// MAPPED-NEXT:     }
 // MAPPED-NEXT:     return %value_outputs : i32
 // MAPPED-NEXT:   }
 // MAPPED-NEXT: }
 
 
 
+
+
diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir
index ebede17a..e5727ded 100644
--- a/test/multi-cgra/kernel_mapping/relu/relu.mlir
+++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir
@@ -4,13 +4,11 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
-// RUN: -o %t.canonicalized.mlir
-// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
+// RUN: -o %t.hyperblock.mlir
+// RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: -o %t.kernel.mlir
@@ -18,7 +16,6 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: --lower-affine \
@@ -34,7 +31,6 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: --lower-affine \
@@ -57,7 +53,6 @@
 
 // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
 // RUN: --classify-counters \
 // RUN: --convert-taskflow-to-neura \
 // RUN: --lower-affine \
@@ -101,81 +96,81 @@ module attributes {} {
   }
 }
 
-// TASKFLOW:      module {
-// TASKFLOW-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
-// TASKFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
-// TASKFLOW-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// TASKFLOW-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
-// TASKFLOW-NEXT:       affine.for %arg5 = 0 to 32 {
-// TASKFLOW-NEXT:         %0 = affine.load %arg2[%arg5] : memref<?xi32>
-// TASKFLOW-NEXT:         %1 = arith.cmpi sgt, %0, %arg4 : i32
-// TASKFLOW-NEXT:         scf.if %1 {
-// TASKFLOW-NEXT:           %2 = affine.load %arg2[%arg5] : memref<?xi32>
-// TASKFLOW-NEXT:           %3 = affine.load %arg3[%arg5] : memref<?xi32>
-// TASKFLOW-NEXT:           %4 = arith.addi %3, %2 : i32
-// TASKFLOW-NEXT:           affine.store %4, %arg3[%arg5] : memref<?xi32>
-// TASKFLOW-NEXT:         } else {
-// TASKFLOW-NEXT:           %2 = affine.load %arg3[%arg5] : memref<?xi32>
-// TASKFLOW-NEXT:           affine.store %2, %arg3[%arg5] : memref<?xi32>
-// TASKFLOW-NEXT:         }
-// TASKFLOW-NEXT:       }
-// TASKFLOW-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// TASKFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
-// TASKFLOW-NEXT:     return
-// TASKFLOW-NEXT:   }
-// TASKFLOW-NEXT: }
+// TASKFLOW:     module {
+// TASKFLOW-NEXT:  func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// TASKFLOW-NEXT:    %c0_i32 = arith.constant 0 : i32
+// TASKFLOW-NEXT:    %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<?xi32>, memref<?xi32>) write_memrefs(%arg1 : memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>) {
+// TASKFLOW-NEXT:    ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// TASKFLOW-NEXT:      affine.for %arg6 = 0 to 32 {
+// TASKFLOW-NEXT:        %0 = affine.load %arg2[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:        %1 = arith.cmpi sgt, %0, %arg5 : i32
+// TASKFLOW-NEXT:        scf.if %1 {
+// TASKFLOW-NEXT:          %2 = affine.load %arg2[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:          %3 = affine.load %arg4[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:          %4 = arith.addi %3, %2 : i32
+// TASKFLOW-NEXT:          affine.store %4, %arg4[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:        } else {
+// TASKFLOW-NEXT:          %2 = affine.load %arg4[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:          affine.store %2, %arg4[%arg6] : memref<?xi32>
+// TASKFLOW-NEXT:        }
+// TASKFLOW-NEXT:      }
+// TASKFLOW-NEXT:      taskflow.yield writes(%arg4 : memref<?xi32>)
+// TASKFLOW-NEXT:    }
+// TASKFLOW-NEXT:    return
+// TASKFLOW-NEXT:  }
+// TASKFLOW-NEXT:}
 
-// CANONICALIZE:      module {
-// CANONICALIZE-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
-// CANONICALIZE-NEXT:     %c0_i32 = arith.constant 0 : i32
-// CANONICALIZE-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
-// CANONICALIZE-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%0) <{operandSegmentSizes = array<i32: 1, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg5: index):
-// CANONICALIZE-NEXT:         %1 = memref.load %arg2[%arg5] : memref<?xi32>
-// CANONICALIZE-NEXT:         %2 = arith.cmpi sgt, %1, %arg4 : i32
-// CANONICALIZE-NEXT:         scf.if %2 {
-// CANONICALIZE-NEXT:           %3 = memref.load %arg2[%arg5] : memref<?xi32>
-// CANONICALIZE-NEXT:           %4 = memref.load %arg3[%arg5] : memref<?xi32>
-// CANONICALIZE-NEXT:           %5 = arith.addi %4, %3 : i32
-// CANONICALIZE-NEXT:           memref.store %5, %arg3[%arg5] : memref<?xi32>
-// CANONICALIZE-NEXT:         } else {
-// CANONICALIZE-NEXT:           %3 = memref.load %arg3[%arg5] : memref<?xi32>
-// CANONICALIZE-NEXT:           memref.store %3, %arg3[%arg5] : memref<?xi32>
-// CANONICALIZE-NEXT:         }
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
-// CANONICALIZE-NEXT:     return
-// CANONICALIZE-NEXT:   }
-// CANONICALIZE-NEXT: }
+// HYPERBLOCK:      module {
+// HYPERBLOCK-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// HYPERBLOCK-NEXT:     %c0_i32 = arith.constant 0 : i32
+// HYPERBLOCK-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<?xi32>, memref<?xi32>) write_memrefs(%arg1 : memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>) {
+// HYPERBLOCK-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
+// HYPERBLOCK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
+// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%0) <{operandSegmentSizes = array<i32: 1, 0>}> ({
+// HYPERBLOCK-NEXT:       ^bb0(%arg6: index):
+// HYPERBLOCK-NEXT:         %1 = memref.load %arg2[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:         %2 = arith.cmpi sgt, %1, %arg5 : i32
+// HYPERBLOCK-NEXT:         scf.if %2 {
+// HYPERBLOCK-NEXT:           %3 = memref.load %arg2[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:           %4 = memref.load %arg4[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:           %5 = arith.addi %4, %3 : i32
+// HYPERBLOCK-NEXT:           memref.store %5, %arg4[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:         } else {
+// HYPERBLOCK-NEXT:           %3 = memref.load %arg4[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:           memref.store %3, %arg4[%arg6] : memref<?xi32>
+// HYPERBLOCK-NEXT:         }
+// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:       }) : (index) -> ()
+// HYPERBLOCK-NEXT:       taskflow.yield writes(%arg4 : memref<?xi32>)
+// HYPERBLOCK-NEXT:     }
+// HYPERBLOCK-NEXT:     return
+// HYPERBLOCK-NEXT:   }
+// HYPERBLOCK-NEXT: }
 
 // KERNEL:      module {
 // KERNEL-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
 // KERNEL-NEXT:     %c0_i32 = arith.constant 0 : i32
-// KERNEL-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// KERNEL-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// KERNEL-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<?xi32>, memref<?xi32>) write_memrefs(%arg1 : memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>) {
+// KERNEL-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // KERNEL-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// KERNEL-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) {
-// KERNEL-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// KERNEL-NEXT:       neura.kernel inputs(%arg2, %arg5, %arg4 : memref<?xi32>, i32, memref<?xi32>) {
+// KERNEL-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: i32, %arg8: memref<?xi32>):
 // KERNEL-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// KERNEL-NEXT:         %2 = memref.load %arg5[%1] : memref<?xi32>
-// KERNEL-NEXT:         %3 = arith.cmpi sgt, %2, %arg6 : i32
+// KERNEL-NEXT:         %2 = memref.load %arg6[%1] : memref<?xi32>
+// KERNEL-NEXT:         %3 = arith.cmpi sgt, %2, %arg7 : i32
 // KERNEL-NEXT:         scf.if %3 {
-// KERNEL-NEXT:           %4 = memref.load %arg5[%1] : memref<?xi32>
-// KERNEL-NEXT:           %5 = memref.load %arg7[%1] : memref<?xi32>
+// KERNEL-NEXT:           %4 = memref.load %arg6[%1] : memref<?xi32>
+// KERNEL-NEXT:           %5 = memref.load %arg8[%1] : memref<?xi32>
 // KERNEL-NEXT:           %6 = arith.addi %5, %4 : i32
-// KERNEL-NEXT:           memref.store %6, %arg7[%1] : memref<?xi32>
+// KERNEL-NEXT:           memref.store %6, %arg8[%1] : memref<?xi32>
 // KERNEL-NEXT:         } else {
-// KERNEL-NEXT:           %4 = memref.load %arg7[%1] : memref<?xi32>
-// KERNEL-NEXT:           memref.store %4, %arg7[%1] : memref<?xi32>
+// KERNEL-NEXT:           %4 = memref.load %arg8[%1] : memref<?xi32>
+// KERNEL-NEXT:           memref.store %4, %arg8[%1] : memref<?xi32>
 // KERNEL-NEXT:         }
 // KERNEL-NEXT:         neura.yield
 // KERNEL-NEXT:       }
-// KERNEL-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// KERNEL-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// KERNEL-NEXT:       taskflow.yield writes(%arg4 : memref<?xi32>)
+// KERNEL-NEXT:     }
 // KERNEL-NEXT:     return
 // KERNEL-NEXT:   }
 // KERNEL-NEXT: }
@@ -183,30 +178,30 @@ module attributes {} {
 // NEURA:      module {
 // NEURA-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
 // NEURA-NEXT:     %c0_i32 = arith.constant 0 : i32
-// NEURA-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// NEURA-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// NEURA-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<?xi32>, memref<?xi32>) write_memrefs(%arg1 : memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>) {
+// NEURA-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // NEURA-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// NEURA-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura"} {
-// NEURA-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// NEURA-NEXT:       neura.kernel inputs(%arg2, %arg5, %arg4 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura"} {
+// NEURA-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: i32, %arg8: memref<?xi32>):
 // NEURA-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// NEURA-NEXT:         %2 = neura.load_indexed %arg5[%1 : index] memref<?xi32> : i32
-// NEURA-NEXT:         %3 = "neura.icmp"(%2, %arg6) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// NEURA-NEXT:         %2 = neura.load_indexed %arg6[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %3 = "neura.icmp"(%2, %arg7) <{cmpType = "sgt"}> : (i32, i32) -> i1
 // NEURA-NEXT:         neura.cond_br %3 : i1 then to ^bb1 else to ^bb2
 // NEURA-NEXT:       ^bb1:  // pred: ^bb0
-// NEURA-NEXT:         %4 = neura.load_indexed %arg5[%1 : index] memref<?xi32> : i32
-// NEURA-NEXT:         %5 = neura.load_indexed %arg7[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %4 = neura.load_indexed %arg6[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %5 = neura.load_indexed %arg8[%1 : index] memref<?xi32> : i32
 // NEURA-NEXT:         %6 = "neura.add"(%5, %4) : (i32, i32) -> i32
-// NEURA-NEXT:         neura.store_indexed %6 to %arg7[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         neura.store_indexed %6 to %arg8[%1 : index] memref<?xi32> : i32
 // NEURA-NEXT:         neura.br to ^bb3
 // NEURA-NEXT:       ^bb2:  // pred: ^bb0
-// NEURA-NEXT:         %7 = neura.load_indexed %arg7[%1 : index] memref<?xi32> : i32
-// NEURA-NEXT:         neura.store_indexed %7 to %arg7[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         %7 = neura.load_indexed %arg8[%1 : index] memref<?xi32> : i32
+// NEURA-NEXT:         neura.store_indexed %7 to %arg8[%1 : index] memref<?xi32> : i32
 // NEURA-NEXT:         neura.br to ^bb3
 // NEURA-NEXT:       ^bb3:  // 2 preds: ^bb1, ^bb2
 // NEURA-NEXT:         neura.yield
 // NEURA-NEXT:       }
-// NEURA-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// NEURA-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// NEURA-NEXT:       taskflow.yield writes(%arg4 : memref<?xi32>)
+// NEURA-NEXT:     }
 // NEURA-NEXT:     return
 // NEURA-NEXT:   }
 // NEURA-NEXT: }
@@ -214,11 +209,11 @@ module attributes {} {
 // DATAFLOW:      module {
 // DATAFLOW-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
 // DATAFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
-// DATAFLOW-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// DATAFLOW-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// DATAFLOW-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<?xi32>, memref<?xi32>) write_memrefs(%arg1 : memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>) {
+// DATAFLOW-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // DATAFLOW-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// DATAFLOW-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
-// DATAFLOW-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// DATAFLOW-NEXT:       neura.kernel inputs(%arg2, %arg5, %arg4 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
+// DATAFLOW-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: i32, %arg8: memref<?xi32>):
 // DATAFLOW-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
 // DATAFLOW-NEXT:         %2 = neura.load_indexed [%1 : !neura.data<index, i1>]  {lhs_value = "%input0"} : !neura.data<i32, i1>
 // DATAFLOW-NEXT:         %3 = "neura.icmp"(%2) <{cmpType = "sgt"}> {rhs_value = "%input1"} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
@@ -233,8 +228,8 @@ module attributes {} {
 // DATAFLOW-NEXT:         neura.store_indexed %10 to [%4 : !neura.data<index, i1>]  {rhs_value = "%input2"} : !neura.data<i32, i1>
 // DATAFLOW-NEXT:         neura.yield {yield_type = "void"}
 // DATAFLOW-NEXT:       }
-// DATAFLOW-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// DATAFLOW-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// DATAFLOW-NEXT:       taskflow.yield writes(%arg4 : memref<?xi32>)
+// DATAFLOW-NEXT:     }
 // DATAFLOW-NEXT:     return
 // DATAFLOW-NEXT:   }
 // DATAFLOW-NEXT: }
@@ -242,11 +237,11 @@ module attributes {} {
 // MAPPED:      module {
 // MAPPED-NEXT:   func.func @_Z6kernelPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
 // MAPPED-NEXT:     %c0_i32 = arith.constant 0 : i32
-// MAPPED-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array<i32: 2, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// MAPPED-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: i32):
+// MAPPED-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<?xi32>, memref<?xi32>) write_memrefs(%arg1 : memref<?xi32>) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, i32) -> (memref<?xi32>) {
+// MAPPED-NEXT:     ^bb0(%arg2: memref<?xi32>, %arg3: memref<?xi32>, %arg4: memref<?xi32>, %arg5: i32):
 // MAPPED-NEXT:       %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index
-// MAPPED-NEXT:       neura.kernel inputs(%arg2, %arg4, %arg3 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
-// MAPPED-NEXT:       ^bb0(%arg5: memref<?xi32>, %arg6: i32, %arg7: memref<?xi32>):
+// MAPPED-NEXT:       neura.kernel inputs(%arg2, %arg5, %arg4 : memref<?xi32>, i32, memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
+// MAPPED-NEXT:       ^bb0(%arg6: memref<?xi32>, %arg7: i32, %arg8: memref<?xi32>):
 // MAPPED-NEXT:         %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 0 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data<index, i1>
 // MAPPED-NEXT:         %2 = "neura.data_mov"(%1) {dfg_id = 2 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<index, i1>) -> !neura.data<index, i1>
 // MAPPED-NEXT:         %3 = neura.load_indexed [%2 : !neura.data<index, i1>]  {dfg_id = 5 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i32, i1>
@@ -277,10 +272,8 @@ module attributes {} {
 // MAPPED-NEXT:         neura.store_indexed %25 to [%26 : !neura.data<index, i1>]  {dfg_id = 28 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 1 : i32}], rhs_value = "%input2"} : !neura.data<i32, i1>
 // MAPPED-NEXT:         neura.yield {dfg_id = 1 : i32, yield_type = "void"}
 // MAPPED-NEXT:       }
-// MAPPED-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// MAPPED-NEXT:     }) : (memref<?xi32>, memref<?xi32>, i32) -> memref<?xi32>
+// MAPPED-NEXT:       taskflow.yield writes(%arg4 : memref<?xi32>)
+// MAPPED-NEXT:     }
 // MAPPED-NEXT:     return
 // MAPPED-NEXT:   }
-// MAPPED-NEXT: }
-
-
+// MAPPED-NEXT: }
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 9d1e6f46..906bc267 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -1,18 +1,18 @@
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: -o %t.serialized.mlir
+// RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED
+
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
 // RUN: -o %t.taskflow.mlir
 // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
-// RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
-// RUN: -o %t.canonicalized.mlir
-// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
-
 #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)>
 module attributes {} {
   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
@@ -49,6 +49,45 @@ module attributes {} {
   }
 }
 
+// SERIALIZED:      module {
+// SERIALIZED-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// SERIALIZED-NEXT:     %c2_i32 = arith.constant 2 : i32
+// SERIALIZED-NEXT:     %c8_i32 = arith.constant 8 : i32
+// SERIALIZED-NEXT:     %c0_i32 = arith.constant 0 : i32
+// SERIALIZED-NEXT:     %alloca = memref.alloca() : memref<i32>
+// SERIALIZED-NEXT:     %alloca_0 = memref.alloca() : memref<4x8xi32>
+// SERIALIZED-NEXT:     %0 = affine.for %arg0 = 0 to 5 iter_args(%arg1 = %c0_i32) -> (i32) {
+// SERIALIZED-NEXT:       %2 = arith.index_cast %arg0 : index to i32
+// SERIALIZED-NEXT:       %3 = arith.addi %arg1, %2 : i32
+// SERIALIZED-NEXT:       affine.yield %3 : i32
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg0 = 0 to 4 {
+// SERIALIZED-NEXT:       %2 = arith.index_cast %arg0 : index to i32
+// SERIALIZED-NEXT:       %3 = arith.muli %2, %c8_i32 : i32
+// SERIALIZED-NEXT:       affine.for %arg1 = 0 to 8 {
+// SERIALIZED-NEXT:         %4 = arith.index_cast %arg1 : index to i32
+// SERIALIZED-NEXT:         %5 = arith.addi %3, %4 : i32
+// SERIALIZED-NEXT:         affine.store %5, %alloca_0[%arg0, %arg1] : memref<4x8xi32>
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg0 = 0 to 4 {
+// SERIALIZED-NEXT:       %2 = arith.index_cast %arg0 : index to i32
+// SERIALIZED-NEXT:       %3 = arith.muli %2, %c8_i32 : i32
+// SERIALIZED-NEXT:       affine.for %arg1 = 0 to 8 {
+// SERIALIZED-NEXT:         %4 = affine.load %alloca_0[%arg0, %arg1] : memref<4x8xi32>
+// SERIALIZED-NEXT:         %5 = arith.addi %4, %0 : i32
+// SERIALIZED-NEXT:         affine.if #set(%arg0, %arg1) {
+// SERIALIZED-NEXT:           affine.store %5, %alloca[] : memref<i32>
+// SERIALIZED-NEXT:           %6 = arith.muli %5, %c2_i32 : i32
+// SERIALIZED-NEXT:           affine.store %6, %alloca[] : memref<i32>
+// SERIALIZED-NEXT:         }
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     %1 = affine.load %alloca[] : memref<i32>
+// SERIALIZED-NEXT:     return %1 : i32
+// SERIALIZED-NEXT:   }
+// SERIALIZED-NEXT: }
+
 // TASKFLOW:      #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)>
 // TASKFLOW-NEXT: module {
 // TASKFLOW-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
@@ -57,26 +96,34 @@ module attributes {} {
 // TASKFLOW-NEXT:     %c0_i32 = arith.constant 0 : i32
 // TASKFLOW-NEXT:     %alloca = memref.alloca() : memref<i32>
 // TASKFLOW-NEXT:     %alloca_0 = memref.alloca() : memref<4x8xi32>
-// TASKFLOW-NEXT:     %value_outputs = "taskflow.task"(%c0_i32) <{operandSegmentSizes = array<i32: 0, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
+// TASKFLOW-NEXT:     %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) {
 // TASKFLOW-NEXT:     ^bb0(%arg0: i32):
 // TASKFLOW-NEXT:       %1 = affine.for %arg1 = 0 to 5 iter_args(%arg2 = %arg0) -> (i32) {
 // TASKFLOW-NEXT:         %2 = arith.index_cast %arg1 : index to i32
 // TASKFLOW-NEXT:         %3 = arith.addi %arg2, %2 : i32
 // TASKFLOW-NEXT:         affine.yield %3 : i32
 // TASKFLOW-NEXT:       }
-// TASKFLOW-NEXT:       "taskflow.yield"(%1) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// TASKFLOW-NEXT:     }) : (i32) -> i32
-// TASKFLOW-NEXT:     %memory_outputs:2 = "taskflow.task"(%alloca_0, %alloca, %c8_i32, %value_outputs, %c2_i32) <{operandSegmentSizes = array<i32: 2, 3>, resultSegmentSizes = array<i32: 2, 0>, task_name = "Task_1"}> ({
+// TASKFLOW-NEXT:       taskflow.yield values(%1 : i32)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) {
+// TASKFLOW-NEXT:     ^bb0(%arg0: memref<4x8xi32>, %arg1: i32):
+// TASKFLOW-NEXT:       affine.for %arg2 = 0 to 4 {
+// TASKFLOW-NEXT:         %1 = arith.index_cast %arg2 : index to i32
+// TASKFLOW-NEXT:         %2 = arith.muli %1, %arg1 : i32
+// TASKFLOW-NEXT:         affine.for %arg3 = 0 to 8 {
+// TASKFLOW-NEXT:           %3 = arith.index_cast %arg3 : index to i32
+// TASKFLOW-NEXT:           %4 = arith.addi %2, %3 : i32
+// TASKFLOW-NEXT:           affine.store %4, %arg0[%arg2, %arg3] : memref<4x8xi32>
+// TASKFLOW-NEXT:         }
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg0 : memref<4x8xi32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref<i32>) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref<i32>, i32, i32, i32) -> (memref<i32>) {
 // TASKFLOW-NEXT:     ^bb0(%arg0: memref<4x8xi32>, %arg1: memref<i32>, %arg2: i32, %arg3: i32, %arg4: i32):
 // TASKFLOW-NEXT:       affine.for %arg5 = 0 to 4 {
 // TASKFLOW-NEXT:         %1 = arith.index_cast %arg5 : index to i32
 // TASKFLOW-NEXT:         %2 = arith.muli %1, %arg2 : i32
 // TASKFLOW-NEXT:         affine.for %arg6 = 0 to 8 {
-// TASKFLOW-NEXT:           %3 = arith.index_cast %arg6 : index to i32
-// TASKFLOW-NEXT:           %4 = arith.addi %2, %3 : i32
-// TASKFLOW-NEXT:           affine.store %4, %arg0[%arg5, %arg6] : memref<4x8xi32>
-// TASKFLOW-NEXT:         }
-// TASKFLOW-NEXT:         affine.for %arg6 = 0 to 8 {
 // TASKFLOW-NEXT:           %3 = affine.load %arg0[%arg5, %arg6] : memref<4x8xi32>
 // TASKFLOW-NEXT:           %4 = arith.addi %3, %arg3 : i32
 // TASKFLOW-NEXT:           affine.if #set(%arg5, %arg6) {
@@ -86,133 +133,84 @@ module attributes {} {
 // TASKFLOW-NEXT:           }
 // TASKFLOW-NEXT:         }
 // TASKFLOW-NEXT:       }
-// TASKFLOW-NEXT:       "taskflow.yield"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 2, 0>}> : (memref<4x8xi32>, memref<i32>) -> ()
-// TASKFLOW-NEXT:     }) : (memref<4x8xi32>, memref<i32>, i32, i32, i32) -> (memref<4x8xi32>, memref<i32>)
-// TASKFLOW-NEXT:     %0 = affine.load %memory_outputs#1[] : memref<i32>
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg1 : memref<i32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %0 = affine.load %write_outputs_1[] : memref<i32>
 // TASKFLOW-NEXT:     return %0 : i32
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }
 
-// HYPERBLOCK:      module {
-// HYPERBLOCK-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// HYPERBLOCK-NEXT:     %c2_i32 = arith.constant 2 : i32
-// HYPERBLOCK-NEXT:     %c8_i32 = arith.constant 8 : i32
-// HYPERBLOCK-NEXT:     %c0_i32 = arith.constant 0 : i32
-// HYPERBLOCK-NEXT:     %alloca = memref.alloca() : memref<i32>
-// HYPERBLOCK-NEXT:     %alloca_0 = memref.alloca() : memref<4x8xi32>
-// HYPERBLOCK-NEXT:     %value_outputs = "taskflow.task"(%c0_i32) <{operandSegmentSizes = array<i32: 0, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
-// HYPERBLOCK-NEXT:     ^bb0(%arg0: i32):
-// HYPERBLOCK-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
-// HYPERBLOCK-NEXT:       %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array<i32: 1, 1>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg1: index, %arg2: i32):
-// HYPERBLOCK-NEXT:         %3 = arith.index_cast %arg1 : index to i32
-// HYPERBLOCK-NEXT:         %4 = arith.addi %arg2, %3 : i32
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32)
-// HYPERBLOCK-NEXT:       }) : (index, i32) -> i32
-// HYPERBLOCK-NEXT:       "taskflow.yield"(%2) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// HYPERBLOCK-NEXT:     }) : (i32) -> i32
-// HYPERBLOCK-NEXT:     %memory_outputs:2 = "taskflow.task"(%alloca_0, %alloca, %c8_i32, %value_outputs, %c2_i32) <{operandSegmentSizes = array<i32: 2, 3>, resultSegmentSizes = array<i32: 2, 0>, task_name = "Task_1"}> ({
-// HYPERBLOCK-NEXT:     ^bb0(%arg0: memref<4x8xi32>, %arg1: memref<i32>, %arg2: i32, %arg3: i32, %arg4: i32):
-// HYPERBLOCK-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// HYPERBLOCK-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// HYPERBLOCK-NEXT:       %3 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg5: index, %arg6: index):
-// HYPERBLOCK-NEXT:         %4 = arith.index_cast %arg5 : index to i32
-// HYPERBLOCK-NEXT:         %5 = arith.muli %4, %arg2 : i32
-// HYPERBLOCK-NEXT:         %6 = arith.index_cast %arg6 : index to i32
-// HYPERBLOCK-NEXT:         %7 = arith.addi %5, %6 : i32
-// HYPERBLOCK-NEXT:         memref.store %7, %arg0[%arg5, %arg6] : memref<4x8xi32>
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:       }) : (index, index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1, %3) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg5: index, %arg6: index):
-// HYPERBLOCK-NEXT:         %4 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32>
-// HYPERBLOCK-NEXT:         %5 = arith.addi %4, %arg3 : i32
-// HYPERBLOCK-NEXT:         %c0 = arith.constant 0 : index
-// HYPERBLOCK-NEXT:         %c-3 = arith.constant -3 : index
-// HYPERBLOCK-NEXT:         %6 = arith.addi %arg5, %c-3 : index
-// HYPERBLOCK-NEXT:         %7 = arith.cmpi eq, %6, %c0 : index
-// HYPERBLOCK-NEXT:         %c-7 = arith.constant -7 : index
-// HYPERBLOCK-NEXT:         %8 = arith.addi %arg6, %c-7 : index
-// HYPERBLOCK-NEXT:         %9 = arith.cmpi eq, %8, %c0 : index
-// HYPERBLOCK-NEXT:         %10 = arith.andi %7, %9 : i1
-// HYPERBLOCK-NEXT:         scf.if %10 {
-// HYPERBLOCK-NEXT:           memref.store %5, %arg1[] : memref<i32>
-// HYPERBLOCK-NEXT:           %11 = arith.muli %5, %arg4 : i32
-// HYPERBLOCK-NEXT:           memref.store %11, %arg1[] : memref<i32>
-// HYPERBLOCK-NEXT:         }
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:       }) : (index, index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.yield"(%arg0, %arg1) <{operandSegmentSizes = array<i32: 2, 0>}> : (memref<4x8xi32>, memref<i32>) -> ()
-// HYPERBLOCK-NEXT:     }) : (memref<4x8xi32>, memref<i32>, i32, i32, i32) -> (memref<4x8xi32>, memref<i32>)
-// HYPERBLOCK-NEXT:     %0 = affine.load %memory_outputs#1[] : memref<i32>
-// HYPERBLOCK-NEXT:     return %0 : i32
-// HYPERBLOCK-NEXT:   }
-// HYPERBLOCK-NEXT: }
-
-
-
-// CANONICALIZE:      module {
-// CANONICALIZE-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// CANONICALIZE-NEXT:     %c2_i32 = arith.constant 2 : i32
-// CANONICALIZE-NEXT:     %c8_i32 = arith.constant 8 : i32
-// CANONICALIZE-NEXT:     %c0_i32 = arith.constant 0 : i32
-// CANONICALIZE-NEXT:     %alloca = memref.alloca() : memref<i32>
-// CANONICALIZE-NEXT:     %alloca_0 = memref.alloca() : memref<4x8xi32>
-// CANONICALIZE-NEXT:     %value_outputs = "taskflow.task"(%c0_i32) <{operandSegmentSizes = array<i32: 0, 1>, resultSegmentSizes = array<i32: 0, 1>, task_name = "Task_0"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg0: i32):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
-// CANONICALIZE-NEXT:       %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array<i32: 1, 1>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg1: index, %arg2: i32):
-// CANONICALIZE-NEXT:         %3 = arith.index_cast %arg1 : index to i32
-// CANONICALIZE-NEXT:         %4 = arith.addi %arg2, %3 : i32
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32)
-// CANONICALIZE-NEXT:       }) : (index, i32) -> i32
-// CANONICALIZE-NEXT:       "taskflow.yield"(%2) <{operandSegmentSizes = array<i32: 0, 1>}> : (i32) -> ()
-// CANONICALIZE-NEXT:     }) : (i32) -> i32
-// CANONICALIZE-NEXT:     %memory_outputs = "taskflow.task"(%alloca_0, %c8_i32, %alloca_0) <{operandSegmentSizes = array<i32: 1, 2>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg0: memref<4x8xi32>, %arg1: i32, %arg2: memref<4x8xi32>):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// CANONICALIZE-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg3: index, %arg4: index):
-// CANONICALIZE-NEXT:         %3 = arith.index_cast %arg3 : index to i32
-// CANONICALIZE-NEXT:         %4 = arith.muli %3, %arg1 : i32
-// CANONICALIZE-NEXT:         %5 = arith.index_cast %arg4 : index to i32
-// CANONICALIZE-NEXT:         %6 = arith.addi %4, %5 : i32
-// CANONICALIZE-NEXT:         memref.store %6, %arg2[%arg3, %arg4] : memref<4x8xi32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index, index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg2) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<4x8xi32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<4x8xi32>, i32, memref<4x8xi32>) -> memref<4x8xi32>
-// CANONICALIZE-NEXT:     %memory_outputs_1 = "taskflow.task"(%memory_outputs, %alloca, %alloca_0, %value_outputs, %alloca, %c2_i32) <{operandSegmentSizes = array<i32: 2, 4>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_2"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg0: memref<4x8xi32>, %arg1: memref<i32>, %arg2: memref<4x8xi32>, %arg3: i32, %arg4: memref<i32>, %arg5: i32):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// CANONICALIZE-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg6: index, %arg7: index):
-// CANONICALIZE-NEXT:         %3 = memref.load %arg2[%arg6, %arg7] : memref<4x8xi32>
-// CANONICALIZE-NEXT:         %4 = arith.addi %3, %arg3 : i32
-// CANONICALIZE-NEXT:         %c0 = arith.constant 0 : index
-// CANONICALIZE-NEXT:         %c-3 = arith.constant -3 : index
-// CANONICALIZE-NEXT:         %5 = arith.addi %arg6, %c-3 : index
-// CANONICALIZE-NEXT:         %6 = arith.cmpi eq, %5, %c0 : index
-// CANONICALIZE-NEXT:         %c-7 = arith.constant -7 : index
-// CANONICALIZE-NEXT:         %7 = arith.addi %arg7, %c-7 : index
-// CANONICALIZE-NEXT:         %8 = arith.cmpi eq, %7, %c0 : index
-// CANONICALIZE-NEXT:         %9 = arith.andi %6, %8 : i1
-// CANONICALIZE-NEXT:         scf.if %9 {
-// CANONICALIZE-NEXT:           memref.store %4, %arg4[] : memref<i32>
-// CANONICALIZE-NEXT:           %10 = arith.muli %4, %arg5 : i32
-// CANONICALIZE-NEXT:           memref.store %10, %arg4[] : memref<i32>
-// CANONICALIZE-NEXT:         }
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index, index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg4) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<i32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<4x8xi32>, memref<i32>, memref<4x8xi32>, i32, memref<i32>, i32) -> memref<i32>
-// CANONICALIZE-NEXT:     %0 = affine.load %memory_outputs_1[] : memref<i32>
-// CANONICALIZE-NEXT:     return %0 : i32
-// CANONICALIZE-NEXT:   }
-// CANONICALIZE-NEXT: }
-
+// HYPERBLOCK:     module {
+// HYPERBLOCK-NEXT:  func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// HYPERBLOCK-NEXT:    %c2_i32 = arith.constant 2 : i32
+// HYPERBLOCK-NEXT:    %c8_i32 = arith.constant 8 : i32
+// HYPERBLOCK-NEXT:    %c0_i32 = arith.constant 0 : i32
+// HYPERBLOCK-NEXT:    %alloca = memref.alloca() : memref<i32>
+// HYPERBLOCK-NEXT:    %alloca_0 = memref.alloca() : memref<4x8xi32>
+// HYPERBLOCK-NEXT:    %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg0: i32):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
+// HYPERBLOCK-NEXT:      %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array<i32: 1, 1>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg1: index, %arg2: i32):
+// HYPERBLOCK-NEXT:        %3 = arith.index_cast %arg1 : index to i32
+// HYPERBLOCK-NEXT:        %4 = arith.addi %arg2, %3 : i32
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32)
+// HYPERBLOCK-NEXT:      }) : (index, i32) -> i32
+// HYPERBLOCK-NEXT:      taskflow.yield values(%2 : i32)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg0: memref<4x8xi32>, %arg1: i32):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1) <{operandSegmentSizes = array<i32: 1, 0>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg2: index):
+// HYPERBLOCK-NEXT:        %3 = arith.index_cast %arg2 : index to i32
+// HYPERBLOCK-NEXT:        %4 = arith.muli %3, %arg1 : i32
+// HYPERBLOCK-NEXT:        %c0 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:        %c8 = arith.constant 8 : index
+// HYPERBLOCK-NEXT:        %c1 = arith.constant 1 : index
+// HYPERBLOCK-NEXT:        scf.for %arg3 = %c0 to %c8 step %c1 {
+// HYPERBLOCK-NEXT:          %5 = arith.index_cast %arg3 : index to i32
+// HYPERBLOCK-NEXT:          %6 = arith.addi %4, %5 : i32
+// HYPERBLOCK-NEXT:          memref.store %6, %arg0[%arg2, %arg3] : memref<4x8xi32>
+// HYPERBLOCK-NEXT:        }
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:      }) : (index) -> ()
+// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg0 : memref<4x8xi32>)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref<i32>) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref<i32>, i32, i32, i32) -> (memref<i32>) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg0: memref<4x8xi32>, %arg1: memref<i32>, %arg2: i32, %arg3: i32, %arg4: i32):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1) <{operandSegmentSizes = array<i32: 1, 0>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg5: index):
+// HYPERBLOCK-NEXT:        %3 = arith.index_cast %arg5 : index to i32
+// HYPERBLOCK-NEXT:        %4 = arith.muli %3, %arg2 : i32
+// HYPERBLOCK-NEXT:        %c0 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:        %c8 = arith.constant 8 : index
+// HYPERBLOCK-NEXT:        %c1 = arith.constant 1 : index
+// HYPERBLOCK-NEXT:        scf.for %arg6 = %c0 to %c8 step %c1 {
+// HYPERBLOCK-NEXT:          %5 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32>
+// HYPERBLOCK-NEXT:          %6 = arith.addi %5, %arg3 : i32
+// HYPERBLOCK-NEXT:          %c0_2 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:          %c-3 = arith.constant -3 : index
+// HYPERBLOCK-NEXT:          %7 = arith.addi %arg5, %c-3 : index
+// HYPERBLOCK-NEXT:          %8 = arith.cmpi eq, %7, %c0_2 : index
+// HYPERBLOCK-NEXT:          %c-7 = arith.constant -7 : index
+// HYPERBLOCK-NEXT:          %9 = arith.addi %arg6, %c-7 : index
+// HYPERBLOCK-NEXT:          %10 = arith.cmpi eq, %9, %c0_2 : index
+// HYPERBLOCK-NEXT:          %11 = arith.andi %8, %10 : i1
+// HYPERBLOCK-NEXT:          scf.if %11 {
+// HYPERBLOCK-NEXT:            memref.store %6, %arg1[] : memref<i32>
+// HYPERBLOCK-NEXT:            %12 = arith.muli %6, %arg4 : i32
+// HYPERBLOCK-NEXT:            memref.store %12, %arg1[] : memref<i32>
+// HYPERBLOCK-NEXT:          }
+// HYPERBLOCK-NEXT:        }
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:      }) : (index) -> ()
+// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg1 : memref<i32>)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %0 = affine.load %write_outputs_1[] : memref<i32>
+// HYPERBLOCK-NEXT:    return %0 : i32
+// HYPERBLOCK-NEXT:  }
+// HYPERBLOCK-NEXT:}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index c5f75f28..509614a1 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -1,18 +1,18 @@
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: -o %t.serialized.mlir
+// RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED
+
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
 // RUN: -o %t.taskflow.mlir
 // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
-// RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
-// RUN: -o %t.canonicalized.mlir
-// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
-
 module attributes {} {
   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
     affine.for %arg10 = 0 to 4 {
@@ -52,183 +52,204 @@ module attributes {} {
   }
 }
 
+// SERIALIZED:      module {
+// SERIALIZED-NEXT:   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// SERIALIZED-NEXT:     affine.for %arg10 = 0 to 4 {
+// SERIALIZED-NEXT:       affine.for %arg11 = 0 to 8 {
+// SERIALIZED-NEXT:         affine.for %arg12 = 0 to 6 {
+// SERIALIZED-NEXT:           %1 = affine.load %arg0[%arg10, %arg11, %arg12] : memref<?x8x6xi32>
+// SERIALIZED-NEXT:           affine.store %1, %arg5[%arg12] : memref<?xi32>
+// SERIALIZED-NEXT:         }
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg10 = 0 to 4 {
+// SERIALIZED-NEXT:       affine.for %arg11 = 0 to 8 {
+// SERIALIZED-NEXT:         affine.for %arg12 = 0 to 5 {
+// SERIALIZED-NEXT:           %1 = affine.load %arg1[%arg10, %arg11, %arg12] : memref<?x8x5xi32>
+// SERIALIZED-NEXT:           %2 = affine.load %arg2[%arg10, %arg11, %arg12] : memref<?x8x5xi32>
+// SERIALIZED-NEXT:           %3 = arith.addi %1, %2 : i32
+// SERIALIZED-NEXT:           affine.store %3, %arg6[%arg12] : memref<?xi32>
+// SERIALIZED-NEXT:         }
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg10 = 0 to 4 {
+// SERIALIZED-NEXT:       affine.for %arg11 = 0 to 8 {
+// SERIALIZED-NEXT:         affine.for %arg12 = 0 to 6 {
+// SERIALIZED-NEXT:           %1 = affine.load %arg5[%arg12] : memref<?xi32>
+// SERIALIZED-NEXT:           %2 = affine.load %arg6[%arg12] : memref<?xi32>
+// SERIALIZED-NEXT:           %3 = arith.addi %1, %2 : i32
+// SERIALIZED-NEXT:           %4 = affine.load %arg9[0] : memref<?xi32>
+// SERIALIZED-NEXT:           %5 = arith.addi %4, %3 : i32
+// SERIALIZED-NEXT:           affine.store %5, %arg9[0] : memref<?xi32>
+// SERIALIZED-NEXT:         }
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg10 = 0 to 4 {
+// SERIALIZED-NEXT:       affine.for %arg11 = 0 to 7 {
+// SERIALIZED-NEXT:         %1 = affine.load %arg3[%arg10, %arg11] : memref<?x7xi32>
+// SERIALIZED-NEXT:         affine.store %1, %arg7[%arg11] : memref<?xi32>
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg10 = 0 to 4 {
+// SERIALIZED-NEXT:       affine.for %arg11 = 0 to 9 {
+// SERIALIZED-NEXT:         %1 = affine.load %arg4[%arg10, %arg11] : memref<?x9xi32>
+// SERIALIZED-NEXT:         %2 = affine.load %arg7[%arg11] : memref<?xi32>
+// SERIALIZED-NEXT:         %3 = arith.addi %1, %2 : i32
+// SERIALIZED-NEXT:         affine.store %3, %arg8[%arg11] : memref<?xi32>
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     %0 = affine.load %arg9[0] : memref<?xi32>
+// SERIALIZED-NEXT:     return %0 : i32
+// SERIALIZED-NEXT:   }
+// SERIALIZED-NEXT: }
+
 // TASKFLOW:      module {
 // TASKFLOW-NEXT:   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// TASKFLOW-NEXT:     %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array<i32: 10, 0>, resultSegmentSizes = array<i32: 5, 0>, task_name = "Task_0"}> ({
-// TASKFLOW-NEXT:     ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?x8x5xi32>, %arg13: memref<?xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>, %arg16: memref<?x7xi32>, %arg17: memref<?x9xi32>, %arg18: memref<?xi32>, %arg19: memref<?xi32>):
-// TASKFLOW-NEXT:       affine.for %arg20 = 0 to 4 {
-// TASKFLOW-NEXT:         affine.for %arg21 = 0 to 8 {
-// TASKFLOW-NEXT:           affine.for %arg22 = 0 to 6 {
-// TASKFLOW-NEXT:             %1 = affine.load %arg10[%arg20, %arg21, %arg22] : memref<?x8x6xi32>
-// TASKFLOW-NEXT:             affine.store %1, %arg13[%arg22] : memref<?xi32>
+// TASKFLOW-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<?x8x6xi32>) write_memrefs(%arg5 : memref<?xi32>) [original_read_memrefs(%arg0), original_write_memrefs(%arg5)] : (memref<?x8x6xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// TASKFLOW-NEXT:     ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?xi32>):
+// TASKFLOW-NEXT:       affine.for %arg12 = 0 to 4 {
+// TASKFLOW-NEXT:         affine.for %arg13 = 0 to 8 {
+// TASKFLOW-NEXT:           affine.for %arg14 = 0 to 6 {
+// TASKFLOW-NEXT:             %1 = affine.load %arg10[%arg12, %arg13, %arg14] : memref<?x8x6xi32>
+// TASKFLOW-NEXT:             affine.store %1, %arg11[%arg14] : memref<?xi32>
 // TASKFLOW-NEXT:           }
-// TASKFLOW-NEXT:           affine.for %arg22 = 0 to 5 {
-// TASKFLOW-NEXT:             %1 = affine.load %arg11[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
-// TASKFLOW-NEXT:             %2 = affine.load %arg12[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
+// TASKFLOW-NEXT:         }
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg11 : memref<?xi32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>) write_memrefs(%arg6 : memref<?xi32>) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg6)] : (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// TASKFLOW-NEXT:     ^bb0(%arg10: memref<?x8x5xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?xi32>):
+// TASKFLOW-NEXT:       affine.for %arg13 = 0 to 4 {
+// TASKFLOW-NEXT:         affine.for %arg14 = 0 to 8 {
+// TASKFLOW-NEXT:           affine.for %arg15 = 0 to 5 {
+// TASKFLOW-NEXT:             %1 = affine.load %arg10[%arg13, %arg14, %arg15] : memref<?x8x5xi32>
+// TASKFLOW-NEXT:             %2 = affine.load %arg11[%arg13, %arg14, %arg15] : memref<?x8x5xi32>
 // TASKFLOW-NEXT:             %3 = arith.addi %1, %2 : i32
-// TASKFLOW-NEXT:             affine.store %3, %arg14[%arg22] : memref<?xi32>
+// TASKFLOW-NEXT:             affine.store %3, %arg12[%arg15] : memref<?xi32>
 // TASKFLOW-NEXT:           }
-// TASKFLOW-NEXT:           affine.for %arg22 = 0 to 6 {
-// TASKFLOW-NEXT:             %1 = affine.load %arg13[%arg22] : memref<?xi32>
-// TASKFLOW-NEXT:             %2 = affine.load %arg14[%arg22] : memref<?xi32>
+// TASKFLOW-NEXT:         }
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg12 : memref<?xi32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>) write_memrefs(%arg9 : memref<?xi32>) [original_read_memrefs(%arg5, %arg6, %arg9), original_write_memrefs(%arg9)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// TASKFLOW-NEXT:     ^bb0(%arg10: memref<?xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>, %arg13: memref<?xi32>):
+// TASKFLOW-NEXT:       affine.for %arg14 = 0 to 4 {
+// TASKFLOW-NEXT:         affine.for %arg15 = 0 to 8 {
+// TASKFLOW-NEXT:           affine.for %arg16 = 0 to 6 {
+// TASKFLOW-NEXT:             %1 = affine.load %arg10[%arg16] : memref<?xi32>
+// TASKFLOW-NEXT:             %2 = affine.load %arg11[%arg16] : memref<?xi32>
 // TASKFLOW-NEXT:             %3 = arith.addi %1, %2 : i32
-// TASKFLOW-NEXT:             %4 = affine.load %arg15[0] : memref<?xi32>
+// TASKFLOW-NEXT:             %4 = affine.load %arg13[0] : memref<?xi32>
 // TASKFLOW-NEXT:             %5 = arith.addi %4, %3 : i32
-// TASKFLOW-NEXT:             affine.store %5, %arg15[0] : memref<?xi32>
+// TASKFLOW-NEXT:             affine.store %5, %arg13[0] : memref<?xi32>
 // TASKFLOW-NEXT:           }
 // TASKFLOW-NEXT:         }
-// TASKFLOW-NEXT:         affine.for %arg21 = 0 to 7 {
-// TASKFLOW-NEXT:           %1 = affine.load %arg16[%arg20, %arg21] : memref<?x7xi32>
-// TASKFLOW-NEXT:           affine.store %1, %arg18[%arg21] : memref<?xi32>
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg13 : memref<?xi32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %write_outputs_2 = taskflow.task @Task_3 read_memrefs(%arg3 : memref<?x7xi32>) write_memrefs(%arg7 : memref<?xi32>) [original_read_memrefs(%arg3), original_write_memrefs(%arg7)] : (memref<?x7xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// TASKFLOW-NEXT:     ^bb0(%arg10: memref<?x7xi32>, %arg11: memref<?xi32>):
+// TASKFLOW-NEXT:       affine.for %arg12 = 0 to 4 {
+// TASKFLOW-NEXT:         affine.for %arg13 = 0 to 7 {
+// TASKFLOW-NEXT:           %1 = affine.load %arg10[%arg12, %arg13] : memref<?x7xi32>
+// TASKFLOW-NEXT:           affine.store %1, %arg11[%arg13] : memref<?xi32>
 // TASKFLOW-NEXT:         }
-// TASKFLOW-NEXT:         affine.for %arg21 = 0 to 9 {
-// TASKFLOW-NEXT:           %1 = affine.load %arg17[%arg20, %arg21] : memref<?x9xi32>
-// TASKFLOW-NEXT:           %2 = affine.load %arg18[%arg21] : memref<?xi32>
+// TASKFLOW-NEXT:       }
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg11 : memref<?xi32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %write_outputs_3 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_2 : memref<?x9xi32>, memref<?xi32>) write_memrefs(%arg8 : memref<?xi32>) [original_read_memrefs(%arg4, %arg7), original_write_memrefs(%arg8)] : (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// TASKFLOW-NEXT:     ^bb0(%arg10: memref<?x9xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>):
+// TASKFLOW-NEXT:       affine.for %arg13 = 0 to 4 {
+// TASKFLOW-NEXT:         affine.for %arg14 = 0 to 9 {
+// TASKFLOW-NEXT:           %1 = affine.load %arg10[%arg13, %arg14] : memref<?x9xi32>
+// TASKFLOW-NEXT:           %2 = affine.load %arg11[%arg14] : memref<?xi32>
 // TASKFLOW-NEXT:           %3 = arith.addi %1, %2 : i32
-// TASKFLOW-NEXT:           affine.store %3, %arg19[%arg21] : memref<?xi32>
+// TASKFLOW-NEXT:           affine.store %3, %arg12[%arg14] : memref<?xi32>
 // TASKFLOW-NEXT:         }
 // TASKFLOW-NEXT:       }
-// TASKFLOW-NEXT:       "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array<i32: 5, 0>}> : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> ()
-// TASKFLOW-NEXT:     }) : (memref<?x8x6xi32>, memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?x7xi32>, memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>)
-// TASKFLOW-NEXT:     %0 = affine.load %memory_outputs#2[0] : memref<?xi32>
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg12 : memref<?xi32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %0 = affine.load %write_outputs_1[0] : memref<?xi32>
 // TASKFLOW-NEXT:     return %0 : i32
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }
 
-// HYPERBLOCK:      module {
-// HYPERBLOCK-NEXT:   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// HYPERBLOCK-NEXT:     %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array<i32: 10, 0>, resultSegmentSizes = array<i32: 5, 0>, task_name = "Task_0"}> ({
-// HYPERBLOCK-NEXT:     ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?x8x5xi32>, %arg13: memref<?xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>, %arg16: memref<?x7xi32>, %arg17: memref<?x9xi32>, %arg18: memref<?xi32>, %arg19: memref<?xi32>):
-// HYPERBLOCK-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// HYPERBLOCK-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// HYPERBLOCK-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
-// HYPERBLOCK-NEXT:       %4 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
-// HYPERBLOCK-NEXT:       %5 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
-// HYPERBLOCK-NEXT:       %6 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index
-// HYPERBLOCK-NEXT:       %7 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index
-// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array<i32: 3, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg20: index, %arg21: index, %arg22: index):
-// HYPERBLOCK-NEXT:         %8 = memref.load %arg10[%arg20, %arg21, %arg22] : memref<?x8x6xi32>
-// HYPERBLOCK-NEXT:         memref.store %8, %arg13[%arg22] : memref<?xi32>
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:       }) : (index, index, index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1, %2, %4) <{operandSegmentSizes = array<i32: 3, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg20: index, %arg21: index, %arg22: index):
-// HYPERBLOCK-NEXT:         %8 = memref.load %arg11[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
-// HYPERBLOCK-NEXT:         %9 = memref.load %arg12[%arg20, %arg21, %arg22] : memref<?x8x5xi32>
-// HYPERBLOCK-NEXT:         %10 = arith.addi %8, %9 : i32
-// HYPERBLOCK-NEXT:         memref.store %10, %arg14[%arg22] : memref<?xi32>
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:       }) : (index, index, index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%5) <{operandSegmentSizes = array<i32: 1, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg20: index):
-// HYPERBLOCK-NEXT:         %8 = memref.load %arg13[%arg20] : memref<?xi32>
-// HYPERBLOCK-NEXT:         %9 = memref.load %arg14[%arg20] : memref<?xi32>
-// HYPERBLOCK-NEXT:         %10 = arith.addi %8, %9 : i32
-// HYPERBLOCK-NEXT:         %c0 = arith.constant 0 : index
-// HYPERBLOCK-NEXT:         %11 = memref.load %arg15[%c0] : memref<?xi32>
-// HYPERBLOCK-NEXT:         %12 = arith.addi %11, %10 : i32
-// HYPERBLOCK-NEXT:         %c0_0 = arith.constant 0 : index
-// HYPERBLOCK-NEXT:         memref.store %12, %arg15[%c0_0] : memref<?xi32>
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:       }) : (index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1, %6) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg20: index, %arg21: index):
-// HYPERBLOCK-NEXT:         %8 = memref.load %arg16[%arg20, %arg21] : memref<?x7xi32>
-// HYPERBLOCK-NEXT:         memref.store %8, %arg18[%arg21] : memref<?xi32>
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:       }) : (index, index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1, %7) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg20: index, %arg21: index):
-// HYPERBLOCK-NEXT:         %8 = memref.load %arg17[%arg20, %arg21] : memref<?x9xi32>
-// HYPERBLOCK-NEXT:         %9 = memref.load %arg18[%arg21] : memref<?xi32>
-// HYPERBLOCK-NEXT:         %10 = arith.addi %8, %9 : i32
-// HYPERBLOCK-NEXT:         memref.store %10, %arg19[%arg21] : memref<?xi32>
-// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:       }) : (index, index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array<i32: 5, 0>}> : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> ()
-// HYPERBLOCK-NEXT:     }) : (memref<?x8x6xi32>, memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?x7xi32>, memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>)
-// HYPERBLOCK-NEXT:     %0 = affine.load %memory_outputs#2[0] : memref<?xi32>
-// HYPERBLOCK-NEXT:     return %0 : i32
-// HYPERBLOCK-NEXT:   }
-// HYPERBLOCK-NEXT: }
-
-// CANONICALIZE:      module {
-// CANONICALIZE-NEXT:   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// CANONICALIZE-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg5, %arg0, %arg5) <{operandSegmentSizes = array<i32: 2, 2>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?xi32>, %arg12: memref<?x8x6xi32>, %arg13: memref<?xi32>):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// CANONICALIZE-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// CANONICALIZE-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array<i32: 3, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg14: index, %arg15: index, %arg16: index):
-// CANONICALIZE-NEXT:         %4 = memref.load %arg12[%arg14, %arg15, %arg16] : memref<?x8x6xi32>
-// CANONICALIZE-NEXT:         memref.store %4, %arg13[%arg16] : memref<?xi32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index, index, index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg13) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<?x8x6xi32>, memref<?xi32>, memref<?x8x6xi32>, memref<?xi32>) -> memref<?xi32>
-// CANONICALIZE-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg6, %arg1, %arg2, %arg6) <{operandSegmentSizes = array<i32: 3, 3>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg10: memref<?x8x5xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?xi32>, %arg13: memref<?x8x5xi32>, %arg14: memref<?x8x5xi32>, %arg15: memref<?xi32>):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// CANONICALIZE-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// CANONICALIZE-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array<i32: 3, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg16: index, %arg17: index, %arg18: index):
-// CANONICALIZE-NEXT:         %4 = memref.load %arg13[%arg16, %arg17, %arg18] : memref<?x8x5xi32>
-// CANONICALIZE-NEXT:         %5 = memref.load %arg14[%arg16, %arg17, %arg18] : memref<?x8x5xi32>
-// CANONICALIZE-NEXT:         %6 = arith.addi %4, %5 : i32
-// CANONICALIZE-NEXT:         memref.store %6, %arg15[%arg18] : memref<?xi32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index, index, index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg15) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>, memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) -> memref<?xi32>
-// CANONICALIZE-NEXT:     %memory_outputs_1 = "taskflow.task"(%memory_outputs, %memory_outputs_0, %arg9, %arg5, %arg6, %arg9) <{operandSegmentSizes = array<i32: 3, 3>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_2"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg10: memref<?xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>, %arg13: memref<?xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// CANONICALIZE-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// CANONICALIZE-NEXT:       %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%3) <{operandSegmentSizes = array<i32: 1, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg16: index):
-// CANONICALIZE-NEXT:         %4 = memref.load %arg13[%arg16] : memref<?xi32>
-// CANONICALIZE-NEXT:         %5 = memref.load %arg14[%arg16] : memref<?xi32>
-// CANONICALIZE-NEXT:         %6 = arith.addi %4, %5 : i32
-// CANONICALIZE-NEXT:         %c0 = arith.constant 0 : index
-// CANONICALIZE-NEXT:         %7 = memref.load %arg15[%c0] : memref<?xi32>
-// CANONICALIZE-NEXT:         %8 = arith.addi %7, %6 : i32
-// CANONICALIZE-NEXT:         %c0_4 = arith.constant 0 : index
-// CANONICALIZE-NEXT:         memref.store %8, %arg15[%c0_4] : memref<?xi32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg15) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> memref<?xi32>
-// CANONICALIZE-NEXT:     %memory_outputs_2 = "taskflow.task"(%arg3, %arg7, %arg3, %arg7) <{operandSegmentSizes = array<i32: 2, 2>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_3"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg10: memref<?x7xi32>, %arg11: memref<?xi32>, %arg12: memref<?x7xi32>, %arg13: memref<?xi32>):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// CANONICALIZE-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg14: index, %arg15: index):
-// CANONICALIZE-NEXT:         %3 = memref.load %arg12[%arg14, %arg15] : memref<?x7xi32>
-// CANONICALIZE-NEXT:         memref.store %3, %arg13[%arg15] : memref<?xi32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index, index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg13) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<?x7xi32>, memref<?xi32>, memref<?x7xi32>, memref<?xi32>) -> memref<?xi32>
-// CANONICALIZE-NEXT:     %memory_outputs_3 = "taskflow.task"(%arg4, %memory_outputs_2, %arg8, %arg4, %arg7, %arg8) <{operandSegmentSizes = array<i32: 3, 3>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_4"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg10: memref<?x9xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>, %arg13: memref<?x9xi32>, %arg14: memref<?xi32>, %arg15: memref<?xi32>):
-// CANONICALIZE-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// CANONICALIZE-NEXT:       %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg16: index, %arg17: index):
-// CANONICALIZE-NEXT:         %3 = memref.load %arg13[%arg16, %arg17] : memref<?x9xi32>
-// CANONICALIZE-NEXT:         %4 = memref.load %arg14[%arg17] : memref<?xi32>
-// CANONICALIZE-NEXT:         %5 = arith.addi %3, %4 : i32
-// CANONICALIZE-NEXT:         memref.store %5, %arg15[%arg17] : memref<?xi32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index, index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg15) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<?xi32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<?x9xi32>, memref<?xi32>, memref<?xi32>, memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> memref<?xi32>
-// CANONICALIZE-NEXT:     %0 = affine.load %memory_outputs_1[0] : memref<?xi32>
-// CANONICALIZE-NEXT:     return %0 : i32
-// CANONICALIZE-NEXT:   }
-// CANONICALIZE-NEXT: }
\ No newline at end of file
+// HYPERBLOCK:     module {
+// HYPERBLOCK-NEXT:  func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// HYPERBLOCK-NEXT:    %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<?x8x6xi32>) write_memrefs(%arg5 : memref<?xi32>) [original_read_memrefs(%arg0), original_write_memrefs(%arg5)] : (memref<?x8x6xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg10: memref<?x8x6xi32>, %arg11: memref<?xi32>):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:      %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
+// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array<i32: 3, 0>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg12: index, %arg13: index, %arg14: index):
+// HYPERBLOCK-NEXT:        %4 = memref.load %arg10[%arg12, %arg13, %arg14] : memref<?x8x6xi32>
+// HYPERBLOCK-NEXT:        memref.store %4, %arg11[%arg14] : memref<?xi32>
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:      }) : (index, index, index) -> ()
+// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg11 : memref<?xi32>)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<?x8x5xi32>, memref<?x8x5xi32>) write_memrefs(%arg6 : memref<?xi32>) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg6)] : (memref<?x8x5xi32>, memref<?x8x5xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg10: memref<?x8x5xi32>, %arg11: memref<?x8x5xi32>, %arg12: memref<?xi32>):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:      %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
+// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array<i32: 3, 0>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg13: index, %arg14: index, %arg15: index):
+// HYPERBLOCK-NEXT:        %4 = memref.load %arg10[%arg13, %arg14, %arg15] : memref<?x8x5xi32>
+// HYPERBLOCK-NEXT:        %5 = memref.load %arg11[%arg13, %arg14, %arg15] : memref<?x8x5xi32>
+// HYPERBLOCK-NEXT:        %6 = arith.addi %4, %5 : i32
+// HYPERBLOCK-NEXT:        memref.store %6, %arg12[%arg15] : memref<?xi32>
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:      }) : (index, index, index) -> ()
+// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg12 : memref<?xi32>)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref<?xi32>, memref<?xi32>, memref<?xi32>) write_memrefs(%arg9 : memref<?xi32>) [original_read_memrefs(%arg5, %arg6, %arg9), original_write_memrefs(%arg9)] : (memref<?xi32>, memref<?xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg10: memref<?xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>, %arg13: memref<?xi32>):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// HYPERBLOCK-NEXT:      %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index
+// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%3) <{operandSegmentSizes = array<i32: 1, 0>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg14: index):
+// HYPERBLOCK-NEXT:        %4 = memref.load %arg10[%arg14] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %5 = memref.load %arg11[%arg14] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %6 = arith.addi %4, %5 : i32
+// HYPERBLOCK-NEXT:        %c0 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:        %7 = memref.load %arg13[%c0] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %8 = arith.addi %7, %6 : i32
+// HYPERBLOCK-NEXT:        %c0_4 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:        memref.store %8, %arg13[%c0_4] : memref<?xi32>
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:      }) : (index) -> ()
+// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg13 : memref<?xi32>)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %write_outputs_2 = taskflow.task @Task_3 read_memrefs(%arg3 : memref<?x7xi32>) write_memrefs(%arg7 : memref<?xi32>) [original_read_memrefs(%arg3), original_write_memrefs(%arg7)] : (memref<?x7xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg10: memref<?x7xi32>, %arg11: memref<?xi32>):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index
+// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array<i32: 2, 0>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg12: index, %arg13: index):
+// HYPERBLOCK-NEXT:        %3 = memref.load %arg10[%arg12, %arg13] : memref<?x7xi32>
+// HYPERBLOCK-NEXT:        memref.store %3, %arg11[%arg13] : memref<?xi32>
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:      }) : (index, index) -> ()
+// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg11 : memref<?xi32>)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %write_outputs_3 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_2 : memref<?x9xi32>, memref<?xi32>) write_memrefs(%arg8 : memref<?xi32>) [original_read_memrefs(%arg4, %arg7), original_write_memrefs(%arg8)] : (memref<?x9xi32>, memref<?xi32>, memref<?xi32>) -> (memref<?xi32>) {
+// HYPERBLOCK-NEXT:    ^bb0(%arg10: memref<?x9xi32>, %arg11: memref<?xi32>, %arg12: memref<?xi32>):
+// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index
+// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array<i32: 2, 0>}> ({
+// HYPERBLOCK-NEXT:      ^bb0(%arg13: index, %arg14: index):
+// HYPERBLOCK-NEXT:        %3 = memref.load %arg10[%arg13, %arg14] : memref<?x9xi32>
+// HYPERBLOCK-NEXT:        %4 = memref.load %arg11[%arg14] : memref<?xi32>
+// HYPERBLOCK-NEXT:        %5 = arith.addi %3, %4 : i32
+// HYPERBLOCK-NEXT:        memref.store %5, %arg12[%arg14] : memref<?xi32>
+// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:      }) : (index, index) -> ()
+// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg12 : memref<?xi32>)
+// HYPERBLOCK-NEXT:    }
+// HYPERBLOCK-NEXT:    %0 = affine.load %write_outputs_1[0] : memref<?xi32>
+// HYPERBLOCK-NEXT:    return %0 : i32
+// HYPERBLOCK-NEXT:  }
+// HYPERBLOCK-NEXT:}
\ No newline at end of file
diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
index ee37c831..dee9c268 100644
--- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
+++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir
@@ -1,18 +1,18 @@
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: -o %t.serialized.mlir
+// RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED
+
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
 // RUN: -o %t.taskflow.mlir
 // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --convert-affine-to-taskflow \
 // RUN: --construct-hyperblock-from-task \
 // RUN: -o %t.hyperblock.mlir
 // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK
 
-// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
-// RUN: --construct-hyperblock-from-task \
-// RUN: --canonicalize-task \
-// RUN: -o %t.canonicalized.mlir
-// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE
-
 module {
   // Example: Parallel nested loops scenario
   // Task 0: Single-level loop (vector scaling)
@@ -44,18 +44,37 @@ module {
   }
 }
 
+// SERIALIZED:      module {
+// SERIALIZED-NEXT:   func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) {
+// SERIALIZED-NEXT:     affine.for %arg5 = 0 to 16 {
+// SERIALIZED-NEXT:       %0 = affine.load %arg0[%arg5] : memref<16xf32>
+// SERIALIZED-NEXT:       %1 = arith.mulf %0, %arg4 : f32
+// SERIALIZED-NEXT:       affine.store %1, %arg0[%arg5] : memref<16xf32>
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     affine.for %arg5 = 0 to 8 {
+// SERIALIZED-NEXT:       affine.for %arg6 = 0 to 8 {
+// SERIALIZED-NEXT:         %0 = affine.load %arg1[%arg5, %arg6] : memref<8x8xf32>
+// SERIALIZED-NEXT:         %1 = affine.load %arg2[%arg5, %arg6] : memref<8x8xf32>
+// SERIALIZED-NEXT:         %2 = arith.mulf %0, %1 : f32
+// SERIALIZED-NEXT:         affine.store %2, %arg3[%arg5, %arg6] : memref<8x8xf32>
+// SERIALIZED-NEXT:       }
+// SERIALIZED-NEXT:     }
+// SERIALIZED-NEXT:     return
+// SERIALIZED-NEXT:   }
+// SERIALIZED-NEXT: }
+
 // TASKFLOW:      module {
 // TASKFLOW-NEXT:   func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) {
-// TASKFLOW-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array<i32: 1, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// TASKFLOW-NEXT:     ^bb0(%arg5: memref<16xf32>, %arg6: f32):
-// TASKFLOW-NEXT:       affine.for %arg7 = 0 to 16 {
-// TASKFLOW-NEXT:         %0 = affine.load %arg5[%arg7] : memref<16xf32>
-// TASKFLOW-NEXT:         %1 = arith.mulf %0, %arg6 : f32
-// TASKFLOW-NEXT:         affine.store %1, %arg5[%arg7] : memref<16xf32>
+// TASKFLOW-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0), original_write_memrefs(%arg0)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>) {
+// TASKFLOW-NEXT:     ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32):
+// TASKFLOW-NEXT:       affine.for %arg8 = 0 to 16 {
+// TASKFLOW-NEXT:         %0 = affine.load %arg6[%arg8] : memref<16xf32>
+// TASKFLOW-NEXT:         %1 = arith.mulf %0, %arg7 : f32
+// TASKFLOW-NEXT:         affine.store %1, %arg6[%arg8] : memref<16xf32>
 // TASKFLOW-NEXT:       }
-// TASKFLOW-NEXT:       "taskflow.yield"(%arg5) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
-// TASKFLOW-NEXT:     }) : (memref<16xf32>, f32) -> memref<16xf32>
-// TASKFLOW-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 3, 0>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg6 : memref<16xf32>)
+// TASKFLOW-NEXT:     }
+// TASKFLOW-NEXT:     %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg3)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>) {
 // TASKFLOW-NEXT:     ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>):
 // TASKFLOW-NEXT:       affine.for %arg8 = 0 to 8 {
 // TASKFLOW-NEXT:         affine.for %arg9 = 0 to 8 {
@@ -65,27 +84,27 @@ module {
 // TASKFLOW-NEXT:           affine.store %2, %arg7[%arg8, %arg9] : memref<8x8xf32>
 // TASKFLOW-NEXT:         }
 // TASKFLOW-NEXT:       }
-// TASKFLOW-NEXT:       "taskflow.yield"(%arg7) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<8x8xf32>) -> ()
-// TASKFLOW-NEXT:     }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32>
+// TASKFLOW-NEXT:       taskflow.yield writes(%arg7 : memref<8x8xf32>)
+// TASKFLOW-NEXT:     }
 // TASKFLOW-NEXT:     return
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }
 
 // HYPERBLOCK:      module {
 // HYPERBLOCK-NEXT:   func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) {
-// HYPERBLOCK-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array<i32: 1, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// HYPERBLOCK-NEXT:     ^bb0(%arg5: memref<16xf32>, %arg6: f32):
+// HYPERBLOCK-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0), original_write_memrefs(%arg0)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>) {
+// HYPERBLOCK-NEXT:     ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32):
 // HYPERBLOCK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index
 // HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%0) <{operandSegmentSizes = array<i32: 1, 0>}> ({
-// HYPERBLOCK-NEXT:       ^bb0(%arg7: index):
-// HYPERBLOCK-NEXT:         %1 = memref.load %arg5[%arg7] : memref<16xf32>
-// HYPERBLOCK-NEXT:         %2 = arith.mulf %1, %arg6 : f32
-// HYPERBLOCK-NEXT:         memref.store %2, %arg5[%arg7] : memref<16xf32>
+// HYPERBLOCK-NEXT:       ^bb0(%arg8: index):
+// HYPERBLOCK-NEXT:         %1 = memref.load %arg6[%arg8] : memref<16xf32>
+// HYPERBLOCK-NEXT:         %2 = arith.mulf %1, %arg7 : f32
+// HYPERBLOCK-NEXT:         memref.store %2, %arg6[%arg8] : memref<16xf32>
 // HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
 // HYPERBLOCK-NEXT:       }) : (index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.yield"(%arg5) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
-// HYPERBLOCK-NEXT:     }) : (memref<16xf32>, f32) -> memref<16xf32>
-// HYPERBLOCK-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 3, 0>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
+// HYPERBLOCK-NEXT:       taskflow.yield writes(%arg6 : memref<16xf32>)
+// HYPERBLOCK-NEXT:     }
+// HYPERBLOCK-NEXT:     %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg3)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>) {
 // HYPERBLOCK-NEXT:     ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>):
 // HYPERBLOCK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
 // HYPERBLOCK-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
@@ -97,40 +116,8 @@ module {
 // HYPERBLOCK-NEXT:         memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32>
 // HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
 // HYPERBLOCK-NEXT:       }) : (index, index) -> ()
-// HYPERBLOCK-NEXT:       "taskflow.yield"(%arg7) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<8x8xf32>) -> ()
-// HYPERBLOCK-NEXT:     }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32>
+// HYPERBLOCK-NEXT:       taskflow.yield writes(%arg7 : memref<8x8xf32>)
+// HYPERBLOCK-NEXT:     }
 // HYPERBLOCK-NEXT:     return
 // HYPERBLOCK-NEXT:   }
-// HYPERBLOCK-NEXT: }
-
-// CANONICALIZE:      module {
-// CANONICALIZE-NEXT:   func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) {
-// CANONICALIZE-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array<i32: 1, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg5: memref<16xf32>, %arg6: f32):
-// CANONICALIZE-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%0) <{operandSegmentSizes = array<i32: 1, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg7: index):
-// CANONICALIZE-NEXT:         %1 = memref.load %arg5[%arg7] : memref<16xf32>
-// CANONICALIZE-NEXT:         %2 = arith.mulf %1, %arg6 : f32
-// CANONICALIZE-NEXT:         memref.store %2, %arg5[%arg7] : memref<16xf32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg5) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<16xf32>, f32) -> memref<16xf32>
-// CANONICALIZE-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 3, 0>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
-// CANONICALIZE-NEXT:     ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>):
-// CANONICALIZE-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// CANONICALIZE-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// CANONICALIZE-NEXT:       "taskflow.hyperblock"(%0, %1) <{operandSegmentSizes = array<i32: 2, 0>}> ({
-// CANONICALIZE-NEXT:       ^bb0(%arg8: index, %arg9: index):
-// CANONICALIZE-NEXT:         %2 = memref.load %arg5[%arg8, %arg9] : memref<8x8xf32>
-// CANONICALIZE-NEXT:         %3 = memref.load %arg6[%arg8, %arg9] : memref<8x8xf32>
-// CANONICALIZE-NEXT:         %4 = arith.mulf %2, %3 : f32
-// CANONICALIZE-NEXT:         memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32>
-// CANONICALIZE-NEXT:         taskflow.hyperblock.yield
-// CANONICALIZE-NEXT:       }) : (index, index) -> ()
-// CANONICALIZE-NEXT:       "taskflow.yield"(%arg7) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<8x8xf32>) -> ()
-// CANONICALIZE-NEXT:     }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32>
-// CANONICALIZE-NEXT:     return
-// CANONICALIZE-NEXT:   }
-// CANONICALIZE-NEXT: }
\ No newline at end of file
+// HYPERBLOCK-NEXT: }
\ No newline at end of file

From a5817e8aefe023a9a2dd4737fb77e9171d395f4e Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sun, 1 Feb 2026 23:42:17 +0800
Subject: [PATCH 6/9] update submodule

---
 test/e2e/bicg/bicg_int_kernel.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/e2e/bicg/bicg_int_kernel.mlir b/test/e2e/bicg/bicg_int_kernel.mlir
index 32f17705..f9aa4d3d 100644
--- a/test/e2e/bicg/bicg_int_kernel.mlir
+++ b/test/e2e/bicg/bicg_int_kernel.mlir
@@ -11,7 +11,7 @@
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
-// RUN:   --promote-func-arg-to-const \
+// RUN:   --promote-input-arg-to-const \
 // RUN:   --fold-constant \
 // RUN:   --canonicalize-return \
 // RUN:   --canonicalize-live-in \

From bb64e5e6a526f297431506783f7d9e5cec8fdbfb Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Mon, 2 Feb 2026 23:20:04 +0800
Subject: [PATCH 7/9] enhance with sibling loops detection

---
 .../ConstructHyperblockFromTaskPass.cpp       | 131 +++++++++++++++---
 1 file changed, 108 insertions(+), 23 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index bb503c5d..41743149 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -182,53 +182,94 @@ getTopLevelLoopsInfo(SmallVector<LoopInfo> &loops_info) {
 //----------------------------------------------------------------------------
 // Prologue-Loop-Epilogue Code (PLE) Pattern Detection
 //----------------------------------------------------------------------------
-// Prologue-Loop-Epilogue Code means code that appears before and after an inner
-// loop. Example: for %i (outer loop) {
-//   <prologue code>
-//   for %j (nested loop) {
-//     <loop body>
+// Extended PLE Pattern: Also handles sibling loops without prologue/epilogue.
+// Pattern 1: Prologue/Epilogue exists
+//   for %i {
+//     <prologue>
+//     for %j { }
+//     <epilogue>
+//   }
+//
+// Pattern 2: Sibling loops (no prologue/epilogue)
+//   for %i {
+//     for %j { }
+//     for %k { }  ← Sibling loop
 //   }
-//   <epilogue code>  ← Loop-Epilogue Code
-// }
 // For this pattern, we need to wrap the inner loop and the prologue-epilogue
 // code into a hyperblock. Only by doing this can we maintain the hyperblock as
 // a pure data-driven code block.
 struct PLEPattern {
   affine::AffineForOp outer_loop;
-  affine::AffineForOp inner_loop;
+  // Supports multiple sibling loops.
+  SmallVector<affine::AffineForOp> inner_loops;
 
   SmallVector<Operation *> prologue_code;
   SmallVector<Operation *> epilogue_code;
+  // Code between sibling loops.
+  SmallVector<SmallVector<Operation *>> inter_loop_code;
 
   bool has_ple_pattern = false;
+  bool has_sibling_loops = false;
 };
 
 // Detects Prologue-Loop-Epilogue Code pattern in the task.
 static PLEPattern detectPLEPattern(affine::AffineForOp outer_loop) {
   PLEPattern pattern;
   pattern.has_ple_pattern = false;
+  pattern.has_sibling_loops = false;
   pattern.outer_loop = outer_loop;
 
   Block &body = outer_loop.getRegion().front();
-  bool found_nested_loop = false;
+  SmallVector<Operation *> current_segment;
 
   for (Operation &op : body.getOperations()) {
     if (auto nested_for = dyn_cast<affine::AffineForOp>(&op)) {
-      found_nested_loop = true;
-      if (!pattern.inner_loop) {
-        pattern.inner_loop = nested_for;
-      }
-    } else if (!(isa<affine::AffineYieldOp>(&op) && op.getOperands().empty())) {
-      if (!found_nested_loop) {
-        pattern.prologue_code.push_back(&op);
+      // Finds a nested loop.
+      if (pattern.inner_loops.empty()) {
+        // First nested loop - everything before is prologue.
+        if (!current_segment.empty()) {
+          pattern.prologue_code = current_segment;
+          pattern.has_ple_pattern = true;
+          current_segment.clear();
+        }
       } else {
-        pattern.epilogue_code.push_back(&op);
+        // Second or later nested loop - everything before is inter-loop code.
+        pattern.has_sibling_loops = true;
         pattern.has_ple_pattern = true;
+
+        if (!current_segment.empty()) {
+          pattern.inter_loop_code.push_back(current_segment);
+          current_segment.clear();
+        } else {
+          // No operations between loops, add empty segment
+          pattern.inter_loop_code.push_back({});
+        }
       }
+
+      pattern.inner_loops.push_back(nested_for);
+
+    } else if (!(isa<affine::AffineYieldOp>(&op) && op.getOperands().empty())) {
+      // Regular operation - add to current segment.
+      current_segment.push_back(&op);
     }
   }
 
-  if (found_nested_loop && (!pattern.prologue_code.empty())) {
+  // Any remaining operations after all loops are epilogue.
+  if (!current_segment.empty() && !pattern.inner_loops.empty()) {
+    pattern.epilogue_code = current_segment;
+    pattern.has_ple_pattern = true;
+  }
+
+  // If we have sibling loops (even without prologue/epilogue),
+  // it's still a PLE pattern that needs special handling.
+  if (pattern.inner_loops.size() > 1) {
+    pattern.has_ple_pattern = true;
+    pattern.has_sibling_loops = true;
+  }
+
+  // If we have prologue/epilogue with at least one loop, it's a PLE pattern.
+  if (!pattern.inner_loops.empty() &&
+      (!pattern.prologue_code.empty() || !pattern.epilogue_code.empty())) {
     pattern.has_ple_pattern = true;
   }
 
@@ -280,15 +321,30 @@ static void extractHyperblocksInfoFromRegion(
           current_block_ops.clear();
         }
 
-        // 2. Creates a hyperblock for the prologue + inner loop + epilogue.
+        // 2. Creates a hyperblock for:
+        // //    prologue + loop1 + inter_code1 + loop2 + inter_code2 + ... +
+        // epilogue.
         HyperblockInfo info;
+
+        // Adds prologue code.
         if (!ple_pattern.prologue_code.empty()) {
           info.operations.append(ple_pattern.prologue_code.begin(),
                                  ple_pattern.prologue_code.end());
         }
 
-        info.operations.push_back(ple_pattern.inner_loop);
+        // Adds loops and inter-loop code.
+        for (size_t i = 0; i < ple_pattern.inner_loops.size(); ++i) {
+          info.operations.push_back(ple_pattern.inner_loops[i]);
 
+          // Adds inter-loop code if exists.
+          if (i < ple_pattern.inter_loop_code.size() &&
+              !ple_pattern.inter_loop_code[i].empty()) {
+            info.operations.append(ple_pattern.inter_loop_code[i].begin(),
+                                   ple_pattern.inter_loop_code[i].end());
+          }
+        }
+
+        // Adds epilogue code.
         if (!ple_pattern.epilogue_code.empty()) {
           info.operations.append(ple_pattern.epilogue_code.begin(),
                                  ple_pattern.epilogue_code.end());
@@ -608,6 +664,9 @@ static LogicalResult transformTask(TaskflowTaskOp task_op) {
   // Step 4: Creates taskflow.hyperblock operations for each hyperblock.
   builder.setInsertionPoint(first_loop_op);
 
+  // Stores mappings from loop results to hyperblock outputs.
+  DenseMap<Value, Value> loop_result_to_hyperblock_output;
+
   // Creates hyperblock ops.
   for (auto &info : hyperblocks_info) {
     TaskflowHyperblockOp hyperblock_op =
@@ -622,25 +681,51 @@ static LogicalResult transformTask(TaskflowTaskOp task_op) {
 
       for (auto [loop_result, hb_result] :
            llvm::zip(loop_results, hyperblock_results)) {
-        loop_result.replaceAllUsesWith(hb_result);
+        loop_result_to_hyperblock_output[loop_result] = hb_result;
       }
     }
   }
 
+  // Step 5: Replaces loop results with hyperblock outputs BEFORE erasing loops.
+  for (auto [loop_result, hb_output] : loop_result_to_hyperblock_output) {
+    loop_result.replaceAllUsesWith(hb_output);
+  }
+
   // Step 6: Collects and erases original loop operations.
   // Collects all operations to erase.
   SmallVector<Operation *> ops_to_erase;
+  // First pass: collects all affine.for operations recursively.
+  // We need to erase them in reverse order (inner loops first).
+  SmallVector<affine::AffineForOp> loops_to_erase;
+  task_op.walk(
+      [&](affine::AffineForOp loop) { loops_to_erase.push_back(loop); });
+
+  // Reverses the order so we erase innermost loops first.
+  std::reverse(loops_to_erase.begin(), loops_to_erase.end());
+
+  // Second pass: collects non-loop, non-taskflow operations.
   for (Operation &op : llvm::make_early_inc_range(task_body->getOperations())) {
-    if (!isa<TaskflowYieldOp, TaskflowCounterOp, TaskflowHyperblockOp>(&op)) {
+    if (!isa<TaskflowYieldOp, TaskflowCounterOp, TaskflowHyperblockOp,
+             affine::AffineForOp>(&op)) {
       ops_to_erase.push_back(&op);
     }
   }
 
-  // Erases original operations.
+  // Step 7: Erases operations.
+  // First erases non-loop operations.
   for (Operation *op : ops_to_erase) {
+    // Makes sure all uses are replaced before erasing.
+    assert(op->use_empty() && "Operation still has uses before erasing");
     op->erase();
   }
 
+  // Then erases loops (innermost first).
+  for (affine::AffineForOp loop : loops_to_erase) {
+    // Makes sure the loop results have been replaced before erasing.
+    assert(loop->use_empty() && "Loop still has uses before erasing");
+    loop->erase();
+  }
+
   return success();
 }
 

From ddde493732db8901be0e32e66723a2abbe64600c Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 3 Feb 2026 01:43:40 +0800
Subject: [PATCH 8/9] simplify the contruct-hyperblock-from-task logic

---
 include/TaskflowDialect/TaskflowPasses.td     |   6 +-
 .../ConstructHyperblockFromTaskPass.cpp       | 904 ++++++------------
 .../irregular-loop/irregular-loop.mlir        | 147 ++-
 3 files changed, 360 insertions(+), 697 deletions(-)

diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 7c6b5a17..ccc2a711 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -24,10 +24,10 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module
 //=========================================================//
 // Passes for the Taskflow dialect
 //=========================================================//
-def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> {
-  let summary = "Constructs hyperblocks and counter chain from Taskflow tasks";
+def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp">{
+  let summary = "Constructs hyperblocks from Taskflow tasks by detecting perfect nested loop bands";
   let description = [{
-    This pass constructs hyperblocks and counter chain from Taskflow tasks.
+    This pass constructs hyperblocks from Taskflow tasks by detecting perfect nested loop bands.
   }];
   let constructor = "taskflow::createConstructHyperblockFromTaskPass()";
 }
diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
index 41743149..acc58fa2 100644
--- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
+++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp
@@ -8,720 +8,385 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/LogicalResult.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <memory>
-#include <optional>
-
+#include <cstddef>
 using namespace mlir;
 using namespace mlir::taskflow;
 
 namespace {
-//---------------------------------------------------------------------------
-// Loop Info Structure.
-//----------------------------------------------------------------------------
-struct LoopInfo {
-  affine::AffineForOp for_op;
-  int lower_bound;
-  int upper_bound;
-  int step;
-
-  // For nested loops.
-  LoopInfo *parent_loop_info = nullptr;
-  SmallVector<LoopInfo *> child_loops;
-
-  // Generated counter index.
-  Value counter_index;
-};
-
-//---------------------------------------------------------------------------
-// Hyperblock Info Structure.
-//----------------------------------------------------------------------------
-// Represents a code block that should become a hyperblock.
-struct HyperblockInfo {
-  // The operations that belong to this hyperblock.
-  SmallVector<Operation *> operations;
 
-  // The counter indices that trigger this hyperblock (empty for top-level
-  // operations before any loops).
-  SmallVector<Value> trigger_indices;
+//==============================================================================
+// Perfect Loop Band Detection.
+//==============================================================================
 
-  // Whether this hyperblock is nested within loops.
-  bool is_loop_body = false;
+// A perfect loop band is a sequence of perfectly nested loops where each loop
+// (except the innermost) has exactly one child loop and no other operations
+// (no prologue/epilogue).
+struct PerfectLoopBand {
+  // Outer to inner loop order.
+  SmallVector<affine::AffineForOp> loops;
 
-  // The corresponding loop.
-  affine::AffineForOp loop_op = nullptr;
-
-  // Marks if this hyperblock follows the PLE pattern.
-  bool is_ple_pattern = false;
+  bool isEmpty() const { return loops.empty(); }
+  size_t getDepth() const { return loops.size(); }
 };
 
-//----------------------------------------------------------------------------
-// Helper Functions.
-//----------------------------------------------------------------------------
-// Extracts loop parameters from affine.for operation.
-static std::optional<LoopInfo> extractLoopBound(affine::AffineForOp for_op) {
-  LoopInfo loop_info;
-  loop_info.for_op = for_op;
-
-  // Gets lower bound.
-  if (for_op.hasConstantLowerBound()) {
-    loop_info.lower_bound = for_op.getConstantLowerBound();
-  } else {
-    return std::nullopt;
-  }
-
-  // Gets upper bound.
-  if (for_op.hasConstantUpperBound()) {
-    loop_info.upper_bound = for_op.getConstantUpperBound();
-  } else {
-    return std::nullopt;
-  }
-
-  // Gets step.
-  loop_info.step = for_op.getStepAsInt();
-
-  return loop_info;
-}
-
-// Collects all affine.for loops and builds loop hierarchy.
-static SmallVector<LoopInfo> collectLoopInfo(TaskflowTaskOp task_op) {
-  SmallVector<LoopInfo> loops_info;
-  DenseMap<Operation *, LoopInfo *> op_to_loopinfo;
-
-  // Step 1: Collects all loops with its parameter.
-  task_op.walk([&](affine::AffineForOp for_op) {
-    auto info = extractLoopBound(for_op);
-    if (!info) {
-      assert(false && "Non-constant loop bounds are not supported.");
-    }
-
-    loops_info.push_back(*info);
-    op_to_loopinfo[for_op.getOperation()] = &loops_info.back();
-  });
-
-  // Step 2: Builds parent-child relationships among loops.
-  for (auto &loop_info : loops_info) {
-    Operation *parent_op = loop_info.for_op->getParentOp();
-    if (auto parent_for = dyn_cast<affine::AffineForOp>(parent_op)) {
-      if (op_to_loopinfo.count(parent_for.getOperation())) {
-        LoopInfo *parent_loop_info = op_to_loopinfo[parent_for.getOperation()];
-        loop_info.parent_loop_info = parent_loop_info;
-        parent_loop_info->child_loops.push_back(&loop_info);
+// Detects the maximal perfect loop band starting from the given loop.
+// Returns the sequence of perfectly nested loops.
+static PerfectLoopBand detectPerfectLoopBand(affine::AffineForOp start_loop) {
+  PerfectLoopBand band;
+  affine::AffineForOp current_loop = start_loop;
+
+  while (current_loop) {
+    band.loops.push_back(current_loop);
+
+    // Checks the body of current loop.
+    Block &body = current_loop.getRegion().front();
+
+    // Counts non-trivial operations (excluding yield).
+    affine::AffineForOp nested_loop = nullptr;
+    size_t num_loops = 0;
+    size_t num_other_ops = 0;
+
+    for (Operation &op : body) {
+      if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
+        nested_loop = for_op;
+        num_loops++;
+      } else if (!(isa<affine::AffineYieldOp>(&op) &&
+                   op.getNumOperands() == 0)) {
+        num_other_ops++;
       }
     }
-  }
-
-  return loops_info;
-}
 
-//----------------------------------------------------------------------------
-// Counter Chain Creation.
-//----------------------------------------------------------------------------
-// Recursively creates counter chain for each top-level loop.
-static void createCounterChainRecursivly(OpBuilder &builder, Location loc,
-                                         LoopInfo *loop_info,
-                                         Value parent_counter) {
-  // Creates counter for this loop.
-  Value counter_index;
-  if (parent_counter) {
-    // Nested counter.
-    auto counter_op = builder.create<TaskflowCounterOp>(
-        loc, builder.getIndexType(), parent_counter,
-        builder.getIndexAttr(loop_info->lower_bound),
-        builder.getIndexAttr(loop_info->upper_bound),
-        builder.getIndexAttr(loop_info->step),
-        /*Counter Type*/ nullptr, /*Counter ID*/ nullptr);
-    counter_index = counter_op.getCounterIndex();
-  } else {
-    // Top-level counter.
-    auto counter_op = builder.create<TaskflowCounterOp>(
-        loc, builder.getIndexType(), /*parent_index=*/nullptr,
-        builder.getIndexAttr(loop_info->lower_bound),
-        builder.getIndexAttr(loop_info->upper_bound),
-        builder.getIndexAttr(loop_info->step),
-        /*Counter Type*/ nullptr, /*Counter ID*/ nullptr);
-    counter_index = counter_op.getCounterIndex();
-  }
-
-  loop_info->counter_index = counter_index;
-
-  // Recursively creates counters for child loops.
-  for (LoopInfo *child : loop_info->child_loops) {
-    createCounterChainRecursivly(builder, loc, child, counter_index);
+    // Perfect nesting condition: exactly 1 nested loop, no other operations.
+    if (num_loops == 1 && num_other_ops == 0) {
+      // Continues to next level.
+      current_loop = nested_loop;
+    } else {
+      break; // Not perfect anymore.
+    }
   }
-}
 
-// Creates counter chain for all top-level loops.
-static void createCounterChain(OpBuilder &builder, Location loc,
-                               SmallVector<LoopInfo *> &top_level_loops_info) {
-  for (LoopInfo *loop_info : top_level_loops_info) {
-    createCounterChainRecursivly(builder, loc, loop_info, nullptr);
-  }
+  return band;
 }
 
-// Gets top-level loops' info (loops without parents).
-static SmallVector<LoopInfo *>
-getTopLevelLoopsInfo(SmallVector<LoopInfo> &loops_info) {
-  SmallVector<LoopInfo *> top_level_loops_info;
-  for (auto &loop_info : loops_info) {
-    if (!loop_info.parent_loop_info) {
-      top_level_loops_info.push_back(&loop_info);
-    }
-  }
-  return top_level_loops_info;
-}
+//==============================================================================
+// Counter Creation.
+//==============================================================================
 
-//----------------------------------------------------------------------------
-// Prologue-Loop-Epilogue Code (PLE) Pattern Detection
-//----------------------------------------------------------------------------
-// Extended PLE Pattern: Also handles sibling loops without prologue/epilogue.
-// Pattern 1: Prologue/Epilogue exists
-//   for %i {
-//     <prologue>
-//     for %j { }
-//     <epilogue>
-//   }
-//
-// Pattern 2: Sibling loops (no prologue/epilogue)
-//   for %i {
-//     for %j { }
-//     for %k { }  ← Sibling loop
-//   }
-// For this pattern, we need to wrap the inner loop and the prologue-epilogue
-// code into a hyperblock. Only by doing this can we maintain the hyperblock as
-// a pure data-driven code block.
-struct PLEPattern {
-  affine::AffineForOp outer_loop;
-  // Supports multiple sibling loops.
-  SmallVector<affine::AffineForOp> inner_loops;
-
-  SmallVector<Operation *> prologue_code;
-  SmallVector<Operation *> epilogue_code;
-  // Code between sibling loops.
-  SmallVector<SmallVector<Operation *>> inter_loop_code;
-
-  bool has_ple_pattern = false;
-  bool has_sibling_loops = false;
+struct CounterInfo {
+  affine::AffineForOp loop;
+  // The index value from taskflow.counter
+  Value counter_index;
 };
 
-// Detects Prologue-Loop-Epilogue Code pattern in the task.
-static PLEPattern detectPLEPattern(affine::AffineForOp outer_loop) {
-  PLEPattern pattern;
-  pattern.has_ple_pattern = false;
-  pattern.has_sibling_loops = false;
-  pattern.outer_loop = outer_loop;
-
-  Block &body = outer_loop.getRegion().front();
-  SmallVector<Operation *> current_segment;
-
-  for (Operation &op : body.getOperations()) {
-    if (auto nested_for = dyn_cast<affine::AffineForOp>(&op)) {
-      // Finds a nested loop.
-      if (pattern.inner_loops.empty()) {
-        // First nested loop - everything before is prologue.
-        if (!current_segment.empty()) {
-          pattern.prologue_code = current_segment;
-          pattern.has_ple_pattern = true;
-          current_segment.clear();
-        }
-      } else {
-        // Second or later nested loop - everything before is inter-loop code.
-        pattern.has_sibling_loops = true;
-        pattern.has_ple_pattern = true;
-
-        if (!current_segment.empty()) {
-          pattern.inter_loop_code.push_back(current_segment);
-          current_segment.clear();
-        } else {
-          // No operations between loops, add empty segment
-          pattern.inter_loop_code.push_back({});
-        }
-      }
-
-      pattern.inner_loops.push_back(nested_for);
-
-    } else if (!(isa<affine::AffineYieldOp>(&op) && op.getOperands().empty())) {
-      // Regular operation - add to current segment.
-      current_segment.push_back(&op);
+// Creates a chain of taskflow.counter operations for a perfect loop band.
+// Returns counter info for each loop level.
+static SmallVector<CounterInfo>
+createCounterChain(OpBuilder &builder, Location loc,
+                   const PerfectLoopBand &band) {
+  SmallVector<CounterInfo> counters;
+  Value parent_counter = nullptr;
+
+  for (affine::AffineForOp loop : band.loops) {
+    CounterInfo info;
+    info.loop = loop;
+
+    // Gets loop bounds.
+    int32_t lb = 0, ub = 0, step = 0;
+    if (loop.hasConstantLowerBound() && loop.hasConstantUpperBound()) {
+      lb = loop.getConstantLowerBound();
+      ub = loop.getConstantUpperBound();
+      step = loop.getStepAsInt();
+    } else {
+      llvm::errs() << "Warning: Non-constant loop bounds not supported yet\n";
+      continue;
     }
-  }
-
-  // Any remaining operations after all loops are epilogue.
-  if (!current_segment.empty() && !pattern.inner_loops.empty()) {
-    pattern.epilogue_code = current_segment;
-    pattern.has_ple_pattern = true;
-  }
-
-  // If we have sibling loops (even without prologue/epilogue),
-  // it's still a PLE pattern that needs special handling.
-  if (pattern.inner_loops.size() > 1) {
-    pattern.has_ple_pattern = true;
-    pattern.has_sibling_loops = true;
-  }
-
-  // If we have prologue/epilogue with at least one loop, it's a PLE pattern.
-  if (!pattern.inner_loops.empty() &&
-      (!pattern.prologue_code.empty() || !pattern.epilogue_code.empty())) {
-    pattern.has_ple_pattern = true;
-  }
-
-  return pattern;
-}
-
-//----------------------------------------------------------------------------
-// Hyperblock Creation
-//----------------------------------------------------------------------------
-// Recursively extracts hyperblocks from a region.
-// Key insight: Operations in a loop body that are used by nested loops
-// should be inlined into the nested loop's hyperblock.
-static void extractHyperblocksInfoFromRegion(
-    Region &region,
-    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map,
-    SmallVector<Value> parent_indices,
-    SmallVector<HyperblockInfo> &hyperblocks_info,
-    affine::AffineForOp enclosing_loop = nullptr,
-    SmallVector<Operation *> inherited_ops = {}) {
-  Block &block = region.front();
-  SmallVector<Operation *> current_block_ops;
-
-  current_block_ops.append(inherited_ops.begin(), inherited_ops.end());
-
-  for (Operation &op : block.getOperations()) {
-    if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
-
-      PLEPattern ple_pattern = detectPLEPattern(for_op);
-
-      // Gets the loop info.
-      LoopInfo *loop_info = loop_info_map.lookup(for_op);
-      assert(loop_info && "Loop not found in loop_info_map");
-
-      // Builds trigger indices for this loop (parent indices + this loop's
-      // index).
-      SmallVector<Value> loop_indices = parent_indices;
-      loop_indices.push_back(loop_info->counter_index);
-
-      // Handles the PLE pattern.
-      if (ple_pattern.has_ple_pattern) {
-        // 1. Emits any accumulated operations as a hyperblock.
-        if (!current_block_ops.empty()) {
-          HyperblockInfo info;
-          info.operations = current_block_ops;
-          info.trigger_indices = parent_indices;
-          info.is_loop_body = !parent_indices.empty();
-          info.loop_op = enclosing_loop;
-          hyperblocks_info.push_back(info);
-          current_block_ops.clear();
-        }
-
-        // 2. Creates a hyperblock for:
-        // //    prologue + loop1 + inter_code1 + loop2 + inter_code2 + ... +
-        // epilogue.
-        HyperblockInfo info;
 
-        // Adds prologue code.
-        if (!ple_pattern.prologue_code.empty()) {
-          info.operations.append(ple_pattern.prologue_code.begin(),
-                                 ple_pattern.prologue_code.end());
-        }
-
-        // Adds loops and inter-loop code.
-        for (size_t i = 0; i < ple_pattern.inner_loops.size(); ++i) {
-          info.operations.push_back(ple_pattern.inner_loops[i]);
-
-          // Adds inter-loop code if exists.
-          if (i < ple_pattern.inter_loop_code.size() &&
-              !ple_pattern.inter_loop_code[i].empty()) {
-            info.operations.append(ple_pattern.inter_loop_code[i].begin(),
-                                   ple_pattern.inter_loop_code[i].end());
-          }
-        }
-
-        // Adds epilogue code.
-        if (!ple_pattern.epilogue_code.empty()) {
-          info.operations.append(ple_pattern.epilogue_code.begin(),
-                                 ple_pattern.epilogue_code.end());
-        }
-
-        info.trigger_indices = loop_indices;
-        info.is_loop_body = true;
-        info.loop_op = for_op;
-        info.is_ple_pattern = true;
-        hyperblocks_info.push_back(info);
-
-        // No need for further processing of this loop. Since we have already
-        // handled the whole for_op.
-        current_block_ops.clear();
-        continue;
-      }
-
-      // Analyzes which of the current_ops are used by this loop.
-      DenseSet<Value> values_used_in_loop;
-      for_op.walk([&](Operation *nested_op) {
-        for (Value operand : nested_op->getOperands()) {
-          values_used_in_loop.insert(operand);
-        }
-      });
-
-      SmallVector<Operation *> ops_for_nested_loop;
-      SmallVector<Operation *> ops_not_used;
-      bool used_by_loop = false;
-      for (Operation *current_op : current_block_ops) {
-        for (Value result : current_op->getResults()) {
-          if (values_used_in_loop.contains(result)) {
-            used_by_loop = true;
-            break;
-          }
-        }
-      }
-      if (used_by_loop) {
-        ops_for_nested_loop.append(current_block_ops.begin(),
-                                   current_block_ops.end());
-      } else {
-        ops_not_used.append(current_block_ops.begin(), current_block_ops.end());
-      }
-
-      // Before processing the loop, emits any accumulated operations as a
-      // hyperblock.
-      if (!ops_not_used.empty()) {
-        HyperblockInfo info;
-        info.operations = ops_not_used;
-        info.trigger_indices = parent_indices;
-        info.is_loop_body = !parent_indices.empty();
-        info.loop_op = enclosing_loop;
-        hyperblocks_info.push_back(info);
-      }
-
-      // Recursively extracts hyperblocks from the loop body.
-      extractHyperblocksInfoFromRegion(for_op.getRegion(), loop_info_map,
-                                       loop_indices, hyperblocks_info, for_op,
-                                       ops_for_nested_loop);
-      current_block_ops.clear();
-    } else if (isa<TaskflowYieldOp, TaskflowCounterOp>(&op) ||
-               (isa<affine::AffineYieldOp>(&op) && op.getOperands().empty())) {
-      // Skips TaskflowYieldOp, TaskflowCounterOp, and empty affine.yield.
-      continue;
+    // Creates counter.
+    if (parent_counter) {
+      // Creates nested counter with parent.
+      TaskflowCounterOp counter_op = builder.create<TaskflowCounterOp>(
+          loc,
+          /*counter_index*/ builder.getIndexType(),
+          /*parent_index*/ parent_counter,
+          /*lower_bound*/ builder.getIndexAttr(lb),
+          /*upper_bound*/ builder.getIndexAttr(ub),
+          /*step*/ builder.getIndexAttr(step),
+          /*counter_type*/ nullptr,
+          /*counter_id*/ nullptr);
+      info.counter_index = counter_op.getCounterIndex();
     } else {
-      // Regular operation, accumulates it.
-      current_block_ops.push_back(&op);
+      // Creates the top-level counter (no parent).
+      TaskflowCounterOp counter_op = builder.create<TaskflowCounterOp>(
+          loc,
+          /*counter_index*/ builder.getIndexType(),
+          /*parent_index*/ nullptr,
+          /*lower_bound*/ builder.getIndexAttr(lb),
+          /*upper_bound*/ builder.getIndexAttr(ub),
+          /*step*/ builder.getIndexAttr(step),
+          /*counter_type*/ nullptr,
+          /*counter_id*/ nullptr);
+      info.counter_index = counter_op.getCounterIndex();
     }
-  }
 
-  // Emits any remaining operations as a hyperblock.
-  if (!current_block_ops.empty()) {
-    HyperblockInfo info;
-    info.operations = current_block_ops;
-    info.trigger_indices = parent_indices;
-    info.is_loop_body = !parent_indices.empty();
-    info.loop_op = enclosing_loop;
-    hyperblocks_info.push_back(info);
-    current_block_ops.clear();
+    parent_counter = info.counter_index;
+    counters.push_back(info);
   }
-}
 
-// Extracts all hyperblocks from a task.
-static SmallVector<HyperblockInfo> extractHyperblocksInfo(
-    TaskflowTaskOp task_op,
-    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map) {
-  SmallVector<HyperblockInfo> hyperblocks_info;
-  // No parent indices for top-level hyperblocks (Not nested in a loop).
-  SmallVector<Value> empty_indices;
+  return counters;
+}
 
-  extractHyperblocksInfoFromRegion(task_op.getBody(), loop_info_map,
-                                   empty_indices, hyperblocks_info);
+//==============================================================================
+// Hyperblock Creation.
+//==============================================================================
 
-  return hyperblocks_info;
-}
+// Analyzes which loop induction variables are actually used in the loop body.
+// Returns indices of loops whose induction variables are used.
+static SmallVector<size_t> analyzeUsedLoopIndices(const PerfectLoopBand &band) {
+  SmallVector<size_t> used_indices;
 
-// Collects all indices that are actually used by operations in the hyperblock.
-static SmallVector<Value> collectUsedIndices(
-    const SmallVector<Operation *> &operations,
-    const SmallVector<Value> &candidate_indices,
-    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map) {
-  // Builds reverse mapping: counter -> induction variable.
-  DenseMap<Value, Value> counter_to_indvar;
-  for (auto [loop_op, loop_info] : loop_info_map) {
-    counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar();
-  }
+  // Gets the deepest perfect loop's body.
+  affine::AffineForOp deepest_loop = band.loops.back();
+  Block &body = deepest_loop.getRegion().front();
 
-  // Collects all values used by operations.
-  SetVector<Value> used_indvars_set;
-  for (Operation *op : operations) {
+  // Collects all values used in the body.
+  DenseSet<Value> used_values;
+  body.walk([&](Operation *op) {
     for (Value operand : op->getOperands()) {
-      used_indvars_set.insert(operand);
+      used_values.insert(operand);
     }
-  }
+  });
 
-  // Returns in the same order as candidate_indices to maintain parent->child
-  // order.
-  SmallVector<Value> used_counters;
-  for (Value counter : candidate_indices) {
-    if (counter_to_indvar.count(counter)) {
-      Value indvar = counter_to_indvar[counter];
-      if (used_indvars_set.contains(indvar)) {
-        used_counters.push_back(counter);
-      }
+  // Checks which loop induction variables are used.
+  for (size_t i = 0; i < band.loops.size(); ++i) {
+    affine::AffineForOp loop = band.loops[i];
+    Value induction_var = loop.getInductionVar();
+    if (used_values.contains(induction_var)) {
+      used_indices.push_back(i);
     }
   }
 
-  return used_counters;
+  return used_indices;
 }
 
-// Determines output types for the hyperblock based on operations.
-static SmallVector<Type>
-determineHyperblockOutputTypes(const SmallVector<Operation *> &operations) {
-  SmallVector<Type> output_types = {};
-
-  // Checks if there's an affine.yield operation.
-  for (Operation *op : operations) {
-    if (auto affine_yield = dyn_cast<affine::AffineYieldOp>(op)) {
-      // Uses the operand types of affine.yield as output types.
-      for (Value operand : affine_yield.getOperands()) {
-        output_types.push_back(operand.getType());
-      }
-      return output_types;
+// Clones the body of the deepest perfect loop in the perfect band into a
+// hyperblock. Handles iter_args (reduction variables) by:
+// 1. Adding iter_args initial values as hyperblock inputs
+// 2. Mapping iter_args to hyperblock block arguments
+// 3. Returning reduction results as hyperblock outputs
+static TaskflowHyperblockOp
+createHyperblockFromLoopBody(OpBuilder &builder, Location loc,
+                             const PerfectLoopBand &band,
+                             const SmallVector<CounterInfo> &counters) {
+  // Gets the deepest perfect loop in the perfect nested band.
+  affine::AffineForOp deepest_perfect_loop = band.loops.back();
+  Block &loop_body = deepest_perfect_loop.getRegion().front();
+
+  // Analyzes which loop indices are actually used.
+  SmallVector<size_t> used_loop_indices = analyzeUsedLoopIndices(band);
+
+  // Checks if the deepest loop has iter_args (reduction variables).
+  bool has_iter_args = deepest_perfect_loop.getNumIterOperands() > 0;
+  SmallVector<Value> iter_args_init_values = {};
+  SmallVector<Type> iter_args_types = {};
+
+  if (has_iter_args) {
+    for (Value init_val : deepest_perfect_loop.getInits()) {
+      iter_args_init_values.push_back(init_val);
+      iter_args_types.push_back(init_val.getType());
     }
   }
 
-  // No affine.yield found, no output types needed.
-  return output_types;
-}
-
-// Creates a taskflow.hyperblock operation from HyperblockInfo.
-static TaskflowHyperblockOp createHyperblock(
-    OpBuilder &builder, Location loc, HyperblockInfo &info, Block *task_body,
-    const DenseMap<affine::AffineForOp, LoopInfo *> &loop_info_map) {
-  // Collects only the indices that are actually used in the hyperblock.
-  SmallVector<Value> used_indices =
-      collectUsedIndices(info.operations, info.trigger_indices, loop_info_map);
-
-  // Determines output types for the hyperblock based on operations.
-  SmallVector<Type> output_types =
-      determineHyperblockOutputTypes(info.operations);
-
-  // Checks if there is a reduction in the hyperblock (with iter_args).
-  SmallVector<Value> iter_args_init_values;
-  bool is_reduction = false;
-  if (info.loop_op && info.loop_op.getNumIterOperands() > 0) {
-    is_reduction = true;
-    for (Value init : info.loop_op.getInits()) {
-      iter_args_init_values.push_back(init);
-    }
+  // Builds trigger values (only for USED counter indices)
+  SmallVector<Value> trigger_values;
+  for (size_t idx : used_loop_indices) {
+    trigger_values.push_back(counters[idx].counter_index);
   }
-  // Creates the hyperblock operation.
-  TaskflowHyperblockOp hyperblock_op;
-  if (is_reduction) {
-    hyperblock_op = builder.create<TaskflowHyperblockOp>(
-        loc, output_types, used_indices, iter_args_init_values);
-  } else {
-    hyperblock_op = builder.create<TaskflowHyperblockOp>(
-        loc, output_types, used_indices, /*iter_args=*/ValueRange{});
+
+  // Determines hyperblock result types (from iter_args if present).
+  SmallVector<Type> result_types = {};
+  if (has_iter_args) {
+    result_types = iter_args_types;
   }
 
-  Block *hyperblock_body = new Block();
-  hyperblock_op.getBody().push_back(hyperblock_body);
+  // Creates hyperblock operation with iter_args as inputs.
+  auto hyperblock_op = builder.create<TaskflowHyperblockOp>(
+      loc, result_types, trigger_values, iter_args_init_values);
 
-  // Adds block arguments for the used indices.
-  for (Value idx : used_indices) {
-    hyperblock_body->addArgument(idx.getType(), loc);
+  // Builds block arguments:
+  // 1. Counter indices (only for USED loop levels).
+  // 2. Iter args values (passed through hyperblock invocation).
+  SmallVector<Type> arg_types;
+  SmallVector<Location> arg_locs;
+
+  // Adds counter index arguments (only for used indices).
+  for (size_t i = 0; i < used_loop_indices.size(); ++i) {
+    arg_types.push_back(builder.getIndexType());
+    arg_locs.push_back(loc);
   }
 
-  SmallVector<Value> iter_args_block_args;
-  if (is_reduction) {
-    for (Value init : iter_args_init_values) {
-      BlockArgument arg = hyperblock_body->addArgument(init.getType(), loc);
-      iter_args_block_args.push_back(arg);
+  // Adds iter_args as hyperblock block arguments.
+  if (has_iter_args) {
+    for (Type ty : iter_args_types) {
+      arg_types.push_back(ty);
+      arg_locs.push_back(loc);
     }
   }
 
-  // Clone operations into the hyperblock body.
-  OpBuilder hyperblock_builder(hyperblock_body, hyperblock_body->begin());
-  IRMapping mapping;
+  Block *hyperblock_body = &hyperblock_op.getBody().emplaceBlock();
+  hyperblock_body->addArguments(arg_types, arg_locs);
 
-  // Maps used indices to block arguments
-  for (auto [idx, arg] :
-       llvm::zip(used_indices, hyperblock_body->getArguments())) {
-    mapping.map(idx, arg);
-  }
+  OpBuilder body_builder = OpBuilder::atBlockBegin(hyperblock_body);
+  IRMapping mapper;
 
-  // Creates a mapping from loop counters to loop induction variables.
-  DenseMap<Value, Value> counter_to_indvar;
-  for (auto [loop_op, loop_info] : loop_info_map) {
-    counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar();
+  // Maps USED loop induction variables to hyperblock arguments.
+  for (size_t i = 0; i < used_loop_indices.size(); ++i) {
+    size_t loop_idx = used_loop_indices[i];
+    affine::AffineForOp loop = band.loops[loop_idx];
+    mapper.map(loop.getInductionVar(), hyperblock_body->getArgument(i));
   }
 
-  // Maps loop induction variables to hyperblock block arguments.
-  for (auto [idx, arg] :
-       llvm::zip(used_indices, hyperblock_body->getArguments())) {
-    if (counter_to_indvar.count(idx)) {
-      Value indvar = counter_to_indvar[idx];
-      mapping.map(indvar, arg);
+  // Maps iter_args to hyperblock block arguments (after counter indices).
+  if (has_iter_args) {
+    for (size_t i = 0; i < iter_args_types.size(); ++i) {
+      size_t arg_idx = used_loop_indices.size() + i;
+      mapper.map(deepest_perfect_loop.getRegionIterArgs()[i],
+                 hyperblock_body->getArgument(arg_idx));
     }
   }
 
-  // If this hyperblock comes from a loop with iter_args, maps them.
-  if (is_reduction) {
-    Block &loop_body = info.loop_op.getRegion().front();
-    auto loop_iter_args = loop_body.getArguments().drop_front(1);
+  // Clones all operations from the deepest perfect loop's body.
+  SmallVector<Value> yield_operands;
 
-    for (auto [loop_iter_arg, hb_iter_arg] :
-         llvm::zip(loop_iter_args, iter_args_block_args)) {
-      mapping.map(loop_iter_arg, hb_iter_arg);
+  for (Operation &op : loop_body) {
+    // Handles affine.yield with operands (reduction results).
+    if (auto yield_op = dyn_cast<affine::AffineYieldOp>(&op)) {
+      if (yield_op.getNumOperands() > 0) {
+        // Maps the yielded values for hyperblock's return.
+        for (Value yielded : yield_op.getOperands()) {
+          Value mapped = mapper.lookupOrDefault(yielded);
+          yield_operands.push_back(mapped);
+        }
+      }
+      continue; // Skips the yield itself.
     }
-  }
 
-  // Clones all operations and handle terminators.
-  bool has_terminator = false;
-  for (Operation *op : info.operations) {
-    // Handles affine.yield specially - convert to hyperblock.yield.
-    if (auto affine_yield = dyn_cast<affine::AffineYieldOp>(op)) {
-      // Maps the yield operands through the IRMapping.
-      SmallVector<Value> yield_operands;
-      for (Value operand : affine_yield.getOperands()) {
-        Value mapped_operand = mapping.lookupOrDefault(operand);
-        yield_operands.push_back(mapped_operand);
-      }
+    // Clones operation (including nested affine.for with iter_args).
+    Operation *cloned = body_builder.clone(op, mapper);
 
-      // Creates hyperblock.yield with the mapped operands.
-      hyperblock_builder.create<TaskflowHyperblockYieldOp>(loc, yield_operands,
-                                                           yield_operands);
-      has_terminator = true;
-      continue;
+    // Updates mapper with cloned operation results.
+    for (size_t i = 0; i < op.getNumResults(); ++i) {
+      mapper.map(op.getResult(i), cloned->getResult(i));
     }
-
-    // Clones regular operations.
-    hyperblock_builder.clone(*op, mapping);
   }
 
-  // Adds terminator if the last operation wasn't already a yield.
-  if (!has_terminator) {
-    hyperblock_builder.setInsertionPointToEnd(hyperblock_body);
-    hyperblock_builder.create<TaskflowHyperblockYieldOp>(loc);
+  // Adds terminator with reduction results (if any).
+  if (has_iter_args) {
+    body_builder.create<TaskflowHyperblockYieldOp>(
+        loc,
+        /*iter_args_next=*/yield_operands, // No iter_args_next for final
+                                           // iteration
+        /*results=*/yield_operands);       // Reduction results
+  } else {
+    body_builder.create<TaskflowHyperblockYieldOp>(loc);
   }
 
+  // Converts affine operations to standard/scf operations.
   MLIRContext *context = hyperblock_op.getContext();
   RewritePatternSet patterns(context);
   populateAffineToStdConversionPatterns(patterns);
+
   ConversionTarget target(*context);
   target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
-                         func::FuncDialect, taskflow::TaskflowDialect,
-                         scf::SCFDialect>();
+                         func::FuncDialect, TaskflowDialect, scf::SCFDialect>();
   target.addIllegalOp<affine::AffineLoadOp, affine::AffineStoreOp,
-                      affine::AffineIfOp>();
+                      affine::AffineForOp, affine::AffineIfOp,
+                      affine::AffineYieldOp>();
+
   if (failed(
           applyPartialConversion(hyperblock_op, target, std::move(patterns)))) {
-    assert(false && "Affine to Standard conversion failed.");
+    llvm::errs()
+        << "Error: Failed to convert affine operations to standard/scf\n";
+    return nullptr;
   }
 
   return hyperblock_op;
 }
 
-//----------------------------------------------------------------------------
-// Task Transformation
-//----------------------------------------------------------------------------
-// The main transformation function for TaskflowTaskOp.
+//============================================================================
+// Task Transformation.
+//===========================================================================
 static LogicalResult transformTask(TaskflowTaskOp task_op) {
   Location loc = task_op.getLoc();
+  Block &task_body = task_op.getBody().front();
 
-  // Step 1: Collects loop information.
-  DenseMap<affine::AffineForOp, LoopInfo *> loop_info_map;
-  SmallVector<LoopInfo> loops_info = collectLoopInfo(task_op);
-  for (auto &loop_info : loops_info) {
-    loop_info_map[loop_info.for_op] = &loop_info;
-  }
-
-  // Gets the body block of the task.
-  Block *task_body = &task_op.getBody().front();
-
-  // Finds the first loop in the task body.
-  affine::AffineForOp first_loop_op = nullptr;
-  for (Operation &op : task_body->getOperations()) {
+  // Finds all top-level loops in the task.
+  SmallVector<affine::AffineForOp> top_level_loops;
+  for (Operation &op : task_body) {
     if (auto for_op = dyn_cast<affine::AffineForOp>(&op)) {
-      first_loop_op = for_op;
-      break;
+      top_level_loops.push_back(for_op);
     }
   }
 
-  assert(first_loop_op && "No loops found in the task body.");
+  if (top_level_loops.empty()) {
+    llvm::errs() << "No loops found in task " << task_op.getTaskName() << "\n";
+    return success();
+  }
+
+  assert(top_level_loops.size() == 1 &&
+         "Expected exactly one top-level loop in each task.");
+
+  OpBuilder builder(&task_body, task_body.begin());
 
-  // Step 2: Creates counter chain before the first loop.
-  OpBuilder builder(first_loop_op);
-  SmallVector<LoopInfo *> top_level_loops_info =
-      getTopLevelLoopsInfo(loops_info);
-  createCounterChain(builder, loc, top_level_loops_info);
+  // Stores mapping from loop results to hyperblock results.
+  DenseMap<Value, Value> loop_result_to_hyperblock_result;
 
-  // Step 3: Extracts hyperblocks from task.
-  SmallVector<HyperblockInfo> hyperblocks_info =
-      extractHyperblocksInfo(task_op, loop_info_map);
+  // Processes each top-level loop.
+  for (affine::AffineForOp top_loop : top_level_loops) {
+    llvm::errs() << "\n[ConstructHyperblock] Processing top-level loop\n";
 
-  // Step 4: Creates taskflow.hyperblock operations for each hyperblock.
-  builder.setInsertionPoint(first_loop_op);
+    // Step 1: Detects maximal perfect loop band.
+    PerfectLoopBand band = detectPerfectLoopBand(top_loop);
+    llvm::errs() << "  Detected perfect loop band of depth " << band.getDepth()
+                 << "\n";
 
-  // Stores mappings from loop results to hyperblock outputs.
-  DenseMap<Value, Value> loop_result_to_hyperblock_output;
+    // Step 2: Creates counter chain for the perfect band.
+    builder.setInsertionPoint(top_loop);
+    SmallVector<CounterInfo> counters = createCounterChain(builder, loc, band);
+    llvm::errs() << "  Created " << counters.size() << " counters\n";
 
-  // Creates hyperblock ops.
-  for (auto &info : hyperblocks_info) {
+    // Step 3: Creates hyperblock from deepest loop's body.
     TaskflowHyperblockOp hyperblock_op =
-        createHyperblock(builder, loc, info, task_body, loop_info_map);
-
-    // If this hyperblock has outputs and belongs to a loop with iter_args,
-    // replace the loop results with the hyperblock outputs.
-    if (info.loop_op && info.loop_op.getNumResults() > 0 &&
-        (hyperblock_op.getNumResults() == info.loop_op.getNumResults())) {
-      auto loop_results = info.loop_op.getResults();
-      auto hyperblock_results = hyperblock_op.getOutputs();
-
-      for (auto [loop_result, hb_result] :
-           llvm::zip(loop_results, hyperblock_results)) {
-        loop_result_to_hyperblock_output[loop_result] = hb_result;
-      }
-    }
-  }
+        createHyperblockFromLoopBody(builder, loc, band, counters);
+    llvm::errs() << "  Created hyperblock with "
+                 << hyperblock_op.getBody().front().getOperations().size()
+                 << " operations\n";
 
-  // Step 5: Replaces loop results with hyperblock outputs BEFORE erasing loops.
-  for (auto [loop_result, hb_output] : loop_result_to_hyperblock_output) {
-    loop_result.replaceAllUsesWith(hb_output);
-  }
+    assert(hyperblock_op && "Hyperblock creation failed");
 
-  // Step 6: Collects and erases original loop operations.
-  // Collects all operations to erase.
-  SmallVector<Operation *> ops_to_erase;
-  // First pass: collects all affine.for operations recursively.
-  // We need to erase them in reverse order (inner loops first).
-  SmallVector<affine::AffineForOp> loops_to_erase;
-  task_op.walk(
-      [&](affine::AffineForOp loop) { loops_to_erase.push_back(loop); });
-
-  // Reverses the order so we erase innermost loops first.
-  std::reverse(loops_to_erase.begin(), loops_to_erase.end());
-
-  // Second pass: collects non-loop, non-taskflow operations.
-  for (Operation &op : llvm::make_early_inc_range(task_body->getOperations())) {
-    if (!isa<TaskflowYieldOp, TaskflowCounterOp, TaskflowHyperblockOp,
-             affine::AffineForOp>(&op)) {
-      ops_to_erase.push_back(&op);
+    // If the loop has results (iter_args), map them to hyperblock results.
+    if (top_loop.getNumResults() > 0) {
+      llvm::errs() << "  Mapping " << top_loop.getNumResults()
+                   << " loop results to hyperblock outputs\n";
+
+      for (size_t i = 0; i < top_loop.getNumResults(); ++i) {
+        loop_result_to_hyperblock_result[top_loop.getResult(i)] =
+            hyperblock_op.getResult(i);
+      }
     }
   }
 
-  // Step 7: Erases operations.
-  // First erases non-loop operations.
-  for (Operation *op : ops_to_erase) {
-    // Makes sure all uses are replaced before erasing.
-    assert(op->use_empty() && "Operation still has uses before erasing");
-    op->erase();
+  // Replaces loop results with hyperblock results BEFORE erasing loops.
+  for (auto [loop_result, hb_result] : loop_result_to_hyperblock_result) {
+    loop_result.replaceAllUsesWith(hb_result);
   }
 
-  // Then erases loops (innermost first).
-  for (affine::AffineForOp loop : loops_to_erase) {
-    // Makes sure the loop results have been replaced before erasing.
+  // Step 4: Erases all original loops.
+  for (affine::AffineForOp loop : top_level_loops) {
+    // Ensures no uses remain.
     assert(loop->use_empty() && "Loop still has uses before erasing");
     loop->erase();
   }
@@ -739,7 +404,8 @@ struct ConstructHyperblockFromTaskPass
   }
 
   StringRef getDescription() const final {
-    return "Constructs hyperblocks and counter chains from Taskflow tasks.";
+    return "Constructs hyperblocks from taskflow tasks by detecting perfect "
+           "nested loop bands.";
   }
 
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -750,17 +416,15 @@ struct ConstructHyperblockFromTaskPass
 
   void runOnOperation() override {
     func::FuncOp func_op = getOperation();
-    // Collects all tasks.
-    SmallVector<TaskflowTaskOp> tasks;
-    func_op.walk([&](TaskflowTaskOp task_op) { tasks.push_back(task_op); });
 
-    // Transforms each task.
-    for (TaskflowTaskOp task_op : tasks) {
+    // Walks through all TaskflowTaskOp in the function.
+    func_op.walk([&](TaskflowTaskOp task_op) {
       if (failed(transformTask(task_op))) {
         signalPassFailure();
-        return;
+        return WalkResult::interrupt();
       }
-    }
+      return WalkResult::advance();
+    });
   }
 };
 } // namespace
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 906bc267..15070981 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -140,77 +140,76 @@ module attributes {} {
 // TASKFLOW-NEXT:   }
 // TASKFLOW-NEXT: }
 
-// HYPERBLOCK:     module {
-// HYPERBLOCK-NEXT:  func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-// HYPERBLOCK-NEXT:    %c2_i32 = arith.constant 2 : i32
-// HYPERBLOCK-NEXT:    %c8_i32 = arith.constant 8 : i32
-// HYPERBLOCK-NEXT:    %c0_i32 = arith.constant 0 : i32
-// HYPERBLOCK-NEXT:    %alloca = memref.alloca() : memref<i32>
-// HYPERBLOCK-NEXT:    %alloca_0 = memref.alloca() : memref<4x8xi32>
-// HYPERBLOCK-NEXT:    %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) {
-// HYPERBLOCK-NEXT:    ^bb0(%arg0: i32):
-// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
-// HYPERBLOCK-NEXT:      %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array<i32: 1, 1>}> ({
-// HYPERBLOCK-NEXT:      ^bb0(%arg1: index, %arg2: i32):
-// HYPERBLOCK-NEXT:        %3 = arith.index_cast %arg1 : index to i32
-// HYPERBLOCK-NEXT:        %4 = arith.addi %arg2, %3 : i32
-// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32)
-// HYPERBLOCK-NEXT:      }) : (index, i32) -> i32
-// HYPERBLOCK-NEXT:      taskflow.yield values(%2 : i32)
-// HYPERBLOCK-NEXT:    }
-// HYPERBLOCK-NEXT:    %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) {
-// HYPERBLOCK-NEXT:    ^bb0(%arg0: memref<4x8xi32>, %arg1: i32):
-// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1) <{operandSegmentSizes = array<i32: 1, 0>}> ({
-// HYPERBLOCK-NEXT:      ^bb0(%arg2: index):
-// HYPERBLOCK-NEXT:        %3 = arith.index_cast %arg2 : index to i32
-// HYPERBLOCK-NEXT:        %4 = arith.muli %3, %arg1 : i32
-// HYPERBLOCK-NEXT:        %c0 = arith.constant 0 : index
-// HYPERBLOCK-NEXT:        %c8 = arith.constant 8 : index
-// HYPERBLOCK-NEXT:        %c1 = arith.constant 1 : index
-// HYPERBLOCK-NEXT:        scf.for %arg3 = %c0 to %c8 step %c1 {
-// HYPERBLOCK-NEXT:          %5 = arith.index_cast %arg3 : index to i32
-// HYPERBLOCK-NEXT:          %6 = arith.addi %4, %5 : i32
-// HYPERBLOCK-NEXT:          memref.store %6, %arg0[%arg2, %arg3] : memref<4x8xi32>
-// HYPERBLOCK-NEXT:        }
-// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:      }) : (index) -> ()
-// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg0 : memref<4x8xi32>)
-// HYPERBLOCK-NEXT:    }
-// HYPERBLOCK-NEXT:    %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref<i32>) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref<i32>, i32, i32, i32) -> (memref<i32>) {
-// HYPERBLOCK-NEXT:    ^bb0(%arg0: memref<4x8xi32>, %arg1: memref<i32>, %arg2: i32, %arg3: i32, %arg4: i32):
-// HYPERBLOCK-NEXT:      %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
-// HYPERBLOCK-NEXT:      %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
-// HYPERBLOCK-NEXT:      "taskflow.hyperblock"(%1) <{operandSegmentSizes = array<i32: 1, 0>}> ({
-// HYPERBLOCK-NEXT:      ^bb0(%arg5: index):
-// HYPERBLOCK-NEXT:        %3 = arith.index_cast %arg5 : index to i32
-// HYPERBLOCK-NEXT:        %4 = arith.muli %3, %arg2 : i32
-// HYPERBLOCK-NEXT:        %c0 = arith.constant 0 : index
-// HYPERBLOCK-NEXT:        %c8 = arith.constant 8 : index
-// HYPERBLOCK-NEXT:        %c1 = arith.constant 1 : index
-// HYPERBLOCK-NEXT:        scf.for %arg6 = %c0 to %c8 step %c1 {
-// HYPERBLOCK-NEXT:          %5 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32>
-// HYPERBLOCK-NEXT:          %6 = arith.addi %5, %arg3 : i32
-// HYPERBLOCK-NEXT:          %c0_2 = arith.constant 0 : index
-// HYPERBLOCK-NEXT:          %c-3 = arith.constant -3 : index
-// HYPERBLOCK-NEXT:          %7 = arith.addi %arg5, %c-3 : index
-// HYPERBLOCK-NEXT:          %8 = arith.cmpi eq, %7, %c0_2 : index
-// HYPERBLOCK-NEXT:          %c-7 = arith.constant -7 : index
-// HYPERBLOCK-NEXT:          %9 = arith.addi %arg6, %c-7 : index
-// HYPERBLOCK-NEXT:          %10 = arith.cmpi eq, %9, %c0_2 : index
-// HYPERBLOCK-NEXT:          %11 = arith.andi %8, %10 : i1
-// HYPERBLOCK-NEXT:          scf.if %11 {
-// HYPERBLOCK-NEXT:            memref.store %6, %arg1[] : memref<i32>
-// HYPERBLOCK-NEXT:            %12 = arith.muli %6, %arg4 : i32
-// HYPERBLOCK-NEXT:            memref.store %12, %arg1[] : memref<i32>
-// HYPERBLOCK-NEXT:          }
-// HYPERBLOCK-NEXT:        }
-// HYPERBLOCK-NEXT:        taskflow.hyperblock.yield
-// HYPERBLOCK-NEXT:      }) : (index) -> ()
-// HYPERBLOCK-NEXT:      taskflow.yield writes(%arg1 : memref<i32>)
-// HYPERBLOCK-NEXT:    }
-// HYPERBLOCK-NEXT:    %0 = affine.load %write_outputs_1[] : memref<i32>
-// HYPERBLOCK-NEXT:    return %0 : i32
-// HYPERBLOCK-NEXT:  }
-// HYPERBLOCK-NEXT:}
\ No newline at end of file
+// HYPERBLOCK:      module {
+// HYPERBLOCK-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// HYPERBLOCK-NEXT:     %c2_i32 = arith.constant 2 : i32
+// HYPERBLOCK-NEXT:     %c8_i32 = arith.constant 8 : i32
+// HYPERBLOCK-NEXT:     %c0_i32 = arith.constant 0 : i32
+// HYPERBLOCK-NEXT:     %alloca = memref.alloca() : memref<i32>
+// HYPERBLOCK-NEXT:     %alloca_0 = memref.alloca() : memref<4x8xi32>
+// HYPERBLOCK-NEXT:     %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) {
+// HYPERBLOCK-NEXT:     ^bb0(%arg0: i32):
+// HYPERBLOCK-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index
+// HYPERBLOCK-NEXT:       %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array<i32: 1, 1>}> ({
+// HYPERBLOCK-NEXT:       ^bb0(%arg1: index, %arg2: i32):
+// HYPERBLOCK-NEXT:         %3 = arith.index_cast %arg1 : index to i32
+// HYPERBLOCK-NEXT:         %4 = arith.addi %arg2, %3 : i32
+// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32)
+// HYPERBLOCK-NEXT:       }) : (index, i32) -> i32
+// HYPERBLOCK-NEXT:       taskflow.yield values(%2 : i32)
+// HYPERBLOCK-NEXT:     }
+// HYPERBLOCK-NEXT:     %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) {
+// HYPERBLOCK-NEXT:     ^bb0(%arg0: memref<4x8xi32>, %arg1: i32):
+// HYPERBLOCK-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1) <{operandSegmentSizes = array<i32: 1, 0>}> ({
+// HYPERBLOCK-NEXT:       ^bb0(%arg2: index):
+// HYPERBLOCK-NEXT:         %2 = arith.index_cast %arg2 : index to i32
+// HYPERBLOCK-NEXT:         %3 = arith.muli %2, %arg1 : i32
+// HYPERBLOCK-NEXT:         %c0 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:         %c8 = arith.constant 8 : index
+// HYPERBLOCK-NEXT:         %c1 = arith.constant 1 : index
+// HYPERBLOCK-NEXT:         scf.for %arg3 = %c0 to %c8 step %c1 {
+// HYPERBLOCK-NEXT:           %4 = arith.index_cast %arg3 : index to i32
+// HYPERBLOCK-NEXT:           %5 = arith.addi %3, %4 : i32
+// HYPERBLOCK-NEXT:           memref.store %5, %arg0[%arg2, %arg3] : memref<4x8xi32>
+// HYPERBLOCK-NEXT:         }
+// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:       }) : (index) -> ()
+// HYPERBLOCK-NEXT:       taskflow.yield writes(%arg0 : memref<4x8xi32>)
+// HYPERBLOCK-NEXT:     }
+// HYPERBLOCK-NEXT:     %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref<i32>) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref<i32>, i32, i32, i32) -> (memref<i32>) {
+// HYPERBLOCK-NEXT:     ^bb0(%arg0: memref<4x8xi32>, %arg1: memref<i32>, %arg2: i32, %arg3: i32, %arg4: i32):
+// HYPERBLOCK-NEXT:       %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index
+// HYPERBLOCK-NEXT:       "taskflow.hyperblock"(%1) <{operandSegmentSizes = array<i32: 1, 0>}> ({
+// HYPERBLOCK-NEXT:       ^bb0(%arg5: index):
+// HYPERBLOCK-NEXT:         %2 = arith.index_cast %arg5 : index to i32
+// HYPERBLOCK-NEXT:         %3 = arith.muli %2, %arg2 : i32
+// HYPERBLOCK-NEXT:         %c0 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:         %c8 = arith.constant 8 : index
+// HYPERBLOCK-NEXT:         %c1 = arith.constant 1 : index
+// HYPERBLOCK-NEXT:         scf.for %arg6 = %c0 to %c8 step %c1 {
+// HYPERBLOCK-NEXT:           %4 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32>
+// HYPERBLOCK-NEXT:           %5 = arith.addi %4, %arg3 : i32
+// HYPERBLOCK-NEXT:           %c0_2 = arith.constant 0 : index
+// HYPERBLOCK-NEXT:           %c-3 = arith.constant -3 : index
+// HYPERBLOCK-NEXT:           %6 = arith.addi %arg5, %c-3 : index
+// HYPERBLOCK-NEXT:           %7 = arith.cmpi eq, %6, %c0_2 : index
+// HYPERBLOCK-NEXT:           %c-7 = arith.constant -7 : index
+// HYPERBLOCK-NEXT:           %8 = arith.addi %arg6, %c-7 : index
+// HYPERBLOCK-NEXT:           %9 = arith.cmpi eq, %8, %c0_2 : index
+// HYPERBLOCK-NEXT:           %10 = arith.andi %7, %9 : i1
+// HYPERBLOCK-NEXT:           scf.if %10 {
+// HYPERBLOCK-NEXT:             memref.store %5, %arg1[] : memref<i32>
+// HYPERBLOCK-NEXT:             %11 = arith.muli %5, %arg4 : i32
+// HYPERBLOCK-NEXT:             memref.store %11, %arg1[] : memref<i32>
+// HYPERBLOCK-NEXT:           }
+// HYPERBLOCK-NEXT:         }
+// HYPERBLOCK-NEXT:         taskflow.hyperblock.yield
+// HYPERBLOCK-NEXT:       }) : (index) -> ()
+// HYPERBLOCK-NEXT:       taskflow.yield writes(%arg1 : memref<i32>)
+// HYPERBLOCK-NEXT:     }
+// HYPERBLOCK-NEXT:     %0 = affine.load %write_outputs_1[] : memref<i32>
+// HYPERBLOCK-NEXT:     return %0 : i32
+// HYPERBLOCK-NEXT:   }
+// HYPERBLOCK-NEXT: }
+

From 7a11d4c08841269f01c20a11be5cd70bfdd7a01d Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 3 Feb 2026 12:42:55 +0800
Subject: [PATCH 9/9] [clean] simplify logic in loop tree serialization

---
 .../AffineLoopTreeSerializationPass.cpp       | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp
index da5cc7fa..9a8e45fa 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp
@@ -194,7 +194,7 @@ class MCTBuilder {
         loop_builder = OpBuilder::atBlockEnd(current_insert_block);
       }
 
-      // Prepare iter_args for the new loop.
+      // Prepares iter_args for the new loop.
       SmallVector<Value> iter_args_init_values;
       if (node->loop_op.getNumIterOperands() > 0) {
         for (Value init : node->loop_op.getInits()) {
@@ -202,17 +202,17 @@ class MCTBuilder {
         }
       }
 
-      // Create new loop with same bounds and iter_args.
+      // Creates new loop with same bounds and iter_args.
       auto new_loop = loop_builder.create<affine::AffineForOp>(
           loc, node->lower_bound, node->upper_bound, node->step,
           iter_args_init_values);
 
       created_loops.push_back(new_loop);
 
-      // Map the old induction variable to the new one.
+      // Maps the old induction variable to the new one.
       mapping.map(node->loop_op.getInductionVar(), new_loop.getInductionVar());
 
-      // Map the old iter_args (block args) to the new iter_args (block args).
+      // Maps the old iter_args (block args) to the new iter_args (block args).
       if (node->loop_op.getNumRegionIterArgs() > 0) {
         for (auto [old_arg, new_arg] :
              llvm::zip(node->loop_op.getRegionIterArgs(),
@@ -225,19 +225,19 @@ class MCTBuilder {
         outer_loop = new_loop;
       }
 
-      // Update current insertion block to the body of the new loop.
+      // Updates current insertion block to the body of the new loop.
       current_insert_block = new_loop.getBody();
 
-      // Remove the default yield created by create<AffineForOp>.
+      // Removes the default yield created by create<AffineForOp>.
       if (!current_insert_block->empty() &&
           isa<affine::AffineYieldOp>(current_insert_block->back()))
         current_insert_block->back().erase();
 
-      // Clone body operations for THIS node.
+      // Clones body operations for THIS node.
       OpBuilder body_builder = OpBuilder::atBlockEnd(current_insert_block);
       for (Operation *op : node->body_operations) {
         Operation *new_op = body_builder.clone(*op, mapping);
-        // Update mapping with results of the new op.
+        // Updates mapping with results of the new op.
         for (auto [old_res, new_res] :
              llvm::zip(op->getResults(), new_op->getResults())) {
           mapping.map(old_res, new_res);
@@ -245,7 +245,7 @@ class MCTBuilder {
       }
     }
 
-    // Fix up yields for non-leaf loops (bottom-up).
+    // Fixes up yields for non-leaf loops (bottom-up).
     for (int i = created_loops.size() - 2; i >= 0; --i) {
       affine::AffineForOp parent = created_loops[i];
       affine::AffineForOp child = created_loops[i + 1];
@@ -268,7 +268,7 @@ class MCTBuilder {
     // We need to find what the original yield yielded, map it, and yield it
     // here.
 
-    // Wait, if SALT excludes Yield from body_operations, then we NEVER cloned
+    // If SALT excludes Yield from body_operations, then we NEVER cloned
     // the yield. So the leaf loop has no terminator. We must reconstruct the
     // yield for the leaf loop.
 
@@ -276,7 +276,7 @@ class MCTBuilder {
       affine::AffineForOp new_leaf = created_loops.back();
       SALTNode *leaf_node = chain.getLeaf(); // or chain.nodes.back()
 
-      // Find the yield op in the original leaf node.
+      // Finds the yield op in the original leaf node.
       Operation *original_yield = nullptr;
       for (Operation &op : leaf_node->loop_op.getBody()->getOperations()) {
         if (isa<affine::AffineYieldOp>(&op)) {
@@ -294,10 +294,8 @@ class MCTBuilder {
         }
         leaf_yield_builder.create<affine::AffineYieldOp>(loc, yielded_values);
       } else {
-        // Should not happen for valid AffineForOp
-        OpBuilder leaf_yield_builder =
-            OpBuilder::atBlockEnd(new_leaf.getBody());
-        leaf_yield_builder.create<affine::AffineYieldOp>(loc);
+        assert(false &&
+               "Original leaf loop must have a yield operation in its body.");
       }
     }