diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 14e27a03..a8380d07 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -21,6 +21,7 @@ std::unique_ptr<mlir::Pass> createLowerBuiltinToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
 
 // TaskFlow Conversion Passes.
+std::unique_ptr<mlir::Pass> createAssignTaskTargetPass();
 std::unique_ptr<mlir::Pass> createConvertAffineToTaskflowPass();
 std::unique_ptr<mlir::Pass> createConvertTaskflowToNeuraPass();
 #define GEN_PASS_REGISTRATION
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index e2d727d2..af6f1984 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -48,6 +48,29 @@ def LowerAffineToNeura : Pass<"lower-affine-to-neura", "func::FuncOp">{
 // TaskFlow Conversion Passes.
 //=========================================================//
 
+def AssignTaskTarget : Pass<"assign-task-target", "ModuleOp">{
+  let summary = "Assign hardware targets to compute tasks (functions)";
+  let description = [{
+    This pass assigns hardware target attributes (target.device) to functions
+    based on their names. It enables heterogeneous workload partitioning across
+    different hardware units such as CPU, CGRA, and DOE.
+    
+    The pass applies simple pattern matching rules:
+    - Functions containing "ray_sampler" or "sampler" -> CPU
+    - Functions containing "hash_encoder" or "encoder" -> DOE
+    - Functions containing "nerf_mlp" or "mlp" -> CGRA
+    - Top-level orchestrator functions (e.g., "nerf_forward") -> CPU
+    - Default -> CPU
+    
+    Example output:
+      func.func @ray_sampler_func(...) attributes {target.device = "cpu"} { ... }
+      func.func @hash_encoder_func(...) attributes {target.device = "doe"} { ... }
+      func.func @nerf_mlp_func(...) attributes {target.device = "cgra"} { ... }
+  }];
+  let constructor = "mlir::createAssignTaskTargetPass()";
+  let dependentDialects = ["mlir::func::FuncDialect"];
+}
+
 def ConvertAffineToTaskflow : Pass<"convert-affine-to-taskflow", "ModuleOp">{
   let summary = "Convert top-level affine.for operations to Taskflow dialect";
   let description = [{
diff --git a/lib/Conversion/AssignTaskTarget/AssignTaskTargetPass.cpp b/lib/Conversion/AssignTaskTarget/AssignTaskTargetPass.cpp
new file mode 100644
index 00000000..66ebfc06
--- /dev/null
+++ b/lib/Conversion/AssignTaskTarget/AssignTaskTargetPass.cpp
@@ -0,0 +1,133 @@
+//===- AssignTaskTargetPass.cpp - Assign hardware targets to tasks --------===//
+//
+// This pass assigns hardware target attributes to compute tasks (functions)
+// based on task names. It helps partition the workload across different
+// hardware units (CPU, CGRA, DOE, etc.) in heterogeneous computing systems.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Conversion/ConversionPasses.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// Determines the hardware target for a given function based on its name.
+/// This function implements a simple pattern-matching strategy:
+///   - ray_sampler* -> CPU
+///   - hash_encoder* -> DOE
+///   - nerf_mlp* -> CGRA
+///   - nerf_forward (top-level) -> CPU
+///   - default -> CPU
+static StringRef matchHardwareTarget(StringRef funcName) {
+  // Top-level function: runs on CPU as coordinator
+  if (funcName == "nerf_forward") {
+    return "cpu";
+  }
+
+  // Pattern matching for compute tasks
+  if (funcName.contains("ray_sampler") || funcName.contains("sampler")) {
+    return "cpu";
+  }
+
+  if (funcName.contains("hash_encoder") || funcName.contains("encoder")) {
+    return "doe";
+  }
+
+  if (funcName.contains("nerf_mlp") || funcName.contains("mlp")) {
+    return "cgra";
+  }
+
+  // Default target
+  return "cpu";
+}
+
+//===----------------------------------------------------------------------===//
+// AssignTaskTarget Pass
+//===----------------------------------------------------------------------===//
+
+struct AssignTaskTargetPass
+    : public PassWrapper<AssignTaskTargetPass, OperationPass<ModuleOp>> {
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AssignTaskTargetPass)
+
+  StringRef getArgument() const final { return "assign-task-target"; }
+
+  StringRef getDescription() const final {
+    return "Assign hardware targets to compute tasks (functions) based on "
+           "task names";
+  }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    OpBuilder builder(&getContext());
+
+    // Statistics
+    unsigned totalFuncs = 0;
+    unsigned assignedFuncs = 0;
+    llvm::DenseMap<StringRef, unsigned> targetStats;
+
+    llvm::errs() << "\n";
+    llvm::errs() << "========================================\n";
+    llvm::errs() << "AssignTaskTarget Pass\n";
+    llvm::errs() << "========================================\n\n";
+
+    // Walk through all functions in the module
+    module.walk([&](func::FuncOp funcOp) {
+      totalFuncs++;
+      StringRef funcName = funcOp.getName();
+
+      // Determine hardware target based on function name
+      StringRef target = matchHardwareTarget(funcName);
+
+      // Set the target.device attribute
+      funcOp->setAttr("target.device", builder.getStringAttr(target));
+
+      assignedFuncs++;
+      targetStats[target]++;
+
+      llvm::errs() << "  [ASSIGN] " << funcName << " -> " << target << "\n";
+    });
+
+    // Print summary
+    llvm::errs() << "\n";
+    llvm::errs() << "========================================\n";
+    llvm::errs() << "Summary\n";
+    llvm::errs() << "========================================\n";
+    llvm::errs() << "Total functions:    " << totalFuncs << "\n";
+    llvm::errs() << "Assigned functions: " << assignedFuncs << "\n";
+
+    if (!targetStats.empty()) {
+      llvm::errs() << "\nTarget distribution:\n";
+      for (auto &entry : targetStats) {
+        llvm::errs() << "  " << entry.first << ": " << entry.second
+                     << " function(s)\n";
+      }
+    }
+
+    llvm::errs() << "========================================\n\n";
+  }
+};
+
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Registration
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+
+std::unique_ptr<Pass> createAssignTaskTargetPass() {
+  return std::make_unique<AssignTaskTargetPass>();
+}
+
+}  // namespace mlir
diff --git a/lib/Conversion/AssignTaskTarget/CMakeLists.txt b/lib/Conversion/AssignTaskTarget/CMakeLists.txt
new file mode 100644
index 00000000..44c02c8c
--- /dev/null
+++ b/lib/Conversion/AssignTaskTarget/CMakeLists.txt
@@ -0,0 +1,14 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRAssignTaskTargetPass
+  AssignTaskTargetPass.cpp
+
+  DEPENDS
+  MLIRConversionIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRFuncDialect
+  MLIRIR
+  MLIRPass
+  MLIRSupport
+)
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 690dae25..3bf336e1 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory(AffineToNeura)
 add_subdirectory(LlvmToNeura)
 add_subdirectory(MemRefToNeura)
 add_subdirectory(BuiltinToNeura)
+add_subdirectory(AssignTaskTarget)
 add_subdirectory(AffineToTaskflow)
 add_subdirectory(TaskflowToNeura)
 
@@ -23,6 +24,7 @@ target_link_libraries(MLIRConversion INTERFACE
   MLIRNeuraLlvmToNeuraPass
   MLIRNeuraMemRefToNeuraPass
   MLIRNeuraBuiltinToNeuraPass
+  MLIRAssignTaskTargetPass
   MLIRAffineToTaskflowPass
   MLIRTaskflowToNeuraPass
   ${dialect_libs}
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index e44401d8..afe4c8eb 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -3,6 +3,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
     CanonicalizeTaskPass.cpp
+    PartitionTaskByTarget.cpp
     ClassifyCountersPass.cpp
 
     DEPENDS
diff --git a/lib/TaskflowDialect/Transforms/PartitionTaskByTarget.cpp b/lib/TaskflowDialect/Transforms/PartitionTaskByTarget.cpp
new file mode 100644
index 00000000..0603303a
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/PartitionTaskByTarget.cpp
@@ -0,0 +1,208 @@
+///===- PartitionTaskByTarget.cpp - Partition tasks by hardware target --===//
+//
+// This pass analyzes taskflow.channel operations and annotates cross-boundary
+// channels (channels connecting tasks on different hardware targets).
+//
+//===----------------------------------------------------------------------===//
+
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// Get the target hardware of a task operation
+static StringRef getTaskTarget(TaskflowTaskOp taskOp) {
+  if (auto targetAttr = taskOp->getAttrOfType<StringAttr>("target")) {
+    return targetAttr.getValue();
+  }
+  return "CPU"; // Default target
+}
+
+/// Check if a value is produced by a TaskflowTaskOp
+static TaskflowTaskOp getProducerTask(Value value) {
+  if (auto taskOp = value.getDefiningOp<TaskflowTaskOp>()) {
+    return taskOp;
+  }
+  // Handle block arguments (function parameters)
+  return nullptr;
+}
+
+/// Get all consumer tasks of a value
+static void getConsumerTasks(Value value, 
+                            SmallVectorImpl<TaskflowTaskOp> &consumers) {
+  for (OpOperand &use : value.getUses()) {
+    Operation *owner = use.getOwner();
+    
+    // Direct consumer
+    if (auto taskOp = dyn_cast<TaskflowTaskOp>(owner)) {
+      consumers.push_back(taskOp);
+    }
+    // Through channel
+    else if (auto channelOp = dyn_cast<TaskflowChannelOp>(owner)) {
+      getConsumerTasks(channelOp.getTarget(), consumers);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PartitionTaskByTarget Pass
+//===----------------------------------------------------------------------===//
+
+struct PartitionTaskByTargetPass
+    : public PassWrapper<PartitionTaskByTargetPass,
+                        OperationPass<func::FuncOp>> {
+  
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PartitionTaskByTargetPass)
+
+  StringRef getArgument() const final { return "partition-taskflow-by-target"; }
+  
+  StringRef getDescription() const final {
+    return "Annotate cross-boundary channels in taskflow graph";
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    OpBuilder builder(&getContext());
+    
+    // Statistics
+    unsigned totalChannels = 0;
+    unsigned crossBoundaryChannels = 0;
+    DenseMap<std::pair<StringRef, StringRef>, unsigned> transferStats;
+    
+    llvm::errs() << "\n";
+    llvm::errs() << "========================================\n";
+    llvm::errs() << "PartitionTaskByTarget Pass\n";
+    llvm::errs() << "========================================\n";
+    llvm::errs() << "Function: " << func.getName() << "\n\n";
+    
+    // Step 1: Collect all tasks and their targets
+    SmallVector<TaskflowTaskOp> tasks;
+    func.walk([&](TaskflowTaskOp taskOp) {
+      tasks.push_back(taskOp);
+      StringRef target = getTaskTarget(taskOp);
+      llvm::errs() << "  Task: " << taskOp.getTaskName() 
+                   << " -> " << target << "\n";
+    });
+    
+    llvm::errs() << "\nTotal tasks: " << tasks.size() << "\n\n";
+    
+    // Step 2: Process all channels
+    llvm::errs() << "Analyzing channels:\n";
+    llvm::errs() << "----------------------------------------\n";
+    
+    func.walk([&](TaskflowChannelOp channelOp) {
+      totalChannels++;
+      
+      Value source = channelOp.getSource();
+      
+      // Get producer task
+      TaskflowTaskOp producerTask = getProducerTask(source);
+      if (!producerTask) {
+        llvm::errs() << "  Channel #" << totalChannels 
+                     << ": skipped (no producer task)\n";
+        return;
+      }
+      
+      StringRef producerTarget = getTaskTarget(producerTask);
+      
+      // Get consumer tasks
+      SmallVector<TaskflowTaskOp> consumerTasks;
+      getConsumerTasks(channelOp.getTarget(), consumerTasks);
+      
+      if (consumerTasks.empty()) {
+        llvm::errs() << "  Channel #" << totalChannels 
+                     << ": " << producerTarget 
+                     << " -> (no consumers)\n";
+        return;
+      }
+      
+      // Check all consumers
+      bool isCrossBoundary = false;
+      StringRef consumerTarget;
+      
+      for (auto consumerTask : consumerTasks) {
+        consumerTarget = getTaskTarget(consumerTask);
+        
+        if (producerTarget != consumerTarget) {
+          isCrossBoundary = true;
+          
+          // Annotate the channel
+          channelOp->setAttr("cross_boundary", 
+                            builder.getUnitAttr());
+          channelOp->setAttr("from", 
+                            builder.getStringAttr(producerTarget));
+          channelOp->setAttr("to", 
+                            builder.getStringAttr(consumerTarget));
+          
+          crossBoundaryChannels++;
+          transferStats[{producerTarget, consumerTarget}]++;
+          
+          llvm::errs() << "  Channel #" << totalChannels << ": "
+                       << producerTask.getTaskName() << " (" << producerTarget 
+                       << ") -> "
+                       << consumerTask.getTaskName() << " (" << consumerTarget 
+                       << ") [CROSS-BOUNDARY]\n";
+          
+          break; // Only need to annotate once
+        }
+      }
+      
+      if (!isCrossBoundary) {
+        llvm::errs() << "  Channel #" << totalChannels << ": "
+                     << producerTarget << " -> " << producerTarget 
+                     << " [same target]\n";
+      }
+    });
+    
+    // Step 3: Print summary
+    llvm::errs() << "\n";
+    llvm::errs() << "========================================\n";
+    llvm::errs() << "Summary\n";
+    llvm::errs() << "========================================\n";
+    llvm::errs() << "Total channels:          " << totalChannels << "\n";
+    llvm::errs() << "Cross-boundary channels: " << crossBoundaryChannels << "\n";
+    llvm::errs() << "Same-target channels:    " 
+                 << (totalChannels - crossBoundaryChannels) << "\n";
+    
+    if (!transferStats.empty()) {
+      llvm::errs() << "\nCross-boundary transfer breakdown:\n";
+      for (auto &entry : transferStats) {
+        llvm::errs() << "  " << entry.first.first << " -> " 
+                     << entry.first.second << ": " 
+                     << entry.second << " transfer(s)\n";
+      }
+    }
+    
+    llvm::errs() << "========================================\n\n";
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Registration
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace taskflow {
+
+std::unique_ptr<Pass> createPartitionTaskByTargetPass() {
+  return std::make_unique<PartitionTaskByTargetPass>();
+}
+
+} // namespace taskflow
+} // namespace mlir
diff --git a/test/Conversion/AssignTaskTarget/assign-task-target.mlir b/test/Conversion/AssignTaskTarget/assign-task-target.mlir
new file mode 100644
index 00000000..32d2e848
--- /dev/null
+++ b/test/Conversion/AssignTaskTarget/assign-task-target.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-neura-opt %s --assign-task-target -o %S/Output/assign-task-target.mlir.tmp
+// RUN: mlir-neura-opt %s --assign-task-target | FileCheck %s
+
+// Test the AssignTaskTarget pass with NeRF modular functions
+
+module {
+  // CHECK-LABEL: func.func @ray_sampler_func
+  // CHECK-SAME: attributes {target.device = "cpu"}
+  func.func @ray_sampler_func(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>) 
+                           -> tensor<2x16x3xf32> {
+    %0 = arith.constant 0.0 : f32
+    %1 = tensor.empty() : tensor<2x16x3xf32>
+    return %1 : tensor<2x16x3xf32>
+  }
+
+  // CHECK-LABEL: func.func @hash_encoder_func
+  // CHECK-SAME: attributes {target.device = "doe"}
+  func.func @hash_encoder_func(%arg0: tensor<2x16x3xf32>) 
+                            -> tensor<2x16x4xf32> {
+    %0 = tensor.empty() : tensor<2x16x4xf32>
+    return %0 : tensor<2x16x4xf32>
+  }
+
+  // CHECK-LABEL: func.func @nerf_mlp_func
+  // CHECK-SAME: attributes {target.device = "cgra"}
+  func.func @nerf_mlp_func(%arg0: tensor<2x16x4xf32>, %arg1: tensor<2x3xf32>) 
+                        -> (tensor<2x16x1xf32>, tensor<2x16x3xf32>) {
+    %0 = tensor.empty() : tensor<2x16x1xf32>
+    %1 = tensor.empty() : tensor<2x16x3xf32>
+    return %0, %1 : tensor<2x16x1xf32>, tensor<2x16x3xf32>
+  }
+
+  // CHECK-LABEL: func.func @nerf_forward
+  // CHECK-SAME: attributes {target.device = "cpu"}
+  func.func @nerf_forward(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>) 
+                       -> (tensor<2x16x1xf32>, tensor<2x16x3xf32>) {
+    %positions = func.call @ray_sampler_func(%arg0, %arg1) 
+                 : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x16x3xf32>
+    
+    %encoded = func.call @hash_encoder_func(%positions)
+               : (tensor<2x16x3xf32>) -> tensor<2x16x4xf32>
+    
+    %density, %rgb = func.call @nerf_mlp_func(%encoded, %arg1)
+                     : (tensor<2x16x4xf32>, tensor<2x3xf32>) 
+                     -> (tensor<2x16x1xf32>, tensor<2x16x3xf32>)
+    
+    return %density, %rgb : tensor<2x16x1xf32>, tensor<2x16x3xf32>
+  }
+
+  // CHECK-LABEL: func.func @generic_sampler
+  // CHECK-SAME: attributes {target.device = "cpu"}
+  func.func @generic_sampler() {
+    return
+  }
+
+  // CHECK-LABEL: func.func @custom_encoder
+  // CHECK-SAME: attributes {target.device = "doe"}
+  func.func @custom_encoder() {
+    return
+  }
+
+  // CHECK-LABEL: func.func @some_mlp
+  // CHECK-SAME: attributes {target.device = "cgra"}
+  func.func @some_mlp() {
+    return
+  }
+
+  // CHECK-LABEL: func.func @unknown_function
+  // CHECK-SAME: attributes {target.device = "cpu"}
+  func.func @unknown_function() {
+    return
+  }
+}
diff --git a/test/Conversion/AssignTaskTarget/run.log b/test/Conversion/AssignTaskTarget/run.log
new file mode 100644
index 00000000..df32c269
--- /dev/null
+++ b/test/Conversion/AssignTaskTarget/run.log
@@ -0,0 +1,74 @@
+-- Testing: 1 tests, 1 workers --
+PASS: Neura Dialect Tests :: Conversion/AssignTaskTarget/assign-task-target.mlir (1 of 1)
+Exit Code: 0
+
+Command Output (stderr):
+--
+RUN: at line 1: /workspace/dataflow/build/tools/mlir-neura-opt/mlir-neura-opt /workspace/dataflow/test/Conversion/AssignTaskTarget/assign-task-target.mlir --assign-task-target -o /workspace/dataflow/test/Conversion/AssignTaskTarget/Output/assign-task-target.mlir.tmp
++ /workspace/dataflow/build/tools/mlir-neura-opt/mlir-neura-opt /workspace/dataflow/test/Conversion/AssignTaskTarget/assign-task-target.mlir --assign-task-target -o /workspace/dataflow/test/Conversion/AssignTaskTarget/Output/assign-task-target.mlir.tmp
+[mlir-neura-opt] No architecture specification file provided, using default configuration
+
+========================================
+AssignTaskTarget Pass
+========================================
+
+  [ASSIGN] ray_sampler_func -> cpu
+  [ASSIGN] hash_encoder_func -> doe
+  [ASSIGN] nerf_mlp_func -> cgra
+  [ASSIGN] nerf_forward -> cpu
+  [ASSIGN] generic_sampler -> cpu
+  [ASSIGN] custom_encoder -> doe
+  [ASSIGN] some_mlp -> cgra
+  [ASSIGN] unknown_function -> cpu
+
+========================================
+Summary
+========================================
+Total functions:    8
+Assigned functions: 8
+
+Target distribution:
+  doe: 2 function(s)
+  cpu: 4 function(s)
+  cgra: 2 function(s)
+========================================
+
+RUN: at line 2: /workspace/dataflow/build/tools/mlir-neura-opt/mlir-neura-opt /workspace/dataflow/test/Conversion/AssignTaskTarget/assign-task-target.mlir --assign-task-target | /workspace/llvm-project/build/./bin/FileCheck /workspace/dataflow/test/Conversion/AssignTaskTarget/assign-task-target.mlir
++ /workspace/dataflow/build/tools/mlir-neura-opt/mlir-neura-opt /workspace/dataflow/test/Conversion/AssignTaskTarget/assign-task-target.mlir --assign-task-target
++ /workspace/llvm-project/build/./bin/FileCheck /workspace/dataflow/test/Conversion/AssignTaskTarget/assign-task-target.mlir
+[mlir-neura-opt] No architecture specification file provided, using default configuration
+
+========================================
+AssignTaskTarget Pass
+========================================
+
+  [ASSIGN] ray_sampler_func -> cpu
+  [ASSIGN] hash_encoder_func -> doe
+  [ASSIGN] nerf_mlp_func -> cgra
+  [ASSIGN] nerf_forward -> cpu
+  [ASSIGN] generic_sampler -> cpu
+  [ASSIGN] custom_encoder -> doe
+  [ASSIGN] some_mlp -> cgra
+  [ASSIGN] unknown_function -> cpu
+
+========================================
+Summary
+========================================
+Total functions:    8
+Assigned functions: 8
+
+Target distribution:
+  doe: 2 function(s)
+  cpu: 4 function(s)
+  cgra: 2 function(s)
+========================================
+
+
+--
+
+********************
+
+Testing Time: 0.22s
+
+Total Discovered Tests: 1
+  Passed: 1 (100.00%)
diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench
index ccc0f9f1..2beecc59 160000
--- a/test/benchmark/CGRA-Bench
+++ b/test/benchmark/CGRA-Bench
@@ -1 +1 @@
-Subproject commit ccc0f9f100462a83942b8bf06247cca032fb817e
+Subproject commit 2beecc599bd268f8665344ba2271f48c97db7aa0
diff --git a/test/e2e/bicg/bicg_int_kernel.mlir b/test/e2e/bicg/bicg_int_kernel.mlir
new file mode 100644
index 00000000..32f17705
--- /dev/null
+++ b/test/e2e/bicg/bicg_int_kernel.mlir
@@ -0,0 +1,359 @@
+// Compile the int BiCG kernel to LLVM IR.
+// Use -I %S so local headers are visible if needed.
+// RUN: clang -S -emit-llvm -O3 -fno-vectorize -fno-unroll-loops -std=c11 \
+// RUN:   -I %S/../../benchmark/CGRA-Bench/kernels/bicg -DSMALL_DATASET \
+// RUN:   -o %t-kernel-full.ll %S/../../benchmark/CGRA-Bench/kernels/bicg/bicg_int.c
+
+// RUN: llvm-extract --rfunc=".*kernel.*" %t-kernel-full.ll -o %t-kernel-only.ll
+// RUN: mlir-translate --import-llvm %t-kernel-only.ll -o %t-kernel.mlir
+
+// Lower and map to the Neura accelerator, then generate code.
+// RUN: mlir-neura-opt %t-kernel.mlir \
+// RUN:   --assign-accelerator \
+// RUN:   --lower-llvm-to-neura \
+// RUN:   --promote-func-arg-to-const \
+// RUN:   --fold-constant \
+// RUN:   --canonicalize-return \
+// RUN:   --canonicalize-live-in \
+// RUN:   --leverage-predicated-value \
+// RUN:   --transform-ctrl-to-data-flow \
+// RUN:   --fold-constant \
+// RUN:   --insert-data-mov \
+// RUN:   --map-to-accelerator="mapping-strategy=heuristic" \
+// RUN:   --architecture-spec=%S/../../arch_spec/architecture.yaml \
+// RUN:   --generate-code -o %t-mapping.mlir
+
+// RUN: FileCheck %s --input-file=%t-mapping.mlir --check-prefix=MAPPING
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.yaml --check-prefix=YAML
+// RUN: FileCheck %s --input-file=tmp-generated-instructions.asm --check-prefix=ASM
+
+// MAPPING:   func.func @kernel_bicg_int(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg3: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg4: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 10 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 9 : i32, res_mii = 3 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
+// MAPPING-NEXT:     %0 = "neura.constant"() <{value = "%arg3"}> {dfg_id = 0 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 0 : i32, y = 3 : i32}]} : () -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %1 = "neura.constant"() <{value = 0 : i8}> {dfg_id = 1 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 1 : i32, y = 1 : i32}]} : () -> !neura.data<i8, i1>
+// MAPPING-NEXT:     %2 = "neura.constant"() <{value = 32 : i64}> {dfg_id = 2 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 2 : i32, y = 3 : i32}]} : () -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %3 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 3 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 0 : i32, y = 0 : i32}]} : () -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %4 = "neura.data_mov"(%0) {dfg_id = 10 : i32, mapping_locs = [{id = 38 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}, {id = 416 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %5 = "neura.data_mov"(%1) {dfg_id = 11 : i32, mapping_locs = [{id = 16 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}, {id = 30 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 9 : i32}]} : (!neura.data<i8, i1>) -> !neura.data<i8, i1>
+// MAPPING-NEXT:     %6 = "neura.data_mov"(%2) {dfg_id = 12 : i32, mapping_locs = [{id = 43 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}, {id = 417 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     "neura.memset"(%4, %5, %6) <{is_volatile = false}> {dfg_id = 15 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 10 : i32, x = 1 : i32, y = 3 : i32}]} : (!neura.data<!llvm.ptr, i1>, !neura.data<i8, i1>, !neura.data<i64, i1>) -> ()
+// MAPPING-NEXT:     %7 = neura.reserve {dfg_id = 4 : i32} : !neura.data<i64, i1>
+// MAPPING-NEXT:     %8 = "neura.data_mov"(%3) {dfg_id = 13 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %9 = neura.phi_start %8, %7 {dfg_id = 16 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %10 = neura.reserve {dfg_id = 5 : i32} : !neura.data<i64, i1>
+// MAPPING-NEXT:     %11 = "neura.data_mov"(%3) {dfg_id = 14 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}, {id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %12 = neura.phi_start %11, %10 {dfg_id = 17 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 0 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %13 = "neura.data_mov"(%12) {dfg_id = 23 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 2 : i32}, {id = 2 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 3 : i32}, {id = 2 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 4 : i32}, {id = 2 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %14 = "neura.gep"(%13) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 29 : i32, lhs_value = "%arg4", mapping_locs = [{id = 0 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %15 = "neura.data_mov"(%14) {dfg_id = 42 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}, {id = 0 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 7 : i32}, {id = 32 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     "neura.store"(%15) {dfg_id = 49 : i32, lhs_value = 0 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 9 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT:     %16 = "neura.data_mov"(%12) {dfg_id = 22 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 32 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 32 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 28 : i32, lhs_value = "%arg2", mapping_locs = [{id = 1 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 5 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %18 = "neura.data_mov"(%12) {dfg_id = 21 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %19 = "neura.shl"(%18) {dfg_id = 27 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 0 : i32}], rhs_value = 5 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %20 = "neura.data_mov"(%19) {dfg_id = 38 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %21 = "neura.gep"(%20) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 46 : i32, lhs_value = "%arg0", mapping_locs = [{id = 0 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %22 = neura.reserve {dfg_id = 6 : i32} : !neura.data<i64, i1>
+// MAPPING-NEXT:     %23 = "neura.data_mov"(%9) {dfg_id = 19 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 257 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}, {id = 257 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 257 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}, {id = 257 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}, {id = 257 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %24 = neura.phi_start %23, %22 {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 7 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %25 = neura.reserve {dfg_id = 7 : i32} : !neura.data<i64, i1>
+// MAPPING-NEXT:     %26 = "neura.data_mov"(%12) {dfg_id = 20 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 128 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %27 = neura.phi_start %26, %25 {dfg_id = 26 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %28 = neura.reserve {dfg_id = 8 : i32} : !neura.data<i64, i1>
+// MAPPING-NEXT:     %29 = "neura.data_mov"(%9) {dfg_id = 18 : i32, mapping_locs = [{id = 128 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %30 = neura.phi_start %29, %28 {dfg_id = 24 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %31 = "neura.data_mov"(%21) {dfg_id = 55 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %32 = "neura.data_mov"(%30) {dfg_id = 33 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 1 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 1 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %33 = "neura.gep"(%31, %32) <{operandSegmentSizes = array<i32: 1, 1>}> {dfg_id = 61 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %34 = "neura.data_mov"(%33) {dfg_id = 67 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %35 = "neura.load"(%34) {dfg_id = 71 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %36 = "neura.data_mov"(%17) {dfg_id = 39 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %37 = "neura.load"(%36) {dfg_id = 47 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 0 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %38 = "neura.data_mov"(%37) {dfg_id = 56 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %39 = "neura.data_mov"(%35) {dfg_id = 78 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %40 = "neura.mul"(%38, %39) {dfg_id = 84 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %41 = "neura.data_mov"(%30) {dfg_id = 32 : i32, mapping_locs = [{id = 129 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}, {id = 129 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 129 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}, {id = 129 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %42 = "neura.gep"(%41) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 45 : i32, lhs_value = "%arg3", mapping_locs = [{id = 4 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %43 = "neura.data_mov"(%42) {dfg_id = 54 : i32, mapping_locs = [{id = 128 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}, {id = 128 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %44 = "neura.load"(%43) {dfg_id = 60 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 0 : i32, y = 1 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %45 = "neura.data_mov"(%44) {dfg_id = 66 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %46 = "neura.data_mov"(%40) {dfg_id = 91 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 7 : i32}, {id = 160 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %47 = "neura.add"(%45, %46) {dfg_id = 94 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 9 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %48 = "neura.data_mov"(%47) {dfg_id = 99 : i32, mapping_locs = [{id = 162 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 9 : i32}, {id = 162 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %49 = "neura.data_mov"(%42) {dfg_id = 53 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 6 : i32}, {id = 163 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 3 : i32, resource = "register", time_step = 7 : i32}, {id = 163 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 3 : i32, resource = "register", time_step = 8 : i32}, {id = 163 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 3 : i32, resource = "register", time_step = 9 : i32}, {id = 163 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 3 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     "neura.store"(%48, %49) {dfg_id = 103 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 11 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT:     %50 = "neura.data_mov"(%30) {dfg_id = 31 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 256 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 256 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %51 = "neura.gep"(%50) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 44 : i32, lhs_value = "%arg1", mapping_locs = [{id = 8 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %52 = "neura.data_mov"(%51) {dfg_id = 52 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %53 = "neura.load"(%52) {dfg_id = 59 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %54 = "neura.data_mov"(%53) {dfg_id = 65 : i32, mapping_locs = [{id = 25 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 6 : i32}, {id = 11 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 7 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %55 = "neura.data_mov"(%35) {dfg_id = 77 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 6 : i32}, {id = 0 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %56 = "neura.mul"(%54, %55) {dfg_id = 83 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %57 = "neura.data_mov"(%14) {dfg_id = 41 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 6 : i32}, {id = 32 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %58 = "neura.load"(%57) {dfg_id = 48 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %59 = "neura.data_mov"(%58) {dfg_id = 57 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %60 = "neura.data_mov"(%56) {dfg_id = 90 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %61 = "neura.add"(%59, %60) {dfg_id = 93 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 9 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %62 = "neura.data_mov"(%61) {dfg_id = 98 : i32, mapping_locs = [{id = 1 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}, {id = 1 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %63 = "neura.data_mov"(%14) {dfg_id = 40 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 6 : i32}, {id = 2 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 7 : i32}, {id = 2 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 8 : i32}, {id = 2 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 9 : i32}, {id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     "neura.store"(%62, %63) {dfg_id = 102 : i32, mapping_locs = [{id = 0 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 11 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT:     %64 = "neura.data_mov"(%30) {dfg_id = 30 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %65 = "neura.add"(%64) {dfg_id = 43 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 1 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %66 = "neura.data_mov"(%65) {dfg_id = 51 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %67 = "neura.icmp"(%66) <{cmpType = "eq"}> {dfg_id = 58 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 1 : i32}], rhs_value = 8 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %68 = "neura.data_mov"(%67) {dfg_id = 64 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %69 = "neura.not"(%68) {dfg_id = 70 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 5 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %70 = "neura.data_mov"(%65) {dfg_id = 50 : i32, mapping_locs = [{id = 161 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 161 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}, {id = 161 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}, {id = 161 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %71 = "neura.data_mov"(%69) {dfg_id = 76 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 160 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %72 = neura.grant_predicate %70, %71 {dfg_id = 82 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %72 -> %28 {dfg_id = 89 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 7 : i32}, {id = 129 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}, {id = 129 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}, {id = 129 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}, {id = 129 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 11 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %73 = "neura.data_mov"(%27) {dfg_id = 37 : i32, mapping_locs = [{id = 130 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 4 : i32}, {id = 130 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 5 : i32}, {id = 130 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 6 : i32}, {id = 130 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 7 : i32}, {id = 130 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %74 = "neura.data_mov"(%69) {dfg_id = 75 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 5 : i32}, {id = 131 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 3 : i32, resource = "register", time_step = 6 : i32}, {id = 131 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 3 : i32, resource = "register", time_step = 7 : i32}, {id = 131 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 3 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %75 = neura.grant_predicate %73, %74 {dfg_id = 81 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 9 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %75 -> %25 {dfg_id = 88 : i32, mapping_locs = [{id = 130 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 9 : i32}, {id = 130 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 10 : i32}, {id = 130 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 11 : i32}, {id = 130 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 12 : i32}, {id = 130 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 13 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %76 = "neura.data_mov"(%24) {dfg_id = 35 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}, {id = 24 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %77 = "neura.data_mov"(%69) {dfg_id = 74 : i32, mapping_locs = [{id = 16 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 5 : i32}, {id = 289 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}, {id = 289 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}, {id = 289 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %78 = neura.grant_predicate %76, %77 {dfg_id = 80 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 9 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %78 -> %22 {dfg_id = 87 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 9 : i32}, {id = 258 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 10 : i32}, {id = 258 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 11 : i32}, {id = 258 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 12 : i32}, {id = 258 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 13 : i32}, {id = 258 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 14 : i32}, {id = 258 : i32, index_per_ii = 5 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 15 : i32}, {id = 258 : i32, index_per_ii = 6 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 16 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %79 = "neura.data_mov"(%27) {dfg_id = 36 : i32, mapping_locs = [{id = 128 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %80 = "neura.data_mov"(%67) {dfg_id = 63 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %81 = neura.grant_predicate %79, %80 {dfg_id = 69 : i32, mapping_locs = [{id = 4 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 5 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %82 = "neura.data_mov"(%24) {dfg_id = 34 : i32, mapping_locs = [{id = 24 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 7 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %83 = "neura.data_mov"(%67) {dfg_id = 62 : i32, mapping_locs = [{id = 16 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 288 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 288 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}, {id = 288 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %84 = neura.grant_predicate %82, %83 {dfg_id = 68 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %85 = "neura.data_mov"(%81) {dfg_id = 73 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 5 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %86 = "neura.add"(%85) {dfg_id = 79 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 1 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %87 = "neura.data_mov"(%86) {dfg_id = 86 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %88 = "neura.icmp"(%87) <{cmpType = "eq"}> {dfg_id = 92 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 7 : i32, x = 2 : i32, y = 1 : i32}], rhs_value = 8 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %89 = "neura.data_mov"(%88) {dfg_id = 97 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %90 = "neura.not"(%89) {dfg_id = 101 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 8 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %91 = "neura.data_mov"(%86) {dfg_id = 85 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 6 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 6 : i32}, {id = 3 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 7 : i32}, {id = 64 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %92 = "neura.data_mov"(%90) {dfg_id = 106 : i32, mapping_locs = [{id = 19 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %93 = neura.grant_predicate %91, %92 {dfg_id = 109 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 9 : i32, x = 2 : i32, y = 0 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %93 -> %10 {dfg_id = 111 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 9 : i32}, {id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 10 : i32}, {id = 1 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 11 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %94 = "neura.data_mov"(%84) {dfg_id = 72 : i32, mapping_locs = [{id = 29 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}, {id = 160 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %95 = "neura.data_mov"(%90) {dfg_id = 105 : i32, mapping_locs = [{id = 17 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 8 : i32}, {id = 161 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %96 = neura.grant_predicate %94, %95 {dfg_id = 108 : i32, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 10 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %96 -> %7 {dfg_id = 110 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 10 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %97 = "neura.data_mov"(%88) {dfg_id = 95 : i32, mapping_locs = [{id = 193 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}, {id = 193 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %98 = "neura.data_mov"(%88) {dfg_id = 96 : i32, mapping_locs = [{id = 194 : i32, index_per_ii = 7 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 7 : i32}, {id = 194 : i32, index_per_ii = 8 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %99 = neura.grant_predicate %97, %98 {dfg_id = 100 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 9 : i32, x = 2 : i32, y = 1 : i32}]} : !neura.data<i1, i1>, !neura.data<i1, i1> -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %100 = "neura.data_mov"(%99) {dfg_id = 104 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 9 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     neura.return_void %100 : !neura.data<i1, i1> {dfg_id = 107 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 1 : i32}]}
+// MAPPING-NEXT:     neura.yield {dfg_id = 9 : i32}
+// MAPPING-NEXT:   }
+// MAPPING-NEXT: }
+
+
+// YAML:     - column: 0
+// YAML-NEXT:       row: 0
+// YAML-NEXT:       core_id: "0"
+// YAML-NEXT:       entries:
+// YAML-NEXT:         - entry_id: "entry0"
+// YAML-NEXT:           instructions:
+// YAML-NEXT:             - index_per_ii: 0
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "GRANT_ONCE"
+// YAML-NEXT:                   id: 3
+// YAML-NEXT:                   time_step: 0
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "#0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "NORTH"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 1
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "STORE"
+// YAML-NEXT:                   id: 102
+// YAML-NEXT:                   time_step: 11
+// YAML-NEXT:                   invalid_iterations: 1
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "$1"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$2"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                 - opcode: "CTRL_MOV"
+// YAML-NEXT:                   id: 111
+// YAML-NEXT:                   time_step: 11
+// YAML-NEXT:                   invalid_iterations: 1
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$1"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 2
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "PHI_START"
+// YAML-NEXT:                   id: 17
+// YAML-NEXT:                   time_step: 2
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$1"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$2"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "NORTH"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 3
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "SHL"
+// YAML-NEXT:                   id: 27
+// YAML-NEXT:                   time_step: 3
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "#5"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                 - opcode: "DATA_MOV"
+// YAML-NEXT:                   id: 33
+// YAML-NEXT:                   time_step: 3
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "NORTH"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$1"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 4
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "GEP"
+// YAML-NEXT:                   id: 46
+// YAML-NEXT:                   time_step: 4
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 5
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "GEP"
+// YAML-NEXT:                   id: 61
+// YAML-NEXT:                   time_step: 5
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$1"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 6
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "GEP"
+// YAML-NEXT:                   id: 29
+// YAML-NEXT:                   time_step: 6
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "$2"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$2"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 7
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "DATA_MOV"
+// YAML-NEXT:                   id: 420000
+// YAML-NEXT:                   time_step: 7
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                 - opcode: "DATA_MOV"
+// YAML-NEXT:                   id: 77
+// YAML-NEXT:                   time_step: 7
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 8
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "MUL"
+// YAML-NEXT:                   id: 83
+// YAML-NEXT:                   time_step: 8
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "NORTH"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:             - index_per_ii: 9
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "ADD"
+// YAML-NEXT:                   id: 93
+// YAML-NEXT:                   time_step: 9
+// YAML-NEXT:                   invalid_iterations: 0
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                     - operand: "$0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
+// YAML-NEXT:                     - operand: "$1"
+// YAML-NEXT:                       color: "RED"
+
+// ASM: PE(0,0):
+// ASM-NEXT: {
+// ASM-NEXT:   GRANT_ONCE, [#0] -> [NORTH, RED], [$0] (t=0, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=0)
+// ASM-NEXT: {
+// ASM-NEXT:   STORE, [$1], [$2] (t=11, inv_iters=1)
+// ASM-NEXT:   CTRL_MOV, [EAST, RED] -> [$1] (t=11, inv_iters=1)
+// ASM-NEXT: } (idx_per_ii=1)
+// ASM-NEXT: {
+// ASM-NEXT:   PHI_START, [$0], [$1] -> [$2], [EAST, RED], [$0], [NORTH, RED] (t=2, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=2)
+// ASM-NEXT: {
+// ASM-NEXT:   SHL, [$0], [#5] -> [$0] (t=3, inv_iters=0)
+// ASM-NEXT:   DATA_MOV, [NORTH, RED] -> [$1] (t=3, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=3)
+// ASM-NEXT: {
+// ASM-NEXT:   GEP, [$0] -> [$0] (t=4, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=4)
+// ASM-NEXT: {
+// ASM-NEXT:   GEP, [$0], [$1] -> [EAST, RED] (t=5, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=5)
+// ASM-NEXT: {
+// ASM-NEXT:   GEP, [$2] -> [$0], [EAST, RED], [$2] (t=6, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=6)
+// ASM-NEXT: {
+// ASM-NEXT:   DATA_MOV, [$0] -> [EAST, RED] (t=7, inv_iters=0)
+// ASM-NEXT:   DATA_MOV, [EAST, RED] -> [$0] (t=7, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=7)
+// ASM-NEXT: {
+// ASM-NEXT:   MUL, [NORTH, RED], [$0] -> [$0] (t=8, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=8)
+// ASM-NEXT: {
+// ASM-NEXT:   ADD, [EAST, RED], [$0] -> [$1] (t=9, inv_iters=0)
+// ASM-NEXT: } (idx_per_ii=9)
+
diff --git a/test/e2e/relu/relu_kernel.mlir b/test/e2e/relu/relu_kernel.mlir
index b5c46f98..a6588a54 100644
--- a/test/e2e/relu/relu_kernel.mlir
+++ b/test/e2e/relu/relu_kernel.mlir
@@ -33,51 +33,52 @@
 // Check the mapped MLIR contains key operations with full statements.
 // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
 // MAPPING:          %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 0 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
-// MAPPING:          %1 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i64, i1>
-// MAPPING:          %2 = "neura.data_mov"(%0) {dfg_id = 3 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %3 = neura.phi_start %2, %1 {dfg_id = 4 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING:          %4 = "neura.data_mov"(%3) {dfg_id = 7 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 9 : i32, lhs_value = "%arg0", mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING:          %6 = "neura.data_mov"(%5) {dfg_id = 12 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING:          %7 = "neura.load"(%6) {dfg_id = 14 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %8 = "neura.data_mov"(%7) {dfg_id = 19 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {dfg_id = 22 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %10 = "neura.data_mov"(%3) {dfg_id = 6 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 18 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 224 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 224 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %11 = "neura.data_mov"(%9) {dfg_id = 26 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %12 = neura.grant_predicate %10, %11 {dfg_id = 30 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 3 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING:          %13 = "neura.data_mov"(%7) {dfg_id = 18 : i32, mapping_locs = [{id = 37 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 480 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}, {id = 480 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 480 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %14 = "neura.data_mov"(%9) {dfg_id = 25 : i32, mapping_locs = [{id = 37 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 481 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}, {id = 481 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %15 = neura.grant_predicate %13, %14 {dfg_id = 29 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 3 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING:          %16 = "neura.data_mov"(%12) {dfg_id = 33 : i32, mapping_locs = [{id = 224 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 34 : i32, lhs_value = "%arg1", mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING:          %18 = "neura.data_mov"(%17) {dfg_id = 36 : i32, mapping_locs = [{id = 21 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING:          %19 = "neura.load"(%18) {dfg_id = 37 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %20 = "neura.data_mov"(%19) {dfg_id = 38 : i32, mapping_locs = [{id = 20 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 34 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %21 = "neura.data_mov"(%15) {dfg_id = 32 : i32, mapping_locs = [{id = 46 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 448 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %22 = "neura.add"(%20, %21) {dfg_id = 39 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 2 : i32, y = 3 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %23 = "neura.data_mov"(%22) {dfg_id = 40 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING:          %24 = "neura.data_mov"(%17) {dfg_id = 35 : i32, mapping_locs = [{id = 23 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}, {id = 37 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 46 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}, {id = 449 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING:          "neura.store"(%23, %24) {dfg_id = 41 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 3 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING:          %25 = "neura.data_mov"(%3) {dfg_id = 5 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %26 = "neura.add"(%25) {dfg_id = 8 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %27 = "neura.data_mov"(%26) {dfg_id = 11 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {dfg_id = 13 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %29 = "neura.data_mov"(%28) {dfg_id = 17 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %30 = "neura.not"(%29) {dfg_id = 21 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %31 = "neura.data_mov"(%26) {dfg_id = 10 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}, {id = 321 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 321 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING:          %32 = "neura.data_mov"(%30) {dfg_id = 24 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %33 = neura.grant_predicate %31, %32 {dfg_id = 28 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING:          neura.ctrl_mov %33 -> %1 {dfg_id = 31 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING:          %34 = "neura.data_mov"(%28) {dfg_id = 15 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 192 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}, {id = 192 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 192 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}, {id = 192 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %35 = "neura.data_mov"(%28) {dfg_id = 16 : i32, mapping_locs = [{id = 322 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 3 : i32}, {id = 33 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 193 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}, {id = 193 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}, {id = 193 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          %36 = neura.grant_predicate %34, %35 {dfg_id = 20 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 2 : i32, y = 1 : i32}]} : !neura.data<i1, i1>, !neura.data<i1, i1> -> !neura.data<i1, i1>
-// MAPPING:          %37 = "neura.data_mov"(%36) {dfg_id = 23 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING:          neura.return_void %37 : !neura.data<i1, i1> {dfg_id = 27 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 2 : i32, y = 1 : i32}]}
-// MAPPING:          neura.yield {dfg_id = 2 : i32}
+// MAPPING:          %1 = "neura.grant_once"() <{constant_value = 0 : i32}> {dfg_id = 1 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 3 : i32}]} : () -> !neura.data<i32, i1>
+// MAPPING:          %2 = neura.reserve {dfg_id = 2 : i32} : !neura.data<i32, i1>
+// MAPPING:          %3 = "neura.data_mov"(%1) {dfg_id = 6 : i32, mapping_locs = [{id = 39 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %4 = neura.phi_start %3, %2 {dfg_id = 8 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// MAPPING:          %5 = neura.reserve {dfg_id = 3 : i32} : !neura.data<i64, i1>
+// MAPPING:          %6 = "neura.data_mov"(%0) {dfg_id = 5 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING:          %7 = neura.phi_start %6, %5 {dfg_id = 7 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING:          %8 = "neura.data_mov"(%7) {dfg_id = 11 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING:          %9 = "neura.gep"(%8) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 16 : i32, lhs_value = "%arg0", mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING:          %10 = "neura.data_mov"(%9) {dfg_id = 20 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING:          %11 = "neura.load"(%10) {dfg_id = 22 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %12 = "neura.data_mov"(%11) {dfg_id = 27 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %13 = "neura.icmp"(%12) <{cmpType = "sge"}> {dfg_id = 30 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %14 = "neura.data_mov"(%13) {dfg_id = 34 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 31 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %15 = "neura.data_mov"(%11) {dfg_id = 26 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 31 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 288 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %16 = "neura.data_mov"(%4) {dfg_id = 13 : i32, mapping_locs = [{id = 24 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 289 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %17 = "neura.sel"(%14, %15, %16) {dfg_id = 38 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>, !neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %18 = "neura.data_mov"(%7) {dfg_id = 10 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 448 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}, {id = 448 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 448 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING:          %19 = "neura.gep"(%18) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 15 : i32, lhs_value = "%arg1", mapping_locs = [{id = 14 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING:          %20 = "neura.data_mov"(%17) {dfg_id = 41 : i32, mapping_locs = [{id = 30 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %21 = "neura.data_mov"(%19) {dfg_id = 19 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 43 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING:          "neura.store"(%20, %21) {dfg_id = 42 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 3 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING:          %22 = "neura.data_mov"(%7) {dfg_id = 9 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING:          %23 = "neura.add"(%22) {dfg_id = 14 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING:          %24 = "neura.data_mov"(%23) {dfg_id = 18 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING:          %25 = "neura.icmp"(%24) <{cmpType = "eq"}> {dfg_id = 21 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %26 = "neura.data_mov"(%25) {dfg_id = 25 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %27 = "neura.not"(%26) {dfg_id = 29 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %28 = "neura.data_mov"(%23) {dfg_id = 17 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}, {id = 321 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}, {id = 321 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING:          %29 = "neura.data_mov"(%27) {dfg_id = 33 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %30 = neura.grant_predicate %28, %29 {dfg_id = 37 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING:          neura.ctrl_mov %30 -> %5 {dfg_id = 40 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING:          %31 = "neura.data_mov"(%4) {dfg_id = 12 : i32, mapping_locs = [{id = 256 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}, {id = 24 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING:          %32 = "neura.data_mov"(%27) {dfg_id = 32 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 43 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 42 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %33 = neura.grant_predicate %31, %32 {dfg_id = 36 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING:          neura.ctrl_mov %33 -> %2 {dfg_id = 39 : i32, mapping_locs = [{id = 27 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 256 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
+// MAPPING:          %34 = "neura.data_mov"(%25) {dfg_id = 23 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 192 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %35 = "neura.data_mov"(%25) {dfg_id = 24 : i32, mapping_locs = [{id = 322 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 3 : i32}, {id = 33 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          %36 = neura.grant_predicate %34, %35 {dfg_id = 28 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 1 : i32}]} : !neura.data<i1, i1>, !neura.data<i1, i1> -> !neura.data<i1, i1>
+// MAPPING:          %37 = "neura.data_mov"(%36) {dfg_id = 31 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING:          neura.return_void %37 : !neura.data<i1, i1> {dfg_id = 35 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 1 : i32}]}
+// MAPPING:          neura.yield {dfg_id = 4 : i32}
 // MAPPING:        }
 // MAPPING:      }
 
-// YAML:      array_config:
+// YAML: array_config:
 // YAML-NEXT:   columns: 4
 // YAML-NEXT:   rows: 4
 // YAML-NEXT:   compiled_ii: 5
@@ -90,56 +91,31 @@
 // YAML-NEXT:           instructions:
 // YAML-NEXT:             - index_per_ii: 0
 // YAML-NEXT:               operations:
-// YAML-NEXT:                 - opcode: "DATA_MOV"
-// YAML-NEXT:                   id: 16
+// YAML-NEXT:                 - opcode: "GRANT_PREDICATE"
+// YAML-NEXT:                   id: 28
 // YAML-NEXT:                   time_step: 5
 // YAML-NEXT:                   invalid_iterations: 1
 // YAML-NEXT:                   src_operands:
-// YAML-NEXT:                     - operand: "NORTH"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                   dst_operands:
-// YAML-NEXT:                     - operand: "$1"
+// YAML-NEXT:                     - operand: "$0"
 // YAML-NEXT:                       color: "RED"
-// YAML-NEXT:             - index_per_ii: 2
-// YAML-NEXT:               operations:
-// YAML-NEXT:                 - opcode: "DATA_MOV"
-// YAML-NEXT:                   id: 60001
-// YAML-NEXT:                   time_step: 2
-// YAML-NEXT:                   invalid_iterations: 0
-// YAML-NEXT:                   src_operands:
 // YAML-NEXT:                     - operand: "NORTH"
 // YAML-NEXT:                       color: "RED"
 // YAML-NEXT:                   dst_operands:
-// YAML-NEXT:                     - operand: "EAST"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                 - opcode: "LOAD"
-// YAML-NEXT:                   id: 37
-// YAML-NEXT:                   time_step: 7
-// YAML-NEXT:                   invalid_iterations: 1
-// YAML-NEXT:                   src_operands:
-// YAML-NEXT:                     - operand: "EAST"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                   dst_operands:
-// YAML-NEXT:                     - operand: "NORTH"
+// YAML-NEXT:                     - operand: "$0"
 // YAML-NEXT:                       color: "RED"
-// YAML-NEXT:             - index_per_ii: 3
+// YAML-NEXT:             - index_per_ii: 1
 // YAML-NEXT:               operations:
-// YAML-NEXT:                 - opcode: "GRANT_PREDICATE"
-// YAML-NEXT:                   id: 20
-// YAML-NEXT:                   time_step: 8
+// YAML-NEXT:                 - opcode: "RETURN_VOID"
+// YAML-NEXT:                   id: 35
+// YAML-NEXT:                   time_step: 6
 // YAML-NEXT:                   invalid_iterations: 1
 // YAML-NEXT:                   src_operands:
 // YAML-NEXT:                     - operand: "$0"
 // YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                     - operand: "$1"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                   dst_operands:
-// YAML-NEXT:                     - operand: "$0"
-// YAML-NEXT:                       color: "RED"
 // YAML-NEXT:             - index_per_ii: 4
 // YAML-NEXT:               operations:
 // YAML-NEXT:                 - opcode: "DATA_MOV"
-// YAML-NEXT:                   id: 15
+// YAML-NEXT:                   id: 23
 // YAML-NEXT:                   time_step: 4
 // YAML-NEXT:                   invalid_iterations: 0
 // YAML-NEXT:                   src_operands:
@@ -148,26 +124,15 @@
 // YAML-NEXT:                   dst_operands:
 // YAML-NEXT:                     - operand: "$0"
 // YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                 - opcode: "RETURN_VOID"
-// YAML-NEXT:                   id: 27
-// YAML-NEXT:                   time_step: 9
-// YAML-NEXT:                   invalid_iterations: 1
-// YAML-NEXT:                   src_operands:
-// YAML-NEXT:                     - operand: "$0"
-// YAML-NEXT:                       color: "RED"
 
 // ASM: # Compiled II: 5
-// ASM: PE(3,2):
+// ASM: PE(2,1):
 // ASM-NEXT: {
-// ASM-NEXT:   GRANT_ONCE, [#0] -> [WEST, RED] (t=0, inv_iters=0)
+// ASM-NEXT:   GRANT_PREDICATE, [$0], [NORTH, RED] -> [$0] (t=5, inv_iters=1)
 // ASM-NEXT: } (idx_per_ii=0)
 // ASM-NEXT: {
-// ASM-NEXT:   GEP, [WEST, RED] -> [$0] (t=2, inv_iters=0)
-// ASM-NEXT:   DATA_MOV, [SOUTH, RED] -> [NORTH, RED] (t=7, inv_iters=1)
-// ASM-NEXT: } (idx_per_ii=2)
-// ASM-NEXT: {
-// ASM-NEXT:   LOAD, [$0] -> [$0], [NORTH, RED] (t=3, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=3)
+// ASM-NEXT:   RETURN_VOID, [$0] (t=6, inv_iters=1)
+// ASM-NEXT: } (idx_per_ii=1)
 // ASM-NEXT: {
-// ASM-NEXT:   ICMP_SGT, [$0], [#0] -> [SOUTH, RED], [NORTH, RED] (t=4, inv_iters=0)
+// ASM-NEXT:   DATA_MOV, [NORTH, RED] -> [$0] (t=4, inv_iters=0)
 // ASM-NEXT: } (idx_per_ii=4)
diff --git a/test/multi-cgra/taskflow/nerf_hash_grid/build_modular_mlir.py b/test/multi-cgra/taskflow/nerf_hash_grid/build_modular_mlir.py
new file mode 100755
index 00000000..6b79e5ca
--- /dev/null
+++ b/test/multi-cgra/taskflow/nerf_hash_grid/build_modular_mlir.py
@@ -0,0 +1,616 @@
+#!/cluster/home/tangyz/.conda/envs/torch-mlir-env/bin/python
+"""Build modular MLIR from NeRF PyTorch components.
+
+This script compiles individual NeRF components (ray sampler, hash encoder,
+MLP) into separate MLIR modules and merges them into a single modular MLIR
+file with a top-level orchestrator function.
+
+Features:
+  - Automatic function signature extraction
+  - Signature-based top-level function generation
+  - MLIR verification with mlir-opt
+  - Command-line output path specification
+"""
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+
+import torch
+import torch_mlir
+
+from nerf_components import HashGridEncoder
+from nerf_components import NeRFMLP
+from nerf_components import RaySampler
+
+
+def compile_single_module(module, inputs, module_name):
+  """Compiles a single PyTorch module to Linalg MLIR.
+
+  Args:
+    module: PyTorch module to compile.
+    inputs: Tuple of input tensors for tracing.
+    module_name: Name for the module (used in debug output).
+
+  Returns:
+    MLIR string representation, or None if compilation fails.
+  """
+  print(f'\nCompiling module: {module_name}')
+  print('-' * 70)
+  print(f'  Input shapes: {[x.shape for x in inputs]}')
+
+  try:
+    mlir_module = torch_mlir.compile(
+        module,
+        inputs,
+        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=True)
+
+    mlir_str = str(mlir_module)
+
+    # Save debug file.
+    debug_file = f'{module_name}_module.mlir'
+    with open(debug_file, 'w') as f:
+      f.write(mlir_str)
+
+    print(f'  ✓ Compilation successful: {debug_file}')
+    print(f'    Size: {len(mlir_str):,} characters')
+
+    return mlir_str
+
+  except Exception as e:
+    print(f'  ✗ Compilation failed: {e}')
+    import traceback
+    traceback.print_exc()
+    return None
+
+
+def extract_function_signature(mlir_str):
+  """Extracts function signature from MLIR.
+
+  Args:
+    mlir_str: MLIR string containing a @forward function.
+
+  Returns:
+    Tuple of (input_types, output_types, full_signature_string).
+    Returns (None, None, None) if extraction fails.
+  """
+  # Match function signature:
+  # func.func @forward(%arg0: type0, ...) -> (type_out0, ...)
+  pattern = r'func\.func @forward\((.*?)\)\s*->\s*\(([^)]+)\)'
+  match = re.search(pattern, mlir_str, re.DOTALL)
+
+  if not match:
+    # Try single return value: -> type
+    pattern = r'func\.func @forward\((.*?)\)\s*->\s*([^\s{]+)'
+    match = re.search(pattern, mlir_str, re.DOTALL)
+    if not match:
+      print('    ⚠ Cannot extract function signature')
+      return None, None, None
+
+    inputs_str = match.group(1).strip()
+    outputs_str = match.group(2).strip()
+    output_types = [outputs_str]
+  else:
+    inputs_str = match.group(1).strip()
+    outputs_str = match.group(2).strip()
+    output_types = [t.strip() for t in outputs_str.split(',') if t.strip()]
+
+  # Extract input types.
+  input_types = []
+  for param in inputs_str.split(','):
+    if ':' in param:
+      type_part = param.split(':', 1)[1].strip()
+      input_types.append(type_part)
+
+  full_signature = f"({inputs_str}) -> ({', '.join(output_types)})"
+
+  return input_types, output_types, full_signature
+
+
+def extract_and_rename_function(mlir_str, new_name):
+  """Extracts @forward function and renames it.
+
+  Args:
+    mlir_str: MLIR string containing the function.
+    new_name: New name for the function.
+
+  Returns:
+    Renamed function as string, or None if extraction fails.
+  """
+  lines = mlir_str.split('\n')
+  func_lines = []
+  brace_count = 0
+  in_function = False
+
+  for line in lines:
+    if 'func.func @forward(' in line:
+      in_function = True
+      # Rename function
+      line = line.replace('func.func @forward',
+                          f'func.func @{new_name}')
+
+    if in_function:
+      func_lines.append(line)
+      brace_count += line.count('{')
+      brace_count -= line.count('}')
+
+      if brace_count == 0 and len(func_lines) > 1:
+        break
+
+  return '\n'.join(func_lines) if func_lines else None
+
+
+def collect_map_definitions(mlir_str):
+  """Collects all affine_map definitions from MLIR.
+
+  Args:
+    mlir_str: MLIR string.
+
+  Returns:
+    List of tuples (map_name, map_definition) where map_name is like 'map'
+    or 'map1' and map_definition is the full affine_map expression.
+  """
+  maps = []
+  for line in mlir_str.split('\n'):
+    if line.startswith('#map'):
+      # Parse: #map = affine_map<...>
+      # or:    #map1 = affine_map<...>
+      match = re.match(r'#(map\d*)\s*=\s*(.+)', line)
+      if match:
+        map_name = match.group(1)
+        map_def = match.group(2).strip()
+        maps.append((map_name, map_def))
+  return maps
+
+
+def build_global_map_definitions(maps_list1, maps_list2, maps_list3):
+  """Builds global map definitions and renaming mappings for each module.
+
+  Args:
+    maps_list1: List of (map_name, map_def) tuples from module 1.
+    maps_list2: List of (map_name, map_def) tuples from module 2.
+    maps_list3: List of (map_name, map_def) tuples from module 3.
+
+  Returns:
+    Tuple of (global_map_lines, rename_map1, rename_map2, rename_map3) where:
+    - global_map_lines: List of global map definition strings.
+    - rename_mapX: Dict mapping old map name to new global map name for module X.
+  """
+  # Track unique map definitions and assign global names.
+  unique_maps = {}  # map_def -> global_name
+  global_map_lines = []
+  global_counter = 0
+
+  # Process all maps from all modules.
+  all_module_maps = [
+      ('module1', maps_list1),
+      ('module2', maps_list2),
+      ('module3', maps_list3),
+  ]
+
+  rename_maps = [{}, {}, {}]  # One dict per module.
+
+  for module_idx, (module_name, maps_list) in enumerate(all_module_maps):
+    for old_name, map_def in maps_list:
+      if map_def not in unique_maps:
+        # New unique map definition - assign global name.
+        if global_counter == 0:
+          global_name = 'map'
+        else:
+          global_name = f'map{global_counter}'
+        global_counter += 1
+
+        unique_maps[map_def] = global_name
+        global_map_lines.append(f'#{global_name} = {map_def}')
+
+      # Record the renaming: old_name -> global_name.
+      global_name = unique_maps[map_def]
+      rename_maps[module_idx][old_name] = global_name
+
+  return global_map_lines, rename_maps[0], rename_maps[1], rename_maps[2]
+
+
+def rename_maps_in_function(func_str, rename_map):
+  """Renames map references in a function body.
+
+  Args:
+    func_str: Function definition as string.
+    rename_map: Dict mapping old map names to new map names.
+
+  Returns:
+    Function string with renamed map references.
+  """
+  # Use a callback function for atomic replacements to avoid chaining
+  def replace_callback(match):
+    map_name = match.group(1)  # Capture the map name without '#'
+    return '#' + rename_map.get(map_name, map_name)
+  
+  # Build pattern that matches any of the old map names
+  # Sort by length (descending) to match longer names first (e.g., map10 before map1)
+  sorted_names = sorted(rename_map.keys(), key=len, reverse=True)
+  if not sorted_names:
+    return func_str
+  
+  # Create pattern: #(map10|map1|map|...)(?=\W|$)
+  pattern = r'#(' + '|'.join(re.escape(name) for name in sorted_names) + r')(?=\W|$)'
+  
+  # Replace all matches in a single pass (atomic operation)
+  result = re.sub(pattern, replace_callback, func_str)
+  
+  return result
+
+
+def build_wrapper_function(sig1, sig2, sig3):
+  """Generates top-level orchestrator function based on signatures.
+
+  Args:
+    sig1: Ray sampler signature (input_types, output_types, full_sig).
+    sig2: Hash encoder signature.
+    sig3: NeRF MLP signature.
+
+  Returns:
+    Top-level function as string.
+  """
+  in1, out1, _ = sig1
+  in2, out2, _ = sig2
+  in3, out3, _ = sig3
+
+  # Validate type compatibility.
+  print('\nValidating type compatibility:')
+  print(f'  ray_sampler output: {out1}')
+  print(f'  hash_encoder input: {in2}')
+  print(f'  hash_encoder output: {out2}')
+  print(f'  nerf_mlp input: {in3}')
+  print(f'  nerf_mlp output: {out3}')
+
+  if len(out1) != 1 or len(in2) != 1:
+    print('  ⚠ Warning: ray_sampler → hash_encoder type mismatch')
+  if len(out2) != 1 or len(in3) < 1:
+    print('  ⚠ Warning: hash_encoder → nerf_mlp type mismatch')
+
+  # Generate top-level function.
+  # Inputs: Same as ray_sampler.
+  # Outputs: Same as nerf_mlp.
+  wrapper_inputs = ', '.join([f'%arg{i}: {t}' for i, t in enumerate(in1)])
+  wrapper_outputs = ', '.join(out3)
+
+  wrapper = f'''  func.func @nerf_forward({wrapper_inputs}) 
+                       -> ({wrapper_outputs}) {{
+    // ================================================
+    // Task 1: Ray Sampling
+    // ================================================
+    %positions = func.call @ray_sampler_func({', '.join([f'%arg{i}' for i in range(len(in1))])}) 
+                 : ({', '.join(in1)}) -> {out1[0]}
+    
+    // ================================================
+    // Task 2: Hash Encoding
+    // ================================================
+    %encoded = func.call @hash_encoder_func(%positions)
+               : ({out1[0]}) -> {out2[0]}
+    
+    // ================================================
+    // Task 3: MLP Inference
+    // ================================================
+'''
+
+  # Handle MLP's multiple inputs (encoded + view_dirs).
+  if len(in3) == 2:
+    wrapper += f'''    %density, %rgb = func.call @nerf_mlp_func(%encoded, %arg{len(in1)-1})
+                         : ({out2[0]}, {in1[-1]}) -> ({', '.join(out3)})
+    
+    return %density, %rgb : {', '.join(out3)}
+  }}
+'''
+  else:
+    wrapper += f'''    %result = func.call @nerf_mlp_func(%encoded)
+                      : ({out2[0]}) -> ({', '.join(out3)})
+    
+    return %result : {', '.join(out3)}
+  }}
+'''
+
+  return wrapper
+
+
+def merge_mlir_modules(mlir1, mlir2, mlir3):
+  """Merges three MLIR modules into a single modular MLIR file.
+
+  Args:
+    mlir1: MLIR string for ray sampler.
+    mlir2: MLIR string for hash encoder.
+    mlir3: MLIR string for NeRF MLP.
+
+  Returns:
+    Merged MLIR string, or None if merging fails.
+  """
+  print('\n' + '=' * 70)
+  print('Merging Modules')
+  print('=' * 70)
+
+  # Extract signatures.
+  print('\nExtracting function signatures...')
+  sig1 = extract_function_signature(mlir1)
+  sig2 = extract_function_signature(mlir2)
+  sig3 = extract_function_signature(mlir3)
+
+  if None in [sig1[0], sig2[0], sig3[0]]:
+    print('✗ Failed to extract function signatures')
+    return None
+
+  print('  ✓ Signature extraction successful')
+
+  # Extract function definitions.
+  print('\nExtracting function definitions...')
+  func1 = extract_and_rename_function(mlir1, 'ray_sampler_func')
+  func2 = extract_and_rename_function(mlir2, 'hash_encoder_func')
+  func3 = extract_and_rename_function(mlir3, 'nerf_mlp_func')
+
+  if not all([func1, func2, func3]):
+    print('✗ Failed to extract function definitions')
+    return None
+
+  print('  ✓ Function extraction successful')
+
+  # Collect and rename all map definitions.
+  print('\nCollecting affine_map definitions...')
+  maps1 = collect_map_definitions(mlir1)
+  maps2 = collect_map_definitions(mlir2)
+  maps3 = collect_map_definitions(mlir3)
+  
+  print(f'  Module 1: {len(maps1)} maps')
+  print(f'  Module 2: {len(maps2)} maps')
+  print(f'  Module 3: {len(maps3)} maps')
+
+  # Build global map definitions and rename mappings.
+  print('\nBuilding global map definitions with renaming...')
+  global_map_lines, rename_map1, rename_map2, rename_map3 = \
+      build_global_map_definitions(maps1, maps2, maps3)
+  
+  print(f'  ✓ Created {len(global_map_lines)} unique global map definitions')
+  
+  # Rename map references in each function.
+  print('\nRenaming map references in functions...')
+  func1 = rename_maps_in_function(func1, rename_map1)
+  func2 = rename_maps_in_function(func2, rename_map2)
+  func3 = rename_maps_in_function(func3, rename_map3)
+  print('  ✓ Map references renamed successfully')
+
+  # Generate top-level function.
+  print('\nGenerating top-level function...')
+  wrapper = build_wrapper_function(sig1, sig2, sig3)
+  print('  ✓ Top-level function generation successful')
+
+  # Assemble final MLIR.
+  merged = '\n'.join(global_map_lines) + '\n' if global_map_lines else ''
+  merged += 'module {\n'
+  merged += ('  ml_program.global private mutable @global_seed'
+             '(dense<0> : tensor<i64>) : tensor<i64>\n\n')
+  merged += '  // ============================================\n'
+  merged += '  // Module 1: Ray Sampler\n'
+  merged += '  // ============================================\n'
+  merged += indent_mlir(func1, 2) + '\n\n'
+  merged += '  // ============================================\n'
+  merged += '  // Module 2: Hash Grid Encoder\n'
+  merged += '  // ============================================\n'
+  merged += indent_mlir(func2, 2) + '\n\n'
+  merged += '  // ============================================\n'
+  merged += '  // Module 3: NeRF MLP\n'
+  merged += '  // ============================================\n'
+  merged += indent_mlir(func3, 2) + '\n\n'
+  merged += '  // ============================================\n'
+  merged += '  // Top-level Function (Auto-generated)\n'
+  merged += '  // ============================================\n'
+  merged += wrapper + '\n'
+  merged += '}\n'
+
+  return merged
+
+
+def fix_tensor_expand_shape_syntax(mlir_str):
+  """Fixes tensor.expand_shape syntax for LLVM 20+ compatibility.
+  
+  Converts old syntax:
+    %x = tensor.expand_shape %y [[0, 1]] : tensor<16xf32> into tensor<1x16xf32>
+  
+  To new syntax:
+    %x = tensor.expand_shape %y [[0, 1]] output_shape [1, 16] : tensor<16xf32> into tensor<1x16xf32>
+  
+  Args:
+    mlir_str: MLIR string to fix.
+  
+  Returns:
+    Fixed MLIR string.
+  """
+  lines = mlir_str.split('\n')
+  fixed_lines = []
+  
+  for line in lines:
+    # Match tensor.expand_shape pattern
+    # Pattern: tensor.expand_shape %var [[...]] : tensor<...> into tensor<shape>
+    match = re.search(
+        r'(.*tensor\.expand_shape\s+%\S+\s+\[\[.*?\]\])\s*:\s*(tensor<[^>]+>)\s+into\s+tensor<([^>]+)>',
+        line
+    )
+    
+    if match:
+      prefix = match.group(1)  # Everything before ':'
+      input_type = match.group(2)  # tensor<16xf32>
+      output_shape = match.group(3)  # 1x16xf32
+      
+      # Extract shape dimensions from output_shape
+      # Remove type suffix (e.g., 'xf32', 'xi64')
+      shape_str = re.sub(r'x[a-z]\w+$', '', output_shape)
+      # Split by 'x' to get dimensions
+      dims = shape_str.split('x')
+      
+      # Build output_shape attribute
+      output_shape_attr = f"output_shape [{', '.join(dims)}]"
+      
+      # Reconstruct the line with output_shape attribute
+      fixed_line = f"{prefix} {output_shape_attr} : {input_type} into tensor<{output_shape}>"
+      
+      # Preserve any trailing content (like comments)
+      trailing = line[match.end():]
+      fixed_line += trailing
+      
+      fixed_lines.append(fixed_line)
+    else:
+      # No match, keep original line
+      fixed_lines.append(line)
+  
+  return '\n'.join(fixed_lines)
+
+
+def indent_mlir(mlir_str, spaces):
+  """Adds indentation to MLIR string.
+
+  Args:
+    mlir_str: MLIR string to indent.
+    spaces: Number of spaces for indentation.
+
+  Returns:
+    Indented MLIR string.
+  """
+  lines = mlir_str.split('\n')
+  indent = ' ' * spaces
+  return '\n'.join(indent + line if line.strip() else line for line in lines)
+
+
+def verify_mlir(mlir_file):
+  """Verifies MLIR file using mlir-opt.
+
+  Args:
+    mlir_file: Path to MLIR file to verify.
+
+  Returns:
+    True if verification succeeds, False otherwise.
+  """
+  print('\nVerifying MLIR file...')
+
+  mlir_opt = '../../../../../build/tools/mlir-neura-opt/mlir-neura-opt'
+
+  if not os.path.exists(mlir_opt):
+    print('  ⚠ mlir-neura-opt not found, skipping verification')
+    return True
+
+  result = subprocess.run(
+      [mlir_opt, mlir_file, '--verify-each=true', '-o', '/dev/null'],
+      capture_output=True,
+      text=True)
+
+  if result.returncode == 0:
+    print('  ✅ MLIR verification passed!')
+    return True
+  else:
+    print('  ✗ MLIR verification failed:')
+    print(result.stderr)
+    return False
+
+
+def main():
+  """Main workflow."""
+  # Parse command-line arguments.
+  parser = argparse.ArgumentParser(
+      description='Build modular MLIR from NeRF components')
+  parser.add_argument(
+      '--output',
+      '-o',
+      default='nerf_modular_3funcs.mlir',
+      help='Output file path (default: nerf_modular_3funcs.mlir)')
+  args = parser.parse_args()
+
+  print('=' * 70)
+  print('Build Modular MLIR (Auto Signature Extraction)')
+  print('=' * 70)
+  print(f'Output file: {args.output}')
+  print('=' * 70)
+
+  device = torch.device('cpu')
+
+  # Compile 3 modules.
+  sampler = RaySampler(num_samples=16)
+  sampler.eval()
+  mlir1 = compile_single_module(sampler,
+                                 (torch.randn(2, 3), torch.randn(2, 3)),
+                                 'ray_sampler')
+
+  encoder = HashGridEncoder(
+      num_levels=2, features_per_level=2, log2_hashmap_size=8)
+  encoder.eval()
+  mlir2 = compile_single_module(encoder, (torch.randn(2, 16, 3),),
+                                 'hash_encoder')
+
+  mlp = NeRFMLP(input_dim=4, hidden_dim=32, num_layers=2)
+  mlp.eval()
+  mlir3 = compile_single_module(
+      mlp, (torch.randn(2, 16, 4), torch.randn(2, 3)), 'nerf_mlp')
+
+  if not all([mlir1, mlir2, mlir3]):
+    print('\n✗ Some modules failed to compile')
+    return 1
+
+  # Merge modules.
+  merged = merge_mlir_modules(mlir1, mlir2, mlir3)
+
+  if not merged:
+    print('\n✗ Module merging failed')
+    return 1
+
+  # Fix tensor.expand_shape syntax for LLVM 20+ compatibility.
+  print('\nApplying syntax fixes for LLVM 20+ compatibility...')
+  merged = fix_tensor_expand_shape_syntax(merged)
+  
+  if 'output_shape [' in merged:
+    print('  ✓ Fixed tensor.expand_shape syntax')
+  else:
+    print('  ℹ No tensor.expand_shape operations found')
+
+  # Save output.
+  output_file = args.output
+
+  # Ensure output directory exists.
+  output_dir = os.path.dirname(output_file)
+  if output_dir and not os.path.exists(output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+
+  with open(output_file, 'w') as f:
+    f.write(merged)
+
+  print('\n' + '=' * 70)
+  print('✓ Modular MLIR generated successfully!')
+  print('=' * 70)
+  print(f'  File: {output_file}')
+  print(f'  Size: {len(merged):,} characters')
+
+  # Statistics.
+  num_funcs = merged.count('func.func')
+  num_calls = merged.count('func.call')
+
+  print('\nStructure:')
+  print(f'  Function definitions: {num_funcs} (3 modules + 1 top-level)')
+  print(f'  Function calls: {num_calls} (top-level calls 3 modules)')
+
+  # Verification.
+  if verify_mlir(output_file):
+    print('\n' + '=' * 70)
+    print('Next Step: Compile to Taskflow')
+    print('=' * 70)
+    print(f'\nmlir-neura-opt {output_file} \\')
+    print('  --one-shot-bufferize \\')
+    print('  --pass-pipeline=\'func.func(convert-linalg-to-affine-loops)\' \\')
+    print('  --convert-affine-to-taskflow \\')
+    print('  -o nerf_taskflow_3tasks.mlir')
+    print('\nExpected: Generate 3 taskflow.task operations')
+
+    return 0
+  else:
+    print('\n⚠ MLIR verification failed, but file was generated')
+    print(f'  You can try manual inspection: {output_file}')
+    return 1
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/test/multi-cgra/taskflow/nerf_hash_grid/nerf_components.py b/test/multi-cgra/taskflow/nerf_hash_grid/nerf_components.py
new file mode 100644
index 00000000..309e0cca
--- /dev/null
+++ b/test/multi-cgra/taskflow/nerf_hash_grid/nerf_components.py
@@ -0,0 +1,369 @@
+"""NeRF components for modular MLIR compilation.
+
+This module contains PyTorch implementations of NeRF components:
+  - RaySampler: Samples 3D positions along rays
+  - HashGridEncoder: Multi-resolution hash encoding (Instant-NGP style)
+  - NeRFMLP: Neural network for density and color prediction
+  - HashGridNeRF: Complete NeRF pipeline
+
+These components are designed to be compiled individually to MLIR and then
+combined into a modular heterogeneous computing system.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class RaySampler(nn.Module):
+  """Samples 3D positions along rays for volume rendering."""
+
+  def __init__(self, num_samples=64, near=2.0, far=6.0):
+    """Initializes ray sampler.
+
+    Args:
+      num_samples: Number of samples per ray.
+      near: Near plane distance.
+      far: Far plane distance.
+    """
+    super().__init__()
+    self.num_samples = num_samples
+    # Register constants as buffers to avoid torch.constant issues.
+    self.register_buffer('near', torch.tensor(near, dtype=torch.float32))
+    self.register_buffer('far', torch.tensor(far, dtype=torch.float32))
+
+  def forward(self, rays_o, rays_d):
+    """Samples positions along rays.
+
+    Args:
+      rays_o: Ray origins [batch_size, 3].
+      rays_d: Ray directions [batch_size, 3].
+
+    Returns:
+      Sampled 3D positions [batch_size, num_samples, 3].
+    """
+    batch_size = rays_o.shape[0]
+
+    # Manually implement linspace for compatibility.
+    # Original: t_vals = torch.linspace(self.near, self.far, ...)
+    # Compatible: Use arange + scaling.
+    indices = torch.arange(
+        self.num_samples, device=rays_o.device, dtype=rays_o.dtype)
+    step = (self.far - self.near) / (self.num_samples - 1)
+    t_vals = self.near + indices * step  # [num_samples]
+
+    t_vals = t_vals.unsqueeze(0).expand(batch_size, -1)  # [B, N]
+
+    # positions = rays_o + t * rays_d
+    positions = (rays_o.unsqueeze(1) +
+                 t_vals.unsqueeze(2) * rays_d.unsqueeze(1))
+
+    return positions  # [batch_size, num_samples, 3]
+
+
+class HashGridEncoder(nn.Module):
+  """Multi-resolution hash grid encoding (Instant-NGP style)."""
+
+  def __init__(self,
+               num_levels=16,
+               features_per_level=2,
+               log2_hashmap_size=19,
+               base_resolution=16,
+               finest_resolution=512):
+    """Initializes hash grid encoder.
+
+    Args:
+      num_levels: Number of resolution levels.
+      features_per_level: Feature dimension per level.
+      log2_hashmap_size: Log2 of hash table size.
+      base_resolution: Coarsest grid resolution.
+      finest_resolution: Finest grid resolution.
+    """
+    super().__init__()
+    self.num_levels = num_levels
+    self.features_per_level = features_per_level
+    self.log2_hashmap_size = log2_hashmap_size
+    self.hashmap_size = 2**log2_hashmap_size
+    self.base_resolution = base_resolution
+    self.finest_resolution = finest_resolution
+
+    # Compute resolution growth factor per level.
+    self.b = np.exp(
+        (np.log(finest_resolution) - np.log(base_resolution)) /
+        (num_levels - 1))
+
+    # Hash tables for each level (learnable parameters).
+    self.hash_tables = nn.ParameterList([
+        nn.Parameter(
+            torch.randn(self.hashmap_size, features_per_level) * 0.01)
+        for _ in range(num_levels)
+    ])
+
+  def hash_function(self, coords, level):
+    """Hashes 3D integer coordinates to hash table indices.
+
+    Uses modulo operation instead of bitwise operations for compatibility.
+    Converts to int32 for better compatibility with downstream operations.
+
+    Args:
+      coords: Integer coordinates [batch_size, 3].
+      level: Resolution level index.
+
+    Returns:
+      Hash indices [batch_size].
+    """
+    # Convert to int32 for compatibility.
+    x = coords[:, 0].int()
+    y = coords[:, 1].int()
+    z = coords[:, 2].int()
+
+    # Spatial hash using prime numbers (avoid int32 overflow).
+    hashed = x * 1 + y * 73856093 + z * 19349663
+
+    # Use modulo instead of bitwise AND.
+    return hashed % self.hashmap_size
+
+  def grid_sample_3d(self, positions, level):
+    """Samples features from hash grid at given level using trilinear
+    interpolation.
+
+    Args:
+      positions: Normalized positions [batch_size, num_samples, 3] in [0, 1].
+      level: Resolution level index.
+
+    Returns:
+      Interpolated features [batch_size, num_samples, features_per_level].
+    """
+    batch_size, num_samples, _ = positions.shape
+    resolution = int(np.floor(self.base_resolution * (self.b**level)))
+
+    # Scale positions to grid resolution.
+    scaled_pos = positions * (resolution - 1)  # [B, N, 3]
+
+    # Get integer grid coordinates (8 corners of cube).
+    base_coords = torch.floor(scaled_pos).int()  # [B, N, 3] - int32
+
+    # Trilinear interpolation weights.
+    frac = scaled_pos - base_coords.float()  # [B, N, 3]
+
+    # Flatten batch and samples for processing.
+    base_coords_flat = base_coords.view(-1, 3)  # [B*N, 3]
+    frac_flat = frac.view(-1, 3)  # [B*N, 3]
+
+    # Sample from 8 corners and compute trilinear interpolation.
+    features_list = []
+    for dx in [0, 1]:
+      for dy in [0, 1]:
+        for dz in [0, 1]:
+          # Compute offset coordinates.
+          offset_x = base_coords_flat[:, 0] + dx
+          offset_y = base_coords_flat[:, 1] + dy
+          offset_z = base_coords_flat[:, 2] + dz
+
+          # Stack into coordinates.
+          corner_coords = torch.stack([offset_x, offset_y, offset_z], dim=1)
+
+          # Hash coordinates to table indices.
+          indices = self.hash_function(corner_coords, level)  # [B*N]
+
+          # Convert to long for tensor indexing.
+          indices = indices.long()
+
+          # Lookup features from hash table.
+          corner_features = self.hash_tables[level][indices]  # [B*N, F]
+
+          # Compute trilinear weight.
+          weight = 1.0
+          weight *= (1 - frac_flat[:, 0]) if dx == 0 else frac_flat[:, 0]
+          weight *= (1 - frac_flat[:, 1]) if dy == 0 else frac_flat[:, 1]
+          weight *= (1 - frac_flat[:, 2]) if dz == 0 else frac_flat[:, 2]
+
+          features_list.append(corner_features * weight.unsqueeze(1))
+
+    # Sum contributions from all corners.
+    interpolated_features = sum(features_list)  # [B*N, F]
+
+    # Reshape back.
+    interpolated_features = interpolated_features.view(
+        batch_size, num_samples, self.features_per_level)
+
+    return interpolated_features
+
+  def forward(self, positions):
+    """Encodes 3D positions with multi-resolution hash encoding.
+
+    Args:
+      positions: 3D positions [batch_size, num_samples, 3] in range [-1, 1].
+
+    Returns:
+      Encoded features [batch_size, num_samples, num_levels *
+      features_per_level].
+    """
+    # Normalize positions to [0, 1].
+    positions_normalized = (positions + 1.0) / 2.0
+
+    # Encode at all levels.
+    encoded_features = []
+    for level in range(self.num_levels):
+      level_features = self.grid_sample_3d(positions_normalized, level)
+      encoded_features.append(level_features)
+
+    # Concatenate features from all levels.
+    encoded = torch.cat(encoded_features, dim=-1)  # [B, N, L*F]
+
+    return encoded
+
+
+class NeRFMLP(nn.Module):
+  """MLP for NeRF: predicts density and color from encoded features."""
+
+  def __init__(self, input_dim=32, hidden_dim=64, num_layers=3):
+    """Initializes NeRF MLP.
+
+    Args:
+      input_dim: Input feature dimension.
+      hidden_dim: Hidden layer dimension.
+      num_layers: Number of hidden layers.
+    """
+    super().__init__()
+
+    # Density network.
+    self.density_net = nn.ModuleList([nn.Linear(input_dim, hidden_dim)])
+    for _ in range(num_layers - 1):
+      self.density_net.append(nn.Linear(hidden_dim, hidden_dim))
+    self.density_out = nn.Linear(hidden_dim, 1)
+
+    # Color network (conditioned on view direction).
+    self.color_net = nn.ModuleList(
+        [nn.Linear(hidden_dim + 3, hidden_dim)])  # +3 for view direction
+    for _ in range(num_layers - 2):
+      self.color_net.append(nn.Linear(hidden_dim, hidden_dim))
+    self.color_out = nn.Linear(hidden_dim, 3)
+
+  def forward(self, encoded_features, view_dirs):
+    """Predicts density and color from encoded features.
+
+    Args:
+      encoded_features: Encoded position features [batch_size, num_samples,
+        input_dim].
+      view_dirs: View directions [batch_size, 3].
+
+    Returns:
+      Tuple of:
+        density: Volume density [batch_size, num_samples, 1].
+        rgb: RGB color [batch_size, num_samples, 3].
+    """
+    batch_size, num_samples, _ = encoded_features.shape
+
+    # Density prediction.
+    x = encoded_features
+    for layer in self.density_net:
+      x = torch.relu(layer(x))
+    density = self.density_out(x)  # [B, N, 1]
+
+    # Get features for color prediction.
+    density_features = x  # [B, N, hidden_dim]
+
+    # Expand view directions.
+    view_dirs_expanded = view_dirs.unsqueeze(1).expand(
+        -1, num_samples, -1)  # [B, N, 3]
+
+    # Concatenate density features with view directions.
+    color_input = torch.cat([density_features, view_dirs_expanded], dim=-1)
+
+    # Color prediction.
+    x = color_input
+    for layer in self.color_net:
+      x = torch.relu(layer(x))
+    rgb = torch.sigmoid(self.color_out(x))  # [B, N, 3]
+
+    return density, rgb
+
+
+class HashGridNeRF(nn.Module):
+  """Complete NeRF pipeline with hash grid encoding."""
+
+  def __init__(self,
+               num_samples=64,
+               num_levels=16,
+               features_per_level=2,
+               hidden_dim=64):
+    """Initializes complete NeRF model.
+
+    Args:
+      num_samples: Number of samples per ray.
+      num_levels: Number of hash grid levels.
+      features_per_level: Features per hash grid level.
+      hidden_dim: MLP hidden dimension.
+    """
+    super().__init__()
+    self.ray_sampler = RaySampler(num_samples=num_samples)
+    self.hash_encoder = HashGridEncoder(
+        num_levels=num_levels, features_per_level=features_per_level)
+    self.nerf_mlp = NeRFMLP(
+        input_dim=num_levels * features_per_level, hidden_dim=hidden_dim)
+
+  def forward(self, rays_o, rays_d):
+    """Full NeRF forward pass.
+
+    Args:
+      rays_o: Ray origins [batch_size, 3].
+      rays_d: Ray directions [batch_size, 3].
+
+    Returns:
+      Tuple of:
+        density: Volume density [batch_size, num_samples, 1].
+        rgb: RGB color [batch_size, num_samples, 3].
+    """
+    # 1. Sample positions along rays.
+    positions = self.ray_sampler(rays_o, rays_d)  # [B, N, 3]
+
+    # 2. Hash encoding.
+    encoded = self.hash_encoder(positions)  # [B, N, L*F]
+
+    # 3. MLP prediction.
+    density, rgb = self.nerf_mlp(encoded, rays_d)
+
+    return density, rgb
+
+
+if __name__ == '__main__':
+  print('=' * 70)
+  print('NeRF Components Test')
+  print('=' * 70)
+
+  # Test RaySampler.
+  print('\n1. Testing RaySampler...')
+  sampler = RaySampler(num_samples=16)
+  rays_o = torch.randn(2, 3)
+  rays_d = torch.randn(2, 3)
+  positions = sampler(rays_o, rays_d)
+  print(f'✓ RaySampler output shape: {positions.shape}')
+
+  # Test HashGridEncoder.
+  print('\n2. Testing HashGridEncoder...')
+  encoder = HashGridEncoder(
+      num_levels=2, features_per_level=2, log2_hashmap_size=8)
+  encoded = encoder(positions)
+  print(f'✓ HashGridEncoder output shape: {encoded.shape}')
+
+  # Test NeRFMLP.
+  print('\n3. Testing NeRFMLP...')
+  mlp = NeRFMLP(input_dim=4, hidden_dim=32)
+  density, rgb = mlp(encoded, rays_d)
+  print(f'✓ NeRFMLP density shape: {density.shape}')
+  print(f'✓ NeRFMLP rgb shape: {rgb.shape}')
+
+  # Test full model.
+  print('\n4. Testing Complete Model...')
+  model = HashGridNeRF(
+      num_samples=16, num_levels=2, features_per_level=2, hidden_dim=32)
+  density, rgb = model(rays_o, rays_d)
+  print('✓ Complete model works!')
+  print(f'  Density shape: {density.shape}')
+  print(f'  RGB shape: {rgb.shape}')
+
+  print('\n' + '=' * 70)
+  print('All tests passed!')
+  print('=' * 70)
diff --git a/test/multi-cgra/taskflow/nerf_hash_grid/test_assign_target.mlir b/test/multi-cgra/taskflow/nerf_hash_grid/test_assign_target.mlir
new file mode 100644
index 00000000..dac103be
--- /dev/null
+++ b/test/multi-cgra/taskflow/nerf_hash_grid/test_assign_target.mlir
@@ -0,0 +1,26 @@
+// RUN: mkdir -p %S/Output
+// RUN: cd %S && python build_modular_mlir.py --output %S/Output/nerf_modular_3funcs.mlir
+// RUN: mlir-neura-opt %S/Output/nerf_modular_3funcs.mlir --assign-task-target -o %S/Output/nerf_with_target.mlir
+// RUN: mlir-neura-opt %S/Output/nerf_modular_3funcs.mlir --assign-task-target | FileCheck %s
+
+// Test AssignTaskTarget pass on NeRF modular functions
+// This test verifies the complete workflow:
+//   1. Generate modular MLIR from PyTorch NeRF components using build_modular_mlir.py
+//   2. Run AssignTaskTarget pass to assign hardware targets to functions
+//   3. Verify that targets are correctly assigned based on function names:
+//      - ray_sampler_func -> CPU (sampling operations)
+//      - hash_encoder_func -> DOE (encoding operations)
+//      - nerf_mlp_func -> CGRA (neural network inference)
+//      - nerf_forward -> CPU (top-level coordinator)
+
+// CHECK-LABEL: func.func @ray_sampler_func
+// CHECK-SAME: attributes {target.device = "cpu"}
+
+// CHECK-LABEL: func.func @hash_encoder_func
+// CHECK-SAME: attributes {target.device = "doe"}
+
+// CHECK-LABEL: func.func @nerf_mlp_func
+// CHECK-SAME: attributes {target.device = "cgra"}
+
+// CHECK-LABEL: func.func @nerf_forward
+// CHECK-SAME: attributes {target.device = "cpu"}