From b538839d5e390be6d46e4bbb159ecf84c4253511 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 31 Jan 2026 22:59:27 +0800 Subject: [PATCH 1/9] detecting loop-epilogue code when constructing hyperblock --- include/TaskflowDialect/TaskflowPasses.h | 1 + include/TaskflowDialect/TaskflowPasses.td | 33 +- lib/TaskflowDialect/Transforms/CMakeLists.txt | 4 +- .../ConstructHyperblockFromTaskPass.cpp | 103 +++- .../AffineLoopTreeSerializationPass.cpp | 439 ++++++++++++++++++ .../Transforms/Optimizations/CMakeLists.txt | 18 + 6 files changed, 581 insertions(+), 17 deletions(-) create mode 100644 lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp create mode 100644 lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index c0007ce1..69c2a37e 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -15,6 +15,7 @@ namespace taskflow { // Passes defined in TaskflowPasses.td #define GEN_PASS_DECL #include "TaskflowDialect/TaskflowPasses.h.inc" +std::unique_ptr createAffineLoopTreeSerializationPass(); std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createCanonicalizeTaskPass(); std::unique_ptr createClassifyCountersPass(); diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 4fc2137f..6aef5870 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -8,28 +8,35 @@ include "mlir/Pass/PassBase.td" //=========================================================// // Passes for the Taskflow dialect //=========================================================// -def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> { - let summary = "Constructs hyperblocks and counter chain from Taskflow tasks"; +def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "ModuleOp">{ + let summary = "Serializes top-level affine.for loops into minimized task operations"; let description = [{ - This pass constructs hyperblocks and counter chain from Taskflow tasks. + This pass converts top-level affine.for loops in a function into + minimized and canonicalized task operations. }]; - let constructor = "taskflow::createConstructHyperblockFromTaskPass()"; + let constructor = "taskflow::createAffineLoopTreeSerializationPass()"; + let dependentDialects = [ + "mlir::taskflow::TaskflowDialect", + "mlir::affine::AffineDialect", + "mlir::func::FuncDialect"]; } -def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{ - let summary = "Canonicalizes tasks by splitting each hyperblock into a separate atomic task"; +def CanonicalizeTask : Pass<"canonicalize-task", "func::FuncOp"> { + let summary = "Canonicalizes Taskflow tasks"; let description = [{ - This pass splits tasks so that each task contains exactly one hyperblock. - This creates atomic task units that can be analyzed and optimized independently. - - Input: Task with N hyperblocks - Output: N atomic tasks, each containing one hyperblock - - This is a prerequisite pass before fusion optimizations. + This pass canonicalizes Taskflow tasks. }]; let constructor = "taskflow::createCanonicalizeTaskPass()"; } +def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> { + let summary = "Constructs hyperblocks and counter chain from Taskflow tasks"; + let description = [{ + This pass constructs hyperblocks and counter chain from Taskflow tasks. + }]; + let constructor = "taskflow::createConstructHyperblockFromTaskPass()"; +} + def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ let summary = "Classifies counters as root/relay/leaf"; let description = [{ diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index e44401d8..ff12e671 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -16,4 +16,6 @@ add_mlir_library(MLIRTaskflowTransforms MLIRTaskflow ${dialect_libs} LLVMSupport -) \ No newline at end of file +) + +add_subdirectory(Optimizations) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index 6955e29c..792412ff 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -58,6 +58,9 @@ struct HyperblockInfo { // The corresponding loop. affine::AffineForOp loop_op = nullptr; + + // Marks if this hyperblock follows the LEC pattern. + bool is_lec_pattern = false; }; //---------------------------------------------------------------------------- @@ -176,12 +179,64 @@ getTopLevelLoopsInfo(SmallVector &loops_info) { return top_level_loops_info; } +//---------------------------------------------------------------------------- +// Loop-Epilogue Code (LEC) Pattern Detection +//---------------------------------------------------------------------------- +// Loop-Epilogue Code means code that appears after an inner loop. +// Example: +// for %i (outer loop) { +// for %j (nested loop) { +// +// } +// ← Loop-Epilogue Code +// } +// For this pattern, we need to wrap the inner loop and the epilogue code into +// a hyperblock. Only by doing this can we maintain the hyperblock as a pure +// data-driven code block. +struct LECPattern { + affine::AffineForOp outer_loop; + affine::AffineForOp inner_loop; + + SmallVector prologue_code; + SmallVector epilogue_code; + + bool has_lec_pattern = false; +}; + +// Detects Loop-Epilogue Code pattern in the task. +static LECPattern detectLECPattern(affine::AffineForOp outer_loop) { + LECPattern pattern; + pattern.outer_loop = outer_loop; + + Block &body = outer_loop.getRegion().front(); + bool found_nested_loop = false; + + for (Operation &op : body.getOperations()) { + if (auto nested_for = dyn_cast(&op)) { + found_nested_loop = true; + if (!pattern.inner_loop) { + pattern.inner_loop = nested_for; + } + } else if (!(isa(&op) && op.getOperands().empty())) { + if (!found_nested_loop) { + pattern.prologue_code.push_back(&op); + pattern.has_lec_pattern = true; + } else { + pattern.epilogue_code.push_back(&op); + pattern.has_lec_pattern = true; + } + } + } + + return pattern; +} + //---------------------------------------------------------------------------- // Hyperblock Creation //---------------------------------------------------------------------------- // Recursively extracts hyperblocks from a region. -// Key insight: Operations in a loop body that are used by nested loops should -// be inlined into the nested loop's hyperblock. +// Key insight: Operations in a loop body that are used by nested loops +// should be inlined into the nested loop's hyperblock. static void extractHyperblocksInfoFromRegion( Region ®ion, const DenseMap &loop_info_map, @@ -196,15 +251,57 @@ static void extractHyperblocksInfoFromRegion( for (Operation &op : block.getOperations()) { if (auto for_op = dyn_cast(&op)) { + + LECPattern lec_pattern = detectLECPattern(for_op); + // Gets the loop info. LoopInfo *loop_info = loop_info_map.lookup(for_op); assert(loop_info && "Loop not found in loop_info_map"); - // Builds trigger indices fro this loop (parent indices + this loop's + // Builds trigger indices for this loop (parent indices + this loop's // index). SmallVector loop_indices = parent_indices; loop_indices.push_back(loop_info->counter_index); + // Handles the LEC pattern. + if (lec_pattern.has_lec_pattern) { + // 1. Emits any accumulated operations as a hyperblock. + if (!current_block_ops.empty()) { + HyperblockInfo info; + info.operations = current_block_ops; + info.trigger_indices = parent_indices; + info.is_loop_body = !parent_indices.empty(); + info.loop_op = enclosing_loop; + hyperblocks_info.push_back(info); + current_block_ops.clear(); + } + + // 2. Creates a hyperblock for the prologue + inner loop + epilogue. + HyperblockInfo info; + if (!lec_pattern.prologue_code.empty()) { + info.operations.append(lec_pattern.prologue_code.begin(), + lec_pattern.prologue_code.end()); + } + + info.operations.push_back(lec_pattern.inner_loop); + + if (!lec_pattern.epilogue_code.empty()) { + info.operations.append(lec_pattern.epilogue_code.begin(), + lec_pattern.epilogue_code.end()); + } + + info.trigger_indices = loop_indices; + info.is_loop_body = true; + info.loop_op = for_op; + info.is_lec_pattern = true; + hyperblocks_info.push_back(info); + + // No need for further processing of this loop. Since we have already + // handled the whole for_op. + current_block_ops.clear(); + continue; + } + // Analyzes which of the current_ops are used by this loop. DenseSet values_used_in_loop; for_op.walk([&](Operation *nested_op) { diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp new file mode 100644 index 00000000..da5cc7fa --- /dev/null +++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp @@ -0,0 +1,439 @@ +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" +#include "TaskflowDialect/TaskflowTypes.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +//============================================================================== +// Static Affine Loop Tree (SALT) Node. +//============================================================================== +struct SALTNode { + affine::AffineForOp loop_op; + int64_t lower_bound; + int64_t upper_bound; + int64_t step; + + SALTNode *parent = nullptr; + SmallVector children; + + // Operations that are NOT nested loops (the actual computation at this + // level). + SmallVector body_operations; + + bool isLeaf() const { return children.empty(); } + bool isRoot() const { return parent == nullptr; } +}; + +//============================================================================== +// Loop Chain - Path from Root to Leaf. +//============================================================================== +struct LoopChain { + SmallVector nodes; // Ordered from root to leaf. + + SALTNode *getRoot() const { return nodes.front(); } + SALTNode *getLeaf() const { return nodes.back(); } +}; + +//============================================================================== +// SALT Builder. +//============================================================================== +class SALTBuilder { +public: + SmallVector build(func::FuncOp func_op) { + SmallVector roots; + + for (Block &block : func_op.getBlocks()) { + for (Operation &op : block) { + if (affine::AffineForOp for_op = dyn_cast(&op)) { + if (for_op.hasConstantLowerBound() && + for_op.hasConstantUpperBound()) { + SALTNode *root = buildNodeRecursively(for_op, nullptr); + if (root) { + roots.push_back(root); + } + } + } + } + } + + return roots; + } + + const SmallVector> &getAllNodes() const { + return all_nodes; + } + +private: + SmallVector> all_nodes; + + SALTNode *buildNodeRecursively(affine::AffineForOp for_op, SALTNode *parent) { + auto node = std::make_unique(); + node->loop_op = for_op; + node->lower_bound = for_op.getConstantLowerBound(); + node->upper_bound = for_op.getConstantUpperBound(); + node->step = for_op.getStepAsInt(); + node->parent = parent; + + SALTNode *node_ptr = node.get(); + all_nodes.push_back(std::move(node)); + + Block &body = for_op.getRegion().front(); + for (Operation &op : body) { + if (auto nested_for = dyn_cast(&op)) { + if (nested_for.hasConstantLowerBound() && + nested_for.hasConstantUpperBound()) { + SALTNode *child = buildNodeRecursively(nested_for, node_ptr); + if (child) { + node_ptr->children.push_back(child); + } + } else { + node_ptr->body_operations.push_back(&op); + } + } else if (!isa(&op)) { + node_ptr->body_operations.push_back(&op); + } + } + + return node_ptr; + } +}; + +//============================================================================== +// Loop Chain Extractor (DFS). +//============================================================================== +class LoopChainExtractor { +public: + SmallVector extract(const SmallVector &roots) { + SmallVector chains; + + for (SALTNode *root : roots) { + SmallVector current_path; + dfs(root, current_path, chains); + } + + return chains; + } + +private: + void dfs(SALTNode *node, SmallVector ¤t_path, + SmallVector &chains) { + current_path.push_back(node); + + if (node->isLeaf()) { + LoopChain chain; + chain.nodes = current_path; + chains.push_back(chain); + } else { + for (SALTNode *child : node->children) { + dfs(child, current_path, chains); + } + } + + current_path.pop_back(); + } +}; + +//============================================================================== +// MCT Builder - Builds nested affine.for loops for the entire chain. +//============================================================================== +class MCTBuilder { +public: + MCTBuilder(OpBuilder &builder, Location loc) : builder(builder), loc(loc) {} + + // Builds the loop chain and returns the outermost loop. + // The built loops will be inserted at the builder's current insertion point. + affine::AffineForOp build(const LoopChain &chain) { + // Mapping from old values to new values. + IRMapping mapping; + + affine::AffineForOp outer_loop = nullptr; + Block *current_insert_block = nullptr; + SmallVector created_loops; + + // Iterate from root to leaf to build the nested loops. + for (size_t i = 0; i < chain.nodes.size(); ++i) { + SALTNode *node = chain.nodes[i]; + bool is_first = (i == 0); + + OpBuilder loop_builder(builder.getContext()); + if (is_first) { + loop_builder = builder; + } else { + // We want to insert the nested loop at the end of the current block. + // If the block has a terminator (e.g. yield we just added/cloned?), + // we should insert before it? + // Actually, we remove default yield immediately after creation. + // So the block usually doesn't have a terminator when we are filling + // it, UNLESS we cloned a yield from body ops? SALT excludes Yields from + // body_operations. So current_insert_block should be terminator-free + // (or we removed it). + loop_builder = OpBuilder::atBlockEnd(current_insert_block); + } + + // Prepare iter_args for the new loop. + SmallVector iter_args_init_values; + if (node->loop_op.getNumIterOperands() > 0) { + for (Value init : node->loop_op.getInits()) { + iter_args_init_values.push_back(mapping.lookupOrDefault(init)); + } + } + + // Create new loop with same bounds and iter_args. + auto new_loop = loop_builder.create( + loc, node->lower_bound, node->upper_bound, node->step, + iter_args_init_values); + + created_loops.push_back(new_loop); + + // Map the old induction variable to the new one. + mapping.map(node->loop_op.getInductionVar(), new_loop.getInductionVar()); + + // Map the old iter_args (block args) to the new iter_args (block args). + if (node->loop_op.getNumRegionIterArgs() > 0) { + for (auto [old_arg, new_arg] : + llvm::zip(node->loop_op.getRegionIterArgs(), + new_loop.getRegionIterArgs())) { + mapping.map(old_arg, new_arg); + } + } + + if (is_first) { + outer_loop = new_loop; + } + + // Update current insertion block to the body of the new loop. + current_insert_block = new_loop.getBody(); + + // Remove the default yield created by create. + if (!current_insert_block->empty() && + isa(current_insert_block->back())) + current_insert_block->back().erase(); + + // Clone body operations for THIS node. + OpBuilder body_builder = OpBuilder::atBlockEnd(current_insert_block); + for (Operation *op : node->body_operations) { + Operation *new_op = body_builder.clone(*op, mapping); + // Update mapping with results of the new op. + for (auto [old_res, new_res] : + llvm::zip(op->getResults(), new_op->getResults())) { + mapping.map(old_res, new_res); + } + } + } + + // Fix up yields for non-leaf loops (bottom-up). + for (int i = created_loops.size() - 2; i >= 0; --i) { + affine::AffineForOp parent = created_loops[i]; + affine::AffineForOp child = created_loops[i + 1]; + + OpBuilder yield_builder = OpBuilder::atBlockEnd(parent.getBody()); + + if (child.getNumResults() > 0) { + yield_builder.create(loc, child.getResults()); + } else { + yield_builder.create(loc); + } + } + + // For the LEAF loop, we cloned body operations (which excludes Yields). + // So the leaf loop likely has NO yield now. + // We must add a yield to the leaf loop that yields the results of the + // operations that produced results (mapped from original yield). + // SALTNode loop_op is the original loop. + // The original loop body had a yield. + // We need to find what the original yield yielded, map it, and yield it + // here. + + // Wait, if SALT excludes Yield from body_operations, then we NEVER cloned + // the yield. So the leaf loop has no terminator. We must reconstruct the + // yield for the leaf loop. + + if (!created_loops.empty()) { + affine::AffineForOp new_leaf = created_loops.back(); + SALTNode *leaf_node = chain.getLeaf(); // or chain.nodes.back() + + // Find the yield op in the original leaf node. + Operation *original_yield = nullptr; + for (Operation &op : leaf_node->loop_op.getBody()->getOperations()) { + if (isa(&op)) { + original_yield = &op; + break; + } + } + + if (original_yield) { + OpBuilder leaf_yield_builder = + OpBuilder::atBlockEnd(new_leaf.getBody()); + SmallVector yielded_values; + for (Value operand : original_yield->getOperands()) { + yielded_values.push_back(mapping.lookupOrDefault(operand)); + } + leaf_yield_builder.create(loc, yielded_values); + } else { + // Should not happen for valid AffineForOp + OpBuilder leaf_yield_builder = + OpBuilder::atBlockEnd(new_leaf.getBody()); + leaf_yield_builder.create(loc); + } + } + + return outer_loop; + } + +private: + OpBuilder &builder; + Location loc; +}; + +//============================================================================== +// Pass Implementation +//============================================================================== +struct AffineLoopTreeSerializationPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AffineLoopTreeSerializationPass) + + StringRef getArgument() const final { + return "affine-loop-tree-serialization"; + } + + StringRef getDescription() const final { + return "Serialize Affine loop trees into a linear sequence of loop nests " + "for MCT construction."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + ModuleOp module = getOperation(); + + WalkResult result = module.walk([&](func::FuncOp func_op) { + if (failed(convertFunction(func_op))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + if (result.wasInterrupted()) { + signalPassFailure(); + } + } + +private: + LogicalResult convertFunction(func::FuncOp func_op) { + Location loc = func_op.getLoc(); + + // Builds static affine loop tree. + SALTBuilder salt_builder; + SmallVector roots = salt_builder.build(func_op); + + if (roots.empty()) { + return success(); + } + + llvm::errs() << "=== SALT Structure ===\n"; + for (SALTNode *root : roots) { + printSALT(root, 0); + } + + // Extracts loop chains. + LoopChainExtractor extractor; + SmallVector chains = extractor.extract(roots); + + llvm::errs() << "=== Extracted " << chains.size() << " MCT(s) ===\n"; + for (size_t i = 0; i < chains.size(); ++i) { + llvm::errs() << "MCT " << i << ": "; + for (SALTNode *node : chains[i].nodes) { + llvm::errs() << "[" << node->lower_bound << "," << node->upper_bound + << ") "; + } + llvm::errs() << "\n"; + } + + // LoopChainExtractor iterates roots in order of SALTBuilder (order + // of appearance). So we can iterate through roots, and for each root, build + // its chains, replace root with chains. + + for (SALTNode *root : roots) { + OpBuilder builder(root->loop_op); + + // Finds chains originating from this root. + SmallVector root_chains; + for (const auto &chain : chains) { + if (chain.getRoot() == root) { + root_chains.push_back(chain); + } + } + + // Builds new chains. + for (const LoopChain &chain : root_chains) { + MCTBuilder mct_builder(builder, loc); + affine::AffineForOp new_loop = mct_builder.build(chain); + + // If the original root loop had results (iter_args), and the new loop + // has matching results, we must replace the uses of the original + // results with the new ones. NOTE: This assumes that for a loop + // defining values, there is a corresponding single chain that produces + // all the values (or at least the one we process). If a root with + // results is split into multiple chains, this simple logic might loop + // over them. However, for a reduction loop that is a single chain, this + // works. + if (root->loop_op.getNumResults() > 0 && new_loop && + new_loop.getNumResults() == root->loop_op.getNumResults()) { + root->loop_op.replaceAllUsesWith(new_loop.getResults()); + } + } + + // Erases the original root loop. + root->loop_op.erase(); + } + + return success(); + } + + void printSALT(SALTNode *node, int depth) { + for (int i = 0; i < depth; ++i) { + llvm::errs() << " "; + } + llvm::errs() << "Loop [" << node->lower_bound << "," << node->upper_bound + << ") step=" << node->step + << " | body_ops=" << node->body_operations.size() + << " | children=" << node->children.size() << "\n"; + for (SALTNode *child : node->children) { + printSALT(child, depth + 1); + } + } +}; + +} // namespace + +std::unique_ptr mlir::taskflow::createAffineLoopTreeSerializationPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt new file mode 100644 index 00000000..3e1ce5cd --- /dev/null +++ b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt @@ -0,0 +1,18 @@ +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +add_mlir_conversion_library(MLIRTaskflowOptimization + AffineLoopTreeSerializationPass.cpp + + DEPENDS + MLIRTaskflowTransformsIncGen + + LINK_LIBS PUBLIC + MLIRTaskflow + MLIRArithDialect + MLIRFuncDialect + MLIRLinalgDialect + MLIRIR + MLIRPass + MLIRTransforms + MLIRSupport +) \ No newline at end of file From 11010dc0bc5b35c688e295b9fa70f41452ba986f Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sun, 1 Feb 2026 16:58:29 +0800 Subject: [PATCH 2/9] introducing original memrefs in task --- include/TaskflowDialect/TaskflowOps.td | 40 +- include/TaskflowDialect/TaskflowPasses.h | 1 - include/TaskflowDialect/TaskflowPasses.td | 8 - .../AffineToTaskflow/AffineToTaskflowPass.cpp | 109 +++- lib/TaskflowDialect/TaskflowOps.cpp | 281 +++++++++ lib/TaskflowDialect/Transforms/CMakeLists.txt | 1 - .../Transforms/CanonicalizeTaskPass.cpp | 560 ------------------ 7 files changed, 381 insertions(+), 619 deletions(-) delete mode 100644 lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index a7ee4a6c..d4d7c326 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -39,11 +39,17 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [ 1. Memory dependencies: memrefs that are read or written by the task 2. Value dependencies: SSA values from producer tasks + The `read_memrefs` and `write_memrefs` attributes record the actural + original memrefs that this task accesses, + enabling data placement analysis for multi-CGRA mapping. + Example: - // Memory input: %mem, Value input: %val + // Memory inputs: %mem, Value inputs: %val $out_mem, %out_val = taskflow.task "Task_0" - memory_inputs(%mem : memref<4xi32>) - value_inputs(%val : i32) { + read_inputs(%mem : memref<4xi32>) + value_inputs(%val : i32) + original_read_memrefs(%arg0 : memref) + original_write_memrefs(%arg5 : memref) { ^bb0(%a0: memref<4xi32>, %a1: i32): affine.for %i = 0 to 4 { %v = affine.load %a0[%i] : memref<4xi32> @@ -55,28 +61,22 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [ }]; let arguments = (ins - Variadic:$memory_inputs, + Variadic:$read_inputs, + Variadic:$write_inputs, Variadic:$value_inputs, - StrAttr:$task_name + StrAttr:$task_name, + Variadic:$original_read_memrefs, + Variadic:$original_write_memrefs ); let results = (outs - Variadic:$memory_outputs, + Variadic:$write_outputs, Variadic:$value_outputs ); let regions = (region SizedRegion<1>:$body); - // let hasCustomAssemblyFormat = 1; - - // let assemblyFormat = [{ - // (`memory_inputs` `(` $memory_inputs^ `:` type($memory_inputs) `)`)? - // (`value_inputs` `(` $value_inputs^ `:` type($value_inputs) `)`)? - // attr-dict-with-keyword - // $body - // `->` `(` type($memory_outputs) `,` type($value_outputs) `)` - // }]; - + let hasCustomAssemblyFormat = 1; } // Defines the yield operation to terminate a Taskflow task. @@ -97,13 +97,7 @@ def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, Att Variadic:$memory_results, Variadic:$value_results); - // let assemblyFormat = [{ - // (`memory_outputs` `(` $memory_results^ `:` type($memory_results) `)`)? - // (`value_outputs` `(` $value_results^ `:` type($value_results) `)`)? - // attr-dict - // }]; - - // let hasCustomAssemblyFormat = 1; + let hasCustomAssemblyFormat = 1; let builders = [ // Default builder for empty yield. diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 69c2a37e..09a28aee 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -17,7 +17,6 @@ namespace taskflow { #include "TaskflowDialect/TaskflowPasses.h.inc" std::unique_ptr createAffineLoopTreeSerializationPass(); std::unique_ptr createConstructHyperblockFromTaskPass(); -std::unique_ptr createCanonicalizeTaskPass(); std::unique_ptr createClassifyCountersPass(); #define GEN_PASS_REGISTRATION diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 6aef5870..1e3c55c2 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -21,14 +21,6 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module "mlir::func::FuncDialect"]; } -def CanonicalizeTask : Pass<"canonicalize-task", "func::FuncOp"> { - let summary = "Canonicalizes Taskflow tasks"; - let description = [{ - This pass canonicalizes Taskflow tasks. - }]; - let constructor = "taskflow::createCanonicalizeTaskPass()"; -} - def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> { let summary = "Constructs hyperblocks and counter chain from Taskflow tasks"; let description = [{ diff --git a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp index 111dec0c..318c530d 100644 --- a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp +++ b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp @@ -15,6 +15,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" +#include "mlir/IR/ValueRange.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/ArrayRef.h" @@ -30,7 +31,6 @@ namespace { //------------------------------------------------------------------------------ // Helper Functions. //------------------------------------------------------------------------------ - // Collects memrefs that are loaded (read) within a given operation scope. static void collectReadMemrefs(Operation *op, SetVector &read_memrefs) { op->walk([&](Operation *nested_op) { @@ -104,15 +104,41 @@ updateOperationOperands(Operation *op, } } +//------------------------------------------------------------------------------ +// Analyzes all the original memory access info before conversion. +//------------------------------------------------------------------------------ +struct MemrefAccessInfo { + SetVector read_memrefs; + SetVector write_memrefs; +}; + +static DenseMap +analyzeMemrefAccesses(func::FuncOp func_op) { + DenseMap loop_to_memref_info; + + func_op.walk([&](affine::AffineForOp for_op) { + llvm::errs() << "\nAnalyzing memref accesses for loop:\n" << for_op << "\n"; + MemrefAccessInfo access_info; + + collectReadMemrefs(for_op.getOperation(), access_info.read_memrefs); + collectWrittenMemrefs(for_op.getOperation(), access_info.write_memrefs); + + loop_to_memref_info[for_op] = access_info; + }); + + return loop_to_memref_info; +} + //------------------------------------------------------------------------------ // Task Conversion //------------------------------------------------------------------------------ // Converts a top-level affine.for to a taskflow.task operation. -static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, - affine::AffineForOp for_op, - DenseMap &value_mapping, - int task_id) { +static TaskflowTaskOp convertLoopToTask( + OpBuilder &builder, affine::AffineForOp for_op, + DenseMap &value_mapping, + const DenseMap &loop_to_original_memref_info, + int task_id) { Location loc = for_op.getLoc(); std::string task_name = "Task_" + std::to_string(task_id); @@ -125,9 +151,9 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, // Step 1: Collects read and written memrefs. //------------------------------------------------------------------- SetVector read_memrefs; - SetVector written_memrefs; + SetVector write_memrefs; collectReadMemrefs(for_op.getOperation(), read_memrefs); - collectWrittenMemrefs(for_op.getOperation(), written_memrefs); + collectWrittenMemrefs(for_op.getOperation(), write_memrefs); llvm::errs() << "Read memrefs for loop:\n" << for_op << "\n"; for (Value memref : read_memrefs) { @@ -135,23 +161,25 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, } llvm::errs() << "Written memrefs for loop:\n" << for_op << "\n"; - for (Value memref : written_memrefs) { + for (Value memref : write_memrefs) { llvm::errs() << memref << "\n"; } + // Collects original memref access info. + auto it = loop_to_original_memref_info.find(for_op.getOperation()); + assert(it != loop_to_original_memref_info.end() && + "Original memref access info not found for the loop"); + const MemrefAccessInfo &original_memref_info = it->second; + SetVector original_read_memrefs = original_memref_info.read_memrefs; + SetVector original_write_memrefs = original_memref_info.write_memrefs; + //------------------------------------------------------------------- // Step 2: Determines memory inputs and outputs. //------------------------------------------------------------------- - // Memory inputs: ALL memrefs that are accessed (read OR written). - // This ensures WAR and WAW dependencies are respected. - SetVector accessed_memrefs; - accessed_memrefs.insert(read_memrefs.begin(), read_memrefs.end()); - accessed_memrefs.insert(written_memrefs.begin(), written_memrefs.end()); - // Memory outputs: ONLY memrefs that are written. // This ensures RAW and WAW dependencies are respected. SetVector output_memrefs; - output_memrefs.insert(written_memrefs.begin(), written_memrefs.end()); + output_memrefs.insert(write_memrefs.begin(), write_memrefs.end()); //------------------------------------------------------------------- // Step 3: Collects external SSA values (non-memref). @@ -167,17 +195,28 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, //------------------------------------------------------------------- // Step 4: Resolves inputs through value mapping. //------------------------------------------------------------------- - SmallVector memory_inputs; + SmallVector read_inputs; + SmallVector write_inputs; SmallVector value_inputs; IRMapping mapping; - // Resolves memory inputs. - for (Value memref : accessed_memrefs) { + // Resolves read inputs. + for (Value memref : read_memrefs) { + Value resolved_memref = value_mapping.lookup(memref); + if (!resolved_memref) { + resolved_memref = memref; + } + read_inputs.push_back(resolved_memref); + mapping.map(memref, resolved_memref); + } + + // Resolves write inputs. + for (Value memref : write_memrefs) { Value resolved_memref = value_mapping.lookup(memref); if (!resolved_memref) { resolved_memref = memref; } - memory_inputs.push_back(resolved_memref); + write_inputs.push_back(resolved_memref); mapping.map(memref, resolved_memref); } @@ -211,9 +250,12 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, loc, /*memory_outputs=*/memory_output_types, /*value_outputs=*/value_output_types, - /*memory_inputs=*/memory_inputs, + /*read_inputs=*/read_inputs, + /*write_inputs=*/write_inputs, /*value_inputs=*/value_inputs, - /*task_name=*/builder.getStringAttr(task_name)); + /*task_name=*/builder.getStringAttr(task_name), + /*original_read_memrefs=*/original_read_memrefs.getArrayRef(), + /*original_write_memrefs=*/original_write_memrefs.getArrayRef()); //------------------------------------------------------------------- // Step 7: Builds the task body. @@ -223,8 +265,15 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, // Adds block arguments (memory inputs first, then value inputs). DenseMap input_to_block_arg; - // Memory input arguments. - for (Value memref : accessed_memrefs) { + // Memory read input arguments. + for (Value memref : read_memrefs) { + BlockArgument arg = task_body->addArgument(memref.getType(), loc); + mapping.map(memref, arg); + input_to_block_arg[memref] = arg; + } + + // Memory write input arguments. + for (Value memref : write_memrefs) { BlockArgument arg = task_body->addArgument(memref.getType(), loc); mapping.map(memref, arg); input_to_block_arg[memref] = arg; @@ -270,7 +319,7 @@ static TaskflowTaskOp convertLoopToTask(OpBuilder &builder, //------------------------------------------------------------------- // Memory outputs. for (auto [memref, task_output] : - llvm::zip(output_memrefs, task_op.getMemoryOutputs())) { + llvm::zip(output_memrefs, task_op.getWriteOutputs())) { value_mapping[memref] = task_output; } @@ -285,6 +334,8 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) { llvm::errs() << "\n===Converting function: " << func_op.getName() << "===\n"; + DenseMap loop_to_original_memref_info = + analyzeMemrefAccesses(func_op); OpBuilder builder(func_op.getContext()); SmallVector loops_to_erase; DenseMap value_mapping; @@ -298,13 +349,19 @@ static LogicalResult convertFuncToTaskflow(func::FuncOp func_op) { ops_to_process.push_back(&op); } + llvm::errs() << "ops_to_process:\n"; + for (Operation *op : ops_to_process) { + llvm::errs() << *op << "\n"; + } + // Processes each operation in order (top to bottom). for (Operation *op : ops_to_process) { if (auto for_op = dyn_cast(op)) { // Converts affine.for to taskflow.task. OpBuilder builder(for_op); - TaskflowTaskOp task_op = convertLoopToTask( - builder, for_op, value_mapping, task_id_counter++); + TaskflowTaskOp task_op = + convertLoopToTask(builder, for_op, value_mapping, + loop_to_original_memref_info, task_id_counter++); // Replaces uses of loop results with task value outputs. for (auto [loop_result, task_value_output] : diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp index e69de29b..44aa255a 100644 --- a/lib/TaskflowDialect/TaskflowOps.cpp +++ b/lib/TaskflowDialect/TaskflowOps.cpp @@ -0,0 +1,281 @@ +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/DialectImplementation.h" +#include "mlir/IR/OpImplementation.h" +#include + +using namespace mlir; +using namespace mlir::taskflow; + +//===----------------------------------------------------------------------===// +// TaskflowTaskOp +//===----------------------------------------------------------------------===// + +ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) { + // Parses task name: @Task_0. + StringAttr task_name; + if (parser.parseSymbolName(task_name)) + return failure(); + result.addAttribute("task_name", task_name); + + // Parses read_inputs: read_inputs(%arg0, %arg1 : memref, + // memref). + SmallVector read_operands; + SmallVector read_types; + if (succeeded(parser.parseOptionalKeyword("read_inputs"))) { + if (parser.parseLParen() || parser.parseOperandList(read_operands) || + parser.parseColonTypeList(read_types) || parser.parseRParen()) + return failure(); + } + + // Parses write_inputs: write_inputs(%arg5 : memref). + SmallVector write_operands; + SmallVector write_types; + if (succeeded(parser.parseOptionalKeyword("write_inputs"))) { + if (parser.parseLParen() || parser.parseOperandList(write_operands) || + parser.parseColonTypeList(write_types) || parser.parseRParen()) + return failure(); + } + + // Parses value_inputs: value_inputs(%scalar : i32). + SmallVector value_operands; + SmallVector value_types; + if (succeeded(parser.parseOptionalKeyword("value_inputs"))) { + if (parser.parseLParen() || parser.parseOperandList(value_operands) || + parser.parseColonTypeList(value_types) || parser.parseRParen()) + return failure(); + } + + // Parses original memrefs: [original_read_memrefs(%arg0), + // original_write_memrefs(%arg5)]. + SmallVector original_read_operands; + SmallVector original_read_types; + SmallVector original_write_operands; + SmallVector original_write_types; + + if (succeeded(parser.parseOptionalLSquare())) { + // original_reads. + if (succeeded(parser.parseOptionalKeyword("original_read_memrefs"))) { + if (parser.parseLParen() || + parser.parseOperandList(original_read_operands) || + parser.parseRParen()) + return failure(); + } + + // optional comma. + (void)parser.parseOptionalComma(); + + // original_writes. + if (succeeded(parser.parseOptionalKeyword("original_write_memrefs"))) { + if (parser.parseLParen() || + parser.parseOperandList(original_write_operands) || + parser.parseRParen()) + return failure(); + } + + if (parser.parseRSquare()) + return failure(); + } + + // Resolves operands. + if (parser.resolveOperands(read_operands, read_types, + parser.getCurrentLocation(), result.operands) || + parser.resolveOperands(write_operands, write_types, + parser.getCurrentLocation(), result.operands) || + parser.resolveOperands(value_operands, value_types, + parser.getCurrentLocation(), result.operands)) + return failure(); + + // Resolves original memrefs (infer types from read/write memrefs). + for (size_t i = 0; i < original_read_operands.size(); ++i) { + original_read_types.push_back(read_types.empty() ? write_types[0] + : read_types[0]); + } + for (size_t i = 0; i < original_write_operands.size(); ++i) { + original_write_types.push_back(write_types.empty() ? read_types[0] + : write_types[0]); + } + + if (parser.resolveOperands(original_read_operands, original_read_types, + parser.getCurrentLocation(), result.operands) || + parser.resolveOperands(original_write_operands, original_write_types, + parser.getCurrentLocation(), result.operands)) + return failure(); + + // Parses optional attributes. + if (parser.parseOptionalAttrDict(result.attributes)) + return failure(); + + // Parses function type: : (...) -> (...). + FunctionType func_type; + if (parser.parseColon() || parser.parseType(func_type)) + return failure(); + + // Adds result types. + result.addTypes(func_type.getResults()); + + // Parses region. + Region *body = result.addRegion(); + if (parser.parseRegion(*body, /*args=*/{}, /*argTypes=*/{})) + return failure(); + + // Adds operand segment sizes. + result.addAttribute( + "operandSegmentSizes", + parser.getBuilder().getDenseI32ArrayAttr( + {static_cast(read_operands.size()), + static_cast(write_operands.size()), + static_cast(value_operands.size()), + static_cast(original_read_operands.size()), + static_cast(original_write_operands.size())})); + + // Adds result segment sizes. + size_t num_write_outputs = 0; + size_t num_value_outputs = 0; + for (Type t : func_type.getResults()) { + if (isa(t)) + num_write_outputs++; + else + num_value_outputs++; + } + result.addAttribute("resultSegmentSizes", + parser.getBuilder().getDenseI32ArrayAttr( + {static_cast(num_write_outputs), + static_cast(num_value_outputs)})); + + return success(); +} + +void TaskflowTaskOp::print(OpAsmPrinter &printer) { + // Prints task name. + printer << " @" << getTaskName(); + + // Prints read_inputs. + if (!getReadInputs().empty()) { + printer << " read_inputs("; + llvm::interleaveComma(getReadInputs(), printer); + printer << " : "; + llvm::interleaveComma(getReadInputs().getTypes(), printer); + printer << ")"; + } + + // Prints write_inputs. + if (!getWriteInputs().empty()) { + printer << " write_inputs("; + llvm::interleaveComma(getWriteInputs(), printer); + printer << " : "; + llvm::interleaveComma(getWriteInputs().getTypes(), printer); + printer << ")"; + } + + // Prints value_inputs. + if (!getValueInputs().empty()) { + printer << " value_inputs("; + llvm::interleaveComma(getValueInputs(), printer); + printer << " : "; + llvm::interleaveComma(getValueInputs().getTypes(), printer); + printer << ")"; + } + + // Prints original memrefs. + if (!getOriginalReadMemrefs().empty() || !getOriginalWriteMemrefs().empty()) { + printer << " ["; + + if (!getOriginalReadMemrefs().empty()) { + printer << "original_read_memrefs("; + llvm::interleaveComma(getOriginalReadMemrefs(), printer); + printer << ")"; + } + + if (!getOriginalReadMemrefs().empty() && !getOriginalWriteMemrefs().empty()) + printer << ", "; + + if (!getOriginalWriteMemrefs().empty()) { + printer << "original_write_memrefs("; + llvm::interleaveComma(getOriginalWriteMemrefs(), printer); + printer << ")"; + } + + printer << "]"; + } + + // Prints attributes (skip operandSegmentSizes, resultSegmentSizes, + // task_name). + SmallVector elidedAttrs = {"operandSegmentSizes", + "resultSegmentSizes", "task_name"}; + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + + // Prints function type. + printer << " : ("; + llvm::interleaveComma(llvm::concat(getReadInputs().getTypes(), + getWriteInputs().getTypes(), + getValueInputs().getTypes()), + printer); + printer << ") -> ("; + llvm::interleaveComma(llvm::concat(getWriteOutputs().getTypes(), + getValueOutputs().getTypes()), + printer); + printer << ")"; + + // Prints region. + printer << " "; + printer.printRegion(getBody(), /*printEntryBlockArgs=*/true); +} + +//===----------------------------------------------------------------------===// +// TaskflowYieldOp +//===----------------------------------------------------------------------===// + +ParseResult TaskflowYieldOp::parse(OpAsmParser &parser, + OperationState &result) { + SmallVector write_operands; + SmallVector write_types; + SmallVector value_operands; + SmallVector value_types; + + // Parses writes. + if (succeeded(parser.parseOptionalKeyword("writes"))) { + if (parser.parseLParen() || parser.parseOperandList(write_operands) || + parser.parseColonTypeList(write_types) || parser.parseRParen()) + return failure(); + } + + // Parses values. + if (succeeded(parser.parseOptionalKeyword("values"))) { + if (parser.parseLParen() || parser.parseOperandList(value_operands) || + parser.parseColonTypeList(value_types) || parser.parseRParen()) + return failure(); + } + + if (parser.resolveOperands(write_operands, write_types, + parser.getCurrentLocation(), result.operands) || + parser.resolveOperands(value_operands, value_types, + parser.getCurrentLocation(), result.operands)) + return failure(); + + result.addAttribute("operandSegmentSizes", + parser.getBuilder().getDenseI32ArrayAttr( + {static_cast(write_operands.size()), + static_cast(value_operands.size())})); + + return success(); +} + +void TaskflowYieldOp::print(OpAsmPrinter &printer) { + if (!getMemoryResults().empty()) { + printer << " writes("; + llvm::interleaveComma(getMemoryResults(), printer); + printer << " : "; + llvm::interleaveComma(getMemoryResults().getTypes(), printer); + printer << ")"; + } + + if (!getValueResults().empty()) { + printer << " values("; + llvm::interleaveComma(getValueResults(), printer); + printer << " : "; + llvm::interleaveComma(getValueResults().getTypes(), printer); + printer << ")"; + } +} \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index ff12e671..a5443158 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -2,7 +2,6 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp - CanonicalizeTaskPass.cpp ClassifyCountersPass.cpp DEPENDS diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp deleted file mode 100644 index 4281fae2..00000000 --- a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp +++ /dev/null @@ -1,560 +0,0 @@ -#include "TaskflowDialect/TaskflowDialect.h" -#include "TaskflowDialect/TaskflowOps.h" -#include "TaskflowDialect/TaskflowPasses.h" - -#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/IRMapping.h" -#include "mlir/IR/Unit.h" -#include "mlir/IR/Value.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/LLVM.h" -#include "mlir/Transforms/DialectConversion.h" -#include "llvm/ADT/SmallVector.h" - -using namespace mlir; -using namespace mlir::taskflow; - -namespace { -//---------------------------------------------------------------------- -// Memory and Value Access Info. -//---------------------------------------------------------------------- -// This struct analyzes accesses information within a hyperblock. -struct AccessInfo { - // Set of read memrefs. - SetVector memref_reads; - // Set of written memrefs. - SetVector memref_writes; - // Set of read values. - SetVector value_reads; - - void analyze(TaskflowHyperblockOp hyperblock, Block *task_body) { - DenseSet task_block_args; - for (Value arg : task_body->getArguments()) { - task_block_args.insert(arg); - } - - hyperblock.walk([&](Operation *op) { - if (auto load = dyn_cast(op)) { - this->memref_reads.insert(load.getMemRef()); - } else if (auto store = dyn_cast(op)) { - this->memref_writes.insert(store.getMemRef()); - } - - for (Value operand : op->getOperands()) { - if (task_block_args.contains(operand)) { - this->value_reads.insert(operand); - } - } - }); - } - - SetVector getAllMemRefs() const { - SetVector all; - all.insert(this->memref_reads.begin(), this->memref_reads.end()); - all.insert(this->memref_writes.begin(), this->memref_writes.end()); - return all; - } - - SetVector getAllValues() const { return this->value_reads; } -}; - -//---------------------------------------------------------------------- -// Counter Collector. -//---------------------------------------------------------------------- -// This class is used to collects all counters needed by a hyperblock. -class CounterCollector { -public: - void collect(TaskflowHyperblockOp hyperblock) { - for (Value idx : hyperblock.getIndices()) { - collectRecursively(idx); - } - } - - // Gets the collected counters sorted by their depth. - SmallVector getSortedCounters() const { - SmallVector result(this->counters.begin(), - this->counters.end()); - llvm::sort(result, [this](TaskflowCounterOp a, TaskflowCounterOp b) { - return getDepth(a) < getDepth(b); - }); - return result; - } - -private: - // Collects counters recursively. - void collectRecursively(Value idx) { - TaskflowCounterOp counter = idx.getDefiningOp(); - if (!counter) { - return; - } - this->counters.insert(counter); - if (Value parent = counter.getParentIndex()) { - collectRecursively(parent); - } - } - - // Gets the depth of a counter. - size_t getDepth(TaskflowCounterOp counter) const { - size_t depth = 0; - Value parent = counter.getParentIndex(); - while (parent) { - depth++; - if (TaskflowCounterOp p = parent.getDefiningOp()) { - parent = p.getParentIndex(); - } else { - break; - } - } - return depth; - } - - SetVector counters; -}; - -//---------------------------------------------------------------------- -// Block Argument Resolver. -//---------------------------------------------------------------------- -// This class resolves the input arguments of a task block to their source -// values. -// For example: -// taskflow.task(%buf_input, %val_input) { -// ^bb0(%arg0: memref, %arg1: i32): // ← block arguments -// // %arg0 corresponds to %buf_input -// // %arg1 corresponds to %val_input -// } -// resolveToSource(%arg0) -> %buf_input -class BlockArgResolver { -public: - explicit BlockArgResolver(TaskflowTaskOp task) { - Block *body = &task.getBody().front(); - - // Resolves memory inputs. - auto mem_inputs = task.getMemoryInputs(); - auto mem_args = body->getArguments().take_front(mem_inputs.size()); - for (auto [input, arg] : llvm::zip(mem_inputs, mem_args)) { - this->block_arg_to_source[arg] = input; - this->source_to_block_arg[input] = arg; - } - - // Resolves value inputs. - auto val_inputs = task.getValueInputs(); - auto val_args = body->getArguments().drop_front(mem_inputs.size()); - for (auto [input, arg] : llvm::zip(val_inputs, val_args)) { - this->block_arg_to_source[arg] = input; - this->source_to_block_arg[input] = arg; - } - } - - // Gets the source value for a given block argument. - Value resolveToSource(Value val) const { - auto it = this->block_arg_to_source.find(val); - return it != this->block_arg_to_source.end() ? it->second : val; - } - - // Gets the block argument for a given source value. - Value getBlockArg(Value source) const { - auto it = this->source_to_block_arg.find(source); - return it != this->source_to_block_arg.end() ? it->second : Value(); - } - -private: - // Maps block argument to its source value. - DenseMap block_arg_to_source; - // Maps source value to its block argument. - DenseMap source_to_block_arg; -}; - -//---------------------------------------------------------------------- -// Atomic Task Builder. -//---------------------------------------------------------------------- -// This class builds an atomic task from a hyperblock. -class AtomicTaskBuilder { -public: - AtomicTaskBuilder(OpBuilder &builder, Location loc, unsigned global_task_idx, - DenseMap &memref_to_latest_version, - DenseMap &value_to_latest_version) - : builder(builder), loc(loc), global_task_idx(global_task_idx), - memref_to_latest_version(memref_to_latest_version), - value_to_latest_version(value_to_latest_version) {} - - TaskflowTaskOp build(TaskflowHyperblockOp hyperblock, - TaskflowTaskOp original_task) { - AccessInfo access_info; - access_info.analyze(hyperblock, &original_task.getBody().front()); - - BlockArgResolver resolver(original_task); - - // Determines memref inputs. - SmallVector memref_inputs; - DenseMap source_to_memref_input_idx; - - for (Value memref : access_info.getAllMemRefs()) { - Value source = resolver.resolveToSource(memref); - Value input_memref = getLatestMemrefVersion(source); - - if (!source_to_memref_input_idx.count(source)) { - source_to_memref_input_idx[source] = memref_inputs.size(); - memref_inputs.push_back(input_memref); - } - } - - // Determines value inputs. - SmallVector value_inputs; - DenseMap source_to_value_input_idx; - - for (Value val : access_info.getAllValues()) { - Value source = resolver.resolveToSource(val); - Value input_val = getLatestValueVersion(source); - - if (!source_to_value_input_idx.count(source)) { - source_to_value_input_idx[source] = value_inputs.size(); - value_inputs.push_back(input_val); - } - } - - // Determines memref outputs. - SmallVector memref_output_types; - // The source memrefs of the written memrefs. - SmallVector written_memref_sources; - - for (Value memref : access_info.memref_writes) { - Value source = resolver.resolveToSource(memref); - memref_output_types.push_back(source.getType()); - written_memref_sources.push_back(source); - } - - // Determines value outputs. - SmallVector value_output_types; - SmallVector yielded_value_sources; - - if (!hyperblock.getOutputs().empty()) { - for (Value output : hyperblock.getOutputs()) { - value_output_types.push_back(output.getType()); - // For value outputs, they are source themselves. - yielded_value_sources.push_back(output); - } - } - - // Creates a new task. - std::string task_name = "Task_" + std::to_string(this->global_task_idx); - auto new_task = builder.create( - this->loc, memref_output_types, value_output_types, memref_inputs, - value_inputs, builder.getStringAttr(task_name)); - - // Creates the task body. - Block *task_body = new Block(); - new_task.getBody().push_back(task_body); - - // Adds memref input arguments. - for (Value input : memref_inputs) { - task_body->addArgument(input.getType(), this->loc); - } - // Adds value input arguments. - for (Value input : value_inputs) { - task_body->addArgument(input.getType(), this->loc); - } - - // Builds value mapping. - IRMapping mapping; - - // Maps memref inputs. - for (auto [source, idx] : source_to_memref_input_idx) { - BlockArgument new_arg = task_body->getArgument(idx); - mapping.map(source, new_arg); - - if (Value orig_arg = resolver.getBlockArg(source)) { - mapping.map(orig_arg, new_arg); - } - } - - // Maps value inputs. - size_t value_arg_offset = memref_inputs.size(); - for (auto [source, idx] : source_to_value_input_idx) { - BlockArgument new_arg = task_body->getArgument(value_arg_offset + idx); - mapping.map(source, new_arg); - - if (Value orig_arg = resolver.getBlockArg(source)) { - mapping.map(orig_arg, new_arg); - } - } - - // Clones counters and hyperblock. - OpBuilder task_builder(task_body, task_body->begin()); - cloneCounters(task_builder, hyperblock, mapping); - cloneHyperblock(task_builder, hyperblock, mapping); - - // Creates yield. - SmallVector memref_yield_operands; - for (Value memref : access_info.memref_writes) { - memref_yield_operands.push_back(mapping.lookupOrDefault(memref)); - } - - SmallVector value_yield_operands; - // If this hyperblock has value outputs, we need to yield them from the - // mapped hyperblock. - if (!hyperblock.getOutputs().empty()) { - // Finds the cloned hyperblock op. - TaskflowHyperblockOp cloned_hb = nullptr; - for (Operation &op : task_body->getOperations()) { - if (auto hb = dyn_cast(op)) { - cloned_hb = hb; - break; - } - if (cloned_hb) { - for (Value output : cloned_hb.getOutputs()) { - value_yield_operands.push_back(output); - } - } - } - } - - task_builder.setInsertionPointToEnd(task_body); - task_builder.create(this->loc, memref_yield_operands, - value_yield_operands); - - // Updates latest versions. - auto memref_outputs = new_task.getMemoryOutputs(); - for (auto [source, output] : - llvm::zip(written_memref_sources, memref_outputs)) { - this->memref_to_latest_version[source] = output; - } - - auto value_outputs = new_task.getValueOutputs(); - for (auto [source, output] : - llvm::zip(yielded_value_sources, value_outputs)) { - this->value_to_latest_version[source] = output; - } - - return new_task; - } - -private: - Value getLatestMemrefVersion(Value source) { - auto it = this->memref_to_latest_version.find(source); - return it != this->memref_to_latest_version.end() ? it->second : source; - } - - Value getLatestValueVersion(Value source) { - auto it = this->value_to_latest_version.find(source); - return it != this->value_to_latest_version.end() ? it->second : source; - } - - void cloneCounters(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock, - IRMapping &mapping) { - CounterCollector collector; - collector.collect(hyperblock); - - for (TaskflowCounterOp counter : collector.getSortedCounters()) { - task_builder.clone(*counter.getOperation(), mapping); - } - } - - void cloneHyperblock(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock, - IRMapping &mapping) { - SmallVector mapped_indices; - for (Value idx : hyperblock.getIndices()) { - mapped_indices.push_back(mapping.lookupOrDefault(idx)); - } - - SmallVector mapped_iter_args; - for (Value arg : hyperblock.getIterArgs()) { - mapped_iter_args.push_back(mapping.lookupOrDefault(arg)); - } - - SmallVector output_types(hyperblock.getOutputs().getTypes()); - auto newHB = task_builder.create( - this->loc, output_types, mapped_indices, mapped_iter_args); - - Block *new_body = new Block(); - newHB.getBody().push_back(new_body); - - for (Value idx : mapped_indices) { - new_body->addArgument(idx.getType(), this->loc); - } - - for (Value arg : mapped_iter_args) { - new_body->addArgument(arg.getType(), this->loc); - } - - Block *old_body = &hyperblock.getBody().front(); - for (auto [old_arg, new_arg] : - llvm::zip(old_body->getArguments(), new_body->getArguments())) { - mapping.map(old_arg, new_arg); - } - - OpBuilder hb_builder(new_body, new_body->begin()); - for (Operation &op : old_body->without_terminator()) { - hb_builder.clone(op, mapping); - } - - if (auto yield = - dyn_cast(old_body->getTerminator())) { - SmallVector yield_results; - SmallVector yield_iter_args_next; - for (Value v : yield.getResults()) { - yield_results.push_back(mapping.lookupOrDefault(v)); - } - for (Value v : yield.getIterArgsNext()) { - yield_iter_args_next.push_back(mapping.lookupOrDefault(v)); - } - hb_builder.create(this->loc, yield_results, - yield_iter_args_next); - } else { - hb_builder.create(this->loc); - } - } - - OpBuilder &builder; - Location loc; - unsigned global_task_idx; - DenseMap &memref_to_latest_version; - DenseMap &value_to_latest_version; -}; - -//---------------------------------------------------------------------- -// Pass Implementation. -//---------------------------------------------------------------------- - -struct CanonicalizeTaskPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeTaskPass) - - StringRef getArgument() const final { return "canonicalize-task"; } - - StringRef getDescription() const final { - return "Canonicalizes tasks by splitting each hyperblock into a separate " - "atomic task (one hyperblock per task)"; - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry - .insert(); - } - - void runOnOperation() override { - func::FuncOp func_op = getOperation(); - - SmallVector tasks_to_process; - func_op.walk( - [&](TaskflowTaskOp task_op) { tasks_to_process.push_back(task_op); }); - - unsigned global_task_idx = 0; - - for (TaskflowTaskOp original_task : tasks_to_process) { - OpBuilder builder(original_task); - // Collects hyperblocks within the original task. - SmallVector hyperblocks; - original_task.walk( - [&](TaskflowHyperblockOp hb) { hyperblocks.push_back(hb); }); - - assert(!hyperblocks.empty() && - "Expected at least one hyperblock in the task"); - - // If there's only one hyperblock, it is already canonical. - if (hyperblocks.size() == 1) { - std::string task_name = "Task_" + std::to_string(global_task_idx++); - original_task.setTaskNameAttr(builder.getStringAttr(task_name)); - continue; - } - - //---------------------------------------------------------------- - // Step 1: Builds mapping from original task's memory outputs to their - // corresponding source memrefs (the original inputs). - //---------------------------------------------------------------- - // Gets the yield operation to find which memrefs are yielded. - auto yield_op = cast( - original_task.getBody().front().getTerminator()); - auto original_mem_outputs = original_task.getMemoryOutputs(); - auto original_val_outputs = original_task.getValueOutputs(); - auto yielded_memrefs = yield_op.getMemoryResults(); - auto yielded_values = yield_op.getValueResults(); - - // Map: yielded -> original task output. - DenseMap yielded_to_output; - for (auto [yielded, output] : - llvm::zip(yielded_memrefs, original_mem_outputs)) { - yielded_to_output[yielded] = output; - } - for (auto [yielded, output] : - llvm::zip(yielded_values, original_val_outputs)) { - yielded_to_output[yielded] = output; - } - - // Map: original input memref -> original task output (if it's yielded). - // This tells us which original outputs correspond to which input memrefs. - Block *orig_body = &original_task.getBody().front(); - auto orig_mem_inputs = original_task.getMemoryInputs(); - auto orig_val_inputs = original_task.getValueInputs(); - - DenseMap source_to_original_output; - - // Maps memref inputs. - for (auto [input, arg] : llvm::zip( - orig_mem_inputs, - orig_body->getArguments().take_front(orig_mem_inputs.size()))) { - if (yielded_to_output.count(arg)) { - source_to_original_output[input] = yielded_to_output[arg]; - } - } - - // Maps value inputs. - for (auto [input, arg] : llvm::zip( - orig_val_inputs, - orig_body->getArguments().drop_front(orig_mem_inputs.size()))) { - if (yielded_to_output.count(arg)) { - source_to_original_output[input] = yielded_to_output[arg]; - } - } - - //---------------------------------------------------------------- - // Step 2: Creates atomic tasks for each hyperblock. - //---------------------------------------------------------------- - // Records the mapping from source memref to the latest version after - // executing each atomic task. - DenseMap memref_to_latest_version; - DenseMap value_to_latest_version; - - for (size_t i = 0; i < hyperblocks.size(); ++i) { - AtomicTaskBuilder task_builder( - builder, original_task.getLoc(), global_task_idx++, - memref_to_latest_version, value_to_latest_version); - task_builder.build(hyperblocks[i], original_task); - } - - //---------------------------------------------------------------- - // Step 3: Replaces uses of original task outputs with the latest - // versions. - //---------------------------------------------------------------- - for (auto [source, original_output] : source_to_original_output) { - Value latest = nullptr; - if (memref_to_latest_version.count(source)) { - latest = memref_to_latest_version[source]; - } else if (value_to_latest_version.count(source)) { - latest = value_to_latest_version[source]; - } - - if (latest) { - original_output.replaceAllUsesWith(latest); - } - } - - //---------------------------------------------------------------- - // Step 4: Erase the original task. - //---------------------------------------------------------------- - original_task.erase(); - } - } -}; - -} // namespace - -std::unique_ptr mlir::taskflow::createCanonicalizeTaskPass() { - return std::make_unique(); -} \ No newline at end of file From ac60077f98c69010b1c6b004c3438aae5799b28c Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sun, 1 Feb 2026 17:30:03 +0800 Subject: [PATCH 3/9] [fix] fix bug in hyperblock construction --- .../ConstructHyperblockFromTaskPass.cpp | 2 +- .../TosaToTaskflow/affine-to-taskflow.mlir | 27 +++++++++------- .../TosaToTaskflow/tosa-to-taskflow.mlir | 30 ++++++++++-------- test/e2e/tosa_e2e.mlir | 31 ++++++++++--------- test/multi-cgra/kernel_mapping/fir/fir.mlir | 7 ++--- 5 files changed, 53 insertions(+), 44 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index 792412ff..c1e6ddff 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -206,6 +206,7 @@ struct LECPattern { // Detects Loop-Epilogue Code pattern in the task. static LECPattern detectLECPattern(affine::AffineForOp outer_loop) { LECPattern pattern; + pattern.has_lec_pattern = false; pattern.outer_loop = outer_loop; Block &body = outer_loop.getRegion().front(); @@ -220,7 +221,6 @@ static LECPattern detectLECPattern(affine::AffineForOp outer_loop) { } else if (!(isa(&op) && op.getOperands().empty())) { if (!found_nested_loop) { pattern.prologue_code.push_back(&op); - pattern.has_lec_pattern = true; } else { pattern.epilogue_code.push_back(&op); pattern.has_lec_pattern = true; diff --git a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir index 3f07f91d..38bb3ca2 100644 --- a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir @@ -1,4 +1,6 @@ -// RUN: mlir-neura-opt --convert-affine-to-taskflow %s 2>/dev/null | FileCheck %s +// RUN: mlir-neura-opt --convert-affine-to-taskflow %s \ +// RUN: -o %t-taskflow.mlir +// RUN: FileCheck %s --input-file=%t-taskflow.mlir // Test Affine to Taskflow conversion module { @@ -13,15 +15,16 @@ module { } } -// CHECK-LABEL: func.func @simple_add -// CHECK-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %arg2) -// CHECK-SAME: task_name = "Task_0" -// CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>): -// CHECK-NEXT: affine.for %arg6 = 0 to 16 { -// CHECK-NEXT: %0 = affine.load %arg3[%arg6] : memref<16xf32> -// CHECK-NEXT: %1 = affine.load %arg4[%arg6] : memref<16xf32> -// CHECK-NEXT: %2 = arith.addf %0, %1 : f32 -// CHECK-NEXT: affine.store %2, %arg5[%arg6] : memref<16xf32> +// CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: memref<16xf32>) { +// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg2)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>): +// CHECK-NEXT: affine.for %arg6 = 0 to 16 { +// CHECK-NEXT: %0 = affine.load %arg3[%arg6] : memref<16xf32> +// CHECK-NEXT: %1 = affine.load %arg4[%arg6] : memref<16xf32> +// CHECK-NEXT: %2 = arith.addf %0, %1 : f32 +// CHECK-NEXT: affine.store %2, %arg5[%arg6] : memref<16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: taskflow.yield writes(%arg5 : memref<16xf32>) +// CHECK-NEXT: } +// CHECK-NEXT: return // CHECK-NEXT: } -// CHECK-NEXT: "taskflow.yield"(%arg5) -// CHECK: return diff --git a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir index 7c2356cf..dd7083ba 100644 --- a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-neura-opt --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' %s 2>&1 | FileCheck %s +// RUN: mlir-neura-opt --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' %s \ +// RUN: -o %t-taskflow.mlir +// RUN: FileCheck %s --input-file=%t-taskflow.mlir + // Simple TOSA add lowering test func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf32> { @@ -6,16 +9,17 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16 return %0 : tensor<16xf32> } -// CHECK-LABEL: func.func @simple_add -// CHECK: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK: %[[RES:.*]] = "taskflow.task"(%arg0, %arg1, %alloc) -// CHECK-SAME: task_name = "Task_0" -// CHECK-NEXT: ^bb0(%[[BA1:.*]]: memref<16xf32>, %[[BA2:.*]]: memref<16xf32>, %[[BA3:.*]]: memref<16xf32>): -// CHECK-NEXT: affine.for %[[IV:.*]] = 0 to 16 { -// CHECK-NEXT: %0 = affine.load %[[BA1]][%[[IV]]] : memref<16xf32> -// CHECK-NEXT: %1 = affine.load %[[BA2]][%[[IV]]] : memref<16xf32> -// CHECK-NEXT: %2 = arith.addf %0, %1 : f32 -// CHECK-NEXT: affine.store %2, %[[BA3]][%[[IV]]] : memref<16xf32> +// CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { +// CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> +// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): +// CHECK-NEXT: affine.for %arg5 = 0 to 16 { +// CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> +// CHECK-NEXT: %1 = affine.load %arg3[%arg5] : memref<16xf32> +// CHECK-NEXT: %2 = arith.addf %0, %1 : f32 +// CHECK-NEXT: affine.store %2, %arg4[%arg5] : memref<16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: taskflow.yield writes(%arg4 : memref<16xf32>) // CHECK-NEXT: } -// CHECK-NEXT: "taskflow.yield"(%[[BA3]]) -// CHECK: return %[[RES]] : memref<16xf32> +// CHECK-NEXT: return %write_outputs : memref<16xf32> +// CHECK-NEXT: } diff --git a/test/e2e/tosa_e2e.mlir b/test/e2e/tosa_e2e.mlir index 19a75576..f291ffd7 100644 --- a/test/e2e/tosa_e2e.mlir +++ b/test/e2e/tosa_e2e.mlir @@ -1,4 +1,6 @@ -// RUN: mlir-neura-opt %s --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' | FileCheck %s +// RUN: mlir-neura-opt %s --pass-pipeline='builtin.module(func.func(tosa-infer-shapes,tosa-make-broadcastable,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith,tosa-to-tensor,linalg-fuse-elementwise-ops),one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},func.func(convert-linalg-to-affine-loops),convert-affine-to-taskflow)' \ +// RUN: -o %t-taskflow.mlir +// RUN: FileCheck %s --input-file=%t-taskflow.mlir // Verifies the end-to-end lowering from TOSA to Taskflow. func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf32> { @@ -7,17 +9,18 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf return %1 : tensor<16xf32> } -// CHECK-LABEL: func.func @test_e2e -// CHECK: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK: %[[RES:.*]] = "taskflow.task"(%arg0, %arg1, %alloc) -// CHECK-SAME: task_name = "Task_0" -// CHECK-NEXT: ^bb0(%[[BA1:.*]]: memref<16xf32>, %[[BA2:.*]]: memref<16xf32>, %[[BA3:.*]]: memref<16xf32>): -// CHECK-NEXT: affine.for %[[IV:.*]] = 0 to 16 { -// CHECK-NEXT: %0 = affine.load %[[BA1]][%[[IV]]] : memref<16xf32> -// CHECK-NEXT: %1 = affine.load %[[BA2]][%[[IV]]] : memref<16xf32> -// CHECK-NEXT: %2 = arith.addf %0, %1 : f32 -// CHECK-NEXT: %3 = arith.mulf %2, %2 : f32 -// CHECK-NEXT: affine.store %3, %[[BA3]][%[[IV]]] : memref<16xf32> +// CHECK: func.func @test_e2e(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { +// CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> +// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): +// CHECK-NEXT: affine.for %arg5 = 0 to 16 { +// CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> +// CHECK-NEXT: %1 = affine.load %arg3[%arg5] : memref<16xf32> +// CHECK-NEXT: %2 = arith.addf %0, %1 : f32 +// CHECK-NEXT: %3 = arith.mulf %2, %2 : f32 +// CHECK-NEXT: affine.store %3, %arg4[%arg5] : memref<16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: taskflow.yield writes(%arg4 : memref<16xf32>) // CHECK-NEXT: } -// CHECK-NEXT: "taskflow.yield"(%[[BA3]]) -// CHECK: return %[[RES]] : memref<16xf32> +// CHECK-NEXT: return %write_outputs : memref<16xf32> +// CHECK-NEXT: } diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index cc2bf924..d8facaa3 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -4,7 +4,6 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: -o %t.canonicalized.mlir // RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE @@ -100,7 +99,7 @@ module attributes {} { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// TASKFLOW-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// TASKFLOW-NEXT: %value_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { // TASKFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // TASKFLOW-NEXT: %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) { // TASKFLOW-NEXT: %1 = affine.load %arg3[%arg6] : memref @@ -109,8 +108,8 @@ module attributes {} { // TASKFLOW-NEXT: %4 = arith.addi %arg7, %3 : i32 // TASKFLOW-NEXT: affine.yield %4 : i32 // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () -// TASKFLOW-NEXT: }) : (memref, memref, i32) -> i32 +// TASKFLOW-NEXT: taskflow.yield values(%0 : i32) +// TASKFLOW-NEXT: } // TASKFLOW-NEXT: return %value_outputs : i32 // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } From 02faa878f4b95a2f22ad0fedcd0648da59b72fe9 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sun, 1 Feb 2026 21:28:39 +0800 Subject: [PATCH 4/9] modify the ple pattern --- include/TaskflowDialect/TaskflowOps.td | 4 +- include/TaskflowDialect/TaskflowPasses.h | 1 + include/TaskflowDialect/TaskflowPasses.td | 14 + lib/TaskflowDialect/TaskflowOps.cpp | 32 +- lib/TaskflowDialect/Transforms/CMakeLists.txt | 1 + .../Transforms/CanonicalizeTaskPass.cpp | 676 ++++++++++++++++++ .../ConstructHyperblockFromTaskPass.cpp | 58 +- .../TosaToTaskflow/affine-to-taskflow.mlir | 2 +- .../TosaToTaskflow/tosa-to-taskflow.mlir | 2 +- test/e2e/tosa_e2e.mlir | 2 +- 10 files changed, 744 insertions(+), 48 deletions(-) create mode 100644 lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index d4d7c326..8359b0cc 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -61,8 +61,8 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [ }]; let arguments = (ins - Variadic:$read_inputs, - Variadic:$write_inputs, + Variadic:$read_memrefs, + Variadic:$write_memrefs, Variadic:$value_inputs, StrAttr:$task_name, Variadic:$original_read_memrefs, diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index 09a28aee..c4c73b6b 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -18,6 +18,7 @@ namespace taskflow { std::unique_ptr createAffineLoopTreeSerializationPass(); std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); +std::unique_ptr createCanonicalizeTaskPass(); #define GEN_PASS_REGISTRATION #include "TaskflowDialect/TaskflowPasses.h.inc" diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 1e3c55c2..7f2e78b6 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -29,6 +29,20 @@ def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func:: let constructor = "taskflow::createConstructHyperblockFromTaskPass()"; } +def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{ + let summary = "Canonicalizes tasks by splitting each hyperblock into a separate atomic task"; + let description = [{ + This pass splits tasks so that each task contains exactly one hyperblock. + This creates atomic task units that can be analyzed and optimized independently. + + Input: Task with N hyperblocks + Output: N atomic tasks, each containing one hyperblock + + This is a prerequisite pass before fusion optimizations. + }]; + let constructor = "taskflow::createCanonicalizeTaskPass()"; +} + def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ let summary = "Classifies counters as root/relay/leaf"; let description = [{ diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp index 44aa255a..06fa3c49 100644 --- a/lib/TaskflowDialect/TaskflowOps.cpp +++ b/lib/TaskflowDialect/TaskflowOps.cpp @@ -19,20 +19,20 @@ ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) { return failure(); result.addAttribute("task_name", task_name); - // Parses read_inputs: read_inputs(%arg0, %arg1 : memref, + // Parses read_memrefs: read_memrefs(%arg0, %arg1 : memref, // memref). SmallVector read_operands; SmallVector read_types; - if (succeeded(parser.parseOptionalKeyword("read_inputs"))) { + if (succeeded(parser.parseOptionalKeyword("read_memrefs"))) { if (parser.parseLParen() || parser.parseOperandList(read_operands) || parser.parseColonTypeList(read_types) || parser.parseRParen()) return failure(); } - // Parses write_inputs: write_inputs(%arg5 : memref). + // Parses write_memrefs: write_memrefs(%arg5 : memref). SmallVector write_operands; SmallVector write_types; - if (succeeded(parser.parseOptionalKeyword("write_inputs"))) { + if (succeeded(parser.parseOptionalKeyword("write_memrefs"))) { if (parser.parseLParen() || parser.parseOperandList(write_operands) || parser.parseColonTypeList(write_types) || parser.parseRParen()) return failure(); @@ -151,21 +151,21 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) { // Prints task name. printer << " @" << getTaskName(); - // Prints read_inputs. - if (!getReadInputs().empty()) { - printer << " read_inputs("; - llvm::interleaveComma(getReadInputs(), printer); + // Prints read_memrefs. + if (!getReadMemrefs().empty()) { + printer << " read_memrefs("; + llvm::interleaveComma(getReadMemrefs(), printer); printer << " : "; - llvm::interleaveComma(getReadInputs().getTypes(), printer); + llvm::interleaveComma(getReadMemrefs().getTypes(), printer); printer << ")"; } - // Prints write_inputs. - if (!getWriteInputs().empty()) { - printer << " write_inputs("; - llvm::interleaveComma(getWriteInputs(), printer); + // Prints write_memrefs. + if (!getWriteMemrefs().empty()) { + printer << " write_memrefs("; + llvm::interleaveComma(getWriteMemrefs(), printer); printer << " : "; - llvm::interleaveComma(getWriteInputs().getTypes(), printer); + llvm::interleaveComma(getWriteMemrefs().getTypes(), printer); printer << ")"; } @@ -208,8 +208,8 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) { // Prints function type. printer << " : ("; - llvm::interleaveComma(llvm::concat(getReadInputs().getTypes(), - getWriteInputs().getTypes(), + llvm::interleaveComma(llvm::concat(getReadMemrefs().getTypes(), + getWriteMemrefs().getTypes(), getValueInputs().getTypes()), printer); printer << ") -> ("; diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index a5443158..ff12e671 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp + CanonicalizeTaskPass.cpp ClassifyCountersPass.cpp DEPENDS diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp new file mode 100644 index 00000000..636e02b9 --- /dev/null +++ b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp @@ -0,0 +1,676 @@ +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" + +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/Unit.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/DialectConversion.h" +#include "llvm/ADT/SmallVector.h" + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { +//---------------------------------------------------------------------- +// Memory and Value Access Info. +//---------------------------------------------------------------------- +// This struct analyzes accesses information within a hyperblock. +struct AccessInfo { + // Set of read memrefs. + SetVector memref_reads; + // Set of written memrefs. + SetVector memref_writes; + // Set of read values. + SetVector value_reads; + + void analyze(TaskflowHyperblockOp hyperblock, Block *task_body) { + DenseSet task_block_args; + for (Value arg : task_body->getArguments()) { + task_block_args.insert(arg); + } + + hyperblock.walk([&](Operation *op) { + if (auto load = dyn_cast(op)) { + this->memref_reads.insert(load.getMemRef()); + } else if (auto store = dyn_cast(op)) { + this->memref_writes.insert(store.getMemRef()); + } + + for (Value operand : op->getOperands()) { + if (task_block_args.contains(operand)) { + this->value_reads.insert(operand); + } + } + }); + } + + SetVector getReadMemRefs() const { + SetVector all; + all.insert(this->memref_reads.begin(), this->memref_reads.end()); + return all; + } + + SetVector getWriteMemRefs() const { + SetVector all; + all.insert(this->memref_writes.begin(), this->memref_writes.end()); + return all; + } + + SetVector getAllValues() const { return this->value_reads; } +}; + +//---------------------------------------------------------------------- +// Counter Collector. +//---------------------------------------------------------------------- +// This class is used to collects all counters needed by a hyperblock. +class CounterCollector { +public: + void collect(TaskflowHyperblockOp hyperblock) { + for (Value idx : hyperblock.getIndices()) { + collectRecursively(idx); + } + } + + // Gets the collected counters sorted by their depth. + SmallVector getSortedCounters() const { + SmallVector result(this->counters.begin(), + this->counters.end()); + llvm::sort(result, [this](TaskflowCounterOp a, TaskflowCounterOp b) { + return getDepth(a) < getDepth(b); + }); + return result; + } + +private: + // Collects counters recursively. + void collectRecursively(Value idx) { + TaskflowCounterOp counter = idx.getDefiningOp(); + if (!counter) { + return; + } + this->counters.insert(counter); + if (Value parent = counter.getParentIndex()) { + collectRecursively(parent); + } + } + + // Gets the depth of a counter. + size_t getDepth(TaskflowCounterOp counter) const { + size_t depth = 0; + Value parent = counter.getParentIndex(); + while (parent) { + depth++; + if (TaskflowCounterOp p = parent.getDefiningOp()) { + parent = p.getParentIndex(); + } else { + break; + } + } + return depth; + } + + SetVector counters; +}; + +//---------------------------------------------------------------------- +// Block Argument Resolver. +//---------------------------------------------------------------------- +// This class resolves the input arguments of a task block to their source +// values. +// For example: +// taskflow.task(%buf_input, %val_input) { +// ^bb0(%arg0: memref, %arg1: i32): // ← block arguments +// // %arg0 corresponds to %buf_input +// // %arg1 corresponds to %val_input +// } +// resolveToSource(%arg0) -> %buf_input +class BlockArgResolver { +public: + explicit BlockArgResolver(TaskflowTaskOp task) { + Block *body = &task.getBody().front(); + + // Resolves memory inputs. + auto read_memrefs = task.getReadMemrefs(); + auto read_args = body->getArguments().take_front(read_memrefs.size()); + for (auto [input, arg] : llvm::zip(read_memrefs, read_args)) { + this->block_arg_to_source[arg] = input; + this->source_to_block_arg[input] = arg; + } + + // Resolves memory inputs. + auto write_memrefs = task.getWriteMemrefs(); + auto mem_args = body->getArguments().take_front(write_memrefs.size()); + for (auto [input, arg] : llvm::zip(write_memrefs, mem_args)) { + this->block_arg_to_source[arg] = input; + this->source_to_block_arg[input] = arg; + } + + // Resolves value inputs. + auto val_inputs = task.getValueInputs(); + auto val_args = body->getArguments().drop_front(read_memrefs.size() + + write_memrefs.size()); + for (auto [input, arg] : llvm::zip(val_inputs, val_args)) { + this->block_arg_to_source[arg] = input; + this->source_to_block_arg[input] = arg; + } + } + + // Gets the source value for a given block argument. + Value resolveToSource(Value val) const { + auto it = this->block_arg_to_source.find(val); + return it != this->block_arg_to_source.end() ? it->second : val; + } + + // Gets the block argument for a given source value. + Value getBlockArg(Value source) const { + auto it = this->source_to_block_arg.find(source); + return it != this->source_to_block_arg.end() ? it->second : Value(); + } + +private: + // Maps block argument to its source value. + DenseMap block_arg_to_source; + // Maps source value to its block argument. + DenseMap source_to_block_arg; +}; + +//---------------------------------------------------------------------- +// Atomic Task Builder. +//---------------------------------------------------------------------- +// This class builds an atomic task from a hyperblock. +class AtomicTaskBuilder { +public: + AtomicTaskBuilder(OpBuilder &builder, Location loc, unsigned global_task_idx, + DenseMap &memref_to_latest_version, + DenseMap &value_to_latest_version) + : builder(builder), loc(loc), global_task_idx(global_task_idx), + memref_to_latest_version(memref_to_latest_version), + value_to_latest_version(value_to_latest_version) {} + + TaskflowTaskOp build(TaskflowHyperblockOp hyperblock, + TaskflowTaskOp original_task) { + AccessInfo access_info; + access_info.analyze(hyperblock, &original_task.getBody().front()); + + BlockArgResolver resolver(original_task); + + //------------------------------------------------------ + // Step1: Determines read/write memresfs and value inputs. + //------------------------------------------------------ + SmallVector read_memrefs; + SmallVector write_memrefs; + SmallVector value_inputs; + + DenseMap source_to_read_memref_idx; + DenseMap source_to_write_memref_idx; + DenseMap source_to_value_input_idx; + + // Classifies memrefs into read and write sets. + for (Value memref : access_info.getReadMemRefs()) { + Value source = resolver.resolveToSource(memref); + Value input_memref = getLatestMemrefVersion(source); + + if (!source_to_read_memref_idx.count(source)) { + source_to_read_memref_idx[source] = read_memrefs.size(); + read_memrefs.push_back(input_memref); + } + } + + for (Value memref : access_info.getWriteMemRefs()) { + Value source = resolver.resolveToSource(memref); + Value input_memref = getLatestMemrefVersion(source); + + if (!source_to_write_memref_idx.count(source)) { + source_to_write_memref_idx[source] = write_memrefs.size(); + write_memrefs.push_back(input_memref); + } + } + + for (Value val : access_info.getAllValues()) { + Value source = resolver.resolveToSource(val); + Value input_val = getLatestValueVersion(source); + + if (!source_to_value_input_idx.count(source)) { + source_to_value_input_idx[source] = value_inputs.size(); + value_inputs.push_back(input_val); + } + } + + //------------------------------------------------------ + // Step 2: Determines output types. + //------------------------------------------------------ + // Determines memref outputs. + SmallVector memref_output_types; + // The source memrefs of the written memrefs. + SmallVector written_memref_sources; + + for (Value memref : access_info.memref_writes) { + Value source = resolver.resolveToSource(memref); + memref_output_types.push_back(source.getType()); + written_memref_sources.push_back(source); + } + + // Determines value outputs. + SmallVector value_output_types; + SmallVector yielded_value_sources; + + if (!hyperblock.getOutputs().empty()) { + for (Value output : hyperblock.getOutputs()) { + value_output_types.push_back(output.getType()); + // For value outputs, they are source themselves. + yielded_value_sources.push_back(output); + } + } + + //------------------------------------------------------ + // Step 3: Resolves original_read_memrefs and original_write_memrefs. + //------------------------------------------------------ + // Map: block arg -> original memref (from original task). + DenseMap arg_to_original_read; + DenseMap arg_to_original_write; + + Block *orig_body = &original_task.getBody().front(); + auto orig_read_memrefs = original_task.getOriginalReadMemrefs(); + auto orig_write_memrefs = original_task.getOriginalWriteMemrefs(); + + size_t read_arg_count = original_task.getReadMemrefs().size(); + size_t write_arg_count = original_task.getWriteMemrefs().size(); + + // Maps read args to original read memrefs. + for (auto [orig_memref, arg] : + llvm::zip(orig_read_memrefs, + orig_body->getArguments().take_front(read_arg_count))) { + arg_to_original_read[arg] = orig_memref; + } + + // Maps write args to original write memrefs. + for (auto [orig_memref, arg] : + llvm::zip(orig_write_memrefs, orig_body->getArguments().slice( + read_arg_count, write_arg_count))) { + arg_to_original_write[arg] = orig_memref; + } + + // Collects original memrefs for this new task. + SmallVector new_original_read_memrefs; + SmallVector new_original_write_memrefs; + + for (Value memref : access_info.memref_reads) { + if (arg_to_original_read.count(memref)) { + new_original_read_memrefs.push_back(arg_to_original_read[memref]); + } else if (arg_to_original_write.count(memref)) { + // If reading from a write memref, add to original read. + new_original_read_memrefs.push_back(arg_to_original_write[memref]); + } + } + + for (Value memref : access_info.memref_writes) { + if (arg_to_original_write.count(memref)) { + new_original_write_memrefs.push_back(arg_to_original_write[memref]); + } else if (arg_to_original_read.count(memref)) { + // If writing to a read memref, add to original write. + new_original_write_memrefs.push_back(arg_to_original_read[memref]); + } + } + + //------------------------------------------------------ + // Step 4: Creates the new taskflow.task operation. + //------------------------------------------------------ + std::string task_name = "Task_" + std::to_string(this->global_task_idx); + auto new_task = builder.create( + this->loc, memref_output_types, value_output_types, read_memrefs, + write_memrefs, value_inputs, builder.getStringAttr(task_name), + new_original_read_memrefs, new_original_write_memrefs); + + //------------------------------------------------------ + // Step 5: Builds the task body. + //------------------------------------------------------ + Block *task_body = new Block(); + new_task.getBody().push_back(task_body); + + // Adds block arguments: [read_memrefs, write_memrefs, value_inputs]. + for (Value input : read_memrefs) { + task_body->addArgument(input.getType(), this->loc); + } + for (Value input : write_memrefs) { + task_body->addArgument(input.getType(), this->loc); + } + for (Value input : value_inputs) { + task_body->addArgument(input.getType(), this->loc); + } + + // Builds value mapping. + IRMapping mapping; + + // Maps read memrefs. + for (auto [source, idx] : source_to_read_memref_idx) { + BlockArgument new_arg = task_body->getArgument(idx); + mapping.map(source, new_arg); + + if (Value orig_arg = resolver.getBlockArg(source)) { + mapping.map(orig_arg, new_arg); + } + } + + // Maps write memrefs. + size_t write_arg_offset = read_memrefs.size(); + for (auto [source, idx] : source_to_write_memref_idx) { + BlockArgument new_arg = task_body->getArgument(write_arg_offset + idx); + mapping.map(source, new_arg); + + if (Value orig_arg = resolver.getBlockArg(source)) { + mapping.map(orig_arg, new_arg); + } + } + + // Maps value inputs. + size_t value_arg_offset = read_memrefs.size() + write_memrefs.size(); + for (auto [source, idx] : source_to_value_input_idx) { + BlockArgument new_arg = task_body->getArgument(value_arg_offset + idx); + mapping.map(source, new_arg); + + if (Value orig_arg = resolver.getBlockArg(source)) { + mapping.map(orig_arg, new_arg); + } + } + + // Clones counters and hyperblock. + OpBuilder task_builder(task_body, task_body->begin()); + cloneCounters(task_builder, hyperblock, mapping); + cloneHyperblock(task_builder, hyperblock, mapping); + + // Creates yield. + SmallVector memref_yield_operands; + for (Value memref : access_info.memref_writes) { + memref_yield_operands.push_back(mapping.lookupOrDefault(memref)); + } + + SmallVector value_yield_operands; + // If this hyperblock has value outputs, we need to yield them from the + // mapped hyperblock. + if (!hyperblock.getOutputs().empty()) { + // Finds the cloned hyperblock op. + TaskflowHyperblockOp cloned_hb = nullptr; + for (Operation &op : task_body->getOperations()) { + if (auto hb = dyn_cast(op)) { + cloned_hb = hb; + break; + } + if (cloned_hb) { + for (Value output : cloned_hb.getOutputs()) { + value_yield_operands.push_back(output); + } + } + } + } + + task_builder.setInsertionPointToEnd(task_body); + task_builder.create(this->loc, memref_yield_operands, + value_yield_operands); + + //------------------------------------------------------ + // Step 6: Updates latest versions. + //------------------------------------------------------ + // Updates latest versions. + auto memref_outputs = new_task.getWriteOutputs(); + for (auto [source, output] : + llvm::zip(written_memref_sources, memref_outputs)) { + this->memref_to_latest_version[source] = output; + } + + auto value_outputs = new_task.getValueOutputs(); + for (auto [source, output] : + llvm::zip(yielded_value_sources, value_outputs)) { + this->value_to_latest_version[source] = output; + } + + return new_task; + } + +private: + Value getLatestMemrefVersion(Value source) { + auto it = this->memref_to_latest_version.find(source); + return it != this->memref_to_latest_version.end() ? it->second : source; + } + + Value getLatestValueVersion(Value source) { + auto it = this->value_to_latest_version.find(source); + return it != this->value_to_latest_version.end() ? it->second : source; + } + + void cloneCounters(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock, + IRMapping &mapping) { + CounterCollector collector; + collector.collect(hyperblock); + + for (TaskflowCounterOp counter : collector.getSortedCounters()) { + task_builder.clone(*counter.getOperation(), mapping); + } + } + + void cloneHyperblock(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock, + IRMapping &mapping) { + SmallVector mapped_indices; + for (Value idx : hyperblock.getIndices()) { + mapped_indices.push_back(mapping.lookupOrDefault(idx)); + } + + SmallVector mapped_iter_args; + for (Value arg : hyperblock.getIterArgs()) { + mapped_iter_args.push_back(mapping.lookupOrDefault(arg)); + } + + SmallVector output_types(hyperblock.getOutputs().getTypes()); + auto newHB = task_builder.create( + this->loc, output_types, mapped_indices, mapped_iter_args); + + Block *new_body = new Block(); + newHB.getBody().push_back(new_body); + + for (Value idx : mapped_indices) { + new_body->addArgument(idx.getType(), this->loc); + } + + for (Value arg : mapped_iter_args) { + new_body->addArgument(arg.getType(), this->loc); + } + + Block *old_body = &hyperblock.getBody().front(); + for (auto [old_arg, new_arg] : + llvm::zip(old_body->getArguments(), new_body->getArguments())) { + mapping.map(old_arg, new_arg); + } + + OpBuilder hb_builder(new_body, new_body->begin()); + for (Operation &op : old_body->without_terminator()) { + hb_builder.clone(op, mapping); + } + + if (auto yield = + dyn_cast(old_body->getTerminator())) { + SmallVector yield_results; + SmallVector yield_iter_args_next; + for (Value v : yield.getResults()) { + yield_results.push_back(mapping.lookupOrDefault(v)); + } + for (Value v : yield.getIterArgsNext()) { + yield_iter_args_next.push_back(mapping.lookupOrDefault(v)); + } + hb_builder.create(this->loc, yield_results, + yield_iter_args_next); + } else { + hb_builder.create(this->loc); + } + } + + OpBuilder &builder; + Location loc; + unsigned global_task_idx; + DenseMap &memref_to_latest_version; + DenseMap &value_to_latest_version; +}; + +//---------------------------------------------------------------------- +// Pass Implementation. +//---------------------------------------------------------------------- + +struct CanonicalizeTaskPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeTaskPass) + + StringRef getArgument() const final { return "canonicalize-task"; } + + StringRef getDescription() const final { + return "Canonicalizes tasks by splitting each hyperblock into a separate " + "atomic task (one hyperblock per task)"; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry + .insert(); + } + + void runOnOperation() override { + func::FuncOp func_op = getOperation(); + + SmallVector tasks_to_process; + func_op.walk( + [&](TaskflowTaskOp task_op) { tasks_to_process.push_back(task_op); }); + + unsigned global_task_idx = 0; + + for (TaskflowTaskOp original_task : tasks_to_process) { + OpBuilder builder(original_task); + // Collects hyperblocks within the original task. + SmallVector hyperblocks; + original_task.walk( + [&](TaskflowHyperblockOp hb) { hyperblocks.push_back(hb); }); + + assert(!hyperblocks.empty() && + "Expected at least one hyperblock in the task"); + + // If there's only one hyperblock, it is already canonical. + if (hyperblocks.size() == 1) { + std::string task_name = "Task_" + std::to_string(global_task_idx++); + original_task.setTaskNameAttr(builder.getStringAttr(task_name)); + continue; + } + + //---------------------------------------------------------------- + // Step 1: Builds mapping from original task's memory outputs to their + // corresponding source memrefs (the original inputs). + //---------------------------------------------------------------- + // Gets the yield operation to find which memrefs are yielded. + auto yield_op = cast( + original_task.getBody().front().getTerminator()); + + auto original_write_outputs = original_task.getWriteOutputs(); + auto original_val_outputs = original_task.getValueOutputs(); + auto yielded_memrefs = yield_op.getMemoryResults(); + auto yielded_values = yield_op.getValueResults(); + + // Map: yielded -> original task output. + DenseMap yielded_to_output; + for (auto [yielded, output] : + llvm::zip(yielded_memrefs, original_write_outputs)) { + yielded_to_output[yielded] = output; + } + for (auto [yielded, output] : + llvm::zip(yielded_values, original_val_outputs)) { + yielded_to_output[yielded] = output; + } + + // Map: original input memref -> original task output (if it's yielded). + // This tells us which original outputs correspond to which input memrefs. + Block *orig_body = &original_task.getBody().front(); + auto orig_read_memrefs = original_task.getReadMemrefs(); + auto orig_write_memrefs = original_task.getWriteMemrefs(); + auto orig_val_inputs = original_task.getValueInputs(); + + DenseMap source_to_original_output; + + // Maps read memrefs. + for (auto [input, arg] : + llvm::zip(orig_read_memrefs, orig_body->getArguments().take_front( + orig_read_memrefs.size()))) { + if (yielded_to_output.count(arg)) { + source_to_original_output[input] = yielded_to_output[arg]; + } + } + + // Maps write memrefs. + size_t write_offset = orig_read_memrefs.size(); + for (auto [input, arg] : + llvm::zip(orig_write_memrefs, + orig_body->getArguments().slice( + write_offset, orig_write_memrefs.size()))) { + if (yielded_to_output.count(arg)) { + source_to_original_output[input] = yielded_to_output[arg]; + } + } + + // Maps value inputs. + for (auto [input, arg] : + llvm::zip(orig_val_inputs, + orig_body->getArguments().drop_front( + write_offset + orig_write_memrefs.size()))) { + if (yielded_to_output.count(arg)) { + source_to_original_output[input] = yielded_to_output[arg]; + } + } + + //---------------------------------------------------------------- + // Step 2: Creates atomic tasks for each hyperblock. + //---------------------------------------------------------------- + // Records the mapping from source memref to the latest version after + // executing each atomic task. + DenseMap memref_to_latest_version; + DenseMap value_to_latest_version; + + for (size_t i = 0; i < hyperblocks.size(); ++i) { + AtomicTaskBuilder task_builder( + builder, original_task.getLoc(), global_task_idx++, + memref_to_latest_version, value_to_latest_version); + task_builder.build(hyperblocks[i], original_task); + } + + //---------------------------------------------------------------- + // Step 3: Replaces uses of original task outputs with the latest + // versions. + //---------------------------------------------------------------- + for (auto [source, original_output] : source_to_original_output) { + Value latest = nullptr; + if (memref_to_latest_version.count(source)) { + latest = memref_to_latest_version[source]; + } else if (value_to_latest_version.count(source)) { + latest = value_to_latest_version[source]; + } + + if (latest) { + original_output.replaceAllUsesWith(latest); + } + } + + //---------------------------------------------------------------- + // Step 4: Erase the original task. + //---------------------------------------------------------------- + original_task.erase(); + } + } +}; + +} // namespace + +std::unique_ptr mlir::taskflow::createCanonicalizeTaskPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index c1e6ddff..e8a9927c 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -59,8 +59,8 @@ struct HyperblockInfo { // The corresponding loop. affine::AffineForOp loop_op = nullptr; - // Marks if this hyperblock follows the LEC pattern. - bool is_lec_pattern = false; + // Marks if this hyperblock follows the PLE pattern. + bool is_ple_pattern = false; }; //---------------------------------------------------------------------------- @@ -180,33 +180,33 @@ getTopLevelLoopsInfo(SmallVector &loops_info) { } //---------------------------------------------------------------------------- -// Loop-Epilogue Code (LEC) Pattern Detection +// Prologue-Loop-Epilogue Code (PLE) Pattern Detection //---------------------------------------------------------------------------- -// Loop-Epilogue Code means code that appears after an inner loop. -// Example: -// for %i (outer loop) { +// Prologue-Loop-Epilogue Code means code that appears before and after an inner +// loop. Example: for %i (outer loop) { +// // for %j (nested loop) { // // } // ← Loop-Epilogue Code // } -// For this pattern, we need to wrap the inner loop and the epilogue code into -// a hyperblock. Only by doing this can we maintain the hyperblock as a pure -// data-driven code block. -struct LECPattern { +// For this pattern, we need to wrap the inner loop and the prologue-epilogue +// code into a hyperblock. Only by doing this can we maintain the hyperblock as +// a pure data-driven code block. +struct PLEPattern { affine::AffineForOp outer_loop; affine::AffineForOp inner_loop; SmallVector prologue_code; SmallVector epilogue_code; - bool has_lec_pattern = false; + bool has_ple_pattern = false; }; -// Detects Loop-Epilogue Code pattern in the task. -static LECPattern detectLECPattern(affine::AffineForOp outer_loop) { - LECPattern pattern; - pattern.has_lec_pattern = false; +// Detects Prologue-Loop-Epilogue Code pattern in the task. +static PLEPattern detectPLEPattern(affine::AffineForOp outer_loop) { + PLEPattern pattern; + pattern.has_ple_pattern = false; pattern.outer_loop = outer_loop; Block &body = outer_loop.getRegion().front(); @@ -223,11 +223,15 @@ static LECPattern detectLECPattern(affine::AffineForOp outer_loop) { pattern.prologue_code.push_back(&op); } else { pattern.epilogue_code.push_back(&op); - pattern.has_lec_pattern = true; + pattern.has_ple_pattern = true; } } } + if (found_nested_loop && (!pattern.prologue_code.empty())) { + pattern.has_ple_pattern = true; + } + return pattern; } @@ -252,7 +256,7 @@ static void extractHyperblocksInfoFromRegion( for (Operation &op : block.getOperations()) { if (auto for_op = dyn_cast(&op)) { - LECPattern lec_pattern = detectLECPattern(for_op); + PLEPattern ple_pattern = detectPLEPattern(for_op); // Gets the loop info. LoopInfo *loop_info = loop_info_map.lookup(for_op); @@ -263,8 +267,8 @@ static void extractHyperblocksInfoFromRegion( SmallVector loop_indices = parent_indices; loop_indices.push_back(loop_info->counter_index); - // Handles the LEC pattern. - if (lec_pattern.has_lec_pattern) { + // Handles the PLE pattern. + if (ple_pattern.has_ple_pattern) { // 1. Emits any accumulated operations as a hyperblock. if (!current_block_ops.empty()) { HyperblockInfo info; @@ -278,22 +282,22 @@ static void extractHyperblocksInfoFromRegion( // 2. Creates a hyperblock for the prologue + inner loop + epilogue. HyperblockInfo info; - if (!lec_pattern.prologue_code.empty()) { - info.operations.append(lec_pattern.prologue_code.begin(), - lec_pattern.prologue_code.end()); + if (!ple_pattern.prologue_code.empty()) { + info.operations.append(ple_pattern.prologue_code.begin(), + ple_pattern.prologue_code.end()); } - info.operations.push_back(lec_pattern.inner_loop); + info.operations.push_back(ple_pattern.inner_loop); - if (!lec_pattern.epilogue_code.empty()) { - info.operations.append(lec_pattern.epilogue_code.begin(), - lec_pattern.epilogue_code.end()); + if (!ple_pattern.epilogue_code.empty()) { + info.operations.append(ple_pattern.epilogue_code.begin(), + ple_pattern.epilogue_code.end()); } info.trigger_indices = loop_indices; info.is_loop_body = true; info.loop_op = for_op; - info.is_lec_pattern = true; + info.is_ple_pattern = true; hyperblocks_info.push_back(info); // No need for further processing of this loop. Since we have already diff --git a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir index 38bb3ca2..658e3062 100644 --- a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir @@ -16,7 +16,7 @@ module { } // CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: memref<16xf32>) { -// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg2)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg2)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>): // CHECK-NEXT: affine.for %arg6 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg3[%arg6] : memref<16xf32> diff --git a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir index dd7083ba..32931f5f 100644 --- a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir @@ -11,7 +11,7 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16 // CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> diff --git a/test/e2e/tosa_e2e.mlir b/test/e2e/tosa_e2e.mlir index f291ffd7..a8474588 100644 --- a/test/e2e/tosa_e2e.mlir +++ b/test/e2e/tosa_e2e.mlir @@ -11,7 +11,7 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf // CHECK: func.func @test_e2e(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_inputs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%alloc)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> From 074615b53b045df332c77542a67474cb1e9f7f56 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sun, 1 Feb 2026 23:15:14 +0800 Subject: [PATCH 5/9] enable atomic canonical task creation --- include/TaskflowDialect/TaskflowPasses.h | 7 +- include/TaskflowDialect/TaskflowPasses.td | 19 +- .../TaskflowToNeura/TaskflowToNeuraPass.cpp | 6 + lib/TaskflowDialect/Transforms/CMakeLists.txt | 1 - .../Transforms/CanonicalizeTaskPass.cpp | 676 ------------------ .../ConstructHyperblockFromTaskPass.cpp | 8 +- test/multi-cgra/kernel_mapping/fir/fir.mlir | 72 +- .../loop-in-kernel/loop-in-kernel.mlir | 94 +-- test/multi-cgra/kernel_mapping/relu/relu.mlir | 187 +++-- .../irregular-loop/irregular-loop.mlir | 284 ++++---- .../taskflow/multi-nested/multi-nested.mlir | 363 +++++----- .../parallel-nested/parallel-nested.mlir | 113 ++- 12 files changed, 574 insertions(+), 1256 deletions(-) delete mode 100644 lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index c4c73b6b..88c9d5bb 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -15,10 +15,13 @@ namespace taskflow { // Passes defined in TaskflowPasses.td #define GEN_PASS_DECL #include "TaskflowDialect/TaskflowPasses.h.inc" -std::unique_ptr createAffineLoopTreeSerializationPass(); std::unique_ptr createConstructHyperblockFromTaskPass(); std::unique_ptr createClassifyCountersPass(); -std::unique_ptr createCanonicalizeTaskPass(); + +//=========================================================// +// Optimization Passes +//=========================================================// +std::unique_ptr createAffineLoopTreeSerializationPass(); #define GEN_PASS_REGISTRATION #include "TaskflowDialect/TaskflowPasses.h.inc" diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 7f2e78b6..7c6b5a17 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -6,7 +6,7 @@ include "mlir/Pass/PassBase.td" //=========================================================// -// Passes for the Taskflow dialect +// Passes for Task Level Optimizations //=========================================================// def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "ModuleOp">{ let summary = "Serializes top-level affine.for loops into minimized task operations"; @@ -21,6 +21,9 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module "mlir::func::FuncDialect"]; } +//=========================================================// +// Passes for the Taskflow dialect +//=========================================================// def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> { let summary = "Constructs hyperblocks and counter chain from Taskflow tasks"; let description = [{ @@ -29,20 +32,6 @@ def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func:: let constructor = "taskflow::createConstructHyperblockFromTaskPass()"; } -def CanonicalizeTask: Pass<"canonicalize-task", "func::FuncOp">{ - let summary = "Canonicalizes tasks by splitting each hyperblock into a separate atomic task"; - let description = [{ - This pass splits tasks so that each task contains exactly one hyperblock. - This creates atomic task units that can be analyzed and optimized independently. - - Input: Task with N hyperblocks - Output: N atomic tasks, each containing one hyperblock - - This is a prerequisite pass before fusion optimizations. - }]; - let constructor = "taskflow::createCanonicalizeTaskPass()"; -} - def ClassifyCounters : Pass<"classify-counters", "ModuleOp">{ let summary = "Classifies counters as root/relay/leaf"; let description = [{ diff --git a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp index fc34a545..f0eb7cb0 100644 --- a/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp +++ b/lib/Conversion/TaskflowToNeura/TaskflowToNeuraPass.cpp @@ -54,6 +54,12 @@ struct HyperblockToKernelPattern return failure(); } + // Asserts that each task contains only one hyperblock. + int hyperblock_count = 0; + task_op.walk([&](TaskflowHyperblockOp op) { hyperblock_count++; }); + assert(hyperblock_count == 1 && + "Each taskflow.task should contain only one hyperblock"); + Block &hb_block = hyperblock_op.getBody().front(); Block &task_block = task_op.getBody().front(); diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index ff12e671..a5443158 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -2,7 +2,6 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp - CanonicalizeTaskPass.cpp ClassifyCountersPass.cpp DEPENDS diff --git a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp b/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp deleted file mode 100644 index 636e02b9..00000000 --- a/lib/TaskflowDialect/Transforms/CanonicalizeTaskPass.cpp +++ /dev/null @@ -1,676 +0,0 @@ -#include "TaskflowDialect/TaskflowDialect.h" -#include "TaskflowDialect/TaskflowOps.h" -#include "TaskflowDialect/TaskflowPasses.h" - -#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/IRMapping.h" -#include "mlir/IR/Unit.h" -#include "mlir/IR/Value.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/LLVM.h" -#include "mlir/Transforms/DialectConversion.h" -#include "llvm/ADT/SmallVector.h" - -using namespace mlir; -using namespace mlir::taskflow; - -namespace { -//---------------------------------------------------------------------- -// Memory and Value Access Info. -//---------------------------------------------------------------------- -// This struct analyzes accesses information within a hyperblock. -struct AccessInfo { - // Set of read memrefs. - SetVector memref_reads; - // Set of written memrefs. - SetVector memref_writes; - // Set of read values. - SetVector value_reads; - - void analyze(TaskflowHyperblockOp hyperblock, Block *task_body) { - DenseSet task_block_args; - for (Value arg : task_body->getArguments()) { - task_block_args.insert(arg); - } - - hyperblock.walk([&](Operation *op) { - if (auto load = dyn_cast(op)) { - this->memref_reads.insert(load.getMemRef()); - } else if (auto store = dyn_cast(op)) { - this->memref_writes.insert(store.getMemRef()); - } - - for (Value operand : op->getOperands()) { - if (task_block_args.contains(operand)) { - this->value_reads.insert(operand); - } - } - }); - } - - SetVector getReadMemRefs() const { - SetVector all; - all.insert(this->memref_reads.begin(), this->memref_reads.end()); - return all; - } - - SetVector getWriteMemRefs() const { - SetVector all; - all.insert(this->memref_writes.begin(), this->memref_writes.end()); - return all; - } - - SetVector getAllValues() const { return this->value_reads; } -}; - -//---------------------------------------------------------------------- -// Counter Collector. -//---------------------------------------------------------------------- -// This class is used to collects all counters needed by a hyperblock. -class CounterCollector { -public: - void collect(TaskflowHyperblockOp hyperblock) { - for (Value idx : hyperblock.getIndices()) { - collectRecursively(idx); - } - } - - // Gets the collected counters sorted by their depth. - SmallVector getSortedCounters() const { - SmallVector result(this->counters.begin(), - this->counters.end()); - llvm::sort(result, [this](TaskflowCounterOp a, TaskflowCounterOp b) { - return getDepth(a) < getDepth(b); - }); - return result; - } - -private: - // Collects counters recursively. - void collectRecursively(Value idx) { - TaskflowCounterOp counter = idx.getDefiningOp(); - if (!counter) { - return; - } - this->counters.insert(counter); - if (Value parent = counter.getParentIndex()) { - collectRecursively(parent); - } - } - - // Gets the depth of a counter. - size_t getDepth(TaskflowCounterOp counter) const { - size_t depth = 0; - Value parent = counter.getParentIndex(); - while (parent) { - depth++; - if (TaskflowCounterOp p = parent.getDefiningOp()) { - parent = p.getParentIndex(); - } else { - break; - } - } - return depth; - } - - SetVector counters; -}; - -//---------------------------------------------------------------------- -// Block Argument Resolver. -//---------------------------------------------------------------------- -// This class resolves the input arguments of a task block to their source -// values. -// For example: -// taskflow.task(%buf_input, %val_input) { -// ^bb0(%arg0: memref, %arg1: i32): // ← block arguments -// // %arg0 corresponds to %buf_input -// // %arg1 corresponds to %val_input -// } -// resolveToSource(%arg0) -> %buf_input -class BlockArgResolver { -public: - explicit BlockArgResolver(TaskflowTaskOp task) { - Block *body = &task.getBody().front(); - - // Resolves memory inputs. - auto read_memrefs = task.getReadMemrefs(); - auto read_args = body->getArguments().take_front(read_memrefs.size()); - for (auto [input, arg] : llvm::zip(read_memrefs, read_args)) { - this->block_arg_to_source[arg] = input; - this->source_to_block_arg[input] = arg; - } - - // Resolves memory inputs. - auto write_memrefs = task.getWriteMemrefs(); - auto mem_args = body->getArguments().take_front(write_memrefs.size()); - for (auto [input, arg] : llvm::zip(write_memrefs, mem_args)) { - this->block_arg_to_source[arg] = input; - this->source_to_block_arg[input] = arg; - } - - // Resolves value inputs. - auto val_inputs = task.getValueInputs(); - auto val_args = body->getArguments().drop_front(read_memrefs.size() + - write_memrefs.size()); - for (auto [input, arg] : llvm::zip(val_inputs, val_args)) { - this->block_arg_to_source[arg] = input; - this->source_to_block_arg[input] = arg; - } - } - - // Gets the source value for a given block argument. - Value resolveToSource(Value val) const { - auto it = this->block_arg_to_source.find(val); - return it != this->block_arg_to_source.end() ? it->second : val; - } - - // Gets the block argument for a given source value. - Value getBlockArg(Value source) const { - auto it = this->source_to_block_arg.find(source); - return it != this->source_to_block_arg.end() ? it->second : Value(); - } - -private: - // Maps block argument to its source value. - DenseMap block_arg_to_source; - // Maps source value to its block argument. - DenseMap source_to_block_arg; -}; - -//---------------------------------------------------------------------- -// Atomic Task Builder. -//---------------------------------------------------------------------- -// This class builds an atomic task from a hyperblock. -class AtomicTaskBuilder { -public: - AtomicTaskBuilder(OpBuilder &builder, Location loc, unsigned global_task_idx, - DenseMap &memref_to_latest_version, - DenseMap &value_to_latest_version) - : builder(builder), loc(loc), global_task_idx(global_task_idx), - memref_to_latest_version(memref_to_latest_version), - value_to_latest_version(value_to_latest_version) {} - - TaskflowTaskOp build(TaskflowHyperblockOp hyperblock, - TaskflowTaskOp original_task) { - AccessInfo access_info; - access_info.analyze(hyperblock, &original_task.getBody().front()); - - BlockArgResolver resolver(original_task); - - //------------------------------------------------------ - // Step1: Determines read/write memresfs and value inputs. - //------------------------------------------------------ - SmallVector read_memrefs; - SmallVector write_memrefs; - SmallVector value_inputs; - - DenseMap source_to_read_memref_idx; - DenseMap source_to_write_memref_idx; - DenseMap source_to_value_input_idx; - - // Classifies memrefs into read and write sets. - for (Value memref : access_info.getReadMemRefs()) { - Value source = resolver.resolveToSource(memref); - Value input_memref = getLatestMemrefVersion(source); - - if (!source_to_read_memref_idx.count(source)) { - source_to_read_memref_idx[source] = read_memrefs.size(); - read_memrefs.push_back(input_memref); - } - } - - for (Value memref : access_info.getWriteMemRefs()) { - Value source = resolver.resolveToSource(memref); - Value input_memref = getLatestMemrefVersion(source); - - if (!source_to_write_memref_idx.count(source)) { - source_to_write_memref_idx[source] = write_memrefs.size(); - write_memrefs.push_back(input_memref); - } - } - - for (Value val : access_info.getAllValues()) { - Value source = resolver.resolveToSource(val); - Value input_val = getLatestValueVersion(source); - - if (!source_to_value_input_idx.count(source)) { - source_to_value_input_idx[source] = value_inputs.size(); - value_inputs.push_back(input_val); - } - } - - //------------------------------------------------------ - // Step 2: Determines output types. - //------------------------------------------------------ - // Determines memref outputs. - SmallVector memref_output_types; - // The source memrefs of the written memrefs. - SmallVector written_memref_sources; - - for (Value memref : access_info.memref_writes) { - Value source = resolver.resolveToSource(memref); - memref_output_types.push_back(source.getType()); - written_memref_sources.push_back(source); - } - - // Determines value outputs. - SmallVector value_output_types; - SmallVector yielded_value_sources; - - if (!hyperblock.getOutputs().empty()) { - for (Value output : hyperblock.getOutputs()) { - value_output_types.push_back(output.getType()); - // For value outputs, they are source themselves. - yielded_value_sources.push_back(output); - } - } - - //------------------------------------------------------ - // Step 3: Resolves original_read_memrefs and original_write_memrefs. - //------------------------------------------------------ - // Map: block arg -> original memref (from original task). - DenseMap arg_to_original_read; - DenseMap arg_to_original_write; - - Block *orig_body = &original_task.getBody().front(); - auto orig_read_memrefs = original_task.getOriginalReadMemrefs(); - auto orig_write_memrefs = original_task.getOriginalWriteMemrefs(); - - size_t read_arg_count = original_task.getReadMemrefs().size(); - size_t write_arg_count = original_task.getWriteMemrefs().size(); - - // Maps read args to original read memrefs. - for (auto [orig_memref, arg] : - llvm::zip(orig_read_memrefs, - orig_body->getArguments().take_front(read_arg_count))) { - arg_to_original_read[arg] = orig_memref; - } - - // Maps write args to original write memrefs. - for (auto [orig_memref, arg] : - llvm::zip(orig_write_memrefs, orig_body->getArguments().slice( - read_arg_count, write_arg_count))) { - arg_to_original_write[arg] = orig_memref; - } - - // Collects original memrefs for this new task. - SmallVector new_original_read_memrefs; - SmallVector new_original_write_memrefs; - - for (Value memref : access_info.memref_reads) { - if (arg_to_original_read.count(memref)) { - new_original_read_memrefs.push_back(arg_to_original_read[memref]); - } else if (arg_to_original_write.count(memref)) { - // If reading from a write memref, add to original read. - new_original_read_memrefs.push_back(arg_to_original_write[memref]); - } - } - - for (Value memref : access_info.memref_writes) { - if (arg_to_original_write.count(memref)) { - new_original_write_memrefs.push_back(arg_to_original_write[memref]); - } else if (arg_to_original_read.count(memref)) { - // If writing to a read memref, add to original write. - new_original_write_memrefs.push_back(arg_to_original_read[memref]); - } - } - - //------------------------------------------------------ - // Step 4: Creates the new taskflow.task operation. - //------------------------------------------------------ - std::string task_name = "Task_" + std::to_string(this->global_task_idx); - auto new_task = builder.create( - this->loc, memref_output_types, value_output_types, read_memrefs, - write_memrefs, value_inputs, builder.getStringAttr(task_name), - new_original_read_memrefs, new_original_write_memrefs); - - //------------------------------------------------------ - // Step 5: Builds the task body. - //------------------------------------------------------ - Block *task_body = new Block(); - new_task.getBody().push_back(task_body); - - // Adds block arguments: [read_memrefs, write_memrefs, value_inputs]. - for (Value input : read_memrefs) { - task_body->addArgument(input.getType(), this->loc); - } - for (Value input : write_memrefs) { - task_body->addArgument(input.getType(), this->loc); - } - for (Value input : value_inputs) { - task_body->addArgument(input.getType(), this->loc); - } - - // Builds value mapping. - IRMapping mapping; - - // Maps read memrefs. - for (auto [source, idx] : source_to_read_memref_idx) { - BlockArgument new_arg = task_body->getArgument(idx); - mapping.map(source, new_arg); - - if (Value orig_arg = resolver.getBlockArg(source)) { - mapping.map(orig_arg, new_arg); - } - } - - // Maps write memrefs. - size_t write_arg_offset = read_memrefs.size(); - for (auto [source, idx] : source_to_write_memref_idx) { - BlockArgument new_arg = task_body->getArgument(write_arg_offset + idx); - mapping.map(source, new_arg); - - if (Value orig_arg = resolver.getBlockArg(source)) { - mapping.map(orig_arg, new_arg); - } - } - - // Maps value inputs. - size_t value_arg_offset = read_memrefs.size() + write_memrefs.size(); - for (auto [source, idx] : source_to_value_input_idx) { - BlockArgument new_arg = task_body->getArgument(value_arg_offset + idx); - mapping.map(source, new_arg); - - if (Value orig_arg = resolver.getBlockArg(source)) { - mapping.map(orig_arg, new_arg); - } - } - - // Clones counters and hyperblock. - OpBuilder task_builder(task_body, task_body->begin()); - cloneCounters(task_builder, hyperblock, mapping); - cloneHyperblock(task_builder, hyperblock, mapping); - - // Creates yield. - SmallVector memref_yield_operands; - for (Value memref : access_info.memref_writes) { - memref_yield_operands.push_back(mapping.lookupOrDefault(memref)); - } - - SmallVector value_yield_operands; - // If this hyperblock has value outputs, we need to yield them from the - // mapped hyperblock. - if (!hyperblock.getOutputs().empty()) { - // Finds the cloned hyperblock op. - TaskflowHyperblockOp cloned_hb = nullptr; - for (Operation &op : task_body->getOperations()) { - if (auto hb = dyn_cast(op)) { - cloned_hb = hb; - break; - } - if (cloned_hb) { - for (Value output : cloned_hb.getOutputs()) { - value_yield_operands.push_back(output); - } - } - } - } - - task_builder.setInsertionPointToEnd(task_body); - task_builder.create(this->loc, memref_yield_operands, - value_yield_operands); - - //------------------------------------------------------ - // Step 6: Updates latest versions. - //------------------------------------------------------ - // Updates latest versions. - auto memref_outputs = new_task.getWriteOutputs(); - for (auto [source, output] : - llvm::zip(written_memref_sources, memref_outputs)) { - this->memref_to_latest_version[source] = output; - } - - auto value_outputs = new_task.getValueOutputs(); - for (auto [source, output] : - llvm::zip(yielded_value_sources, value_outputs)) { - this->value_to_latest_version[source] = output; - } - - return new_task; - } - -private: - Value getLatestMemrefVersion(Value source) { - auto it = this->memref_to_latest_version.find(source); - return it != this->memref_to_latest_version.end() ? it->second : source; - } - - Value getLatestValueVersion(Value source) { - auto it = this->value_to_latest_version.find(source); - return it != this->value_to_latest_version.end() ? it->second : source; - } - - void cloneCounters(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock, - IRMapping &mapping) { - CounterCollector collector; - collector.collect(hyperblock); - - for (TaskflowCounterOp counter : collector.getSortedCounters()) { - task_builder.clone(*counter.getOperation(), mapping); - } - } - - void cloneHyperblock(OpBuilder &task_builder, TaskflowHyperblockOp hyperblock, - IRMapping &mapping) { - SmallVector mapped_indices; - for (Value idx : hyperblock.getIndices()) { - mapped_indices.push_back(mapping.lookupOrDefault(idx)); - } - - SmallVector mapped_iter_args; - for (Value arg : hyperblock.getIterArgs()) { - mapped_iter_args.push_back(mapping.lookupOrDefault(arg)); - } - - SmallVector output_types(hyperblock.getOutputs().getTypes()); - auto newHB = task_builder.create( - this->loc, output_types, mapped_indices, mapped_iter_args); - - Block *new_body = new Block(); - newHB.getBody().push_back(new_body); - - for (Value idx : mapped_indices) { - new_body->addArgument(idx.getType(), this->loc); - } - - for (Value arg : mapped_iter_args) { - new_body->addArgument(arg.getType(), this->loc); - } - - Block *old_body = &hyperblock.getBody().front(); - for (auto [old_arg, new_arg] : - llvm::zip(old_body->getArguments(), new_body->getArguments())) { - mapping.map(old_arg, new_arg); - } - - OpBuilder hb_builder(new_body, new_body->begin()); - for (Operation &op : old_body->without_terminator()) { - hb_builder.clone(op, mapping); - } - - if (auto yield = - dyn_cast(old_body->getTerminator())) { - SmallVector yield_results; - SmallVector yield_iter_args_next; - for (Value v : yield.getResults()) { - yield_results.push_back(mapping.lookupOrDefault(v)); - } - for (Value v : yield.getIterArgsNext()) { - yield_iter_args_next.push_back(mapping.lookupOrDefault(v)); - } - hb_builder.create(this->loc, yield_results, - yield_iter_args_next); - } else { - hb_builder.create(this->loc); - } - } - - OpBuilder &builder; - Location loc; - unsigned global_task_idx; - DenseMap &memref_to_latest_version; - DenseMap &value_to_latest_version; -}; - -//---------------------------------------------------------------------- -// Pass Implementation. -//---------------------------------------------------------------------- - -struct CanonicalizeTaskPass - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CanonicalizeTaskPass) - - StringRef getArgument() const final { return "canonicalize-task"; } - - StringRef getDescription() const final { - return "Canonicalizes tasks by splitting each hyperblock into a separate " - "atomic task (one hyperblock per task)"; - } - - void getDependentDialects(DialectRegistry ®istry) const override { - registry - .insert(); - } - - void runOnOperation() override { - func::FuncOp func_op = getOperation(); - - SmallVector tasks_to_process; - func_op.walk( - [&](TaskflowTaskOp task_op) { tasks_to_process.push_back(task_op); }); - - unsigned global_task_idx = 0; - - for (TaskflowTaskOp original_task : tasks_to_process) { - OpBuilder builder(original_task); - // Collects hyperblocks within the original task. - SmallVector hyperblocks; - original_task.walk( - [&](TaskflowHyperblockOp hb) { hyperblocks.push_back(hb); }); - - assert(!hyperblocks.empty() && - "Expected at least one hyperblock in the task"); - - // If there's only one hyperblock, it is already canonical. - if (hyperblocks.size() == 1) { - std::string task_name = "Task_" + std::to_string(global_task_idx++); - original_task.setTaskNameAttr(builder.getStringAttr(task_name)); - continue; - } - - //---------------------------------------------------------------- - // Step 1: Builds mapping from original task's memory outputs to their - // corresponding source memrefs (the original inputs). - //---------------------------------------------------------------- - // Gets the yield operation to find which memrefs are yielded. - auto yield_op = cast( - original_task.getBody().front().getTerminator()); - - auto original_write_outputs = original_task.getWriteOutputs(); - auto original_val_outputs = original_task.getValueOutputs(); - auto yielded_memrefs = yield_op.getMemoryResults(); - auto yielded_values = yield_op.getValueResults(); - - // Map: yielded -> original task output. - DenseMap yielded_to_output; - for (auto [yielded, output] : - llvm::zip(yielded_memrefs, original_write_outputs)) { - yielded_to_output[yielded] = output; - } - for (auto [yielded, output] : - llvm::zip(yielded_values, original_val_outputs)) { - yielded_to_output[yielded] = output; - } - - // Map: original input memref -> original task output (if it's yielded). - // This tells us which original outputs correspond to which input memrefs. - Block *orig_body = &original_task.getBody().front(); - auto orig_read_memrefs = original_task.getReadMemrefs(); - auto orig_write_memrefs = original_task.getWriteMemrefs(); - auto orig_val_inputs = original_task.getValueInputs(); - - DenseMap source_to_original_output; - - // Maps read memrefs. - for (auto [input, arg] : - llvm::zip(orig_read_memrefs, orig_body->getArguments().take_front( - orig_read_memrefs.size()))) { - if (yielded_to_output.count(arg)) { - source_to_original_output[input] = yielded_to_output[arg]; - } - } - - // Maps write memrefs. - size_t write_offset = orig_read_memrefs.size(); - for (auto [input, arg] : - llvm::zip(orig_write_memrefs, - orig_body->getArguments().slice( - write_offset, orig_write_memrefs.size()))) { - if (yielded_to_output.count(arg)) { - source_to_original_output[input] = yielded_to_output[arg]; - } - } - - // Maps value inputs. - for (auto [input, arg] : - llvm::zip(orig_val_inputs, - orig_body->getArguments().drop_front( - write_offset + orig_write_memrefs.size()))) { - if (yielded_to_output.count(arg)) { - source_to_original_output[input] = yielded_to_output[arg]; - } - } - - //---------------------------------------------------------------- - // Step 2: Creates atomic tasks for each hyperblock. - //---------------------------------------------------------------- - // Records the mapping from source memref to the latest version after - // executing each atomic task. - DenseMap memref_to_latest_version; - DenseMap value_to_latest_version; - - for (size_t i = 0; i < hyperblocks.size(); ++i) { - AtomicTaskBuilder task_builder( - builder, original_task.getLoc(), global_task_idx++, - memref_to_latest_version, value_to_latest_version); - task_builder.build(hyperblocks[i], original_task); - } - - //---------------------------------------------------------------- - // Step 3: Replaces uses of original task outputs with the latest - // versions. - //---------------------------------------------------------------- - for (auto [source, original_output] : source_to_original_output) { - Value latest = nullptr; - if (memref_to_latest_version.count(source)) { - latest = memref_to_latest_version[source]; - } else if (value_to_latest_version.count(source)) { - latest = value_to_latest_version[source]; - } - - if (latest) { - original_output.replaceAllUsesWith(latest); - } - } - - //---------------------------------------------------------------- - // Step 4: Erase the original task. - //---------------------------------------------------------------- - original_task.erase(); - } - } -}; - -} // namespace - -std::unique_ptr mlir::taskflow::createCanonicalizeTaskPass() { - return std::make_unique(); -} \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index e8a9927c..bb503c5d 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -25,7 +25,7 @@ using namespace mlir::taskflow; namespace { //--------------------------------------------------------------------------- -// Loop Info Structure +// Loop Info Structure. //---------------------------------------------------------------------------- struct LoopInfo { affine::AffineForOp for_op; @@ -42,7 +42,7 @@ struct LoopInfo { }; //--------------------------------------------------------------------------- -// Hyperblock Info Structure +// Hyperblock Info Structure. //---------------------------------------------------------------------------- // Represents a code block that should become a hyperblock. struct HyperblockInfo { @@ -64,7 +64,7 @@ struct HyperblockInfo { }; //---------------------------------------------------------------------------- -// Helper Functions +// Helper Functions. //---------------------------------------------------------------------------- // Extracts loop parameters from affine.for operation. static std::optional extractLoopBound(affine::AffineForOp for_op) { @@ -123,7 +123,7 @@ static SmallVector collectLoopInfo(TaskflowTaskOp task_op) { } //---------------------------------------------------------------------------- -// Counter Chain Creation +// Counter Chain Creation. //---------------------------------------------------------------------------- // Recursively creates counter chain for each top-level loop. static void createCounterChainRecursivly(OpBuilder &builder, Location loc, diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index d8facaa3..8927cbf6 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -4,12 +4,11 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: -o %t.canonicalized.mlir -// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE +// RUN: -o %t.hyperblock.mlir +// RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: -o %t.kernel.mlir @@ -17,7 +16,6 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: --lower-affine \ @@ -33,7 +31,6 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: --lower-affine \ @@ -56,7 +53,6 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: --lower-affine \ @@ -99,7 +95,7 @@ module attributes {} { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// TASKFLOW-NEXT: %value_outputs = taskflow.task @Task_0 read_inputs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { +// TASKFLOW-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { // TASKFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // TASKFLOW-NEXT: %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) { // TASKFLOW-NEXT: %1 = affine.load %arg3[%arg6] : memref @@ -114,30 +110,30 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// CANONICALIZE: module { -// CANONICALIZE-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// CANONICALIZE-NEXT: %c0_i32 = arith.constant 0 : i32 -// CANONICALIZE-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): -// CANONICALIZE-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// CANONICALIZE-NEXT: %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg6: index, %arg7: i32): -// CANONICALIZE-NEXT: %2 = memref.load %arg3[%arg6] : memref -// CANONICALIZE-NEXT: %3 = memref.load %arg4[%arg6] : memref -// CANONICALIZE-NEXT: %4 = arith.muli %2, %3 : i32 -// CANONICALIZE-NEXT: %5 = arith.addi %arg7, %4 : i32 -// CANONICALIZE-NEXT: taskflow.hyperblock.yield iter_args_next(%5 : i32) results(%5 : i32) -// CANONICALIZE-NEXT: }) : (index, i32) -> i32 -// CANONICALIZE-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () -// CANONICALIZE-NEXT: }) : (memref, memref, i32) -> i32 -// CANONICALIZE-NEXT: return %value_outputs : i32 -// CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: } +// HYPERBLOCK: module { +// HYPERBLOCK-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 +// HYPERBLOCK-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { +// HYPERBLOCK-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// HYPERBLOCK-NEXT: %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg6: index, %arg7: i32): +// HYPERBLOCK-NEXT: %2 = memref.load %arg3[%arg6] : memref +// HYPERBLOCK-NEXT: %3 = memref.load %arg4[%arg6] : memref +// HYPERBLOCK-NEXT: %4 = arith.muli %2, %3 : i32 +// HYPERBLOCK-NEXT: %5 = arith.addi %arg7, %4 : i32 +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield iter_args_next(%5 : i32) results(%5 : i32) +// HYPERBLOCK-NEXT: }) : (index, i32) -> i32 +// HYPERBLOCK-NEXT: taskflow.yield values(%1 : i32) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: return %value_outputs : i32 +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: } // KERNEL: module { // KERNEL-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // KERNEL-NEXT: %c0_i32 = arith.constant 0 : i32 -// KERNEL-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// KERNEL-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { // KERNEL-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // KERNEL-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) { @@ -149,8 +145,8 @@ module attributes {} { // KERNEL-NEXT: %6 = arith.addi %arg8, %5 : i32 // KERNEL-NEXT: neura.yield iter_args_next(%6 : i32) results(%6 : i32) // KERNEL-NEXT: } : i32 -// KERNEL-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () -// KERNEL-NEXT: }) : (memref, memref, i32) -> i32 +// KERNEL-NEXT: taskflow.yield values(%1 : i32) +// KERNEL-NEXT: } // KERNEL-NEXT: return %value_outputs : i32 // KERNEL-NEXT: } // KERNEL-NEXT: } @@ -158,7 +154,7 @@ module attributes {} { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// NEURA-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { // NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // NEURA-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura"} { @@ -170,8 +166,8 @@ module attributes {} { // NEURA-NEXT: %6 = "neura.add"(%arg8, %5) : (i32, i32) -> i32 // NEURA-NEXT: neura.yield iter_args_next(%6 : i32) results(%6 : i32) // NEURA-NEXT: } : i32 -// NEURA-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () -// NEURA-NEXT: }) : (memref, memref, i32) -> i32 +// NEURA-NEXT: taskflow.yield values(%1 : i32) +// NEURA-NEXT: } // NEURA-NEXT: return %value_outputs : i32 // NEURA-NEXT: } // NEURA-NEXT: } @@ -179,7 +175,7 @@ module attributes {} { // DATAFLOW: module { // DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// DATAFLOW-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { // DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // DATAFLOW-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { @@ -199,8 +195,8 @@ module attributes {} { // DATAFLOW-NEXT: neura.return_value %12 : !neura.data // DATAFLOW-NEXT: neura.yield // DATAFLOW-NEXT: } : i32 -// DATAFLOW-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () -// DATAFLOW-NEXT: }) : (memref, memref, i32) -> i32 +// DATAFLOW-NEXT: taskflow.yield values(%1 : i32) +// DATAFLOW-NEXT: } // DATAFLOW-NEXT: return %value_outputs : i32 // DATAFLOW-NEXT: } // DATAFLOW-NEXT: } @@ -208,7 +204,7 @@ module attributes {} { // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// MAPPED-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2)] : (memref, memref, i32) -> (i32) { // MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // MAPPED-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { @@ -240,8 +236,8 @@ module attributes {} { // MAPPED-NEXT: neura.return_value %24 : !neura.data {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 2 : i32}]} // MAPPED-NEXT: neura.yield {dfg_id = 3 : i32} // MAPPED-NEXT: } : i32 -// MAPPED-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () -// MAPPED-NEXT: }) : (memref, memref, i32) -> i32 +// MAPPED-NEXT: taskflow.yield values(%1 : i32) +// MAPPED-NEXT: } // MAPPED-NEXT: return %value_outputs : i32 // MAPPED-NEXT: } // MAPPED-NEXT: } \ No newline at end of file diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir index f926d548..1802e538 100644 --- a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir +++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir @@ -55,7 +55,7 @@ module { func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { %c0_i32 = arith.constant 0 : i32 - %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ + %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): %1 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) { ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -68,8 +68,8 @@ module { } neura.yield results(%0 : i32) } : i32 - "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () - }) : (memref, memref, i32) -> i32 + taskflow.yield values(%1 : i32) + } return %value_outputs : i32 } } @@ -77,7 +77,7 @@ module { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// NEURA-NEXT: %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { // NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura"} { // NEURA-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -101,56 +101,56 @@ module { // NEURA-NEXT: ^bb3: // pred: ^bb1 // NEURA-NEXT: neura.yield results(%6 : i32) // NEURA-NEXT: } : i32 -// NEURA-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () -// NEURA-NEXT: }) : (memref, memref, i32) -> i32 +// NEURA-NEXT: taskflow.yield values(%0 : i32) +// NEURA-NEXT: } // NEURA-NEXT: return %value_outputs : i32 // NEURA-NEXT: } // NEURA-NEXT: } -// DATAFLOW: module { -// DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): -// DATAFLOW-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { -// DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): -// DATAFLOW-NEXT: %1 = "neura.grant_once"() <{constant_value = "%input2"}> : () -> !neura.data -// DATAFLOW-NEXT: %2 = "neura.constant"() <{value = 0 : index}> : () -> !neura.data -// DATAFLOW-NEXT: %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %4 = "neura.grant_once"(%3) : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %5 = neura.reserve : !neura.data -// DATAFLOW-NEXT: %6 = neura.phi_start %1, %5 : !neura.data, !neura.data -> !neura.data -// DATAFLOW-NEXT: %7 = neura.reserve : !neura.data -// DATAFLOW-NEXT: %8 = neura.phi_start %4, %7 : !neura.data, !neura.data -> !neura.data -// DATAFLOW-NEXT: %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {rhs_value = 32 : index} : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %11 = neura.grant_predicate %9, %10 : !neura.data, !neura.data -> !neura.data -// DATAFLOW-NEXT: %12 = neura.grant_predicate %6, %10 : !neura.data, !neura.data -> !neura.data -// DATAFLOW-NEXT: %13 = "neura.not"(%10) : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %14 = neura.grant_predicate %6, %13 : !neura.data, !neura.data -> !neura.data -// DATAFLOW-NEXT: neura.return_value %14 : !neura.data -// DATAFLOW-NEXT: %15 = neura.load_indexed [%11 : !neura.data] {lhs_value = "%input0"} : !neura.data -// DATAFLOW-NEXT: %16 = neura.load_indexed [%11 : !neura.data] {lhs_value = "%input1"} : !neura.data -// DATAFLOW-NEXT: %17 = "neura.mul"(%15, %16) : (!neura.data, !neura.data) -> !neura.data -// DATAFLOW-NEXT: %18 = "neura.add"(%12, %17) : (!neura.data, !neura.data) -> !neura.data -// DATAFLOW-NEXT: %19 = "neura.add"(%11) {rhs_value = 1 : index} : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: %20 = "neura.cast"(%19) <{cast_type = "index_to_int"}> : (!neura.data) -> !neura.data -// DATAFLOW-NEXT: neura.ctrl_mov %20 -> %7 : !neura.data !neura.data -// DATAFLOW-NEXT: neura.ctrl_mov %18 -> %5 : !neura.data !neura.data -// DATAFLOW-NEXT: neura.yield -// DATAFLOW-NEXT: } : i32 -// DATAFLOW-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () -// DATAFLOW-NEXT: }) : (memref, memref, i32) -> i32 -// DATAFLOW-NEXT: return %value_outputs : i32 -// DATAFLOW-NEXT: } -// DATAFLOW-NEXT:} +// DATAFLOW: module { +// DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 +// DATAFLOW-NEXT: %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { +// DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): +// DATAFLOW-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { +// DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): +// DATAFLOW-NEXT: %1 = "neura.grant_once"() <{constant_value = "%input2"}> : () -> !neura.data +// DATAFLOW-NEXT: %2 = "neura.constant"() <{value = 0 : index}> : () -> !neura.data +// DATAFLOW-NEXT: %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %4 = "neura.grant_once"(%3) : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %5 = neura.reserve : !neura.data +// DATAFLOW-NEXT: %6 = neura.phi_start %1, %5 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %7 = neura.reserve : !neura.data +// DATAFLOW-NEXT: %8 = neura.phi_start %4, %7 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {rhs_value = 32 : index} : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %11 = neura.grant_predicate %9, %10 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %12 = neura.grant_predicate %6, %10 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: %13 = "neura.not"(%10) : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %14 = neura.grant_predicate %6, %13 : !neura.data, !neura.data -> !neura.data +// DATAFLOW-NEXT: neura.return_value %14 : !neura.data +// DATAFLOW-NEXT: %15 = neura.load_indexed [%11 : !neura.data] {lhs_value = "%input0"} : !neura.data +// DATAFLOW-NEXT: %16 = neura.load_indexed [%11 : !neura.data] {lhs_value = "%input1"} : !neura.data +// DATAFLOW-NEXT: %17 = "neura.mul"(%15, %16) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: %18 = "neura.add"(%12, %17) : (!neura.data, !neura.data) -> !neura.data +// DATAFLOW-NEXT: %19 = "neura.add"(%11) {rhs_value = 1 : index} : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: %20 = "neura.cast"(%19) <{cast_type = "index_to_int"}> : (!neura.data) -> !neura.data +// DATAFLOW-NEXT: neura.ctrl_mov %20 -> %7 : !neura.data !neura.data +// DATAFLOW-NEXT: neura.ctrl_mov %18 -> %5 : !neura.data !neura.data +// DATAFLOW-NEXT: neura.yield +// DATAFLOW-NEXT: } : i32 +// DATAFLOW-NEXT: taskflow.yield values(%0 : i32) +// DATAFLOW-NEXT: } +// DATAFLOW-NEXT: return %value_outputs : i32 +// DATAFLOW-NEXT: } +// DATAFLOW-NEXT: } // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %value_outputs = "taskflow.task"(%arg0, %arg2, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// MAPPED-NEXT: %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { // MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { // MAPPED-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -193,11 +193,13 @@ module { // MAPPED-NEXT: neura.ctrl_mov %32 -> %3 {dfg_id = 37 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data !neura.data // MAPPED-NEXT: neura.yield {dfg_id = 4 : i32} // MAPPED-NEXT: } : i32 -// MAPPED-NEXT: "taskflow.yield"(%0) <{operandSegmentSizes = array}> : (i32) -> () -// MAPPED-NEXT: }) : (memref, memref, i32) -> i32 +// MAPPED-NEXT: taskflow.yield values(%0 : i32) +// MAPPED-NEXT: } // MAPPED-NEXT: return %value_outputs : i32 // MAPPED-NEXT: } // MAPPED-NEXT: } + + diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir index ebede17a..e5727ded 100644 --- a/test/multi-cgra/kernel_mapping/relu/relu.mlir +++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir @@ -4,13 +4,11 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ -// RUN: -o %t.canonicalized.mlir -// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE +// RUN: -o %t.hyperblock.mlir +// RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: -o %t.kernel.mlir @@ -18,7 +16,6 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: --lower-affine \ @@ -34,7 +31,6 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: --lower-affine \ @@ -57,7 +53,6 @@ // RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ // RUN: --classify-counters \ // RUN: --convert-taskflow-to-neura \ // RUN: --lower-affine \ @@ -101,81 +96,81 @@ module attributes {} { } } -// TASKFLOW: module { -// TASKFLOW-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { -// TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// TASKFLOW-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// TASKFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): -// TASKFLOW-NEXT: affine.for %arg5 = 0 to 32 { -// TASKFLOW-NEXT: %0 = affine.load %arg2[%arg5] : memref -// TASKFLOW-NEXT: %1 = arith.cmpi sgt, %0, %arg4 : i32 -// TASKFLOW-NEXT: scf.if %1 { -// TASKFLOW-NEXT: %2 = affine.load %arg2[%arg5] : memref -// TASKFLOW-NEXT: %3 = affine.load %arg3[%arg5] : memref -// TASKFLOW-NEXT: %4 = arith.addi %3, %2 : i32 -// TASKFLOW-NEXT: affine.store %4, %arg3[%arg5] : memref -// TASKFLOW-NEXT: } else { -// TASKFLOW-NEXT: %2 = affine.load %arg3[%arg5] : memref -// TASKFLOW-NEXT: affine.store %2, %arg3[%arg5] : memref -// TASKFLOW-NEXT: } -// TASKFLOW-NEXT: } -// TASKFLOW-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () -// TASKFLOW-NEXT: }) : (memref, memref, i32) -> memref -// TASKFLOW-NEXT: return -// TASKFLOW-NEXT: } -// TASKFLOW-NEXT: } +// TASKFLOW: module { +// TASKFLOW-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 +// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref, memref, memref, i32) -> (memref) { +// TASKFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): +// TASKFLOW-NEXT: affine.for %arg6 = 0 to 32 { +// TASKFLOW-NEXT: %0 = affine.load %arg2[%arg6] : memref +// TASKFLOW-NEXT: %1 = arith.cmpi sgt, %0, %arg5 : i32 +// TASKFLOW-NEXT: scf.if %1 { +// TASKFLOW-NEXT: %2 = affine.load %arg2[%arg6] : memref +// TASKFLOW-NEXT: %3 = affine.load %arg4[%arg6] : memref +// TASKFLOW-NEXT: %4 = arith.addi %3, %2 : i32 +// TASKFLOW-NEXT: affine.store %4, %arg4[%arg6] : memref +// TASKFLOW-NEXT: } else { +// TASKFLOW-NEXT: %2 = affine.load %arg4[%arg6] : memref +// TASKFLOW-NEXT: affine.store %2, %arg4[%arg6] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: taskflow.yield writes(%arg4 : memref) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: return +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT:} -// CANONICALIZE: module { -// CANONICALIZE-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { -// CANONICALIZE-NEXT: %c0_i32 = arith.constant 0 : i32 -// CANONICALIZE-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): -// CANONICALIZE-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg5: index): -// CANONICALIZE-NEXT: %1 = memref.load %arg2[%arg5] : memref -// CANONICALIZE-NEXT: %2 = arith.cmpi sgt, %1, %arg4 : i32 -// CANONICALIZE-NEXT: scf.if %2 { -// CANONICALIZE-NEXT: %3 = memref.load %arg2[%arg5] : memref -// CANONICALIZE-NEXT: %4 = memref.load %arg3[%arg5] : memref -// CANONICALIZE-NEXT: %5 = arith.addi %4, %3 : i32 -// CANONICALIZE-NEXT: memref.store %5, %arg3[%arg5] : memref -// CANONICALIZE-NEXT: } else { -// CANONICALIZE-NEXT: %3 = memref.load %arg3[%arg5] : memref -// CANONICALIZE-NEXT: memref.store %3, %arg3[%arg5] : memref -// CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () -// CANONICALIZE-NEXT: }) : (memref, memref, i32) -> memref -// CANONICALIZE-NEXT: return -// CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: } +// HYPERBLOCK: module { +// HYPERBLOCK-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { +// HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 +// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref, memref, memref, i32) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): +// HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg6: index): +// HYPERBLOCK-NEXT: %1 = memref.load %arg2[%arg6] : memref +// HYPERBLOCK-NEXT: %2 = arith.cmpi sgt, %1, %arg5 : i32 +// HYPERBLOCK-NEXT: scf.if %2 { +// HYPERBLOCK-NEXT: %3 = memref.load %arg2[%arg6] : memref +// HYPERBLOCK-NEXT: %4 = memref.load %arg4[%arg6] : memref +// HYPERBLOCK-NEXT: %5 = arith.addi %4, %3 : i32 +// HYPERBLOCK-NEXT: memref.store %5, %arg4[%arg6] : memref +// HYPERBLOCK-NEXT: } else { +// HYPERBLOCK-NEXT: %3 = memref.load %arg4[%arg6] : memref +// HYPERBLOCK-NEXT: memref.store %3, %arg4[%arg6] : memref +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg4 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: return +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: } // KERNEL: module { // KERNEL-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // KERNEL-NEXT: %c0_i32 = arith.constant 0 : i32 -// KERNEL-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// KERNEL-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// KERNEL-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref, memref, memref, i32) -> (memref) { +// KERNEL-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// KERNEL-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) { -// KERNEL-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// KERNEL-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) { +// KERNEL-NEXT: ^bb0(%arg6: memref, %arg7: i32, %arg8: memref): // KERNEL-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// KERNEL-NEXT: %2 = memref.load %arg5[%1] : memref -// KERNEL-NEXT: %3 = arith.cmpi sgt, %2, %arg6 : i32 +// KERNEL-NEXT: %2 = memref.load %arg6[%1] : memref +// KERNEL-NEXT: %3 = arith.cmpi sgt, %2, %arg7 : i32 // KERNEL-NEXT: scf.if %3 { -// KERNEL-NEXT: %4 = memref.load %arg5[%1] : memref -// KERNEL-NEXT: %5 = memref.load %arg7[%1] : memref +// KERNEL-NEXT: %4 = memref.load %arg6[%1] : memref +// KERNEL-NEXT: %5 = memref.load %arg8[%1] : memref // KERNEL-NEXT: %6 = arith.addi %5, %4 : i32 -// KERNEL-NEXT: memref.store %6, %arg7[%1] : memref +// KERNEL-NEXT: memref.store %6, %arg8[%1] : memref // KERNEL-NEXT: } else { -// KERNEL-NEXT: %4 = memref.load %arg7[%1] : memref -// KERNEL-NEXT: memref.store %4, %arg7[%1] : memref +// KERNEL-NEXT: %4 = memref.load %arg8[%1] : memref +// KERNEL-NEXT: memref.store %4, %arg8[%1] : memref // KERNEL-NEXT: } // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () -// KERNEL-NEXT: }) : (memref, memref, i32) -> memref +// KERNEL-NEXT: taskflow.yield writes(%arg4 : memref) +// KERNEL-NEXT: } // KERNEL-NEXT: return // KERNEL-NEXT: } // KERNEL-NEXT: } @@ -183,30 +178,30 @@ module attributes {} { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// NEURA-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// NEURA-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref, memref, memref, i32) -> (memref) { +// NEURA-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// NEURA-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) attributes {accelerator = "neura"} { -// NEURA-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// NEURA-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) attributes {accelerator = "neura"} { +// NEURA-NEXT: ^bb0(%arg6: memref, %arg7: i32, %arg8: memref): // NEURA-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// NEURA-NEXT: %2 = neura.load_indexed %arg5[%1 : index] memref : i32 -// NEURA-NEXT: %3 = "neura.icmp"(%2, %arg6) <{cmpType = "sgt"}> : (i32, i32) -> i1 +// NEURA-NEXT: %2 = neura.load_indexed %arg6[%1 : index] memref : i32 +// NEURA-NEXT: %3 = "neura.icmp"(%2, %arg7) <{cmpType = "sgt"}> : (i32, i32) -> i1 // NEURA-NEXT: neura.cond_br %3 : i1 then to ^bb1 else to ^bb2 // NEURA-NEXT: ^bb1: // pred: ^bb0 -// NEURA-NEXT: %4 = neura.load_indexed %arg5[%1 : index] memref : i32 -// NEURA-NEXT: %5 = neura.load_indexed %arg7[%1 : index] memref : i32 +// NEURA-NEXT: %4 = neura.load_indexed %arg6[%1 : index] memref : i32 +// NEURA-NEXT: %5 = neura.load_indexed %arg8[%1 : index] memref : i32 // NEURA-NEXT: %6 = "neura.add"(%5, %4) : (i32, i32) -> i32 -// NEURA-NEXT: neura.store_indexed %6 to %arg7[%1 : index] memref : i32 +// NEURA-NEXT: neura.store_indexed %6 to %arg8[%1 : index] memref : i32 // NEURA-NEXT: neura.br to ^bb3 // NEURA-NEXT: ^bb2: // pred: ^bb0 -// NEURA-NEXT: %7 = neura.load_indexed %arg7[%1 : index] memref : i32 -// NEURA-NEXT: neura.store_indexed %7 to %arg7[%1 : index] memref : i32 +// NEURA-NEXT: %7 = neura.load_indexed %arg8[%1 : index] memref : i32 +// NEURA-NEXT: neura.store_indexed %7 to %arg8[%1 : index] memref : i32 // NEURA-NEXT: neura.br to ^bb3 // NEURA-NEXT: ^bb3: // 2 preds: ^bb1, ^bb2 // NEURA-NEXT: neura.yield // NEURA-NEXT: } -// NEURA-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () -// NEURA-NEXT: }) : (memref, memref, i32) -> memref +// NEURA-NEXT: taskflow.yield writes(%arg4 : memref) +// NEURA-NEXT: } // NEURA-NEXT: return // NEURA-NEXT: } // NEURA-NEXT: } @@ -214,11 +209,11 @@ module attributes {} { // DATAFLOW: module { // DATAFLOW-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// DATAFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// DATAFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref, memref, memref, i32) -> (memref) { +// DATAFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// DATAFLOW-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate"} { -// DATAFLOW-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// DATAFLOW-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate"} { +// DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: i32, %arg8: memref): // DATAFLOW-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : !neura.data // DATAFLOW-NEXT: %2 = neura.load_indexed [%1 : !neura.data] {lhs_value = "%input0"} : !neura.data // DATAFLOW-NEXT: %3 = "neura.icmp"(%2) <{cmpType = "sgt"}> {rhs_value = "%input1"} : (!neura.data) -> !neura.data @@ -233,8 +228,8 @@ module attributes {} { // DATAFLOW-NEXT: neura.store_indexed %10 to [%4 : !neura.data] {rhs_value = "%input2"} : !neura.data // DATAFLOW-NEXT: neura.yield {yield_type = "void"} // DATAFLOW-NEXT: } -// DATAFLOW-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () -// DATAFLOW-NEXT: }) : (memref, memref, i32) -> memref +// DATAFLOW-NEXT: taskflow.yield writes(%arg4 : memref) +// DATAFLOW-NEXT: } // DATAFLOW-NEXT: return // DATAFLOW-NEXT: } // DATAFLOW-NEXT: } @@ -242,11 +237,11 @@ module attributes {} { // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg1, %c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// MAPPED-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: i32): +// MAPPED-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1), original_write_memrefs(%arg1)] : (memref, memref, memref, i32) -> (memref) { +// MAPPED-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index -// MAPPED-NEXT: neura.kernel inputs(%arg2, %arg4, %arg3 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { -// MAPPED-NEXT: ^bb0(%arg5: memref, %arg6: i32, %arg7: memref): +// MAPPED-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { +// MAPPED-NEXT: ^bb0(%arg6: memref, %arg7: i32, %arg8: memref): // MAPPED-NEXT: %1 = neura.counter {counter_id = 0 : i32, counter_type = "leaf", dfg_id = 0 : i32, lower_bound = 0 : index, mapping_locs = [{id = 5 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 1 : i32}], step = 1 : index, upper_bound = 32 : index} : !neura.data // MAPPED-NEXT: %2 = "neura.data_mov"(%1) {dfg_id = 2 : i32, mapping_locs = [{id = 160 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data) -> !neura.data // MAPPED-NEXT: %3 = neura.load_indexed [%2 : !neura.data] {dfg_id = 5 : i32, lhs_value = "%input0", mapping_locs = [{id = 5 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data @@ -277,10 +272,8 @@ module attributes {} { // MAPPED-NEXT: neura.store_indexed %25 to [%26 : !neura.data] {dfg_id = 28 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 1 : i32}], rhs_value = "%input2"} : !neura.data // MAPPED-NEXT: neura.yield {dfg_id = 1 : i32, yield_type = "void"} // MAPPED-NEXT: } -// MAPPED-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref) -> () -// MAPPED-NEXT: }) : (memref, memref, i32) -> memref +// MAPPED-NEXT: taskflow.yield writes(%arg4 : memref) +// MAPPED-NEXT: } // MAPPED-NEXT: return // MAPPED-NEXT: } -// MAPPED-NEXT: } - - +// MAPPED-NEXT: } \ No newline at end of file diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 9d1e6f46..906bc267 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -1,18 +1,18 @@ -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: -o %t.serialized.mlir +// RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED + +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ // RUN: -o %t.taskflow.mlir // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ -// RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ -// RUN: -o %t.canonicalized.mlir -// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE - #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)> module attributes {} { func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { @@ -49,6 +49,45 @@ module attributes {} { } } +// SERIALIZED: module { +// SERIALIZED-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { +// SERIALIZED-NEXT: %c2_i32 = arith.constant 2 : i32 +// SERIALIZED-NEXT: %c8_i32 = arith.constant 8 : i32 +// SERIALIZED-NEXT: %c0_i32 = arith.constant 0 : i32 +// SERIALIZED-NEXT: %alloca = memref.alloca() : memref +// SERIALIZED-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> +// SERIALIZED-NEXT: %0 = affine.for %arg0 = 0 to 5 iter_args(%arg1 = %c0_i32) -> (i32) { +// SERIALIZED-NEXT: %2 = arith.index_cast %arg0 : index to i32 +// SERIALIZED-NEXT: %3 = arith.addi %arg1, %2 : i32 +// SERIALIZED-NEXT: affine.yield %3 : i32 +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: affine.for %arg0 = 0 to 4 { +// SERIALIZED-NEXT: %2 = arith.index_cast %arg0 : index to i32 +// SERIALIZED-NEXT: %3 = arith.muli %2, %c8_i32 : i32 +// SERIALIZED-NEXT: affine.for %arg1 = 0 to 8 { +// SERIALIZED-NEXT: %4 = arith.index_cast %arg1 : index to i32 +// SERIALIZED-NEXT: %5 = arith.addi %3, %4 : i32 +// SERIALIZED-NEXT: affine.store %5, %alloca_0[%arg0, %arg1] : memref<4x8xi32> +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: affine.for %arg0 = 0 to 4 { +// SERIALIZED-NEXT: %2 = arith.index_cast %arg0 : index to i32 +// SERIALIZED-NEXT: %3 = arith.muli %2, %c8_i32 : i32 +// SERIALIZED-NEXT: affine.for %arg1 = 0 to 8 { +// SERIALIZED-NEXT: %4 = affine.load %alloca_0[%arg0, %arg1] : memref<4x8xi32> +// SERIALIZED-NEXT: %5 = arith.addi %4, %0 : i32 +// SERIALIZED-NEXT: affine.if #set(%arg0, %arg1) { +// SERIALIZED-NEXT: affine.store %5, %alloca[] : memref +// SERIALIZED-NEXT: %6 = arith.muli %5, %c2_i32 : i32 +// SERIALIZED-NEXT: affine.store %6, %alloca[] : memref +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: %1 = affine.load %alloca[] : memref +// SERIALIZED-NEXT: return %1 : i32 +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } + // TASKFLOW: #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)> // TASKFLOW-NEXT: module { // TASKFLOW-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { @@ -57,26 +96,34 @@ module attributes {} { // TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 // TASKFLOW-NEXT: %alloca = memref.alloca() : memref // TASKFLOW-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> -// TASKFLOW-NEXT: %value_outputs = "taskflow.task"(%c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// TASKFLOW-NEXT: %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) { // TASKFLOW-NEXT: ^bb0(%arg0: i32): // TASKFLOW-NEXT: %1 = affine.for %arg1 = 0 to 5 iter_args(%arg2 = %arg0) -> (i32) { // TASKFLOW-NEXT: %2 = arith.index_cast %arg1 : index to i32 // TASKFLOW-NEXT: %3 = arith.addi %arg2, %2 : i32 // TASKFLOW-NEXT: affine.yield %3 : i32 // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: "taskflow.yield"(%1) <{operandSegmentSizes = array}> : (i32) -> () -// TASKFLOW-NEXT: }) : (i32) -> i32 -// TASKFLOW-NEXT: %memory_outputs:2 = "taskflow.task"(%alloca_0, %alloca, %c8_i32, %value_outputs, %c2_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ +// TASKFLOW-NEXT: taskflow.yield values(%1 : i32) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { +// TASKFLOW-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32): +// TASKFLOW-NEXT: affine.for %arg2 = 0 to 4 { +// TASKFLOW-NEXT: %1 = arith.index_cast %arg2 : index to i32 +// TASKFLOW-NEXT: %2 = arith.muli %1, %arg1 : i32 +// TASKFLOW-NEXT: affine.for %arg3 = 0 to 8 { +// TASKFLOW-NEXT: %3 = arith.index_cast %arg3 : index to i32 +// TASKFLOW-NEXT: %4 = arith.addi %2, %3 : i32 +// TASKFLOW-NEXT: affine.store %4, %arg0[%arg2, %arg3] : memref<4x8xi32> +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref) { // TASKFLOW-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): // TASKFLOW-NEXT: affine.for %arg5 = 0 to 4 { // TASKFLOW-NEXT: %1 = arith.index_cast %arg5 : index to i32 // TASKFLOW-NEXT: %2 = arith.muli %1, %arg2 : i32 // TASKFLOW-NEXT: affine.for %arg6 = 0 to 8 { -// TASKFLOW-NEXT: %3 = arith.index_cast %arg6 : index to i32 -// TASKFLOW-NEXT: %4 = arith.addi %2, %3 : i32 -// TASKFLOW-NEXT: affine.store %4, %arg0[%arg5, %arg6] : memref<4x8xi32> -// TASKFLOW-NEXT: } -// TASKFLOW-NEXT: affine.for %arg6 = 0 to 8 { // TASKFLOW-NEXT: %3 = affine.load %arg0[%arg5, %arg6] : memref<4x8xi32> // TASKFLOW-NEXT: %4 = arith.addi %3, %arg3 : i32 // TASKFLOW-NEXT: affine.if #set(%arg5, %arg6) { @@ -86,133 +133,84 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: "taskflow.yield"(%arg0, %arg1) <{operandSegmentSizes = array}> : (memref<4x8xi32>, memref) -> () -// TASKFLOW-NEXT: }) : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) -// TASKFLOW-NEXT: %0 = affine.load %memory_outputs#1[] : memref +// TASKFLOW-NEXT: taskflow.yield writes(%arg1 : memref) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %0 = affine.load %write_outputs_1[] : memref // TASKFLOW-NEXT: return %0 : i32 // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// HYPERBLOCK: module { -// HYPERBLOCK-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { -// HYPERBLOCK-NEXT: %c2_i32 = arith.constant 2 : i32 -// HYPERBLOCK-NEXT: %c8_i32 = arith.constant 8 : i32 -// HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 -// HYPERBLOCK-NEXT: %alloca = memref.alloca() : memref -// HYPERBLOCK-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> -// HYPERBLOCK-NEXT: %value_outputs = "taskflow.task"(%c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg0: i32): -// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index -// HYPERBLOCK-NEXT: %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg1: index, %arg2: i32): -// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg1 : index to i32 -// HYPERBLOCK-NEXT: %4 = arith.addi %arg2, %3 : i32 -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32) -// HYPERBLOCK-NEXT: }) : (index, i32) -> i32 -// HYPERBLOCK-NEXT: "taskflow.yield"(%2) <{operandSegmentSizes = array}> : (i32) -> () -// HYPERBLOCK-NEXT: }) : (i32) -> i32 -// HYPERBLOCK-NEXT: %memory_outputs:2 = "taskflow.task"(%alloca_0, %alloca, %c8_i32, %value_outputs, %c2_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): -// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// HYPERBLOCK-NEXT: %3 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg5: index, %arg6: index): -// HYPERBLOCK-NEXT: %4 = arith.index_cast %arg5 : index to i32 -// HYPERBLOCK-NEXT: %5 = arith.muli %4, %arg2 : i32 -// HYPERBLOCK-NEXT: %6 = arith.index_cast %arg6 : index to i32 -// HYPERBLOCK-NEXT: %7 = arith.addi %5, %6 : i32 -// HYPERBLOCK-NEXT: memref.store %7, %arg0[%arg5, %arg6] : memref<4x8xi32> -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %3) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg5: index, %arg6: index): -// HYPERBLOCK-NEXT: %4 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32> -// HYPERBLOCK-NEXT: %5 = arith.addi %4, %arg3 : i32 -// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index -// HYPERBLOCK-NEXT: %c-3 = arith.constant -3 : index -// HYPERBLOCK-NEXT: %6 = arith.addi %arg5, %c-3 : index -// HYPERBLOCK-NEXT: %7 = arith.cmpi eq, %6, %c0 : index -// HYPERBLOCK-NEXT: %c-7 = arith.constant -7 : index -// HYPERBLOCK-NEXT: %8 = arith.addi %arg6, %c-7 : index -// HYPERBLOCK-NEXT: %9 = arith.cmpi eq, %8, %c0 : index -// HYPERBLOCK-NEXT: %10 = arith.andi %7, %9 : i1 -// HYPERBLOCK-NEXT: scf.if %10 { -// HYPERBLOCK-NEXT: memref.store %5, %arg1[] : memref -// HYPERBLOCK-NEXT: %11 = arith.muli %5, %arg4 : i32 -// HYPERBLOCK-NEXT: memref.store %11, %arg1[] : memref -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: "taskflow.yield"(%arg0, %arg1) <{operandSegmentSizes = array}> : (memref<4x8xi32>, memref) -> () -// HYPERBLOCK-NEXT: }) : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) -// HYPERBLOCK-NEXT: %0 = affine.load %memory_outputs#1[] : memref -// HYPERBLOCK-NEXT: return %0 : i32 -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: } - - - -// CANONICALIZE: module { -// CANONICALIZE-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { -// CANONICALIZE-NEXT: %c2_i32 = arith.constant 2 : i32 -// CANONICALIZE-NEXT: %c8_i32 = arith.constant 8 : i32 -// CANONICALIZE-NEXT: %c0_i32 = arith.constant 0 : i32 -// CANONICALIZE-NEXT: %alloca = memref.alloca() : memref -// CANONICALIZE-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> -// CANONICALIZE-NEXT: %value_outputs = "taskflow.task"(%c0_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg0: i32): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index -// CANONICALIZE-NEXT: %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg1: index, %arg2: i32): -// CANONICALIZE-NEXT: %3 = arith.index_cast %arg1 : index to i32 -// CANONICALIZE-NEXT: %4 = arith.addi %arg2, %3 : i32 -// CANONICALIZE-NEXT: taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32) -// CANONICALIZE-NEXT: }) : (index, i32) -> i32 -// CANONICALIZE-NEXT: "taskflow.yield"(%2) <{operandSegmentSizes = array}> : (i32) -> () -// CANONICALIZE-NEXT: }) : (i32) -> i32 -// CANONICALIZE-NEXT: %memory_outputs = "taskflow.task"(%alloca_0, %c8_i32, %alloca_0) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32, %arg2: memref<4x8xi32>): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// CANONICALIZE-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg3: index, %arg4: index): -// CANONICALIZE-NEXT: %3 = arith.index_cast %arg3 : index to i32 -// CANONICALIZE-NEXT: %4 = arith.muli %3, %arg1 : i32 -// CANONICALIZE-NEXT: %5 = arith.index_cast %arg4 : index to i32 -// CANONICALIZE-NEXT: %6 = arith.addi %4, %5 : i32 -// CANONICALIZE-NEXT: memref.store %6, %arg2[%arg3, %arg4] : memref<4x8xi32> -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index, index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg2) <{operandSegmentSizes = array}> : (memref<4x8xi32>) -> () -// CANONICALIZE-NEXT: }) : (memref<4x8xi32>, i32, memref<4x8xi32>) -> memref<4x8xi32> -// CANONICALIZE-NEXT: %memory_outputs_1 = "taskflow.task"(%memory_outputs, %alloca, %alloca_0, %value_outputs, %alloca, %c2_i32) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_2"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: memref<4x8xi32>, %arg3: i32, %arg4: memref, %arg5: i32): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// CANONICALIZE-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg6: index, %arg7: index): -// CANONICALIZE-NEXT: %3 = memref.load %arg2[%arg6, %arg7] : memref<4x8xi32> -// CANONICALIZE-NEXT: %4 = arith.addi %3, %arg3 : i32 -// CANONICALIZE-NEXT: %c0 = arith.constant 0 : index -// CANONICALIZE-NEXT: %c-3 = arith.constant -3 : index -// CANONICALIZE-NEXT: %5 = arith.addi %arg6, %c-3 : index -// CANONICALIZE-NEXT: %6 = arith.cmpi eq, %5, %c0 : index -// CANONICALIZE-NEXT: %c-7 = arith.constant -7 : index -// CANONICALIZE-NEXT: %7 = arith.addi %arg7, %c-7 : index -// CANONICALIZE-NEXT: %8 = arith.cmpi eq, %7, %c0 : index -// CANONICALIZE-NEXT: %9 = arith.andi %6, %8 : i1 -// CANONICALIZE-NEXT: scf.if %9 { -// CANONICALIZE-NEXT: memref.store %4, %arg4[] : memref -// CANONICALIZE-NEXT: %10 = arith.muli %4, %arg5 : i32 -// CANONICALIZE-NEXT: memref.store %10, %arg4[] : memref -// CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index, index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg4) <{operandSegmentSizes = array}> : (memref) -> () -// CANONICALIZE-NEXT: }) : (memref<4x8xi32>, memref, memref<4x8xi32>, i32, memref, i32) -> memref -// CANONICALIZE-NEXT: %0 = affine.load %memory_outputs_1[] : memref -// CANONICALIZE-NEXT: return %0 : i32 -// CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: } - +// HYPERBLOCK: module { +// HYPERBLOCK-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { +// HYPERBLOCK-NEXT: %c2_i32 = arith.constant 2 : i32 +// HYPERBLOCK-NEXT: %c8_i32 = arith.constant 8 : i32 +// HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 +// HYPERBLOCK-NEXT: %alloca = memref.alloca() : memref +// HYPERBLOCK-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> +// HYPERBLOCK-NEXT: %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) { +// HYPERBLOCK-NEXT: ^bb0(%arg0: i32): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index +// HYPERBLOCK-NEXT: %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg1: index, %arg2: i32): +// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg1 : index to i32 +// HYPERBLOCK-NEXT: %4 = arith.addi %arg2, %3 : i32 +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32) +// HYPERBLOCK-NEXT: }) : (index, i32) -> i32 +// HYPERBLOCK-NEXT: taskflow.yield values(%2 : i32) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { +// HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg2: index): +// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg2 : index to i32 +// HYPERBLOCK-NEXT: %4 = arith.muli %3, %arg1 : i32 +// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %c8 = arith.constant 8 : index +// HYPERBLOCK-NEXT: %c1 = arith.constant 1 : index +// HYPERBLOCK-NEXT: scf.for %arg3 = %c0 to %c8 step %c1 { +// HYPERBLOCK-NEXT: %5 = arith.index_cast %arg3 : index to i32 +// HYPERBLOCK-NEXT: %6 = arith.addi %4, %5 : i32 +// HYPERBLOCK-NEXT: memref.store %6, %arg0[%arg2, %arg3] : memref<4x8xi32> +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg5: index): +// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg5 : index to i32 +// HYPERBLOCK-NEXT: %4 = arith.muli %3, %arg2 : i32 +// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %c8 = arith.constant 8 : index +// HYPERBLOCK-NEXT: %c1 = arith.constant 1 : index +// HYPERBLOCK-NEXT: scf.for %arg6 = %c0 to %c8 step %c1 { +// HYPERBLOCK-NEXT: %5 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32> +// HYPERBLOCK-NEXT: %6 = arith.addi %5, %arg3 : i32 +// HYPERBLOCK-NEXT: %c0_2 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %c-3 = arith.constant -3 : index +// HYPERBLOCK-NEXT: %7 = arith.addi %arg5, %c-3 : index +// HYPERBLOCK-NEXT: %8 = arith.cmpi eq, %7, %c0_2 : index +// HYPERBLOCK-NEXT: %c-7 = arith.constant -7 : index +// HYPERBLOCK-NEXT: %9 = arith.addi %arg6, %c-7 : index +// HYPERBLOCK-NEXT: %10 = arith.cmpi eq, %9, %c0_2 : index +// HYPERBLOCK-NEXT: %11 = arith.andi %8, %10 : i1 +// HYPERBLOCK-NEXT: scf.if %11 { +// HYPERBLOCK-NEXT: memref.store %6, %arg1[] : memref +// HYPERBLOCK-NEXT: %12 = arith.muli %6, %arg4 : i32 +// HYPERBLOCK-NEXT: memref.store %12, %arg1[] : memref +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg1 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[] : memref +// HYPERBLOCK-NEXT: return %0 : i32 +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT:} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index c5f75f28..509614a1 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -1,18 +1,18 @@ -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: -o %t.serialized.mlir +// RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED + +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ // RUN: -o %t.taskflow.mlir // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ -// RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ -// RUN: -o %t.canonicalized.mlir -// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE - module attributes {} { func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { affine.for %arg10 = 0 to 4 { @@ -52,183 +52,204 @@ module attributes {} { } } +// SERIALIZED: module { +// SERIALIZED-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// SERIALIZED-NEXT: affine.for %arg10 = 0 to 4 { +// SERIALIZED-NEXT: affine.for %arg11 = 0 to 8 { +// SERIALIZED-NEXT: affine.for %arg12 = 0 to 6 { +// SERIALIZED-NEXT: %1 = affine.load %arg0[%arg10, %arg11, %arg12] : memref +// SERIALIZED-NEXT: affine.store %1, %arg5[%arg12] : memref +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: affine.for %arg10 = 0 to 4 { +// SERIALIZED-NEXT: affine.for %arg11 = 0 to 8 { +// SERIALIZED-NEXT: affine.for %arg12 = 0 to 5 { +// SERIALIZED-NEXT: %1 = affine.load %arg1[%arg10, %arg11, %arg12] : memref +// SERIALIZED-NEXT: %2 = affine.load %arg2[%arg10, %arg11, %arg12] : memref +// SERIALIZED-NEXT: %3 = arith.addi %1, %2 : i32 +// SERIALIZED-NEXT: affine.store %3, %arg6[%arg12] : memref +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: affine.for %arg10 = 0 to 4 { +// SERIALIZED-NEXT: affine.for %arg11 = 0 to 8 { +// SERIALIZED-NEXT: affine.for %arg12 = 0 to 6 { +// SERIALIZED-NEXT: %1 = affine.load %arg5[%arg12] : memref +// SERIALIZED-NEXT: %2 = affine.load %arg6[%arg12] : memref +// SERIALIZED-NEXT: %3 = arith.addi %1, %2 : i32 +// SERIALIZED-NEXT: %4 = affine.load %arg9[0] : memref +// SERIALIZED-NEXT: %5 = arith.addi %4, %3 : i32 +// SERIALIZED-NEXT: affine.store %5, %arg9[0] : memref +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: affine.for %arg10 = 0 to 4 { +// SERIALIZED-NEXT: affine.for %arg11 = 0 to 7 { +// SERIALIZED-NEXT: %1 = affine.load %arg3[%arg10, %arg11] : memref +// SERIALIZED-NEXT: affine.store %1, %arg7[%arg11] : memref +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: affine.for %arg10 = 0 to 4 { +// SERIALIZED-NEXT: affine.for %arg11 = 0 to 9 { +// SERIALIZED-NEXT: %1 = affine.load %arg4[%arg10, %arg11] : memref +// SERIALIZED-NEXT: %2 = affine.load %arg7[%arg11] : memref +// SERIALIZED-NEXT: %3 = arith.addi %1, %2 : i32 +// SERIALIZED-NEXT: affine.store %3, %arg8[%arg11] : memref +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: %0 = affine.load %arg9[0] : memref +// SERIALIZED-NEXT: return %0 : i32 +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } + // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// TASKFLOW-NEXT: %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref, %arg16: memref, %arg17: memref, %arg18: memref, %arg19: memref): -// TASKFLOW-NEXT: affine.for %arg20 = 0 to 4 { -// TASKFLOW-NEXT: affine.for %arg21 = 0 to 8 { -// TASKFLOW-NEXT: affine.for %arg22 = 0 to 6 { -// TASKFLOW-NEXT: %1 = affine.load %arg10[%arg20, %arg21, %arg22] : memref -// TASKFLOW-NEXT: affine.store %1, %arg13[%arg22] : memref +// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0), original_write_memrefs(%arg5)] : (memref, memref) -> (memref) { +// TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref): +// TASKFLOW-NEXT: affine.for %arg12 = 0 to 4 { +// TASKFLOW-NEXT: affine.for %arg13 = 0 to 8 { +// TASKFLOW-NEXT: affine.for %arg14 = 0 to 6 { +// TASKFLOW-NEXT: %1 = affine.load %arg10[%arg12, %arg13, %arg14] : memref +// TASKFLOW-NEXT: affine.store %1, %arg11[%arg14] : memref // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: affine.for %arg22 = 0 to 5 { -// TASKFLOW-NEXT: %1 = affine.load %arg11[%arg20, %arg21, %arg22] : memref -// TASKFLOW-NEXT: %2 = affine.load %arg12[%arg20, %arg21, %arg22] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: taskflow.yield writes(%arg11 : memref) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg6)] : (memref, memref, memref) -> (memref) { +// TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): +// TASKFLOW-NEXT: affine.for %arg13 = 0 to 4 { +// TASKFLOW-NEXT: affine.for %arg14 = 0 to 8 { +// TASKFLOW-NEXT: affine.for %arg15 = 0 to 5 { +// TASKFLOW-NEXT: %1 = affine.load %arg10[%arg13, %arg14, %arg15] : memref +// TASKFLOW-NEXT: %2 = affine.load %arg11[%arg13, %arg14, %arg15] : memref // TASKFLOW-NEXT: %3 = arith.addi %1, %2 : i32 -// TASKFLOW-NEXT: affine.store %3, %arg14[%arg22] : memref +// TASKFLOW-NEXT: affine.store %3, %arg12[%arg15] : memref // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: affine.for %arg22 = 0 to 6 { -// TASKFLOW-NEXT: %1 = affine.load %arg13[%arg22] : memref -// TASKFLOW-NEXT: %2 = affine.load %arg14[%arg22] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: taskflow.yield writes(%arg12 : memref) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9), original_write_memrefs(%arg9)] : (memref, memref, memref, memref) -> (memref) { +// TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): +// TASKFLOW-NEXT: affine.for %arg14 = 0 to 4 { +// TASKFLOW-NEXT: affine.for %arg15 = 0 to 8 { +// TASKFLOW-NEXT: affine.for %arg16 = 0 to 6 { +// TASKFLOW-NEXT: %1 = affine.load %arg10[%arg16] : memref +// TASKFLOW-NEXT: %2 = affine.load %arg11[%arg16] : memref // TASKFLOW-NEXT: %3 = arith.addi %1, %2 : i32 -// TASKFLOW-NEXT: %4 = affine.load %arg15[0] : memref +// TASKFLOW-NEXT: %4 = affine.load %arg13[0] : memref // TASKFLOW-NEXT: %5 = arith.addi %4, %3 : i32 -// TASKFLOW-NEXT: affine.store %5, %arg15[0] : memref +// TASKFLOW-NEXT: affine.store %5, %arg13[0] : memref // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: affine.for %arg21 = 0 to 7 { -// TASKFLOW-NEXT: %1 = affine.load %arg16[%arg20, %arg21] : memref -// TASKFLOW-NEXT: affine.store %1, %arg18[%arg21] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: taskflow.yield writes(%arg13 : memref) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %write_outputs_2 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3), original_write_memrefs(%arg7)] : (memref, memref) -> (memref) { +// TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref): +// TASKFLOW-NEXT: affine.for %arg12 = 0 to 4 { +// TASKFLOW-NEXT: affine.for %arg13 = 0 to 7 { +// TASKFLOW-NEXT: %1 = affine.load %arg10[%arg12, %arg13] : memref +// TASKFLOW-NEXT: affine.store %1, %arg11[%arg13] : memref // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: affine.for %arg21 = 0 to 9 { -// TASKFLOW-NEXT: %1 = affine.load %arg17[%arg20, %arg21] : memref -// TASKFLOW-NEXT: %2 = affine.load %arg18[%arg21] : memref +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: taskflow.yield writes(%arg11 : memref) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %write_outputs_3 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_2 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7), original_write_memrefs(%arg8)] : (memref, memref, memref) -> (memref) { +// TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): +// TASKFLOW-NEXT: affine.for %arg13 = 0 to 4 { +// TASKFLOW-NEXT: affine.for %arg14 = 0 to 9 { +// TASKFLOW-NEXT: %1 = affine.load %arg10[%arg13, %arg14] : memref +// TASKFLOW-NEXT: %2 = affine.load %arg11[%arg14] : memref // TASKFLOW-NEXT: %3 = arith.addi %1, %2 : i32 -// TASKFLOW-NEXT: affine.store %3, %arg19[%arg21] : memref +// TASKFLOW-NEXT: affine.store %3, %arg12[%arg14] : memref // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array}> : (memref, memref, memref, memref, memref) -> () -// TASKFLOW-NEXT: }) : (memref, memref, memref, memref, memref, memref, memref, memref, memref, memref) -> (memref, memref, memref, memref, memref) -// TASKFLOW-NEXT: %0 = affine.load %memory_outputs#2[0] : memref +// TASKFLOW-NEXT: taskflow.yield writes(%arg12 : memref) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %0 = affine.load %write_outputs_1[0] : memref // TASKFLOW-NEXT: return %0 : i32 // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// HYPERBLOCK: module { -// HYPERBLOCK-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// HYPERBLOCK-NEXT: %memory_outputs:5 = "taskflow.task"(%arg0, %arg1, %arg2, %arg5, %arg6, %arg9, %arg3, %arg4, %arg7, %arg8) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref, %arg16: memref, %arg17: memref, %arg18: memref, %arg19: memref): -// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// HYPERBLOCK-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index -// HYPERBLOCK-NEXT: %4 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index -// HYPERBLOCK-NEXT: %5 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index -// HYPERBLOCK-NEXT: %6 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index -// HYPERBLOCK-NEXT: %7 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index, %arg22: index): -// HYPERBLOCK-NEXT: %8 = memref.load %arg10[%arg20, %arg21, %arg22] : memref -// HYPERBLOCK-NEXT: memref.store %8, %arg13[%arg22] : memref -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index, index, index) -> () -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %2, %4) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index, %arg22: index): -// HYPERBLOCK-NEXT: %8 = memref.load %arg11[%arg20, %arg21, %arg22] : memref -// HYPERBLOCK-NEXT: %9 = memref.load %arg12[%arg20, %arg21, %arg22] : memref -// HYPERBLOCK-NEXT: %10 = arith.addi %8, %9 : i32 -// HYPERBLOCK-NEXT: memref.store %10, %arg14[%arg22] : memref -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index, index, index) -> () -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%5) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg20: index): -// HYPERBLOCK-NEXT: %8 = memref.load %arg13[%arg20] : memref -// HYPERBLOCK-NEXT: %9 = memref.load %arg14[%arg20] : memref -// HYPERBLOCK-NEXT: %10 = arith.addi %8, %9 : i32 -// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index -// HYPERBLOCK-NEXT: %11 = memref.load %arg15[%c0] : memref -// HYPERBLOCK-NEXT: %12 = arith.addi %11, %10 : i32 -// HYPERBLOCK-NEXT: %c0_0 = arith.constant 0 : index -// HYPERBLOCK-NEXT: memref.store %12, %arg15[%c0_0] : memref -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %6) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index): -// HYPERBLOCK-NEXT: %8 = memref.load %arg16[%arg20, %arg21] : memref -// HYPERBLOCK-NEXT: memref.store %8, %arg18[%arg21] : memref -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %7) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg20: index, %arg21: index): -// HYPERBLOCK-NEXT: %8 = memref.load %arg17[%arg20, %arg21] : memref -// HYPERBLOCK-NEXT: %9 = memref.load %arg18[%arg21] : memref -// HYPERBLOCK-NEXT: %10 = arith.addi %8, %9 : i32 -// HYPERBLOCK-NEXT: memref.store %10, %arg19[%arg21] : memref -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: "taskflow.yield"(%arg13, %arg14, %arg15, %arg18, %arg19) <{operandSegmentSizes = array}> : (memref, memref, memref, memref, memref) -> () -// HYPERBLOCK-NEXT: }) : (memref, memref, memref, memref, memref, memref, memref, memref, memref, memref) -> (memref, memref, memref, memref, memref) -// HYPERBLOCK-NEXT: %0 = affine.load %memory_outputs#2[0] : memref -// HYPERBLOCK-NEXT: return %0 : i32 -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: } - -// CANONICALIZE: module { -// CANONICALIZE-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// CANONICALIZE-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg5, %arg0, %arg5) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// CANONICALIZE-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// CANONICALIZE-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg14: index, %arg15: index, %arg16: index): -// CANONICALIZE-NEXT: %4 = memref.load %arg12[%arg14, %arg15, %arg16] : memref -// CANONICALIZE-NEXT: memref.store %4, %arg13[%arg16] : memref -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index, index, index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg13) <{operandSegmentSizes = array}> : (memref) -> () -// CANONICALIZE-NEXT: }) : (memref, memref, memref, memref) -> memref -// CANONICALIZE-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg6, %arg1, %arg2, %arg6) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// CANONICALIZE-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// CANONICALIZE-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg16: index, %arg17: index, %arg18: index): -// CANONICALIZE-NEXT: %4 = memref.load %arg13[%arg16, %arg17, %arg18] : memref -// CANONICALIZE-NEXT: %5 = memref.load %arg14[%arg16, %arg17, %arg18] : memref -// CANONICALIZE-NEXT: %6 = arith.addi %4, %5 : i32 -// CANONICALIZE-NEXT: memref.store %6, %arg15[%arg18] : memref -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index, index, index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg15) <{operandSegmentSizes = array}> : (memref) -> () -// CANONICALIZE-NEXT: }) : (memref, memref, memref, memref, memref, memref) -> memref -// CANONICALIZE-NEXT: %memory_outputs_1 = "taskflow.task"(%memory_outputs, %memory_outputs_0, %arg9, %arg5, %arg6, %arg9) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_2"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// CANONICALIZE-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// CANONICALIZE-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%3) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg16: index): -// CANONICALIZE-NEXT: %4 = memref.load %arg13[%arg16] : memref -// CANONICALIZE-NEXT: %5 = memref.load %arg14[%arg16] : memref -// CANONICALIZE-NEXT: %6 = arith.addi %4, %5 : i32 -// CANONICALIZE-NEXT: %c0 = arith.constant 0 : index -// CANONICALIZE-NEXT: %7 = memref.load %arg15[%c0] : memref -// CANONICALIZE-NEXT: %8 = arith.addi %7, %6 : i32 -// CANONICALIZE-NEXT: %c0_4 = arith.constant 0 : index -// CANONICALIZE-NEXT: memref.store %8, %arg15[%c0_4] : memref -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg15) <{operandSegmentSizes = array}> : (memref) -> () -// CANONICALIZE-NEXT: }) : (memref, memref, memref, memref, memref, memref) -> memref -// CANONICALIZE-NEXT: %memory_outputs_2 = "taskflow.task"(%arg3, %arg7, %arg3, %arg7) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_3"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// CANONICALIZE-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg14: index, %arg15: index): -// CANONICALIZE-NEXT: %3 = memref.load %arg12[%arg14, %arg15] : memref -// CANONICALIZE-NEXT: memref.store %3, %arg13[%arg15] : memref -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index, index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg13) <{operandSegmentSizes = array}> : (memref) -> () -// CANONICALIZE-NEXT: }) : (memref, memref, memref, memref) -> memref -// CANONICALIZE-NEXT: %memory_outputs_3 = "taskflow.task"(%arg4, %memory_outputs_2, %arg8, %arg4, %arg7, %arg8) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_4"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref, %arg14: memref, %arg15: memref): -// CANONICALIZE-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// CANONICALIZE-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg16: index, %arg17: index): -// CANONICALIZE-NEXT: %3 = memref.load %arg13[%arg16, %arg17] : memref -// CANONICALIZE-NEXT: %4 = memref.load %arg14[%arg17] : memref -// CANONICALIZE-NEXT: %5 = arith.addi %3, %4 : i32 -// CANONICALIZE-NEXT: memref.store %5, %arg15[%arg17] : memref -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index, index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg15) <{operandSegmentSizes = array}> : (memref) -> () -// CANONICALIZE-NEXT: }) : (memref, memref, memref, memref, memref, memref) -> memref -// CANONICALIZE-NEXT: %0 = affine.load %memory_outputs_1[0] : memref -// CANONICALIZE-NEXT: return %0 : i32 -// CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: } \ No newline at end of file +// HYPERBLOCK: module { +// HYPERBLOCK-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0), original_write_memrefs(%arg5)] : (memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg12: index, %arg13: index, %arg14: index): +// HYPERBLOCK-NEXT: %4 = memref.load %arg10[%arg12, %arg13, %arg14] : memref +// HYPERBLOCK-NEXT: memref.store %4, %arg11[%arg14] : memref +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index, index, index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg11 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg6)] : (memref, memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %2, %3) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg13: index, %arg14: index, %arg15: index): +// HYPERBLOCK-NEXT: %4 = memref.load %arg10[%arg13, %arg14, %arg15] : memref +// HYPERBLOCK-NEXT: %5 = memref.load %arg11[%arg13, %arg14, %arg15] : memref +// HYPERBLOCK-NEXT: %6 = arith.addi %4, %5 : i32 +// HYPERBLOCK-NEXT: memref.store %6, %arg12[%arg15] : memref +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index, index, index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg12 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9), original_write_memrefs(%arg9)] : (memref, memref, memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// HYPERBLOCK-NEXT: %3 = taskflow.counter parent(%2 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 6 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%3) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg14: index): +// HYPERBLOCK-NEXT: %4 = memref.load %arg10[%arg14] : memref +// HYPERBLOCK-NEXT: %5 = memref.load %arg11[%arg14] : memref +// HYPERBLOCK-NEXT: %6 = arith.addi %4, %5 : i32 +// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %7 = memref.load %arg13[%c0] : memref +// HYPERBLOCK-NEXT: %8 = arith.addi %7, %6 : i32 +// HYPERBLOCK-NEXT: %c0_4 = arith.constant 0 : index +// HYPERBLOCK-NEXT: memref.store %8, %arg13[%c0_4] : memref +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg13 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs_2 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3), original_write_memrefs(%arg7)] : (memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg12: index, %arg13: index): +// HYPERBLOCK-NEXT: %3 = memref.load %arg10[%arg12, %arg13] : memref +// HYPERBLOCK-NEXT: memref.store %3, %arg11[%arg13] : memref +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index, index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg11 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs_3 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_2 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7), original_write_memrefs(%arg8)] : (memref, memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1, %2) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg13: index, %arg14: index): +// HYPERBLOCK-NEXT: %3 = memref.load %arg10[%arg13, %arg14] : memref +// HYPERBLOCK-NEXT: %4 = memref.load %arg11[%arg14] : memref +// HYPERBLOCK-NEXT: %5 = arith.addi %3, %4 : i32 +// HYPERBLOCK-NEXT: memref.store %5, %arg12[%arg14] : memref +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index, index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg12 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[0] : memref +// HYPERBLOCK-NEXT: return %0 : i32 +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT:} \ No newline at end of file diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index ee37c831..dee9c268 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -1,18 +1,18 @@ -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: -o %t.serialized.mlir +// RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED + +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ // RUN: -o %t.taskflow.mlir // RUN: FileCheck %s --input-file=%t.taskflow.mlir --check-prefixes=TASKFLOW -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --convert-affine-to-taskflow \ // RUN: --construct-hyperblock-from-task \ // RUN: -o %t.hyperblock.mlir // RUN: FileCheck %s --input-file=%t.hyperblock.mlir --check-prefixes=HYPERBLOCK -// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ -// RUN: --construct-hyperblock-from-task \ -// RUN: --canonicalize-task \ -// RUN: -o %t.canonicalized.mlir -// RUN: FileCheck %s --input-file=%t.canonicalized.mlir --check-prefixes=CANONICALIZE - module { // Example: Parallel nested loops scenario // Task 0: Single-level loop (vector scaling) @@ -44,18 +44,37 @@ module { } } +// SERIALIZED: module { +// SERIALIZED-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { +// SERIALIZED-NEXT: affine.for %arg5 = 0 to 16 { +// SERIALIZED-NEXT: %0 = affine.load %arg0[%arg5] : memref<16xf32> +// SERIALIZED-NEXT: %1 = arith.mulf %0, %arg4 : f32 +// SERIALIZED-NEXT: affine.store %1, %arg0[%arg5] : memref<16xf32> +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: affine.for %arg5 = 0 to 8 { +// SERIALIZED-NEXT: affine.for %arg6 = 0 to 8 { +// SERIALIZED-NEXT: %0 = affine.load %arg1[%arg5, %arg6] : memref<8x8xf32> +// SERIALIZED-NEXT: %1 = affine.load %arg2[%arg5, %arg6] : memref<8x8xf32> +// SERIALIZED-NEXT: %2 = arith.mulf %0, %1 : f32 +// SERIALIZED-NEXT: affine.store %2, %arg3[%arg5, %arg6] : memref<8x8xf32> +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: return +// SERIALIZED-NEXT: } +// SERIALIZED-NEXT: } + // TASKFLOW: module { // TASKFLOW-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { -// TASKFLOW-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// TASKFLOW-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: f32): -// TASKFLOW-NEXT: affine.for %arg7 = 0 to 16 { -// TASKFLOW-NEXT: %0 = affine.load %arg5[%arg7] : memref<16xf32> -// TASKFLOW-NEXT: %1 = arith.mulf %0, %arg6 : f32 -// TASKFLOW-NEXT: affine.store %1, %arg5[%arg7] : memref<16xf32> +// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0), original_write_memrefs(%arg0)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>) { +// TASKFLOW-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32): +// TASKFLOW-NEXT: affine.for %arg8 = 0 to 16 { +// TASKFLOW-NEXT: %0 = affine.load %arg6[%arg8] : memref<16xf32> +// TASKFLOW-NEXT: %1 = arith.mulf %0, %arg7 : f32 +// TASKFLOW-NEXT: affine.store %1, %arg6[%arg8] : memref<16xf32> // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: "taskflow.yield"(%arg5) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () -// TASKFLOW-NEXT: }) : (memref<16xf32>, f32) -> memref<16xf32> -// TASKFLOW-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ +// TASKFLOW-NEXT: taskflow.yield writes(%arg6 : memref<16xf32>) +// TASKFLOW-NEXT: } +// TASKFLOW-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg3)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>) { // TASKFLOW-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): // TASKFLOW-NEXT: affine.for %arg8 = 0 to 8 { // TASKFLOW-NEXT: affine.for %arg9 = 0 to 8 { @@ -65,27 +84,27 @@ module { // TASKFLOW-NEXT: affine.store %2, %arg7[%arg8, %arg9] : memref<8x8xf32> // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: "taskflow.yield"(%arg7) <{operandSegmentSizes = array}> : (memref<8x8xf32>) -> () -// TASKFLOW-NEXT: }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32> +// TASKFLOW-NEXT: taskflow.yield writes(%arg7 : memref<8x8xf32>) +// TASKFLOW-NEXT: } // TASKFLOW-NEXT: return // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { -// HYPERBLOCK-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: f32): +// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0), original_write_memrefs(%arg0)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>) { +// HYPERBLOCK-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index // HYPERBLOCK-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg7: index): -// HYPERBLOCK-NEXT: %1 = memref.load %arg5[%arg7] : memref<16xf32> -// HYPERBLOCK-NEXT: %2 = arith.mulf %1, %arg6 : f32 -// HYPERBLOCK-NEXT: memref.store %2, %arg5[%arg7] : memref<16xf32> +// HYPERBLOCK-NEXT: ^bb0(%arg8: index): +// HYPERBLOCK-NEXT: %1 = memref.load %arg6[%arg8] : memref<16xf32> +// HYPERBLOCK-NEXT: %2 = arith.mulf %1, %arg7 : f32 +// HYPERBLOCK-NEXT: memref.store %2, %arg6[%arg8] : memref<16xf32> // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: "taskflow.yield"(%arg5) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () -// HYPERBLOCK-NEXT: }) : (memref<16xf32>, f32) -> memref<16xf32> -// HYPERBLOCK-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg6 : memref<16xf32>) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2), original_write_memrefs(%arg3)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>) { // HYPERBLOCK-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index // HYPERBLOCK-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -97,40 +116,8 @@ module { // HYPERBLOCK-NEXT: memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32> // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: "taskflow.yield"(%arg7) <{operandSegmentSizes = array}> : (memref<8x8xf32>) -> () -// HYPERBLOCK-NEXT: }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32> +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg7 : memref<8x8xf32>) +// HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: return // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: } - -// CANONICALIZE: module { -// CANONICALIZE-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { -// CANONICALIZE-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg4) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: f32): -// CANONICALIZE-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg7: index): -// CANONICALIZE-NEXT: %1 = memref.load %arg5[%arg7] : memref<16xf32> -// CANONICALIZE-NEXT: %2 = arith.mulf %1, %arg6 : f32 -// CANONICALIZE-NEXT: memref.store %2, %arg5[%arg7] : memref<16xf32> -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg5) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () -// CANONICALIZE-NEXT: }) : (memref<16xf32>, f32) -> memref<16xf32> -// CANONICALIZE-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1, %arg2, %arg3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): -// CANONICALIZE-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// CANONICALIZE-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// CANONICALIZE-NEXT: "taskflow.hyperblock"(%0, %1) <{operandSegmentSizes = array}> ({ -// CANONICALIZE-NEXT: ^bb0(%arg8: index, %arg9: index): -// CANONICALIZE-NEXT: %2 = memref.load %arg5[%arg8, %arg9] : memref<8x8xf32> -// CANONICALIZE-NEXT: %3 = memref.load %arg6[%arg8, %arg9] : memref<8x8xf32> -// CANONICALIZE-NEXT: %4 = arith.mulf %2, %3 : f32 -// CANONICALIZE-NEXT: memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32> -// CANONICALIZE-NEXT: taskflow.hyperblock.yield -// CANONICALIZE-NEXT: }) : (index, index) -> () -// CANONICALIZE-NEXT: "taskflow.yield"(%arg7) <{operandSegmentSizes = array}> : (memref<8x8xf32>) -> () -// CANONICALIZE-NEXT: }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> memref<8x8xf32> -// CANONICALIZE-NEXT: return -// CANONICALIZE-NEXT: } -// CANONICALIZE-NEXT: } \ No newline at end of file +// HYPERBLOCK-NEXT: } \ No newline at end of file From a5817e8aefe023a9a2dd4737fb77e9171d395f4e Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sun, 1 Feb 2026 23:42:17 +0800 Subject: [PATCH 6/9] update submodule --- test/e2e/bicg/bicg_int_kernel.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/bicg/bicg_int_kernel.mlir b/test/e2e/bicg/bicg_int_kernel.mlir index 32f17705..f9aa4d3d 100644 --- a/test/e2e/bicg/bicg_int_kernel.mlir +++ b/test/e2e/bicg/bicg_int_kernel.mlir @@ -11,7 +11,7 @@ // RUN: mlir-neura-opt %t-kernel.mlir \ // RUN: --assign-accelerator \ // RUN: --lower-llvm-to-neura \ -// RUN: --promote-func-arg-to-const \ +// RUN: --promote-input-arg-to-const \ // RUN: --fold-constant \ // RUN: --canonicalize-return \ // RUN: --canonicalize-live-in \ From bb64e5e6a526f297431506783f7d9e5cec8fdbfb Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Mon, 2 Feb 2026 23:20:04 +0800 Subject: [PATCH 7/9] enhance with sibling loops detection --- .../ConstructHyperblockFromTaskPass.cpp | 131 +++++++++++++++--- 1 file changed, 108 insertions(+), 23 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index bb503c5d..41743149 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -182,53 +182,94 @@ getTopLevelLoopsInfo(SmallVector &loops_info) { //---------------------------------------------------------------------------- // Prologue-Loop-Epilogue Code (PLE) Pattern Detection //---------------------------------------------------------------------------- -// Prologue-Loop-Epilogue Code means code that appears before and after an inner -// loop. Example: for %i (outer loop) { -// -// for %j (nested loop) { -// +// Extended PLE Pattern: Also handles sibling loops without prologue/epilogue. +// Pattern 1: Prologue/Epilogue exists +// for %i { +// +// for %j { } +// +// } +// +// Pattern 2: Sibling loops (no prologue/epilogue) +// for %i { +// for %j { } +// for %k { } ← Sibling loop // } -// ← Loop-Epilogue Code -// } // For this pattern, we need to wrap the inner loop and the prologue-epilogue // code into a hyperblock. Only by doing this can we maintain the hyperblock as // a pure data-driven code block. struct PLEPattern { affine::AffineForOp outer_loop; - affine::AffineForOp inner_loop; + // Supports multiple sibling loops. + SmallVector inner_loops; SmallVector prologue_code; SmallVector epilogue_code; + // Code between sibling loops. + SmallVector> inter_loop_code; bool has_ple_pattern = false; + bool has_sibling_loops = false; }; // Detects Prologue-Loop-Epilogue Code pattern in the task. static PLEPattern detectPLEPattern(affine::AffineForOp outer_loop) { PLEPattern pattern; pattern.has_ple_pattern = false; + pattern.has_sibling_loops = false; pattern.outer_loop = outer_loop; Block &body = outer_loop.getRegion().front(); - bool found_nested_loop = false; + SmallVector current_segment; for (Operation &op : body.getOperations()) { if (auto nested_for = dyn_cast(&op)) { - found_nested_loop = true; - if (!pattern.inner_loop) { - pattern.inner_loop = nested_for; - } - } else if (!(isa(&op) && op.getOperands().empty())) { - if (!found_nested_loop) { - pattern.prologue_code.push_back(&op); + // Finds a nested loop. + if (pattern.inner_loops.empty()) { + // First nested loop - everything before is prologue. + if (!current_segment.empty()) { + pattern.prologue_code = current_segment; + pattern.has_ple_pattern = true; + current_segment.clear(); + } } else { - pattern.epilogue_code.push_back(&op); + // Second or later nested loop - everything before is inter-loop code. + pattern.has_sibling_loops = true; pattern.has_ple_pattern = true; + + if (!current_segment.empty()) { + pattern.inter_loop_code.push_back(current_segment); + current_segment.clear(); + } else { + // No operations between loops, add empty segment + pattern.inter_loop_code.push_back({}); + } } + + pattern.inner_loops.push_back(nested_for); + + } else if (!(isa(&op) && op.getOperands().empty())) { + // Regular operation - add to current segment. + current_segment.push_back(&op); } } - if (found_nested_loop && (!pattern.prologue_code.empty())) { + // Any remaining operations after all loops are epilogue. + if (!current_segment.empty() && !pattern.inner_loops.empty()) { + pattern.epilogue_code = current_segment; + pattern.has_ple_pattern = true; + } + + // If we have sibling loops (even without prologue/epilogue), + // it's still a PLE pattern that needs special handling. + if (pattern.inner_loops.size() > 1) { + pattern.has_ple_pattern = true; + pattern.has_sibling_loops = true; + } + + // If we have prologue/epilogue with at least one loop, it's a PLE pattern. + if (!pattern.inner_loops.empty() && + (!pattern.prologue_code.empty() || !pattern.epilogue_code.empty())) { pattern.has_ple_pattern = true; } @@ -280,15 +321,30 @@ static void extractHyperblocksInfoFromRegion( current_block_ops.clear(); } - // 2. Creates a hyperblock for the prologue + inner loop + epilogue. + // 2. Creates a hyperblock for: + // // prologue + loop1 + inter_code1 + loop2 + inter_code2 + ... + + // epilogue. HyperblockInfo info; + + // Adds prologue code. if (!ple_pattern.prologue_code.empty()) { info.operations.append(ple_pattern.prologue_code.begin(), ple_pattern.prologue_code.end()); } - info.operations.push_back(ple_pattern.inner_loop); + // Adds loops and inter-loop code. + for (size_t i = 0; i < ple_pattern.inner_loops.size(); ++i) { + info.operations.push_back(ple_pattern.inner_loops[i]); + // Adds inter-loop code if exists. + if (i < ple_pattern.inter_loop_code.size() && + !ple_pattern.inter_loop_code[i].empty()) { + info.operations.append(ple_pattern.inter_loop_code[i].begin(), + ple_pattern.inter_loop_code[i].end()); + } + } + + // Adds epilogue code. if (!ple_pattern.epilogue_code.empty()) { info.operations.append(ple_pattern.epilogue_code.begin(), ple_pattern.epilogue_code.end()); @@ -608,6 +664,9 @@ static LogicalResult transformTask(TaskflowTaskOp task_op) { // Step 4: Creates taskflow.hyperblock operations for each hyperblock. builder.setInsertionPoint(first_loop_op); + // Stores mappings from loop results to hyperblock outputs. + DenseMap loop_result_to_hyperblock_output; + // Creates hyperblock ops. for (auto &info : hyperblocks_info) { TaskflowHyperblockOp hyperblock_op = @@ -622,25 +681,51 @@ static LogicalResult transformTask(TaskflowTaskOp task_op) { for (auto [loop_result, hb_result] : llvm::zip(loop_results, hyperblock_results)) { - loop_result.replaceAllUsesWith(hb_result); + loop_result_to_hyperblock_output[loop_result] = hb_result; } } } + // Step 5: Replaces loop results with hyperblock outputs BEFORE erasing loops. + for (auto [loop_result, hb_output] : loop_result_to_hyperblock_output) { + loop_result.replaceAllUsesWith(hb_output); + } + // Step 6: Collects and erases original loop operations. // Collects all operations to erase. SmallVector ops_to_erase; + // First pass: collects all affine.for operations recursively. + // We need to erase them in reverse order (inner loops first). + SmallVector loops_to_erase; + task_op.walk( + [&](affine::AffineForOp loop) { loops_to_erase.push_back(loop); }); + + // Reverses the order so we erase innermost loops first. + std::reverse(loops_to_erase.begin(), loops_to_erase.end()); + + // Second pass: collects non-loop, non-taskflow operations. for (Operation &op : llvm::make_early_inc_range(task_body->getOperations())) { - if (!isa(&op)) { + if (!isa(&op)) { ops_to_erase.push_back(&op); } } - // Erases original operations. + // Step 7: Erases operations. + // First erases non-loop operations. for (Operation *op : ops_to_erase) { + // Makes sure all uses are replaced before erasing. + assert(op->use_empty() && "Operation still has uses before erasing"); op->erase(); } + // Then erases loops (innermost first). + for (affine::AffineForOp loop : loops_to_erase) { + // Makes sure the loop results have been replaced before erasing. + assert(loop->use_empty() && "Loop still has uses before erasing"); + loop->erase(); + } + return success(); } From ddde493732db8901be0e32e66723a2abbe64600c Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Tue, 3 Feb 2026 01:43:40 +0800 Subject: [PATCH 8/9] simplify the contruct-hyperblock-from-task logic --- include/TaskflowDialect/TaskflowPasses.td | 6 +- .../ConstructHyperblockFromTaskPass.cpp | 904 ++++++------------ .../irregular-loop/irregular-loop.mlir | 147 ++- 3 files changed, 360 insertions(+), 697 deletions(-) diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 7c6b5a17..ccc2a711 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -24,10 +24,10 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module //=========================================================// // Passes for the Taskflow dialect //=========================================================// -def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp"> { - let summary = "Constructs hyperblocks and counter chain from Taskflow tasks"; +def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::FuncOp">{ + let summary = "Constructs hyperblocks from Taskflow tasks by detecting perfect nested loop bands"; let description = [{ - This pass constructs hyperblocks and counter chain from Taskflow tasks. + This pass constructs hyperblocks from Taskflow tasks by detecting perfect nested loop bands. }]; let constructor = "taskflow::createConstructHyperblockFromTaskPass()"; } diff --git a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp index 41743149..acc58fa2 100644 --- a/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/ConstructHyperblockFromTaskPass.cpp @@ -8,720 +8,385 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/IRMapping.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/IR/Value.h" #include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/LogicalResult.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include - +#include using namespace mlir; using namespace mlir::taskflow; namespace { -//--------------------------------------------------------------------------- -// Loop Info Structure. -//---------------------------------------------------------------------------- -struct LoopInfo { - affine::AffineForOp for_op; - int lower_bound; - int upper_bound; - int step; - - // For nested loops. - LoopInfo *parent_loop_info = nullptr; - SmallVector child_loops; - - // Generated counter index. - Value counter_index; -}; - -//--------------------------------------------------------------------------- -// Hyperblock Info Structure. -//---------------------------------------------------------------------------- -// Represents a code block that should become a hyperblock. -struct HyperblockInfo { - // The operations that belong to this hyperblock. - SmallVector operations; - // The counter indices that trigger this hyperblock (empty for top-level - // operations before any loops). - SmallVector trigger_indices; +//============================================================================== +// Perfect Loop Band Detection. +//============================================================================== - // Whether this hyperblock is nested within loops. - bool is_loop_body = false; +// A perfect loop band is a sequence of perfectly nested loops where each loop +// (except the innermost) has exactly one child loop and no other operations +// (no prologue/epilogue). +struct PerfectLoopBand { + // Outer to inner loop order. + SmallVector loops; - // The corresponding loop. - affine::AffineForOp loop_op = nullptr; - - // Marks if this hyperblock follows the PLE pattern. - bool is_ple_pattern = false; + bool isEmpty() const { return loops.empty(); } + size_t getDepth() const { return loops.size(); } }; -//---------------------------------------------------------------------------- -// Helper Functions. -//---------------------------------------------------------------------------- -// Extracts loop parameters from affine.for operation. -static std::optional extractLoopBound(affine::AffineForOp for_op) { - LoopInfo loop_info; - loop_info.for_op = for_op; - - // Gets lower bound. - if (for_op.hasConstantLowerBound()) { - loop_info.lower_bound = for_op.getConstantLowerBound(); - } else { - return std::nullopt; - } - - // Gets upper bound. - if (for_op.hasConstantUpperBound()) { - loop_info.upper_bound = for_op.getConstantUpperBound(); - } else { - return std::nullopt; - } - - // Gets step. - loop_info.step = for_op.getStepAsInt(); - - return loop_info; -} - -// Collects all affine.for loops and builds loop hierarchy. -static SmallVector collectLoopInfo(TaskflowTaskOp task_op) { - SmallVector loops_info; - DenseMap op_to_loopinfo; - - // Step 1: Collects all loops with its parameter. - task_op.walk([&](affine::AffineForOp for_op) { - auto info = extractLoopBound(for_op); - if (!info) { - assert(false && "Non-constant loop bounds are not supported."); - } - - loops_info.push_back(*info); - op_to_loopinfo[for_op.getOperation()] = &loops_info.back(); - }); - - // Step 2: Builds parent-child relationships among loops. - for (auto &loop_info : loops_info) { - Operation *parent_op = loop_info.for_op->getParentOp(); - if (auto parent_for = dyn_cast(parent_op)) { - if (op_to_loopinfo.count(parent_for.getOperation())) { - LoopInfo *parent_loop_info = op_to_loopinfo[parent_for.getOperation()]; - loop_info.parent_loop_info = parent_loop_info; - parent_loop_info->child_loops.push_back(&loop_info); +// Detects the maximal perfect loop band starting from the given loop. +// Returns the sequence of perfectly nested loops. +static PerfectLoopBand detectPerfectLoopBand(affine::AffineForOp start_loop) { + PerfectLoopBand band; + affine::AffineForOp current_loop = start_loop; + + while (current_loop) { + band.loops.push_back(current_loop); + + // Checks the body of current loop. + Block &body = current_loop.getRegion().front(); + + // Counts non-trivial operations (excluding yield). + affine::AffineForOp nested_loop = nullptr; + size_t num_loops = 0; + size_t num_other_ops = 0; + + for (Operation &op : body) { + if (auto for_op = dyn_cast(&op)) { + nested_loop = for_op; + num_loops++; + } else if (!(isa(&op) && + op.getNumOperands() == 0)) { + num_other_ops++; } } - } - - return loops_info; -} -//---------------------------------------------------------------------------- -// Counter Chain Creation. -//---------------------------------------------------------------------------- -// Recursively creates counter chain for each top-level loop. -static void createCounterChainRecursivly(OpBuilder &builder, Location loc, - LoopInfo *loop_info, - Value parent_counter) { - // Creates counter for this loop. - Value counter_index; - if (parent_counter) { - // Nested counter. - auto counter_op = builder.create( - loc, builder.getIndexType(), parent_counter, - builder.getIndexAttr(loop_info->lower_bound), - builder.getIndexAttr(loop_info->upper_bound), - builder.getIndexAttr(loop_info->step), - /*Counter Type*/ nullptr, /*Counter ID*/ nullptr); - counter_index = counter_op.getCounterIndex(); - } else { - // Top-level counter. - auto counter_op = builder.create( - loc, builder.getIndexType(), /*parent_index=*/nullptr, - builder.getIndexAttr(loop_info->lower_bound), - builder.getIndexAttr(loop_info->upper_bound), - builder.getIndexAttr(loop_info->step), - /*Counter Type*/ nullptr, /*Counter ID*/ nullptr); - counter_index = counter_op.getCounterIndex(); - } - - loop_info->counter_index = counter_index; - - // Recursively creates counters for child loops. - for (LoopInfo *child : loop_info->child_loops) { - createCounterChainRecursivly(builder, loc, child, counter_index); + // Perfect nesting condition: exactly 1 nested loop, no other operations. + if (num_loops == 1 && num_other_ops == 0) { + // Continues to next level. + current_loop = nested_loop; + } else { + break; // Not perfect anymore. + } } -} -// Creates counter chain for all top-level loops. -static void createCounterChain(OpBuilder &builder, Location loc, - SmallVector &top_level_loops_info) { - for (LoopInfo *loop_info : top_level_loops_info) { - createCounterChainRecursivly(builder, loc, loop_info, nullptr); - } + return band; } -// Gets top-level loops' info (loops without parents). -static SmallVector -getTopLevelLoopsInfo(SmallVector &loops_info) { - SmallVector top_level_loops_info; - for (auto &loop_info : loops_info) { - if (!loop_info.parent_loop_info) { - top_level_loops_info.push_back(&loop_info); - } - } - return top_level_loops_info; -} +//============================================================================== +// Counter Creation. +//============================================================================== -//---------------------------------------------------------------------------- -// Prologue-Loop-Epilogue Code (PLE) Pattern Detection -//---------------------------------------------------------------------------- -// Extended PLE Pattern: Also handles sibling loops without prologue/epilogue. -// Pattern 1: Prologue/Epilogue exists -// for %i { -// -// for %j { } -// -// } -// -// Pattern 2: Sibling loops (no prologue/epilogue) -// for %i { -// for %j { } -// for %k { } ← Sibling loop -// } -// For this pattern, we need to wrap the inner loop and the prologue-epilogue -// code into a hyperblock. Only by doing this can we maintain the hyperblock as -// a pure data-driven code block. -struct PLEPattern { - affine::AffineForOp outer_loop; - // Supports multiple sibling loops. - SmallVector inner_loops; - - SmallVector prologue_code; - SmallVector epilogue_code; - // Code between sibling loops. - SmallVector> inter_loop_code; - - bool has_ple_pattern = false; - bool has_sibling_loops = false; +struct CounterInfo { + affine::AffineForOp loop; + // The index value from taskflow.counter + Value counter_index; }; -// Detects Prologue-Loop-Epilogue Code pattern in the task. -static PLEPattern detectPLEPattern(affine::AffineForOp outer_loop) { - PLEPattern pattern; - pattern.has_ple_pattern = false; - pattern.has_sibling_loops = false; - pattern.outer_loop = outer_loop; - - Block &body = outer_loop.getRegion().front(); - SmallVector current_segment; - - for (Operation &op : body.getOperations()) { - if (auto nested_for = dyn_cast(&op)) { - // Finds a nested loop. - if (pattern.inner_loops.empty()) { - // First nested loop - everything before is prologue. - if (!current_segment.empty()) { - pattern.prologue_code = current_segment; - pattern.has_ple_pattern = true; - current_segment.clear(); - } - } else { - // Second or later nested loop - everything before is inter-loop code. - pattern.has_sibling_loops = true; - pattern.has_ple_pattern = true; - - if (!current_segment.empty()) { - pattern.inter_loop_code.push_back(current_segment); - current_segment.clear(); - } else { - // No operations between loops, add empty segment - pattern.inter_loop_code.push_back({}); - } - } - - pattern.inner_loops.push_back(nested_for); - - } else if (!(isa(&op) && op.getOperands().empty())) { - // Regular operation - add to current segment. - current_segment.push_back(&op); +// Creates a chain of taskflow.counter operations for a perfect loop band. +// Returns counter info for each loop level. +static SmallVector +createCounterChain(OpBuilder &builder, Location loc, + const PerfectLoopBand &band) { + SmallVector counters; + Value parent_counter = nullptr; + + for (affine::AffineForOp loop : band.loops) { + CounterInfo info; + info.loop = loop; + + // Gets loop bounds. + int32_t lb = 0, ub = 0, step = 0; + if (loop.hasConstantLowerBound() && loop.hasConstantUpperBound()) { + lb = loop.getConstantLowerBound(); + ub = loop.getConstantUpperBound(); + step = loop.getStepAsInt(); + } else { + llvm::errs() << "Warning: Non-constant loop bounds not supported yet\n"; + continue; } - } - - // Any remaining operations after all loops are epilogue. - if (!current_segment.empty() && !pattern.inner_loops.empty()) { - pattern.epilogue_code = current_segment; - pattern.has_ple_pattern = true; - } - - // If we have sibling loops (even without prologue/epilogue), - // it's still a PLE pattern that needs special handling. - if (pattern.inner_loops.size() > 1) { - pattern.has_ple_pattern = true; - pattern.has_sibling_loops = true; - } - - // If we have prologue/epilogue with at least one loop, it's a PLE pattern. - if (!pattern.inner_loops.empty() && - (!pattern.prologue_code.empty() || !pattern.epilogue_code.empty())) { - pattern.has_ple_pattern = true; - } - - return pattern; -} - -//---------------------------------------------------------------------------- -// Hyperblock Creation -//---------------------------------------------------------------------------- -// Recursively extracts hyperblocks from a region. -// Key insight: Operations in a loop body that are used by nested loops -// should be inlined into the nested loop's hyperblock. -static void extractHyperblocksInfoFromRegion( - Region ®ion, - const DenseMap &loop_info_map, - SmallVector parent_indices, - SmallVector &hyperblocks_info, - affine::AffineForOp enclosing_loop = nullptr, - SmallVector inherited_ops = {}) { - Block &block = region.front(); - SmallVector current_block_ops; - - current_block_ops.append(inherited_ops.begin(), inherited_ops.end()); - - for (Operation &op : block.getOperations()) { - if (auto for_op = dyn_cast(&op)) { - - PLEPattern ple_pattern = detectPLEPattern(for_op); - - // Gets the loop info. - LoopInfo *loop_info = loop_info_map.lookup(for_op); - assert(loop_info && "Loop not found in loop_info_map"); - - // Builds trigger indices for this loop (parent indices + this loop's - // index). - SmallVector loop_indices = parent_indices; - loop_indices.push_back(loop_info->counter_index); - - // Handles the PLE pattern. - if (ple_pattern.has_ple_pattern) { - // 1. Emits any accumulated operations as a hyperblock. - if (!current_block_ops.empty()) { - HyperblockInfo info; - info.operations = current_block_ops; - info.trigger_indices = parent_indices; - info.is_loop_body = !parent_indices.empty(); - info.loop_op = enclosing_loop; - hyperblocks_info.push_back(info); - current_block_ops.clear(); - } - - // 2. Creates a hyperblock for: - // // prologue + loop1 + inter_code1 + loop2 + inter_code2 + ... + - // epilogue. - HyperblockInfo info; - // Adds prologue code. - if (!ple_pattern.prologue_code.empty()) { - info.operations.append(ple_pattern.prologue_code.begin(), - ple_pattern.prologue_code.end()); - } - - // Adds loops and inter-loop code. - for (size_t i = 0; i < ple_pattern.inner_loops.size(); ++i) { - info.operations.push_back(ple_pattern.inner_loops[i]); - - // Adds inter-loop code if exists. - if (i < ple_pattern.inter_loop_code.size() && - !ple_pattern.inter_loop_code[i].empty()) { - info.operations.append(ple_pattern.inter_loop_code[i].begin(), - ple_pattern.inter_loop_code[i].end()); - } - } - - // Adds epilogue code. - if (!ple_pattern.epilogue_code.empty()) { - info.operations.append(ple_pattern.epilogue_code.begin(), - ple_pattern.epilogue_code.end()); - } - - info.trigger_indices = loop_indices; - info.is_loop_body = true; - info.loop_op = for_op; - info.is_ple_pattern = true; - hyperblocks_info.push_back(info); - - // No need for further processing of this loop. Since we have already - // handled the whole for_op. - current_block_ops.clear(); - continue; - } - - // Analyzes which of the current_ops are used by this loop. - DenseSet values_used_in_loop; - for_op.walk([&](Operation *nested_op) { - for (Value operand : nested_op->getOperands()) { - values_used_in_loop.insert(operand); - } - }); - - SmallVector ops_for_nested_loop; - SmallVector ops_not_used; - bool used_by_loop = false; - for (Operation *current_op : current_block_ops) { - for (Value result : current_op->getResults()) { - if (values_used_in_loop.contains(result)) { - used_by_loop = true; - break; - } - } - } - if (used_by_loop) { - ops_for_nested_loop.append(current_block_ops.begin(), - current_block_ops.end()); - } else { - ops_not_used.append(current_block_ops.begin(), current_block_ops.end()); - } - - // Before processing the loop, emits any accumulated operations as a - // hyperblock. - if (!ops_not_used.empty()) { - HyperblockInfo info; - info.operations = ops_not_used; - info.trigger_indices = parent_indices; - info.is_loop_body = !parent_indices.empty(); - info.loop_op = enclosing_loop; - hyperblocks_info.push_back(info); - } - - // Recursively extracts hyperblocks from the loop body. - extractHyperblocksInfoFromRegion(for_op.getRegion(), loop_info_map, - loop_indices, hyperblocks_info, for_op, - ops_for_nested_loop); - current_block_ops.clear(); - } else if (isa(&op) || - (isa(&op) && op.getOperands().empty())) { - // Skips TaskflowYieldOp, TaskflowCounterOp, and empty affine.yield. - continue; + // Creates counter. + if (parent_counter) { + // Creates nested counter with parent. + TaskflowCounterOp counter_op = builder.create( + loc, + /*counter_index*/ builder.getIndexType(), + /*parent_index*/ parent_counter, + /*lower_bound*/ builder.getIndexAttr(lb), + /*upper_bound*/ builder.getIndexAttr(ub), + /*step*/ builder.getIndexAttr(step), + /*counter_type*/ nullptr, + /*counter_id*/ nullptr); + info.counter_index = counter_op.getCounterIndex(); } else { - // Regular operation, accumulates it. - current_block_ops.push_back(&op); + // Creates the top-level counter (no parent). + TaskflowCounterOp counter_op = builder.create( + loc, + /*counter_index*/ builder.getIndexType(), + /*parent_index*/ nullptr, + /*lower_bound*/ builder.getIndexAttr(lb), + /*upper_bound*/ builder.getIndexAttr(ub), + /*step*/ builder.getIndexAttr(step), + /*counter_type*/ nullptr, + /*counter_id*/ nullptr); + info.counter_index = counter_op.getCounterIndex(); } - } - // Emits any remaining operations as a hyperblock. - if (!current_block_ops.empty()) { - HyperblockInfo info; - info.operations = current_block_ops; - info.trigger_indices = parent_indices; - info.is_loop_body = !parent_indices.empty(); - info.loop_op = enclosing_loop; - hyperblocks_info.push_back(info); - current_block_ops.clear(); + parent_counter = info.counter_index; + counters.push_back(info); } -} -// Extracts all hyperblocks from a task. -static SmallVector extractHyperblocksInfo( - TaskflowTaskOp task_op, - const DenseMap &loop_info_map) { - SmallVector hyperblocks_info; - // No parent indices for top-level hyperblocks (Not nested in a loop). - SmallVector empty_indices; + return counters; +} - extractHyperblocksInfoFromRegion(task_op.getBody(), loop_info_map, - empty_indices, hyperblocks_info); +//============================================================================== +// Hyperblock Creation. +//============================================================================== - return hyperblocks_info; -} +// Analyzes which loop induction variables are actually used in the loop body. +// Returns indices of loops whose induction variables are used. +static SmallVector analyzeUsedLoopIndices(const PerfectLoopBand &band) { + SmallVector used_indices; -// Collects all indices that are actually used by operations in the hyperblock. -static SmallVector collectUsedIndices( - const SmallVector &operations, - const SmallVector &candidate_indices, - const DenseMap &loop_info_map) { - // Builds reverse mapping: counter -> induction variable. - DenseMap counter_to_indvar; - for (auto [loop_op, loop_info] : loop_info_map) { - counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar(); - } + // Gets the deepest perfect loop's body. + affine::AffineForOp deepest_loop = band.loops.back(); + Block &body = deepest_loop.getRegion().front(); - // Collects all values used by operations. - SetVector used_indvars_set; - for (Operation *op : operations) { + // Collects all values used in the body. + DenseSet used_values; + body.walk([&](Operation *op) { for (Value operand : op->getOperands()) { - used_indvars_set.insert(operand); + used_values.insert(operand); } - } + }); - // Returns in the same order as candidate_indices to maintain parent->child - // order. - SmallVector used_counters; - for (Value counter : candidate_indices) { - if (counter_to_indvar.count(counter)) { - Value indvar = counter_to_indvar[counter]; - if (used_indvars_set.contains(indvar)) { - used_counters.push_back(counter); - } + // Checks which loop induction variables are used. + for (size_t i = 0; i < band.loops.size(); ++i) { + affine::AffineForOp loop = band.loops[i]; + Value induction_var = loop.getInductionVar(); + if (used_values.contains(induction_var)) { + used_indices.push_back(i); } } - return used_counters; + return used_indices; } -// Determines output types for the hyperblock based on operations. -static SmallVector -determineHyperblockOutputTypes(const SmallVector &operations) { - SmallVector output_types = {}; - - // Checks if there's an affine.yield operation. - for (Operation *op : operations) { - if (auto affine_yield = dyn_cast(op)) { - // Uses the operand types of affine.yield as output types. - for (Value operand : affine_yield.getOperands()) { - output_types.push_back(operand.getType()); - } - return output_types; +// Clones the body of the deepest perfect loop in the perfect band into a +// hyperblock. Handles iter_args (reduction variables) by: +// 1. Adding iter_args initial values as hyperblock inputs +// 2. Mapping iter_args to hyperblock block arguments +// 3. Returning reduction results as hyperblock outputs +static TaskflowHyperblockOp +createHyperblockFromLoopBody(OpBuilder &builder, Location loc, + const PerfectLoopBand &band, + const SmallVector &counters) { + // Gets the deepest perfect loop in the perfect nested band. + affine::AffineForOp deepest_perfect_loop = band.loops.back(); + Block &loop_body = deepest_perfect_loop.getRegion().front(); + + // Analyzes which loop indices are actually used. + SmallVector used_loop_indices = analyzeUsedLoopIndices(band); + + // Checks if the deepest loop has iter_args (reduction variables). + bool has_iter_args = deepest_perfect_loop.getNumIterOperands() > 0; + SmallVector iter_args_init_values = {}; + SmallVector iter_args_types = {}; + + if (has_iter_args) { + for (Value init_val : deepest_perfect_loop.getInits()) { + iter_args_init_values.push_back(init_val); + iter_args_types.push_back(init_val.getType()); } } - // No affine.yield found, no output types needed. - return output_types; -} - -// Creates a taskflow.hyperblock operation from HyperblockInfo. -static TaskflowHyperblockOp createHyperblock( - OpBuilder &builder, Location loc, HyperblockInfo &info, Block *task_body, - const DenseMap &loop_info_map) { - // Collects only the indices that are actually used in the hyperblock. - SmallVector used_indices = - collectUsedIndices(info.operations, info.trigger_indices, loop_info_map); - - // Determines output types for the hyperblock based on operations. - SmallVector output_types = - determineHyperblockOutputTypes(info.operations); - - // Checks if there is a reduction in the hyperblock (with iter_args). - SmallVector iter_args_init_values; - bool is_reduction = false; - if (info.loop_op && info.loop_op.getNumIterOperands() > 0) { - is_reduction = true; - for (Value init : info.loop_op.getInits()) { - iter_args_init_values.push_back(init); - } + // Builds trigger values (only for USED counter indices) + SmallVector trigger_values; + for (size_t idx : used_loop_indices) { + trigger_values.push_back(counters[idx].counter_index); } - // Creates the hyperblock operation. - TaskflowHyperblockOp hyperblock_op; - if (is_reduction) { - hyperblock_op = builder.create( - loc, output_types, used_indices, iter_args_init_values); - } else { - hyperblock_op = builder.create( - loc, output_types, used_indices, /*iter_args=*/ValueRange{}); + + // Determines hyperblock result types (from iter_args if present). + SmallVector result_types = {}; + if (has_iter_args) { + result_types = iter_args_types; } - Block *hyperblock_body = new Block(); - hyperblock_op.getBody().push_back(hyperblock_body); + // Creates hyperblock operation with iter_args as inputs. + auto hyperblock_op = builder.create( + loc, result_types, trigger_values, iter_args_init_values); - // Adds block arguments for the used indices. - for (Value idx : used_indices) { - hyperblock_body->addArgument(idx.getType(), loc); + // Builds block arguments: + // 1. Counter indices (only for USED loop levels). + // 2. Iter args values (passed through hyperblock invocation). + SmallVector arg_types; + SmallVector arg_locs; + + // Adds counter index arguments (only for used indices). + for (size_t i = 0; i < used_loop_indices.size(); ++i) { + arg_types.push_back(builder.getIndexType()); + arg_locs.push_back(loc); } - SmallVector iter_args_block_args; - if (is_reduction) { - for (Value init : iter_args_init_values) { - BlockArgument arg = hyperblock_body->addArgument(init.getType(), loc); - iter_args_block_args.push_back(arg); + // Adds iter_args as hyperblock block arguments. + if (has_iter_args) { + for (Type ty : iter_args_types) { + arg_types.push_back(ty); + arg_locs.push_back(loc); } } - // Clone operations into the hyperblock body. - OpBuilder hyperblock_builder(hyperblock_body, hyperblock_body->begin()); - IRMapping mapping; + Block *hyperblock_body = &hyperblock_op.getBody().emplaceBlock(); + hyperblock_body->addArguments(arg_types, arg_locs); - // Maps used indices to block arguments - for (auto [idx, arg] : - llvm::zip(used_indices, hyperblock_body->getArguments())) { - mapping.map(idx, arg); - } + OpBuilder body_builder = OpBuilder::atBlockBegin(hyperblock_body); + IRMapping mapper; - // Creates a mapping from loop counters to loop induction variables. - DenseMap counter_to_indvar; - for (auto [loop_op, loop_info] : loop_info_map) { - counter_to_indvar[loop_info->counter_index] = loop_op.getInductionVar(); + // Maps USED loop induction variables to hyperblock arguments. + for (size_t i = 0; i < used_loop_indices.size(); ++i) { + size_t loop_idx = used_loop_indices[i]; + affine::AffineForOp loop = band.loops[loop_idx]; + mapper.map(loop.getInductionVar(), hyperblock_body->getArgument(i)); } - // Maps loop induction variables to hyperblock block arguments. - for (auto [idx, arg] : - llvm::zip(used_indices, hyperblock_body->getArguments())) { - if (counter_to_indvar.count(idx)) { - Value indvar = counter_to_indvar[idx]; - mapping.map(indvar, arg); + // Maps iter_args to hyperblock block arguments (after counter indices). + if (has_iter_args) { + for (size_t i = 0; i < iter_args_types.size(); ++i) { + size_t arg_idx = used_loop_indices.size() + i; + mapper.map(deepest_perfect_loop.getRegionIterArgs()[i], + hyperblock_body->getArgument(arg_idx)); } } - // If this hyperblock comes from a loop with iter_args, maps them. - if (is_reduction) { - Block &loop_body = info.loop_op.getRegion().front(); - auto loop_iter_args = loop_body.getArguments().drop_front(1); + // Clones all operations from the deepest perfect loop's body. + SmallVector yield_operands; - for (auto [loop_iter_arg, hb_iter_arg] : - llvm::zip(loop_iter_args, iter_args_block_args)) { - mapping.map(loop_iter_arg, hb_iter_arg); + for (Operation &op : loop_body) { + // Handles affine.yield with operands (reduction results). + if (auto yield_op = dyn_cast(&op)) { + if (yield_op.getNumOperands() > 0) { + // Maps the yielded values for hyperblock's return. + for (Value yielded : yield_op.getOperands()) { + Value mapped = mapper.lookupOrDefault(yielded); + yield_operands.push_back(mapped); + } + } + continue; // Skips the yield itself. } - } - // Clones all operations and handle terminators. - bool has_terminator = false; - for (Operation *op : info.operations) { - // Handles affine.yield specially - convert to hyperblock.yield. - if (auto affine_yield = dyn_cast(op)) { - // Maps the yield operands through the IRMapping. - SmallVector yield_operands; - for (Value operand : affine_yield.getOperands()) { - Value mapped_operand = mapping.lookupOrDefault(operand); - yield_operands.push_back(mapped_operand); - } + // Clones operation (including nested affine.for with iter_args). + Operation *cloned = body_builder.clone(op, mapper); - // Creates hyperblock.yield with the mapped operands. - hyperblock_builder.create(loc, yield_operands, - yield_operands); - has_terminator = true; - continue; + // Updates mapper with cloned operation results. + for (size_t i = 0; i < op.getNumResults(); ++i) { + mapper.map(op.getResult(i), cloned->getResult(i)); } - - // Clones regular operations. - hyperblock_builder.clone(*op, mapping); } - // Adds terminator if the last operation wasn't already a yield. - if (!has_terminator) { - hyperblock_builder.setInsertionPointToEnd(hyperblock_body); - hyperblock_builder.create(loc); + // Adds terminator with reduction results (if any). + if (has_iter_args) { + body_builder.create( + loc, + /*iter_args_next=*/yield_operands, // No iter_args_next for final + // iteration + /*results=*/yield_operands); // Reduction results + } else { + body_builder.create(loc); } + // Converts affine operations to standard/scf operations. MLIRContext *context = hyperblock_op.getContext(); RewritePatternSet patterns(context); populateAffineToStdConversionPatterns(patterns); + ConversionTarget target(*context); target.addLegalDialect(); + func::FuncDialect, TaskflowDialect, scf::SCFDialect>(); target.addIllegalOp(); + affine::AffineForOp, affine::AffineIfOp, + affine::AffineYieldOp>(); + if (failed( applyPartialConversion(hyperblock_op, target, std::move(patterns)))) { - assert(false && "Affine to Standard conversion failed."); + llvm::errs() + << "Error: Failed to convert affine operations to standard/scf\n"; + return nullptr; } return hyperblock_op; } -//---------------------------------------------------------------------------- -// Task Transformation -//---------------------------------------------------------------------------- -// The main transformation function for TaskflowTaskOp. +//============================================================================ +// Task Transformation. +//=========================================================================== static LogicalResult transformTask(TaskflowTaskOp task_op) { Location loc = task_op.getLoc(); + Block &task_body = task_op.getBody().front(); - // Step 1: Collects loop information. - DenseMap loop_info_map; - SmallVector loops_info = collectLoopInfo(task_op); - for (auto &loop_info : loops_info) { - loop_info_map[loop_info.for_op] = &loop_info; - } - - // Gets the body block of the task. - Block *task_body = &task_op.getBody().front(); - - // Finds the first loop in the task body. - affine::AffineForOp first_loop_op = nullptr; - for (Operation &op : task_body->getOperations()) { + // Finds all top-level loops in the task. + SmallVector top_level_loops; + for (Operation &op : task_body) { if (auto for_op = dyn_cast(&op)) { - first_loop_op = for_op; - break; + top_level_loops.push_back(for_op); } } - assert(first_loop_op && "No loops found in the task body."); + if (top_level_loops.empty()) { + llvm::errs() << "No loops found in task " << task_op.getTaskName() << "\n"; + return success(); + } + + assert(top_level_loops.size() == 1 && + "Expected exactly one top-level loop in each task."); + + OpBuilder builder(&task_body, task_body.begin()); - // Step 2: Creates counter chain before the first loop. - OpBuilder builder(first_loop_op); - SmallVector top_level_loops_info = - getTopLevelLoopsInfo(loops_info); - createCounterChain(builder, loc, top_level_loops_info); + // Stores mapping from loop results to hyperblock results. + DenseMap loop_result_to_hyperblock_result; - // Step 3: Extracts hyperblocks from task. - SmallVector hyperblocks_info = - extractHyperblocksInfo(task_op, loop_info_map); + // Processes each top-level loop. + for (affine::AffineForOp top_loop : top_level_loops) { + llvm::errs() << "\n[ConstructHyperblock] Processing top-level loop\n"; - // Step 4: Creates taskflow.hyperblock operations for each hyperblock. - builder.setInsertionPoint(first_loop_op); + // Step 1: Detects maximal perfect loop band. + PerfectLoopBand band = detectPerfectLoopBand(top_loop); + llvm::errs() << " Detected perfect loop band of depth " << band.getDepth() + << "\n"; - // Stores mappings from loop results to hyperblock outputs. - DenseMap loop_result_to_hyperblock_output; + // Step 2: Creates counter chain for the perfect band. + builder.setInsertionPoint(top_loop); + SmallVector counters = createCounterChain(builder, loc, band); + llvm::errs() << " Created " << counters.size() << " counters\n"; - // Creates hyperblock ops. - for (auto &info : hyperblocks_info) { + // Step 3: Creates hyperblock from deepest loop's body. TaskflowHyperblockOp hyperblock_op = - createHyperblock(builder, loc, info, task_body, loop_info_map); - - // If this hyperblock has outputs and belongs to a loop with iter_args, - // replace the loop results with the hyperblock outputs. - if (info.loop_op && info.loop_op.getNumResults() > 0 && - (hyperblock_op.getNumResults() == info.loop_op.getNumResults())) { - auto loop_results = info.loop_op.getResults(); - auto hyperblock_results = hyperblock_op.getOutputs(); - - for (auto [loop_result, hb_result] : - llvm::zip(loop_results, hyperblock_results)) { - loop_result_to_hyperblock_output[loop_result] = hb_result; - } - } - } + createHyperblockFromLoopBody(builder, loc, band, counters); + llvm::errs() << " Created hyperblock with " + << hyperblock_op.getBody().front().getOperations().size() + << " operations\n"; - // Step 5: Replaces loop results with hyperblock outputs BEFORE erasing loops. - for (auto [loop_result, hb_output] : loop_result_to_hyperblock_output) { - loop_result.replaceAllUsesWith(hb_output); - } + assert(hyperblock_op && "Hyperblock creation failed"); - // Step 6: Collects and erases original loop operations. - // Collects all operations to erase. - SmallVector ops_to_erase; - // First pass: collects all affine.for operations recursively. - // We need to erase them in reverse order (inner loops first). - SmallVector loops_to_erase; - task_op.walk( - [&](affine::AffineForOp loop) { loops_to_erase.push_back(loop); }); - - // Reverses the order so we erase innermost loops first. - std::reverse(loops_to_erase.begin(), loops_to_erase.end()); - - // Second pass: collects non-loop, non-taskflow operations. - for (Operation &op : llvm::make_early_inc_range(task_body->getOperations())) { - if (!isa(&op)) { - ops_to_erase.push_back(&op); + // If the loop has results (iter_args), map them to hyperblock results. + if (top_loop.getNumResults() > 0) { + llvm::errs() << " Mapping " << top_loop.getNumResults() + << " loop results to hyperblock outputs\n"; + + for (size_t i = 0; i < top_loop.getNumResults(); ++i) { + loop_result_to_hyperblock_result[top_loop.getResult(i)] = + hyperblock_op.getResult(i); + } } } - // Step 7: Erases operations. - // First erases non-loop operations. - for (Operation *op : ops_to_erase) { - // Makes sure all uses are replaced before erasing. - assert(op->use_empty() && "Operation still has uses before erasing"); - op->erase(); + // Replaces loop results with hyperblock results BEFORE erasing loops. + for (auto [loop_result, hb_result] : loop_result_to_hyperblock_result) { + loop_result.replaceAllUsesWith(hb_result); } - // Then erases loops (innermost first). - for (affine::AffineForOp loop : loops_to_erase) { - // Makes sure the loop results have been replaced before erasing. + // Step 4: Erases all original loops. + for (affine::AffineForOp loop : top_level_loops) { + // Ensures no uses remain. assert(loop->use_empty() && "Loop still has uses before erasing"); loop->erase(); } @@ -739,7 +404,8 @@ struct ConstructHyperblockFromTaskPass } StringRef getDescription() const final { - return "Constructs hyperblocks and counter chains from Taskflow tasks."; + return "Constructs hyperblocks from taskflow tasks by detecting perfect " + "nested loop bands."; } void getDependentDialects(DialectRegistry ®istry) const override { @@ -750,17 +416,15 @@ struct ConstructHyperblockFromTaskPass void runOnOperation() override { func::FuncOp func_op = getOperation(); - // Collects all tasks. - SmallVector tasks; - func_op.walk([&](TaskflowTaskOp task_op) { tasks.push_back(task_op); }); - // Transforms each task. - for (TaskflowTaskOp task_op : tasks) { + // Walks through all TaskflowTaskOp in the function. + func_op.walk([&](TaskflowTaskOp task_op) { if (failed(transformTask(task_op))) { signalPassFailure(); - return; + return WalkResult::interrupt(); } - } + return WalkResult::advance(); + }); } }; } // namespace diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 906bc267..15070981 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -140,77 +140,76 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// HYPERBLOCK: module { -// HYPERBLOCK-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { -// HYPERBLOCK-NEXT: %c2_i32 = arith.constant 2 : i32 -// HYPERBLOCK-NEXT: %c8_i32 = arith.constant 8 : i32 -// HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 -// HYPERBLOCK-NEXT: %alloca = memref.alloca() : memref -// HYPERBLOCK-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> -// HYPERBLOCK-NEXT: %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) { -// HYPERBLOCK-NEXT: ^bb0(%arg0: i32): -// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index -// HYPERBLOCK-NEXT: %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg1: index, %arg2: i32): -// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg1 : index to i32 -// HYPERBLOCK-NEXT: %4 = arith.addi %arg2, %3 : i32 -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32) -// HYPERBLOCK-NEXT: }) : (index, i32) -> i32 -// HYPERBLOCK-NEXT: taskflow.yield values(%2 : i32) -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { -// HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32): -// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg2: index): -// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg2 : index to i32 -// HYPERBLOCK-NEXT: %4 = arith.muli %3, %arg1 : i32 -// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index -// HYPERBLOCK-NEXT: %c8 = arith.constant 8 : index -// HYPERBLOCK-NEXT: %c1 = arith.constant 1 : index -// HYPERBLOCK-NEXT: scf.for %arg3 = %c0 to %c8 step %c1 { -// HYPERBLOCK-NEXT: %5 = arith.index_cast %arg3 : index to i32 -// HYPERBLOCK-NEXT: %6 = arith.addi %4, %5 : i32 -// HYPERBLOCK-NEXT: memref.store %6, %arg0[%arg2, %arg3] : memref<4x8xi32> -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref) { -// HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): -// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index -// HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index -// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ -// HYPERBLOCK-NEXT: ^bb0(%arg5: index): -// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg5 : index to i32 -// HYPERBLOCK-NEXT: %4 = arith.muli %3, %arg2 : i32 -// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index -// HYPERBLOCK-NEXT: %c8 = arith.constant 8 : index -// HYPERBLOCK-NEXT: %c1 = arith.constant 1 : index -// HYPERBLOCK-NEXT: scf.for %arg6 = %c0 to %c8 step %c1 { -// HYPERBLOCK-NEXT: %5 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32> -// HYPERBLOCK-NEXT: %6 = arith.addi %5, %arg3 : i32 -// HYPERBLOCK-NEXT: %c0_2 = arith.constant 0 : index -// HYPERBLOCK-NEXT: %c-3 = arith.constant -3 : index -// HYPERBLOCK-NEXT: %7 = arith.addi %arg5, %c-3 : index -// HYPERBLOCK-NEXT: %8 = arith.cmpi eq, %7, %c0_2 : index -// HYPERBLOCK-NEXT: %c-7 = arith.constant -7 : index -// HYPERBLOCK-NEXT: %9 = arith.addi %arg6, %c-7 : index -// HYPERBLOCK-NEXT: %10 = arith.cmpi eq, %9, %c0_2 : index -// HYPERBLOCK-NEXT: %11 = arith.andi %8, %10 : i1 -// HYPERBLOCK-NEXT: scf.if %11 { -// HYPERBLOCK-NEXT: memref.store %6, %arg1[] : memref -// HYPERBLOCK-NEXT: %12 = arith.muli %6, %arg4 : i32 -// HYPERBLOCK-NEXT: memref.store %12, %arg1[] : memref -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: taskflow.hyperblock.yield -// HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg1 : memref) -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[] : memref -// HYPERBLOCK-NEXT: return %0 : i32 -// HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT:} \ No newline at end of file +// HYPERBLOCK: module { +// HYPERBLOCK-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { +// HYPERBLOCK-NEXT: %c2_i32 = arith.constant 2 : i32 +// HYPERBLOCK-NEXT: %c8_i32 = arith.constant 8 : i32 +// HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 +// HYPERBLOCK-NEXT: %alloca = memref.alloca() : memref +// HYPERBLOCK-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> +// HYPERBLOCK-NEXT: %value_outputs = taskflow.task @Task_0 value_inputs(%c0_i32 : i32) : (i32) -> (i32) { +// HYPERBLOCK-NEXT: ^bb0(%arg0: i32): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 5 : index} : index +// HYPERBLOCK-NEXT: %2 = "taskflow.hyperblock"(%1, %arg0) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg1: index, %arg2: i32): +// HYPERBLOCK-NEXT: %3 = arith.index_cast %arg1 : index to i32 +// HYPERBLOCK-NEXT: %4 = arith.addi %arg2, %3 : i32 +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield iter_args_next(%4 : i32) results(%4 : i32) +// HYPERBLOCK-NEXT: }) : (index, i32) -> i32 +// HYPERBLOCK-NEXT: taskflow.yield values(%2 : i32) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { +// HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg2: index): +// HYPERBLOCK-NEXT: %2 = arith.index_cast %arg2 : index to i32 +// HYPERBLOCK-NEXT: %3 = arith.muli %2, %arg1 : i32 +// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %c8 = arith.constant 8 : index +// HYPERBLOCK-NEXT: %c1 = arith.constant 1 : index +// HYPERBLOCK-NEXT: scf.for %arg3 = %c0 to %c8 step %c1 { +// HYPERBLOCK-NEXT: %4 = arith.index_cast %arg3 : index to i32 +// HYPERBLOCK-NEXT: %5 = arith.addi %3, %4 : i32 +// HYPERBLOCK-NEXT: memref.store %5, %arg0[%arg2, %arg3] : memref<4x8xi32> +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0), original_write_memrefs(%alloca)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref) { +// HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): +// HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index +// HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ +// HYPERBLOCK-NEXT: ^bb0(%arg5: index): +// HYPERBLOCK-NEXT: %2 = arith.index_cast %arg5 : index to i32 +// HYPERBLOCK-NEXT: %3 = arith.muli %2, %arg2 : i32 +// HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %c8 = arith.constant 8 : index +// HYPERBLOCK-NEXT: %c1 = arith.constant 1 : index +// HYPERBLOCK-NEXT: scf.for %arg6 = %c0 to %c8 step %c1 { +// HYPERBLOCK-NEXT: %4 = memref.load %arg0[%arg5, %arg6] : memref<4x8xi32> +// HYPERBLOCK-NEXT: %5 = arith.addi %4, %arg3 : i32 +// HYPERBLOCK-NEXT: %c0_2 = arith.constant 0 : index +// HYPERBLOCK-NEXT: %c-3 = arith.constant -3 : index +// HYPERBLOCK-NEXT: %6 = arith.addi %arg5, %c-3 : index +// HYPERBLOCK-NEXT: %7 = arith.cmpi eq, %6, %c0_2 : index +// HYPERBLOCK-NEXT: %c-7 = arith.constant -7 : index +// HYPERBLOCK-NEXT: %8 = arith.addi %arg6, %c-7 : index +// HYPERBLOCK-NEXT: %9 = arith.cmpi eq, %8, %c0_2 : index +// HYPERBLOCK-NEXT: %10 = arith.andi %7, %9 : i1 +// HYPERBLOCK-NEXT: scf.if %10 { +// HYPERBLOCK-NEXT: memref.store %5, %arg1[] : memref +// HYPERBLOCK-NEXT: %11 = arith.muli %5, %arg4 : i32 +// HYPERBLOCK-NEXT: memref.store %11, %arg1[] : memref +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: taskflow.hyperblock.yield +// HYPERBLOCK-NEXT: }) : (index) -> () +// HYPERBLOCK-NEXT: taskflow.yield writes(%arg1 : memref) +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[] : memref +// HYPERBLOCK-NEXT: return %0 : i32 +// HYPERBLOCK-NEXT: } +// HYPERBLOCK-NEXT: } + From 7a11d4c08841269f01c20a11be5cd70bfdd7a01d Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Tue, 3 Feb 2026 12:42:55 +0800 Subject: [PATCH 9/9] [clean] simplify logic in loop tree serialization --- .../AffineLoopTreeSerializationPass.cpp | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp index da5cc7fa..9a8e45fa 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopTreeSerializationPass.cpp @@ -194,7 +194,7 @@ class MCTBuilder { loop_builder = OpBuilder::atBlockEnd(current_insert_block); } - // Prepare iter_args for the new loop. + // Prepares iter_args for the new loop. SmallVector iter_args_init_values; if (node->loop_op.getNumIterOperands() > 0) { for (Value init : node->loop_op.getInits()) { @@ -202,17 +202,17 @@ class MCTBuilder { } } - // Create new loop with same bounds and iter_args. + // Creates new loop with same bounds and iter_args. auto new_loop = loop_builder.create( loc, node->lower_bound, node->upper_bound, node->step, iter_args_init_values); created_loops.push_back(new_loop); - // Map the old induction variable to the new one. + // Maps the old induction variable to the new one. mapping.map(node->loop_op.getInductionVar(), new_loop.getInductionVar()); - // Map the old iter_args (block args) to the new iter_args (block args). + // Maps the old iter_args (block args) to the new iter_args (block args). if (node->loop_op.getNumRegionIterArgs() > 0) { for (auto [old_arg, new_arg] : llvm::zip(node->loop_op.getRegionIterArgs(), @@ -225,19 +225,19 @@ class MCTBuilder { outer_loop = new_loop; } - // Update current insertion block to the body of the new loop. + // Updates current insertion block to the body of the new loop. current_insert_block = new_loop.getBody(); - // Remove the default yield created by create. + // Removes the default yield created by create. if (!current_insert_block->empty() && isa(current_insert_block->back())) current_insert_block->back().erase(); - // Clone body operations for THIS node. + // Clones body operations for THIS node. OpBuilder body_builder = OpBuilder::atBlockEnd(current_insert_block); for (Operation *op : node->body_operations) { Operation *new_op = body_builder.clone(*op, mapping); - // Update mapping with results of the new op. + // Updates mapping with results of the new op. for (auto [old_res, new_res] : llvm::zip(op->getResults(), new_op->getResults())) { mapping.map(old_res, new_res); @@ -245,7 +245,7 @@ class MCTBuilder { } } - // Fix up yields for non-leaf loops (bottom-up). + // Fixes up yields for non-leaf loops (bottom-up). for (int i = created_loops.size() - 2; i >= 0; --i) { affine::AffineForOp parent = created_loops[i]; affine::AffineForOp child = created_loops[i + 1]; @@ -268,7 +268,7 @@ class MCTBuilder { // We need to find what the original yield yielded, map it, and yield it // here. - // Wait, if SALT excludes Yield from body_operations, then we NEVER cloned + // If SALT excludes Yield from body_operations, then we NEVER cloned // the yield. So the leaf loop has no terminator. We must reconstruct the // yield for the leaf loop. @@ -276,7 +276,7 @@ class MCTBuilder { affine::AffineForOp new_leaf = created_loops.back(); SALTNode *leaf_node = chain.getLeaf(); // or chain.nodes.back() - // Find the yield op in the original leaf node. + // Finds the yield op in the original leaf node. Operation *original_yield = nullptr; for (Operation &op : leaf_node->loop_op.getBody()->getOperations()) { if (isa(&op)) { @@ -294,10 +294,8 @@ class MCTBuilder { } leaf_yield_builder.create(loc, yielded_values); } else { - // Should not happen for valid AffineForOp - OpBuilder leaf_yield_builder = - OpBuilder::atBlockEnd(new_leaf.getBody()); - leaf_yield_builder.create(loc); + assert(false && + "Original leaf loop must have a yield operation in its body."); } }