From 4c8682d78a4f67a8dd1cb6fbfee43f5d37fcd8ea Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Mon, 23 Feb 2026 12:25:22 -0800 Subject: [PATCH 1/8] PR feedback from SingleAccretion Reorder invariant nodes in simple scenarios in stackifier Jitdump when moving nodes in stackifier When regallocwasm creates a new store node, lower it Apply regallocwasm fix from andy Checkpoint Checkpoint Add comment Speculatively implement the dstOnStack optimization (code that hits it doesn't compile yet) --- src/coreclr/jit/codegenwasm.cpp | 99 +++++++++++++++++++++++++++++++- src/coreclr/jit/compiler.h | 4 ++ src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/lowerwasm.cpp | 13 +++++ src/coreclr/jit/regallocwasm.cpp | 27 +++++++++ src/coreclr/jit/regallocwasm.h | 1 + 6 files changed, 143 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index 566c7ca5bd57bb..3bc45805e007e5 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -13,6 +13,8 @@ static const int LINEAR_MEMORY_INDEX = 0; #ifdef TARGET_64BIT +static const instruction INS_I_load = INS_i64_load; +static const instruction INS_I_store = INS_i64_store; static const instruction INS_I_const = INS_i64_const; static const instruction INS_I_add = INS_i64_add; static const instruction INS_I_mul = INS_i64_mul; @@ -21,6 +23,8 @@ static const instruction INS_I_le_u = INS_i64_le_u; static const instruction INS_I_ge_u = INS_i64_ge_u; static const instruction INS_I_gt_u = INS_i64_gt_u; #else // !TARGET_64BIT +static const instruction INS_I_load = INS_i32_load; +static const instruction INS_I_store = INS_i32_store; static const instruction INS_I_const = INS_i32_const; static const instruction INS_I_add = INS_i32_add; static const instruction INS_I_mul = INS_i32_mul; @@ -427,7 +431,9 @@ void CodeGen::WasmProduceReg(GenTree* node) // // If the operand is a candidate, we use that candidate's current register. // Otherwise it must have been allocated into a temporary register initialized -// in 'WasmProduceReg'. +// in 'WasmProduceReg'. To do this, set the LIR::Flags::MultiplyUsed flag during +// lowering or other pre-regalloc phases, and ensure that regalloc is updated to +// call CollectReferences on the node(s) that need to be used multiple times. // // Arguments: // operand - The operand node @@ -2420,9 +2426,98 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) } } +//------------------------------------------------------------------------ +// genCodeForCpObj: Produce code for a GT_STORE_BLK node that represents a cpobj operation. +// +// Arguments: +// cpObjNode - the node +// void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) { - NYI_WASM("genCodeForCpObj"); + GenTree* dstAddr = cpObjNode->Addr(); + GenTree* source = cpObjNode->Data(); + var_types srcAddrType = TYP_BYREF; + + assert(source->isContained()); + if (source->OperIs(GT_IND)) + { + source = source->gtGetOp1(); + assert(!source->isContained()); + srcAddrType = source->TypeGet(); + } + + noway_assert(source->IsLocal()); + noway_assert(dstAddr->IsLocal()); + + // If the destination is on the stack we don't need the write barrier. + bool dstOnStack = cpObjNode->IsAddressNotOnHeap(m_compiler); + +#ifdef DEBUG + assert(!dstAddr->isContained()); + + // This GenTree node has data about GC pointers, this means we're dealing + // with CpObj. + assert(cpObjNode->GetLayout()->HasGCPtr()); +#endif // DEBUG + + genConsumeRegs(cpObjNode); + + ClassLayout* layout = cpObjNode->GetLayout(); + unsigned slots = layout->GetSlotCount(); + + regNumber srcReg = GetMultiUseOperandReg(source); + regNumber dstReg = GetMultiUseOperandReg(dstAddr); + + if (cpObjNode->IsVolatile()) + { + // TODO-WASM: Memory barrier + } + + emitter* emit = GetEmitter(); + + emitAttr attrSrcAddr = emitActualTypeSize(srcAddrType); + emitAttr attrDstAddr = emitActualTypeSize(dstAddr->TypeGet()); + + unsigned gcPtrCount = cpObjNode->GetLayout()->GetGCPtrCount(); + + unsigned i = 0, offset = 0; + while (i < slots) + { + // Copy the pointer-sized non-gc-pointer slots one at a time (and GC pointer slots if the destination is stack) + // using regular I-sized load/store pairs. + if (dstOnStack || !layout->IsGCPtr(i)) + { + // Do a pointer-sized load+store pair at the appropriate offset relative to dest and source + emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); + emit->emitIns_I(INS_local_get, attrSrcAddr, WasmRegToIndex(srcReg)); + emit->emitIns_I(INS_I_load, EA_PTRSIZE, offset); + emit->emitIns_I(INS_I_store, EA_PTRSIZE, offset); + } + else + { + // Compute the actual dest/src of the slot being copied to pass to the helper. + emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); + emit->emitIns_I(INS_i32_const, attrDstAddr, offset); + emit->emitIns(INS_i32_add); + emit->emitIns_I(INS_local_get, attrSrcAddr, WasmRegToIndex(srcReg)); + emit->emitIns_I(INS_i32_const, attrSrcAddr, offset); + emit->emitIns(INS_i32_add); + // Call the byref assign helper. On other targets this updates the dst/src regs but here it won't, + // so we have to do the local.get+i32.const+i32.add dance every time. + genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE); + gcPtrCount--; + } + ++i; + offset += TARGET_POINTER_SIZE; + } + assert(gcPtrCount == 0); + + if (cpObjNode->IsVolatile()) + { + // TODO-WASM: Memory barrier + } + + WasmProduceReg(cpObjNode); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 89e5f28d1bd447..c89c7a874d8a65 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -2566,6 +2566,10 @@ class Compiler friend class ReplaceVisitor; friend class FlowGraphNaturalLoop; +#ifdef TARGET_WASM + friend class WasmRegAlloc; // For m_pLowering +#endif + #ifdef FEATURE_HW_INTRINSICS friend struct GenTreeHWIntrinsic; friend struct HWIntrinsicInfo; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d44e0bf76a1a91..c2b1dfc80d2c5f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -12844,7 +12844,7 @@ void Compiler::gtDispTree(GenTree* tree, #ifdef TARGET_WASM case GenTreeBlk::BlkOpKindNativeOpcode: - printf(" (memory.copy|fill)"); + printf(" (memory.%s)", tree->OperIsCopyBlkOp() ? "copy" : "fill"); break; #endif diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index e241e9a3a02700..50fec8b9dab716 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -254,6 +254,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindCpObjUnroll; + dstAddr->gtLIRFlags |= LIR::Flags::MultiplyUsed; + if (src->OperIs(GT_IND)) + src->gtGetOp1()->gtLIRFlags |= LIR::Flags::MultiplyUsed; + else + src->gtLIRFlags |= LIR::Flags::MultiplyUsed; } else { @@ -512,6 +517,14 @@ void Lowering::AfterLowerBlock() // instead be ifdef-ed out for WASM. m_anyChanges = true; + if (node->IsInvariant()) + { + JITDUMP("Stackifier moving invariant node [%06u] after [%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); + m_lower->BlockRange().Remove(node); + m_lower->BlockRange().InsertAfter(prev, node); + break; + } + JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); NYI_WASM("IR not in a stackified form"); } diff --git a/src/coreclr/jit/regallocwasm.cpp b/src/coreclr/jit/regallocwasm.cpp index 88efaee43d7284..25345f1a045e6a 100644 --- a/src/coreclr/jit/regallocwasm.cpp +++ b/src/coreclr/jit/regallocwasm.cpp @@ -8,6 +8,8 @@ #include "regallocwasm.h" +#include "lower.h" // for LowerRange() + RegAllocInterface* GetRegisterAllocator(Compiler* compiler) { return new (compiler->getAllocator(CMK_LSRA)) WasmRegAlloc(compiler); @@ -328,6 +330,9 @@ void WasmRegAlloc::CollectReferencesForNode(GenTree* node) case GT_SUB: case GT_MUL: CollectReferencesForBinop(node->AsOp()); + + case GT_STORE_BLK: + CollectReferencesForBlockStore(node->AsBlk()); break; default: @@ -417,6 +422,21 @@ void WasmRegAlloc::CollectReferencesForBinop(GenTreeOp* binopNode) ConsumeTemporaryRegForOperand(binopNode->gtGetOp1() DEBUGARG("binop overflow check")); } +// CollectReferencesForBlockStore: Collect virtual register references for a block store. +// +// Arguments: +// node - The GT_STORE_BLK node +// +void WasmRegAlloc::CollectReferencesForBlockStore(GenTreeBlk* node) +{ + GenTree* src = node->Data(); + if (src->OperIs(GT_IND)) + src = src->gtGetOp1(); + + ConsumeTemporaryRegForOperand(src DEBUGARG("block store source")); + ConsumeTemporaryRegForOperand(node->Addr() DEBUGARG("block store destination")); +} + //------------------------------------------------------------------------ // CollectReferencesForLclVar: Collect virtual register references for a LCL_VAR. // @@ -476,6 +496,9 @@ void WasmRegAlloc::RewriteLocalStackStore(GenTreeLclVarCommon* lclNode) CurrentRange().InsertAfter(lclNode, store); CurrentRange().Remove(lclNode); CurrentRange().InsertBefore(insertionPoint, lclNode); + + auto tempRange = LIR::ReadOnlyRange(store, store); + m_compiler->m_pLowering->LowerRange(m_currentBlock, tempRange); } //------------------------------------------------------------------------ @@ -539,6 +562,8 @@ void WasmRegAlloc::RequestTemporaryRegisterForMultiplyUsedNode(GenTree* node) // Note how due to the fact we're processing nodes in stack order, // we don't need to maintain free/busy sets, only a simple stack. regNumber reg = AllocateTemporaryRegister(genActualType(node)); + // If the node already has a regnum, trying to assign it a second one is no good. + assert(node->GetRegNum() == REG_NA); node->SetRegNum(reg); } @@ -561,6 +586,7 @@ void WasmRegAlloc::ConsumeTemporaryRegForOperand(GenTree* operand DEBUGARG(const } regNumber reg = ReleaseTemporaryRegister(genActualType(operand)); + // If this assert fails you likely called ConsumeTemporaryRegForOperand on your operands in the wrong order. assert(reg == operand->GetRegNum()); CollectReference(operand); @@ -605,6 +631,7 @@ void WasmRegAlloc::ResolveReferences() { TemporaryRegStack& temporaryRegs = m_temporaryRegs[static_cast(type)]; TemporaryRegBank& allocatedTemporaryRegs = temporaryRegMap[static_cast(type)]; + // If temporaryRegs.Count != 0 that means CollectReferences failed to CollectReference one or more multiply-used nodes. assert(temporaryRegs.Count == 0); allocatedTemporaryRegs.Count = temporaryRegs.MaxCount; diff --git a/src/coreclr/jit/regallocwasm.h b/src/coreclr/jit/regallocwasm.h index a24f07df1fa73d..0d87b92d76f451 100644 --- a/src/coreclr/jit/regallocwasm.h +++ b/src/coreclr/jit/regallocwasm.h @@ -125,6 +125,7 @@ class WasmRegAlloc : public RegAllocInterface void CollectReferencesForCast(GenTreeOp* castNode); void CollectReferencesForBinop(GenTreeOp* binOpNode); void CollectReferencesForLclVar(GenTreeLclVar* lclVar); + void CollectReferencesForBlockStore(GenTreeBlk* node); void RewriteLocalStackStore(GenTreeLclVarCommon* node); void CollectReference(GenTree* node); void RequestTemporaryRegisterForMultiplyUsedNode(GenTree* node); From 76e30d019ed52577b354630d459ae1edf5218747 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Tue, 24 Feb 2026 22:06:05 -0800 Subject: [PATCH 2/8] Address copilot feedback --- src/coreclr/jit/codegenwasm.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index 3bc45805e007e5..fbee0ab1b57cca 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -2460,7 +2460,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) assert(cpObjNode->GetLayout()->HasGCPtr()); #endif // DEBUG - genConsumeRegs(cpObjNode); + genConsumeOperands(cpObjNode); ClassLayout* layout = cpObjNode->GetLayout(); unsigned slots = layout->GetSlotCount(); @@ -2497,11 +2497,11 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) { // Compute the actual dest/src of the slot being copied to pass to the helper. emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); - emit->emitIns_I(INS_i32_const, attrDstAddr, offset); - emit->emitIns(INS_i32_add); + emit->emitIns_I(INS_I_const, attrDstAddr, offset); + emit->emitIns(INS_I_add); emit->emitIns_I(INS_local_get, attrSrcAddr, WasmRegToIndex(srcReg)); - emit->emitIns_I(INS_i32_const, attrSrcAddr, offset); - emit->emitIns(INS_i32_add); + emit->emitIns_I(INS_I_const, attrSrcAddr, offset); + emit->emitIns(INS_I_add); // Call the byref assign helper. On other targets this updates the dst/src regs but here it won't, // so we have to do the local.get+i32.const+i32.add dance every time. genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE); @@ -2510,7 +2510,8 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) ++i; offset += TARGET_POINTER_SIZE; } - assert(gcPtrCount == 0); + + assert(dstOnStack || (gcPtrCount == 0)); if (cpObjNode->IsVolatile()) { From 90f2757ef1f73748216f5309de3284510f661296 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Wed, 25 Feb 2026 07:23:33 -0800 Subject: [PATCH 3/8] Repair merge damage --- src/coreclr/jit/regallocwasm.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/regallocwasm.cpp b/src/coreclr/jit/regallocwasm.cpp index 25345f1a045e6a..0d48ac98575895 100644 --- a/src/coreclr/jit/regallocwasm.cpp +++ b/src/coreclr/jit/regallocwasm.cpp @@ -330,6 +330,7 @@ void WasmRegAlloc::CollectReferencesForNode(GenTree* node) case GT_SUB: case GT_MUL: CollectReferencesForBinop(node->AsOp()); + break; case GT_STORE_BLK: CollectReferencesForBlockStore(node->AsBlk()); From 8335ca17785f2ec2b5c21c35dd6bc4f76198a9f6 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Wed, 25 Feb 2026 07:30:35 -0800 Subject: [PATCH 4/8] jit-format --- src/coreclr/jit/codegenwasm.cpp | 6 +++--- src/coreclr/jit/lowerwasm.cpp | 3 ++- src/coreclr/jit/regallocwasm.cpp | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index fbee0ab1b57cca..9f666b4103338a 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -2434,9 +2434,9 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) // void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) { - GenTree* dstAddr = cpObjNode->Addr(); - GenTree* source = cpObjNode->Data(); - var_types srcAddrType = TYP_BYREF; + GenTree* dstAddr = cpObjNode->Addr(); + GenTree* source = cpObjNode->Data(); + var_types srcAddrType = TYP_BYREF; assert(source->isContained()); if (source->OperIs(GT_IND)) diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index 50fec8b9dab716..941b3154a2fbb6 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -519,7 +519,8 @@ void Lowering::AfterLowerBlock() if (node->IsInvariant()) { - JITDUMP("Stackifier moving invariant node [%06u] after [%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); + JITDUMP("Stackifier moving invariant node [%06u] after [%06u]\n", Compiler::dspTreeID(node), + Compiler::dspTreeID(prev)); m_lower->BlockRange().Remove(node); m_lower->BlockRange().InsertAfter(prev, node); break; diff --git a/src/coreclr/jit/regallocwasm.cpp b/src/coreclr/jit/regallocwasm.cpp index 0d48ac98575895..6da22ae240d6af 100644 --- a/src/coreclr/jit/regallocwasm.cpp +++ b/src/coreclr/jit/regallocwasm.cpp @@ -632,7 +632,8 @@ void WasmRegAlloc::ResolveReferences() { TemporaryRegStack& temporaryRegs = m_temporaryRegs[static_cast(type)]; TemporaryRegBank& allocatedTemporaryRegs = temporaryRegMap[static_cast(type)]; - // If temporaryRegs.Count != 0 that means CollectReferences failed to CollectReference one or more multiply-used nodes. + // If temporaryRegs.Count != 0 that means CollectReferences failed to CollectReference one or more multiply-used + // nodes. assert(temporaryRegs.Count == 0); allocatedTemporaryRegs.Count = temporaryRegs.MaxCount; From 702969807a359181ff42f400c3c4ed9b4e543d96 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Wed, 25 Feb 2026 10:04:12 -0800 Subject: [PATCH 5/8] loadStructWithRefs compiles now --- src/coreclr/jit/codegenwasm.cpp | 1 - src/coreclr/jit/lowerwasm.cpp | 48 +++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index 9f666b4103338a..51e33531f98808 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -2447,7 +2447,6 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) } noway_assert(source->IsLocal()); - noway_assert(dstAddr->IsLocal()); // If the destination is on the stack we don't need the write barrier. bool dstOnStack = cpObjNode->IsAddressNotOnHeap(m_compiler); diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index 941b3154a2fbb6..706b1db8ff2bd6 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -517,15 +517,59 @@ void Lowering::AfterLowerBlock() // instead be ifdef-ed out for WASM. m_anyChanges = true; - if (node->IsInvariant()) + // Invariant nodes can be safely moved by the stackifier with no side effects. + // For other nodes, the side effects would require us to turn them into a temporary local, but this + // is not possible for contained nodes like an IND inside a STORE_BLK. However, the few types of + // contained nodes we have in Wasm should be safe to move freely since the lack of 'dup' or persistent + // registers in Wasm means that the actual codegen will trigger the side effect(s) and store the result + // into a Wasm local for any later uses during the containing node's execution, i.e. cpobj where the + // src and dest get stashed at the start and then used as add operands repeatedly. + // Locals can also be safely moved as long as they aren't address-exposed due to local var nodes being + // implicitly pseudo-contained. + // TODO-WASM: Verify that it is actually safe to do this for all contained nodes. + if ( + node->IsInvariant() || + node->isContained() || + (node->OperIs(GT_LCL_VAR) && !m_lower->m_compiler->lvaGetDesc(node->AsLclVarCommon())->IsAddressExposed()) + ) { - JITDUMP("Stackifier moving invariant node [%06u] after [%06u]\n", Compiler::dspTreeID(node), + JITDUMP("Stackifier moving node [%06u] after [%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); m_lower->BlockRange().Remove(node); m_lower->BlockRange().InsertAfter(prev, node); break; } + /* + else + { + // To resolve this scenario we have two options: + // 1. We try moving the whole tree rooted at `node`. + // To avoid quadratic behavior, we first stackify it and collect all the side effects + // from it. Then we check for interference of those side effects with nodes between + // 'node' and 'prev'. + // 2. Failing that, we insert a temporary ('ReplaceWithLclVar') for 'node'. + // To avoid explosion of temporaries, we maintain a busy/free set of them. + // For now, for simplicity we are implementing #2 only. + + LIR::Use nodeUse; + // FIXME-WASM: TryGetUse is inefficient here, replace it with something more optimal + if (!m_lower->BlockRange().TryGetUse(node, &nodeUse)) + { + JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); + NYI_WASM("Could not get a LIR::Use for the node to be moved by the stackifier"); + } + + unsigned lclNum = nodeUse.ReplaceWithLclVar(m_lower->m_compiler); + GenTree* newNode = nodeUse.Def(); + JITDUMP("Stackifier replaced node [%06u] with lcl var %u\n", Compiler::dspTreeID(node), lclNum); + m_lower->BlockRange().Remove(newNode); + m_lower->BlockRange().InsertAfter(prev, newNode); + JITDUMP("Stackifier moved new node [%06u] after [%06u]\n", Compiler::dspTreeID(newNode), Compiler::dspTreeID(prev)); + break; + } + */ + JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); NYI_WASM("IR not in a stackified form"); } From 3c5e30b6abd5b427f149fccc7e119bc601e868b0 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Wed, 25 Feb 2026 10:21:36 -0800 Subject: [PATCH 6/8] When doing a cpobj to the stack just generate a memcpy instead --- src/coreclr/jit/codegenwasm.cpp | 10 +++++++--- src/coreclr/jit/lowerwasm.cpp | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index 51e33531f98808..b267048f038576 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -2450,6 +2450,8 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) // If the destination is on the stack we don't need the write barrier. bool dstOnStack = cpObjNode->IsAddressNotOnHeap(m_compiler); + // We should have generated a memory.copy for this scenario in lowering. + assert(!dstOnStack); #ifdef DEBUG assert(!dstAddr->isContained()); @@ -2461,9 +2463,6 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) genConsumeOperands(cpObjNode); - ClassLayout* layout = cpObjNode->GetLayout(); - unsigned slots = layout->GetSlotCount(); - regNumber srcReg = GetMultiUseOperandReg(source); regNumber dstReg = GetMultiUseOperandReg(dstAddr); @@ -2472,8 +2471,13 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) // TODO-WASM: Memory barrier } + // TODO: Do we need an implicit null check here? + emitter* emit = GetEmitter(); + ClassLayout* layout = cpObjNode->GetLayout(); + unsigned slots = layout->GetSlotCount(); + emitAttr attrSrcAddr = emitActualTypeSize(srcAddrType); emitAttr attrDstAddr = emitActualTypeSize(dstAddr->TypeGet()); diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index 706b1db8ff2bd6..58e8e1bc97441e 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -244,6 +244,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ClassLayout* layout = blkNode->GetLayout(); bool doCpObj = layout->HasGCPtr(); + // If copying to the stack instead of the heap, we should treat it as a raw memcpy for + // smaller generated code and potentially better performance. + if (blkNode->IsAddressNotOnHeap(m_compiler)) + doCpObj = false; + // CopyObj or CopyBlk if (doCpObj) { From 98c92df56d181e428899e89470dd3a88402f2160 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Wed, 25 Feb 2026 10:36:42 -0800 Subject: [PATCH 7/8] jit-format --- src/coreclr/jit/lowerwasm.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index 58e8e1bc97441e..5327f411100960 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -525,18 +525,18 @@ void Lowering::AfterLowerBlock() // Invariant nodes can be safely moved by the stackifier with no side effects. // For other nodes, the side effects would require us to turn them into a temporary local, but this // is not possible for contained nodes like an IND inside a STORE_BLK. However, the few types of - // contained nodes we have in Wasm should be safe to move freely since the lack of 'dup' or persistent - // registers in Wasm means that the actual codegen will trigger the side effect(s) and store the result - // into a Wasm local for any later uses during the containing node's execution, i.e. cpobj where the - // src and dest get stashed at the start and then used as add operands repeatedly. - // Locals can also be safely moved as long as they aren't address-exposed due to local var nodes being + // contained nodes we have in Wasm should be safe to move freely since the lack of 'dup' or + // persistent registers in Wasm means that the actual codegen will trigger the side effect(s) and + // store the result into a Wasm local for any later uses during the containing node's execution, + // i.e. cpobj where the src and dest get stashed at the start and then used as add operands + // repeatedly. + // Locals can also be safely moved as long as they aren't address-exposed due to local var nodes + // being // implicitly pseudo-contained. // TODO-WASM: Verify that it is actually safe to do this for all contained nodes. - if ( - node->IsInvariant() || - node->isContained() || - (node->OperIs(GT_LCL_VAR) && !m_lower->m_compiler->lvaGetDesc(node->AsLclVarCommon())->IsAddressExposed()) - ) + if (node->IsInvariant() || node->isContained() || + (node->OperIs(GT_LCL_VAR) && + !m_lower->m_compiler->lvaGetDesc(node->AsLclVarCommon())->IsAddressExposed())) { JITDUMP("Stackifier moving node [%06u] after [%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); @@ -561,8 +561,9 @@ void Lowering::AfterLowerBlock() // FIXME-WASM: TryGetUse is inefficient here, replace it with something more optimal if (!m_lower->BlockRange().TryGetUse(node, &nodeUse)) { - JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); - NYI_WASM("Could not get a LIR::Use for the node to be moved by the stackifier"); + JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), + Compiler::dspTreeID(prev)); NYI_WASM("Could not get a LIR::Use for the node to be moved by the + stackifier"); } unsigned lclNum = nodeUse.ReplaceWithLclVar(m_lower->m_compiler); @@ -570,8 +571,8 @@ void Lowering::AfterLowerBlock() JITDUMP("Stackifier replaced node [%06u] with lcl var %u\n", Compiler::dspTreeID(node), lclNum); m_lower->BlockRange().Remove(newNode); m_lower->BlockRange().InsertAfter(prev, newNode); - JITDUMP("Stackifier moved new node [%06u] after [%06u]\n", Compiler::dspTreeID(newNode), Compiler::dspTreeID(prev)); - break; + JITDUMP("Stackifier moved new node [%06u] after [%06u]\n", Compiler::dspTreeID(newNode), + Compiler::dspTreeID(prev)); break; } */ From 18e652026b2374c6e55713b57ddefedc7a4a99af Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Wed, 25 Feb 2026 10:56:44 -0800 Subject: [PATCH 8/8] Address copilot feedback --- src/coreclr/jit/codegenwasm.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index b267048f038576..4c7b0f27abc072 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -2486,9 +2486,9 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) unsigned i = 0, offset = 0; while (i < slots) { - // Copy the pointer-sized non-gc-pointer slots one at a time (and GC pointer slots if the destination is stack) - // using regular I-sized load/store pairs. - if (dstOnStack || !layout->IsGCPtr(i)) + // Copy the pointer-sized non-gc-pointer slots one at a time using regular I-sized load/store pairs, + // and gc-pointer slots using a write barrier. + if (!layout->IsGCPtr(i)) { // Do a pointer-sized load+store pair at the appropriate offset relative to dest and source emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); @@ -2514,7 +2514,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) offset += TARGET_POINTER_SIZE; } - assert(dstOnStack || (gcPtrCount == 0)); + assert(gcPtrCount == 0); if (cpObjNode->IsVolatile()) {