diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index 566c7ca5bd57bb..4c7b0f27abc072 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -13,6 +13,8 @@ static const int LINEAR_MEMORY_INDEX = 0; #ifdef TARGET_64BIT +static const instruction INS_I_load = INS_i64_load; +static const instruction INS_I_store = INS_i64_store; static const instruction INS_I_const = INS_i64_const; static const instruction INS_I_add = INS_i64_add; static const instruction INS_I_mul = INS_i64_mul; @@ -21,6 +23,8 @@ static const instruction INS_I_le_u = INS_i64_le_u; static const instruction INS_I_ge_u = INS_i64_ge_u; static const instruction INS_I_gt_u = INS_i64_gt_u; #else // !TARGET_64BIT +static const instruction INS_I_load = INS_i32_load; +static const instruction INS_I_store = INS_i32_store; static const instruction INS_I_const = INS_i32_const; static const instruction INS_I_add = INS_i32_add; static const instruction INS_I_mul = INS_i32_mul; @@ -427,7 +431,9 @@ void CodeGen::WasmProduceReg(GenTree* node) // // If the operand is a candidate, we use that candidate's current register. // Otherwise it must have been allocated into a temporary register initialized -// in 'WasmProduceReg'. +// in 'WasmProduceReg'. To do this, set the LIR::Flags::MultiplyUsed flag during +// lowering or other pre-regalloc phases, and ensure that regalloc is updated to +// call CollectReferences on the node(s) that need to be used multiple times. // // Arguments: // operand - The operand node @@ -2420,9 +2426,102 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) } } +//------------------------------------------------------------------------ +// genCodeForCpObj: Produce code for a GT_STORE_BLK node that represents a cpobj operation. +// +// Arguments: +// cpObjNode - the node +// void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) { - NYI_WASM("genCodeForCpObj"); + GenTree* dstAddr = cpObjNode->Addr(); + GenTree* source = cpObjNode->Data(); + var_types srcAddrType = TYP_BYREF; + + assert(source->isContained()); + if (source->OperIs(GT_IND)) + { + source = source->gtGetOp1(); + assert(!source->isContained()); + srcAddrType = source->TypeGet(); + } + + noway_assert(source->IsLocal()); + + // If the destination is on the stack we don't need the write barrier. + bool dstOnStack = cpObjNode->IsAddressNotOnHeap(m_compiler); + // We should have generated a memory.copy for this scenario in lowering. + assert(!dstOnStack); + +#ifdef DEBUG + assert(!dstAddr->isContained()); + + // This GenTree node has data about GC pointers, this means we're dealing + // with CpObj. + assert(cpObjNode->GetLayout()->HasGCPtr()); +#endif // DEBUG + + genConsumeOperands(cpObjNode); + + regNumber srcReg = GetMultiUseOperandReg(source); + regNumber dstReg = GetMultiUseOperandReg(dstAddr); + + if (cpObjNode->IsVolatile()) + { + // TODO-WASM: Memory barrier + } + + // TODO: Do we need an implicit null check here? + + emitter* emit = GetEmitter(); + + ClassLayout* layout = cpObjNode->GetLayout(); + unsigned slots = layout->GetSlotCount(); + + emitAttr attrSrcAddr = emitActualTypeSize(srcAddrType); + emitAttr attrDstAddr = emitActualTypeSize(dstAddr->TypeGet()); + + unsigned gcPtrCount = cpObjNode->GetLayout()->GetGCPtrCount(); + + unsigned i = 0, offset = 0; + while (i < slots) + { + // Copy the pointer-sized non-gc-pointer slots one at a time using regular I-sized load/store pairs, + // and gc-pointer slots using a write barrier. + if (!layout->IsGCPtr(i)) + { + // Do a pointer-sized load+store pair at the appropriate offset relative to dest and source + emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); + emit->emitIns_I(INS_local_get, attrSrcAddr, WasmRegToIndex(srcReg)); + emit->emitIns_I(INS_I_load, EA_PTRSIZE, offset); + emit->emitIns_I(INS_I_store, EA_PTRSIZE, offset); + } + else + { + // Compute the actual dest/src of the slot being copied to pass to the helper. + emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); + emit->emitIns_I(INS_I_const, attrDstAddr, offset); + emit->emitIns(INS_I_add); + emit->emitIns_I(INS_local_get, attrSrcAddr, WasmRegToIndex(srcReg)); + emit->emitIns_I(INS_I_const, attrSrcAddr, offset); + emit->emitIns(INS_I_add); + // Call the byref assign helper. On other targets this updates the dst/src regs but here it won't, + // so we have to do the local.get+i32.const+i32.add dance every time. + genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE); + gcPtrCount--; + } + ++i; + offset += TARGET_POINTER_SIZE; + } + + assert(gcPtrCount == 0); + + if (cpObjNode->IsVolatile()) + { + // TODO-WASM: Memory barrier + } + + WasmProduceReg(cpObjNode); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 89e5f28d1bd447..c89c7a874d8a65 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -2566,6 +2566,10 @@ class Compiler friend class ReplaceVisitor; friend class FlowGraphNaturalLoop; +#ifdef TARGET_WASM + friend class WasmRegAlloc; // For m_pLowering +#endif + #ifdef FEATURE_HW_INTRINSICS friend struct GenTreeHWIntrinsic; friend struct HWIntrinsicInfo; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d44e0bf76a1a91..c2b1dfc80d2c5f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -12844,7 +12844,7 @@ void Compiler::gtDispTree(GenTree* tree, #ifdef TARGET_WASM case GenTreeBlk::BlkOpKindNativeOpcode: - printf(" (memory.copy|fill)"); + printf(" (memory.%s)", tree->OperIsCopyBlkOp() ? "copy" : "fill"); break; #endif diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index e241e9a3a02700..5327f411100960 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -244,6 +244,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ClassLayout* layout = blkNode->GetLayout(); bool doCpObj = layout->HasGCPtr(); + // If copying to the stack instead of the heap, we should treat it as a raw memcpy for + // smaller generated code and potentially better performance. + if (blkNode->IsAddressNotOnHeap(m_compiler)) + doCpObj = false; + // CopyObj or CopyBlk if (doCpObj) { @@ -254,6 +259,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindCpObjUnroll; + dstAddr->gtLIRFlags |= LIR::Flags::MultiplyUsed; + if (src->OperIs(GT_IND)) + src->gtGetOp1()->gtLIRFlags |= LIR::Flags::MultiplyUsed; + else + src->gtLIRFlags |= LIR::Flags::MultiplyUsed; } else { @@ -512,6 +522,60 @@ void Lowering::AfterLowerBlock() // instead be ifdef-ed out for WASM. m_anyChanges = true; + // Invariant nodes can be safely moved by the stackifier with no side effects. + // For other nodes, the side effects would require us to turn them into a temporary local, but this + // is not possible for contained nodes like an IND inside a STORE_BLK. However, the few types of + // contained nodes we have in Wasm should be safe to move freely since the lack of 'dup' or + // persistent registers in Wasm means that the actual codegen will trigger the side effect(s) and + // store the result into a Wasm local for any later uses during the containing node's execution, + // i.e. cpobj where the src and dest get stashed at the start and then used as add operands + // repeatedly. + // Locals can also be safely moved as long as they aren't address-exposed due to local var nodes + // being + // implicitly pseudo-contained. + // TODO-WASM: Verify that it is actually safe to do this for all contained nodes. + if (node->IsInvariant() || node->isContained() || + (node->OperIs(GT_LCL_VAR) && + !m_lower->m_compiler->lvaGetDesc(node->AsLclVarCommon())->IsAddressExposed())) + { + JITDUMP("Stackifier moving node [%06u] after [%06u]\n", Compiler::dspTreeID(node), + Compiler::dspTreeID(prev)); + m_lower->BlockRange().Remove(node); + m_lower->BlockRange().InsertAfter(prev, node); + break; + } + + /* + else + { + // To resolve this scenario we have two options: + // 1. We try moving the whole tree rooted at `node`. + // To avoid quadratic behavior, we first stackify it and collect all the side effects + // from it. Then we check for interference of those side effects with nodes between + // 'node' and 'prev'. + // 2. Failing that, we insert a temporary ('ReplaceWithLclVar') for 'node'. + // To avoid explosion of temporaries, we maintain a busy/free set of them. + // For now, for simplicity we are implementing #2 only. + + LIR::Use nodeUse; + // FIXME-WASM: TryGetUse is inefficient here, replace it with something more optimal + if (!m_lower->BlockRange().TryGetUse(node, &nodeUse)) + { + JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), + Compiler::dspTreeID(prev)); NYI_WASM("Could not get a LIR::Use for the node to be moved by the + stackifier"); + } + + unsigned lclNum = nodeUse.ReplaceWithLclVar(m_lower->m_compiler); + GenTree* newNode = nodeUse.Def(); + JITDUMP("Stackifier replaced node [%06u] with lcl var %u\n", Compiler::dspTreeID(node), lclNum); + m_lower->BlockRange().Remove(newNode); + m_lower->BlockRange().InsertAfter(prev, newNode); + JITDUMP("Stackifier moved new node [%06u] after [%06u]\n", Compiler::dspTreeID(newNode), + Compiler::dspTreeID(prev)); break; + } + */ + JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); NYI_WASM("IR not in a stackified form"); } diff --git a/src/coreclr/jit/regallocwasm.cpp b/src/coreclr/jit/regallocwasm.cpp index 88efaee43d7284..6da22ae240d6af 100644 --- a/src/coreclr/jit/regallocwasm.cpp +++ b/src/coreclr/jit/regallocwasm.cpp @@ -8,6 +8,8 @@ #include "regallocwasm.h" +#include "lower.h" // for LowerRange() + RegAllocInterface* GetRegisterAllocator(Compiler* compiler) { return new (compiler->getAllocator(CMK_LSRA)) WasmRegAlloc(compiler); @@ -330,6 +332,10 @@ void WasmRegAlloc::CollectReferencesForNode(GenTree* node) CollectReferencesForBinop(node->AsOp()); break; + case GT_STORE_BLK: + CollectReferencesForBlockStore(node->AsBlk()); + break; + default: assert(!node->OperIsLocalStore()); break; @@ -417,6 +423,21 @@ void WasmRegAlloc::CollectReferencesForBinop(GenTreeOp* binopNode) ConsumeTemporaryRegForOperand(binopNode->gtGetOp1() DEBUGARG("binop overflow check")); } +// CollectReferencesForBlockStore: Collect virtual register references for a block store. +// +// Arguments: +// node - The GT_STORE_BLK node +// +void WasmRegAlloc::CollectReferencesForBlockStore(GenTreeBlk* node) +{ + GenTree* src = node->Data(); + if (src->OperIs(GT_IND)) + src = src->gtGetOp1(); + + ConsumeTemporaryRegForOperand(src DEBUGARG("block store source")); + ConsumeTemporaryRegForOperand(node->Addr() DEBUGARG("block store destination")); +} + //------------------------------------------------------------------------ // CollectReferencesForLclVar: Collect virtual register references for a LCL_VAR. // @@ -476,6 +497,9 @@ void WasmRegAlloc::RewriteLocalStackStore(GenTreeLclVarCommon* lclNode) CurrentRange().InsertAfter(lclNode, store); CurrentRange().Remove(lclNode); CurrentRange().InsertBefore(insertionPoint, lclNode); + + auto tempRange = LIR::ReadOnlyRange(store, store); + m_compiler->m_pLowering->LowerRange(m_currentBlock, tempRange); } //------------------------------------------------------------------------ @@ -539,6 +563,8 @@ void WasmRegAlloc::RequestTemporaryRegisterForMultiplyUsedNode(GenTree* node) // Note how due to the fact we're processing nodes in stack order, // we don't need to maintain free/busy sets, only a simple stack. regNumber reg = AllocateTemporaryRegister(genActualType(node)); + // If the node already has a regnum, trying to assign it a second one is no good. + assert(node->GetRegNum() == REG_NA); node->SetRegNum(reg); } @@ -561,6 +587,7 @@ void WasmRegAlloc::ConsumeTemporaryRegForOperand(GenTree* operand DEBUGARG(const } regNumber reg = ReleaseTemporaryRegister(genActualType(operand)); + // If this assert fails you likely called ConsumeTemporaryRegForOperand on your operands in the wrong order. assert(reg == operand->GetRegNum()); CollectReference(operand); @@ -605,6 +632,8 @@ void WasmRegAlloc::ResolveReferences() { TemporaryRegStack& temporaryRegs = m_temporaryRegs[static_cast(type)]; TemporaryRegBank& allocatedTemporaryRegs = temporaryRegMap[static_cast(type)]; + // If temporaryRegs.Count != 0 that means CollectReferences failed to CollectReference one or more multiply-used + // nodes. assert(temporaryRegs.Count == 0); allocatedTemporaryRegs.Count = temporaryRegs.MaxCount; diff --git a/src/coreclr/jit/regallocwasm.h b/src/coreclr/jit/regallocwasm.h index a24f07df1fa73d..0d87b92d76f451 100644 --- a/src/coreclr/jit/regallocwasm.h +++ b/src/coreclr/jit/regallocwasm.h @@ -125,6 +125,7 @@ class WasmRegAlloc : public RegAllocInterface void CollectReferencesForCast(GenTreeOp* castNode); void CollectReferencesForBinop(GenTreeOp* binOpNode); void CollectReferencesForLclVar(GenTreeLclVar* lclVar); + void CollectReferencesForBlockStore(GenTreeBlk* node); void RewriteLocalStackStore(GenTreeLclVarCommon* node); void CollectReference(GenTree* node); void RequestTemporaryRegisterForMultiplyUsedNode(GenTree* node);