-
Notifications
You must be signed in to change notification settings - Fork 5.4k
[Wasm RyuJIT] Block Stores pt. 2 #124846
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[Wasm RyuJIT] Block Stores pt. 2 #124846
Changes from all commits
4c8682d
76e30d0
90f2757
8335ca1
7029698
3c5e30b
98c92df
18e6520
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,8 @@ | |
| static const int LINEAR_MEMORY_INDEX = 0; | ||
|
|
||
| #ifdef TARGET_64BIT | ||
| static const instruction INS_I_load = INS_i64_load; | ||
| static const instruction INS_I_store = INS_i64_store; | ||
| static const instruction INS_I_const = INS_i64_const; | ||
| static const instruction INS_I_add = INS_i64_add; | ||
| static const instruction INS_I_mul = INS_i64_mul; | ||
|
|
@@ -21,6 +23,8 @@ static const instruction INS_I_le_u = INS_i64_le_u; | |
| static const instruction INS_I_ge_u = INS_i64_ge_u; | ||
| static const instruction INS_I_gt_u = INS_i64_gt_u; | ||
| #else // !TARGET_64BIT | ||
| static const instruction INS_I_load = INS_i32_load; | ||
| static const instruction INS_I_store = INS_i32_store; | ||
| static const instruction INS_I_const = INS_i32_const; | ||
| static const instruction INS_I_add = INS_i32_add; | ||
| static const instruction INS_I_mul = INS_i32_mul; | ||
|
|
@@ -427,7 +431,9 @@ void CodeGen::WasmProduceReg(GenTree* node) | |
| // | ||
| // If the operand is a candidate, we use that candidate's current register. | ||
| // Otherwise it must have been allocated into a temporary register initialized | ||
| // in 'WasmProduceReg'. | ||
| // in 'WasmProduceReg'. To do this, set the LIR::Flags::MultiplyUsed flag during | ||
| // lowering or other pre-regalloc phases, and ensure that regalloc is updated to | ||
| // call CollectReferences on the node(s) that need to be used multiple times. | ||
| // | ||
| // Arguments: | ||
| // operand - The operand node | ||
|
|
@@ -2420,9 +2426,102 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) | |
| } | ||
| } | ||
|
|
||
| //------------------------------------------------------------------------ | ||
| // genCodeForCpObj: Produce code for a GT_STORE_BLK node that represents a cpobj operation. | ||
| // | ||
| // Arguments: | ||
| // cpObjNode - the node | ||
| // | ||
| void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) | ||
| { | ||
| NYI_WASM("genCodeForCpObj"); | ||
| GenTree* dstAddr = cpObjNode->Addr(); | ||
| GenTree* source = cpObjNode->Data(); | ||
| var_types srcAddrType = TYP_BYREF; | ||
|
|
||
| assert(source->isContained()); | ||
| if (source->OperIs(GT_IND)) | ||
| { | ||
|
Comment on lines
+2429
to
+2443
|
||
| source = source->gtGetOp1(); | ||
| assert(!source->isContained()); | ||
| srcAddrType = source->TypeGet(); | ||
| } | ||
|
|
||
| noway_assert(source->IsLocal()); | ||
|
|
||
| // If the destination is on the stack we don't need the write barrier. | ||
| bool dstOnStack = cpObjNode->IsAddressNotOnHeap(m_compiler); | ||
| // We should have generated a memory.copy for this scenario in lowering. | ||
| assert(!dstOnStack); | ||
|
|
||
| #ifdef DEBUG | ||
| assert(!dstAddr->isContained()); | ||
|
|
||
| // This GenTree node has data about GC pointers, this means we're dealing | ||
| // with CpObj. | ||
| assert(cpObjNode->GetLayout()->HasGCPtr()); | ||
| #endif // DEBUG | ||
|
|
||
| genConsumeOperands(cpObjNode); | ||
|
|
||
| regNumber srcReg = GetMultiUseOperandReg(source); | ||
| regNumber dstReg = GetMultiUseOperandReg(dstAddr); | ||
|
|
||
| if (cpObjNode->IsVolatile()) | ||
| { | ||
| // TODO-WASM: Memory barrier | ||
| } | ||
|
|
||
| // TODO: Do we need an implicit null check here? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes |
||
|
|
||
| emitter* emit = GetEmitter(); | ||
|
|
||
| ClassLayout* layout = cpObjNode->GetLayout(); | ||
| unsigned slots = layout->GetSlotCount(); | ||
|
|
||
| emitAttr attrSrcAddr = emitActualTypeSize(srcAddrType); | ||
| emitAttr attrDstAddr = emitActualTypeSize(dstAddr->TypeGet()); | ||
|
|
||
| unsigned gcPtrCount = cpObjNode->GetLayout()->GetGCPtrCount(); | ||
|
|
||
| unsigned i = 0, offset = 0; | ||
| while (i < slots) | ||
| { | ||
| // Copy the pointer-sized non-gc-pointer slots one at a time using regular I-sized load/store pairs, | ||
| // and gc-pointer slots using a write barrier. | ||
| if (!layout->IsGCPtr(i)) | ||
| { | ||
| // Do a pointer-sized load+store pair at the appropriate offset relative to dest and source | ||
| emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); | ||
| emit->emitIns_I(INS_local_get, attrSrcAddr, WasmRegToIndex(srcReg)); | ||
| emit->emitIns_I(INS_I_load, EA_PTRSIZE, offset); | ||
| emit->emitIns_I(INS_I_store, EA_PTRSIZE, offset); | ||
| } | ||
| else | ||
| { | ||
| // Compute the actual dest/src of the slot being copied to pass to the helper. | ||
| emit->emitIns_I(INS_local_get, attrDstAddr, WasmRegToIndex(dstReg)); | ||
| emit->emitIns_I(INS_I_const, attrDstAddr, offset); | ||
| emit->emitIns(INS_I_add); | ||
| emit->emitIns_I(INS_local_get, attrSrcAddr, WasmRegToIndex(srcReg)); | ||
| emit->emitIns_I(INS_I_const, attrSrcAddr, offset); | ||
| emit->emitIns(INS_I_add); | ||
| // Call the byref assign helper. On other targets this updates the dst/src regs but here it won't, | ||
| // so we have to do the local.get+i32.const+i32.add dance every time. | ||
| genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE); | ||
| gcPtrCount--; | ||
|
Comment on lines
+2502
to
+2511
|
||
| } | ||
| ++i; | ||
| offset += TARGET_POINTER_SIZE; | ||
| } | ||
|
|
||
| assert(gcPtrCount == 0); | ||
|
|
||
| if (cpObjNode->IsVolatile()) | ||
| { | ||
| // TODO-WASM: Memory barrier | ||
| } | ||
|
|
||
| WasmProduceReg(cpObjNode); | ||
| } | ||
|
|
||
| //------------------------------------------------------------------------ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -244,6 +244,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) | |
| ClassLayout* layout = blkNode->GetLayout(); | ||
| bool doCpObj = layout->HasGCPtr(); | ||
|
|
||
| // If copying to the stack instead of the heap, we should treat it as a raw memcpy for | ||
| // smaller generated code and potentially better performance. | ||
| if (blkNode->IsAddressNotOnHeap(m_compiler)) | ||
| doCpObj = false; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. other targets do this only under size check + they set
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're trying always using the native wasm memcpy/memset opcodes because the VMs unroll them for us, which should provide both optimal code size and perf. So I think omitting the size check is okay. Great point on gc unsafe.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah it's fine.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well, we can do what other targets do and then just ignore it in the emitter till we implement threads, interruptible gc or whatever?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It feels like
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We won't need it for those things either. We will only need it if a WASM proposal comes around which makes stack walking in the traditional sense possible, which is exceedingly unlikely. |
||
|
|
||
| // CopyObj or CopyBlk | ||
| if (doCpObj) | ||
| { | ||
|
|
@@ -254,6 +259,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) | |
| } | ||
|
|
||
| blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindCpObjUnroll; | ||
| dstAddr->gtLIRFlags |= LIR::Flags::MultiplyUsed; | ||
| if (src->OperIs(GT_IND)) | ||
| src->gtGetOp1()->gtLIRFlags |= LIR::Flags::MultiplyUsed; | ||
| else | ||
| src->gtLIRFlags |= LIR::Flags::MultiplyUsed; | ||
| } | ||
| else | ||
| { | ||
|
|
@@ -512,6 +522,60 @@ void Lowering::AfterLowerBlock() | |
| // instead be ifdef-ed out for WASM. | ||
| m_anyChanges = true; | ||
|
|
||
| // Invariant nodes can be safely moved by the stackifier with no side effects. | ||
| // For other nodes, the side effects would require us to turn them into a temporary local, but this | ||
| // is not possible for contained nodes like an IND inside a STORE_BLK. However, the few types of | ||
| // contained nodes we have in Wasm should be safe to move freely since the lack of 'dup' or | ||
| // persistent registers in Wasm means that the actual codegen will trigger the side effect(s) and | ||
| // store the result into a Wasm local for any later uses during the containing node's execution, | ||
| // i.e. cpobj where the src and dest get stashed at the start and then used as add operands | ||
| // repeatedly. | ||
| // Locals can also be safely moved as long as they aren't address-exposed due to local var nodes | ||
| // being | ||
| // implicitly pseudo-contained. | ||
| // TODO-WASM: Verify that it is actually safe to do this for all contained nodes. | ||
| if (node->IsInvariant() || node->isContained() || | ||
| (node->OperIs(GT_LCL_VAR) && | ||
| !m_lower->m_compiler->lvaGetDesc(node->AsLclVarCommon())->IsAddressExposed())) | ||
| { | ||
| JITDUMP("Stackifier moving node [%06u] after [%06u]\n", Compiler::dspTreeID(node), | ||
| Compiler::dspTreeID(prev)); | ||
| m_lower->BlockRange().Remove(node); | ||
| m_lower->BlockRange().InsertAfter(prev, node); | ||
| break; | ||
| } | ||
|
|
||
| /* | ||
| else | ||
| { | ||
| // To resolve this scenario we have two options: | ||
| // 1. We try moving the whole tree rooted at `node`. | ||
| // To avoid quadratic behavior, we first stackify it and collect all the side effects | ||
| // from it. Then we check for interference of those side effects with nodes between | ||
| // 'node' and 'prev'. | ||
| // 2. Failing that, we insert a temporary ('ReplaceWithLclVar') for 'node'. | ||
| // To avoid explosion of temporaries, we maintain a busy/free set of them. | ||
| // For now, for simplicity we are implementing #2 only. | ||
|
|
||
| LIR::Use nodeUse; | ||
| // FIXME-WASM: TryGetUse is inefficient here, replace it with something more optimal | ||
| if (!m_lower->BlockRange().TryGetUse(node, &nodeUse)) | ||
| { | ||
| JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), | ||
| Compiler::dspTreeID(prev)); NYI_WASM("Could not get a LIR::Use for the node to be moved by the | ||
| stackifier"); | ||
| } | ||
|
|
||
| unsigned lclNum = nodeUse.ReplaceWithLclVar(m_lower->m_compiler); | ||
| GenTree* newNode = nodeUse.Def(); | ||
| JITDUMP("Stackifier replaced node [%06u] with lcl var %u\n", Compiler::dspTreeID(node), lclNum); | ||
| m_lower->BlockRange().Remove(newNode); | ||
| m_lower->BlockRange().InsertAfter(prev, newNode); | ||
| JITDUMP("Stackifier moved new node [%06u] after [%06u]\n", Compiler::dspTreeID(newNode), | ||
| Compiler::dspTreeID(prev)); break; | ||
| } | ||
| */ | ||
|
Comment on lines
+548
to
+577
|
||
|
|
||
| JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); | ||
| NYI_WASM("IR not in a stackified form"); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We probably can never get here if we extend
TryLowerBlockStoreAsGcBulkCopyCallto never bail. So all cpobj will go through the batch helperThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is that bad?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean for simplicity. The bulk helper is likely a bit less efficient for small object copies with just 1 gc ref where we can emit just one write-barrier, otherwise it's better. It also is simdified.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I misunderstood. So for a struct that has i.e. 4 gc ptrs at the head and then some random ints and floats at the tail, TryLowerBlockStoreAsGcBulkCopyCall will just lower this away entirely in theory. I've been testing with interleaved ptrs where a bulk one isn't possible.
I was definitely thinking we'd want to use bulk copies if possible.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TryLowerBlockStoreAsGcBulkCopyCall can handle any layout, even if there are no gc refs at all (although, in that case we should never be in cpobj, because we rely on the fact that the size is divisble by pointer size).