Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions lib/Transforms/Scalar/GVN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,17 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
StoredVal->getType()->isArrayTy())
return false;

// HLSL Change Begin - Reject types where padded and primitive sizes differ.
// Coercion would create bitcasts between mismatched sizes.
Type *StoredValTy = StoredVal->getType();
uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits();
uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits();
if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits)
return false;
if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits)
return false;
// HLSL Change End

// The store has to be at least as big as the load.
if (DL.getTypeSizeInBits(StoredVal->getType()) <
DL.getTypeSizeInBits(LoadTy))
Expand Down Expand Up @@ -1942,6 +1953,16 @@ bool GVN::processLoad(LoadInst *L) {
if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
Value *StoredVal = DepSI->getValueOperand();

// HLSL Change Begin - Defense-in-depth: skip cross-type forwarding for
// padded types (e.g., min precision vectors).
if (StoredVal->getType() != L->getType()) {
Type *StoredTy = StoredVal->getType();
uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
return false;
}
// HLSL Change End

// The store and load are to a must-aliased pointer, but they may not
// actually have the same type. See if we know how to reuse the stored
// value (depending on its type).
Expand Down
14 changes: 10 additions & 4 deletions lib/Transforms/Scalar/SROA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1671,7 +1671,10 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
// extremely poorly defined currently. The long-term goal is to remove GEPing
// over a vector from the IR completely.
if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
// HLSL Change: Use alloc size for element stride to account for padded
// types.
unsigned ElementSizeInBits =
DL.getTypeAllocSizeInBits(VecTy->getScalarType());
if (ElementSizeInBits % 8 != 0) {
// GEPs over non-multiple of 8 size vector elements are invalid.
return nullptr;
Expand Down Expand Up @@ -2134,7 +2137,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,

// Try each vector type, and return the one which works.
auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
// HLSL Change: Use alloc size to match GEP offset stride for padded types.
uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType());

// While the definition of LLVM vectors is bitpacked, we don't support sizes
// that aren't byte sized.
Expand Down Expand Up @@ -2492,12 +2496,14 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
: nullptr),
VecTy(PromotableVecTy),
ElementTy(VecTy ? VecTy->getElementType() : nullptr),
ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
// HLSL Change: Use alloc size to match GEP offset stride for padded
// types.
ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
IRB(NewAI.getContext(), ConstantFolder()) {
if (VecTy) {
assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 && // HLSL Change
"Only multiple-of-8 sized vector elements are viable");
++NumVectorized;
}
Expand Down
154 changes: 154 additions & 0 deletions test/Transforms/GVN/min-precision-padding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
; RUN: opt < %s -basicaa -gvn -S | FileCheck %s

; Regression test for min precision vector GVN miscompilation.
; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not:
; 1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad)
; 2. Forward a zeroinitializer store past partial element stores (processLoad)
;
; Without the fix, GVN would forward the zeroinitializer vector load, producing
; incorrect all-zero results for elements that were individually written.

target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
target triple = "dxil-ms-dx"

; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>.
; The store of zeroinitializer to %dst is followed by an element store to
; %dst[0], then a vector load of %dst. GVN must not replace the vector load
; with the zeroinitializer.

; CHECK-LABEL: @test_no_forward_i16_vec3
; CHECK: store <3 x i16> zeroinitializer
; CHECK: store i16 %val
; The vector load must survive — GVN must not replace it with zeroinitializer.
; CHECK: %result = load <3 x i16>
; CHECK: ret <3 x i16> %result
define <3 x i16> @test_no_forward_i16_vec3(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %val, i16* %elem0, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 2: Same pattern with <3 x half> (f16:32 padding).

; CHECK-LABEL: @test_no_forward_f16_vec3
; CHECK: store <3 x half> zeroinitializer
; CHECK: store half %val
; CHECK: %result = load <3 x half>
; CHECK: ret <3 x half> %result
define <3 x half> @test_no_forward_f16_vec3(half %val) {
entry:
%dst = alloca <3 x half>, align 4
store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
%elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
store half %val, half* %elem0, align 4
%result = load <3 x half>, <3 x half>* %dst, align 4
ret <3 x half> %result
}

; Test 3: Multiple element stores — all must survive.
; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer.

; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems
; CHECK: store <3 x i16> zeroinitializer
; CHECK: store i16 %v0
; CHECK: store i16 %v1
; CHECK: store i16 %v2
; CHECK: %result = load <3 x i16>
; CHECK: ret <3 x i16> %result
define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type.
; GVN must not attempt bitcast coercion on padded types.
; If coercion happened, the load would be eliminated and replaced with a bitcast.

; CHECK-LABEL: @test_no_coerce_i16_vec3
; CHECK: store <3 x i16>
; CHECK: load i96
; CHECK-NOT: bitcast
; CHECK: ret
define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) {
entry:
%ptr = alloca <3 x i16>, align 4
store <3 x i16> %v, <3 x i16>* %ptr, align 4
%iptr = bitcast <3 x i16>* %ptr to i96*
%result = load i96, i96* %iptr, align 4
ret i96 %result
}

; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size).

; CHECK-LABEL: @test_no_forward_i16_vec5
; CHECK: store <5 x i16> zeroinitializer
; CHECK: store i16 %val
; CHECK: %result = load <5 x i16>
; CHECK: ret <5 x i16> %result
define <5 x i16> @test_no_forward_i16_vec5(i16 %val) {
entry:
%dst = alloca <5 x i16>, align 4
store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
%elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
store i16 %val, i16* %elem0, align 4
%result = load <5 x i16>, <5 x i16>* %dst, align 4
ret <5 x i16> %result
}

; Test 6: Long vector variant — <8 x half>.

; CHECK-LABEL: @test_no_forward_f16_vec8
; CHECK: store <8 x half> zeroinitializer
; CHECK: store half %val
; CHECK: %result = load <8 x half>
; CHECK: ret <8 x half> %result
define <8 x half> @test_no_forward_f16_vec8(half %val) {
entry:
%dst = alloca <8 x half>, align 4
store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
%elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
store half %val, half* %elem0, align 4
%result = load <8 x half>, <8 x half>* %dst, align 4
ret <8 x half> %result
}

; Test 7: Same-type store-to-load forwarding must still work for padded types.
; GVN should forward %v directly — no intervening writes, same type.

; CHECK-LABEL: @test_same_type_forward_i16_vec3
; The load should be eliminated and %v returned directly.
; CHECK-NOT: load
; CHECK: ret <3 x i16> %v
define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) {
entry:
%ptr = alloca <3 x i16>, align 4
store <3 x i16> %v, <3 x i16>* %ptr, align 4
%result = load <3 x i16>, <3 x i16>* %ptr, align 4
ret <3 x i16> %result
}

; Test 8: Same-type forwarding for <3 x half>.

; CHECK-LABEL: @test_same_type_forward_f16_vec3
; CHECK-NOT: load
; CHECK: ret <3 x half> %v
define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) {
entry:
%ptr = alloca <3 x half>, align 4
store <3 x half> %v, <3 x half>* %ptr, align 4
%result = load <3 x half>, <3 x half>* %ptr, align 4
ret <3 x half> %result
}
149 changes: 149 additions & 0 deletions test/Transforms/SROA/min-precision-padding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
; RUN: opt < %s -sroa -S | FileCheck %s

; Regression test for SROA miscompilation of min precision vector element access.
; DXC's data layout pads i16/f16 to 32 bits (i16:32, f16:32), so GEP offsets
; between vector elements are 4 bytes apart. SROA must use alloc size (not
; primitive size) for element stride, otherwise element stores get misplaced.

target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
target triple = "dxil-ms-dx"

; Test 1: Element-wise write to <3 x i16> vector.
; SROA must map GEP byte offsets to correct element indices using alloc size
; (4 bytes per i16), not primitive size (2 bytes). All stores must survive
; with correct indices, and the final vector load must be preserved.

; CHECK-LABEL: @test_sroa_i16_vec3
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0
; CHECK: store i16 %v0
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %v1
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
; CHECK: store i16 %v2
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 2: Same pattern with <3 x half> (f16:32 padding).

; CHECK-LABEL: @test_sroa_f16_vec3
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0
; CHECK: store half %v0
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1
; CHECK: store half %v1
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2
; CHECK: store half %v2
; CHECK: load <3 x half>
; CHECK: ret <3 x half>
define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) {
entry:
%dst = alloca <3 x half>, align 4
store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
%e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
store half %v0, half* %e0, align 4
%e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1
store half %v1, half* %e1, align 4
%e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2
store half %v2, half* %e2, align 4
%result = load <3 x half>, <3 x half>* %dst, align 4
ret <3 x half> %result
}

; Test 3: Partial write — only element 1 is stored. SROA must index it correctly.

; CHECK-LABEL: @test_sroa_i16_vec3_elem1
; Element 1 store must be correctly placed at GEP index 1, not index 2.
; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong).
; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct).
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %val
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %val, i16* %e1, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 4: Element 2 store — verifies highest index is correct.

; CHECK-LABEL: @test_sroa_i16_vec3_elem2
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
; CHECK: store i16 %val
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %val, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 5: Long vector — <5 x i16> (exceeds 4-element native size).

; CHECK-LABEL: @test_sroa_i16_vec5
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0
; CHECK: store i16 %v0
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %v1
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4
; CHECK: store i16 %v4
; CHECK: load <5 x i16>
; CHECK: ret <5 x i16>
define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) {
entry:
%dst = alloca <5 x i16>, align 4
store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
%e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3
store i16 %v3, i16* %e3, align 4
%e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4
store i16 %v4, i16* %e4, align 4
%result = load <5 x i16>, <5 x i16>* %dst, align 4
ret <5 x i16> %result
}

; Test 6: Long vector — <8 x half>.

; CHECK-LABEL: @test_sroa_f16_vec8_partial
; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0
; CHECK: store half %v0
; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7
; CHECK: store half %v7
; CHECK: load <8 x half>
; CHECK: ret <8 x half>
define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) {
entry:
%dst = alloca <8 x half>, align 4
store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
%e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
store half %v0, half* %e0, align 4
%e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7
store half %v7, half* %e7, align 4
%result = load <8 x half>, <8 x half>* %dst, align 4
ret <8 x half> %result
}
Loading