Skip to content

Commit 2c6ad34

Browse files
author
CKI KWF Bot
committed
Merge: arm64: Add BBM Level 2 cpu feature
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-10/-/merge_requests/1662 JIRA: https://issues.redhat.com/browse/RHEL-122955 ## Summary of Changes The following series adds Break-Before-Move cpu feature into Linux kernel and enables it on AmpereOne cpus. Depends: !1400 Depends: !1565 Signed-off-by: Marcin Juszkiewicz <mjuszkiewicz@redhat.com> Approved-by: Mark Salter <msalter@redhat.com> Approved-by: Jerry Snitselaar <jsnitsel@redhat.com> Approved-by: Eder Zulian <ezulian@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: CKI GitLab Kmaint Pipeline Bot <26919896-cki-kmaint-pipeline-bot@users.noreply.gitlab.com>
2 parents a48df9e + db23c0a commit 2c6ad34

File tree

6 files changed

+151
-1
lines changed

6 files changed

+151
-1
lines changed

arch/arm64/include/asm/cputype.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@
129129

130130
#define NVIDIA_CPU_PART_DENVER 0x003
131131
#define NVIDIA_CPU_PART_CARMEL 0x004
132+
#define NVIDIA_CPU_PART_OLYMPUS 0x010
132133

133134
#define FUJITSU_CPU_PART_A64FX 0x001
134135

@@ -219,6 +220,7 @@
219220

220221
#define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER)
221222
#define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL)
223+
#define MIDR_NVIDIA_OLYMPUS MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_OLYMPUS)
222224
#define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX)
223225
#define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110)
224226
#define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09)

arch/arm64/kernel/cpufeature.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2235,6 +2235,9 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco
22352235
static const struct midr_range supports_bbml2_noabort_list[] = {
22362236
MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf),
22372237
MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf),
2238+
MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
2239+
MIDR_ALL_VERSIONS(MIDR_AMPERE1),
2240+
MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
22382241
{}
22392242
};
22402243

arch/arm64/mm/contpte.c

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,144 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
6868
pte = pte_mkyoung(pte);
6969
}
7070

71-
__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
71+
/*
72+
* On eliding the __tlb_flush_range() under BBML2+noabort:
73+
*
74+
* NOTE: Instead of using N=16 as the contiguous block length, we use
75+
* N=4 for clarity.
76+
*
77+
* NOTE: 'n' and 'c' are used to denote the "contiguous bit" being
78+
* unset and set, respectively.
79+
*
80+
* We worry about two cases where contiguous bit is used:
81+
* - When folding N smaller non-contiguous ptes as 1 contiguous block.
82+
* - When unfolding a contiguous block into N smaller non-contiguous ptes.
83+
*
84+
* Currently, the BBML0 folding case looks as follows:
85+
*
86+
* 0) Initial page-table layout:
87+
*
88+
* +----+----+----+----+
89+
* |RO,n|RO,n|RO,n|RW,n| <--- last page being set as RO
90+
* +----+----+----+----+
91+
*
92+
* 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
93+
*
94+
* +----+----+----+----+
95+
* | 0 | 0 | 0 | 0 |
96+
* +----+----+----+----+
97+
*
98+
* 2) __flush_tlb_range():
99+
*
100+
* |____ tlbi + dsb ____|
101+
*
102+
* 3) __set_ptes() to repaint contiguous block:
103+
*
104+
* +----+----+----+----+
105+
* |RO,c|RO,c|RO,c|RO,c|
106+
* +----+----+----+----+
107+
*
108+
* 4) The kernel will eventually __flush_tlb() for changed page:
109+
*
110+
* |____| <--- tlbi + dsb
111+
*
112+
* As expected, the intermediate tlbi+dsb ensures that other PEs
113+
* only ever see an invalid (0) entry, or the new contiguous TLB entry.
114+
* The final tlbi+dsb will always throw away the newly installed
115+
* contiguous TLB entry, which is a micro-optimisation opportunity,
116+
* but does not affect correctness.
117+
*
118+
* In the BBML2 case, the change is avoiding the intermediate tlbi+dsb.
119+
* This means a few things, but notably other PEs will still "see" any
120+
* stale cached TLB entries. This could lead to a "contiguous bit
121+
* misprogramming" issue until the final tlbi+dsb of the changed page,
122+
* which would clear out both the stale (RW,n) entry and the new (RO,c)
123+
* contiguous entry installed in its place.
124+
*
125+
* What this is saying, is the following:
126+
*
127+
* +----+----+----+----+
128+
* |RO,n|RO,n|RO,n|RW,n| <--- old page tables, all non-contiguous
129+
* +----+----+----+----+
130+
*
131+
* +----+----+----+----+
132+
* |RO,c|RO,c|RO,c|RO,c| <--- new page tables, all contiguous
133+
* +----+----+----+----+
134+
* /\
135+
* ||
136+
*
137+
* If both the old single (RW,n) and new contiguous (RO,c) TLB entries
138+
* are present, and a write is made to this address, do we fault or
139+
* is the write permitted (via amalgamation)?
140+
*
141+
* The relevant Arm ARM DDI 0487L.a requirements are RNGLXZ and RJQQTC,
142+
* and together state that when BBML1 or BBML2 are implemented, either
143+
* a TLB conflict abort is raised (which we expressly forbid), or will
144+
* "produce an OA, access permissions, and memory attributes that are
145+
* consistent with any of the programmed translation table values".
146+
*
147+
* That is to say, will either raise a TLB conflict, or produce one of
148+
* the cached TLB entries, but never amalgamate.
149+
*
150+
* Thus, as the page tables are only considered "consistent" after
151+
* the final tlbi+dsb (which evicts both the single stale (RW,n) TLB
152+
* entry as well as the new contiguous (RO,c) TLB entry), omitting the
153+
* initial tlbi+dsb is correct.
154+
*
155+
* It is also important to note that at the end of the BBML2 folding
156+
* case, we are still left with potentially all N TLB entries still
157+
* cached (the N-1 non-contiguous ptes, and the single contiguous
158+
* block). However, over time, natural TLB pressure will cause the
159+
* non-contiguous pte TLB entries to be flushed, leaving only the
160+
* contiguous block TLB entry. This means that omitting the tlbi+dsb is
161+
* not only correct, but also keeps our eventual performance benefits.
162+
*
163+
* For the unfolding case, BBML0 looks as follows:
164+
*
165+
* 0) Initial page-table layout:
166+
*
167+
* +----+----+----+----+
168+
* |RW,c|RW,c|RW,c|RW,c| <--- last page being set as RO
169+
* +----+----+----+----+
170+
*
171+
* 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
172+
*
173+
* +----+----+----+----+
174+
* | 0 | 0 | 0 | 0 |
175+
* +----+----+----+----+
176+
*
177+
* 2) __flush_tlb_range():
178+
*
179+
* |____ tlbi + dsb ____|
180+
*
181+
* 3) __set_ptes() to repaint as non-contiguous:
182+
*
183+
* +----+----+----+----+
184+
* |RW,n|RW,n|RW,n|RW,n|
185+
* +----+----+----+----+
186+
*
187+
* 4) Update changed page permissions:
188+
*
189+
* +----+----+----+----+
190+
* |RW,n|RW,n|RW,n|RO,n| <--- last page permissions set
191+
* +----+----+----+----+
192+
*
193+
* 5) The kernel will eventually __flush_tlb() for changed page:
194+
*
195+
* |____| <--- tlbi + dsb
196+
*
197+
* For BBML2, we again remove the intermediate tlbi+dsb. Here, there
198+
* are no issues, as the final tlbi+dsb covering the changed page is
199+
* guaranteed to remove the original large contiguous (RW,c) TLB entry,
200+
* as well as the intermediate (RW,n) TLB entry; the next access will
201+
* install the new (RO,n) TLB entry and the page tables are only
202+
* considered "consistent" after the final tlbi+dsb, so software must
203+
* be prepared for this inconsistency prior to finishing the mm dance
204+
* regardless.
205+
*/
206+
207+
if (!system_supports_bbml2_noabort())
208+
__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
72209

73210
__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
74211
}

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,9 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
222222
feat_mask |= ARM_SMMU_FEAT_VAX;
223223
}
224224

225+
if (system_supports_bbml2_noabort())
226+
feat_mask |= ARM_SMMU_FEAT_BBML2;
227+
225228
if ((smmu->features & feat_mask) != feat_mask)
226229
return false;
227230

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4443,6 +4443,9 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
44434443
if (FIELD_GET(IDR3_FWB, reg))
44444444
smmu->features |= ARM_SMMU_FEAT_S2FWB;
44454445

4446+
if (FIELD_GET(IDR3_BBM, reg) == 2)
4447+
smmu->features |= ARM_SMMU_FEAT_BBML2;
4448+
44464449
/* IDR5 */
44474450
reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
44484451

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ struct arm_smmu_device;
6060
#define ARM_SMMU_IDR3 0xc
6161
#define IDR3_FWB (1 << 8)
6262
#define IDR3_RIL (1 << 10)
63+
#define IDR3_BBM GENMASK(12, 11)
6364

6465
#define ARM_SMMU_IDR5 0x14
6566
#define IDR5_STALL_MAX GENMASK(31, 16)
@@ -755,6 +756,7 @@ struct arm_smmu_device {
755756
#define ARM_SMMU_FEAT_HA (1 << 21)
756757
#define ARM_SMMU_FEAT_HD (1 << 22)
757758
#define ARM_SMMU_FEAT_S2FWB (1 << 23)
759+
#define ARM_SMMU_FEAT_BBML2 (1 << 24)
758760
u32 features;
759761

760762
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)

0 commit comments

Comments
 (0)