Re: [RFC PATCH 2/3] x86/mm: make sure LAM is up-to-date during context switching

From: Kirill A. Shutemov
Date: Fri Mar 08 2024 - 09:05:48 EST


On Fri, Mar 08, 2024 at 01:47:39AM +0000, Yosry Ahmed wrote:
> I like it very much. The problem now is, as I told Dave, I realized I
> cannot do any testing beyond compilation due to lack of hardware. I am
> happy to send a next version if this is acceptable or if someone else
> can test.

I have non-upstreamable QEMU patch that adds LAM emulation, if it helps:

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 93b1ca810bf4..fe887a86a156 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1295,6 +1295,19 @@ void tlb_set_page(CPUState *cpu, vaddr addr,
prot, mmu_idx, size);
}

+
+static vaddr clean_addr(CPUState *cpu, vaddr addr)
+{
+ CPUClass *cc = CPU_GET_CLASS(cpu);
+
+ if (cc->tcg_ops->do_clean_addr) {
+ addr = cc->tcg_ops->do_clean_addr(cpu, addr);
+ }
+
+ return addr;
+}
+
+
/*
* Note: tlb_fill() can trigger a resize of the TLB. This means that all of the
* caller's prior references to the TLB table (e.g. CPUTLBEntry pointers) must
@@ -1867,9 +1880,10 @@ static bool mmu_lookup(CPUState *cpu, vaddr addr, MemOpIdx oi,
* Probe for an atomic operation. Do not allow unaligned operations,
* or io operations to proceed. Return the host address.
*/
-static void *atomic_mmu_lookup(CPUState *cpu, vaddr addr, MemOpIdx oi,
+static void *atomic_mmu_lookup(CPUState *cpu, vaddr address, MemOpIdx oi,
int size, uintptr_t retaddr)
{
+ vaddr addr = clean_addr(cpu, address);
uintptr_t mmu_idx = get_mmuidx(oi);
MemOp mop = get_memop(oi);
int a_bits = get_alignment_bits(mop);
@@ -2002,10 +2016,11 @@ static void *atomic_mmu_lookup(CPUState *cpu, vaddr addr, MemOpIdx oi,
* The bytes are concatenated in big-endian order with @ret_be.
*/
static uint64_t int_ld_mmio_beN(CPUState *cpu, CPUTLBEntryFull *full,
- uint64_t ret_be, vaddr addr, int size,
+ uint64_t ret_be, vaddr address, int size,
int mmu_idx, MMUAccessType type, uintptr_t ra,
MemoryRegion *mr, hwaddr mr_offset)
{
+ vaddr addr = clean_addr(cpu, address);
do {
MemOp this_mop;
unsigned this_size;
@@ -2543,10 +2558,11 @@ static Int128 do_ld16_mmu(CPUState *cpu, vaddr addr,
* return the bytes of @val_le beyond @p->size that have not been stored.
*/
static uint64_t int_st_mmio_leN(CPUState *cpu, CPUTLBEntryFull *full,
- uint64_t val_le, vaddr addr, int size,
+ uint64_t val_le, vaddr address, int size,
int mmu_idx, uintptr_t ra,
MemoryRegion *mr, hwaddr mr_offset)
{
+ vaddr addr = clean_addr(cpu, address);
do {
MemOp this_mop;
unsigned this_size;
diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
index bf8ff8e3eec1..eaa8e09a6226 100644
--- a/include/hw/core/tcg-cpu-ops.h
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -140,6 +140,12 @@ struct TCGCPUOps {
MMUAccessType access_type,
int mmu_idx, uintptr_t retaddr);

+
+ /**
+ * @do_clean_addr: Callback for clearing metadata/tags from the address.
+ */
+ vaddr (*do_clean_addr)(CPUState *cpu, vaddr addr);
+
/**
* @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
*/
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 2666ef380891..1bbfd31042b2 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -739,7 +739,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
#define TCG_7_0_EDX_FEATURES (CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_KERNEL_FEATURES)

#define TCG_7_1_EAX_FEATURES (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | \
- CPUID_7_1_EAX_FSRC | CPUID_7_1_EAX_CMPCCXADD)
+ CPUID_7_1_EAX_FSRC | CPUID_7_1_EAX_CMPCCXADD | CPUID_7_1_EAX_LAM)
#define TCG_7_1_EDX_FEATURES 0
#define TCG_7_2_EDX_FEATURES 0
#define TCG_APM_FEATURES 0
@@ -968,7 +968,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
"fsrc", NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, "amx-fp16", NULL, "avx-ifma",
- NULL, NULL, NULL, NULL,
+ NULL, NULL, "lam", NULL,
NULL, NULL, NULL, NULL,
},
.cpuid = {
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 952174bb6f52..6ef9afd443b7 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -238,6 +238,9 @@ typedef enum X86Seg {
#define CR0_CD_MASK (1U << 30)
#define CR0_PG_MASK (1U << 31)

+#define CR3_LAM_U57 (1ULL << 61)
+#define CR3_LAM_U48 (1ULL << 62)
+
#define CR4_VME_MASK (1U << 0)
#define CR4_PVI_MASK (1U << 1)
#define CR4_TSD_MASK (1U << 2)
@@ -261,6 +264,7 @@ typedef enum X86Seg {
#define CR4_SMAP_MASK (1U << 21)
#define CR4_PKE_MASK (1U << 22)
#define CR4_PKS_MASK (1U << 24)
+#define CR4_LAM_SUP (1U << 28)

#define CR4_RESERVED_MASK \
(~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \
@@ -269,7 +273,8 @@ typedef enum X86Seg {
| CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \
| CR4_LA57_MASK \
| CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \
- | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK))
+ | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \
+ | CR4_LAM_SUP ))

#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
@@ -932,6 +937,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
#define CPUID_7_1_EAX_AMX_FP16 (1U << 21)
/* Support for VPMADD52[H,L]UQ */
#define CPUID_7_1_EAX_AVX_IFMA (1U << 23)
+/* Linear Address Masking */
+#define CPUID_7_1_EAX_LAM (1U << 26)

/* Support for VPDPB[SU,UU,SS]D[,S] */
#define CPUID_7_1_EDX_AVX_VNNI_INT8 (1U << 4)
@@ -2525,6 +2532,24 @@ static inline bool hyperv_feat_enabled(X86CPU *cpu, int feat)
return !!(cpu->hyperv_features & BIT(feat));
}

+static inline uint64_t cr3_reserved_bits(CPUX86State *env)
+{
+ uint64_t reserved_bits;
+
+ if (!(env->efer & MSR_EFER_LMA)) {
+ return 0;
+ }
+
+ reserved_bits = (~0ULL) << env_archcpu(env)->phys_bits;
+
+ if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM) {
+ reserved_bits &= ~(CR3_LAM_U48 | CR3_LAM_U57);
+ }
+
+ return reserved_bits;
+}
+
+
static inline uint64_t cr4_reserved_bits(CPUX86State *env)
{
uint64_t reserved_bits = CR4_RESERVED_MASK;
diff --git a/target/i386/helper.c b/target/i386/helper.c
index 2070dd0dda1f..4901c9c17b1e 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -262,7 +262,7 @@ hwaddr x86_cpu_get_phys_page_attrs_debug(CPUState *cs, vaddr addr,
}

if (la57) {
- pml5e_addr = ((env->cr[3] & ~0xfff) +
+ pml5e_addr = ((env->cr[3] & PG_ADDRESS_MASK) +
(((addr >> 48) & 0x1ff) << 3)) & a20_mask;
pml5e = x86_ldq_phys(cs, pml5e_addr);
if (!(pml5e & PG_PRESENT_MASK)) {
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index effc2c1c9842..11f75ea475e3 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -84,6 +84,7 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
MMUAccessType access_type,
int mmu_idx, uintptr_t retaddr);
+vaddr x86_cpu_clean_addr(CPUState *cpu, vaddr addr);
#endif

/* cc_helper.c */
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index 8f7011d96631..1bc71170e6a3 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -163,7 +163,7 @@ static bool mmu_translate(CPUX86State *env, const TranslateParams *in,
/*
* Page table level 5
*/
- pte_addr = (in->cr3 & ~0xfff) + (((addr >> 48) & 0x1ff) << 3);
+ pte_addr = (in->cr3 & PG_ADDRESS_MASK) + (((addr >> 48) & 0x1ff) << 3);
if (!ptw_translate(&pte_trans, pte_addr)) {
return false;
}
@@ -638,3 +638,30 @@ G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
X86CPU *cpu = X86_CPU(cs);
handle_unaligned_access(&cpu->env, vaddr, access_type, retaddr);
}
+
+
+static inline int64_t sign_extend64(uint64_t value, int index)
+{
+ int shift = 63 - index;
+ return (int64_t)(value << shift) >> shift;
+}
+
+vaddr x86_cpu_clean_addr(CPUState *cs, vaddr addr)
+{
+ CPUX86State *env = &X86_CPU(cs)->env;
+ bool la57 = env->cr[4] & CR4_LA57_MASK;
+
+ if (addr >> 63) {
+ if (env->cr[4] & CR4_LAM_SUP) {
+ return sign_extend64(addr, la57 ? 56 : 47);
+ }
+ } else {
+ if (env->cr[3] & CR3_LAM_U57) {
+ return sign_extend64(addr, 56);
+ } else if (env->cr[3] & CR3_LAM_U48) {
+ return sign_extend64(addr, 47);
+ }
+ }
+
+ return addr;
+}
diff --git a/target/i386/tcg/sysemu/misc_helper.c b/target/i386/tcg/sysemu/misc_helper.c
index edb7c3d89408..aecb523e777d 100644
--- a/target/i386/tcg/sysemu/misc_helper.c
+++ b/target/i386/tcg/sysemu/misc_helper.c
@@ -98,8 +98,7 @@ void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
cpu_x86_update_cr0(env, t0);
break;
case 3:
- if ((env->efer & MSR_EFER_LMA) &&
- (t0 & ((~0ULL) << env_archcpu(env)->phys_bits))) {
+ if (t0 & cr3_reserved_bits(env)) {
cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
}
if (!(env->efer & MSR_EFER_LMA)) {
diff --git a/target/i386/tcg/sysemu/svm_helper.c b/target/i386/tcg/sysemu/svm_helper.c
index 5d6de2294fa1..e981b124d975 100644
--- a/target/i386/tcg/sysemu/svm_helper.c
+++ b/target/i386/tcg/sysemu/svm_helper.c
@@ -305,8 +305,7 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
}
new_cr3 = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.cr3));
- if ((env->efer & MSR_EFER_LMA) &&
- (new_cr3 & ((~0ULL) << cpu->phys_bits))) {
+ if (new_cr3 & cr3_reserved_bits(env)) {
cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
}
new_cr4 = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.cr4));
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index cca19cd40e81..8ceeb954364e 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -118,6 +118,7 @@ static const TCGCPUOps x86_tcg_ops = {
.record_sigbus = x86_cpu_record_sigbus,
#else
.tlb_fill = x86_cpu_tlb_fill,
+ .do_clean_addr = x86_cpu_clean_addr,
.do_interrupt = x86_cpu_do_interrupt,
.cpu_exec_halt = x86_cpu_exec_halt,
.cpu_exec_interrupt = x86_cpu_exec_interrupt,
--
Kiryl Shutsemau / Kirill A. Shutemov