Re: [PATCH v13 057/113] KVM: TDX: MTRR: implement get_mt_mask() for TDX

From: Isaku Yamahata
Date: Fri Mar 24 2023 - 21:12:11 EST


On Thu, Mar 16, 2023 at 10:38:02AM +0000,
"Huang, Kai" <kai.huang@xxxxxxxxx> wrote:

> On Sun, 2023-03-12 at 10:56 -0700, isaku.yamahata@xxxxxxxxx wrote:
> > From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> >
> > Because TDX virtualize cpuid[0x1].EDX[MTRR: bit 12] to fixed 1, guest TD
> > thinks MTRR is supported. Although TDX supports only WB for private GPA,
> > it's desirable to support MTRR for shared GPA. As guest access to MTRR
> > MSRs causes #VE and KVM/x86 tracks the values of MTRR MSRs, the remining
> > part is to implement get_mt_mask method for TDX for shared GPA.
> >
> > Pass around shared bit from kvm fault handler to get_mt_mask method so that
> > it can determine if the gfn is shared or private.  
> >
>
> I think we have an Xarray to query whether a given GFN is shared or private?
> Can we use that?
>
> > Implement get_mt_mask()
> > following vmx case for shared GPA and return WB for private GPA.
> > the existing vmx_get_mt_mask() can't be directly used as CPU state(CR0.CD)
> > is protected. GFN passed to kvm_mtrr_check_gfn_range_consistency() should
> > include shared bit.
> >
> > Suggested-by: Kai Huang <kai.huang@xxxxxxxxx>
>
> I am not sure what is suggested by me?
>
> I thought what I suggested is we should have a dedicated patch to handle MTRR
> for TDX putting all related things together.

Sure. After looking at the specs, my conclusion is that guest TD can't update
MTRR reliably. Because MTRR update requires to disable cache(CR0.CR=1), cache
flush, and tlb flush. (SDM 3a 12.11.7: MTRR maintenance programming interface)
Linux implements MTRR update logic according to it.

While TDX enforces CR0.CD=0, trying to set CR0.CD=1 results in #GP. At the
same time, it reports that MTRR is available via cpuid. So I come up with the
followings.

- MTRRCap(RO): report no feature available
SMRR=0: SMRR interface unsupported
WC=0: write combining unsupported
FIX=0: Fixed range registers unsupported
VCNT=0: number of variable range regitsers = 0

- MTRRDefType(R/W): Only writeback even with reset state.
E=1: enable MTRR (E=0 means all memory is UC.)
FE=0: disable fixed range MTRRs
type: default memory type=writeback
Accept write this value. Other value results in #GP.

- Fixed range MTRR
#GP as fixed range MTRRs is reported as unsupported

- Variable range MTRRs
#GP as the number of variable range MTRRs is reported as zero


> > +u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
> > +{
> > + /* TDX private GPA is always WB. */
> > + if (!(gfn & kvm_gfn_shared_mask(vcpu->kvm))) {
> > + /* MMIO is only for shared GPA. */
> > + WARN_ON_ONCE(is_mmio);
> > + return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
> > + }
> > +
> > + /* Drop shared bit as MTRR doesn't know about shared bit. */
> > + gfn = kvm_gfn_to_private(vcpu->kvm, gfn);
> > +
> > + /* As TDX enforces CR0.CD to 0, pass check_cr0_cd = false. */
> > + return __vmx_get_mt_mask(vcpu, gfn, is_mmio, false);
> > +}
>
>
> Do you know whether there's any use case of non-coherent device assignment to
> TDX guest?
>
> IMHO, we should just disallow TDX guest to support non-coherent device
> assignment, so that we can just return WB for both private and shared.
>
> If we support non-coherent device assignment, then if guest sets private memory
> to non-WB memory, it believes the memory type is non-WB, but in fact TDX always
> map private memory as WB.
>
> Will this be a problem, i.e. if assigned device can DMA to private memory
> directly in the future?

MTRR is legacy feature and PAT replaced it. We can rely on guest to use PAT.
Here is the new patch for MTRR.

--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -229,6 +229,14 @@ static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
}

+static u8 vt_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_get_mt_mask(vcpu, gfn, is_mmio);
+
+ return vmx_get_mt_mask(vcpu, gfn, is_mmio);
+}
+
static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
if (!is_td(kvm))
@@ -349,7 +357,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {

.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
+ .get_mt_mask = vt_get_mt_mask,

.get_exit_info = vmx_get_exit_info,

diff -u b/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
--- b/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -347,6 +347,25 @@
return 0;
}

+u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ /* TDX private GPA is always WB. */
+ if (!(gfn & kvm_gfn_shared_mask(vcpu->kvm))) {
+ /* MMIO is only for shared GPA. */
+ WARN_ON_ONCE(is_mmio);
+ return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
+ }
+
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+
+ /* TDX enforces CR0.CD = 0 and KVM MTRR emulation enforces writeback. */
+ return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
+}
+
int tdx_vcpu_create(struct kvm_vcpu *vcpu)
{
/*
@@ -1256,6 +1275,45 @@
return ret;
}

+static int tdx_vcpu_init_mtrr(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr;
+ int ret;
+ int i;
+
+ /*
+ * To avoid confusion with reporting VNCT = 0, explicitly disable
+ * vaiale-range reisters.
+ */
+ for (i = 0; i < KVM_NR_VAR_MTRR; i++) {
+ /* phymask */
+ msr = (struct msr_data) {
+ .host_initiated = true,
+ .index = 0x200 + 2 * i + 1,
+ .data = 0, /* valid = 0 to disable. */
+ };
+ ret = kvm_set_msr_common(vcpu, &msr);
+ if (ret)
+ return -EINVAL;
+ }
+
+ /* Set MTRR to use writeback on reset. */
+ msr = (struct msr_data) {
+ .host_initiated = true,
+ .index = MSR_MTRRdefType,
+ /*
+ * Set E(enable MTRR)=1, FE(enable fixed range MTRR)=0, default
+ * type=writeback on reset to avoid UC. Note E=0 means all
+ * memory is UC.
+ */
+ .data = (1 << 11) | MTRR_TYPE_WRBACK,
+ };
+ ret = kvm_set_msr_common(vcpu, &msr);
+ if (ret)
+ return -EINVAL;
+ return 0;
+}
+
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
struct msr_data apic_base_msr;
@@ -1293,6 +1351,10 @@
if (kvm_set_apic_base(vcpu, &apic_base_msr))
return -EINVAL;

+ ret = tdx_vcpu_init_mtrr(vcpu);
+ if (ret)
+ return ret;
+
ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
if (ret)
return ret;
@@ -1676,7 +1738,9 @@
case MSR_IA32_UCODE_REV:
case MSR_IA32_ARCH_CAPABILITIES:
case MSR_IA32_POWER_CTL:
+ case MSR_MTRRcap:
case MSR_IA32_CR_PAT:
+ case MSR_MTRRdefType:
case MSR_IA32_TSC_DEADLINE:
case MSR_IA32_MISC_ENABLE:
case MSR_PLATFORM_INFO:
@@ -1718,16 +1782,47 @@

int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
- if (tdx_has_emulated_msr(msr->index, false))
- return kvm_get_msr_common(vcpu, msr);
- return 1;
+ switch (msr->index) {
+ case MSR_MTRRcap:
+ /*
+ * Override kvm_mtrr_get_msr() which hardcodes the value.
+ * Report SMRR = 0, WC = 0, FIX = 0 VCNT = 0 to disable MTRR
+ * effectively.
+ */
+ msr->data = 0;
+ return 0;
+ default:
+ if (tdx_has_emulated_msr(msr->index, false))
+ return kvm_get_msr_common(vcpu, msr);
+ return 1;
+ }
}

int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
- if (tdx_has_emulated_msr(msr->index, true))
+ switch (msr->index) {
+ case MSR_MTRRdefType:
+ /*
+ * Allow writeback only for all memory.
+ * Because it's reported that fixed range MTRR isn't supported
+ * and VCNT=0, enforce MTRRDefType.FE = 0 and don't care
+ * variable range MTRRs. Only default memory type matters.
+ *
+ * bit 11 E: MTRR enable/disable
+ * bit 12 FE: Fixed-range MTRRs enable/disable
+ * (E, FE) = (1, 1): enable MTRR and Fixed range MTRR
+ * (E, FE) = (1, 0): enable MTRR, disable Fixed range MTRR
+ * (E, FE) = (0, *): disable all MTRRs. all physical memory
+ * is UC
+ */
+ if (msr->data != ((1 << 11) | MTRR_TYPE_WRBACK))
+ return 1;
return kvm_set_msr_common(vcpu, msr);
- return 1;
+ default:
+ if (tdx_has_emulated_msr(msr->index, true))
+ return kvm_set_msr_common(vcpu, msr);
+ return 1;
+ }
}

int tdx_dev_ioctl(void __user *argp)
unchanged:
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -152,6 +152,7 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
int tdx_vcpu_create(struct kvm_vcpu *vcpu);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);

int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);

@@ -176,6 +177,7 @@ static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP
static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
+static inline u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; }

static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }




--
Isaku Yamahata <isaku.yamahata@xxxxxxxxx>