Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC

From: Wincy Van
Date: Tue Aug 09 2016 - 06:19:43 EST


On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@xxxxxxxxx> wrote:
> On 2016/8/9 2:16, Radim KrÄmÃÅ wrote:
>>
>> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
>> accesses. In some configurations of VMX controls, the guest can even
>> directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED
>> APIC ACCESSES.
>>
>> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
>> To do so, L1 would first trick KVM to disable all possible interceptions
>> by enabling APICv features and then would turn those features off;
>> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
>> not intercept previously enabled MSRs even though they were not safe
>> with the new configuration.
>>
>> Correctly re-enabling interceptions is not enough as a second bug would
>> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
>> VMCSs, so L1 could trigger a race to get the desired combination of msr
>> bitmap and VMX controls.
>>
>> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
>> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
>> have to intercept everything anyway.
>>
>> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
>> Reported-by: Jim Mattson <jmattson@xxxxxxxxxx>
>> Suggested-by: Wincy Van <fanwenyi0529@xxxxxxxxx>
>> Signed-off-by: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
>> ---
>> arch/x86/kvm/vmx.c | 107
>> ++++++++++++++++++++++-------------------------------
>> 1 file changed, 44 insertions(+), 63 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index a45d8580f91e..c66ac2c70d22 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -435,6 +435,8 @@ struct nested_vmx {
>> bool pi_pending;
>> u16 posted_intr_nv;
>>
>> + unsigned long *msr_bitmap;
>> +
>> struct hrtimer preemption_timer;
>> bool preemption_timer_expired;
>>
>> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>> static unsigned long *vmx_msr_bitmap_longmode;
>> static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>> static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>> -static unsigned long *vmx_msr_bitmap_nested;
>> static unsigned long *vmx_vmread_bitmap;
>> static unsigned long *vmx_vmwrite_bitmap;
>>
>> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu
>> *vcpu)
>> unsigned long *msr_bitmap;
>>
>> if (is_guest_mode(vcpu))
>> - msr_bitmap = vmx_msr_bitmap_nested;
>> + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>> else if (cpu_has_secondary_exec_ctrls() &&
>> (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
>> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>> if (!vmx_msr_bitmap_longmode_x2apic)
>> goto out4;
>>
>> - if (nested) {
>> - vmx_msr_bitmap_nested =
>> - (unsigned long *)__get_free_page(GFP_KERNEL);
>> - if (!vmx_msr_bitmap_nested)
>> - goto out5;
>> - }
>> -
>> vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>> if (!vmx_vmread_bitmap)
>> goto out6;
>> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>>
>> memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>> memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
>> - if (nested)
>> - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>>
>> if (setup_vmcs_config(&vmcs_config) < 0) {
>> r = -EIO;
>> @@ -6529,9 +6521,6 @@ out8:
>> out7:
>> free_page((unsigned long)vmx_vmread_bitmap);
>> out6:
>> - if (nested)
>> - free_page((unsigned long)vmx_msr_bitmap_nested);
>> -out5:
>> free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>> out4:
>> free_page((unsigned long)vmx_msr_bitmap_longmode);
>> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>> free_page((unsigned long)vmx_io_bitmap_a);
>> free_page((unsigned long)vmx_vmwrite_bitmap);
>> free_page((unsigned long)vmx_vmread_bitmap);
>> - if (nested)
>> - free_page((unsigned long)vmx_msr_bitmap_nested);
>>
>> free_kvm_area();
>> }
>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>> return 1;
>> }
>>
>> + if (cpu_has_vmx_msr_bitmap()) {
>> + vmx->nested.msr_bitmap =
>> + (unsigned long
>> *)__get_free_page(GFP_KERNEL);
>> + if (!vmx->nested.msr_bitmap)
>> + goto out_msr_bitmap;
>> + }
>> +
>
>
> We export msr_bitmap to guest even it is not supported by hardware. So we
> need to allocate msr_bitmap for L1 unconditionally.
>
>

Nice to see you, Yang :-)

I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and
used by hardware.
L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to
vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP.

Am I missed something?

Thanks,
Wincy

>> vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>> if (!vmx->nested.cached_vmcs12)
>> - return -ENOMEM;
>> + goto out_cached_vmcs12;
>>
>> if (enable_shadow_vmcs) {
>> shadow_vmcs = alloc_vmcs();
>> - if (!shadow_vmcs) {
>> - kfree(vmx->nested.cached_vmcs12);
>> - return -ENOMEM;
>> - }
>> + if (!shadow_vmcs)
>> + goto out_shadow_vmcs;
>> /* mark vmcs as shadow */
>> shadow_vmcs->revision_id |= (1u << 31);
>> /* init shadow vmcs */
>> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>> skip_emulated_instruction(vcpu);
>> nested_vmx_succeed(vcpu);
>> return 1;
>> +
>> +out_shadow_vmcs:
>> + kfree(vmx->nested.cached_vmcs12);
>> +
>> +out_cached_vmcs12:
>> + free_page((unsigned long)vmx->nested.msr_bitmap);
>> +
>> +out_msr_bitmap:
>> + return -ENOMEM;
>> }
>>
>> /*
>> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>> vmx->nested.vmxon = false;
>> free_vpid(vmx->nested.vpid02);
>> nested_release_vmcs12(vmx);
>> + if (vmx->nested.msr_bitmap) {
>> + free_page((unsigned long)vmx->nested.msr_bitmap);
>> + vmx->nested.msr_bitmap = NULL;
>> + }
>> if (enable_shadow_vmcs)
>> free_vmcs(vmx->nested.current_shadow_vmcs);
>> kfree(vmx->nested.cached_vmcs12);
>> @@ -9472,8 +9477,10 @@ static inline bool
>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>> {
>> int msr;
>> struct page *page;
>> - unsigned long *msr_bitmap;
>> + unsigned long *msr_bitmap_l1;
>> + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>>
>> + /* This shortcut is ok because we support only x2APIC MSRs so far.
>> */
>> if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>> return false;
>>
>> @@ -9482,63 +9489,37 @@ static inline bool
>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>> WARN_ON(1);
>> return false;
>> }
>> - msr_bitmap = (unsigned long *)kmap(page);
>> - if (!msr_bitmap) {
>> + msr_bitmap_l1 = (unsigned long *)kmap(page);
>> + if (!msr_bitmap_l1) {
>> nested_release_page_clean(page);
>> WARN_ON(1);
>> return false;
>> }
>>
>> + memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
>> +
>> if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>> if (nested_cpu_has_apic_reg_virt(vmcs12))
>> for (msr = 0x800; msr <= 0x8ff; msr++)
>> nested_vmx_disable_intercept_for_msr(
>> - msr_bitmap,
>> - vmx_msr_bitmap_nested,
>> + msr_bitmap_l1, msr_bitmap_l0,
>> msr, MSR_TYPE_R);
>> - /* TPR is allowed */
>> - nested_vmx_disable_intercept_for_msr(msr_bitmap,
>> - vmx_msr_bitmap_nested,
>> +
>> + nested_vmx_disable_intercept_for_msr(
>> + msr_bitmap_l1, msr_bitmap_l0,
>> APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>> MSR_TYPE_R | MSR_TYPE_W);
>> +
>> if (nested_cpu_has_vid(vmcs12)) {
>> - /* EOI and self-IPI are allowed */
>> nested_vmx_disable_intercept_for_msr(
>> - msr_bitmap,
>> - vmx_msr_bitmap_nested,
>> + msr_bitmap_l1, msr_bitmap_l0,
>> APIC_BASE_MSR + (APIC_EOI >> 4),
>> MSR_TYPE_W);
>> nested_vmx_disable_intercept_for_msr(
>> - msr_bitmap,
>> - vmx_msr_bitmap_nested,
>> + msr_bitmap_l1, msr_bitmap_l0,
>> APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>> MSR_TYPE_W);
>> }
>> - } else {
>> - /*
>> - * Enable reading intercept of all the x2apic
>> - * MSRs. We should not rely on vmcs12 to do any
>> - * optimizations here, it may have been modified
>> - * by L1.
>> - */
>> - for (msr = 0x800; msr <= 0x8ff; msr++)
>> - __vmx_enable_intercept_for_msr(
>> - vmx_msr_bitmap_nested,
>> - msr,
>> - MSR_TYPE_R);
>> -
>> - __vmx_enable_intercept_for_msr(
>> - vmx_msr_bitmap_nested,
>> - APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>> - MSR_TYPE_W);
>> - __vmx_enable_intercept_for_msr(
>> - vmx_msr_bitmap_nested,
>> - APIC_BASE_MSR + (APIC_EOI >> 4),
>> - MSR_TYPE_W);
>> - __vmx_enable_intercept_for_msr(
>> - vmx_msr_bitmap_nested,
>> - APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>> - MSR_TYPE_W);
>> }
>> kunmap(page);
>> nested_release_page_clean(page);
>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> struct vmcs12 *vmcs12)
>> }
>>
>> if (cpu_has_vmx_msr_bitmap() &&
>> - exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>> - nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>> - /* MSR_BITMAP will be set by following vmx_set_efer. */
>> - } else
>> + exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>> + nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>> + ; /* MSR_BITMAP will be set by following vmx_set_efer. */
>> + else
>> exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>>
>> /*
>>
>
>
> --
> Yang
> Alibaba Cloud Computing