Re: [RFC PATCH] KVM: x86: Fix APIC page invalidation race

From: Eiichi Tsukata
Date: Sat Jun 06 2020 - 01:01:41 EST


Hello

The race window I mentioned in the commit message is pretty small. So itâs difficult to reproduce it.
But with the following âdelayâ patch, it can be very easy to reproduce.

```
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c17e6eb9ad43..b6728bf80a7d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,6 +55,7 @@
#include <linux/sched/stat.h>
#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
+#include <linux/delay.h>

#include <trace/events/kvm.h>

@@ -8161,8 +8162,10 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
* Update it when it becomes invalid.
*/
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (start <= apic_address && apic_address < end)
+ if (start <= apic_address && apic_address < end) {
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+ mdelay(1000);
+ }

return 0;
}
```

Steps to Reproduce:
- start Windows VM(ex: Windows Server 2016) and watch YouTube video to stimulate VM_ENTER/EXIT
- âstress âvm X âvm-bytes Yâ to make the APIC page swapped out
- Windows OS will crash with BugCheck 0x109

Thanks,

Eiichi

> On Jun 6, 2020, at 13:26, Eiichi Tsukata <eiichi.tsukata@xxxxxxxxxxx> wrote:
>
> Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to
> fix inappropriate APIC page invalidation by re-introducing arch specific
> kvm_arch_mmu_notifier_invalidate_range() and calling it from
> kvm_mmu_notifier_invalidate_range_start. But threre could be the
> following race because VMCS APIC address cache can be updated
> *before* it is unmapped.
>
> Race:
> (Invalidator) kvm_mmu_notifier_invalidate_range_start()
> (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
> (KVM VCPU) vcpu_enter_guest()
> (KVM VCPU) kvm_vcpu_reload_apic_access_page()
> (Invalidator) actually unmap page
>
> Symptom:
> The above race can make Guest OS see already freed page and Guest OS
> will see broken APIC register values. Especially, Windows OS checks
> LAPIC modification so it can cause BSOD crash with BugCheck
> CRITICAL_STRUCTURE_CORRUPTION (109). These symptoms are the same as we
> previously saw in https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.kernel.org_show-5Fbug.cgi-3Fid-3D197951&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=uTkyasbUNMoptgfsLkg3D5IDb_xxOSjklf2IfLLUzgI&e= and
> we are currently seeing in
> https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.redhat.com_show-5Fbug.cgi-3Fid-3D1751017&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=pyRkFbs1A9a9AXxWMqiDEOoGJGBbmF8uJdLu8vKSPCs&e= .
>
> To prevent Guest OS from accessing already freed page, this patch calls
> kvm_arch_mmu_notifier_invalidate_range() from
> kvm_mmu_notifier_invalidate_range() instead of ..._range_start().
>
> Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation")
> Signed-off-by: Eiichi Tsukata <eiichi.tsukata@xxxxxxxxxxx>
> ---
> arch/x86/kvm/x86.c | 7 ++-----
> include/linux/kvm_host.h | 4 ++--
> virt/kvm/kvm_main.c | 26 ++++++++++++++++----------
> 3 files changed, 20 insertions(+), 17 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c17e6eb9ad43..1700aade39d1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8150,9 +8150,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
> kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
> }
>
> -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> - unsigned long start, unsigned long end,
> - bool blockable)
> +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> + unsigned long start, unsigned long end)
> {
> unsigned long apic_address;
>
> @@ -8163,8 +8162,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
> if (start <= apic_address && apic_address < end)
> kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
> -
> - return 0;
> }
>
> void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 131cc1527d68..92efa39ea3d7 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1406,8 +1406,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
> }
> #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
>
> -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> - unsigned long start, unsigned long end, bool blockable);
> +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> + unsigned long start, unsigned long end);
>
> #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
> int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 731c1e517716..77aa91fb08d2 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
> static unsigned long long kvm_createvm_count;
> static unsigned long long kvm_active_vms;
>
> -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> - unsigned long start, unsigned long end, bool blockable)
> +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> + unsigned long start, unsigned long end)
> {
> - return 0;
> }
>
> bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
> @@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> return container_of(mn, struct kvm, mmu_notifier);
> }
>
> +static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
> + struct mm_struct *mm,
> + unsigned long start, unsigned long end)
> +{
> + struct kvm *kvm = mmu_notifier_to_kvm(mn);
> + int idx;
> +
> + idx = srcu_read_lock(&kvm->srcu);
> + kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
> + srcu_read_unlock(&kvm->srcu, idx);
> +}
> +
> static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> struct mm_struct *mm,
> unsigned long address,
> @@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> {
> struct kvm *kvm = mmu_notifier_to_kvm(mn);
> int need_tlb_flush = 0, idx;
> - int ret;
>
> idx = srcu_read_lock(&kvm->srcu);
> spin_lock(&kvm->mmu_lock);
> @@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> kvm_flush_remote_tlbs(kvm);
>
> spin_unlock(&kvm->mmu_lock);
> -
> - ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
> - range->end,
> - mmu_notifier_range_blockable(range));
> -
> srcu_read_unlock(&kvm->srcu, idx);
>
> - return ret;
> + return 0;
> }
>
> static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> @@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .invalidate_range = kvm_mmu_notifier_invalidate_range,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
> --
> 2.21.3
>