Re: [PATCH v6 17/38] KVM: x86: hyper-v: L2 TLB flush

From: Maxim Levitsky
Date: Tue Jun 07 2022 - 05:47:19 EST


On Mon, 2022-06-06 at 10:36 +0200, Vitaly Kuznetsov wrote:
> Handle L2 TLB flush requests by going through all vCPUs and checking
> whether there are vCPUs running the same VM_ID with a VP_ID specified
> in the requests. Perform synthetic exit to L2 upon finish.
>
> Note, while checking VM_ID/VP_ID of running vCPUs seem to be a bit
> racy, we count on the fact that KVM flushes the whole L2 VPID upon
> transition. Also, KVM_REQ_HV_TLB_FLUSH request needs to be done upon
> transition between L1 and L2 to make sure all pending requests are
> always processed.
>
> For the reference, Hyper-V TLFS refers to the feature as "Direct
> Virtual Flush".
>
> Note, nVMX/nSVM code does not handle VMCALL/VMMCALL from L2 yet.
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
> ---
>  arch/x86/kvm/hyperv.c | 84 +++++++++++++++++++++++++++++++++++--------
>  arch/x86/kvm/hyperv.h | 14 ++++----
>  arch/x86/kvm/trace.h  | 21 ++++++-----
>  arch/x86/kvm/x86.c    |  4 +--
>  4 files changed, 91 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 3075e9661696..740190917c1c 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -34,6 +34,7 @@
>  #include <linux/eventfd.h>
>  
>  #include <asm/apicdef.h>
> +#include <asm/mshyperv.h>
>  #include <trace/events/kvm.h>
>  
>  #include "trace.h"
> @@ -1835,9 +1836,10 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc
>                                   entries, consumed_xmm_halves, offset);
>  }
>  
> -static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, u64 *entries, int count)
> +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
> +                                struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
> +                                u64 *entries, int count)
>  {
> -       struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
>         struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
>         u64 entry = KVM_HV_TLB_FLUSHALL_ENTRY;
>         unsigned long flags;
> @@ -1845,9 +1847,6 @@ static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, u64 *entries, int count)
>         if (!hv_vcpu)
>                 return;
>  
> -       /* kvm_hv_flush_tlb() is not ready to handle requests for L2s yet */
> -       tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo[HV_L1_TLB_FLUSH_FIFO];
> -
>         spin_lock_irqsave(&tlb_flush_fifo->write_lock, flags);
>  
>         /*
> @@ -1883,7 +1882,7 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
>                 return;
>         }
>  
> -       tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu);
> +       tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
>  
>         count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
>  
> @@ -1916,6 +1915,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>         struct hv_tlb_flush_ex flush_ex;
>         struct hv_tlb_flush flush;
>         DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
> +       struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
>         /*
>          * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
>          * entries on the TLB flush fifo. The last entry, however, needs to be
> @@ -1959,7 +1959,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>                 }
>  
>                 trace_kvm_hv_flush_tlb(flush.processor_mask,
> -                                      flush.address_space, flush.flags);
> +                                      flush.address_space, flush.flags,
> +                                      is_guest_mode(vcpu));
>  
>                 valid_bank_mask = BIT_ULL(0);
>                 sparse_banks[0] = flush.processor_mask;
> @@ -1990,7 +1991,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>                 trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
>                                           flush_ex.hv_vp_set.format,
>                                           flush_ex.address_space,
> -                                         flush_ex.flags);
> +                                         flush_ex.flags, is_guest_mode(vcpu));
>  
>                 valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
>                 all_cpus = flush_ex.hv_vp_set.format !=
> @@ -2028,19 +2029,57 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
>          * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
>          * analyze it here, flush TLB regardless of the specified address space.
>          */
> -       if (all_cpus) {
> -               kvm_for_each_vcpu(i, v, kvm)
> -                       hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt);
> +       if (all_cpus && !is_guest_mode(vcpu)) {
> +               kvm_for_each_vcpu(i, v, kvm) {
> +                       tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
> +                       hv_tlb_flush_enqueue(v, tlb_flush_fifo,
> +                                            tlb_flush_entries, hc->rep_cnt);
> +               }
>  
>                 kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
> -       } else {
> +       } else if (!is_guest_mode(vcpu)) {
>                 sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
>  
>                 for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
>                         v = kvm_get_vcpu(kvm, i);
>                         if (!v)
>                                 continue;
> -                       hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt);
> +                       tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
> +                       hv_tlb_flush_enqueue(v, tlb_flush_fifo,
> +                                            tlb_flush_entries, hc->rep_cnt);
> +               }
> +
> +               kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
> +       } else {
> +               struct kvm_vcpu_hv *hv_v;
> +
> +               bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
> +
> +               kvm_for_each_vcpu(i, v, kvm) {
> +                       hv_v = to_hv_vcpu(v);
> +
> +                       /*
> +                        * The following check races with nested vCPUs entering/exiting
> +                        * and/or migrating between L1's vCPUs, however the only case when
> +                        * KVM *must* flush the TLB is when the target L2 vCPU keeps
> +                        * running on the same L1 vCPU from the moment of the request until
> +                        * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
> +                        * cases, e.g. when the target L2 vCPU migrates to a different L1
> +                        * vCPU or when the corresponding L1 vCPU temporary switches to a
> +                        * different L2 vCPU while the request is being processed.
Looks great!

> +                        */
> +                       if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
> +                               continue;
> +
> +                       if (!all_cpus &&
> +                           !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
> +                                                   sparse_banks))
> +                               continue;
> +
> +                       __set_bit(i, vcpu_mask);
> +                       tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
> +                       hv_tlb_flush_enqueue(v, tlb_flush_fifo,
> +                                            tlb_flush_entries, hc->rep_cnt);
>                 }
>  
>                 kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
> @@ -2228,10 +2267,27 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
>  
>  static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
>  {
> +       int ret;
> +
>         trace_kvm_hv_hypercall_done(result);
>         kvm_hv_hypercall_set_result(vcpu, result);
>         ++vcpu->stat.hypercalls;
> -       return kvm_skip_emulated_instruction(vcpu);
> +       ret = kvm_skip_emulated_instruction(vcpu);
> +
> +       if (unlikely(hv_result_success(result) && is_guest_mode(vcpu)
> +                    && kvm_hv_is_tlb_flush_hcall(vcpu))) {
> +               struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> +               u32 tlb_lock_count;
> +
> +               if (unlikely(kvm_read_guest(vcpu->kvm, hv_vcpu->nested.pa_page_gpa,
> +                                           &tlb_lock_count, sizeof(tlb_lock_count))))
> +                       kvm_inject_gp(vcpu, 0);
> +
> +               if (tlb_lock_count)
> +                       kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
> +       }
> +
> +       return ret;
>  }
>  
>  static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
> index dc46c5ed5d18..7778b3a5913c 100644
> --- a/arch/x86/kvm/hyperv.h
> +++ b/arch/x86/kvm/hyperv.h
> @@ -148,26 +148,24 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
>  int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>                      struct kvm_cpuid_entry2 __user *entries);
>  
> -static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu)
> +static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
> +                                                                          bool is_guest_mode)
>  {
>         struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> -       int i = !is_guest_mode(vcpu) ? HV_L1_TLB_FLUSH_FIFO :
> -                                      HV_L2_TLB_FLUSH_FIFO;
> -
> -       /* KVM does not handle L2 TLB flush requests yet */
> -       WARN_ON_ONCE(i != HV_L1_TLB_FLUSH_FIFO);
> +       int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
> +                               HV_L1_TLB_FLUSH_FIFO;
>  
>         return &hv_vcpu->tlb_flush_fifo[i];
>  }
>  
> -static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu)
> +static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu, bool is_guest_mode)
>  {
>         struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
>  
>         if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
>                 return;
>  
> -       tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu);
> +       tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode);
>  
>         kfifo_reset_out(&tlb_flush_fifo->entries);
>  }
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index fd28dd40b813..f5e5b8f0342c 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -1510,38 +1510,41 @@ TRACE_EVENT(kvm_hv_timer_state,
>   * Tracepoint for kvm_hv_flush_tlb.
>   */
>  TRACE_EVENT(kvm_hv_flush_tlb,
> -       TP_PROTO(u64 processor_mask, u64 address_space, u64 flags),
> -       TP_ARGS(processor_mask, address_space, flags),
> +       TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode),
> +       TP_ARGS(processor_mask, address_space, flags, guest_mode),
>  
>         TP_STRUCT__entry(
>                 __field(u64, processor_mask)
>                 __field(u64, address_space)
>                 __field(u64, flags)
> +               __field(bool, guest_mode)
>         ),
>  
>         TP_fast_assign(
>                 __entry->processor_mask = processor_mask;
>                 __entry->address_space = address_space;
>                 __entry->flags = flags;
> +               __entry->guest_mode = guest_mode;
>         ),
>  
> -       TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx",
> +       TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s",
>                   __entry->processor_mask, __entry->address_space,
> -                 __entry->flags)
> +                 __entry->flags, __entry->guest_mode ? "(L2)" : "")
>  );
>  
>  /*
>   * Tracepoint for kvm_hv_flush_tlb_ex.
>   */
>  TRACE_EVENT(kvm_hv_flush_tlb_ex,
> -       TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags),
> -       TP_ARGS(valid_bank_mask, format, address_space, flags),
> +       TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode),
> +       TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode),
>  
>         TP_STRUCT__entry(
>                 __field(u64, valid_bank_mask)
>                 __field(u64, format)
>                 __field(u64, address_space)
>                 __field(u64, flags)
> +               __field(bool, guest_mode)
>         ),
>  
>         TP_fast_assign(
> @@ -1549,12 +1552,14 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
>                 __entry->format = format;
>                 __entry->address_space = address_space;
>                 __entry->flags = flags;
> +               __entry->guest_mode = guest_mode;
>         ),
>  
>         TP_printk("valid_bank_mask 0x%llx format 0x%llx "
> -                 "address_space 0x%llx flags 0x%llx",
> +                 "address_space 0x%llx flags 0x%llx %s",
>                   __entry->valid_bank_mask, __entry->format,
> -                 __entry->address_space, __entry->flags)
> +                 __entry->address_space, __entry->flags,
> +                 __entry->guest_mode ? "(L2)" : "")
>  );
>  
>  /*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 805db43c2829..8e945500ef50 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3355,12 +3355,12 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
>  {
>         if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) {
>                 kvm_vcpu_flush_tlb_current(vcpu);
> -               kvm_hv_vcpu_empty_flush_tlb(vcpu);
> +               kvm_hv_vcpu_empty_flush_tlb(vcpu, is_guest_mode(vcpu));
>         }
>  
>         if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) {
>                 kvm_vcpu_flush_tlb_guest(vcpu);
> -               kvm_hv_vcpu_empty_flush_tlb(vcpu);
> +               kvm_hv_vcpu_empty_flush_tlb(vcpu, is_guest_mode(vcpu));
>         } else if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) {
>                 kvm_hv_vcpu_flush_tlb(vcpu);
>         }

Looks good,

Reviewed-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>

Best regadrds,
Maxim Levitsky