Re: [PATCH v6 6/7] KVM: x86: hyperv: optimize kvm_hv_flush_tlb() for vp_index == vcpu_idx case

From: Roman Kagan
Date: Thu Sep 27 2018 - 05:42:38 EST


On Wed, Sep 26, 2018 at 07:02:58PM +0200, Vitaly Kuznetsov wrote:
> VP inedx almost always matches VCPU and when it does it's faster to walk
> the sparse set instead of all vcpus.
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
> ---
> arch/x86/kvm/hyperv.c | 96 +++++++++++++++++++++++--------------------
> 1 file changed, 52 insertions(+), 44 deletions(-)
>
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index eeb12eacd525..cc0535a078f7 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1277,32 +1277,37 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> return kvm_hv_get_msr(vcpu, msr, pdata, host);
> }
>
> -static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
> +static __always_inline bool hv_vcpu_in_sparse_set(struct kvm_vcpu_hv *hv_vcpu,
> + u64 sparse_banks[],
> + u64 valid_bank_mask)
> {
> - int i = 0, j;
> + int bank = hv_vcpu->vp_index / 64, sbank;
>
> - if (!(valid_bank_mask & BIT_ULL(bank_no)))
> - return -1;
> + if (bank >= 64)
> + return false;
>
> - for (j = 0; j < bank_no; j++)
> - if (valid_bank_mask & BIT_ULL(j))
> - i++;
> + if (!(valid_bank_mask & BIT_ULL(bank)))
> + return false;
>
> - return i;
> + /* Sparse bank number equals to the number of set bits before it */
> + sbank = bitmap_weight((unsigned long *)&valid_bank_mask, bank);
> +
> + return !!(sparse_banks[sbank] & BIT_ULL(hv_vcpu->vp_index % 64));
> }
>
> static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
> u16 rep_cnt, bool ex)
> {
> struct kvm *kvm = current_vcpu->kvm;
> - struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
> + struct kvm_hv *hv = &kvm->arch.hyperv;
> + struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
> struct hv_tlb_flush_ex flush_ex;
> struct hv_tlb_flush flush;
> struct kvm_vcpu *vcpu;
> unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
> - u64 valid_bank_mask = 0;
> + u64 valid_bank_mask;
> u64 sparse_banks[64];
> - int sparse_banks_len, i;
> + int sparse_banks_len, i, bank, sbank;
> bool all_cpus;
>
> if (!ex) {
> @@ -1312,6 +1317,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
> trace_kvm_hv_flush_tlb(flush.processor_mask,
> flush.address_space, flush.flags);
>
> + valid_bank_mask = BIT_ULL(0);
> sparse_banks[0] = flush.processor_mask;
> all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
> } else {
> @@ -1344,52 +1350,54 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
> return HV_STATUS_INVALID_HYPERCALL_INPUT;
> }
>
> - cpumask_clear(&hv_current->tlb_lush);
> + /*
> + * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
> + * analyze it here, flush TLB regardless of the specified address space.
> + */
> + cpumask_clear(&hv_vcpu->tlb_lush);

Maybe squash this hv_current -> hv_vcpu renaming into patch 3?
(And yes this "lush" is funny, too ;)

>
> if (all_cpus) {
> kvm_make_vcpus_request_mask(kvm,
> KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
> - NULL, &hv_current->tlb_lush);
> + NULL, &hv_vcpu->tlb_lush);
> goto ret_success;
> }
>
> - kvm_for_each_vcpu(i, vcpu, kvm) {
> - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
> - int bank = hv->vp_index / 64, sbank = 0;
> -
> - /* Banks >64 can't be represented */
> - if (bank >= 64)
> - continue;
> -
> - /* Non-ex hypercalls can only address first 64 vCPUs */
> - if (!ex && bank)
> - continue;
> -
> - if (ex) {
> - /*
> - * Check is the bank of this vCPU is in sparse
> - * set and get the sparse bank number.
> - */
> - sbank = get_sparse_bank_no(valid_bank_mask, bank);
> -
> - if (sbank < 0)
> - continue;
> + if (atomic_read(&hv->num_mismatched_vp_indexes)) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (hv_vcpu_in_sparse_set(&vcpu->arch.hyperv,
> + sparse_banks,
> + valid_bank_mask))
> + __set_bit(i, vcpu_bitmap);
> }
> + goto flush_request;
> + }
>
> - if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
> - continue;
> -
> - /*
> - * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
> - * can't analyze it here, flush TLB regardless of the specified
> - * address space.
> - */
> - __set_bit(i, vcpu_bitmap);
> + /*
> + * num_mismatched_vp_indexes is zero so every vcpu has
> + * vp_index == vcpu_idx.
> + */
> + sbank = 0;
> + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
> + BITS_PER_LONG) {

s/BITS_PER_LONG/64/

> + for_each_set_bit(i,
> + (unsigned long *)&sparse_banks[sbank],
> + BITS_PER_LONG) {

ditto

> + u32 vp_index = bank * 64 + i;
> +
> + /* A non-existent vCPU was specified */
> + if (vp_index >= KVM_MAX_VCPUS)
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + __set_bit(vp_index, vcpu_bitmap);
> + }
> + sbank++;
> }

I wonder if copying the bank as a whole would make it easier to follow
(and somewhat more efficient):

sbank = 0;
for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, 64)
((u64)vcpu_bitmap)[bank] = sparse_banks[sbank++];

Also it seems equally efficient but slightly easier to read if
vcpu_bitmap is filled first regardless of ->num_mismatched_vp_indexes,
and then either passed directly to kvm_make_vcpus_request_mask if
num_mismatched_vp_indexes == 0, or converted into the real vcpu mask
using the regular test_bit otherwise.

So eventually it all would look like

...
u64 vp_bitmap[KVM_MAX_VCPUS / 64] = {0};
DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS) = {0};
unsigned long *vcpu_mask;
...
if (all_cpus) {
vcpu_mask = NULL;
goto flush_request;
}

sbank = 0;
for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, 64)
vp_bitmap[bank] = sparse_banks[sbank++];

if (likely(!atomic_read(&hv->num_mismatched_vp_indexes)) {
/* for all vcpus vp_index == vcpu_idx */
vcpu_mask = vp_bitmap;
goto flush_request;
}

kvm_for_each_vcpu(i, vcpu, kvm)
if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, vp_bitmap))
__set_bit(i, vcpu_bitmap);
vcpu_mask = vcpu_bitmap;
...

>
> +flush_request:
> kvm_make_vcpus_request_mask(kvm,
> KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
> - vcpu_bitmap, &hv_current->tlb_lush);
> + vcpu_bitmap, &hv_vcpu->tlb_lush);
>
> ret_success:
> /* We always do full TLB flush, set rep_done = rep_cnt. */

Roman.