Re: [PATCHv2 4/4] arm64: add host pv-vcpu-state support

From: Marc Zyngier
Date: Mon Jul 12 2021 - 12:24:47 EST


On Fri, 09 Jul 2021 05:37:13 +0100,
Sergey Senozhatsky <senozhatsky@xxxxxxxxxxxx> wrote:
>
> Add PV-vcpu-state support bits to the host. Host uses the guest
> PV-state per-CPU pointers to update the VCPU state each time
> it kvm_arch_vcpu_load() or kvm_arch_vcpu_put() the VCPU, so
> that guest scheduler can become aware of the fact that not
> all VCPUs are always available. Currently guest scheduler
> on amr64 always assumes that all CPUs are available because

arm64

> vcpu_is_preempted() is not implemented on arm64.
>
> - schbench -t 3 -m 3 -p 4096
>
> Latency percentiles (usec)
> BASE
> ================================================
> 50.0th: 1 (3556427 samples)
> 75.0th: 13 (879210 samples)
> 90.0th: 15 (893311 samples)
> 95.0th: 18 (159594 samples)
> *99.0th: 118 (224187 samples)
> 99.5th: 691 (28555 samples)
> 99.9th: 7384 (23076 samples)
> min=1, max=104218
> avg worker transfer: 25192.00 ops/sec 98.41MB/s
>
> PATCHED
> ================================================
> 50.0th: 1 (3507010 samples)
> 75.0th: 13 (1635775 samples)
> 90.0th: 16 (901271 samples)
> 95.0th: 24 (281051 samples)
> *99.0th: 114 (255581 samples)
> 99.5th: 382 (33051 samples)
> 99.9th: 6392 (26592 samples)
> min=1, max=83877
> avg worker transfer: 28613.39 ops/sec 111.77MB/s
>
> - perf bench sched all
>
> ops/sec
> BASE PATCHED
> ================================================
> 33452 36485
> 33541 39405
> 33365 36858
> 33455 38047
> 33449 37866
> 33616 34922
> 33479 34388
> 33594 37203
> 33458 35363
> 33704 35180
>
> Student's T-test
>
> N Min Max Median Avg Stddev
> base 10 33365 33704 33479 33511.3 100.92467
> patched 10 34388 39405 36858 36571.7 1607.454
> Difference at 95.0% confidence
> 3060.4 +/- 1070.09
> 9.13244% +/- 3.19321%
> (Student's t, pooled s = 1138.88)
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@xxxxxxxxxxxx>
> ---
> arch/arm64/include/asm/kvm_host.h | 23 +++++++++++
> arch/arm64/kvm/Makefile | 3 +-
> arch/arm64/kvm/arm.c | 3 ++
> arch/arm64/kvm/hypercalls.c | 11 ++++++
> arch/arm64/kvm/pv-vcpu-state.c | 64 +++++++++++++++++++++++++++++++
> 5 files changed, 103 insertions(+), 1 deletion(-)
> create mode 100644 arch/arm64/kvm/pv-vcpu-state.c
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 41911585ae0c..e782f4d0c916 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -381,6 +381,12 @@ struct kvm_vcpu_arch {
> u64 last_steal;
> gpa_t base;
> } steal;
> +
> + /* PV state of the VCPU */
> + struct {
> + gpa_t base;
> + struct gfn_to_hva_cache ghc;

Please stay consistent with what we use of steal time accounting
(i.e. don't use gfn_to_hva_cache, but directly use a gpa with
kvm_put_guest()). If you can demonstrate that there is an unacceptable
overhead in doing so, then both steal time and preemption will be
updated at the same time.


> + } vcpu_state;
> };
>
> /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
> @@ -695,6 +701,23 @@ static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
> return (vcpu_arch->steal.base != GPA_INVALID);
> }
>
> +int kvm_init_vcpu_state(struct kvm_vcpu *vcpu, gfn_t addr);
> +int kvm_release_vcpu_state(struct kvm_vcpu *vcpu);
> +
> +static inline void kvm_arm_vcpu_state_init(struct kvm_vcpu_arch *vcpu_arch)
> +{
> + vcpu_arch->vcpu_state.base = GPA_INVALID;
> + memset(&vcpu_arch->vcpu_state.ghc, 0, sizeof(struct gfn_to_hva_cache));

Does it really need to be inlined?

> +}
> +
> +static inline bool
> +kvm_arm_is_vcpu_state_enabled(struct kvm_vcpu_arch *vcpu_arch)
> +{
> + return (vcpu_arch->vcpu_state.base != GPA_INVALID);
> +}
> +
> +void kvm_update_vcpu_preempted(struct kvm_vcpu *vcpu, bool preempted);
> +
> void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
>
> struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 989bb5dad2c8..2a3ee82c6d90 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -12,7 +12,8 @@ obj-$(CONFIG_KVM) += hyp/
>
> kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
> $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
> - arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
> + arm.o mmu.o mmio.o psci.o perf.o hypercalls.o \
> + pvtime.o pv-vcpu-state.o \
> inject_fault.o va_layout.o handle_exit.o \
> guest.o debug.o reset.o sys_regs.o \
> vgic-sys-reg-v3.o fpsimd.o pmu.o \
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index e9a2b8f27792..43e995c9fddb 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -332,6 +332,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> kvm_arm_reset_debug_ptr(vcpu);
>
> kvm_arm_pvtime_vcpu_init(&vcpu->arch);
> + kvm_arm_vcpu_state_init(&vcpu->arch);
>
> vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
>
> @@ -429,10 +430,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> if (vcpu_has_ptrauth(vcpu))
> vcpu_ptrauth_disable(vcpu);
> kvm_arch_vcpu_load_debug_state_flags(vcpu);
> + kvm_update_vcpu_preempted(vcpu, false);
> }
>
> void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> {
> + kvm_update_vcpu_preempted(vcpu, true);

This doesn't look right. With this, you are now telling the guest that
a vcpu that is blocked on WFI is preempted. This really isn't the
case, as it has voluntarily entered a low-power mode while waiting for
an interrupt. Indeed, the vcpu isn't running. A physical CPU wouldn't
be running either.

> kvm_arch_vcpu_put_debug_state_flags(vcpu);
> kvm_arch_vcpu_put_fp(vcpu);
> if (has_vhe())
> diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
> index 30da78f72b3b..95bcf86e0b6f 100644
> --- a/arch/arm64/kvm/hypercalls.c
> +++ b/arch/arm64/kvm/hypercalls.c
> @@ -110,6 +110,9 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
> case ARM_SMCCC_HV_PV_TIME_FEATURES:
> val[0] = SMCCC_RET_SUCCESS;
> break;
> + case ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES:
> + val[0] = SMCCC_RET_SUCCESS;
> + break;
> }
> break;
> case ARM_SMCCC_HV_PV_TIME_FEATURES:
> @@ -139,6 +142,14 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
> case ARM_SMCCC_TRNG_RND32:
> case ARM_SMCCC_TRNG_RND64:
> return kvm_trng_call(vcpu);
> + case ARM_SMCCC_HV_PV_VCPU_STATE_INIT:
> + if (kvm_init_vcpu_state(vcpu, smccc_get_arg1(vcpu)) == 0)
> + val[0] = SMCCC_RET_SUCCESS;
> + break;
> + case ARM_SMCCC_HV_PV_VCPU_STATE_RELEASE:
> + if (kvm_release_vcpu_state(vcpu) == 0)
> + val[0] = SMCCC_RET_SUCCESS;
> + break;
> default:
> return kvm_psci_call(vcpu);
> }
> diff --git a/arch/arm64/kvm/pv-vcpu-state.c b/arch/arm64/kvm/pv-vcpu-state.c
> new file mode 100644
> index 000000000000..8496bb2a5966
> --- /dev/null
> +++ b/arch/arm64/kvm/pv-vcpu-state.c
> @@ -0,0 +1,64 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#include <linux/arm-smccc.h>
> +#include <linux/kvm_host.h>
> +
> +#include <asm/kvm_mmu.h>
> +#include <asm/paravirt.h>
> +
> +#include <kvm/arm_psci.h>

Why this include?

> +
> +int kvm_init_vcpu_state(struct kvm_vcpu *vcpu, gpa_t addr)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + int ret;
> + u64 idx;
> +
> + if (kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
> + return 0;
> +
> + idx = srcu_read_lock(&kvm->srcu);
> + ret = kvm_gfn_to_hva_cache_init(vcpu->kvm,
> + &vcpu->arch.vcpu_state.ghc,
> + addr,
> + sizeof(struct vcpu_state));
> + srcu_read_unlock(&kvm->srcu, idx);
> +
> + if (!ret)
> + vcpu->arch.vcpu_state.base = addr;
> + return ret;
> +}
> +
> +int kvm_release_vcpu_state(struct kvm_vcpu *vcpu)
> +{
> + if (!kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
> + return 0;
> +
> + kvm_arm_vcpu_state_init(&vcpu->arch);
> + return 0;
> +}
> +
> +void kvm_update_vcpu_preempted(struct kvm_vcpu *vcpu, bool preempted)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + u64 idx;
> +
> + if (!kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
> + return;
> +
> + /*
> + * This function is called from atomic context, so we need to
> + * disable page faults. kvm_write_guest_cached() will call
> + * might_fault().
> + */
> + pagefault_disable();
> + /*
> + * Need to take the SRCU lock because kvm_write_guest_offset_cached()
> + * calls kvm_memslots();
> + */
> + idx = srcu_read_lock(&kvm->srcu);
> + kvm_write_guest_cached(kvm, &vcpu->arch.vcpu_state.ghc,
> + &preempted, sizeof(bool));
> + srcu_read_unlock(&kvm->srcu, idx);
> + pagefault_enable();
> +}

Before rushing into an implementation, this really could do with some
specification:

- where is the memory allocated?

- what is the lifetime of the memory?

- what is its expected memory type?

- what is the coherency model (what if the guest runs with caching
disabled, for example)?

- is the functionality preserved across a VM reset?

- what is the strategy w.r.t live migration?

- can userspace detect that the feature is implemented?

- can userspace prevent the feature from being used?

- where is the documentation?

Thanks,

M.

--
Without deviation from the norm, progress is not possible.