Re: [RFC PATCH v2 2/5] kvm: Implement the paravirt sched framework for kvm

From: Vineeth Remanan Pillai
Date: Mon Apr 08 2024 - 09:59:49 EST

Next message: Leon Romanovsky: "Re: [PATCH rdma-next v4 0/4] Define and use mana queues for CQs and WQs"
Previous message: Vineeth Remanan Pillai: "Re: [RFC PATCH v2 1/5] pvsched: paravirt scheduling framework"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Adding sched_ext folks

On Wed, Apr 3, 2024 at 10:01 AM Vineeth Pillai (Google)
<vineeth@xxxxxxxxxxxxxxx> wrote:
>
> kvm uses the kernel's paravirt sched framework to assign an available
> pvsched driver for a guest. guest vcpus registers with the pvsched
> driver and calls into the driver callback to notify the events that the
> driver is interested in.
>
> This PoC doesn't do the callback on interrupt injection yet. Will be
> implemented in subsequent iterations.
>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx>
> Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
> ---
> arch/x86/kvm/Kconfig | 13 ++++
> arch/x86/kvm/x86.c | 3 +
> include/linux/kvm_host.h | 32 +++++++++
> virt/kvm/kvm_main.c | 148 +++++++++++++++++++++++++++++++++++++++
> 4 files changed, 196 insertions(+)
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 65ed14b6540b..c1776cdb5b65 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -189,4 +189,17 @@ config KVM_MAX_NR_VCPUS
> the memory footprint of each KVM guest, regardless of how many vCPUs are
> created for a given VM.
>
> +config PARAVIRT_SCHED_KVM
> + bool "Enable paravirt scheduling capability for kvm"
> + depends on KVM
> + default n
> + help
> + Paravirtualized scheduling facilitates the exchange of scheduling
> + related information between the host and guest through shared memory,
> + enhancing the efficiency of vCPU thread scheduling by the hypervisor.
> + An illustrative use case involves dynamically boosting the priority of
> + a vCPU thread when the guest is executing a latency-sensitive workload
> + on that specific vCPU.
> + This config enables paravirt scheduling in the kvm hypervisor.
> +
> endif # VIRTUALIZATION
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ffe580169c93..d0abc2c64d47 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10896,6 +10896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>
> preempt_disable();
>
> + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMENTER);
> +
> static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
>
> /*
> @@ -11059,6 +11061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> guest_timing_exit_irqoff();
>
> local_irq_enable();
> + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMEXIT);
> preempt_enable();
>
> kvm_vcpu_srcu_read_lock(vcpu);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 179df96b20f8..6381569f3de8 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -45,6 +45,8 @@
> #include <asm/kvm_host.h>
> #include <linux/kvm_dirty_ring.h>
>
> +#include <linux/pvsched.h>
> +
> #ifndef KVM_MAX_VCPU_IDS
> #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
> #endif
> @@ -832,6 +834,11 @@ struct kvm {
> bool vm_bugged;
> bool vm_dead;
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> + spinlock_t pvsched_ops_lock;
> + struct pvsched_vcpu_ops __rcu *pvsched_ops;
> +#endif
> +
> #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
> struct notifier_block pm_notifier;
> #endif
> @@ -2413,4 +2420,29 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
> }
> #endif /* CONFIG_KVM_PRIVATE_MEM */
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events);
> +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu);
> +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu);
> +
> +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name);
> +#else
> +static inline int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events)
> +{
> + return 0;
> +}
> +static inline int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +static inline void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu)
> +{
> +}
> +
> +static inline int kvm_replace_pvsched_ops(struct kvm *kvm, char *name)
> +{
> + return 0;
> +}
> +#endif
> +
> #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0f50960b0e3a..0546814e4db7 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -170,6 +170,142 @@ bool kvm_is_zone_device_page(struct page *page)
> return is_zone_device_page(page);
> }
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> +typedef enum {
> + PVSCHED_CB_REGISTER = 1,
> + PVSCHED_CB_UNREGISTER = 2,
> + PVSCHED_CB_NOTIFY = 3
> +} pvsched_vcpu_callback_t;
> +
> +/*
> + * Helper function to invoke the pvsched driver callback.
> + */
> +static int __vcpu_pvsched_callback(struct kvm_vcpu *vcpu, u32 events,
> + pvsched_vcpu_callback_t action)
> +{
> + int ret = 0;
> + struct pid *pid;
> + struct pvsched_vcpu_ops *ops;
> +
> + rcu_read_lock();
> + ops = rcu_dereference(vcpu->kvm->pvsched_ops);
> + if (!ops) {
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + pid = rcu_dereference(vcpu->pid);
> + if (WARN_ON_ONCE(!pid)) {
> + ret = -EINVAL;
> + goto out;
> + }
> + get_pid(pid);
> + switch(action) {
> + case PVSCHED_CB_REGISTER:
> + ops->pvsched_vcpu_register(pid);
> + break;
> + case PVSCHED_CB_UNREGISTER:
> + ops->pvsched_vcpu_unregister(pid);
> + break;
> + case PVSCHED_CB_NOTIFY:
> + if (ops->events & events) {
> + ops->pvsched_vcpu_notify_event(
> + NULL, /* TODO: Pass guest allocated sharedmem addr */
> + pid,
> + ops->events & events);
> + }
> + break;
> + default:
> + WARN_ON_ONCE(1);
> + }
> + put_pid(pid);
> +
> +out:
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events)
> +{
> + return __vcpu_pvsched_callback(vcpu, events, PVSCHED_CB_NOTIFY);
> +}
> +
> +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu)
> +{
> + return __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_REGISTER);
> + /*
> + * TODO: Action if the registration fails?
> + */
> +}
> +
> +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu)
> +{
> + __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_UNREGISTER);
> +}
> +
> +/*
> + * Replaces the VM's current pvsched driver.
> + * if name is NULL or empty string, unassign the
> + * current driver.
> + */
> +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name)
> +{
> + int ret = 0;
> + unsigned long i;
> + struct kvm_vcpu *vcpu = NULL;
> + struct pvsched_vcpu_ops *ops = NULL, *prev_ops;
> +
> +
> + spin_lock(&kvm->pvsched_ops_lock);
> +
> + prev_ops = rcu_dereference(kvm->pvsched_ops);
> +
> + /*
> + * Unassign operation if the passed in value is
> + * NULL or an empty string.
> + */
> + if (name && *name) {
> + ops = pvsched_get_vcpu_ops(name);
> + if (!ops) {
> + ret = -EINVAL;
> + goto out;
> + }
> + }
> +
> + if (prev_ops) {
> + /*
> + * Unregister current pvsched driver.
> + */
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + kvm_vcpu_pvsched_unregister(vcpu);
> + }
> +
> + pvsched_put_vcpu_ops(prev_ops);
> + }
> +
> +
> + rcu_assign_pointer(kvm->pvsched_ops, ops);
> + if (ops) {
> + /*
> + * Register new pvsched driver.
> + */
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + WARN_ON_ONCE(kvm_vcpu_pvsched_register(vcpu));
> + }
> + }
> +
> +out:
> + spin_unlock(&kvm->pvsched_ops_lock);
> +
> + if (ret)
> + return ret;
> +
> + synchronize_rcu();
> +
> + return 0;
> +}
> +#endif
> +
> /*
> * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
> * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
> @@ -508,6 +644,8 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
> kvm_arch_vcpu_destroy(vcpu);
> kvm_dirty_ring_free(&vcpu->dirty_ring);
>
> + kvm_vcpu_pvsched_unregister(vcpu);
> +
> /*
> * No need for rcu_read_lock as VCPU_RUN is the only place that changes
> * the vcpu->pid pointer, and at destruction time all file descriptors
> @@ -1221,6 +1359,10 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
>
> BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> + spin_lock_init(&kvm->pvsched_ops_lock);
> +#endif
> +
> /*
> * Force subsequent debugfs file creations to fail if the VM directory
> * is not created (by kvm_create_vm_debugfs()).
> @@ -1343,6 +1485,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
> int i;
> struct mm_struct *mm = kvm->mm;
>
> + kvm_replace_pvsched_ops(kvm, NULL);
> +
> kvm_destroy_pm_notifier(kvm);
> kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
> kvm_destroy_vm_debugfs(kvm);
> @@ -3779,6 +3923,8 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
> if (kvm_vcpu_check_block(vcpu) < 0)
> break;
>
> + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_HALT);
> +
> waited = true;
> schedule();
> }
> @@ -4434,6 +4580,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
> /* The thread running this VCPU changed. */
> struct pid *newpid;
>
> + kvm_vcpu_pvsched_unregister(vcpu);
> r = kvm_arch_vcpu_run_pid_change(vcpu);
> if (r)
> break;
> @@ -4442,6 +4589,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
> rcu_assign_pointer(vcpu->pid, newpid);
> if (oldpid)
> synchronize_rcu();
> + kvm_vcpu_pvsched_register(vcpu);
> put_pid(oldpid);
> }
> r = kvm_arch_vcpu_ioctl_run(vcpu);
> --
> 2.40.1
>

Next message: Leon Romanovsky: "Re: [PATCH rdma-next v4 0/4] Define and use mana queues for CQs and WQs"
Previous message: Vineeth Remanan Pillai: "Re: [RFC PATCH v2 1/5] pvsched: paravirt scheduling framework"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]