[RFC PATCH v2 2/5] kvm: Implement the paravirt sched framework for kvm
From: Vineeth Pillai (Google)
Date: Wed Apr 03 2024 - 10:02:03 EST
kvm uses the kernel's paravirt sched framework to assign an available
pvsched driver for a guest. guest vcpus registers with the pvsched
driver and calls into the driver callback to notify the events that the
driver is interested in.
This PoC doesn't do the callback on interrupt injection yet. Will be
implemented in subsequent iterations.
Signed-off-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx>
Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
---
arch/x86/kvm/Kconfig | 13 ++++
arch/x86/kvm/x86.c | 3 +
include/linux/kvm_host.h | 32 +++++++++
virt/kvm/kvm_main.c | 148 +++++++++++++++++++++++++++++++++++++++
4 files changed, 196 insertions(+)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 65ed14b6540b..c1776cdb5b65 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -189,4 +189,17 @@ config KVM_MAX_NR_VCPUS
the memory footprint of each KVM guest, regardless of how many vCPUs are
created for a given VM.
+config PARAVIRT_SCHED_KVM
+ bool "Enable paravirt scheduling capability for kvm"
+ depends on KVM
+ default n
+ help
+ Paravirtualized scheduling facilitates the exchange of scheduling
+ related information between the host and guest through shared memory,
+ enhancing the efficiency of vCPU thread scheduling by the hypervisor.
+ An illustrative use case involves dynamically boosting the priority of
+ a vCPU thread when the guest is executing a latency-sensitive workload
+ on that specific vCPU.
+ This config enables paravirt scheduling in the kvm hypervisor.
+
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffe580169c93..d0abc2c64d47 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10896,6 +10896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
+ kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMENTER);
+
static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
/*
@@ -11059,6 +11061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
guest_timing_exit_irqoff();
local_irq_enable();
+ kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMEXIT);
preempt_enable();
kvm_vcpu_srcu_read_lock(vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 179df96b20f8..6381569f3de8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -45,6 +45,8 @@
#include <asm/kvm_host.h>
#include <linux/kvm_dirty_ring.h>
+#include <linux/pvsched.h>
+
#ifndef KVM_MAX_VCPU_IDS
#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
#endif
@@ -832,6 +834,11 @@ struct kvm {
bool vm_bugged;
bool vm_dead;
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+ spinlock_t pvsched_ops_lock;
+ struct pvsched_vcpu_ops __rcu *pvsched_ops;
+#endif
+
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
@@ -2413,4 +2420,29 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
}
#endif /* CONFIG_KVM_PRIVATE_MEM */
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events);
+int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu);
+void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu);
+
+int kvm_replace_pvsched_ops(struct kvm *kvm, char *name);
+#else
+static inline int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events)
+{
+ return 0;
+}
+static inline int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+static inline void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int kvm_replace_pvsched_ops(struct kvm *kvm, char *name)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0f50960b0e3a..0546814e4db7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -170,6 +170,142 @@ bool kvm_is_zone_device_page(struct page *page)
return is_zone_device_page(page);
}
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+typedef enum {
+ PVSCHED_CB_REGISTER = 1,
+ PVSCHED_CB_UNREGISTER = 2,
+ PVSCHED_CB_NOTIFY = 3
+} pvsched_vcpu_callback_t;
+
+/*
+ * Helper function to invoke the pvsched driver callback.
+ */
+static int __vcpu_pvsched_callback(struct kvm_vcpu *vcpu, u32 events,
+ pvsched_vcpu_callback_t action)
+{
+ int ret = 0;
+ struct pid *pid;
+ struct pvsched_vcpu_ops *ops;
+
+ rcu_read_lock();
+ ops = rcu_dereference(vcpu->kvm->pvsched_ops);
+ if (!ops) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ pid = rcu_dereference(vcpu->pid);
+ if (WARN_ON_ONCE(!pid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ get_pid(pid);
+ switch(action) {
+ case PVSCHED_CB_REGISTER:
+ ops->pvsched_vcpu_register(pid);
+ break;
+ case PVSCHED_CB_UNREGISTER:
+ ops->pvsched_vcpu_unregister(pid);
+ break;
+ case PVSCHED_CB_NOTIFY:
+ if (ops->events & events) {
+ ops->pvsched_vcpu_notify_event(
+ NULL, /* TODO: Pass guest allocated sharedmem addr */
+ pid,
+ ops->events & events);
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ put_pid(pid);
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events)
+{
+ return __vcpu_pvsched_callback(vcpu, events, PVSCHED_CB_NOTIFY);
+}
+
+int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu)
+{
+ return __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_REGISTER);
+ /*
+ * TODO: Action if the registration fails?
+ */
+}
+
+void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu)
+{
+ __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_UNREGISTER);
+}
+
+/*
+ * Replaces the VM's current pvsched driver.
+ * if name is NULL or empty string, unassign the
+ * current driver.
+ */
+int kvm_replace_pvsched_ops(struct kvm *kvm, char *name)
+{
+ int ret = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu = NULL;
+ struct pvsched_vcpu_ops *ops = NULL, *prev_ops;
+
+
+ spin_lock(&kvm->pvsched_ops_lock);
+
+ prev_ops = rcu_dereference(kvm->pvsched_ops);
+
+ /*
+ * Unassign operation if the passed in value is
+ * NULL or an empty string.
+ */
+ if (name && *name) {
+ ops = pvsched_get_vcpu_ops(name);
+ if (!ops) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (prev_ops) {
+ /*
+ * Unregister current pvsched driver.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_vcpu_pvsched_unregister(vcpu);
+ }
+
+ pvsched_put_vcpu_ops(prev_ops);
+ }
+
+
+ rcu_assign_pointer(kvm->pvsched_ops, ops);
+ if (ops) {
+ /*
+ * Register new pvsched driver.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ WARN_ON_ONCE(kvm_vcpu_pvsched_register(vcpu));
+ }
+ }
+
+out:
+ spin_unlock(&kvm->pvsched_ops_lock);
+
+ if (ret)
+ return ret;
+
+ synchronize_rcu();
+
+ return 0;
+}
+#endif
+
/*
* Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
* page, NULL otherwise. Note, the list of refcounted PG_reserved page types
@@ -508,6 +644,8 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_destroy(vcpu);
kvm_dirty_ring_free(&vcpu->dirty_ring);
+ kvm_vcpu_pvsched_unregister(vcpu);
+
/*
* No need for rcu_read_lock as VCPU_RUN is the only place that changes
* the vcpu->pid pointer, and at destruction time all file descriptors
@@ -1221,6 +1359,10 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+ spin_lock_init(&kvm->pvsched_ops_lock);
+#endif
+
/*
* Force subsequent debugfs file creations to fail if the VM directory
* is not created (by kvm_create_vm_debugfs()).
@@ -1343,6 +1485,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
int i;
struct mm_struct *mm = kvm->mm;
+ kvm_replace_pvsched_ops(kvm, NULL);
+
kvm_destroy_pm_notifier(kvm);
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
kvm_destroy_vm_debugfs(kvm);
@@ -3779,6 +3923,8 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
if (kvm_vcpu_check_block(vcpu) < 0)
break;
+ kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_HALT);
+
waited = true;
schedule();
}
@@ -4434,6 +4580,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
/* The thread running this VCPU changed. */
struct pid *newpid;
+ kvm_vcpu_pvsched_unregister(vcpu);
r = kvm_arch_vcpu_run_pid_change(vcpu);
if (r)
break;
@@ -4442,6 +4589,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
rcu_assign_pointer(vcpu->pid, newpid);
if (oldpid)
synchronize_rcu();
+ kvm_vcpu_pvsched_register(vcpu);
put_pid(oldpid);
}
r = kvm_arch_vcpu_ioctl_run(vcpu);
--
2.40.1