[PATCH v2] KVM: arm64: Add KVM_CAP to control WFx trapping
From: Colton Lewis
Date: Tue Mar 19 2024 - 12:44:33 EST
Add a KVM_CAP to control WFx (WFI or WFE) trapping based on scheduler
runqueue depth. This is so they can be passed through if the runqueue
is shallow or the CPU has support for direct interrupt injection. They
may be always trapped by setting this value to 0. Technically this
means traps will be cleared when the runqueue depth is 0, but that
implies nothing is running anyway so there is no reason to care. The
default value is 1 to preserve previous behavior before adding this
option.
Think about his option as a threshold. The instruction will be trapped
if the runqueue depth is higher than the threshold.
Signed-off-by: Colton Lewis <coltonlewis@xxxxxxxxxx>
---
v2:
The last version was exclusively a flag to enable unconditional wfx
passthrough but there was feedback to make passthrough/trapping depend
on runqueue depth. I asked the last thread if there were any
preferences for the interface to accomplish this but I figured it's
easier to show code than wait for people telling me what to do.
v1:
https://lore.kernel.org/kvmarm/20240129213918.3124494-1-coltonlewis@xxxxxxxxxx/
arch/arm64/include/asm/kvm_host.h | 1 +
arch/arm64/kvm/arm.c | 7 ++++++-
include/linux/sched/stat.h | 1 +
include/uapi/linux/kvm.h | 2 +-
kernel/sched/core.c | 15 +++++++++++++--
5 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 21c57b812569..79f461efaa6c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -317,6 +317,7 @@ struct kvm_arch {
* the associated pKVM instance in the hypervisor.
*/
struct kvm_protected_vm pkvm;
+ u64 wfx_trap_runqueue_depth;
};
struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a25265aca432..419eed6e1814 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -116,6 +116,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
}
mutex_unlock(&kvm->slots_lock);
break;
+ case KVM_CAP_ARM_WFX_TRAP_RUNQUEUE_DEPTH:
+ kvm->arch.wfx_trap_runqueue_depth = cap->args[0];
+ break;
default:
r = -EINVAL;
break;
@@ -176,6 +179,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
+ kvm->arch.wfx_trap_runqueue_depth = 1;
return 0;
err_free_cpumask:
@@ -240,6 +244,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_SYSTEM_SUSPEND:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_COUNTER_OFFSET:
+ case KVM_CAP_ARM_WFX_TRAP_RUNQUEUE_DEPTH:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -456,7 +461,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
- if (single_task_running())
+ if (nr_running_this_cpu() <= vcpu->kvm->arch.wfx_trap_runqueue_depth)
vcpu_clear_wfx_traps(vcpu);
else
vcpu_set_wfx_traps(vcpu);
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 0108a38bb64d..dc1541fcec56 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -18,6 +18,7 @@ extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
extern unsigned int nr_running(void);
+extern unsigned int nr_running_this_cpu(void);
extern bool single_task_running(void);
extern unsigned int nr_iowait(void);
extern unsigned int nr_iowait_cpu(int cpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c3308536482b..4c0ebf514c03 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1155,6 +1155,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_MEMORY_ATTRIBUTES 233
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
+#define KVM_CAP_ARM_WFX_TRAP_RUNQUEUE_DEPTH 236
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..b18f29964648 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5420,7 +5420,7 @@ unsigned int nr_running(void)
}
/*
- * Check if only the current task is running on the CPU.
+ * Return number of tasks running on this CPU.
*
* Caution: this function does not check that the caller has disabled
* preemption, thus the result might have a time-of-check-to-time-of-use
@@ -5432,9 +5432,20 @@ unsigned int nr_running(void)
*
* - in a loop with very short iterations (e.g. a polling loop)
*/
+unsigned int nr_running_this_cpu(void)
+{
+ return raw_rq()->nr_running;
+}
+EXPORT_SYMBOL(nr_running_this_cpu);
+
+/*
+ * Check if only the current task is running on the CPU.
+ *
+ * Caution: see warning for nr_running_this_cpu
+ */
bool single_task_running(void)
{
- return raw_rq()->nr_running == 1;
+ return nr_running_this_cpu() == 1;
}
EXPORT_SYMBOL(single_task_running);
--
2.44.0.291.gc1ea87d7ee-goog