Re: [RFC PATCH] KVM: race-free exit from KVM_RUN without POSIX signals
From: Christian Borntraeger
Date: Wed Feb 08 2017 - 08:18:26 EST
On 02/08/2017 12:10 PM, Paolo Bonzini wrote:
> The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick"
> a VCPU out of KVM_RUN through a POSIX signal. A signal is attached
> to a dummy signal handler; by blocking the signal outside KVM_RUN and
> unblocking it inside, this possible race is closed:
>
> VCPU thread service thread
> --------------------------------------------------------------
> check flag
> set flag
> raise signal
> (signal handler does nothing)
> KVM_RUN
>
> However, one issue with KVM_SET_SIGNAL_MASK is that it has to take
> tsk->sighand->siglock on every KVM_RUN. This lock is often on a
> remote NUMA node, because it is on the node of a thread's creator.
> Taking this lock can be very expensive if there are many userspace
> exits (as is the case for SMP Windows VMs without Hyper-V reference
> time counter).
>
> As an alternative, we can put the flag directly in kvm_run so that
> KVM can see it:
>
> VCPU thread service thread
> --------------------------------------------------------------
> raise signal
> signal handler
> set run->immediate_exit
> KVM_RUN
> check run->immediate_exit
So the idea is to have both, a signal and this flag and you want userspace
to set this flag in its signal handler? So we no longer block this signal
in QEMU then. Makes sense.
Do you have the QEMU patch ready, to do a better review of the whole idea?
>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
> Documentation/virtual/kvm/api.txt | 13 ++++++++++++-
> include/uapi/linux/kvm.h | 1 +
> virt/kvm/kvm_main.c | 14 +++++++++++---
> 3 files changed, 24 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index e4f2cdcf78eb..925b1b6be073 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3389,7 +3389,18 @@ struct kvm_run {
> Request that KVM_RUN return when it becomes possible to inject external
> interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.
>
> - __u8 padding1[7];
> + __u8 immediate_exit;
> +
> +This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
> +clears the field and exits immediately, returning -EINTR. In the common
> +scenario where a signal is used to "kick" a VCPU out of KVM_RUN, this
> +field can be used as an alternative to KVM_SET_SIGNAL_MASK: instead of
> +blocking the signal outside KVM_RUN, the signal handler should set
> +run->immediate_exit to a nonzero value.
> +
> +This field is only available if KVM_CAP_IMMEDIATE_EXIT is.
> +
> + __u8 padding1[6];
>
> /* out */
> __u32 exit_reason;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 7964b970b9ad..0aa43b1e5b1d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -881,6 +881,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_SPAPR_RESIZE_HPT 133
> #define KVM_CAP_PPC_MMU_RADIX 134
> #define KVM_CAP_PPC_MMU_HASH_V3 135
> +#define KVM_CAP_IMMEDIATE_EXIT 136
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 482612b4e496..5abb8df00db1 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2550,7 +2550,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
> if (r)
> return r;
> switch (ioctl) {
> - case KVM_RUN:
> + case KVM_RUN: {
> + struct kvm_run *run;
> r = -EINVAL;
> if (arg)
> goto out;
> @@ -2564,9 +2565,15 @@ static long kvm_vcpu_ioctl(struct file *filp,
> synchronize_rcu();
> put_pid(oldpid);
> }
> - r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
> - trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
> + run = vcpu->run;
> + if (run->immediate_exit) {
> + WRITE_ONCE(run->immediate_exit, 0);
> + return -EINTR;
> + }
> + r = kvm_arch_vcpu_ioctl_run(vcpu, run);
> + trace_kvm_userspace_exit(run->exit_reason, r);
> break;
> + }
> case KVM_GET_REGS: {
> struct kvm_regs *kvm_regs;
>
> @@ -2927,6 +2934,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
> #endif
> case KVM_CAP_IOEVENTFD_ANY_LENGTH:
> case KVM_CAP_CHECK_EXTENSION_VM:
> + case KVM_CAP_IMMEDIATE_EXIT:
> return 1;
> #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> case KVM_CAP_IRQ_ROUTING:
>