Re: [patch V4 part 1 13/36] x86/kvm: Restrict ASYNC_PF to user space

From: Paolo Bonzini
Date: Wed May 06 2020 - 03:00:25 EST


On 05/05/20 15:16, Thomas Gleixner wrote:
> The async page fault injection into kernel space creates more problems than
> it solves. The host has absolutely no knowledge about the state of the
> guest if the fault happens in CPL0. The only restriction for the host is
> interrupt disabled state. If interrupts are enabled in the guest then the
> exception can hit arbitrary code. The HALT based wait in non-preemotible
> code is a hacky replacement for a proper hypercall.
>
> For the ongoing work to restrict instrumentation and make the RCU idle
> interaction well defined the required extra work for supporting async
> pagefault in CPL0 is just not justified and creates complexity for a
> dubious benefit.
>
> The CPL3 injection is well defined and does not cause any issues as it is
> more or less the same as a regular page fault from CPL3.
>
> Suggested-by: Andy Lutomirski <luto@xxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
> arch/x86/kernel/kvm.c | 100 +++-----------------------------------------------
> 1 file changed, 7 insertions(+), 93 deletions(-)
>
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -75,7 +75,6 @@ struct kvm_task_sleep_node {
> struct swait_queue_head wq;
> u32 token;
> int cpu;
> - bool use_halt;
> };
>
> static struct kvm_task_sleep_head {
> @@ -98,8 +97,7 @@ static struct kvm_task_sleep_node *_find
> return NULL;
> }
>
> -static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
> - struct kvm_task_sleep_node *n)
> +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
> {
> u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
> struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
> @@ -117,7 +115,6 @@ static bool kvm_async_pf_queue_task(u32
>
> n->token = token;
> n->cpu = smp_processor_id();
> - n->use_halt = use_halt;
> init_swait_queue_head(&n->wq);
> hlist_add_head(&n->link, &b->list);
> raw_spin_unlock(&b->lock);
> @@ -138,7 +135,7 @@ void kvm_async_pf_task_wait_schedule(u32
>
> lockdep_assert_irqs_disabled();
>
> - if (!kvm_async_pf_queue_task(token, false, &n))
> + if (!kvm_async_pf_queue_task(token, &n))
> return;
>
> for (;;) {
> @@ -154,91 +151,10 @@ void kvm_async_pf_task_wait_schedule(u32
> }
> EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
>
> -/*
> - * Invoked from the async page fault handler.
> - */
> -static void kvm_async_pf_task_wait_halt(u32 token)
> -{
> - struct kvm_task_sleep_node n;
> -
> - if (!kvm_async_pf_queue_task(token, true, &n))
> - return;
> -
> - for (;;) {
> - if (hlist_unhashed(&n.link))
> - break;
> - /*
> - * No point in doing anything about RCU here. Any RCU read
> - * side critical section or RCU watching section can be
> - * interrupted by VMEXITs and the host is free to keep the
> - * vCPU scheduled out as long as it sees fit. This is not
> - * any different just because of the halt induced voluntary
> - * VMEXIT.
> - *
> - * Also the async page fault could have interrupted any RCU
> - * watching context, so invoking rcu_irq_exit()/enter()
> - * around this is not gaining anything.
> - */
> - native_safe_halt();
> - local_irq_disable();
> - }
> -}
> -
> -/* Invoked from the async page fault handler */
> -static void kvm_async_pf_task_wait(u32 token, bool usermode)
> -{
> - bool can_schedule;
> -
> - /*
> - * No need to check whether interrupts were disabled because the
> - * host will (hopefully) only inject an async page fault into
> - * interrupt enabled regions.
> - *
> - * If CONFIG_PREEMPTION is enabled then check whether the code
> - * which triggered the page fault is preemptible. This covers user
> - * mode as well because preempt_count() is obviously 0 there.
> - *
> - * The check for rcu_preempt_depth() is also required because
> - * voluntary scheduling inside a rcu read locked section is not
> - * allowed.
> - *
> - * The idle task is already covered by this because idle always
> - * has a preempt count > 0.
> - *
> - * If CONFIG_PREEMPTION is disabled only allow scheduling when
> - * coming from user mode as there is no indication whether the
> - * context which triggered the page fault could schedule or not.
> - */
> - if (IS_ENABLED(CONFIG_PREEMPTION))
> - can_schedule = preempt_count() + rcu_preempt_depth() == 0;
> - else
> - can_schedule = usermode;
> -
> - /*
> - * If the kernel context is allowed to schedule then RCU is
> - * watching because no preemptible code in the kernel is inside RCU
> - * idle state. So it can be treated like user mode. User mode is
> - * safe because the #PF entry invoked enter_from_user_mode().
> - *
> - * For the non schedulable case invoke rcu_irq_enter() for
> - * now. This will be moved out to the pagefault entry code later
> - * and only invoked when really needed.
> - */
> - if (can_schedule) {
> - kvm_async_pf_task_wait_schedule(token);
> - } else {
> - rcu_irq_enter();
> - kvm_async_pf_task_wait_halt(token);
> - rcu_irq_exit();
> - }
> -}
> -
> static void apf_task_wake_one(struct kvm_task_sleep_node *n)
> {
> hlist_del_init(&n->link);
> - if (n->use_halt)
> - smp_send_reschedule(n->cpu);
> - else if (swq_has_sleeper(&n->wq))
> + if (swq_has_sleeper(&n->wq))
> swake_up_one(&n->wq);
> }
>
> @@ -337,8 +253,10 @@ bool __kvm_handle_async_pf(struct pt_reg
> panic("Host injected async #PF in interrupt disabled region\n");
>
> if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
> - /* page is swapped out by the host. */
> - kvm_async_pf_task_wait(token, user_mode(regs));
> + if (unlikely(!(user_mode(regs))))
> + panic("Host injected async #PF in kernel mode\n");
> + /* Page is swapped out by the host. */
> + kvm_async_pf_task_wait_schedule(token);
> } else {
> rcu_irq_enter();
> kvm_async_pf_task_wake(token);
> @@ -397,10 +315,6 @@ static void kvm_guest_cpu_init(void)
> WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
>
> pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
> -
> -#ifdef CONFIG_PREEMPTION
> - pa |= KVM_ASYNC_PF_SEND_ALWAYS;
> -#endif
> pa |= KVM_ASYNC_PF_ENABLED;
>
> if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
>

Acked-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>