Re: [PATCH v7 08/12] Handle async PF in a guest.

From: Jan Kiszka
Date: Wed Oct 20 2010 - 07:51:33 EST


Am 20.10.2010 13:48, Jan Kiszka wrote:
> Am 14.10.2010 11:22, Gleb Natapov wrote:
>> When async PF capability is detected hook up special page fault handler
>> that will handle async page fault events and bypass other page faults to
>> regular page fault handler. Also add async PF handling to nested SVM
>> emulation. Async PF always generates exit to L1 where vcpu thread will
>> be scheduled out until page is available.
>>
>> Acked-by: Rik van Riel <riel@xxxxxxxxxx>
>> Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx>
>> ---
>> arch/x86/include/asm/kvm_para.h | 12 +++
>> arch/x86/include/asm/traps.h | 1 +
>> arch/x86/kernel/entry_32.S | 10 ++
>> arch/x86/kernel/entry_64.S | 3 +
>> arch/x86/kernel/kvm.c | 181 +++++++++++++++++++++++++++++++++++++++
>> arch/x86/kvm/svm.c | 45 ++++++++--
>> 6 files changed, 243 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
>> index 2315398..fbfd367 100644
>> --- a/arch/x86/include/asm/kvm_para.h
>> +++ b/arch/x86/include/asm/kvm_para.h
>> @@ -65,6 +65,9 @@ struct kvm_mmu_op_release_pt {
>> __u64 pt_phys;
>> };
>>
>> +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
>> +#define KVM_PV_REASON_PAGE_READY 2
>> +
>> struct kvm_vcpu_pv_apf_data {
>> __u32 reason;
>> __u8 pad[60];
>> @@ -171,8 +174,17 @@ static inline unsigned int kvm_arch_para_features(void)
>>
>> #ifdef CONFIG_KVM_GUEST
>> void __init kvm_guest_init(void);
>> +void kvm_async_pf_task_wait(u32 token);
>> +void kvm_async_pf_task_wake(u32 token);
>> +u32 kvm_read_and_reset_pf_reason(void);
>> #else
>> #define kvm_guest_init() do { } while (0)
>> +#define kvm_async_pf_task_wait(T) do {} while(0)
>> +#define kvm_async_pf_task_wake(T) do {} while(0)
>> +static u32 kvm_read_and_reset_pf_reason(void)
>> +{
>> + return 0;
>> +}
>> #endif
>>
>> #endif /* __KERNEL__ */
>> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
>> index f66cda5..0310da6 100644
>> --- a/arch/x86/include/asm/traps.h
>> +++ b/arch/x86/include/asm/traps.h
>> @@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
>> asmlinkage void stack_segment(void);
>> asmlinkage void general_protection(void);
>> asmlinkage void page_fault(void);
>> +asmlinkage void async_page_fault(void);
>> asmlinkage void spurious_interrupt_bug(void);
>> asmlinkage void coprocessor_error(void);
>> asmlinkage void alignment_check(void);
>> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
>> index 227d009..e6e7273 100644
>> --- a/arch/x86/kernel/entry_32.S
>> +++ b/arch/x86/kernel/entry_32.S
>> @@ -1496,6 +1496,16 @@ ENTRY(general_protection)
>> CFI_ENDPROC
>> END(general_protection)
>>
>> +#ifdef CONFIG_KVM_GUEST
>> +ENTRY(async_page_fault)
>> + RING0_EC_FRAME
>> + pushl $do_async_page_fault
>> + CFI_ADJUST_CFA_OFFSET 4
>> + jmp error_code
>> + CFI_ENDPROC
>> +END(apf_page_fault)
>> +#endif
>> +
>> /*
>> * End of kprobes section
>> */
>> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
>> index 17be5ec..def98c3 100644
>> --- a/arch/x86/kernel/entry_64.S
>> +++ b/arch/x86/kernel/entry_64.S
>> @@ -1349,6 +1349,9 @@ errorentry xen_stack_segment do_stack_segment
>> #endif
>> errorentry general_protection do_general_protection
>> errorentry page_fault do_page_fault
>> +#ifdef CONFIG_KVM_GUEST
>> +errorentry async_page_fault do_async_page_fault
>> +#endif
>> #ifdef CONFIG_X86_MCE
>> paranoidzeroentry machine_check *machine_check_vector(%rip)
>> #endif
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index 032d03b..d564063 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -29,8 +29,14 @@
>> #include <linux/hardirq.h>
>> #include <linux/notifier.h>
>> #include <linux/reboot.h>
>> +#include <linux/hash.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/kprobes.h>
>> #include <asm/timer.h>
>> #include <asm/cpu.h>
>> +#include <asm/traps.h>
>> +#include <asm/desc.h>
>>
>> #define MMU_QUEUE_SIZE 1024
>>
>> @@ -64,6 +70,168 @@ static void kvm_io_delay(void)
>> {
>> }
>>
>> +#define KVM_TASK_SLEEP_HASHBITS 8
>> +#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
>> +
>> +struct kvm_task_sleep_node {
>> + struct hlist_node link;
>> + wait_queue_head_t wq;
>> + u32 token;
>> + int cpu;
>> +};
>> +
>> +static struct kvm_task_sleep_head {
>> + spinlock_t lock;
>> + struct hlist_head list;
>> +} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
>> +
>> +static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
>> + u32 token)
>> +{
>> + struct hlist_node *p;
>> +
>> + hlist_for_each(p, &b->list) {
>> + struct kvm_task_sleep_node *n =
>> + hlist_entry(p, typeof(*n), link);
>> + if (n->token == token)
>> + return n;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +void kvm_async_pf_task_wait(u32 token)
>> +{
>> + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
>> + struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
>> + struct kvm_task_sleep_node n, *e;
>> + DEFINE_WAIT(wait);
>> +
>> + spin_lock(&b->lock);
>> + e = _find_apf_task(b, token);
>> + if (e) {
>> + /* dummy entry exist -> wake up was delivered ahead of PF */
>> + hlist_del(&e->link);
>> + kfree(e);
>> + spin_unlock(&b->lock);
>> + return;
>> + }
>> +
>> + n.token = token;
>> + n.cpu = smp_processor_id();
>> + init_waitqueue_head(&n.wq);
>> + hlist_add_head(&n.link, &b->list);
>> + spin_unlock(&b->lock);
>> +
>> + for (;;) {
>> + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
>> + if (hlist_unhashed(&n.link))
>> + break;
>> + local_irq_enable();
>> + schedule();
>> + local_irq_disable();
>> + }
>> + finish_wait(&n.wq, &wait);
>> +
>> + return;
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
>> +
>> +static void apf_task_wake_one(struct kvm_task_sleep_node *n)
>> +{
>> + hlist_del_init(&n->link);
>> + if (waitqueue_active(&n->wq))
>> + wake_up(&n->wq);
>> +}
>> +
>> +static void apf_task_wake_all(void)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
>> + struct hlist_node *p, *next;
>> + struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
>> + spin_lock(&b->lock);
>> + hlist_for_each_safe(p, next, &b->list) {
>> + struct kvm_task_sleep_node *n =
>> + hlist_entry(p, typeof(*n), link);
>> + if (n->cpu == smp_processor_id())
>> + apf_task_wake_one(n);
>> + }
>> + spin_unlock(&b->lock);
>> + }
>> +}
>> +
>> +void kvm_async_pf_task_wake(u32 token)
>> +{
>> + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
>> + struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
>> + struct kvm_task_sleep_node *n;
>> +
>> + if (token == ~0) {
>> + apf_task_wake_all();
>> + return;
>> + }
>> +
>> +again:
>> + spin_lock(&b->lock);
>> + n = _find_apf_task(b, token);
>> + if (!n) {
>> + /*
>> + * async PF was not yet handled.
>> + * Add dummy entry for the token.
>> + */
>> + n = kmalloc(sizeof(*n), GFP_ATOMIC);
>> + if (!n) {
>> + /*
>> + * Allocation failed! Busy wait while other cpu
>> + * handles async PF.
>> + */
>> + spin_unlock(&b->lock);
>> + cpu_relax();
>> + goto again;
>> + }
>> + n->token = token;
>> + n->cpu = smp_processor_id();
>> + init_waitqueue_head(&n->wq);
>> + hlist_add_head(&n->link, &b->list);
>> + } else
>> + apf_task_wake_one(n);
>> + spin_unlock(&b->lock);
>> + return;
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
>> +
>> +u32 kvm_read_and_reset_pf_reason(void)
>> +{
>> + u32 reason = 0;
>> +
>> + if (__get_cpu_var(apf_reason).enabled) {
>> + reason = __get_cpu_var(apf_reason).reason;
>> + __get_cpu_var(apf_reason).reason = 0;
>> + }
>> +
>> + return reason;
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
>> +
>> +dotraplinkage void __kprobes
>> +do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
>> +{
>> + switch (kvm_read_and_reset_pf_reason()) {
>> + default:
>> + do_page_fault(regs, error_code);
>> + break;
>> + case KVM_PV_REASON_PAGE_NOT_PRESENT:
>> + /* page is swapped out by the host. */
>> + kvm_async_pf_task_wait((u32)read_cr2());
>> + break;
>> + case KVM_PV_REASON_PAGE_READY:
>> + kvm_async_pf_task_wake((u32)read_cr2());
>> + break;
>> + }
>> +}
>> +
>> static void kvm_mmu_op(void *buffer, unsigned len)
>> {
>> int r;
>> @@ -300,6 +468,7 @@ static void kvm_guest_cpu_online(void *dummy)
>> static void kvm_guest_cpu_offline(void *dummy)
>> {
>> kvm_pv_disable_apf(NULL);
>> + apf_task_wake_all();
>> }
>>
>> static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
>> @@ -327,13 +496,25 @@ static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
>> };
>> #endif
>>
>> +static void __init kvm_apf_trap_init(void)
>> +{
>> + set_intr_gate(14, &async_page_fault);
>> +}
>> +
>> void __init kvm_guest_init(void)
>> {
>> + int i;
>> +
>> if (!kvm_para_available())
>> return;
>>
>> paravirt_ops_setup();
>> register_reboot_notifier(&kvm_pv_reboot_nb);
>> + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
>> + spin_lock_init(&async_pf_sleepers[i].lock);
>> + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
>> + x86_init.irqs.trap_init = kvm_apf_trap_init;
>> +
>> #ifdef CONFIG_SMP
>> smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
>> register_cpu_notifier(&kvm_cpu_notifier);
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 9a92224..9fa27a5 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -31,6 +31,7 @@
>>
>> #include <asm/tlbflush.h>
>> #include <asm/desc.h>
>> +#include <asm/kvm_para.h>
>>
>> #include <asm/virtext.h>
>> #include "trace.h"
>> @@ -133,6 +134,7 @@ struct vcpu_svm {
>>
>> unsigned int3_injected;
>> unsigned long int3_rip;
>> + u32 apf_reason;
>> };
>>
>> #define MSR_INVALID 0xffffffffU
>> @@ -1383,16 +1385,33 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
>>
>> static int pf_interception(struct vcpu_svm *svm)
>> {
>> - u64 fault_address;
>> + u64 fault_address = svm->vmcb->control.exit_info_2;
>> u32 error_code;
>> + int r = 1;
>>
>> - fault_address = svm->vmcb->control.exit_info_2;
>> - error_code = svm->vmcb->control.exit_info_1;
>> + switch (svm->apf_reason) {
>> + default:
>> + error_code = svm->vmcb->control.exit_info_1;
>>
>> - trace_kvm_page_fault(fault_address, error_code);
>> - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
>> - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
>> - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
>> + trace_kvm_page_fault(fault_address, error_code);
>> + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
>> + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
>> + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
>> + break;
>> + case KVM_PV_REASON_PAGE_NOT_PRESENT:
>> + svm->apf_reason = 0;
>> + local_irq_disable();
>> + kvm_async_pf_task_wait(fault_address);
>> + local_irq_enable();
>> + break;
>> + case KVM_PV_REASON_PAGE_READY:
>> + svm->apf_reason = 0;
>> + local_irq_disable();
>> + kvm_async_pf_task_wake(fault_address);
>> + local_irq_enable();
>> + break;
>
> That's only available if CONFIG_KVM_GUEST is set, no? Is there anything
> I miss that resolves this dependency automatically? Otherwise, some more
> #ifdef CONFIG_KVM_GUEST might be needed.

Err, found it. Sorry for the noise.

Jan

--
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/