Re: [PATCH v2 14/16] KVM: MMU: fast path of handling guest page fault

From: Marcelo Tosatti
Date: Tue Apr 17 2012 - 21:52:58 EST


On Fri, Apr 13, 2012 at 06:16:33PM +0800, Xiao Guangrong wrote:
> If the the present bit of page fault error code is set, it indicates
> the shadow page is populated on all levels, it means what we do is
> only modify the access bit which can be done out of mmu-lock
>
> Currently, in order to simplify the code, we only fix the page fault
> caused by write-protect on the fast path
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@xxxxxxxxxxxxxxxxxx>
> ---
> arch/x86/kvm/mmu.c | 205 ++++++++++++++++++++++++++++++++++++++++----
> arch/x86/kvm/paging_tmpl.h | 3 +
> 2 files changed, 192 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index efa5d59..fc91667 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -446,6 +446,13 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
> }
> #endif
>
> +static bool spte_wp_by_dirty_log(u64 spte)
> +{
> + WARN_ON(is_writable_pte(spte));
> +
> + return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> +}
> +
> static bool spte_has_volatile_bits(u64 spte)
> {
> if (!shadow_accessed_mask)
> @@ -454,9 +461,18 @@ static bool spte_has_volatile_bits(u64 spte)
> if (!is_shadow_present_pte(spte))
> return false;
>
> - if ((spte & shadow_accessed_mask) &&
> - (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
> - return false;
> + if (spte & shadow_accessed_mask) {
> + if (is_writable_pte(spte))
> + return !(spte & shadow_dirty_mask);
> +
> + /*
> + * If the spte is write-protected by dirty-log, it may
> + * be marked writable on fast page fault path, then CPU
> + * can modify the Dirty bit.
> + */
> + if (!spte_wp_by_dirty_log(spte))
> + return false;
> + }
>
> return true;
> }
> @@ -1109,26 +1125,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> rmap_remove(kvm, sptep);
> }
>
> -static bool spte_wp_by_dirty_log(u64 spte)
> -{
> - WARN_ON(is_writable_pte(spte));
> -
> - return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> -}
> -
> static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
> bool *flush, bool page_table_protect)
> {
> u64 spte = *sptep;
>
> if (is_writable_pte(spte)) {
> - *flush |= true;
> -
> if (large) {
> pgprintk("rmap_write_protect(large): spte %p %llx\n",
> spte, *spte);
> BUG_ON(!is_large_pte(spte));
>
> + *flush |= true;
> drop_spte(kvm, sptep);
> --kvm->stat.lpages;
> return;
> @@ -1137,6 +1145,9 @@ static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
> goto reset_spte;
> }
>
> + /* We need flush tlbs in this case: the fast page fault path
> + * can mark the spte writable after we read the sptep.
> + */
> if (page_table_protect && spte_wp_by_dirty_log(spte))
> goto reset_spte;
>
> @@ -1144,6 +1155,8 @@ static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
>
> reset_spte:
> rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
> +
> + *flush |= true;
> spte = spte & ~PT_WRITABLE_MASK;
> if (page_table_protect)
> spte |= SPTE_WRITE_PROTECT;
> @@ -2767,18 +2780,172 @@ exit:
> return ret;
> }
>
> +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
> + u32 error_code)
> +{
> + unsigned long *rmap;
> +
> + /*
> + * #PF can be fast only if the shadow page table is present and it
> + * is caused by write-protect, that means we just need change the
> + * W bit of the spte which can be done out of mmu-lock.
> + */
> + if (!(error_code & PFERR_PRESENT_MASK) ||
> + !(error_code & PFERR_WRITE_MASK))
> + return false;
> +
> + rmap = gfn_to_rmap(vcpu->kvm, gfn, PT_PAGE_TABLE_LEVEL);
> +
> + /* Quickly check the page can be writable. */
> + if (test_bit(PTE_LIST_WP_BIT, ACCESS_ONCE(rmap)))
> + return false;
> +
> + return true;
> +}
> +
> +static bool
> +fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> + u64 *sptep, u64 spte, gfn_t gfn)
> +{
> + pfn_t pfn;
> + bool ret = false;
> +
> + /*
> + * For the indirect spte, it is hard to get a stable gfn since
> + * we just use a cmpxchg to avoid all the races which is not
> + * enough to avoid the ABA problem: the host can arbitrarily
> + * change spte and the mapping from gfn to pfh.
> + *
> + * What we do is call gfn_to_pfn_atomic to bind the gfn and the
> + * pfn because after the call:
> + * - we have held the refcount of pfn that means the pfn can not
> + * be freed and be reused for another gfn.
> + * - the pfn is writable that means it can not be shared by different
> + * gfn.
> + */
> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);

Please document what can happen in parallel whenever you manipulate
sptes without mmu_lock held, convincing the reader that this is safe.

> +
> +/*
> + * Return value:
> + * - true: let the vcpu to access on the same address again.
> + * - false: let the real page fault path to fix it.
> + */
> +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
> + int level, u32 error_code)
> +{
> + struct kvm_shadow_walk_iterator iterator;
> + struct kvm_mmu_page *sp;
> + bool ret = false;
> + u64 spte = 0ull;
> +
> + if (!page_fault_can_be_fast(vcpu, gfn, error_code))
> + return false;

What prevents kvm_mmu_commit_zap_page from nukeing the "shadow_page"
here, again? At this point the faulting spte could be zero (!present),
and you have not yet increased reader_counter.

Same with current users of walk_shadow_page_lockless_begin.

I agree with Takuya, please reduce the size of the patchset to only to
what is strictly necessary, it appears many of the first patches are
not.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/