Re: [PATCH v3 08/15] KVM: MMU: allow unmap invalid rmap out ofmmu-lock

From: Gleb Natapov
Date: Thu Apr 18 2013 - 07:00:51 EST


On Tue, Apr 16, 2013 at 02:32:46PM +0800, Xiao Guangrong wrote:
> pte_list_clear_concurrently allows us to reset pte-desc entry
> out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
> lifecycle of sp, we use this way to achieve the goal:
>
> unmap_memslot_rmap_nolock():
> for-each-rmap-in-slot:
> preempt_disable
> kvm->arch.being_unmapped_rmap = rmapp
> clear spte and reset rmap entry
> kvm->arch.being_unmapped_rmap = NULL
> preempt_enable
>
> Other patch like zap-sp and mmu-notify which are protected
> by mmu-lock:
> clear spte and reset rmap entry
> retry:
> if (kvm->arch.being_unmapped_rmap == rmap)
> goto retry
> (the wait is very rare and clear one rmap is very fast, it
> is not bad even if wait is needed)
>
I do not understand what how this achieve the goal. Suppose that rmap
== X and kvm->arch.being_unmapped_rmap == NULL so "goto retry" is skipped,
but moment later unmap_memslot_rmap_nolock() does
vm->arch.being_unmapped_rmap = X.

> Then, we can sure the spte is always available when we do
> unmap_memslot_rmap_nolock
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@xxxxxxxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/kvm_host.h | 2 +
> arch/x86/kvm/mmu.c | 114 ++++++++++++++++++++++++++++++++++++---
> arch/x86/kvm/mmu.h | 2 +-
> 3 files changed, 110 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 5fd6ed1..1ad9a34 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -536,6 +536,8 @@ struct kvm_arch {
> * Hash table of struct kvm_mmu_page.
> */
> struct list_head active_mmu_pages;
> + unsigned long *being_unmapped_rmap;
> +
> struct list_head assigned_dev_head;
> struct iommu_domain *iommu_domain;
> int iommu_flags;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 2a7a5d0..e6414d2 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1104,10 +1104,10 @@ static int slot_rmap_add(struct kvm_memory_slot *slot,
> return slot->arch.ops->rmap_add(vcpu, spte, rmapp);
> }
>
> -static void slot_rmap_remove(struct kvm_memory_slot *slot,
> +static void slot_rmap_remove(struct kvm_memory_slot *slot, struct kvm *kvm,
> unsigned long *rmapp, u64 *spte)
> {
> - slot->arch.ops->rmap_remove(spte, rmapp);
> + slot->arch.ops->rmap_remove(kvm, spte, rmapp);
> }
>
> static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> @@ -1132,7 +1132,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
> sp = page_header(__pa(spte));
> gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
> rmapp = gfn_to_rmap(kvm, &slot, gfn, sp->role.level);
> - slot_rmap_remove(slot, rmapp, spte);
> + slot_rmap_remove(slot, kvm, rmapp, spte);
> }
>
> /*
> @@ -1589,9 +1589,14 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
> return kvm_handle_hva(kvm, hva, 0, slot_rmap_test_age);
> }
>
> +static void rmap_remove_spte(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
> +{
> + pte_list_remove(spte, rmapp);
> +}
> +
> static struct rmap_operations normal_rmap_ops = {
> .rmap_add = pte_list_add,
> - .rmap_remove = pte_list_remove,
> + .rmap_remove = rmap_remove_spte,
>
> .rmap_write_protect = __rmap_write_protect,
>
> @@ -1613,9 +1618,27 @@ static int invalid_rmap_add(struct kvm_vcpu *vcpu, u64 *spte,
> return 0;
> }
>
> -static void invalid_rmap_remove(u64 *spte, unsigned long *rmapp)
> +static void sync_being_unmapped_rmap(struct kvm *kvm, unsigned long *rmapp)
> +{
> + /*
> + * Ensure all the sptes on the rmap have been zapped and
> + * the rmap's entries have been reset so that
> + * unmap_invalid_rmap_nolock can not get any spte from the
> + * rmap after calling sync_being_unmapped_rmap().
> + */
> + smp_mb();
> +retry:
> + if (unlikely(ACCESS_ONCE(kvm->arch.being_unmapped_rmap) == rmapp)) {
> + cpu_relax();
> + goto retry;
> + }
> +}
> +
> +static void
> +invalid_rmap_remove(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
> {
> pte_list_clear_concurrently(spte, rmapp);
> + sync_being_unmapped_rmap(kvm, rmapp);
> }
>
> static bool invalid_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
> @@ -1635,7 +1658,11 @@ static int __kvm_unmap_invalid_rmapp(unsigned long *rmapp)
> if (sptep == PTE_LIST_SPTE_SKIP)
> continue;
>
> - /* Do not call .rmap_remove(). */
> + /*
> + * Do not call .rmap_remove() since we do not want to wait
> + * on sync_being_unmapped_rmap() when all sptes should be
> + * removed from the rmap.
> + */
> if (mmu_spte_clear_track_bits(sptep))
> pte_list_clear_concurrently(sptep, rmapp);
> }
> @@ -1645,7 +1672,10 @@ static int __kvm_unmap_invalid_rmapp(unsigned long *rmapp)
>
> static int kvm_unmap_invalid_rmapp(struct kvm *kvm, unsigned long *rmapp)
> {
> - return __kvm_unmap_invalid_rmapp(rmapp);
> + int ret = __kvm_unmap_invalid_rmapp(rmapp);
> +
> + sync_being_unmapped_rmap(kvm, rmapp);
> + return ret;
> }
>
> static int invalid_rmap_set_pte(struct kvm *kvm, unsigned long *rmapp,
> @@ -1686,6 +1716,76 @@ static struct rmap_operations invalid_rmap_ops = {
> .rmap_unmap = kvm_unmap_invalid_rmapp
> };
>
> +typedef void (*handle_rmap_fun)(unsigned long *rmapp, void *data);
> +static void walk_memslot_rmap_nolock(struct kvm_memory_slot *slot,
> + handle_rmap_fun fun, void *data)
> +{
> + int level;
> +
> + for (level = PT_PAGE_TABLE_LEVEL;
> + level < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++level) {
> + unsigned long idx, *rmapp;
> +
> + rmapp = slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL];
> + idx = gfn_to_index(slot->base_gfn + slot->npages - 1,
> + slot->base_gfn, level) + 1;
> + /*
> + * Walk ramp from the high index to low index to reduce
> + * possible wait in sync_being_unmapped_rmap().
> + */
> + while (idx--)
> + fun(rmapp + idx, data);
> + }
> +}
> +
> +static void unmap_rmap_no_lock_begin(struct kvm *kvm, unsigned long *rmapp)
> +{
> + preempt_disable();
> + kvm->arch.being_unmapped_rmap = rmapp;
> +
> + /*
> + * Set being_unmapped_rmap should be before read/write any
> + * sptes on the rmaps.
> + * See the comment in sync_being_unmapped_rmap().
> + */
> + smp_mb();
> +}
> +
> +static void unmap_rmap_no_lock_end(struct kvm *kvm)
> +{
> + /*
> + * Ensure clearing spte and resetting rmap's entries has
> + * been finished.
> + * See the comment in sync_being_unmapped_rmap().
> + */
> + smp_mb();
> + kvm->arch.being_unmapped_rmap = NULL;
> + preempt_enable();
> +}
> +
> +static void unmap_invalid_rmap_nolock(unsigned long *rmapp, void *data)
> +{
> + struct kvm *kvm = (struct kvm *)data;
> +
> + if (!ACCESS_ONCE(*rmapp))
> + return;
> +
> + unmap_rmap_no_lock_begin(kvm, rmapp);
> + __kvm_unmap_invalid_rmapp(rmapp);
> + unmap_rmap_no_lock_end(kvm);
> +}
> +
> +static void
> +unmap_memslot_rmap_nolock(struct kvm *kvm, struct kvm_memory_slot *slot)
> +{
> + /* Only invalid rmaps can be unmapped out of mmu-lock. */
> + WARN_ON(slot->arch.ops != &invalid_rmap_ops);
> + /* Use slots_lock to protect kvm->arch.being_unmapped_rmap. */
> + WARN_ON(!mutex_is_locked(&kvm->slots_lock));
> +
> + walk_memslot_rmap_nolock(slot, unmap_invalid_rmap_nolock, kvm);
> +}
> +
> #ifdef MMU_DEBUG
> static int is_empty_shadow_page(u64 *spt)
> {
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index bb2b22e..d6aa31a 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -117,7 +117,7 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
> struct rmap_operations {
> int (*rmap_add)(struct kvm_vcpu *vcpu, u64 *spte,
> unsigned long *rmap);
> - void (*rmap_remove)(u64 *spte, unsigned long *rmap);
> + void (*rmap_remove)(struct kvm *kvm, u64 *spte, unsigned long *rmap);
>
> bool (*rmap_write_protect)(struct kvm *kvm, unsigned long *rmap,
> bool pt_protect);
> --
> 1.7.7.6

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/