Re: [PATCH 1/2] KVM: fix cache stale memslot info with correct mmio generation number

From: Xiao Guangrong
Date: Mon Aug 18 2014 - 15:56:35 EST



On Aug 19, 2014, at 2:47 AM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote:

>
>> I think this patch is auditable, page-fault is always called by holding
>> srcu-lock so that a page fault can’t go across synchronize_srcu_expedited.
>> Only these cases can happen:
>>
>> 1) page fault occurs before synchronize_srcu_expedited.
>> In this case, vcpu will generate mmio-exit for the memslot being registered
>> by the ioctl. That’s ok since the ioctl have not finished.
>>
>> 2) page fault occurs after synchronize_srcu_expedited and during
>> increasing generation-number.
>> In this case, userspace may get wrong mmio-exit (that happen if handing
>> page-fault is slower that the ioctl), that’s ok too since userspace need do
>> the check anyway like i said above.
>>
>> 3) page fault occurs after generation-number update
>> that’s definitely correct. :)
>>
>>> Another alternative could be to use the low bit to mark an in-progress
>>> change, and skip the caching if the low bit is set. Similar to a
>>> seqcount (except if read_seqcount_retry fails, we just punt and not
>>> retry anything), you could use it even though the memory barriers
>>> provided by write_seqcount_begin/end are not too useful in this case.
>>
>> I do not know how the bit works, page fault will cache the memslot before
>> the bit set and cache the generation-number after the bit set.
>>
>> Maybe i missed your idea, could you please detail it?
>
> Something like this:
>
> - update_memslots(slots, new, kvm->memslots->generation);
> + /* ensure generation number is always increased. */
> + slots->generation = old_memslots->generation + 1;
> + update_memslots(slots, new);
> rcu_assign_pointer(kvm->memslots, slots);
> synchronize_srcu_expedited(&kvm->srcu);
> + slots->generation++;
>
> Then case 1 and 2 will just have a cache miss.

So, case 2 is what you concerned? :) I still think it is ok but i do not have
strong opinion on that. How about simplify it like this:

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9314678..9fabf6a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -234,16 +234,22 @@ static unsigned int get_mmio_spte_generation(u64 spte)
return gen;
}

-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+static unsigned int __kvm_current_mmio_generation(struct kvm_memslots *slots)
{
+
/*
* Init kvm generation close to MMIO_MAX_GEN to easily test the
* code of handling generation number wrap-around.
*/
- return (kvm_memslots(kvm)->generation +
+ return (slots->generation +
MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
}

+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+ return __kvm_current_mmio_generation(kvm_memslots(kvm));
+}
+
static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
unsigned access)
{
@@ -287,9 +293,15 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,

static bool check_mmio_spte(struct kvm *kvm, u64 spte)
{
+ struct kvm_memslots *slots = kvm_memslots(kvm);
unsigned int kvm_gen, spte_gen;

- kvm_gen = kvm_current_mmio_generation(kvm);
+ if (slots->updated)
+ return false;
+
+ smp_rmb();
+
+ kvm_gen = __kvm_current_mmio_generation(slots);
spte_gen = get_mmio_spte_generation(spte);

trace_check_mmio_spte(spte, kvm_gen, spte_gen);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4b6c01b..1d4e78f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -96,7 +96,7 @@ static void hardware_disable_all(void);

static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
static void update_memslots(struct kvm_memslots *slots,
- struct kvm_memory_slot *new, u64 last_generation);
+ struct kvm_memory_slot *new);

static void kvm_release_pfn_dirty(pfn_t pfn);
static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -685,8 +685,7 @@ static void sort_memslots(struct kvm_memslots *slots)
}

static void update_memslots(struct kvm_memslots *slots,
- struct kvm_memory_slot *new,
- u64 last_generation)
+ struct kvm_memory_slot *new)
{
if (new) {
int id = new->id;
@@ -697,8 +696,6 @@ static void update_memslots(struct kvm_memslots *slots,
if (new->npages != npages)
sort_memslots(slots);
}
-
- slots->generation = last_generation + 1;
}

static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
@@ -720,10 +717,17 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
{
struct kvm_memslots *old_memslots = kvm->memslots;

- update_memslots(slots, new, kvm->memslots->generation);
+ /* ensure generation number is always increased. */
+ slots->updated = true;
+ slots->generation = old_memslots->generation;
+ update_memslots(slots, new);
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);

+ slots->generation++;
+ smp_wmb();
+ slots->updated = false;
+
kvm_arch_memslots_updated(kvm);

return old_memslots;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/