Re: [PATCH v3 3/7] KVM: Add basic bitmap support into kvm_mmu_notifier_test/clear_young

From: David Matlack
Date: Fri Apr 12 2024 - 16:28:55 EST


On 2024-04-01 11:29 PM, James Houghton wrote:
> Add kvm_arch_prepare_bitmap_age() for architectures to indiciate that
> they support bitmap-based aging in kvm_mmu_notifier_test_clear_young()
> and that they do not need KVM to grab the MMU lock for writing. This
> function allows architectures to do other locking or other preparatory
> work that it needs.

There's a lot going on here. I know it's extra work but I think the
series would be easier to understand and simplify if you introduced the
KVM support for lockless test/clear_young() first, and then introduce
support for the bitmap-based look-around.

Specifically:

1. Make all test/clear_young() notifiers lockless. i.e. Move the
mmu_lock into the architecture-specific code (kvm_age_gfn() and
kvm_test_age_gfn()).

2. Convert KVM/x86's kvm_{test,}_age_gfn() to be lockless for the TDP
MMU.

4. Convert KVM/arm64's kvm_{test,}_age_gfn() to hold the mmu_lock in
read-mode.

5. Add bitmap-based look-around support to KVM/x86 and KVM/arm64
(probably 2-3 patches).

>
> If an architecture does not implement kvm_arch_prepare_bitmap_age() or
> is unable to do bitmap-based aging at runtime (and marks the bitmap as
> unreliable):
> 1. If a bitmap was provided, we inform the caller that the bitmap is
> unreliable (MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE).
> 2. If a bitmap was not provided, fall back to the old logic.
>
> Also add logic for architectures to easily use the provided bitmap if
> they are able. The expectation is that the architecture's implementation
> of kvm_gfn_test_age() will use kvm_gfn_record_young(), and
> kvm_gfn_age() will use kvm_gfn_should_age().
>
> Suggested-by: Yu Zhao <yuzhao@xxxxxxxxxx>
> Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
> ---
> include/linux/kvm_host.h | 60 ++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 92 +++++++++++++++++++++++++++++-----------
> 2 files changed, 127 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1800d03a06a9..5862fd7b5f9b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1992,6 +1992,26 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
> extern const struct kvm_stats_header kvm_vcpu_stats_header;
> extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
>
> +/*
> + * Architectures that support using bitmaps for kvm_age_gfn() and
> + * kvm_test_age_gfn should return true for kvm_arch_prepare_bitmap_age()
> + * and do any work they need to prepare. The subsequent walk will not
> + * automatically grab the KVM MMU lock, so some architectures may opt
> + * to grab it.
> + *
> + * If true is returned, a subsequent call to kvm_arch_finish_bitmap_age() is
> + * guaranteed.
> + */
> +#ifndef kvm_arch_prepare_bitmap_age
> +static inline bool kvm_arch_prepare_bitmap_age(struct mmu_notifier *mn)

I find the name of these architecture callbacks misleading/confusing.
The lockless path is used even when a bitmap is not provided. i.e.
bitmap can be NULL in between kvm_arch_prepare/finish_bitmap_age().

> +{
> + return false;
> +}
> +#endif
> +#ifndef kvm_arch_finish_bitmap_age
> +static inline void kvm_arch_finish_bitmap_age(struct mmu_notifier *mn) {}
> +#endif

kvm_arch_finish_bitmap_age() seems unnecessary. I think the KVM/arm64
code could acquire/release the mmu_lock in read-mode in
kvm_test_age_gfn() and kvm_age_gfn() right?

> +
> #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
> static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> {
> @@ -2076,9 +2096,16 @@ static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm,
> return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq;
> }
>
> +struct test_clear_young_metadata {
> + unsigned long *bitmap;
> + unsigned long bitmap_offset_end;

bitmap_offset_end is unused.

> + unsigned long end;
> + bool unreliable;
> +};
> union kvm_mmu_notifier_arg {
> pte_t pte;
> unsigned long attributes;
> + struct test_clear_young_metadata *metadata;

nit: Maybe s/metadata/test_clear_young/ ?

> };
>
> struct kvm_gfn_range {
> @@ -2087,11 +2114,44 @@ struct kvm_gfn_range {
> gfn_t end;
> union kvm_mmu_notifier_arg arg;
> bool may_block;
> + bool lockless;

Please document this as it's somewhat subtle. A reader might think this
implies the entire operation runs without taking the mmu_lock.

> };
> bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> +
> +static inline void kvm_age_set_unreliable(struct kvm_gfn_range *range)
> +{
> + struct test_clear_young_metadata *args = range->arg.metadata;
> +
> + args->unreliable = true;
> +}
> +static inline unsigned long kvm_young_bitmap_offset(struct kvm_gfn_range *range,
> + gfn_t gfn)
> +{
> + struct test_clear_young_metadata *args = range->arg.metadata;
> +
> + return hva_to_gfn_memslot(args->end - 1, range->slot) - gfn;
> +}
> +static inline void kvm_gfn_record_young(struct kvm_gfn_range *range, gfn_t gfn)
> +{
> + struct test_clear_young_metadata *args = range->arg.metadata;
> +
> + WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
> + if (args->bitmap)
> + __set_bit(kvm_young_bitmap_offset(range, gfn), args->bitmap);
> +}
> +static inline bool kvm_gfn_should_age(struct kvm_gfn_range *range, gfn_t gfn)
> +{
> + struct test_clear_young_metadata *args = range->arg.metadata;
> +
> + WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
> + if (args->bitmap)
> + return test_bit(kvm_young_bitmap_offset(range, gfn),
> + args->bitmap);
> + return true;
> +}
> #endif
>
> #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d0545d88c802..7d80321e2ece 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -550,6 +550,7 @@ struct kvm_mmu_notifier_range {
> on_lock_fn_t on_lock;
> bool flush_on_ret;
> bool may_block;
> + bool lockless;
> };
>
> /*
> @@ -598,6 +599,8 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
> struct kvm_memslots *slots;
> int i, idx;
>
> + BUILD_BUG_ON(sizeof(gfn_range.arg) != sizeof(gfn_range.arg.pte));
> +
> if (WARN_ON_ONCE(range->end <= range->start))
> return r;
>
> @@ -637,15 +640,18 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
> gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
> gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
> gfn_range.slot = slot;
> + gfn_range.lockless = range->lockless;
>
> if (!r.found_memslot) {
> r.found_memslot = true;
> - KVM_MMU_LOCK(kvm);
> - if (!IS_KVM_NULL_FN(range->on_lock))
> - range->on_lock(kvm);
> -
> - if (IS_KVM_NULL_FN(range->handler))
> - break;
> + if (!range->lockless) {
> + KVM_MMU_LOCK(kvm);
> + if (!IS_KVM_NULL_FN(range->on_lock))
> + range->on_lock(kvm);
> +
> + if (IS_KVM_NULL_FN(range->handler))
> + break;
> + }
> }
> r.ret |= range->handler(kvm, &gfn_range);
> }
> @@ -654,7 +660,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
> if (range->flush_on_ret && r.ret)
> kvm_flush_remote_tlbs(kvm);
>
> - if (r.found_memslot)
> + if (r.found_memslot && !range->lockless)
> KVM_MMU_UNLOCK(kvm);
>
> srcu_read_unlock(&kvm->srcu, idx);
> @@ -682,19 +688,24 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
> return __kvm_handle_hva_range(kvm, &range).ret;
> }
>
> -static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
> - unsigned long start,
> - unsigned long end,
> - gfn_handler_t handler)
> +static __always_inline int kvm_handle_hva_range_no_flush(
> + struct mmu_notifier *mn,
> + unsigned long start,
> + unsigned long end,
> + gfn_handler_t handler,
> + union kvm_mmu_notifier_arg arg,
> + bool lockless)
> {
> struct kvm *kvm = mmu_notifier_to_kvm(mn);
> const struct kvm_mmu_notifier_range range = {
> .start = start,
> .end = end,
> .handler = handler,
> + .arg = arg,
> .on_lock = (void *)kvm_null_fn,
> .flush_on_ret = false,
> .may_block = false,
> + .lockless = lockless,
> };
>
> return __kvm_handle_hva_range(kvm, &range).ret;
> @@ -909,15 +920,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
> kvm_age_gfn);
> }
>
> -static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
> - struct mm_struct *mm,
> - unsigned long start,
> - unsigned long end,
> - unsigned long *bitmap)
> +static int kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn,
> + struct mm_struct *mm,
> + unsigned long start,
> + unsigned long end,
> + unsigned long *bitmap,
> + bool clear)

Perhaps pass in the callback (kvm_test_age_gfn/kvm_age_gfn) instead of
true/false to avoid the naked booleans at the callsites?

> {
> - trace_kvm_age_hva(start, end);
> + if (kvm_arch_prepare_bitmap_age(mn)) {
> + struct test_clear_young_metadata args = {
> + .bitmap = bitmap,
> + .end = end,
> + .unreliable = false,
> + };
> + union kvm_mmu_notifier_arg arg = {
> + .metadata = &args
> + };
> + bool young;
> +
> + young = kvm_handle_hva_range_no_flush(
> + mn, start, end,
> + clear ? kvm_age_gfn : kvm_test_age_gfn,
> + arg, true);

I suspect the end result will be cleaner we make all architectures
lockless. i.e. Move the mmu_lock acquire/release into the
architecture-specific code.

This could result in more acquire/release calls (one per memslot that
overlaps the provided range) but that should be a single memslot in the
majority of cases I think?

Then unconditionally pass in the metadata structure.

Then you don't need any special casing for the fast path / bitmap path.
The only thing needed is to figure out whether to return
MMU_NOTIFIER_YOUNG vs MMU_NOTIFIER_YOUNG_LOOK_AROUND and that can be
plumbed via test_clear_young_metadata or by changing gfn_handler_t to
return an int instead of a bool.

> +
> + kvm_arch_finish_bitmap_age(mn);
>
> - /* We don't support bitmaps. Don't test or clear anything. */
> + if (!args.unreliable)
> + return young ? MMU_NOTIFIER_YOUNG_FAST : 0;
> + }
> +
> + /* A bitmap was passed but the architecture doesn't support bitmaps */
> if (bitmap)
> return MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE;
>
> @@ -934,7 +966,21 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
> * cadence. If we find this inaccurate, we might come up with a
> * more sophisticated heuristic later.
> */
> - return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
> + return kvm_handle_hva_range_no_flush(
> + mn, start, end, clear ? kvm_age_gfn : kvm_test_age_gfn,
> + KVM_MMU_NOTIFIER_NO_ARG, false);

Should this return MMU_NOTIFIER_YOUNG explicitly? This code is assuming
MMU_NOTIFIER_YOUNG == (int)true.

> +}
> +
> +static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
> + struct mm_struct *mm,
> + unsigned long start,
> + unsigned long end,
> + unsigned long *bitmap)
> +{
> + trace_kvm_age_hva(start, end);
> +
> + return kvm_mmu_notifier_test_clear_young(mn, mm, start, end, bitmap,
> + true);
> }
>
> static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
> @@ -945,12 +991,8 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
> {
> trace_kvm_test_age_hva(start, end);
>
> - /* We don't support bitmaps. Don't test or clear anything. */
> - if (bitmap)
> - return MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE;
> -
> - return kvm_handle_hva_range_no_flush(mn, start, end,
> - kvm_test_age_gfn);
> + return kvm_mmu_notifier_test_clear_young(mn, mm, start, end, bitmap,
> + false);
> }
>
> static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> --
> 2.44.0.478.gd926399ef9-goog
>