Re: [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support

From: Ackerley Tng

Date: Mon Jun 22 2026 - 20:00:02 EST


Tarun Sahu <tarunsahu@xxxxxxxxxx> writes:

> Introduce core infrastructure to support VM preservation with LUO.
>
> First two changes are just refactoring, no functional change, third
> change introduces a new member in struct kvm.
> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
> and anon_inode_getfile() to provide a unified VM file creation API.
> - Track a weak reference to the backing file in struct kvm under
> CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
> without circular lifetime dependencies.
>

Given the above, I think this should be separate patches.

> Signed-off-by: Tarun Sahu <tarunsahu@xxxxxxxxxx>
> ---
> include/linux/kvm_host.h | 14 +++++++
> virt/kvm/kvm_main.c | 79 +++++++++++++++++++++++++++++-----------
> virt/kvm/kvm_mm.h | 3 ++
> 3 files changed, 75 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4c14aee1fb06..9111a28637af 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -874,6 +874,18 @@ struct kvm {
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> /* Protected by slots_lock (for writes) and RCU (for reads) */
> struct xarray mem_attr_array;
> +#endif
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> + /*
> + * Weak reference to the VFS file backing this KVM instance. Stored
> + * without incrementing the file refcount to prevent a circular lifetime
> + * dependency (since file->private_data already pins this struct kvm).
> + * Used exclusively to resolve the file pointer back from struct kvm.
> + *
> + * Written/cleared via rcu_assign_pointer() and read locklessly under
> + * RCU (e.g. via get_file_active() to prevent ABA races).
> + */
> + struct file *vm_file;
> #endif

We didn't really talk about this during the calls, but it seems weird to
preserve a vm_file with pretty much nothing other than the vm type. The
entire VM is re-created, which means it could potentially be a
completely different VM?

In some sense it's more flexible since the guest_memfd can be restored
with some completely different VM, but it seems like it could introduce
other issues.

I think other KVM folks would probably have more thoughts here.

> char stats_id[KVM_STATS_NAME_SIZE];
> };
> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
> bool kvm_get_kvm_safe(struct kvm *kvm);
> void kvm_put_kvm(struct kvm *kvm);
> bool file_is_kvm(struct file *file);
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
> void kvm_put_kvm_no_destroy(struct kvm *kvm);
> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>
> static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
> {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 89489996fbc1..65f0c5fb353e 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -67,9 +67,6 @@
> #include <linux/kvm_dirty_ring.h>
>
>
> -/* Worst case buffer size needed for holding an integer. */
> -#define ITOA_MAX_LEN 12
> -
> MODULE_AUTHOR("Qumranet");
> MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
> MODULE_LICENSE("GPL");
> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
> {
> struct kvm *kvm = filp->private_data;
>
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> + /*
> + * Clear the weak reference of the vm file.
> + * In case vm file is closed by userspace, but kvm still has
> + * other users like vCPUs, clearing this pointer ensures
> + * that we don't have a dangling pointer to a closed file.
> + *
> + * Cleared via rcu_assign_pointer() to ensure proper memory visibility
> + * for concurrent lockless readers under RCU.
> + */
> + rcu_assign_pointer(kvm->vm_file, NULL);
> +#endif
> +
> kvm_irqfd_release(kvm);
>
> kvm_put_kvm(kvm);
> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
> +{
> + struct kvm *kvm = kvm_create_vm(type, fdname);
> + struct file *file;
> +
> + if (IS_ERR(kvm))
> + return ERR_CAST(kvm);
> +
> + file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> + if (IS_ERR(file)) {
> + kvm_put_kvm(kvm);
> + return file;
> + }
> +
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> + /*
> + * Weak reference to the file (without get_file()) to prevent a circular
> + * dependency. Safe because the file's release path clears this pointer
> + * and drops its reference to the VM.
> + *
> + * Written via rcu_assign_pointer() because the pointer can be read
> + * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
> + * get_file_active() to prevent lockless ABA races).
> + */
> + rcu_assign_pointer(kvm->vm_file, file);
> +#endif
> +
> + /*
> + * Don't call kvm_put_kvm anymore at this point; file->f_op is
> + * already set, with ->release() being kvm_vm_release(). In error
> + * cases it will be called by the final fput(file) and will take
> + * care of doing kvm_put_kvm(kvm).
> + */
> +
> + return file;
> +}
> +
> static int kvm_dev_ioctl_create_vm(unsigned long type)
> {
> char fdname[ITOA_MAX_LEN + 1];
> int r, fd;
> - struct kvm *kvm;
> struct file *file;
>
> fd = get_unused_fd_flags(O_CLOEXEC);
> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
>
> snprintf(fdname, sizeof(fdname), "%d", fd);
>
> - kvm = kvm_create_vm(type, fdname);
> - if (IS_ERR(kvm)) {
> - r = PTR_ERR(kvm);
> - goto put_fd;
> - }
> -
> - file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> + file = kvm_create_vm_file(type, fdname);
> if (IS_ERR(file)) {
> r = PTR_ERR(file);
> - goto put_kvm;
> + goto put_fd;
> }
>
> - /*
> - * Don't call kvm_put_kvm anymore at this point; file->f_op is
> - * already set, with ->release() being kvm_vm_release(). In error
> - * cases it will be called by the final fput(file) and will take
> - * care of doing kvm_put_kvm(kvm).
> - */
> - kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);

Notifying with file->private_data threw me off... I would rather inline
the rcu_assign_pointer() in this function and have this line read
notify(..., kvm) like before.

>
> fd_install(fd, file);
> return fd;
>
> -put_kvm:
> - kvm_put_kvm(kvm);
> put_fd:
> put_unused_fd(fd);
> return r;
> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
> kfree(env);
> }
>
> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
> +{
> + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> +}
> +
> static void kvm_init_debug(void)
> {
> const struct file_operations *fops;
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -3,6 +3,9 @@
> #ifndef __KVM_MM_H__
> #define __KVM_MM_H__ 1
>
> +/* Worst case buffer size needed for holding an integer as a string. */
> +#define ITOA_MAX_LEN 12
> +
> /*
> * Architectures can choose whether to use an rwlock or spinlock
> * for the mmu_lock. These macros, for use in common code
> --
> 2.54.0.1032.g2f8565e1d1-goog