Re: [PATCH v2 1/1] virt: acrn: Fix irqfd use-after-free during eventfd shutdown

From: Fei Li

Date: Tue May 19 2026 - 21:21:18 EST


On 2026-05-19 at 19:20:18 +0800, Sicong Huang wrote:
> acrn_irqfd_deassign() and the eventfd EPOLLHUP wakeup can race and free
> the same struct hsm_irqfd:
>
> CPU0 CPU1
> ---- ----
> eventfd_release()
> wake_up_poll(EPOLLHUP)
> hsm_irqfd_wakeup()
> queue_work(&irqfd->shutdown)
> acrn_irqfd_deassign()
> hsm_irqfd_shutdown()
> list_del_init()
> eventfd_ctx_remove_wait_queue()
> eventfd_ctx_put()
> kfree(irqfd)
> hsm_irqfd_shutdown_work()
> container_of(work, ..., shutdown)
> irqfd->vm <-- use-after-free
>
> The deassign path freed the irqfd while a shutdown work item was
> already queued by EPOLLHUP (or vice versa), so the work item could
> resurrect a dangling pointer through container_of().
>
> Switch to the lifetime model used by KVM irqfds:
>
> - Deassign/deinit only deactivate the irqfd: remove it from vm->irqfds
> under irqfds_lock and queue the cleanup work.
> - hsm_irqfd_shutdown_work() becomes the sole owner that unhooks the
> eventfd waitqueue entry, drops the eventfd reference and frees the
> irqfd.
> - A new HSM_IRQFD_FLAG_SHUTDOWN bit guarded by test_and_set_bit()
> ensures the cleanup work is queued at most once, no matter how many
> of {EPOLLHUP, deassign, deinit} fire concurrently. This is safe to
> call from the waitqueue callback, which runs with wqh->lock held and
> IRQs disabled and therefore cannot take irqfds_lock.
> - acrn_irqfd_deassign() flushes vm->irqfd_wq before returning so the
> eventfd is fully detached on return. acrn_irqfd_deinit() deactivates
> every irqfd, flushes the workqueue and only then destroys it, so no
> path can queue_work() onto a torn-down workqueue.
> - acrn_irqfd_assign() now installs the eventfd waitqueue entry and
> publishes the irqfd to vm->irqfds under irqfds_lock, so the irqfd is
> never visible to deassign/deinit before its waitqueue entry is in
> place, and any EPOLLHUP that fires in the assign window queues
> cleanup work that blocks on irqfds_lock until publication is done.
>
> Signed-off-by: Sicong Huang <congei42@xxxxxxx>

Hi Sicong,

Thanks, v2 looks much better to me.

Reviewed-by: Fei Li <fei1.li@xxxxxxxxx>

> ---
> drivers/virt/acrn/irqfd.c | 71 ++++++++++++++++++++++++---------------
> 1 file changed, 44 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
> index acf8cd5f8f8c..feeba7eda494 100644
> --- a/drivers/virt/acrn/irqfd.c
> +++ b/drivers/virt/acrn/irqfd.c
> @@ -16,6 +16,9 @@
>
> #include "acrn_drv.h"
>
> +/* Cleanup work has been queued; set via test_and_set_bit(). */
> +#define HSM_IRQFD_FLAG_SHUTDOWN 0
> +
> /**
> * struct hsm_irqfd - Properties of HSM irqfd
> * @vm: Associated VM pointer
> @@ -25,6 +28,7 @@
> * @list: Entry within &acrn_vm.irqfds of irqfds of a VM
> * @pt: Structure for select/poll on the associated eventfd
> * @msi: MSI data
> + * @flags: Internal lifecycle flags (HSM_IRQFD_FLAG_*)
> */
> struct hsm_irqfd {
> struct acrn_vm *vm;
> @@ -34,6 +38,7 @@ struct hsm_irqfd {
> struct list_head list;
> poll_table pt;
> struct acrn_msi_entry msi;
> + unsigned long flags;
> };
>
> static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
> @@ -44,30 +49,29 @@ static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
> irqfd->msi.msi_data);
> }
>
> -static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
> +/* Queue the cleanup work at most once. Safe from atomic context. */
> +static void hsm_irqfd_queue_shutdown(struct hsm_irqfd *irqfd)
> {
> - u64 cnt;
> -
> - lockdep_assert_held(&irqfd->vm->irqfds_lock);
> -
> - /* remove from wait queue */
> - list_del_init(&irqfd->list);
> - eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
> - eventfd_ctx_put(irqfd->eventfd);
> - kfree(irqfd);
> + if (!test_and_set_bit(HSM_IRQFD_FLAG_SHUTDOWN, &irqfd->flags))
> + queue_work(irqfd->vm->irqfd_wq, &irqfd->shutdown);
> }
>
> +/* Sole owner of @irqfd: unhook waitqueue, drop eventfd ref, free. */
> static void hsm_irqfd_shutdown_work(struct work_struct *work)
> {
> - struct hsm_irqfd *irqfd;
> - struct acrn_vm *vm;
> + struct hsm_irqfd *irqfd = container_of(work, struct hsm_irqfd,
> + shutdown);
> + struct acrn_vm *vm = irqfd->vm;
> + u64 cnt;
>
> - irqfd = container_of(work, struct hsm_irqfd, shutdown);
> - vm = irqfd->vm;
> mutex_lock(&vm->irqfds_lock);
> if (!list_empty(&irqfd->list))
> - hsm_irqfd_shutdown(irqfd);
> + list_del_init(&irqfd->list);
> mutex_unlock(&vm->irqfds_lock);
> +
> + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
> + eventfd_ctx_put(irqfd->eventfd);
> + kfree(irqfd);
> }
>
> /* Called with wqh->lock held and interrupts disabled */
> @@ -76,17 +80,16 @@ static int hsm_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
> {
> unsigned long poll_bits = (unsigned long)key;
> struct hsm_irqfd *irqfd;
> - struct acrn_vm *vm;
>
> irqfd = container_of(wait, struct hsm_irqfd, wait);
> - vm = irqfd->vm;
> +
> if (poll_bits & POLLIN)
> /* An event has been signaled, inject an interrupt */
> acrn_irqfd_inject(irqfd);
>
> if (poll_bits & POLLHUP)
> - /* Do shutdown work in thread to hold wqh->lock */
> - queue_work(vm->irqfd_wq, &irqfd->shutdown);
> + /* Defer teardown to the cleanup work; can't sleep here. */
> + hsm_irqfd_queue_shutdown(irqfd);
>
> return 0;
> }
> @@ -142,6 +145,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
> init_waitqueue_func_entry(&irqfd->wait, hsm_irqfd_wakeup);
> init_poll_funcptr(&irqfd->pt, hsm_irqfd_poll_func);
>
> + /*
> + * Hold irqfds_lock across waitqueue install and list_add so the
> + * irqfd is not visible to deassign/deinit before its waitqueue
> + * entry is in place, and any racing EPOLLHUP cleanup work blocks
> + * on irqfds_lock until publication completes.
> + */
> mutex_lock(&vm->irqfds_lock);
> list_for_each_entry(tmp, &vm->irqfds, list) {
> if (irqfd->eventfd != tmp->eventfd)
> @@ -150,14 +159,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
> mutex_unlock(&vm->irqfds_lock);
> goto fail;
> }
> - list_add_tail(&irqfd->list, &vm->irqfds);
> - mutex_unlock(&vm->irqfds_lock);
>
> - /* Check the pending event in this stage */
> events = vfs_poll(fd_file(f), &irqfd->pt);
> -
> + list_add_tail(&irqfd->list, &vm->irqfds);
> if (events & EPOLLIN)
> acrn_irqfd_inject(irqfd);
> + mutex_unlock(&vm->irqfds_lock);
>
> return 0;
> fail:
> @@ -180,13 +187,17 @@ static int acrn_irqfd_deassign(struct acrn_vm *vm,
> mutex_lock(&vm->irqfds_lock);
> list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) {
> if (irqfd->eventfd == eventfd) {
> - hsm_irqfd_shutdown(irqfd);
> + list_del_init(&irqfd->list);
> + hsm_irqfd_queue_shutdown(irqfd);
> break;
> }
> }
> mutex_unlock(&vm->irqfds_lock);
> eventfd_ctx_put(eventfd);
>
> + /* Wait for cleanup work to finish so the eventfd is fully detached. */
> + flush_workqueue(vm->irqfd_wq);
> +
> return 0;
> }
>
> @@ -219,9 +230,15 @@ void acrn_irqfd_deinit(struct acrn_vm *vm)
> struct hsm_irqfd *irqfd, *next;
>
> dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid);
> - destroy_workqueue(vm->irqfd_wq);
> +
> mutex_lock(&vm->irqfds_lock);
> - list_for_each_entry_safe(irqfd, next, &vm->irqfds, list)
> - hsm_irqfd_shutdown(irqfd);
> + list_for_each_entry_safe(irqfd, next, &vm->irqfds, list) {
> + list_del_init(&irqfd->list);
> + hsm_irqfd_queue_shutdown(irqfd);
> + }
> mutex_unlock(&vm->irqfds_lock);
> +
> + /* Drain all cleanup work before tearing the workqueue down. */
> + flush_workqueue(vm->irqfd_wq);
> + destroy_workqueue(vm->irqfd_wq);
> }
> --
> 2.34.1
>