Re: [PATCH] KVM: x86/xen: Fix sleeping lock in hard IRQ context on PREEMPT_RT

From: Sean Christopherson

Date: Wed Apr 01 2026 - 11:47:45 EST

On Mon, Mar 30, 2026, David Woodhouse wrote:
> On Mon, 2026-03-30 at 10:18 -0400, Steven Rostedt wrote:
> >
> > > +static void xen_timer_inject_irqwork(struct irq_work *work)
> > > +{
> > > +     struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen,
> > > +                                             timer_inject_irqwork);
> > > +     struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen);
> > > +     struct kvm_xen_evtchn e;
> > > +     int rc;
> > > +
> > > +     e.vcpu_id = vcpu->vcpu_id;
> > > +     e.vcpu_idx = vcpu->vcpu_idx;
> > > +     e.port = vcpu->arch.xen.timer_virq;
> > > +     e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
> > > +
> > > +     rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
> > > +     if (rc != -EWOULDBLOCK)
> > > +             vcpu->arch.xen.timer_expires = 0;
> > > +}
> >
> > Why duplicate this code and not simply make a static inline helper
> > function that is used in both places?
>
> It's already duplicating the functionality; the original
> xen_timer_callback() will already fall back to injecting the IRQ in
> process context when it needs to (by setting vcpu-
> >arch.xen.timer_pending and then setting KVM_REQ_UNBLOCK).
>
> All you had to do was make kvm_xen_set_evtchn_fast() return
> -EWOULDBLOCK in the in_hardirq() case in order to use the existing
> fallback, surely?
>
> Better still, can't kvm_xen_set_evtchn_fast() just use read_trylock()
> instead?

Re-reading through the thread where you proposed using trylock, and through
commit bbe17c625d68 ("KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest()"),
I think I agree with using trylock for "fast" paths.

Though I would prefer to not make it unconditional for the "fast" helper instead
of conditional based on in_interrupt(). And before we start doing surgery to
"fix" a setup no one uses, and also before we use gpcs more broadly, I think we
should try to up-level the gpc APIs to reduce the amount of duplicate, boilerplate
code. kvm_xen_update_runstate_guest() and maybe kvm_xen_set_evtchn() will likely
need to open code some amount of logic, but

Side topic, looks like kvm_xen_shared_info_init() is buggy in that it fails to
mark the slot as dirty.

E.g. sans the API implementations, I think we can and should end up with code
like this:

---
arch/x86/kvm/x86.c | 14 ++---
arch/x86/kvm/xen.c | 127 ++++++++++++---------------------------------
2 files changed, 37 insertions(+), 104 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b5d48e75b65..65bad25fd9d4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3274,15 +3274,8 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,

memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));

- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ if (kvm_gpc_acquire(gpc))
+ return;

guest_hv_clock = (void *)(gpc->khva + offset);

@@ -3305,8 +3298,7 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,

guest_hv_clock->version = ++hv_clock.version;

- kvm_gpc_mark_dirty_in_slot(gpc);
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_dirty(gpc);

trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 91fd3673c09a..a97fd88ee99c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -42,19 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
- int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
+ int ret;

- read_lock_irq(&gpc->lock);
- while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
- read_unlock_irq(&gpc->lock);
-
- ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
- if (ret)
- goto out;
-
- read_lock_irq(&gpc->lock);
- }
+ ret = kvm_gpc_acquire(gpc);
+ if (ret)
+ goto out;

/*
* This code mirrors kvm_write_wall_clock() except that it writes
@@ -96,7 +89,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
smp_wmb();

wc->version = wc_version + 1;
- read_unlock_irq(&gpc->lock);
+ kvm_gpc_release_dirty(gpc);

kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);

@@ -155,22 +148,14 @@ static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
struct gfn_to_pfn_cache *gpc,
unsigned int offset)
{
- unsigned long flags;
int r;

- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
- if (r)
- return r;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ r = kvm_gpc_acquire(gpc);
+ if (r)
+ return r;

memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_clean(gpc);

/*
* Sanity check TSC shift+multiplier to verify the guest's view of time
@@ -420,27 +405,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* Attempt to obtain the GPC lock on *both* (if there are two)
* gfn_to_pfn caches that cover the region.
*/
- if (atomic) {
- local_irq_save(flags);
- if (!read_trylock(&gpc1->lock)) {
- local_irq_restore(flags);
- return;
- }
- } else {
- read_lock_irqsave(&gpc1->lock, flags);
- }
- while (!kvm_gpc_check(gpc1, user_len1)) {
- read_unlock_irqrestore(&gpc1->lock, flags);
-
- /* When invoked from kvm_sched_out() we cannot sleep */
- if (atomic)
- return;
-
- if (kvm_gpc_refresh(gpc1, user_len1))
- return;
-
- read_lock_irqsave(&gpc1->lock, flags);
- }
+ if (__kvm_gpc_acquire(gpc, atomic))
+ return;

if (likely(!user_len2)) {
/*
@@ -465,6 +431,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* gpc1 lock to make lockdep shut up about it.
*/
lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+
if (atomic) {
if (!read_trylock(&gpc2->lock)) {
read_unlock_irqrestore(&gpc1->lock, flags);
@@ -575,13 +542,10 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}

- if (user_len2) {
- kvm_gpc_mark_dirty_in_slot(gpc2);
- read_unlock(&gpc2->lock);
- }
+ if (user_len2)
+ kvm_gpc_release_dirty(gpc2);

- kvm_gpc_mark_dirty_in_slot(gpc1);
- read_unlock_irqrestore(&gpc1->lock, flags);
+ kvm_gpc_release_dirty(gpc1);
}

void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -645,20 +609,8 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
if (!evtchn_pending_sel)
return;

- /*
- * Yes, this is an open-coded loop. But that's just what put_user()
- * does anyway. Page it in and retry the instruction. We're just a
- * little more honest about it.
- */
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ if (kvm_gpc_acquire(gpc))
+ return;

/* Now gpc->khva is a valid kernel address for the vcpu_info */
if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
@@ -686,8 +638,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}

- kvm_gpc_mark_dirty_in_slot(gpc);
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_dirty(gpc);

/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
@@ -697,8 +648,8 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
- unsigned long flags;
u8 rc = 0;
+ int r;

/*
* If the global upcall vector (HVMIRQ_callback_vector) is set and
@@ -713,33 +664,23 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));

- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- /*
- * This function gets called from kvm_vcpu_block() after setting the
- * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
- * from a HLT. So we really mustn't sleep. If the page ended up absent
- * at that point, just return 1 in order to trigger an immediate wake,
- * and we'll end up getting called again from a context where we *can*
- * fault in the page and wait for it.
- */
- if (in_atomic() || !task_is_running(current))
- return 1;
-
- if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
- /*
- * If this failed, userspace has screwed up the
- * vcpu_info mapping. No interrupts for you.
- */
- return 0;
- }
- read_lock_irqsave(&gpc->lock, flags);
- }
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ *
+ * If acquiring the cache fails completely, then userspace has screwed
+ * up the vcpu_info mapping. No interrupts for you.
+ */
+ r = __kvm_gpc_acquire(gpc, in_atomic() || !task_is_running(current));
+ if (r)
+ return r == -EWOULDBLOCK ? 1 : 0;

rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_clean(gpc);
return rc;
}

base-commit: 3d6cdcc8883b5726513d245eef0e91cabfc397f7
--

[*] https://lore.kernel.org/all/76c61e1cb86e04df892d74c10976597700fe4cb5.camel@xxxxxxxxxxxxx