[RFC PATCH 1/2] sched/wait: Add add_wait_queue_priority()

From: David Woodhouse
Date: Mon Oct 26 2020 - 13:53:43 EST


From: David Woodhouse <dwmw@xxxxxxxxxxxx>

This allows an exclusive wait_queue_entry to be added at the head of the
queue, instead of the tail as normal. Thus, it gets to consume events
first.

The problem I'm trying to solve here is interrupt remapping invalidation
vs. MSI interrupts from VFIO. I'd really like KVM IRQFD to be able to
consume events before (and indeed instead of) userspace.

When the remapped MSI target in the KVM routing table is invalidated,
the VMM needs to *deassociate* the IRQFD and fall back to handling the
next IRQ in userspace, so it can be retranslated and a fault reported
if appropriate.

It's possible to do that by constantly registering and deregistering the
fd in the userspace poll loop, but it gets ugly especially because the
fallback handler isn't really local to the core MSI handling.

It's much nicer if the userspace handler can just remain registered all
the time, and it just doesn't get any events when KVM steals them first.
Which is precisely what happens with posted interrupts, and this makes
it consistent. (Unless I'm missing something that prevents posted
interrupts from working when there's another listener on the eventfd?)

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
include/linux/wait.h | 12 +++++++++++-
kernel/sched/wait.c | 11 +++++++++++
2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 27fb99cfeb02..fe10e8570a52 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -22,6 +22,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
#define WQ_FLAG_BOOKMARK 0x04
#define WQ_FLAG_CUSTOM 0x08
#define WQ_FLAG_DONE 0x10
+#define WQ_FLAG_PRIORITY 0x20

/*
* A single wait-queue entry structure:
@@ -164,11 +165,20 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
- list_add(&wq_entry->entry, &wq_head->head);
+ struct list_head *head = &wq_head->head;
+ struct wait_queue_entry *wq;
+
+ list_for_each_entry(wq, &wq_head->head, entry) {
+ if (!(wq->flags & WQ_FLAG_PRIORITY))
+ break;
+ head = &wq->entry;
+ }
+ list_add(&wq_entry->entry, head);
}

/*
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 01f5d3020589..d2a84c8e88bf 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
}
EXPORT_SYMBOL(add_wait_queue_exclusive);

+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
--
2.26.2