[RFC] eventfd: add EFD_AUTORESET flag

From: Stefan Hajnoczi
Date: Wed Jan 29 2020 - 12:20:49 EST


Some applications simply use eventfd for inter-thread notifications
without requiring counter or semaphore semantics. They wait for the
eventfd to become readable using poll(2)/select(2) and then call read(2)
to reset the counter.

This patch adds the EFD_AUTORESET flag to reset the counter when
f_ops->poll() finds the eventfd is readable, eliminating the need to
call read(2) to reset the counter.

This results in a small but measurable 1% performance improvement with
QEMU virtio-blk emulation. Each read(2) takes 1 microsecond execution
time in the event loop according to perf.

Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxx>
---
Does this look like a reasonable thing to do? I'm not very familiar
with f_ops->poll() or the eventfd internals, so maybe I'm overlooking a
design flaw.

I've tested this with QEMU and it works fine:
https://github.com/stefanha/qemu/commits/eventfd-autoreset
---
fs/eventfd.c | 99 +++++++++++++++++++++++++----------------
include/linux/eventfd.h | 3 +-
2 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa0ea8c55e8..208f6b9e2234 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -116,45 +116,62 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)

poll_wait(file, &ctx->wqh, wait);

- /*
- * All writes to ctx->count occur within ctx->wqh.lock. This read
- * can be done outside ctx->wqh.lock because we know that poll_wait
- * takes that lock (through add_wait_queue) if our caller will sleep.
- *
- * The read _can_ therefore seep into add_wait_queue's critical
- * section, but cannot move above it! add_wait_queue's spin_lock acts
- * as an acquire barrier and ensures that the read be ordered properly
- * against the writes. The following CAN happen and is safe:
- *
- * poll write
- * ----------------- ------------
- * lock ctx->wqh.lock (in poll_wait)
- * count = ctx->count
- * __add_wait_queue
- * unlock ctx->wqh.lock
- * lock ctx->qwh.lock
- * ctx->count += n
- * if (waitqueue_active)
- * wake_up_locked_poll
- * unlock ctx->qwh.lock
- * eventfd_poll returns 0
- *
- * but the following, which would miss a wakeup, cannot happen:
- *
- * poll write
- * ----------------- ------------
- * count = ctx->count (INVALID!)
- * lock ctx->qwh.lock
- * ctx->count += n
- * **waitqueue_active is false**
- * **no wake_up_locked_poll!**
- * unlock ctx->qwh.lock
- * lock ctx->wqh.lock (in poll_wait)
- * __add_wait_queue
- * unlock ctx->wqh.lock
- * eventfd_poll returns 0
- */
- count = READ_ONCE(ctx->count);
+ if (ctx->flags & EFD_AUTORESET) {
+ unsigned long flags;
+ __poll_t requested = poll_requested_events(wait);
+
+ spin_lock_irqsave(&ctx->wqh.lock, flags);
+ count = ctx->count;
+
+ /* Reset counter if caller is polling for read */
+ if (count != 0 && (requested & EPOLLIN)) {
+ ctx->count = 0;
+ events |= EPOLLOUT;
+ /* TODO is a EPOLLOUT wakeup necessary here? */
+ }
+
+ spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+ } else {
+ /*
+ * All writes to ctx->count occur within ctx->wqh.lock. This read
+ * can be done outside ctx->wqh.lock because we know that poll_wait
+ * takes that lock (through add_wait_queue) if our caller will sleep.
+ *
+ * The read _can_ therefore seep into add_wait_queue's critical
+ * section, but cannot move above it! add_wait_queue's spin_lock acts
+ * as an acquire barrier and ensures that the read be ordered properly
+ * against the writes. The following CAN happen and is safe:
+ *
+ * poll write
+ * ----------------- ------------
+ * lock ctx->wqh.lock (in poll_wait)
+ * count = ctx->count
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * if (waitqueue_active)
+ * wake_up_locked_poll
+ * unlock ctx->qwh.lock
+ * eventfd_poll returns 0
+ *
+ * but the following, which would miss a wakeup, cannot happen:
+ *
+ * poll write
+ * ----------------- ------------
+ * count = ctx->count (INVALID!)
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * **waitqueue_active is false**
+ * **no wake_up_locked_poll!**
+ * unlock ctx->qwh.lock
+ * lock ctx->wqh.lock (in poll_wait)
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * eventfd_poll returns 0
+ */
+ count = READ_ONCE(ctx->count);
+ }

if (count > 0)
events |= EPOLLIN;
@@ -400,6 +417,10 @@ static int do_eventfd(unsigned int count, int flags)
if (flags & ~EFD_FLAGS_SET)
return -EINVAL;

+ /* Semaphore semantics don't make sense when autoreset is enabled */
+ if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
+ return -EINVAL;
+
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index ffcc7724ca21..27577fafc553 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -21,11 +21,12 @@
* shared O_* flags.
*/
#define EFD_SEMAPHORE (1 << 0)
+#define EFD_AUTORESET (1 << 6) /* aliases O_CREAT */
#define EFD_CLOEXEC O_CLOEXEC
#define EFD_NONBLOCK O_NONBLOCK

#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_AUTORESET)

struct eventfd_ctx;
struct file;
--
2.24.1