[PATCH 6/7] eventpoll: add support for min-wait

From: Jens Axboe
Date: Thu Dec 01 2022 - 13:12:37 EST


This adds the necessary infrastructure to support a minimum wait for
reaping events, API for setting or applying a minimum wait will come
in the following patches.

For medium workload efficiencies, some production workloads inject
artificial timers or sleeps before calling epoll_wait() to get
better batching and higher efficiencies. While this does help, it's
not as efficient as it could be. By adding support for epoll_wait()
for this directly, we can avoids extra context switches and scheduler
and timer overhead.

As an example, running an AB test on an identical workload at about
~370K reqs/second, without this change and with the sleep hack
mentioned above (using 200 usec as the timeout), we're doing 310K-340K
non-voluntary context switches per second. Idle CPU on the host is 27-34%.
With the the sleep hack removed and epoll set to the same 200 usec
value, we're handling the exact same load but at 292K-315k non-voluntary
context switches and idle CPU of 33-41%, a substantial win.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
fs/eventpoll.c | 84 ++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 71 insertions(+), 13 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 962d897bbfc6..daa9885d9c2b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -117,6 +117,9 @@ struct eppoll_entry {
/* The "base" pointer is set to the container "struct epitem" */
struct epitem *base;

+ /* min wait time if (min_wait_ts) & 1 != 0 */
+ ktime_t min_wait_ts;
+
/*
* Wait queue item that will be linked to the target file wait
* queue head.
@@ -217,6 +220,9 @@ struct eventpoll {
u64 gen;
struct hlist_head refs;

+ /* min wait for epoll_wait() */
+ unsigned int min_wait_ts;
+
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
@@ -1747,6 +1753,32 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
return to;
}

+struct epoll_wq {
+ wait_queue_entry_t wait;
+ struct hrtimer timer;
+ ktime_t timeout_ts;
+ ktime_t min_wait_ts;
+ struct eventpoll *ep;
+ bool timed_out;
+ int maxevents;
+ int wakeups;
+};
+
+static bool ep_should_min_wait(struct epoll_wq *ewq)
+{
+ if (ewq->min_wait_ts & 1) {
+ /* just an approximation */
+ if (++ewq->wakeups >= ewq->maxevents)
+ goto stop_wait;
+ if (ktime_before(ktime_get_ns(), ewq->min_wait_ts))
+ return true;
+ }
+
+stop_wait:
+ ewq->min_wait_ts &= ~(u64) 1;
+ return false;
+}
+
/*
* autoremove_wake_function, but remove even on failure to wake up, because we
* know that default_wake_function/ttwu will only fail if the thread is already
@@ -1756,27 +1788,37 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
unsigned int mode, int sync, void *key)
{
- int ret = default_wake_function(wq_entry, mode, sync, key);
+ struct epoll_wq *ewq = container_of(wq_entry, struct epoll_wq, wait);
+ int ret;

+ /*
+ * If min wait time hasn't been satisfied yet, keep waiting
+ */
+ if (ep_should_min_wait(ewq))
+ return 0;
+
+ ret = default_wake_function(wq_entry, mode, sync, key);
list_del_init(&wq_entry->entry);
return ret;
}

-struct epoll_wq {
- wait_queue_entry_t wait;
- struct hrtimer timer;
- ktime_t timeout_ts;
- bool timed_out;
-};
-
static enum hrtimer_restart ep_timer(struct hrtimer *timer)
{
struct epoll_wq *ewq = container_of(timer, struct epoll_wq, timer);
struct task_struct *task = ewq->wait.private;
+ const bool is_min_wait = ewq->min_wait_ts & 1;
+
+ if (!is_min_wait || ep_events_available(ewq->ep)) {
+ if (!is_min_wait)
+ ewq->timed_out = true;
+ ewq->min_wait_ts &= ~(u64) 1;
+ wake_up_process(task);
+ return HRTIMER_NORESTART;
+ }

- ewq->timed_out = true;
- wake_up_process(task);
- return HRTIMER_NORESTART;
+ ewq->min_wait_ts &= ~(u64) 1;
+ hrtimer_set_expires_range_ns(&ewq->timer, ewq->timeout_ts, 0);
+ return HRTIMER_RESTART;
}

static void ep_schedule(struct eventpoll *ep, struct epoll_wq *ewq, ktime_t *to,
@@ -1831,12 +1873,16 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,

lockdep_assert_irqs_enabled();

+ ewq.min_wait_ts = 0;
+ ewq.ep = ep;
+ ewq.maxevents = maxevents;
ewq.timed_out = false;
+ ewq.wakeups = 0;

if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
+ ewq.timeout_ts = timespec64_to_ktime(*timeout);
to = &ewq.timeout_ts;
- *to = timespec64_to_ktime(*timeout);
} else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
@@ -1845,6 +1891,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
ewq.timed_out = true;
}

+ /*
+ * If min_wait is set for this epoll instance, note the min_wait
+ * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
+ * the state bit for whether or not min_wait is enabled.
+ */
+ if (!ewq.timed_out && ep->min_wait_ts) {
+ ewq.min_wait_ts = ktime_add_us(ktime_get_ns(),
+ ep->min_wait_ts);
+ ewq.min_wait_ts |= (u64) 1;
+ to = &ewq.min_wait_ts;
+ }
+
/*
* This call is racy: We may or may not see events that are being added
* to the ready list under the lock (e.g., in IRQ callbacks). For cases
@@ -1913,7 +1971,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* important.
*/
eavail = ep_events_available(ep);
- if (!eavail) {
+ if (!eavail || ewq.min_wait_ts & 1) {
__add_wait_queue_exclusive(&ep->wq, &ewq.wait);
write_unlock_irq(&ep->lock);
ep_schedule(ep, &ewq, to, slack);
--
2.35.1