[PATCH v2 2/2] eventpoll: Fix epoll_wait() report false negative

From: Nam Cao

Date: Sat May 30 2026 - 05:39:52 EST


ep_events_available() checks for available events by looking at ep->rdllist
and ep->ovflist. However, this is done without a lock and can report false
negative if rdllist and ovflist are changed in ep_start_scan() or
ep_done_scan() by another task. For example:
____________________________________________________________________________________
|ep_start_scan()
| list_splice_init(&ep->rdllist, txlist)
ep_events_available() |
!list_empty_careful(&ep->rdllist) || |
READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR|
| WRITE_ONCE(ep->ovflist, NULL)
___________________________________________|________________________________________

Another example:
____________________________________________________________________________________
ep_events_available() |
|ep_start_scan()
| list_splice_init(&ep->rdllist, txlist);
| WRITE_ONCE(ep->ovflist, NULL);
!list_empty_careful(&ep->rdllist) || |
|ep_done_scan()
| WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
| list_splice(txlist, &ep->rdllist);
READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR|
___________________________________________|________________________________________

In the above examples, ep_events_available() sees no event from both
rdllist and ovflist despite event being available.

Introduce a sequence lock to resolve this issue.

Measuring the time consumption of 10 million loop iterations doing
epoll_wait(), the following performance drop is observed:

timeout #event before after diff
0ms 0 3727ms 3974ms +6.6%
0ms 1 8099ms 9134ms +13%
1ms 1 13525ms 13586ms +0.45%

Considering the use case of epoll_wait() (wait for events, do something
with the events, repeat), it should only contribute to a small portion of
user's CPU consumption. Therefore this performance drop is not alarming.

Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()")
Suggested-by: Mateusz Guzik <mjguzik@xxxxxxxxx>
Signed-off-by: Nam Cao <namcao@xxxxxxxxxxxxx>
Cc: stable@xxxxxxxxxxxxxxx
---
fs/eventpoll.c | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a3090b446af1..58248862e5ee 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
+#include <linux/seqlock.h>
#include <net/busy_poll.h>

/*
@@ -190,6 +191,9 @@ struct eventpoll {
/* Lock which protects rdllist and ovflist */
spinlock_t lock;

+ /* Protect switching between rdllist and ovflist */
+ seqcount_spinlock_t seq;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;

@@ -382,8 +386,11 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
*/
static inline int ep_events_available(struct eventpoll *ep)
{
+ unsigned int seq = read_seqcount_begin(&ep->seq);
+
return !list_empty_careful(&ep->rdllist) ||
- READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
+ READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR ||
+ read_seqcount_retry(&ep->seq, seq);
}

#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -735,8 +742,12 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
*/
lockdep_assert_irqs_enabled();
spin_lock_irq(&ep->lock);
+ write_seqcount_begin(&ep->seq);
+
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
+
+ write_seqcount_end(&ep->seq);
spin_unlock_irq(&ep->lock);
}

@@ -768,6 +779,9 @@ static void ep_done_scan(struct eventpoll *ep,
ep_pm_stay_awake(epi);
}
}
+
+ write_seqcount_begin(&ep->seq);
+
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
* releasing the lock, events will be queued in the normal way inside
@@ -779,6 +793,9 @@ static void ep_done_scan(struct eventpoll *ep,
* Quickly re-inject items left on "txlist".
*/
list_splice(txlist, &ep->rdllist);
+
+ write_seqcount_end(&ep->seq);
+
__pm_relax(ep->ws);

if (!list_empty(&ep->rdllist)) {
@@ -1155,6 +1172,7 @@ static int ep_alloc(struct eventpoll **pep)

mutex_init(&ep->mtx);
spin_lock_init(&ep->lock);
+ seqcount_spinlock_init(&ep->seq, &ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
--
2.47.3