[PATCH] fs/epoll: make nesting accounting safe for -rt kernel

From: Jason Baron
Date: Fri Jan 17 2020 - 14:22:09 EST


Davidlohr Bueso pointed out that when CONFIG_DEBUG_LOCK_ALLOC is set
ep_poll_safewake() can take several non-raw spinlocks after disabling
pre-emption which is no no for the -rt kernel. So let's re-work how we
determine the nesting level such that it plays nicely with -rt kernel.

Let's introduce a 'nests' field in struct eventpoll that records the
current nesting level during ep_poll_callback(). Then, if we nest again we
can find the previous struct eventpoll that we were called from and
increase our count by 1. The 'nests' field is protected by
ep->poll_wait.lock.

I've also moved napi_id field into a hole in struct eventpoll as part of
introduing the nests field. This change reduces the struct eventpoll from
184 bytes to 176 bytes on x86_64 for the !CONFIG_DEBUG_LOCK_ALLOC
production config.

Reported-by: Davidlohr Bueso <dbueso@xxxxxxx>
Signed-off-by: Jason Baron <jbaron@xxxxxxxxxx>
---
fs/eventpoll.c | 61 ++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67a39503..8ee356c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -219,12 +219,18 @@ struct eventpoll {

/* used to optimize loop detection check */
int visited;
- struct list_head visited_list_link;

#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
+
+ struct list_head visited_list_link;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /* used to track wakeup nests for lockdep validation */
+ u8 nests;
+#endif
};

/* Wait structure used by the poll hooks */
@@ -551,30 +557,43 @@ static int ep_call_nested(struct nested_calls *ncalls,
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC

-static DEFINE_PER_CPU(int, wakeup_nest);
-
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
+ struct eventpoll *ep_src;
unsigned long flags;
- int subclass;
+ u8 nests = 0;

- local_irq_save(flags);
- preempt_disable();
- subclass = __this_cpu_read(wakeup_nest);
- spin_lock_nested(&wq->lock, subclass + 1);
- __this_cpu_inc(wakeup_nest);
- wake_up_locked_poll(wq, POLLIN);
- __this_cpu_dec(wakeup_nest);
- spin_unlock(&wq->lock);
- local_irq_restore(flags);
- preempt_enable();
+ /*
+ * If we are not being call from ep_poll_callback(), epi is
+ * NULL and we are at the first level of nesting, 0. Otherwise,
+ * we are being called from ep_poll_callback() and if a previous
+ * wakeup source is not an epoll file itself, we are at depth
+ * 1 since the wakeup source is depth 0. If the wakeup source
+ * is a previous epoll file in the wakeup chain then we use its
+ * nests value and record ours as nests + 1. The previous epoll
+ * file nests value is stable since its already holding its
+ * own poll_wait.lock.
+ */
+ if (epi) {
+ if ((is_file_epoll(epi->ffd.file))) {
+ ep_src = epi->ffd.file->private_data;
+ nests = ep_src->nests;
+ } else {
+ nests = 1;
+ }
+ }
+ spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
+ ep->nests = nests + 1;
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ ep->nests = 0;
+ spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
- wake_up_poll(wq, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN);
}

#endif
@@ -795,7 +814,7 @@ static void ep_free(struct eventpoll *ep)

/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);

/*
* We need to lock this because we could be hit by
@@ -1264,7 +1283,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v

/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, epi);

if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1568,7 +1587,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,

/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);

return 0;

@@ -1672,7 +1691,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,

/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);

return 0;
}
--
2.7.4