Re: [PATCH] epoll: Add a flag, EPOLLWAKEUP, to prevent suspendwhile epoll events are ready

From: NeilBrown
Date: Tue May 01 2012 - 02:28:30 EST


On Mon, 30 Apr 2012 22:33:48 -0700 Arve HjÃnnevÃg <arve@xxxxxxxxxxx> wrote:

> When an epoll_event, that has the EPOLLWAKEUP flag set, is ready, a
> wakeup_source will be active to prevent suspend. This can be used to
> handle wakeup events from a driver that support poll, e.g. input, if
> that driver wakes up the waitqueue passed to epoll before allowing
> suspend.
>
> Signed-off-by: Arve HjÃnnevÃg <arve@xxxxxxxxxxx>
> Signed-off-by: Rafael J. Wysocki <rjw@xxxxxxx>

Thanks.
Reviewed-by: NeilBrown <neilb@xxxxxxx>

However:
1/ I think all references to "automatic system suspend" can be replaced with
"system suspend" as an active wakeup_source disables any suspend, no matter
it's source
2/ I reserve to right to submit for discussion a later patch which removes
the ep->ws in favour or some other exclusion mechanism :-)

NeilBrown



> ---
> fs/eventpoll.c | 90 ++++++++++++++++++++++++++++++++++++++++++-
> include/linux/capability.h | 5 ++-
> include/linux/eventpoll.h | 12 ++++++
> 3 files changed, 103 insertions(+), 4 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 739b098..1abed50 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -33,6 +33,7 @@
> #include <linux/bitops.h>
> #include <linux/mutex.h>
> #include <linux/anon_inodes.h>
> +#include <linux/device.h>
> #include <asm/uaccess.h>
> #include <asm/io.h>
> #include <asm/mman.h>
> @@ -87,7 +88,7 @@
> */
>
> /* Epoll private bits inside the event mask */
> -#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
> +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
>
> /* Maximum number of nesting allowed inside epoll sets */
> #define EP_MAX_NESTS 4
> @@ -154,6 +155,9 @@ struct epitem {
> /* List header used to link this item to the "struct file" items list */
> struct list_head fllink;
>
> + /* wakeup_source used when EPOLLWAKEUP is set */
> + struct wakeup_source *ws;
> +
> /* The structure that describe the interested events and the source fd */
> struct epoll_event event;
> };
> @@ -194,6 +198,9 @@ struct eventpoll {
> */
> struct epitem *ovflist;
>
> + /* wakeup_source used when ep_scan_ready_list is running */
> + struct wakeup_source *ws;
> +
> /* The user that created the eventpoll descriptor */
> struct user_struct *user;
>
> @@ -588,8 +595,10 @@ static int ep_scan_ready_list(struct eventpoll *ep,
> * queued into ->ovflist but the "txlist" might already
> * contain them, and the list_splice() below takes care of them.
> */
> - if (!ep_is_linked(&epi->rdllink))
> + if (!ep_is_linked(&epi->rdllink)) {
> list_add_tail(&epi->rdllink, &ep->rdllist);
> + __pm_stay_awake(epi->ws);
> + }
> }
> /*
> * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
> @@ -602,6 +611,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
> * Quickly re-inject items left on "txlist".
> */
> list_splice(&txlist, &ep->rdllist);
> + __pm_relax(ep->ws);
>
> if (!list_empty(&ep->rdllist)) {
> /*
> @@ -656,6 +666,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
> list_del_init(&epi->rdllink);
> spin_unlock_irqrestore(&ep->lock, flags);
>
> + wakeup_source_unregister(epi->ws);
> +
> /* At this point it is safe to free the eventpoll item */
> kmem_cache_free(epi_cache, epi);
>
> @@ -706,6 +718,7 @@ static void ep_free(struct eventpoll *ep)
> mutex_unlock(&epmutex);
> mutex_destroy(&ep->mtx);
> free_uid(ep->user);
> + wakeup_source_unregister(ep->ws);
> kfree(ep);
> }
>
> @@ -737,6 +750,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
> * callback, but it's not actually ready, as far as
> * caller requested events goes. We can remove it here.
> */
> + __pm_relax(epi->ws);
> list_del_init(&epi->rdllink);
> }
> }
> @@ -927,13 +941,23 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
> if (epi->next == EP_UNACTIVE_PTR) {
> epi->next = ep->ovflist;
> ep->ovflist = epi;
> + if (epi->ws) {
> + /*
> + * Activate ep->ws since epi->ws may get
> + * deactivated at any time.
> + */
> + __pm_stay_awake(ep->ws);
> + }
> +
> }
> goto out_unlock;
> }
>
> /* If this file is already in the ready list we exit soon */
> - if (!ep_is_linked(&epi->rdllink))
> + if (!ep_is_linked(&epi->rdllink)) {
> list_add_tail(&epi->rdllink, &ep->rdllist);
> + __pm_stay_awake(epi->ws);
> + }
>
> /*
> * Wake up ( if active ) both the eventpoll wait list and the ->poll()
> @@ -1091,6 +1115,30 @@ static int reverse_path_check(void)
> return error;
> }
>
> +static int ep_create_wakeup_source(struct epitem *epi)
> +{
> + const char *name;
> +
> + if (!epi->ep->ws) {
> + epi->ep->ws = wakeup_source_register("eventpoll");
> + if (!epi->ep->ws)
> + return -ENOMEM;
> + }
> +
> + name = epi->ffd.file->f_path.dentry->d_name.name;
> + epi->ws = wakeup_source_register(name);
> + if (!epi->ws)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static void ep_destroy_wakeup_source(struct epitem *epi)
> +{
> + wakeup_source_unregister(epi->ws);
> + epi->ws = NULL;
> +}
> +
> /*
> * Must be called with "mtx" held.
> */
> @@ -1118,6 +1166,13 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
> epi->event = *event;
> epi->nwait = 0;
> epi->next = EP_UNACTIVE_PTR;
> + if (epi->event.events & EPOLLWAKEUP) {
> + error = ep_create_wakeup_source(epi);
> + if (error)
> + goto error_create_wakeup_source;
> + } else {
> + epi->ws = NULL;
> + }
>
> /* Initialize the poll table using the queue callback */
> epq.epi = epi;
> @@ -1164,6 +1219,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
> /* If the file is already "ready" we drop it inside the ready list */
> if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
> list_add_tail(&epi->rdllink, &ep->rdllist);
> + __pm_stay_awake(epi->ws);
>
> /* Notify waiting tasks that events are available */
> if (waitqueue_active(&ep->wq))
> @@ -1204,6 +1260,9 @@ error_unregister:
> list_del_init(&epi->rdllink);
> spin_unlock_irqrestore(&ep->lock, flags);
>
> + wakeup_source_unregister(epi->ws);
> +
> +error_create_wakeup_source:
> kmem_cache_free(epi_cache, epi);
>
> return error;
> @@ -1229,6 +1288,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
> epi->event.events = event->events;
> pt._key = event->events;
> epi->event.data = event->data; /* protected by mtx */
> + if (epi->event.events & EPOLLWAKEUP) {
> + if (!epi->ws)
> + ep_create_wakeup_source(epi);
> + } else if (epi->ws) {
> + ep_destroy_wakeup_source(epi);
> + }
>
> /*
> * Get current event bits. We can safely use the file* here because
> @@ -1244,6 +1309,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
> spin_lock_irq(&ep->lock);
> if (!ep_is_linked(&epi->rdllink)) {
> list_add_tail(&epi->rdllink, &ep->rdllist);
> + __pm_stay_awake(epi->ws);
>
> /* Notify waiting tasks that events are available */
> if (waitqueue_active(&ep->wq))
> @@ -1282,6 +1348,18 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
> !list_empty(head) && eventcnt < esed->maxevents;) {
> epi = list_first_entry(head, struct epitem, rdllink);
>
> + /*
> + * Activate ep->ws before deactivating epi->ws to prevent
> + * triggering auto-suspend here (in case we reactive epi->ws
> + * below).
> + *
> + * This could be rearranged to delay the deactivation of epi->ws
> + * instead, but then epi->ws would temporarily be out of sync
> + * with ep_is_linked().
> + */
> + if (epi->ws && epi->ws->active)
> + __pm_stay_awake(ep->ws);
> + __pm_relax(epi->ws);
> list_del_init(&epi->rdllink);
>
> pt._key = epi->event.events;
> @@ -1298,6 +1376,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
> if (__put_user(revents, &uevent->events) ||
> __put_user(epi->event.data, &uevent->data)) {
> list_add(&epi->rdllink, head);
> + __pm_stay_awake(epi->ws);
> return eventcnt ? eventcnt : -EFAULT;
> }
> eventcnt++;
> @@ -1317,6 +1396,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
> * poll callback will queue them in ep->ovflist.
> */
> list_add_tail(&epi->rdllink, &ep->rdllist);
> + __pm_stay_awake(epi->ws);
> }
> }
> }
> @@ -1629,6 +1709,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
> if (!tfile->f_op || !tfile->f_op->poll)
> goto error_tgt_fput;
>
> + /* Check if EPOLLWAKEUP is allowed */
> + if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP))
> + goto error_tgt_fput;
> +
> /*
> * We have to check that the file structure underneath the file descriptor
> * the user passed to us _is_ an eventpoll file. And also we do not permit
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index 12d52de..222974a 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -360,8 +360,11 @@ struct cpu_vfs_cap_data {
>
> #define CAP_WAKE_ALARM 35
>
> +/* Allow preventing automatic system suspends while epoll events are pending */
>
> -#define CAP_LAST_CAP CAP_WAKE_ALARM
> +#define CAP_EPOLLWAKEUP 36
> +
> +#define CAP_LAST_CAP CAP_EPOLLWAKEUP
>
> #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
>
> diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
> index 657ab55..5b591fb 100644
> --- a/include/linux/eventpoll.h
> +++ b/include/linux/eventpoll.h
> @@ -26,6 +26,18 @@
> #define EPOLL_CTL_DEL 2
> #define EPOLL_CTL_MOD 3
>
> +/*
> + * Request the handling of system wakeup events so as to prevent automatic
> + * system suspends from happening while those events are being processed.
> + *
> + * Assuming neither EPOLLET nor EPOLLONESHOT is set, automatic system suspends
> + * will not be re-allowed until epoll_wait is called again after consuming the
> + * wakeup event(s).
> + *
> + * Requires CAP_EPOLLWAKEUP
> + */
> +#define EPOLLWAKEUP (1 << 29)
> +
> /* Set the One Shot behaviour for the target file descriptor */
> #define EPOLLONESHOT (1 << 30)
>

Attachment: signature.asc
Description: PGP signature