Re: [PATCH RFC v2 1/4] cgroup: implement eventfd-based generic API for notifications

From: Kirill A. Shutemov
Date: Wed Dec 16 2009 - 00:46:52 EST


On Wed, Dec 16, 2009 at 3:44 AM, Li Zefan <lizf@xxxxxxxxxxxxxx> wrote:
> Kirill A. Shutemov wrote:
>> This patch introduces write-only file "cgroup.event_control" in every
>> cgroup.
>>
>> To register new notification handler you need:
>> - create an eventfd;
>> - open a control file to be monitored. Callbacks register_event() and
>> Â unregister_event() must be defined for the control file;
>> - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
>> Â Interpretation of args is defined by control file implementation;
>>
>> eventfd will be woken up by control file implementation or when the
>> cgroup is removed.
>>
>> To unregister notification handler just close eventfd.
>>
>> If you need notification functionality for a control file you have to
>> implement callbacks register_event() and unregister_event() in the
>> struct cftype.
>>
>> Signed-off-by: Kirill A. Shutemov <kirill@xxxxxxxxxxxxx>
>> ---
>> Âinclude/linux/cgroup.h | Â 20 +++++
>> Âkernel/cgroup.c    Â| Â215 +++++++++++++++++++++++++++++++++++++++++++++++-
>> Â2 files changed, 234 insertions(+), 1 deletions(-)
>>
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index 0008dee..7ad3078 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -220,6 +220,10 @@ struct cgroup {
>>
>> Â Â Â /* For RCU-protected deletion */
>> Â Â Â struct rcu_head rcu_head;
>> +
>> + Â Â /* List of events which userspace want to recieve */
>> + Â Â struct list_head event_list;
>> + Â Â struct mutex event_list_mutex;
>> Â};
>>
>> Â/*
>> @@ -362,6 +366,22 @@ struct cftype {
>> Â Â Â int (*trigger)(struct cgroup *cgrp, unsigned int event);
>>
>> Â Â Â int (*release)(struct inode *inode, struct file *file);
>> +
>> + Â Â /*
>> + Â Â Â* register_event() callback will be used to add new userspace
>> + Â Â Â* waiter for changes related to the cftype. Implement it if
>> + Â Â Â* you want to provide this functionality. Use eventfd_signal()
>> + Â Â Â* on eventfd to send notification to userspace.
>> + Â Â Â*/
>> + Â Â int (*register_event)(struct cgroup *cgrp, struct cftype *cft,
>> + Â Â Â Â Â Â Â Â Â Â struct eventfd_ctx *eventfd, const char *args);
>> + Â Â /*
>> + Â Â Â* unregister_event() callback will be called when userspace
>> + Â Â Â* close the eventfd. This callback must be implemented, if you
>> + Â Â Â* provide register_event().
>> + Â Â Â*/
>> + Â Â int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft,
>> + Â Â Â Â Â Â Â Â Â Â struct eventfd_ctx *eventfd);
>> Â};
>>
>> Âstruct cgroup_scanner {
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 0249f4b..f7ec3ca 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -4,6 +4,10 @@
>> Â * ÂBased originally on the cpuset system, extracted by Paul Menage
>> Â * ÂCopyright (C) 2006 Google, Inc
>> Â *
>> + * ÂNotifiactions support
>
> s/Notifiactions/Notifications

Thanks.

>> + * ÂCopyright (C) 2009 Nokia Corporation
>> + * ÂAuthor: Kirill A. Shutemov
>> + *
>> Â * ÂCopyright notices from the original cpuset code:
>> Â * Â--------------------------------------------------
>> Â * ÂCopyright (C) 2003 BULL SA.
>> @@ -51,6 +55,8 @@
>> Â#include <linux/pid_namespace.h>
>> Â#include <linux/idr.h>
>> Â#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
>> +#include <linux/eventfd.h>
>> +#include <linux/poll.h>
>>
>> Â#include <asm/atomic.h>
>>
>> @@ -146,6 +152,36 @@ struct css_id {
>> Â Â Â unsigned short stack[0]; /* Array of Length (depth+1) */
>> Â};
>>
>> +/*
>> + * cgroup_event represents event which userspace want to recieve.
>
> s/event/events ?

Thanks.

>
>> + */
>> +struct cgroup_event {
>> + Â Â /*
>> + Â Â Â* Cgroup which the event belongs to.
>> + Â Â Â*/
>> + Â Â struct cgroup *cgrp;
>> + Â Â /*
>> + Â Â Â* Control file which the event associated.
>> + Â Â Â*/
>> + Â Â struct cftype *cft;
>> + Â Â /*
>> + Â Â Â* eventfd to signal userspace about the event.
>> + Â Â Â*/
>> + Â Â struct eventfd_ctx *eventfd;
>> + Â Â /*
>> + Â Â Â* Each of these stored in a list by the cgroup.
>> + Â Â Â*/
>> + Â Â struct list_head list;
>> + Â Â /*
>> + Â Â Â* All fields below needed to unregister event when
>> + Â Â Â* userspace closes eventfd.
>> + Â Â Â*/
>> + Â Â poll_table pt;
>> + Â Â wait_queue_head_t *wqh;
>> + Â Â wait_queue_t wait;
>> + Â Â struct work_struct remove;
>> +};
>
> Please add a blank line here.

Ok.

>> +static void cgroup_event_remove(struct cgroup_event *event);
>>
>> Â/* The list of hierarchy roots */
>>
>> @@ -734,14 +770,29 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
>> Âstatic int cgroup_call_pre_destroy(struct cgroup *cgrp)
>> Â{
>> Â Â Â struct cgroup_subsys *ss;
>> + Â Â struct cgroup_event *event, *tmp;
>> Â Â Â int ret = 0;
>>
>> Â Â Â for_each_subsys(cgrp->root, ss)
>> Â Â Â Â Â Â Â if (ss->pre_destroy) {
>> Â Â Â Â Â Â Â Â Â Â Â ret = ss->pre_destroy(ss, cgrp);
>> Â Â Â Â Â Â Â Â Â Â Â if (ret)
>> - Â Â Â Â Â Â Â Â Â Â Â Â Â Â break;
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â goto out;
>> Â Â Â Â Â Â Â }
>> +
>> + Â Â /*
>> + Â Â Â* Unregister events and notify userspace.
>> + Â Â Â* FIXME: How to avoid race with cgroup_event_remove_work()
>> + Â Â Â* Â Â Â Âwhich runs from workqueue?
>> + Â Â Â*/
>> + Â Â mutex_lock(&cgrp->event_list_mutex);
>> + Â Â list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
>> + Â Â Â Â Â Â cgroup_event_remove(event);
>> + Â Â Â Â Â Â eventfd_signal(event->eventfd, 1);
>
> How can you access event after you kfree()ed it in cgroup_event_remove()?

Nice catch. Thank you.

>> + Â Â }
>> + Â Â mutex_unlock(&cgrp->event_list_mutex);
>> +
>> +out:
>> Â Â Â return ret;
>> Â}
>>
>> @@ -1136,6 +1187,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
>> Â Â Â INIT_LIST_HEAD(&cgrp->release_list);
>> Â Â Â INIT_LIST_HEAD(&cgrp->pidlists);
>> Â Â Â mutex_init(&cgrp->pidlist_mutex);
>> + Â Â INIT_LIST_HEAD(&cgrp->event_list);
>> + Â Â mutex_init(&cgrp->event_list_mutex);
>> Â}
>>
>> Âstatic void init_cgroup_root(struct cgroupfs_root *root)
>> @@ -1935,6 +1988,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
>> Â Â Â .rename = cgroup_rename,
>> Â};
>>
>> +/*
>> + * Check if a file is a control file
>> + */
>> +static inline struct cftype *__file_cft(struct file *file)
>> +{
>> + Â Â if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
>> + Â Â Â Â Â Â return ERR_PTR(-EINVAL);
>
> I don't think this check is needed.
>
>> + Â Â return __d_cft(file->f_dentry);
>> +}
>> +
>> Âstatic int cgroup_create_file(struct dentry *dentry, mode_t mode,
>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct super_block *sb)
>> Â{
>> @@ -2789,6 +2852,151 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
>> Â Â Â return 0;
>> Â}
>>
>> +static inline void cgroup_event_remove(struct cgroup_event *event)
>> +{
>> + Â Â struct cgroup *cgrp = event->cgrp;
>> +
>> + Â Â BUG_ON(event->cft->unregister_event(cgrp, event->cft, event->eventfd));
>> + Â Â eventfd_ctx_put(event->eventfd);
>> + Â Â remove_wait_queue(event->wqh, &event->wait);
>> + Â Â list_del(&event->list);
>> + Â Â kfree(event);
>> +}
>> +
>> +static void cgroup_event_remove_work(struct work_struct *work)
>> +{
>> + Â Â struct cgroup_event *event = container_of(work, struct cgroup_event,
>> + Â Â Â Â Â Â Â Â Â Â remove);
>> + Â Â struct cgroup *cgrp = event->cgrp;
>> +
>> + Â Â mutex_lock(&cgrp->event_list_mutex);
>> + Â Â cgroup_event_remove(event);
>> + Â Â mutex_unlock(&cgrp->event_list_mutex);
>> +}
>> +
>> +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
>> + Â Â Â Â Â Â int sync, void *key)
>> +{
>> + Â Â struct cgroup_event *event = container_of(wait,
>> + Â Â Â Â Â Â Â Â Â Â struct cgroup_event, wait);
>> + Â Â unsigned long flags = (unsigned long)key;
>> +
>> + Â Â if (flags & POLLHUP)
>> + Â Â Â Â Â Â /*
>> + Â Â Â Â Â Â Â* This function called with spinlock taken, but
>
> s/called/is called/ ?

Ok.

>> + Â Â Â Â Â Â Â* cgroup_event_remove() may sleep, so we have
>> + Â Â Â Â Â Â Â* to run it in a workqueue.
>> + Â Â Â Â Â Â Â*/
>> + Â Â Â Â Â Â schedule_work(&event->remove);
>
> Please use:
>
> Â Â Â Âif (...) {
> Â Â Â Â Â Â Â Â...
> Â Â Â Â}

Ok.

>> +
>> + Â Â return 0;
>> +}
>> +
>> +static void cgroup_event_ptable_queue_proc(struct file *file,
>> + Â Â Â Â Â Â wait_queue_head_t *wqh, poll_table *pt)
>> +{
>> + Â Â struct cgroup_event *event = container_of(pt,
>> + Â Â Â Â Â Â Â Â Â Â struct cgroup_event, pt);
>> +
>> + Â Â event->wqh = wqh;
>> + Â Â add_wait_queue(wqh, &event->wait);
>> +}
>> +
>> +static int cgroup_write_event_control(struct cgroup *cont, struct cftype *cft,
>
> Please consistently use "cgrp"

Ok.

>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const char *buffer)
>> +{
>> + Â Â struct cgroup_event *event = NULL;
>> + Â Â unsigned int efd, cfd;
>> + Â Â struct file *efile = NULL;
>> + Â Â struct file *cfile = NULL;
>> + Â Â char *endp;
>> + Â Â int ret;
>> +
>> + Â Â efd = simple_strtoul(buffer, &endp, 10);
>> + Â Â if (*endp != ' ')
>> + Â Â Â Â Â Â return -EINVAL;
>> + Â Â buffer = endp + 1;
>> +
>> + Â Â cfd = simple_strtoul(buffer, &endp, 10);
>> + Â Â if ((*endp != ' ') && (*endp != '\0'))
>> + Â Â Â Â Â Â return -EINVAL;
>> + Â Â buffer = endp + 1;
>> +
>> + Â Â event = kzalloc(sizeof(*event), GFP_KERNEL);
>> + Â Â if (!event)
>> + Â Â Â Â Â Â return -ENOMEM;
>> + Â Â event->cgrp = cont;
>> + Â Â INIT_LIST_HEAD(&event->list);
>> + Â Â init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
>> + Â Â init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
>> + Â Â INIT_WORK(&event->remove, cgroup_event_remove_work);
>> +
>> + Â Â efile = eventfd_fget(efd);
>> + Â Â if (IS_ERR(efile)) {
>> + Â Â Â Â Â Â ret = PTR_ERR(efile);
>> + Â Â Â Â Â Â goto fail;
>> + Â Â }
>> +
>> + Â Â event->eventfd = eventfd_ctx_fileget(efile);
>> + Â Â if (IS_ERR(event->eventfd)) {
>> + Â Â Â Â Â Â ret = PTR_ERR(event->eventfd);
>> + Â Â Â Â Â Â goto fail;
>> + Â Â }
>> +
>> + Â Â cfile = fget(cfd);
>> + Â Â if (!cfile) {
>> + Â Â Â Â Â Â ret = -EBADF;
>> + Â Â Â Â Â Â goto fail;
>> + Â Â }
>> +
>> + Â Â /* the process need read permission on control file */
>> + Â Â ret = file_permission(cfile, MAY_READ);
>> + Â Â if (ret < 0)
>> + Â Â Â Â Â Â goto fail;
>> +
>> + Â Â event->cft = __file_cft(cfile);
>> + Â Â if (IS_ERR(event->cft)) {
>> + Â Â Â Â Â Â ret = PTR_ERR(event->cft);
>> + Â Â Â Â Â Â goto fail;
>> + Â Â }
>> +
>> + Â Â if (!event->cft->register_event || !event->cft->unregister_event) {
>> + Â Â Â Â Â Â ret = -EINVAL;
>> + Â Â Â Â Â Â goto fail;
>> + Â Â }
>> +
>> + Â Â ret = event->cft->register_event(cont, event->cft,
>> + Â Â Â Â Â Â Â Â Â Â event->eventfd, buffer);
>> + Â Â if (ret)
>> + Â Â Â Â Â Â goto fail;
>> +
>> + Â Â efile->f_op->poll(efile, &event->pt);
>> +
>> + Â Â mutex_lock(&cont->event_list_mutex);
>> + Â Â list_add(&event->list, &cont->event_list);
>> + Â Â mutex_unlock(&cont->event_list_mutex);
>> +
>> + Â Â fput(cfile);
>> + Â Â fput(efile);
>> +
>> + Â Â return 0;
>> +
>> +fail:
>> + Â Â if (!IS_ERR(cfile))
>> + Â Â Â Â Â Â fput(cfile);
>> +
>> + Â Â if (event && event->eventfd && !IS_ERR(event->eventfd))
>> + Â Â Â Â Â Â eventfd_ctx_put(event->eventfd);
>> +
>> + Â Â if (!IS_ERR(efile))
>> + Â Â Â Â Â Â fput(efile);
>> +
>> + Â Â if (event)
>> + Â Â Â Â Â Â kfree(event);
>
> kfree(NULL) is ok

Ok.

>> +
>> + Â Â return ret;
>> +}
>> +
>> Â/*
>> Â * for the common functions, 'private' gives the type of file
>> Â */
>> @@ -2814,6 +3022,11 @@ static struct cftype files[] = {
>> Â Â Â Â Â Â Â .read_u64 = cgroup_read_notify_on_release,
>> Â Â Â Â Â Â Â .write_u64 = cgroup_write_notify_on_release,
>> Â Â Â },
>> + Â Â {
>> + Â Â Â Â Â Â .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
>> + Â Â Â Â Â Â .write_string = cgroup_write_event_control,
>> + Â Â Â Â Â Â .mode = S_IWUGO,
>
> We want this file to be writable to everyone ?

Yes. We check permission of the file which we want to track.

>> + Â Â },
>> Â};
>>
>> Âstatic struct cftype cft_release_agent = {
>

Thank you.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/