Re: [PATCH v2] inotify: Convert to using per-namespace limits

From: Jan Kara
Date: Mon Oct 24 2016 - 03:48:26 EST


On Tue 11-10-16 10:36:22, Nikolay Borisov wrote:
> This patchset converts inotify to using the newly introduced
> per-userns sysctl infrastructure.
>
> Currently the inotify instances/watches are being accounted in the
> user_struct structure. This means that in setups where multiple
> users in unprivileged containers map to the same underlying
> real user (i.e. pointing to the same user_struct) the inotify limits
> are going to be shared as well, allowing one user(or application) to exhaust
> all others limits.
>
> Fix this by switching the inotify sysctls to using the
> per-namespace/per-user limits. This will allow the server admin to
> set sensible global limits, which can further be tuned inside every
> individual user namespace. Additionally, in order to preserve the
> sysctl ABI make the existing inotify instances/watches sysctls
> modify the values of the initial user namespace.
>
> Signed-off-by: Nikolay Borisov <kernel@xxxxxxxx>

I cannot really comment on the user-namespaces sanity of this but from fs
notification framework POV the patch looks fine so feel free to add:

Acked-by: Jan Kara <jack@xxxxxxx>

Honza

> ---
>
> So here is a revised version which retains the existing sysctls,
> and hooks them to the init_user_ns values.
>
> fs/notify/inotify/inotify.h | 17 +++++++++++++++++
> fs/notify/inotify/inotify_fsnotify.c | 6 ++----
> fs/notify/inotify/inotify_user.c | 34 +++++++++++++++++-----------------
> include/linux/fsnotify_backend.h | 3 ++-
> include/linux/sched.h | 4 ----
> include/linux/user_namespace.h | 4 ++++
> kernel/ucount.c | 6 +++++-
> 7 files changed, 47 insertions(+), 27 deletions(-)
>
> diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
> index ed855ef6f077..b5536f8ad3e0 100644
> --- a/fs/notify/inotify/inotify.h
> +++ b/fs/notify/inotify/inotify.h
> @@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
> const unsigned char *file_name, u32 cookie);
>
> extern const struct fsnotify_ops inotify_fsnotify_ops;
> +
> +#ifdef CONFIG_INOTIFY_USER
> +static void dec_inotify_instances(struct ucounts *ucounts)
> +{
> + dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
> +}
> +
> +static struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
> +{
> + return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
> +}
> +
> +static void dec_inotify_watches(struct ucounts *ucounts)
> +{
> + dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
> +}
> +#endif
> diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
> index 2cd900c2c737..1e6b3cfc2cfd 100644
> --- a/fs/notify/inotify/inotify_fsnotify.c
> +++ b/fs/notify/inotify/inotify_fsnotify.c
> @@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
> /* ideally the idr is empty and we won't hit the BUG in the callback */
> idr_for_each(&group->inotify_data.idr, idr_callback, group);
> idr_destroy(&group->inotify_data.idr);
> - if (group->inotify_data.user) {
> - atomic_dec(&group->inotify_data.user->inotify_devs);
> - free_uid(group->inotify_data.user);
> - }
> + if (group->inotify_data.ucounts)
> + dec_inotify_instances(group->inotify_data.ucounts);
> }
>
> static void inotify_free_event(struct fsnotify_event *fsn_event)
> diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
> index b8d08d0d0a4d..7d769a824742 100644
> --- a/fs/notify/inotify/inotify_user.c
> +++ b/fs/notify/inotify/inotify_user.c
> @@ -44,10 +44,8 @@
>
> #include <asm/ioctls.h>
>
> -/* these are configurable via /proc/sys/fs/inotify/ */
> -static int inotify_max_user_instances __read_mostly;
> +/* configurable via /proc/sys/fs/inotify/ */
> static int inotify_max_queued_events __read_mostly;
> -static int inotify_max_user_watches __read_mostly;
>
> static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
>
> @@ -60,7 +58,7 @@ static int zero;
> struct ctl_table inotify_table[] = {
> {
> .procname = "max_user_instances",
> - .data = &inotify_max_user_instances,
> + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
> .maxlen = sizeof(int),
> .mode = 0644,
> .proc_handler = proc_dointvec_minmax,
> @@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
> },
> {
> .procname = "max_user_watches",
> - .data = &inotify_max_user_watches,
> + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
> .maxlen = sizeof(int),
> .mode = 0644,
> .proc_handler = proc_dointvec_minmax,
> @@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
> /* remove this mark from the idr */
> inotify_remove_from_idr(group, i_mark);
>
> - atomic_dec(&group->inotify_data.user->inotify_watches);
> + dec_inotify_watches(group->inotify_data.ucounts);
> }
>
> /* ding dong the mark is dead */
> @@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
> tmp_i_mark->fsn_mark.mask = mask;
> tmp_i_mark->wd = -1;
>
> - ret = -ENOSPC;
> - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
> - goto out_err;
> -
> ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
> if (ret)
> goto out_err;
>
> + /* increment the number of watches the user has */
> + if (!inc_inotify_watches(group->inotify_data.ucounts)) {
> + inotify_remove_from_idr(group, tmp_i_mark);
> + ret = -ENOSPC;
> + goto out_err;
> + }
> +
> /* we are on the idr, now get on the inode */
> ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
> NULL, 0);
> @@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
> goto out_err;
> }
>
> - /* increment the number of watches the user has */
> - atomic_inc(&group->inotify_data.user->inotify_watches);
>
> /* return the watch descriptor for this new mark */
> ret = tmp_i_mark->wd;
> @@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
>
> spin_lock_init(&group->inotify_data.idr_lock);
> idr_init(&group->inotify_data.idr);
> - group->inotify_data.user = get_current_user();
> + group->inotify_data.ucounts = inc_ucount(current_user_ns(),
> + current_euid(),
> + UCOUNT_INOTIFY_INSTANCES);
>
> - if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
> - inotify_max_user_instances) {
> + if (!group->inotify_data.ucounts) {
> fsnotify_destroy_group(group);
> return ERR_PTR(-EMFILE);
> }
> @@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
> inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
>
> inotify_max_queued_events = 16384;
> - inotify_max_user_instances = 128;
> - inotify_max_user_watches = 8192;
> + init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
> + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
>
> return 0;
> }
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 58205f33af02..892959de1162 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -16,6 +16,7 @@
> #include <linux/spinlock.h>
> #include <linux/types.h>
> #include <linux/atomic.h>
> +#include <linux/user_namespace.h>
>
> /*
> * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
> @@ -169,7 +170,7 @@ struct fsnotify_group {
> struct inotify_group_private_data {
> spinlock_t idr_lock;
> struct idr idr;
> - struct user_struct *user;
> + struct ucounts *ucounts;
> } inotify_data;
> #endif
> #ifdef CONFIG_FANOTIFY
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 62c68e513e39..35230a2c73ac 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -840,10 +840,6 @@ struct user_struct {
> atomic_t __count; /* reference count */
> atomic_t processes; /* How many processes does this user have? */
> atomic_t sigpending; /* How many pending signals does this user have? */
> -#ifdef CONFIG_INOTIFY_USER
> - atomic_t inotify_watches; /* How many inotify watches does this user have? */
> - atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
> -#endif
> #ifdef CONFIG_FANOTIFY
> atomic_t fanotify_listeners;
> #endif
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index eb209d4523f5..cbcac7a5ec41 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -32,6 +32,10 @@ enum ucount_type {
> UCOUNT_NET_NAMESPACES,
> UCOUNT_MNT_NAMESPACES,
> UCOUNT_CGROUP_NAMESPACES,
> +#ifdef CONFIG_INOTIFY_USER
> + UCOUNT_INOTIFY_INSTANCES,
> + UCOUNT_INOTIFY_WATCHES,
> +#endif
> UCOUNT_COUNTS,
> };
>
> diff --git a/kernel/ucount.c b/kernel/ucount.c
> index 9d20d5dd298a..a6bcea47306b 100644
> --- a/kernel/ucount.c
> +++ b/kernel/ucount.c
> @@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {
>
> static int zero = 0;
> static int int_max = INT_MAX;
> -#define UCOUNT_ENTRY(name) \
> +#define UCOUNT_ENTRY(name) \
> { \
> .procname = name, \
> .maxlen = sizeof(int), \
> @@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
> UCOUNT_ENTRY("max_net_namespaces"),
> UCOUNT_ENTRY("max_mnt_namespaces"),
> UCOUNT_ENTRY("max_cgroup_namespaces"),
> +#ifdef CONFIG_INOTIFY_USER_
> + UCOUNT_ENTRY("max_inotify_instances"),
> + UCOUNT_ENTRY("max_inotify_watches"),
> +#endif
> { }
> };
> #endif /* CONFIG_SYSCTL */
> --
> 2.5.0
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR