[PATCH] inotify: Convert to using per-namespace limits

From: Nikolay Borisov
Date: Fri Oct 07 2016 - 06:46:27 EST


This patchset converts inotify to using the newly introduced
per-userns sysctl infrastructure.

Currently the inotify instances/watches are being accounted in the
user_struct structure. This means that in setups where multiple
users in unprivileged containers map to the same underlying
real user (i.e. pointing to the same user_struct) the inotify limits
are going to be shared as well, allowing one user(or application) to exhaust
all others limits.

Fix this by switching the inotify sysctls to using the
per-namespace/per-user limits. This will allow the server admin to
set sensible global limits, which can further be tuned inside every
individual user namespace.

Signed-off-by: Nikolay Borisov <kernel@xxxxxxxx>
---
Hello Eric,

I saw you've finally sent your pull request for 4.9 and it
includes your implementatino of the ucount infrastructure. So
here is my respin of the inotify patches using that.

fs/notify/inotify/inotify.h | 17 +++++++++++++
fs/notify/inotify/inotify_fsnotify.c | 6 ++---
fs/notify/inotify/inotify_user.c | 46 ++++++++++++------------------------
include/linux/fsnotify_backend.h | 3 ++-
include/linux/sched.h | 4 ----
include/linux/user_namespace.h | 4 ++++
kernel/ucount.c | 6 ++++-
7 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ed855ef6f077..b5536f8ad3e0 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
const unsigned char *file_name, u32 cookie);

extern const struct fsnotify_ops inotify_fsnotify_ops;
+
+#ifdef CONFIG_INOTIFY_USER
+static void dec_inotify_instances(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
+}
+
+static struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
+{
+ return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
+}
+
+static void dec_inotify_watches(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
+}
+#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 2cd900c2c737..1e6b3cfc2cfd 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
/* ideally the idr is empty and we won't hit the BUG in the callback */
idr_for_each(&group->inotify_data.idr, idr_callback, group);
idr_destroy(&group->inotify_data.idr);
- if (group->inotify_data.user) {
- atomic_dec(&group->inotify_data.user->inotify_devs);
- free_uid(group->inotify_data.user);
- }
+ if (group->inotify_data.ucounts)
+ dec_inotify_instances(group->inotify_data.ucounts);
}

static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b8d08d0d0a4d..cfe48bb511c6 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@

#include <asm/ioctls.h>

-/* these are configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
+/* configurable via /proc/sys/fs/inotify/ */
static int inotify_max_queued_events __read_mostly;
-static int inotify_max_user_watches __read_mostly;

static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;

@@ -59,22 +57,6 @@ static int zero;

struct ctl_table inotify_table[] = {
{
- .procname = "max_user_instances",
- .data = &inotify_max_user_instances,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
- .procname = "max_user_watches",
- .data = &inotify_max_user_watches,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
.procname = "max_queued_events",
.data = &inotify_max_queued_events,
.maxlen = sizeof(int),
@@ -500,7 +482,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
/* remove this mark from the idr */
inotify_remove_from_idr(group, i_mark);

- atomic_dec(&group->inotify_data.user->inotify_watches);
+ dec_inotify_watches(group->inotify_data.ucounts);
}

/* ding dong the mark is dead */
@@ -584,14 +566,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
tmp_i_mark->fsn_mark.mask = mask;
tmp_i_mark->wd = -1;

- ret = -ENOSPC;
- if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
- goto out_err;
-
ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
if (ret)
goto out_err;

+ /* increment the number of watches the user has */
+ if (!inc_inotify_watches(group->inotify_data.ucounts)) {
+ inotify_remove_from_idr(group, tmp_i_mark);
+ ret = -ENOSPC;
+ goto out_err;
+ }
+
/* we are on the idr, now get on the inode */
ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
NULL, 0);
@@ -601,8 +586,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
goto out_err;
}

- /* increment the number of watches the user has */
- atomic_inc(&group->inotify_data.user->inotify_watches);

/* return the watch descriptor for this new mark */
ret = tmp_i_mark->wd;
@@ -653,10 +636,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)

spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
- group->inotify_data.user = get_current_user();
+ group->inotify_data.ucounts = inc_ucount(current_user_ns(),
+ current_euid(),
+ UCOUNT_INOTIFY_INSTANCES);

- if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
- inotify_max_user_instances) {
+ if (!group->inotify_data.ucounts) {
fsnotify_destroy_group(group);
return ERR_PTR(-EMFILE);
}
@@ -819,8 +803,8 @@ static int __init inotify_user_setup(void)
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);

inotify_max_queued_events = 16384;
- inotify_max_user_instances = 128;
- inotify_max_user_watches = 8192;
+ init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
+ init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;

return 0;
}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 58205f33af02..892959de1162 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
+#include <linux/user_namespace.h>

/*
* IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -169,7 +170,7 @@ struct fsnotify_group {
struct inotify_group_private_data {
spinlock_t idr_lock;
struct idr idr;
- struct user_struct *user;
+ struct ucounts *ucounts;
} inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62c68e513e39..35230a2c73ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -840,10 +840,6 @@ struct user_struct {
atomic_t __count; /* reference count */
atomic_t processes; /* How many processes does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY_USER
- atomic_t inotify_watches; /* How many inotify watches does this user have? */
- atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
-#endif
#ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners;
#endif
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..cbcac7a5ec41 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@ enum ucount_type {
UCOUNT_NET_NAMESPACES,
UCOUNT_MNT_NAMESPACES,
UCOUNT_CGROUP_NAMESPACES,
+#ifdef CONFIG_INOTIFY_USER
+ UCOUNT_INOTIFY_INSTANCES,
+ UCOUNT_INOTIFY_WATCHES,
+#endif
UCOUNT_COUNTS,
};

diff --git a/kernel/ucount.c b/kernel/ucount.c
index 9d20d5dd298a..a6bcea47306b 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {

static int zero = 0;
static int int_max = INT_MAX;
-#define UCOUNT_ENTRY(name) \
+#define UCOUNT_ENTRY(name) \
{ \
.procname = name, \
.maxlen = sizeof(int), \
@@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_net_namespaces"),
UCOUNT_ENTRY("max_mnt_namespaces"),
UCOUNT_ENTRY("max_cgroup_namespaces"),
+#ifdef CONFIG_INOTIFY_USER_
+ UCOUNT_ENTRY("max_inotify_instances"),
+ UCOUNT_ENTRY("max_inotify_watches"),
+#endif
{ }
};
#endif /* CONFIG_SYSCTL */
--
2.5.0