[PATCH] namespaces: add transparent user namespaces

From: Jann Horn
Date: Thu Jun 23 2016 - 07:12:46 EST


This allows the admin of a user namespace to mark the namespace as
transparent. All other namespaces, by default, are opaque.

While the current behavior of user namespaces is appropriate for use in
containers, there are many programs that only use user namespaces because
doing so enables them to do other things (e.g. unsharing the mount or
network namespace) that require namespaced capabilities. For them, the
inability to see the real UIDs and GIDs of things from inside the user
namespace can be very annoying.

In a transparent namespace, all UIDs and GIDs that are mapped into its
first opaque ancestor are visible and are not remapped. This means that if
a process e.g. stat()s the real root directory in a namespace, it will
still see it as owned by UID 0.

Traditionally, any UID or GID that was visible in a user namespace was also
mapped into the namespace, giving the namespace admin full access to it.
This patch introduces a distinction: In a transparent namespace, UIDs and
GIDs can be visible without being mapped. Non-mapped, visible UIDs can be
passed from the kernel to userspace, but userspace can't send them back to
the kernel. In order to be able to fully use specific UIDs/GIDs and gain
privileges over them, mappings need to be set up in the usual way -
however, to avoid aliasing problems, only identity mappings are permitted.

I have gone through all callers of from_kuid() and from_kgid(), and as far
as I can tell, kuid_has_mapping() and kgid_has_mapping() were the only
functions that used them for privilege checks. (The keys subsystem uses
them in an insecure way, and that issue has been known for a while, but my
patch doesn't make that any more vulnerable than it already is.)

Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
---
fs/proc/base.c | 28 ++++++--
include/linux/uidgid.h | 16 ++++-
include/linux/user_namespace.h | 4 ++
kernel/user.c | 1 +
kernel/user_namespace.c | 152 +++++++++++++++++++++++++++++++++++++++--
5 files changed, 191 insertions(+), 10 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a11eb71..c521c51 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2744,7 +2744,8 @@ static const struct file_operations proc_projid_map_operations = {
.release = proc_id_map_release,
};

-static int proc_setgroups_open(struct inode *inode, struct file *file)
+static int proc_nsadmin_open(struct inode *inode, struct file *file,
+ int (*show)(struct seq_file *, void *))
{
struct user_namespace *ns = NULL;
struct task_struct *task;
@@ -2767,7 +2768,7 @@ static int proc_setgroups_open(struct inode *inode, struct file *file)
goto err_put_ns;
}

- ret = single_open(file, &proc_setgroups_show, ns);
+ ret = single_open(file, show, ns);
if (ret)
goto err_put_ns;

@@ -2778,7 +2779,7 @@ err:
return ret;
}

-static int proc_setgroups_release(struct inode *inode, struct file *file)
+static int proc_nsadmin_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
@@ -2787,12 +2788,30 @@ static int proc_setgroups_release(struct inode *inode, struct file *file)
return ret;
}

+static int proc_setgroups_open(struct inode *inode, struct file *file)
+{
+ return proc_nsadmin_open(inode, file, &proc_setgroups_show);
+}
+
static const struct file_operations proc_setgroups_operations = {
.open = proc_setgroups_open,
.write = proc_setgroups_write,
.read = seq_read,
.llseek = seq_lseek,
- .release = proc_setgroups_release,
+ .release = proc_nsadmin_release,
+};
+
+static int proc_transparent_open(struct inode *inode, struct file *file)
+{
+ return proc_nsadmin_open(inode, file, &proc_transparent_show);
+}
+
+static const struct file_operations proc_transparent_operations = {
+ .open = proc_transparent_open,
+ .write = proc_transparent_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_nsadmin_release,
};
#endif /* CONFIG_USER_NS */

@@ -2901,6 +2920,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
+ REG("transparent", S_IRUGO|S_IWUSR, proc_transparent_operations),
#endif
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index 0383552..2908d40 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -124,17 +124,19 @@ extern kgid_t make_kgid(struct user_namespace *from, gid_t gid);

extern uid_t from_kuid(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid(struct user_namespace *to, kgid_t gid);
+extern uid_t from_kuid_opaque(struct user_namespace *to, kuid_t uid);
+extern gid_t from_kgid_opaque(struct user_namespace *to, kgid_t gid);
extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid);

static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
- return from_kuid(ns, uid) != (uid_t) -1;
+ return from_kuid_opaque(ns, uid) != (uid_t) -1;
}

static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
- return from_kgid(ns, gid) != (gid_t) -1;
+ return from_kgid_opaque(ns, gid) != (gid_t) -1;
}

#else
@@ -159,6 +161,16 @@ static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid)
return __kgid_val(kgid);
}

+static inline uid_t from_kuid_opaque(struct user_namespace *to, kuid_t kuid)
+{
+ return __kuid_val(kuid);
+}
+
+static inline gid_t from_kgid_opaque(struct user_namespace *to, kgid_t kgid)
+{
+ return __kgid_val(kgid);
+}
+
static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid)
{
uid_t uid = from_kuid(to, kuid);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8297e5b..18291ac 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -28,6 +28,8 @@ struct user_namespace {
struct uid_gid_map projid_map;
atomic_t count;
struct user_namespace *parent;
+ /* self for normal ns; first opaque parent for transparent ns */
+ struct user_namespace *opaque;
int level;
kuid_t owner;
kgid_t group;
@@ -71,6 +73,8 @@ extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, lo
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
+extern ssize_t proc_transparent_write(struct file *, const char __user *, size_t, loff_t *);
+extern int proc_transparent_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
#else

diff --git a/kernel/user.c b/kernel/user.c
index b069ccb..e1fd9e5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -48,6 +48,7 @@ struct user_namespace init_user_ns = {
},
},
.count = ATOMIC_INIT(3),
+ .opaque = &init_user_ns,
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.ns.inum = PROC_USER_INIT_INO,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc21..da329a1 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -98,6 +98,7 @@ int create_user_ns(struct cred *new)
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
+ ns->opaque = ns;
ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
@@ -251,18 +252,46 @@ EXPORT_SYMBOL(make_kuid);
* Map @kuid into the user-namespace specified by @targ and
* return the resulting uid.
*
+ * This function is *not* appropriate for security checks because
+ * if @targ is transparent, the mappings of an ancestor namespace
+ * are used. If @targ isn't &init_user_ns and you intend to do
+ * anything with the result apart from returning it to a process
+ * in @targ, you might want to use from_kuid_opaque() instead.
+ *
* There is always a mapping into the initial user_namespace.
*
- * If @kuid has no mapping in @targ (uid_t)-1 is returned.
+ * If @kuid is not visible in @targ (uid_t)-1 is returned.
*/
uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
/* Map the uid from a global kernel uid */
- return map_id_up(&targ->uid_map, __kuid_val(kuid));
+ struct user_namespace *opaque = READ_ONCE(targ->opaque);
+
+ return map_id_up(&opaque->uid_map, __kuid_val(kuid));
}
EXPORT_SYMBOL(from_kuid);

/**
+ * from_kuid_opaque - Create a uid from a kuid user-namespace pair.
+ * @targ: The user namespace we want a uid in.
+ * @kuid: The kernel internal uid to start with.
+ *
+ * Map @kuid into the user-namespace specified by @targ and
+ * return the resulting uid. This ignores transparent user
+ * namespaces and is therefore appropriate for security checks.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kuid has no mapping in @targ (uid_t)-1 is returned.
+ */
+uid_t from_kuid_opaque(struct user_namespace *targ, kuid_t kuid)
+{
+ /* Map the uid from a global kernel uid */
+ return map_id_up(&targ->uid_map, __kuid_val(kuid));
+}
+EXPORT_SYMBOL(from_kuid_opaque);
+
+/**
* from_kuid_munged - Create a uid from a kuid user-namespace pair.
* @targ: The user namespace we want a uid in.
* @kuid: The kernel internal uid to start with.
@@ -319,18 +348,46 @@ EXPORT_SYMBOL(make_kgid);
* Map @kgid into the user-namespace specified by @targ and
* return the resulting gid.
*
+ * This function is *not* appropriate for security checks because
+ * if @targ is transparent, the mappings of an ancestor namespace
+ * are used. If @targ isn't &init_user_ns and you intend to do
+ * anything with the result apart from returning it to a process
+ * in @targ, you might want to use from_kgid_opaque() instead.
+ *
* There is always a mapping into the initial user_namespace.
*
- * If @kgid has no mapping in @targ (gid_t)-1 is returned.
+ * If @kgid is not visible in @targ (gid_t)-1 is returned.
*/
gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
{
/* Map the gid from a global kernel gid */
- return map_id_up(&targ->gid_map, __kgid_val(kgid));
+ struct user_namespace *opaque = READ_ONCE(targ->opaque);
+
+ return map_id_up(&opaque->gid_map, __kgid_val(kgid));
}
EXPORT_SYMBOL(from_kgid);

/**
+ * from_kgid_opaque - Create a gid from a kgid user-namespace pair.
+ * @targ: The user namespace we want a gid in.
+ * @kgid: The kernel internal gid to start with.
+ *
+ * Map @kgid into the user-namespace specified by @targ and
+ * return the resulting gid. This ignores transparent user
+ * namespaces and is therefore appropriate for security checks.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kgid has no mapping in @targ (gid_t)-1 is returned.
+ */
+gid_t from_kgid_opaque(struct user_namespace *targ, kgid_t kgid)
+{
+ /* Map the gid from a global kernel gid */
+ return map_id_up(&targ->gid_map, __kgid_val(kgid));
+}
+EXPORT_SYMBOL(from_kgid_opaque);
+
+/**
* from_kgid_munged - Create a gid from a kgid user-namespace pair.
* @targ: The user namespace we want a gid in.
* @kgid: The kernel internal gid to start with.
@@ -811,6 +868,18 @@ static bool new_idmap_permitted(const struct file *file,
struct uid_gid_map *new_map)
{
const struct cred *cred = file->f_cred;
+ unsigned int idx;
+
+ /* Don't allow non-identity mappings in transparent namespaces. */
+ if (ns != ns->opaque) {
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ struct uid_gid_extent *ext = &new_map->extent[idx];
+
+ if (ext->first != ext->lower_first)
+ return false;
+ }
+ }
+
/* Don't allow mappings that would allow anything that wouldn't
* be allowed without the establishment of unprivileged mappings.
*/
@@ -922,6 +991,81 @@ out_unlock:
goto out;
}

+int proc_transparent_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *opaque = READ_ONCE(ns->opaque);
+
+ seq_printf(seq, "%d\n", (ns == opaque) ? 0 : 1);
+ return 0;
+}
+
+ssize_t proc_transparent_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ char kbuf[8], *pos;
+ bool transparent;
+ ssize_t ret;
+
+ /* Only allow a very narrow range of strings to be written */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= sizeof(kbuf)))
+ goto out;
+
+ /* What was written? */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+ pos = kbuf;
+
+ /* What is being requested? */
+ ret = -EINVAL;
+ if (pos[0] == '1') {
+ pos += 1;
+ transparent = true;
+ } else if (pos[0] == '0') {
+ pos += 1;
+ transparent = false;
+ } else
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ ret = -EPERM;
+ mutex_lock(&userns_state_mutex);
+ /* Is the requested state different from the current one? */
+ if (transparent != (ns->opaque != ns)) {
+ /* You can't turn off transparent mode. */
+ if (!transparent)
+ goto out_unlock;
+ /* If there are existing mappings, they might be
+ * non-identity mappings. Therefore, block transparent
+ * mode. This also prevents making the init namespace
+ * transparent (which wouldn't work).
+ */
+ if (ns->uid_map.nr_extents != 0 || ns->gid_map.nr_extents != 0)
+ goto out_unlock;
+ /* Okay! Make the namespace transparent. */
+ ns->opaque = ns->parent->opaque;
+ }
+ mutex_unlock(&userns_state_mutex);
+
+ /* Report a successful write */
+ *ppos = count;
+ ret = count;
+out:
+ return ret;
+out_unlock:
+ mutex_unlock(&userns_state_mutex);
+ goto out;
+}
+
bool userns_may_setgroups(const struct user_namespace *ns)
{
bool allowed;
--
2.8.0.rc3.226.g39d4020