Re: [PATCH] namespaces: add transparent user namespaces

From: Eric W. Biederman
Date: Thu Jun 23 2016 - 15:04:29 EST


Jann Horn <jannh@xxxxxxxxxx> writes:

> This allows the admin of a user namespace to mark the namespace as
> transparent. All other namespaces, by default, are opaque.
>
> While the current behavior of user namespaces is appropriate for use in
> containers, there are many programs that only use user namespaces because
> doing so enables them to do other things (e.g. unsharing the mount or
> network namespace) that require namespaced capabilities. For them, the
> inability to see the real UIDs and GIDs of things from inside the user
> namespace can be very annoying.
>
> In a transparent namespace, all UIDs and GIDs that are mapped into its
> first opaque ancestor are visible and are not remapped. This means that if
> a process e.g. stat()s the real root directory in a namespace, it will
> still see it as owned by UID 0.
>
> Traditionally, any UID or GID that was visible in a user namespace was also
> mapped into the namespace, giving the namespace admin full access to it.
> This patch introduces a distinction: In a transparent namespace, UIDs and
> GIDs can be visible without being mapped. Non-mapped, visible UIDs can be
> passed from the kernel to userspace, but userspace can't send them back to
> the kernel. In order to be able to fully use specific UIDs/GIDs and gain
> privileges over them, mappings need to be set up in the usual way -
> however, to avoid aliasing problems, only identity mappings are permitted.
>
> I have gone through all callers of from_kuid() and from_kgid(), and as far
> as I can tell, kuid_has_mapping() and kgid_has_mapping() were the only
> functions that used them for privilege checks. (The keys subsystem uses
> them in an insecure way, and that issue has been known for a while, but my
> patch doesn't make that any more vulnerable than it already is.

Perhaps it has been known for a while but no one has stopped and
mentioned it to me. What questionable thing is the keys subsystem
doing?

>)

This is a bigish change in semantics and I am going to have to digest
this before I can give this an ok.

Quite frankly at the base it scares me.


If this is just about presentation and allowing some information from
the parent user namespace I would be much happier if it was not
from_kuid but that you modified, but if you instead you had a function
say from_kuid_transparent, that performed the transformation you need
and was only used in those places it is safe.

I think I could reason about that.

As your patchset sits I can not reason about the change in semantics,
because without a large grep of the source I don't know what you are
changing.

And you are dramatically changing the semantics the semantics of
from_kuid to the point I do believe we need to inspepect all of the call
sites. As such I really don't think it makes sense to reuse the
existing name for your new semantics.

Eric

> Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
> ---
> fs/proc/base.c | 28 ++++++--
> include/linux/uidgid.h | 16 ++++-
> include/linux/user_namespace.h | 4 ++
> kernel/user.c | 1 +
> kernel/user_namespace.c | 152 +++++++++++++++++++++++++++++++++++++++--
> 5 files changed, 191 insertions(+), 10 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index a11eb71..c521c51 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2744,7 +2744,8 @@ static const struct file_operations proc_projid_map_operations = {
> .release = proc_id_map_release,
> };
>
> -static int proc_setgroups_open(struct inode *inode, struct file *file)
> +static int proc_nsadmin_open(struct inode *inode, struct file *file,
> + int (*show)(struct seq_file *, void *))
> {
> struct user_namespace *ns = NULL;
> struct task_struct *task;
> @@ -2767,7 +2768,7 @@ static int proc_setgroups_open(struct inode *inode, struct file *file)
> goto err_put_ns;
> }
>
> - ret = single_open(file, &proc_setgroups_show, ns);
> + ret = single_open(file, show, ns);
> if (ret)
> goto err_put_ns;
>
> @@ -2778,7 +2779,7 @@ err:
> return ret;
> }
>
> -static int proc_setgroups_release(struct inode *inode, struct file *file)
> +static int proc_nsadmin_release(struct inode *inode, struct file *file)
> {
> struct seq_file *seq = file->private_data;
> struct user_namespace *ns = seq->private;
> @@ -2787,12 +2788,30 @@ static int proc_setgroups_release(struct inode *inode, struct file *file)
> return ret;
> }
>
> +static int proc_setgroups_open(struct inode *inode, struct file *file)
> +{
> + return proc_nsadmin_open(inode, file, &proc_setgroups_show);
> +}
> +
> static const struct file_operations proc_setgroups_operations = {
> .open = proc_setgroups_open,
> .write = proc_setgroups_write,
> .read = seq_read,
> .llseek = seq_lseek,
> - .release = proc_setgroups_release,
> + .release = proc_nsadmin_release,
> +};
> +
> +static int proc_transparent_open(struct inode *inode, struct file *file)
> +{
> + return proc_nsadmin_open(inode, file, &proc_transparent_show);
> +}
> +
> +static const struct file_operations proc_transparent_operations = {
> + .open = proc_transparent_open,
> + .write = proc_transparent_write,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = proc_nsadmin_release,
> };
> #endif /* CONFIG_USER_NS */
>
> @@ -2901,6 +2920,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
> REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
> REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
> + REG("transparent", S_IRUGO|S_IWUSR, proc_transparent_operations),
> #endif
> #ifdef CONFIG_CHECKPOINT_RESTORE
> REG("timers", S_IRUGO, proc_timers_operations),
> diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
> index 0383552..2908d40 100644
> --- a/include/linux/uidgid.h
> +++ b/include/linux/uidgid.h
> @@ -124,17 +124,19 @@ extern kgid_t make_kgid(struct user_namespace *from, gid_t gid);
>
> extern uid_t from_kuid(struct user_namespace *to, kuid_t uid);
> extern gid_t from_kgid(struct user_namespace *to, kgid_t gid);
> +extern uid_t from_kuid_opaque(struct user_namespace *to, kuid_t uid);
> +extern gid_t from_kgid_opaque(struct user_namespace *to, kgid_t gid);
> extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid);
> extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid);
>
> static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
> {
> - return from_kuid(ns, uid) != (uid_t) -1;
> + return from_kuid_opaque(ns, uid) != (uid_t) -1;
> }
>
> static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
> {
> - return from_kgid(ns, gid) != (gid_t) -1;
> + return from_kgid_opaque(ns, gid) != (gid_t) -1;
> }
>
> #else
> @@ -159,6 +161,16 @@ static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid)
> return __kgid_val(kgid);
> }
>
> +static inline uid_t from_kuid_opaque(struct user_namespace *to, kuid_t kuid)
> +{
> + return __kuid_val(kuid);
> +}
> +
> +static inline gid_t from_kgid_opaque(struct user_namespace *to, kgid_t kgid)
> +{
> + return __kgid_val(kgid);
> +}
> +
> static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid)
> {
> uid_t uid = from_kuid(to, kuid);
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 8297e5b..18291ac 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -28,6 +28,8 @@ struct user_namespace {
> struct uid_gid_map projid_map;
> atomic_t count;
> struct user_namespace *parent;
> + /* self for normal ns; first opaque parent for transparent ns */
> + struct user_namespace *opaque;
> int level;
> kuid_t owner;
> kgid_t group;
> @@ -71,6 +73,8 @@ extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, lo
> extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
> extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
> extern int proc_setgroups_show(struct seq_file *m, void *v);
> +extern ssize_t proc_transparent_write(struct file *, const char __user *, size_t, loff_t *);
> +extern int proc_transparent_show(struct seq_file *m, void *v);
> extern bool userns_may_setgroups(const struct user_namespace *ns);
> #else
>
> diff --git a/kernel/user.c b/kernel/user.c
> index b069ccb..e1fd9e5 100644
> --- a/kernel/user.c
> +++ b/kernel/user.c
> @@ -48,6 +48,7 @@ struct user_namespace init_user_ns = {
> },
> },
> .count = ATOMIC_INIT(3),
> + .opaque = &init_user_ns,
> .owner = GLOBAL_ROOT_UID,
> .group = GLOBAL_ROOT_GID,
> .ns.inum = PROC_USER_INIT_INO,
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 9bafc21..da329a1 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -98,6 +98,7 @@ int create_user_ns(struct cred *new)
> atomic_set(&ns->count, 1);
> /* Leave the new->user_ns reference with the new user namespace. */
> ns->parent = parent_ns;
> + ns->opaque = ns;
> ns->level = parent_ns->level + 1;
> ns->owner = owner;
> ns->group = group;
> @@ -251,18 +252,46 @@ EXPORT_SYMBOL(make_kuid);
> * Map @kuid into the user-namespace specified by @targ and
> * return the resulting uid.
> *
> + * This function is *not* appropriate for security checks because
> + * if @targ is transparent, the mappings of an ancestor namespace
> + * are used. If @targ isn't &init_user_ns and you intend to do
> + * anything with the result apart from returning it to a process
> + * in @targ, you might want to use from_kuid_opaque() instead.
> + *
> * There is always a mapping into the initial user_namespace.
> *
> - * If @kuid has no mapping in @targ (uid_t)-1 is returned.
> + * If @kuid is not visible in @targ (uid_t)-1 is returned.
> */
> uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
> {
> /* Map the uid from a global kernel uid */
> - return map_id_up(&targ->uid_map, __kuid_val(kuid));
> + struct user_namespace *opaque = READ_ONCE(targ->opaque);
> +
> + return map_id_up(&opaque->uid_map, __kuid_val(kuid));
> }
> EXPORT_SYMBOL(from_kuid);
>
> /**
> + * from_kuid_opaque - Create a uid from a kuid user-namespace pair.
> + * @targ: The user namespace we want a uid in.
> + * @kuid: The kernel internal uid to start with.
> + *
> + * Map @kuid into the user-namespace specified by @targ and
> + * return the resulting uid. This ignores transparent user
> + * namespaces and is therefore appropriate for security checks.
> + *
> + * There is always a mapping into the initial user_namespace.
> + *
> + * If @kuid has no mapping in @targ (uid_t)-1 is returned.
> + */
> +uid_t from_kuid_opaque(struct user_namespace *targ, kuid_t kuid)
> +{
> + /* Map the uid from a global kernel uid */
> + return map_id_up(&targ->uid_map, __kuid_val(kuid));
> +}
> +EXPORT_SYMBOL(from_kuid_opaque);
> +
> +/**
> * from_kuid_munged - Create a uid from a kuid user-namespace pair.
> * @targ: The user namespace we want a uid in.
> * @kuid: The kernel internal uid to start with.
> @@ -319,18 +348,46 @@ EXPORT_SYMBOL(make_kgid);
> * Map @kgid into the user-namespace specified by @targ and
> * return the resulting gid.
> *
> + * This function is *not* appropriate for security checks because
> + * if @targ is transparent, the mappings of an ancestor namespace
> + * are used. If @targ isn't &init_user_ns and you intend to do
> + * anything with the result apart from returning it to a process
> + * in @targ, you might want to use from_kgid_opaque() instead.
> + *
> * There is always a mapping into the initial user_namespace.
> *
> - * If @kgid has no mapping in @targ (gid_t)-1 is returned.
> + * If @kgid is not visible in @targ (gid_t)-1 is returned.
> */
> gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
> {
> /* Map the gid from a global kernel gid */
> - return map_id_up(&targ->gid_map, __kgid_val(kgid));
> + struct user_namespace *opaque = READ_ONCE(targ->opaque);
> +
> + return map_id_up(&opaque->gid_map, __kgid_val(kgid));
> }
> EXPORT_SYMBOL(from_kgid);
>
> /**
> + * from_kgid_opaque - Create a gid from a kgid user-namespace pair.
> + * @targ: The user namespace we want a gid in.
> + * @kgid: The kernel internal gid to start with.
> + *
> + * Map @kgid into the user-namespace specified by @targ and
> + * return the resulting gid. This ignores transparent user
> + * namespaces and is therefore appropriate for security checks.
> + *
> + * There is always a mapping into the initial user_namespace.
> + *
> + * If @kgid has no mapping in @targ (gid_t)-1 is returned.
> + */
> +gid_t from_kgid_opaque(struct user_namespace *targ, kgid_t kgid)
> +{
> + /* Map the gid from a global kernel gid */
> + return map_id_up(&targ->gid_map, __kgid_val(kgid));
> +}
> +EXPORT_SYMBOL(from_kgid_opaque);
> +
> +/**
> * from_kgid_munged - Create a gid from a kgid user-namespace pair.
> * @targ: The user namespace we want a gid in.
> * @kgid: The kernel internal gid to start with.
> @@ -811,6 +868,18 @@ static bool new_idmap_permitted(const struct file *file,
> struct uid_gid_map *new_map)
> {
> const struct cred *cred = file->f_cred;
> + unsigned int idx;
> +
> + /* Don't allow non-identity mappings in transparent namespaces. */
> + if (ns != ns->opaque) {
> + for (idx = 0; idx < new_map->nr_extents; idx++) {
> + struct uid_gid_extent *ext = &new_map->extent[idx];
> +
> + if (ext->first != ext->lower_first)
> + return false;
> + }
> + }
> +
> /* Don't allow mappings that would allow anything that wouldn't
> * be allowed without the establishment of unprivileged mappings.
> */
> @@ -922,6 +991,81 @@ out_unlock:
> goto out;
> }
>
> +int proc_transparent_show(struct seq_file *seq, void *v)
> +{
> + struct user_namespace *ns = seq->private;
> + struct user_namespace *opaque = READ_ONCE(ns->opaque);
> +
> + seq_printf(seq, "%d\n", (ns == opaque) ? 0 : 1);
> + return 0;
> +}
> +
> +ssize_t proc_transparent_write(struct file *file, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct seq_file *seq = file->private_data;
> + struct user_namespace *ns = seq->private;
> + char kbuf[8], *pos;
> + bool transparent;
> + ssize_t ret;
> +
> + /* Only allow a very narrow range of strings to be written */
> + ret = -EINVAL;
> + if ((*ppos != 0) || (count >= sizeof(kbuf)))
> + goto out;
> +
> + /* What was written? */
> + ret = -EFAULT;
> + if (copy_from_user(kbuf, buf, count))
> + goto out;
> + kbuf[count] = '\0';
> + pos = kbuf;
> +
> + /* What is being requested? */
> + ret = -EINVAL;
> + if (pos[0] == '1') {
> + pos += 1;
> + transparent = true;
> + } else if (pos[0] == '0') {
> + pos += 1;
> + transparent = false;
> + } else
> + goto out;
> +
> + /* Verify there is not trailing junk on the line */
> + pos = skip_spaces(pos);
> + if (*pos != '\0')
> + goto out;
> +
> + ret = -EPERM;
> + mutex_lock(&userns_state_mutex);
> + /* Is the requested state different from the current one? */
> + if (transparent != (ns->opaque != ns)) {
> + /* You can't turn off transparent mode. */
> + if (!transparent)
> + goto out_unlock;
> + /* If there are existing mappings, they might be
> + * non-identity mappings. Therefore, block transparent
> + * mode. This also prevents making the init namespace
> + * transparent (which wouldn't work).
> + */
> + if (ns->uid_map.nr_extents != 0 || ns->gid_map.nr_extents != 0)
> + goto out_unlock;
> + /* Okay! Make the namespace transparent. */
> + ns->opaque = ns->parent->opaque;
> + }
> + mutex_unlock(&userns_state_mutex);
> +
> + /* Report a successful write */
> + *ppos = count;
> + ret = count;
> +out:
> + return ret;
> +out_unlock:
> + mutex_unlock(&userns_state_mutex);
> + goto out;
> +}
> +
> bool userns_may_setgroups(const struct user_namespace *ns)
> {
> bool allowed;