Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
From: Eric W. Biederman
Date: Mon Jan 10 2022 - 11:26:42 EST
Alexey Gladkov <legion@xxxxxxxxxx> writes:
> Right now, the mqueue sysctls take ipc namespaces into account in a
> rather hacky way. This works in most cases, but does not respect the
> user namespace.
>
> Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
> parametres. This poses a problem in the rootless containers.
>
> To solve this I changed the implementation of the mqueue sysctls just
> like some other sysctls.
>
> Before this change:
>
> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
> tee: /proc/sys/fs/mqueue/msg_max: Permission denied
> 5
>
> After this change:
>
> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
> 5
>
> v2:
> * Fixed compilation problem if CONFIG_POSIX_MQUEUE_SYSCTL is not
> specified.
Can you split this in two patches?
The first that converts mq_sysctl.c and ipc_sysctl.c to live in
a per ipc namespace sysctl set. That will just be a bug-fix/cleanup
patch.
The second that modifies the permissions to allow root in the ipc
namespace to change the parameters. With that second patch
we can have the discussion about when it is valid to allow
the user namespace root that created the ipc namespace to be able
to set the sysctls.
A few more comments below.
> Reported-by: kernel test robot <lkp@xxxxxxxxx>
> Signed-off-by: Alexey Gladkov <legion@xxxxxxxxxx>
> ---
> include/linux/ipc_namespace.h | 18 ++++-
> ipc/mq_sysctl.c | 137 ++++++++++++++++++++--------------
> ipc/mqueue.c | 10 +--
> ipc/namespace.c | 6 ++
> 4 files changed, 106 insertions(+), 65 deletions(-)
>
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index b75395ec8d52..bcedc86a6f1d 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -10,6 +10,7 @@
> #include <linux/ns_common.h>
> #include <linux/refcount.h>
> #include <linux/rhashtable-types.h>
> +#include <linux/sysctl.h>
>
> struct user_namespace;
>
> @@ -63,6 +64,11 @@ struct ipc_namespace {
> unsigned int mq_msg_default;
> unsigned int mq_msgsize_default;
>
> +#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
> + struct ctl_table_set set;
> + struct ctl_table_header *sysctls;
> +#endif
> +
Updating the code to handle all of the ipc sysctls should
remove the need for the #ifdef here.
> /* user_ns which owns the ipc ns */
> struct user_namespace *user_ns;
> struct ucounts *ucounts;
> @@ -169,14 +175,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
>
> #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
>
> -struct ctl_table_header;
> -extern struct ctl_table_header *mq_register_sysctl_table(void);
> +void retire_mq_sysctls(struct ipc_namespace *ns);
> +bool setup_mq_sysctls(struct ipc_namespace *ns);
>
> #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
>
> -static inline struct ctl_table_header *mq_register_sysctl_table(void)
> +static inline void retire_mq_sysctls(struct ipc_namespace *ns)
> {
> - return NULL;
> +}
> +
> +static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
> +{
> + return true;
> }
>
> #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
> diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
> index 72a92a08c848..56afb0503296 100644
> --- a/ipc/mq_sysctl.c
> +++ b/ipc/mq_sysctl.c
> @@ -9,39 +9,9 @@
> #include <linux/ipc_namespace.h>
> #include <linux/sysctl.h>
>
> -#ifdef CONFIG_PROC_SYSCTL
> -static void *get_mq(struct ctl_table *table)
> -{
> - char *which = table->data;
> - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
> - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
> - return which;
> -}
> -
> -static int proc_mq_dointvec(struct ctl_table *table, int write,
> - void *buffer, size_t *lenp, loff_t *ppos)
> -{
> - struct ctl_table mq_table;
> - memcpy(&mq_table, table, sizeof(mq_table));
> - mq_table.data = get_mq(table);
> -
> - return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
> -}
> -
> -static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
> - void *buffer, size_t *lenp, loff_t *ppos)
> -{
> - struct ctl_table mq_table;
> - memcpy(&mq_table, table, sizeof(mq_table));
> - mq_table.data = get_mq(table);
> -
> - return proc_dointvec_minmax(&mq_table, write, buffer,
> - lenp, ppos);
> -}
> -#else
> -#define proc_mq_dointvec NULL
> -#define proc_mq_dointvec_minmax NULL
> -#endif
> +#include <linux/stat.h>
> +#include <linux/capability.h>
> +#include <linux/slab.h>
>
> static int msg_max_limit_min = MIN_MSGMAX;
> static int msg_max_limit_max = HARD_MSGMAX;
> @@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = {
> .data = &init_ipc_ns.mq_queues_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> - .proc_handler = proc_mq_dointvec,
> + .proc_handler = proc_dointvec,
> },
> {
> .procname = "msg_max",
> .data = &init_ipc_ns.mq_msg_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> - .proc_handler = proc_mq_dointvec_minmax,
> + .proc_handler = proc_dointvec_minmax,
> .extra1 = &msg_max_limit_min,
> .extra2 = &msg_max_limit_max,
> },
> @@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = {
> .data = &init_ipc_ns.mq_msgsize_max,
> .maxlen = sizeof(int),
> .mode = 0644,
> - .proc_handler = proc_mq_dointvec_minmax,
> + .proc_handler = proc_dointvec_minmax,
> .extra1 = &msg_maxsize_limit_min,
> .extra2 = &msg_maxsize_limit_max,
> },
> @@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = {
> .data = &init_ipc_ns.mq_msg_default,
> .maxlen = sizeof(int),
> .mode = 0644,
> - .proc_handler = proc_mq_dointvec_minmax,
> + .proc_handler = proc_dointvec_minmax,
> .extra1 = &msg_max_limit_min,
> .extra2 = &msg_max_limit_max,
> },
> @@ -89,32 +59,89 @@ static struct ctl_table mq_sysctls[] = {
> .data = &init_ipc_ns.mq_msgsize_default,
> .maxlen = sizeof(int),
> .mode = 0644,
> - .proc_handler = proc_mq_dointvec_minmax,
> + .proc_handler = proc_dointvec_minmax,
> .extra1 = &msg_maxsize_limit_min,
> .extra2 = &msg_maxsize_limit_max,
> },
> {}
> };
>
> -static struct ctl_table mq_sysctl_dir[] = {
> - {
> - .procname = "mqueue",
> - .mode = 0555,
> - .child = mq_sysctls,
> - },
> - {}
> -};
> +static struct ctl_table_set *
> +set_lookup(struct ctl_table_root *root)
> +{
> + return ¤t->nsproxy->ipc_ns->set;
> +}
>
> -static struct ctl_table mq_sysctl_root[] = {
> - {
> - .procname = "fs",
> - .mode = 0555,
> - .child = mq_sysctl_dir,
> - },
> - {}
> +static int set_is_seen(struct ctl_table_set *set)
> +{
> + return ¤t->nsproxy->ipc_ns->set == set;
> +}
> +
> +static int set_permissions(struct ctl_table_header *head, struct ctl_table *table)
> +{
> + struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, set);
> + int mode;
> +
> + /* Allow users with CAP_SYS_RESOURCE unrestrained access */
> + if (ns_capable(ns->user_ns, CAP_SYS_RESOURCE))
> + mode = (table->mode & S_IRWXU) >> 6;
> + else
> + /* Allow all others at most read-only access */
> + mode = table->mode & S_IROTH;
> + return (mode << 6) | (mode << 3) | mode;
> +}
As a cleanup/bug-fix please don't implemenet set_permissions. If you
don't the default permissions that we use today will apply.
> +static struct ctl_table_root set_root = {
> + .lookup = set_lookup,
> + .permissions = set_permissions,
> };
>
> -struct ctl_table_header *mq_register_sysctl_table(void)
> +bool setup_mq_sysctls(struct ipc_namespace *ns)
> {
> - return register_sysctl_table(mq_sysctl_root);
> + struct ctl_table *tbl;
> +
> + setup_sysctl_set(&ns->set, &set_root, set_is_seen);
> +
> + tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
> + if (tbl) {
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
> + if (tbl[i].data == &init_ipc_ns.mq_queues_max)
> + tbl[i].data = &ns->mq_queues_max;
> +
> + else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
> + tbl[i].data = &ns->mq_msg_max;
> +
> + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
> + tbl[i].data = &ns->mq_msgsize_max;
> +
> + else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
> + tbl[i].data = &ns->mq_msg_default;
> +
> + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
> + tbl[i].data = &ns->mq_msgsize_default;
> + else
> + tbl[i].data = NULL;
> + }
> +
> + ns->sysctls = __register_sysctl_table(&ns->set, "fs/mqueue", tbl);
> + }
> + if (!ns->sysctls) {
> + kfree(tbl);
> + retire_sysctl_set(&ns->set);
> + return false;
> + }
> +
> + return true;
> +}
> +
> +void retire_mq_sysctls(struct ipc_namespace *ns)
> +{
> + struct ctl_table *tbl;
> +
> + tbl = ns->sysctls->ctl_table_arg;
> + unregister_sysctl_table(ns->sysctls);
> + retire_sysctl_set(&ns->set);
> + kfree(tbl);
> }
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 5becca9be867..1b4a3be71636 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -163,8 +163,6 @@ static void remove_notification(struct mqueue_inode_info *info);
>
> static struct kmem_cache *mqueue_inode_cachep;
>
> -static struct ctl_table_header *mq_sysctl_table;
> -
> static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
> {
> return container_of(inode, struct mqueue_inode_info, vfs_inode);
> @@ -1713,8 +1711,10 @@ static int __init init_mqueue_fs(void)
> if (mqueue_inode_cachep == NULL)
> return -ENOMEM;
>
> - /* ignore failures - they are not fatal */
> - mq_sysctl_table = mq_register_sysctl_table();
> + if (!setup_mq_sysctls(&init_ipc_ns)) {
> + pr_warn("sysctl registration failed\n");
> + return -ENOMEM;
> + }
>
> error = register_filesystem(&mqueue_fs_type);
> if (error)
> @@ -1731,8 +1731,6 @@ static int __init init_mqueue_fs(void)
> out_filesystem:
> unregister_filesystem(&mqueue_fs_type);
> out_sysctl:
> - if (mq_sysctl_table)
> - unregister_sysctl_table(mq_sysctl_table);
> kmem_cache_destroy(mqueue_inode_cachep);
> return error;
> }
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index ae83f0f2651b..f760243ca685 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -59,6 +59,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
> if (err)
> goto fail_put;
>
> + err = -ENOMEM;
> + if (!setup_mq_sysctls(ns))
> + goto fail_put;
> +
Please make this handle all ipc sysctls not just the mq sysctls.
> sem_init_ns(ns);
> msg_init_ns(ns);
> shm_init_ns(ns);
> @@ -125,6 +129,8 @@ static void free_ipc_ns(struct ipc_namespace *ns)
> msg_exit_ns(ns);
> shm_exit_ns(ns);
>
> + retire_mq_sysctls(ns);
> +
> dec_ipc_namespaces(ns->ucounts);
> put_user_ns(ns->user_ns);
> ns_free_inum(&ns->ns);
Eric