Re: [PATCH] linux-cr: nested pid namespaces (v3)

From: Louis Rilling
Date: Tue Mar 23 2010 - 03:14:42 EST


On Tue, Mar 23, 2010 at 12:18:39AM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.
> We keep the original single pids_array to minimize memory
> allocations. The pids array entries are augmented with a pidns
> depth (relative to the container init's pidns, and an "rpid" which
> is the pid in the checkpointer's pidns (or 0 if no valid pid exists).
> The rpid will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint. If any tasks
> are in nested pid namespace, another single array will hold all of
> the vpids. At restart those are used by userspace to determine how
> to call eclone(). Kernel ignores them.
>
> This patch also adds 'rpid' to struct ckpt_hdr_pids, which is not
> needed for nested pid_ns support, but will be needed for the
> userspace checkpointer to gather additional information (i.e.
> /proc/pid/mountinfo) after sys_checkpoint() completes.
>
> Changelog:
> Mar 22:
> Use Louis Rilling's smarter ckpt_vpids algorithm
> verbatim, to handle pid_ns depths > CKPT_HDR_PIDS_CHUNK,
> as well as fix an apparent bug in my original code.
>
> As Louis suggested, use task_active_pid_ns() rather than
> task->nsproxy->pid_ns. In fact it's a must, bc the
> checkpointed task may be dead and have NULL
> task->nsproxy->pid_ns.

Hm, if task can be dead, then there is a much bigger issue:
task->nsproxy is NULL. Or did I miss something?

To me the real reason is to anticipate pid namespace unsharing. And this
together with setns() will need to re-consider much of the namespace C/R
logic imho. For instance, checkpoint could be done from a foreign task
having entered the container, leak detection should take such foreign
tasks into account (see example below), etc.

>
> Oren: Add spinlock for nsproxy->pidns; use
> ckpt_read_consume() to consume vpids; and use __s32 instead
> of ckpt_vpid struct
>
> Signed-off-by: Serge E. Hallyn <serue@xxxxxxxxxx>
> ---
> checkpoint/checkpoint.c | 123 ++++++++++++++++++++++++++++++++++----
> checkpoint/process.c | 22 ++++++-
> checkpoint/restart.c | 43 ++++++++++++-
> checkpoint/sys.c | 2 +
> include/linux/checkpoint.h | 2 +-
> include/linux/checkpoint_hdr.h | 14 ++++
> include/linux/checkpoint_types.h | 3 +
> kernel/nsproxy.c | 9 ++-
> 8 files changed, 195 insertions(+), 23 deletions(-)
>
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fd61a80 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
> #include <linux/deferqueue.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>
> /* unique checkpoint identifier (FIXME: should be per-container ?) */
> static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> struct task_struct *root = ctx->root_task;
> struct nsproxy *nsproxy;
> int ret = 0;
> + struct pid_namespace *pidns;
>
> ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
> ret = -EPERM;
> }
> - /* no support for >1 private pidns */
> - if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> - _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> - ret = -EPERM;
> + /* pidns must be descendent of root_nsproxy */
> + pidns = nsproxy->pid_ns;

In case of unshared pid namespace, task_active_pid_ns(t) should be checked
instead of t->nsproxy->pid_ns: we can't checkpoint a foreign task.

Thanks,

Louis

> + while (pidns != ctx->root_nsproxy->pid_ns) {
> + if (pidns == &init_pid_ns) {
> + ret = -EPERM;
> + _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> + break;
> + }
> + pidns = pidns->parent;
> }
> rcu_read_unlock();
>
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>
> #define CKPT_HDR_PIDS_CHUNK 256
>
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns. This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
> static int checkpoint_pids(struct ckpt_ctx *ctx)
> {
> struct ckpt_pids *h;
> - struct pid_namespace *ns;
> + struct pid_namespace *root_pidns;
> struct task_struct *task;
> struct task_struct **tasks_arr;
> int nr_tasks, n, pos = 0, ret = 0;
>
> - ns = ctx->root_nsproxy->pid_ns;
> + root_pidns = ctx->root_nsproxy->pid_ns;
> tasks_arr = ctx->tasks_arr;
> nr_tasks = ctx->nr_tasks;
> BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> do {
> rcu_read_lock();
> for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> + struct pid_namespace *task_pidns;
> task = tasks_arr[pos];
>
> - h[n].vpid = task_pid_nr_ns(task, ns);
> - h[n].vtgid = task_tgid_nr_ns(task, ns);
> - h[n].vpgid = task_pgrp_nr_ns(task, ns);
> - h[n].vsid = task_session_nr_ns(task, ns);
> - h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> + h[n].vpid = task_pid_nr_ns(task, root_pidns);
> + h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> + h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> + h[n].vsid = task_session_nr_ns(task, root_pidns);
> + h[n].vppid = task_tgid_nr_ns(task->real_parent,
> + root_pidns);
> + task_pidns = task_active_pid_ns(task);
> + h[n].rpid = task_pid_vnr(task);
> + h[n].depth = task_pidns->level - root_pidns->level;
> ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
> pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> + ctx->nr_vpids += h[n].depth;
> pos++;
> }
> rcu_read_unlock();
> @@ -356,6 +373,71 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> return ret;
> }
>
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> + __s32 *h; /* vpid array */
> + struct pid_namespace *root_pidns, *task_pidns = NULL, *active_pidns;
> + struct task_struct *task;
> + int ret, nr_tasks = ctx->nr_tasks;
> + int tidx = 0, /* index into task array */
> + hidx = 0, /* pids written into current __s32 chunk */
> + vidx = 0; /* vpid index for current task */
> +
> + root_pidns = ctx->root_nsproxy->pid_ns;
> + nr_tasks = ctx->nr_tasks;
> +
> + ret = ckpt_write_obj_type(ctx, NULL,
> + sizeof(*h) * ctx->nr_vpids,
> + CKPT_HDR_BUFFER);
> + if (ret < 0)
> + return ret;
> +
> + h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> + if (!h)
> + return -ENOMEM;
> +
> + do {
> + rcu_read_lock();
> + while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
> + int nsdelta;
> +
> + task = ctx->tasks_arr[tidx];
> + active_pidns = task_active_pid_ns(task);
> + nsdelta = active_pidns->level - root_pidns->level;
> + if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
> + /*
> + * We will release rcu before recording the
> + * remaining vpids, but neither task nor its
> + * pid can disappear.
> + */
> + nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;
> +
> + if (vidx == 0)
> + task_pidns = active_pidns;
> + for (; vidx < nsdelta; vidx++) {
> + h[hidx] = task_pid_nr_ns(task, task_pidns);
> + hidx++;
> + task_pidns = task_pidns->parent;
> + }
> +
> + if (task_pidns == root_pidns) {
> + tidx++;
> + vidx = 0;
> + }
> + }
> + rcu_read_unlock();
> +
> + ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> + if (ret < 0)
> + break;
> +
> + hidx = 0;
> + } while (tidx < nr_tasks);
> +
> + _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> + return ret;
> +}
> +
> static int collect_objects(struct ckpt_ctx *ctx)
> {
> int n, ret = 0;
> @@ -466,6 +548,7 @@ static int build_tree(struct ckpt_ctx *ctx)
> static int checkpoint_tree(struct ckpt_ctx *ctx)
> {
> struct ckpt_hdr_tree *h;
> + struct ckpt_hdr_vpids *hvpids;
> int ret;
>
> h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TREE);
> @@ -480,7 +563,23 @@ static int checkpoint_tree(struct ckpt_ctx *ctx)
> return ret;
>
> ret = checkpoint_pids(ctx);
> - return ret;
> + if (ret < 0)
> + return ret;
> +
> + hvpids = ckpt_hdr_get_type(ctx, sizeof(*hvpids), CKPT_HDR_VPIDS);
> + if (!hvpids)
> + return -ENOMEM;
> +
> + hvpids->nr_vpids = ctx->nr_vpids;
> +
> + ret = ckpt_write_obj(ctx, &hvpids->h);
> + ckpt_hdr_put(ctx, hvpids);
> + if (ret < 0)
> + return ret;
> + if (ctx->nr_vpids == 0)
> + return 0;
> +
> + return checkpoint_vpids(ctx);
> }
>
> static struct task_struct *get_freezer_task(struct task_struct *root_task)
> diff --git a/checkpoint/process.c b/checkpoint/process.c
> index f917112..602ba9f 100644
> --- a/checkpoint/process.c
> +++ b/checkpoint/process.c
> @@ -22,7 +22,7 @@
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> #include <linux/syscalls.h>
> -
> +#include <linux/pid_namespace.h>
>
> pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
> {
> @@ -51,7 +51,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
> * Find the owner process of this pgid (it must exist
> * if pgrp exists). It must be a thread group leader.
> */
> - pgrp = find_vpid(pgid);
> + pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns);
> p = pid_task(pgrp, PIDTYPE_PID);
> if (!p || !thread_group_leader(p))
> return NULL;
> @@ -561,6 +561,13 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
> return ret;
> }
>
> +/*
> + * restart is currently serialized, but if/when that changes we want
> + * to make sure that setting nsproxy->pidns in restore_task_ns() is only
> + * done once. That's what checkpoint_nslock is for
> + */
> +DEFINE_SPINLOCK(checkpoint_nslock);
> +
> static int restore_task_ns(struct ckpt_ctx *ctx)
> {
> struct ckpt_hdr_task_ns *h;
> @@ -578,6 +585,10 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
> }
>
> if (nsproxy != task_nsproxy(current)) {
> + spin_lock(&checkpoint_nslock);
> + if (!nsproxy->pid_ns)
> + nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
> + spin_unlock(&checkpoint_nslock);
> get_nsproxy(nsproxy);
> switch_task_namespaces(current, nsproxy);
> }
> @@ -829,8 +840,8 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
>
> pgid = ctx->pids_arr[ctx->active_pid].vpgid;
>
> - if (pgid == task_pgrp_vnr(task)) /* nothing to do */
> - return 0;
> + if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns))
> + return 0; /* nothing to do */
>
> if (task->signal->leader) /* (2) */
> return -EINVAL;
> @@ -850,6 +861,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
> if (ctx->uflags & RESTART_TASKSELF)
> ret = 0;
>
> + if (ret < 0)
> + ckpt_err(ctx, ret, "setting pgid\n");
> +
> return ret;
> }
>
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> index 6a9644d..c25ce88 100644
> --- a/checkpoint/restart.c
> +++ b/checkpoint/restart.c
> @@ -23,6 +23,7 @@
> #include <asm/syscall.h>
> #include <linux/elf.h>
> #include <linux/deferqueue.h>
> +#include <linux/pid_namespace.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
>
> @@ -760,6 +761,33 @@ static int restore_read_tree(struct ckpt_ctx *ctx)
> return ret;
> }
>
> +/*
> + * read all the vpids - we don't actually care about them,
> + * userspace did
> + */
> +static int restore_slurp_vpids(struct ckpt_ctx *ctx)
> +{
> + struct ckpt_hdr_vpids *h;
> + int size, ret;
> +
> + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VPIDS);
> + if (IS_ERR(h))
> + return PTR_ERR(h);
> + ctx->nr_vpids = h->nr_vpids;
> + ckpt_hdr_put(ctx, h);
> +
> + if (!ctx->nr_vpids)
> + return 0;
> +
> + size = sizeof(__s32) * ctx->nr_vpids;
> + if (size < 0) /* overflow ? */
> + return -EINVAL;
> +
> + ret = ckpt_read_consume(ctx, size + sizeof(struct ckpt_hdr),
> + CKPT_HDR_BUFFER);
> + return ret;
> +}
> +
> static inline int all_tasks_activated(struct ckpt_ctx *ctx)
> {
> return (ctx->active_pid == ctx->nr_pids);
> @@ -848,7 +876,8 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
> pid = get_active_pid(ctx);
>
> rcu_read_lock();
> - task = find_task_by_pid_ns(pid, ctx->root_nsproxy->pid_ns);
> + task = find_task_by_pid_ns(pid,
> + task_active_pid_ns(ctx->root_task));
> /* target task must have same restart context */
> if (task && task->checkpoint_ctx == ctx)
> wake_up_process(task);
> @@ -870,7 +899,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
>
> static int wait_task_active(struct ckpt_ctx *ctx)
> {
> - pid_t pid = task_pid_vnr(current);
> + pid_t pid = task_pid_nr_ns(current, task_active_pid_ns(ctx->root_task));
> int ret;
>
> ckpt_debug("pid %d waiting\n", pid);
> @@ -886,7 +915,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
>
> static int wait_task_sync(struct ckpt_ctx *ctx)
> {
> - ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
> + ckpt_debug("pid %d syncing\n",
> + task_pid_nr_ns(current, task_active_pid_ns(ctx->root_task)));
> wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
> ckpt_debug("task sync done (errno %d)\n", ctx->errno);
> if (ckpt_test_error(ctx))
> @@ -1160,7 +1190,7 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
>
> read_lock(&tasklist_lock);
> list_for_each_entry(task, &current->children, sibling) {
> - if (task_pid_vnr(task) == pid) {
> + if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
> get_task_struct(task);
> ctx->root_task = task;
> ctx->root_pid = pid;
> @@ -1237,6 +1267,11 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
> if (ret < 0)
> return ret;
>
> + ret = restore_slurp_vpids(ctx);
> + ckpt_debug("read vpids %d\n", ret);
> + if (ret < 0)
> + return ret;
> +
> if ((ctx->uflags & RESTART_TASKSELF) && ctx->nr_pids != 1)
> return -EINVAL;
>
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 9e9df9b..45d3e7a 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -22,6 +22,7 @@
> #include <linux/capability.h>
> #include <linux/checkpoint.h>
> #include <linux/deferqueue.h>
> +#include <linux/pid_namespace.h>
>
> /*
> * ckpt_unpriv_allowed - sysctl controlled.
> @@ -276,6 +277,7 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
> ctx->uflags = uflags;
> ctx->kflags = kflags;
> ctx->ktime_begin = ktime_get();
> + ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns);
>
> atomic_set(&ctx->refcount, 0);
> INIT_LIST_HEAD(&ctx->pgarr_list);
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 792b523..e860bf5 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -10,7 +10,7 @@
> * distribution for more details.
> */
>
> -#define CHECKPOINT_VERSION 5
> +#define CHECKPOINT_VERSION 6
>
> /* checkpoint user flags */
> #define CHECKPOINT_SUBTREE 0x1
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 41412d1..20c90b3 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -117,6 +117,8 @@ enum {
> #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
> CKPT_HDR_TASK_CREDS,
> #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
> + CKPT_HDR_VPIDS,
> +#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS
>
> /* 201-299: reserved for arch-dependent */
>
> @@ -327,11 +329,23 @@ struct ckpt_hdr_tree {
> } __attribute__((aligned(8)));
>
> struct ckpt_pids {
> + /* These pids are in the root_nsproxy's pid ns */
> __s32 vpid;
> __s32 vppid;
> __s32 vtgid;
> __s32 vpgid;
> __s32 vsid;
> + /* rpid is the real pid, in checkpointer's pidns. This is
> + * so checkpointer in userspace can get more info about the
> + * task (i.e. /proc/pid/mountinfo) */
> + __s32 rpid;
> + __s32 depth; /* pid namespace depth relative to container init */
> +} __attribute__((aligned(8)));
> +
> +/* number of vpids */
> +struct ckpt_hdr_vpids {
> + struct ckpt_hdr h;
> + __s32 nr_vpids;
> } __attribute__((aligned(8)));
>
> /* pids */
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index ecd3e91..2fb79cf 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -72,6 +72,9 @@ struct ckpt_ctx {
> struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
> int nr_tasks; /* size of tasks array */
>
> + int nr_vpids;
> + struct pid_namespace *coord_pidns; /* coordinator pid_ns */
> +
> /* [multi-process restart] */
> struct ckpt_pids *pids_arr; /* array of all pids [restart] */
> int nr_pids; /* size of pids array */
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 0da0d83..6d86240 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
> get_net(net_ns);
> nsproxy->net_ns = net_ns;
>
> - get_pid_ns(current->nsproxy->pid_ns);
> - nsproxy->pid_ns = current->nsproxy->pid_ns;
> + /*
> + * The pid_ns will get assigned the first time that we
> + * assign the nsproxy to a task. The task had unshared
> + * its pid_ns in userspace before calling restart, and
> + * we want to keep using that pid_ns.
> + */
> + nsproxy->pid_ns = NULL;
> }
> out:
> if (ret < 0)
> --
> 1.6.1
>
> _______________________________________________
> Containers mailing list
> Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
> https://lists.linux-foundation.org/mailman/listinfo/containers

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes

Attachment: signature.asc
Description: Digital signature