Re: proc_flush_task oops

From: Tetsuo Handa
Date: Tue Dec 19 2017 - 05:50:04 EST

On 2017/12/19 12:39, Dave Jones wrote:
> On Mon, Dec 18, 2017 at 03:50:52PM -0800, Linus Torvalds wrote:
> > But I don't see what would have changed in this area recently.
> >
> > Do you end up saving the seeds that cause crashes? Is this
> > reproducible? (Other than seeing it twoce, of course)
> Only clue so far, is every time I'm able to trigger it, the last thing
> the child process that triggers it did, was an execveat.
> Telling it to just fuzz execveat doesn't instantly trigger it, so it
> must be a combination of some other syscall. I'll leave a script running
> overnight to see if I can binary search the other syscalls in
> combination with it.
> One other thing: I said this was rc4, but it was actually rc4 + all the
> x86 stuff from today. There's enough creepy stuff in that pile, that
> I'll try with just plain rc4 tomorrow too.
> Dave

When hitting an oops at finalization function using fuzzing tool, checking for
corresponding initialization function and previous error messages might help.

It seems to me that there are error paths which allow nsproxy to be replaced
without assigning ->pid_ns_for_children->proc_mnt.

static __latent_entropy struct task_struct *copy_process(
unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls,
int node)
retval = copy_namespaces(clone_flags, p); // Allocates p->nsproxy->pid_ns_for_children
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
goto bad_fork_cleanup_io; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.

if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children); // Initializes p->nsproxy->pid_ns_for_children->proc_mnt upon success.
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.

int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;

if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
return 0;

if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;

* CLONE_NEWIPC must detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable. In clone parlance, CLONE_SYSVSEM
* means share undolist with parent, so we must forbid using
* it along with CLONE_NEWIPC.
if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
return -EINVAL;

new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); // Allocates new nsproxy.
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);

tsk->nsproxy = new_ns; // p->nsproxy is updated with ->pid_ns_for_children->proc_mnt == NULL.
return 0;

static struct nsproxy *create_new_namespaces(unsigned long flags,
struct task_struct *tsk, struct user_namespace *user_ns,
struct fs_struct *new_fs)
struct nsproxy *new_nsp;
int err;

new_nsp = create_nsproxy();
if (!new_nsp)
return ERR_PTR(-ENOMEM);

new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;

new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;

new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;

new_nsp->pid_ns_for_children =
copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); // Returns with ->proc_mnt == NULL.
if (IS_ERR(new_nsp->pid_ns_for_children)) {
err = PTR_ERR(new_nsp->pid_ns_for_children);
goto out_pid;

new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
if (IS_ERR(new_nsp->cgroup_ns)) {
err = PTR_ERR(new_nsp->cgroup_ns);
goto out_cgroup;

new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;

return new_nsp; // Returns with ->pid_ns_for_children->proc_mnt == NULL.

if (new_nsp->pid_ns_for_children)
if (new_nsp->ipc_ns)
if (new_nsp->uts_ns)
if (new_nsp->mnt_ns)
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);

struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
if (!(flags & CLONE_NEWPID))
return get_pid_ns(old_ns);
if (task_active_pid_ns(current) != old_ns)
return ERR_PTR(-EINVAL);
return create_pid_namespace(user_ns, old_ns); // Returns with ->proc_mnt == NULL.

static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
int err;

err = -EINVAL;
if (!in_userns(parent_pid_ns->user_ns, user_ns))
goto out;

err = -ENOSPC;
if (level > MAX_PID_NS_LEVEL)
goto out;
ucounts = inc_pid_namespaces(user_ns);
if (!ucounts)
goto out;

err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); // Initializes ->proc_mnt with NULL.
if (ns == NULL)
goto out_dec;


ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
goto out_free_idr;

err = ns_alloc_inum(&ns->ns);
if (err)
goto out_free_idr;
ns->ns.ops = &pidns_operations;

ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);

return ns; // Returns with ->proc_mnt == NULL.

kmem_cache_free(pid_ns_cachep, ns);
return ERR_PTR(err);

struct pid *alloc_pid(struct pid_namespace *ns)
if (unlikely(is_child_reaper(pid))) {
if (pid_ns_prepare_proc(ns)) {
goto out_free;

int pid_ns_prepare_proc(struct pid_namespace *ns)
struct vfsmount *mnt;

mnt = kern_mount_data(&proc_fs_type, ns);
if (IS_ERR(mnt))
return PTR_ERR(mnt);

ns->proc_mnt = mnt;
return 0;

If one of copy_io(), copy_thread_tls() or alloc_pid() returns an error, creation
of a child fails with p->nsproxy->pid_ns_for_children->proc_mnt == NULL.

Then, when the child exits, the parent waiting at wait() calls
proc_flush_task() which assumes that everything was set up properly.

void proc_flush_task(struct task_struct *task)
int i;
struct pid *pid, *tgid;
struct upid *upid;

pid = task_pid(task);
tgid = task_tgid(task);

for (i = 0; i <= pid->level; i++) {
upid = &pid->numbers[i];
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,