[PATCH 1/2] Run dump pipe in container's namespace

From: Zhao Lei
Date: Wed Mar 16 2016 - 05:25:34 EST


In current system, when we set core_pattern to a pipe, both pipe program
and program's output are in host's filesystem.

For example, when we set following core_pattern:
# echo "|/my_dump_pipe %s %c %p %u %g %t e" >/proc/sys/kernel/core_pattern
and trigger a segment fault in a container, my_dump_pipe is searched from
host's filesystem, and it will write coredump into host's filesystem too.

In a privileged container, user can crush host system by following command:
# # In a container
# echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern
# make_dump

Actually, all operation in a container should not change host's
environment, the container should use core_pattern as its private setting.
In detail, in core dump action:
1: Search pipe program in container's fs namespace.
2: Run pipe program in container's fs namespace to write coredump to it.

This patch fixed above problem running pipe program in user process's
context instead of kthread.

Test:
# ################
# # In host's system
# ################
#
# ulimit -c 1024000
# echo "|/dump_pipe" >/proc/sys/kernel/core_pattern
# cat /dump_pipe
#!/bin/sh
cat >/tmp/host_dump_$1_$2_$3_$4_$5_$6
# rm -f /tmp/*dump*
# ./make_dump
Segmentation fault (core dumped)
# ls -l /tmp/*dump*
-rw-r--r-- 1 root root 331776 Mar 16 16:57 /tmp/host_dump______
#
# lxc-start -n vm01
#
# ################
# # In guest's system:
# ################
#
# cat /proc/sys/kernel/core_pattern
|/dump_pipe
# cat /dump_pipe
#!/bin/sh
cat >/tmp/guest_dump_$1_$2_$3_$4_$5_$6
# rm -f /tmp/*dump*
# ./make_dump
Segmentation fault (core dumped)
# ls -l /tmp/*dump*
-rw-r--r-- 1 root root 331776 Mar 16 09:02 /tmp/guest_dump______
#

Signed-off-by: Zhao Lei <zhaolei@xxxxxxxxxxxxxx>
---
arch/x86/kernel/process_32.c | 5 +--
arch/x86/kernel/process_64.c | 5 +--
fs/coredump.c | 76 +++++++++++++++++++++++++++-----------------
include/linux/sched.h | 5 +--
kernel/fork.c | 24 ++++++++------
5 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9f95091..2b1862e 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -130,7 +130,8 @@ void release_thread(struct task_struct *dead_task)
}

int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
+ unsigned long arg, struct task_struct *p, unsigned long tls,
+ int return_to_kernel)
{
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
@@ -140,7 +141,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
p->thread.sp0 = (unsigned long) (childregs+1);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));

- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & PF_KTHREAD) || return_to_kernel) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
p->thread.ip = (unsigned long) ret_from_kernel_thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0..de05bc0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -153,7 +153,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
}

int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
+ unsigned long arg, struct task_struct *p, unsigned long tls,
+ int return_to_kernel)
{
int err;
struct pt_regs *childregs;
@@ -173,7 +174,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
savesegment(ds, p->thread.ds);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));

- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & PF_KTHREAD) || return_to_kernel) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
childregs->sp = (unsigned long)childregs;
diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9..6287f00 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -496,33 +496,50 @@ static void wait_for_dump_helpers(struct file *file)
pipe_unlock(pipe);
}

-/*
- * umh_pipe_setup
- * helper function to customize the process used
- * to collect the core in userspace. Specifically
- * it sets up a pipe and installs it as fd 0 (stdin)
- * for the process. Returns 0 on success, or
- * PTR_ERR on failure.
- * Note that it also sets the core limit to 1. This
- * is a special value that we use to trap recursive
- * core dumps
- */
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+struct pipeprg_data {
+ char **argv;
+ struct coredump_params *cp;
+};
+
+static int fork_callback(void *data)
{
+ struct pipeprg_data *ppd = (struct pipeprg_data *)data;
struct file *files[2];
- struct coredump_params *cp = (struct coredump_params *)info->data;
- int err = create_pipe_files(files, 0);
- if (err)
- return err;
+ int ret;
+
+ /*
+ * Sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.
+ */
+ ret = create_pipe_files(files, 0);
+ if (ret)
+ do_exit(0);

- cp->file = files[1];
+ ppd->cp->file = files[1];

- err = replace_fd(0, files[0], 0);
+ ret = replace_fd(0, files[0], 0);
fput(files[0]);
- /* and disallow core files too */
+ if (ret < 0)
+ do_exit(0);
+
+ /*
+ * Sets the core limit to 1. This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};

- return err;
+ set_fs(KERNEL_DS);
+ ret = do_execve(getname_kernel(ppd->argv[0]),
+ (const char __user *const __user *)ppd->argv,
+ (const char __user *const __user *)NULL);
+ if (ret) {
+ printk(KERN_WARNING "execute pipe program failed: %s ret=%d\n",
+ ppd->argv[0], ret);
+ do_exit(0);
+ }
+
+ return ret;
}

void do_coredump(const siginfo_t *siginfo)
@@ -551,6 +568,8 @@ void do_coredump(const siginfo_t *siginfo)
*/
.mm_flags = mm->flags,
};
+ struct pipeprg_data ppd;
+ pid_t pid;

audit_core_dumps(siginfo->si_signo);

@@ -586,7 +605,6 @@ void do_coredump(const siginfo_t *siginfo)
if (ispipe) {
int dump_count;
char **helper_argv;
- struct subprocess_info *sub_info;

if (ispipe < 0) {
printk(KERN_WARNING "format_corename failed\n");
@@ -633,19 +651,17 @@ void do_coredump(const siginfo_t *siginfo)
goto fail_dropcount;
}

- retval = -ENOMEM;
- sub_info = call_usermodehelper_setup(helper_argv[0],
- helper_argv, NULL, GFP_KERNEL,
- umh_pipe_setup, NULL, &cprm);
- if (sub_info)
- retval = call_usermodehelper_exec(sub_info,
- UMH_WAIT_EXEC);
+ ppd.argv = helper_argv;
+ ppd.cp = &cprm;

+ pid = _do_fork(CLONE_VFORK, (unsigned long)fork_callback,
+ (unsigned long)&ppd, NULL, NULL, 0, 1);
argv_free(helper_argv);
- if (retval) {
+ if (pid < 0) {
printk(KERN_INFO "Core dump to |%s pipe failed\n",
cn.corename);
- goto close_fail;
+ retval = pid;
+ goto fail_dropcount;
}
} else {
struct inode *inode;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..1647319 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2612,7 +2612,7 @@ extern void mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_HAVE_COPY_THREAD_TLS
extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
- struct task_struct *, unsigned long);
+ struct task_struct *, unsigned long, int);
#else
extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
@@ -2644,7 +2644,8 @@ extern int do_execveat(int, struct filename *,
const char __user * const __user *,
const char __user * const __user *,
int);
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
+extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
+ int __user *, unsigned long, int);
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e391c7..643a09b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1245,7 +1245,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
int __user *child_tidptr,
struct pid *pid,
int trace,
- unsigned long tls)
+ unsigned long tls,
+ int return_to_kernel)
{
int retval;
struct task_struct *p;
@@ -1451,7 +1452,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+ retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls,
+ return_to_kernel);
if (retval)
goto bad_fork_cleanup_io;

@@ -1673,7 +1675,7 @@ static inline void init_idle_pids(struct pid_link *links)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1693,7 +1695,8 @@ long _do_fork(unsigned long clone_flags,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
- unsigned long tls)
+ unsigned long tls,
+ int return_to_kernel)
{
struct task_struct *p;
int trace = 0;
@@ -1718,7 +1721,7 @@ long _do_fork(unsigned long clone_flags,
}

p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace, tls);
+ child_tidptr, NULL, trace, tls, return_to_kernel);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -1769,7 +1772,7 @@ long do_fork(unsigned long clone_flags,
int __user *child_tidptr)
{
return _do_fork(clone_flags, stack_start, stack_size,
- parent_tidptr, child_tidptr, 0);
+ parent_tidptr, child_tidptr, 0, 0);
}
#endif

@@ -1779,14 +1782,14 @@ long do_fork(unsigned long clone_flags,
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL, 0);
+ (unsigned long)arg, NULL, NULL, 0, 0);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+ return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -1798,7 +1801,7 @@ SYSCALL_DEFINE0(fork)
SYSCALL_DEFINE0(vfork)
{
return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL, 0);
+ 0, NULL, NULL, 0, 0);
}
#endif

@@ -1826,7 +1829,8 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
unsigned long, tls)
#endif
{
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+ return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
+ tls, 0);
}
#endif

--
1.8.5.1