[PATCH v2 2/3] Run dump pipe in container's namespace

From: Zhao Lei
Date: Fri Mar 18 2016 - 08:50:57 EST


In current system, when we set core_pattern to a pipe, both pipe program
and program's output are in host's filesystem.

For example, when we set following core_pattern:
# echo "|/my_dump_pipe %s %c %p %u %g %t e" >/proc/sys/kernel/core_pattern
and trigger a segment fault in a container, my_dump_pipe is searched from
host's filesystem, and it will write coredump into host's filesystem too.

In a privileged container, user can destroy host system by following
command:
# # In a container
# echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern
# make_dump

Actually, all operation in a container should not change host's
environment, the container should use core_pattern as its private setting.
In detail, in core dump action:
1: Search pipe program in container's fs namespace.
2: Run pipe program in container's fs namespace to write coredump to it.

This patch fixed above problem running pipe program in user process's
context instead of kthread.

Test:
# ################
# # In host's system
# ################
#
# ulimit -c 1024000
# echo "|/dump_pipe" >/proc/sys/kernel/core_pattern
# cat /dump_pipe
#!/bin/sh
cat >/tmp/host_dump_$1_$2_$3_$4_$5_$6
# rm -f /tmp/*dump*
# ./make_dump
Segmentation fault (core dumped)
# ls -l /tmp/*dump*
-rw-r--r-- 1 root root 331776 Mar 16 16:57 /tmp/host_dump______
#
# lxc-start -n vm01
#
# ################
# # In guest's system:
# ################
#
# cat /proc/sys/kernel/core_pattern
|/dump_pipe
# cat /dump_pipe
#!/bin/sh
cat >/tmp/guest_dump_$1_$2_$3_$4_$5_$6
# rm -f /tmp/*dump*
# ./make_dump
Segmentation fault (core dumped)
# ls -l /tmp/*dump*
-rw-r--r-- 1 root root 331776 Mar 16 09:02 /tmp/guest_dump______
#

Signed-off-by: Zhao Lei <zhaolei@xxxxxxxxxxxxxx>
---
fs/coredump.c | 76 +++++++++++++++++++++++++++++++--------------------
include/linux/sched.h | 1 +
kernel/fork.c | 6 ++++
3 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9..863c23a 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -496,33 +496,50 @@ static void wait_for_dump_helpers(struct file *file)
pipe_unlock(pipe);
}

-/*
- * umh_pipe_setup
- * helper function to customize the process used
- * to collect the core in userspace. Specifically
- * it sets up a pipe and installs it as fd 0 (stdin)
- * for the process. Returns 0 on success, or
- * PTR_ERR on failure.
- * Note that it also sets the core limit to 1. This
- * is a special value that we use to trap recursive
- * core dumps
- */
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+struct pipeprg_data {
+ char **argv;
+ struct coredump_params *cp;
+};
+
+static int fork_callback(void *data)
{
+ struct pipeprg_data *ppd = (struct pipeprg_data *)data;
struct file *files[2];
- struct coredump_params *cp = (struct coredump_params *)info->data;
- int err = create_pipe_files(files, 0);
- if (err)
- return err;
+ int ret;
+
+ /*
+ * Sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.
+ */
+ ret = create_pipe_files(files, 0);
+ if (ret)
+ do_exit(0);

- cp->file = files[1];
+ ppd->cp->file = files[1];

- err = replace_fd(0, files[0], 0);
+ ret = replace_fd(0, files[0], 0);
fput(files[0]);
- /* and disallow core files too */
+ if (ret < 0)
+ do_exit(0);
+
+ /*
+ * Sets the core limit to 1. This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};

- return err;
+ set_fs(KERNEL_DS);
+ ret = do_execve(getname_kernel(ppd->argv[0]),
+ (const char __user *const __user *)ppd->argv,
+ (const char __user *const __user *)NULL);
+ if (ret) {
+ printk(KERN_WARNING "execute pipe program failed: %s ret=%d\n",
+ ppd->argv[0], ret);
+ do_exit(0);
+ }
+
+ return ret;
}

void do_coredump(const siginfo_t *siginfo)
@@ -586,7 +603,8 @@ void do_coredump(const siginfo_t *siginfo)
if (ispipe) {
int dump_count;
char **helper_argv;
- struct subprocess_info *sub_info;
+ struct pipeprg_data ppd;
+ pid_t pid;

if (ispipe < 0) {
printk(KERN_WARNING "format_corename failed\n");
@@ -633,19 +651,17 @@ void do_coredump(const siginfo_t *siginfo)
goto fail_dropcount;
}

- retval = -ENOMEM;
- sub_info = call_usermodehelper_setup(helper_argv[0],
- helper_argv, NULL, GFP_KERNEL,
- umh_pipe_setup, NULL, &cprm);
- if (sub_info)
- retval = call_usermodehelper_exec(sub_info,
- UMH_WAIT_EXEC);
+ ppd.argv = helper_argv;
+ ppd.cp = &cprm;

+ pid = user_thread(fork_callback, &ppd,
+ CLONE_VFORK | CLONE_UNTRACED);
argv_free(helper_argv);
- if (retval) {
+ if (pid < 0) {
printk(KERN_INFO "Core dump to |%s pipe failed\n",
cn.corename);
- goto close_fail;
+ retval = pid;
+ goto fail_dropcount;
}
} else {
struct inode *inode;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56401e4..a1893f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2649,6 +2649,7 @@ extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern pid_t user_thread(int (*fn)(void *), void *arg, unsigned long flags);

extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
static inline void set_task_comm(struct task_struct *tsk, const char *from)
diff --git a/kernel/fork.c b/kernel/fork.c
index 643a09b..71b3339 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1785,6 +1785,12 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
(unsigned long)arg, NULL, NULL, 0, 0);
}

+pid_t user_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ return _do_fork(flags, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL, 0, 1);
+}
+
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
--
1.8.5.1