[RFC] catching sys_reboot syscall

From: Daniel Lezcano
Date: Mon Aug 08 2011 - 17:14:15 EST



Hi all,

I am trying to solve one of the problems the linux container community
is facing :

How to catch when a process called the 'reboot' syscall in a container ?

In the case of a VPS, when we shutdown/halt/reboot the container, the
reboot utility will invoke the sys_reboot syscall which has the bad
effect to reboot the host. The way to fix that is to drop the
CAP_SYS_REBOOT capability in the container.

In this case, the container shutdowns correctly but, at the end, the
init process is waiting indefinitely and we have the containers stuck
with one process (the init process).

In order to fix that, we used a hypervisor process, parent of the
container's init process, watching for the container's utmp file and
detecting when the runlevel changes. When this runlevel change is
detected we wait for the container to have one process left and then we
kill the container's init.

That works well if we modify the distro configuration files, we make
/var/run to not be a tmpfs and we remove all the files inside this
directory when the container boots. *But* as soon as we upgrade the
container distro, all the tweaks are lost. So this method works but at
the cost of tweaking the containers configuration files again and again,
each time there is an update, which is not tolerable in a production
environment.

This problem is easy to solve with a small hack in the kernel:

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..61f3d02 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
#define CLD_TRAPPED (__SI_CHLD|4) /* traced child has trapped */
#define CLD_STOPPED (__SI_CHLD|5) /* child has stopped */
#define CLD_CONTINUED (__SI_CHLD|6) /* stopped child has continued */
-#define NSIGCHLD 6
+#define CLD_REBOOTED (__SI_CHLD|7) /* process was killed by a
reboot */
+#define NSIGCHLD 7

/*
* SIGPOLL si_codes
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 67f5688..41e0889 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2103,6 +2103,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int
priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern int kill_proc_info(int, struct siginfo *, pid_t);
extern int do_notify_parent(struct task_struct *, int);
+extern void do_notify_parent_cldreboot(struct task_struct *, int, char *);
extern void __wake_up_parent(struct task_struct *p, struct task_struct
*parent);
extern void force_sig(int, struct task_struct *);
extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..09fc254 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1522,6 +1522,46 @@ int do_notify_parent(struct task_struct *tsk, int
sig)
return ret;
}

+void do_notify_parent_cldreboot(struct task_struct *tsk, int why, char
*buffer)
+{
+ struct siginfo info = { };
+ struct task_struct *parent;
+ struct sighand_struct *sighand;
+ unsigned long flags;
+
+ if (task_ptrace(tsk))
+ parent = tsk->parent;
+ else {
+ tsk = tsk->group_leader;
+ parent = tsk->real_parent;
+ }
+
+ info.si_signo = SIGCHLD;
+ info.si_errno = 0;
+ info.si_status = why;
+
+ rcu_read_lock();
+ info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+ info.si_uid = __task_cred(tsk)->uid;
+ rcu_read_unlock();
+
+ info.si_utime = cputime_to_clock_t(tsk->utime);
+ info.si_stime = cputime_to_clock_t(tsk->stime);
+
+ info.si_code = CLD_REBOOTED;
+
+ sighand = parent->sighand;
+ spin_lock_irqsave(&sighand->siglock, flags);
+ if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
+ sighand->action[SIGCHLD-1].sa.sa_flags & SA_CLDREBOOT)
+ __group_send_sig_info(SIGCHLD, &info, parent);
+ /*
+ * Even if SIGCHLD is not generated, we must wake up wait4 calls.
+ */
+ __wake_up_parent(tsk, parent);
+ spin_unlock_irqrestore(&sighand->siglock, flags);
+}
+
static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
{
struct siginfo info;
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..b50449c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,6 +361,13 @@ void kernel_power_off(void)
}
EXPORT_SYMBOL_GPL(kernel_power_off);

+static void pid_namespace_reboot(struct pid_namespace *pid_ns,
+ int cmd, char *buffer)
+{
+ struct task_struct *tsk = pid_ns->child_reaper;
+ do_notify_parent_cldreboot(tsk, cmd, buffer);
+}
+
static DEFINE_MUTEX(reboot_mutex);

/*
@@ -374,13 +381,10 @@ static DEFINE_MUTEX(reboot_mutex);
SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
- char buffer[256];
+ struct pid_namespace *pid_ns = current->nsproxy->pid_ns;
+ char buffer[256] = { 0 };
int ret = 0;

- /* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT))
- return -EPERM;
-
/* For safety, we require "magic" arguments. */
if (magic1 != LINUX_REBOOT_MAGIC1 ||
(magic2 != LINUX_REBOOT_MAGIC2 &&
@@ -395,12 +399,45 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2,
unsigned int, cmd,
if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
cmd = LINUX_REBOOT_CMD_HALT;

+ /* check the cmd parameter */
+ if (cmd != LINUX_REBOOT_CMD_RESTART &&
+#ifdef CONFIG_KEXEC
+ cmd != LINUX_REBOOT_CMD_KEXEC &&
+#endif
+#ifdef CONFIG_HIBERNATION
+ cmd != LINUX_REBOOT_CMD_SW_SUSPEND &&
+#endif
+ cmd != LINUX_REBOOT_CMD_CAD_ON &&
+ cmd != LINUX_REBOOT_CMD_CAD_OFF &&
+ cmd != LINUX_REBOOT_CMD_HALT &&
+ cmd != LINUX_REBOOT_CMD_POWER_OFF &&
+ cmd != LINUX_REBOOT_CMD_RESTART2)
+ return -EINVAL;
+
+ if (cmd == LINUX_REBOOT_CMD_RESTART2)
+ if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0)
+ return -EFAULT;
+
+ /* If we are not in the initial pid namespace, we send a signal
+ * to the parent of this init pid namespace, notifying a shutdown
+ * occured */
+ if (pid_ns != &init_pid_ns)
+ pid_namespace_reboot(pid_ns, cmd, buffer);
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
mutex_lock(&reboot_mutex);
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
kernel_restart(NULL);
break;

+ case LINUX_REBOOT_CMD_RESTART2:
+ kernel_restart(buffer);
+ break;
+
case LINUX_REBOOT_CMD_CAD_ON:
C_A_D = 1;
break;
@@ -419,16 +456,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2,
unsigned int, cmd,
do_exit(0);
break;

- case LINUX_REBOOT_CMD_RESTART2:
- if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
- ret = -EFAULT;
- break;
- }
- buffer[sizeof(buffer) - 1] = '\0';
-
- kernel_restart(buffer);
- break;
-
#ifdef CONFIG_KEXEC
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
@@ -440,10 +467,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2,
unsigned int, cmd,
ret = hibernate();
break;
#endif
-
- default:
- ret = -EINVAL;
- break;
}
mutex_unlock(&reboot_mutex);
return ret;


With this patch, the container's init parent will receive a SIGCHLD,
with ssi_code set with the sys_reboot parameter value, when one of the
process in the container invoke sys_reboot. This solution works very
well and solves the problem but is it acceptable ? I don't see any
alternative usable for other use cases. It is probable nobody cares
about sys_reboot was called because there is nothing to do as the system
will halt a few microseconds after :)

Does anyone have an idea ?

Thanks in advance
-- Daniel


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/