Re: [RFC] catching sys_reboot syscall

From: Bruno PrÃmont
Date: Wed Aug 10 2011 - 16:17:32 EST


Hi Daniel,

[I'm adding containers ml as we had a discussion there some time ago
for this feature]

One comment below:

On Mon, 08 August 2011 Daniel Lezcano <daniel.lezcano@xxxxxxx> wrote:
> Hi all,
>
> I am trying to solve one of the problems the linux container community
> is facing :
>
> How to catch when a process called the 'reboot' syscall in a container ?
>
> In the case of a VPS, when we shutdown/halt/reboot the container, the
> reboot utility will invoke the sys_reboot syscall which has the bad
> effect to reboot the host. The way to fix that is to drop the
> CAP_SYS_REBOOT capability in the container.
>
> In this case, the container shutdowns correctly but, at the end, the
> init process is waiting indefinitely and we have the containers stuck
> with one process (the init process).
>
> In order to fix that, we used a hypervisor process, parent of the
> container's init process, watching for the container's utmp file and
> detecting when the runlevel changes. When this runlevel change is
> detected we wait for the container to have one process left and then we
> kill the container's init.
>
> That works well if we modify the distro configuration files, we make
> /var/run to not be a tmpfs and we remove all the files inside this
> directory when the container boots. *But* as soon as we upgrade the
> container distro, all the tweaks are lost. So this method works but at
> the cost of tweaking the containers configuration files again and again,
> each time there is an update, which is not tolerable in a production
> environment.
>
> This problem is easy to solve with a small hack in the kernel:
>
> diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
> index 942d30b..61f3d02 100644
> --- a/include/asm-generic/siginfo.h
> +++ b/include/asm-generic/siginfo.h
> @@ -218,7 +218,8 @@ typedef struct siginfo {
> #define CLD_TRAPPED (__SI_CHLD|4) /* traced child has trapped */
> #define CLD_STOPPED (__SI_CHLD|5) /* child has stopped */
> #define CLD_CONTINUED (__SI_CHLD|6) /* stopped child has continued */
> -#define NSIGCHLD 6
> +#define CLD_REBOOTED (__SI_CHLD|7) /* process was killed by a
> reboot */
> +#define NSIGCHLD 7
>
> /*
> * SIGPOLL si_codes
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 67f5688..41e0889 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2103,6 +2103,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int
> priv);
> extern int kill_pid(struct pid *pid, int sig, int priv);
> extern int kill_proc_info(int, struct siginfo *, pid_t);
> extern int do_notify_parent(struct task_struct *, int);
> +extern void do_notify_parent_cldreboot(struct task_struct *, int, char *);
> extern void __wake_up_parent(struct task_struct *p, struct task_struct
> *parent);
> extern void force_sig(int, struct task_struct *);
> extern int send_sig(int, struct task_struct *, int);
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 4e3cff1..09fc254 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1522,6 +1522,46 @@ int do_notify_parent(struct task_struct *tsk, int
> sig)
> return ret;
> }
>
> +void do_notify_parent_cldreboot(struct task_struct *tsk, int why, char
> *buffer)
> +{
> + struct siginfo info = { };
> + struct task_struct *parent;
> + struct sighand_struct *sighand;
> + unsigned long flags;
> +
> + if (task_ptrace(tsk))
> + parent = tsk->parent;
> + else {
> + tsk = tsk->group_leader;
> + parent = tsk->real_parent;
> + }
> +
> + info.si_signo = SIGCHLD;
> + info.si_errno = 0;
> + info.si_status = why;
> +
> + rcu_read_lock();
> + info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
> + info.si_uid = __task_cred(tsk)->uid;
> + rcu_read_unlock();
> +
> + info.si_utime = cputime_to_clock_t(tsk->utime);
> + info.si_stime = cputime_to_clock_t(tsk->stime);
> +
> + info.si_code = CLD_REBOOTED;
> +
> + sighand = parent->sighand;
> + spin_lock_irqsave(&sighand->siglock, flags);
> + if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
> + sighand->action[SIGCHLD-1].sa.sa_flags & SA_CLDREBOOT)
> + __group_send_sig_info(SIGCHLD, &info, parent);
> + /*
> + * Even if SIGCHLD is not generated, we must wake up wait4 calls.
> + */
> + __wake_up_parent(tsk, parent);
> + spin_unlock_irqrestore(&sighand->siglock, flags);
> +}
> +
> static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
> {
> struct siginfo info;
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 18da702..b50449c 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -361,6 +361,13 @@ void kernel_power_off(void)
> }
> EXPORT_SYMBOL_GPL(kernel_power_off);
>
> +static void pid_namespace_reboot(struct pid_namespace *pid_ns,
> + int cmd, char *buffer)
> +{
> + struct task_struct *tsk = pid_ns->child_reaper;
> + do_notify_parent_cldreboot(tsk, cmd, buffer);
> +}
> +
> static DEFINE_MUTEX(reboot_mutex);
>
> /*
> @@ -374,13 +381,10 @@ static DEFINE_MUTEX(reboot_mutex);
> SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
> void __user *, arg)
> {
> - char buffer[256];
> + struct pid_namespace *pid_ns = current->nsproxy->pid_ns;
> + char buffer[256] = { 0 };
> int ret = 0;
>
> - /* We only trust the superuser with rebooting the system. */
> - if (!capable(CAP_SYS_BOOT))
> - return -EPERM;
> -
> /* For safety, we require "magic" arguments. */
> if (magic1 != LINUX_REBOOT_MAGIC1 ||
> (magic2 != LINUX_REBOOT_MAGIC2 &&
> @@ -395,12 +399,45 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2,
> unsigned int, cmd,
> if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
> cmd = LINUX_REBOOT_CMD_HALT;
>
> + /* check the cmd parameter */
> + if (cmd != LINUX_REBOOT_CMD_RESTART &&
> +#ifdef CONFIG_KEXEC
> + cmd != LINUX_REBOOT_CMD_KEXEC &&
> +#endif
> +#ifdef CONFIG_HIBERNATION
> + cmd != LINUX_REBOOT_CMD_SW_SUSPEND &&
> +#endif
> + cmd != LINUX_REBOOT_CMD_CAD_ON &&
> + cmd != LINUX_REBOOT_CMD_CAD_OFF &&
> + cmd != LINUX_REBOOT_CMD_HALT &&
> + cmd != LINUX_REBOOT_CMD_POWER_OFF &&
> + cmd != LINUX_REBOOT_CMD_RESTART2)
> + return -EINVAL;
> +
> + if (cmd == LINUX_REBOOT_CMD_RESTART2)
> + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0)
> + return -EFAULT;
> +
> + /* If we are not in the initial pid namespace, we send a signal
> + * to the parent of this init pid namespace, notifying a shutdown
> + * occured */
> + if (pid_ns != &init_pid_ns)
> + pid_namespace_reboot(pid_ns, cmd, buffer);

Should there be a return here?
Or does pid_namespace_reboot() never return by submitting signal to
parent?

Thanks,
Bruno



> +
> + /* We only trust the superuser with rebooting the system. */
> + if (!capable(CAP_SYS_BOOT))
> + return -EPERM;
> +
> mutex_lock(&reboot_mutex);
> switch (cmd) {
> case LINUX_REBOOT_CMD_RESTART:
> kernel_restart(NULL);
> break;
>
> + case LINUX_REBOOT_CMD_RESTART2:
> + kernel_restart(buffer);
> + break;
> +
> case LINUX_REBOOT_CMD_CAD_ON:
> C_A_D = 1;
> break;
> @@ -419,16 +456,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2,
> unsigned int, cmd,
> do_exit(0);
> break;
>
> - case LINUX_REBOOT_CMD_RESTART2:
> - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
> - ret = -EFAULT;
> - break;
> - }
> - buffer[sizeof(buffer) - 1] = '\0';
> -
> - kernel_restart(buffer);
> - break;
> -
> #ifdef CONFIG_KEXEC
> case LINUX_REBOOT_CMD_KEXEC:
> ret = kernel_kexec();
> @@ -440,10 +467,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2,
> unsigned int, cmd,
> ret = hibernate();
> break;
> #endif
> -
> - default:
> - ret = -EINVAL;
> - break;
> }
> mutex_unlock(&reboot_mutex);
> return ret;
>
>
> With this patch, the container's init parent will receive a SIGCHLD,
> with ssi_code set with the sys_reboot parameter value, when one of the
> process in the container invoke sys_reboot. This solution works very
> well and solves the problem but is it acceptable ? I don't see any
> alternative usable for other use cases. It is probable nobody cares
> about sys_reboot was called because there is nothing to do as the system
> will halt a few microseconds after :)
>
> Does anyone have an idea ?
>
> Thanks in advance
> -- Daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/