[PATCH v3] kernel/signal: Signal-based pre-coredump notification

From: Enke Chen
Date: Wed Oct 24 2018 - 01:44:28 EST


For simplicity and consistency, this patch provides an implementation
for signal-based fault notification prior to the coredump of a child
process. A new prctl command, PR_SET_PREDUMP_SIG, is defined that can
be used by an application to express its interest and to specify the
signal for such a notification. A new signal code CLD_PREDUMP is also
defined for SIGCHLD.

Changes to prctl(2):

PR_SET_PREDUMP_SIG (since Linux 4.20.x)
Set the child pre-coredump signal of the calling process to
arg2 (either a signal value in the range 1..maxsig, or 0 to
clear). This is the signal that the calling process will get
prior to the coredump of a child process. This value is
cleared across execve(2), or for the child of a fork(2).

When SIGCHLD is specified, the signal code will be set to
CLD_PREDUMP in such an SIGCHLD signal.

PR_GET_PREDUMP_SIG (since Linux 4.20.x)
Return the current value of the child pre-coredump signal,
in the location pointed to by (int *) arg2.

Background:

As the coredump of a process may take time, in certain time-sensitive
applications it is necessary for a parent process (e.g., a process
manager) to be notified of a child's imminent death before the coredump
so that the parent process can act sooner, such as re-spawning an
application process, or initiating a control-plane fail-over.

Currently there are two ways for a parent process to be notified of a
child process's state change. One is to use the POSIX signal, and
another is to use the kernel connector module. The specific events and
actions are summarized as follows:

Process Event POSIX Signal Connector-based
----------------------------------------------------------------------
ptrace_attach() do_notify_parent_cldstop() proc_ptrace_connector()
SIGCHLD / CLD_STOPPED

ptrace_detach() do_notify_parent_cldstop() proc_ptrace_connector()
SIGCHLD / CLD_CONTINUED

pre_coredump/ N/A proc_coredump_connector()
get_signal()

post_coredump/ do_notify_parent() proc_exit_connector()
do_exit() SIGCHLD / exit_signal
----------------------------------------------------------------------

As shown in the table, the signal-based pre-coredump notification is not
currently available. In some cases using a connector-based notification
can be quite complicated (e.g., when a process manager is written in shell
scripts and thus is subject to certain inherent limitations), and a
signal-based notification would be simpler and better suited.

Signed-off-by: Enke Chen <enkechen@xxxxxxxxx>
---
v2 -> v3:

Addressed review comments from Oleg Nesterov, including:
o remove the restriction on signal for PR_SET_PREDUMP_SIG.
o code simplification

arch/x86/kernel/signal_compat.c | 2 +-
fs/coredump.c | 6 +
fs/exec.c | 3 +
include/linux/sched/signal.h | 4 +
include/uapi/asm-generic/siginfo.h | 3 +-
include/uapi/linux/prctl.h | 4 +
kernel/fork.c | 3 +
kernel/signal.c | 31 +++++
kernel/sys.c | 13 ++
tools/testing/selftests/prctl/Makefile | 2 +-
tools/testing/selftests/prctl/predump-sig-test.c | 169 +++++++++++++++++++++++
11 files changed, 237 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/prctl/predump-sig-test.c

diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 9ccbf05..a3deba8 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -30,7 +30,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGSEGV != 7);
BUILD_BUG_ON(NSIGBUS != 5);
BUILD_BUG_ON(NSIGTRAP != 5);
- BUILD_BUG_ON(NSIGCHLD != 6);
+ BUILD_BUG_ON(NSIGCHLD != 7);
BUILD_BUG_ON(NSIGSYS != 1);

/* This is part of the ABI and can never change in size: */
diff --git a/fs/coredump.c b/fs/coredump.c
index e42e17e..d6ca1a3 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -590,6 +590,12 @@ void do_coredump(const kernel_siginfo_t *siginfo)
if (retval < 0)
goto fail_creds;

+ /*
+ * Send the pre-coredump signal to the parent if requested.
+ */
+ do_notify_parent_predump();
+ cond_resched();
+
old_cred = override_creds(cred);

ispipe = format_corename(&cn, &cprm);
diff --git a/fs/exec.c b/fs/exec.c
index fc281b7..7714da7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1181,6 +1181,9 @@ static int de_thread(struct task_struct *tsk)
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;

+ /* Clear the pre-coredump signal before loading a new binary */
+ sig->predump_signal = 0;
+
#ifdef CONFIG_POSIX_TIMERS
exit_itimers(sig);
flush_itimer_signals();
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 13789d1..132ce08 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -112,6 +112,9 @@ struct signal_struct {
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */

+ /* The signal sent prior to a child's coredump */
+ int predump_signal;
+
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
@@ -332,6 +335,7 @@ extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
+extern void do_notify_parent_predump(void);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int, struct task_struct *);
extern int send_sig(int, struct task_struct *, int);
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index cb3d6c2..1a47cef 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -267,7 +267,8 @@ struct { \
#define CLD_TRAPPED 4 /* traced child has trapped */
#define CLD_STOPPED 5 /* child has stopped */
#define CLD_CONTINUED 6 /* stopped child has continued */
-#define NSIGCHLD 6
+#define CLD_PREDUMP 7 /* child is about to dump core */
+#define NSIGCHLD 7

/*
* SIGPOLL (or any other signal without signal specific si_codes) si_codes
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index c0d7ea0..79f0a8a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -219,4 +219,8 @@ struct prctl_mm_map {
# define PR_SPEC_DISABLE (1UL << 2)
# define PR_SPEC_FORCE_DISABLE (1UL << 3)

+/* Whether to receive signal prior to child's coredump */
+#define PR_SET_PREDUMP_SIG 54
+#define PR_GET_PREDUMP_SIG 55
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cddff..8e30a00 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1553,6 +1553,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);

+ /* Clear the pre-coredump signal for the child */
+ sig->predump_signal = 0;
+
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;

diff --git a/kernel/signal.c b/kernel/signal.c
index 9a32bc2..904ad8a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1855,6 +1855,37 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
return autoreap;
}

+/*
+ * While do_notify_parent() notifies the parent of a child's death post
+ * its coredump, this function lets the parent (if so desired) know about
+ * the imminent death of a child just prior to its coredump.
+ */
+void do_notify_parent_predump(void)
+{
+ struct sighand_struct *sighand;
+ struct kernel_siginfo info;
+ struct task_struct *parent;
+ unsigned long flags;
+ int sig;
+
+ read_lock(&tasklist_lock);
+ parent = current->parent;
+ sig = parent->signal->predump_signal;
+ if (sig != 0) {
+ clear_siginfo(&info);
+ info.si_pid = task_tgid_vnr(current);
+ info.si_signo = sig;
+ if (sig == SIGCHLD)
+ info.si_code = CLD_PREDUMP;
+
+ sighand = parent->sighand;
+ spin_lock_irqsave(&sighand->siglock, flags);
+ __group_send_sig_info(sig, &info, parent);
+ spin_unlock_irqrestore(&sighand->siglock, flags);
+ }
+ read_unlock(&tasklist_lock);
+}
+
/**
* do_notify_parent_cldstop - notify parent of stopped/continued state change
* @tsk: task reporting the state change
diff --git a/kernel/sys.c b/kernel/sys.c
index 123bd73..39aa3b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2476,6 +2476,19 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
break;
+ case PR_SET_PREDUMP_SIG:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ if (!valid_signal((int)arg2))
+ return -EINVAL;
+ me->signal->predump_signal = (int)arg2;
+ break;
+ case PR_GET_PREDUMP_SIG:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = put_user(me->signal->predump_signal,
+ (int __user *)arg2);
+ break;
default:
error = -EINVAL;
break;
diff --git a/tools/testing/selftests/prctl/Makefile b/tools/testing/selftests/prctl/Makefile
index c7923b2..f8d60d5 100644
--- a/tools/testing/selftests/prctl/Makefile
+++ b/tools/testing/selftests/prctl/Makefile
@@ -5,7 +5,7 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)

ifeq ($(ARCH),x86)
TEST_PROGS := disable-tsc-ctxt-sw-stress-test disable-tsc-on-off-stress-test \
- disable-tsc-test
+ disable-tsc-test predump-sig-test
all: $(TEST_PROGS)

include ../lib.mk
diff --git a/tools/testing/selftests/prctl/predump-sig-test.c b/tools/testing/selftests/prctl/predump-sig-test.c
new file mode 100644
index 0000000..1b93521
--- /dev/null
+++ b/tools/testing/selftests/prctl/predump-sig-test.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Enke Chen, Cisco Systems, Inc.
+ *
+ * Tests for prctl(PR_SET_PREDUMP_SIG, ...) / prctl(PR_GET_PREDUMP_SIG, ...)
+ *
+ * When set with prctl(), the specified signal is sent to the parent process
+ * prior to the coredump of a child process.
+ *
+ * Usage: ./predump-sig-test {SIGUSR1 | SIGCHLD}
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <sys/signalfd.h>
+#include <errno.h>
+
+#ifndef PR_SET_PREDUMP_SIG
+#define PR_SET_PREDUMP_SIG 54
+#define PR_GET_PREDUMP_SIG 55
+#endif
+
+#ifndef CLD_PREDUMP
+#define CLD_PREDUMP 7 /* child is about to dump core */
+#endif
+
+#define handle_error(msg) \
+ do { perror(msg); exit(EXIT_FAILURE); } while (0)
+
+static int test_prctl(int sig)
+{
+ int sig2, rc;
+
+ rc = prctl(PR_SET_PREDUMP_SIG, sig, 0, 0, 0);
+ if (rc < 0)
+ handle_error("prctl: setting");
+
+ rc = prctl(PR_GET_PREDUMP_SIG, &sig2, 0, 0, 0);
+ if (rc < 0)
+ handle_error("prctl: getting");
+
+ if (sig2 != sig) {
+ printf("prctl: sig %d, post %d\n", sig, sig2);
+ return -1;
+ }
+ return 0;
+}
+
+static int sigfd;
+static int predump_signal;
+
+static int init_signalfd(void)
+{
+ sigset_t mask;
+ int sfd;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGCHLD);
+ if (predump_signal && (predump_signal != SIGCHLD))
+ sigaddset(&mask, predump_signal);
+
+ /*
+ * Block signals so that they aren't handled according to their
+ * default dispositions.
+ */
+ if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1)
+ handle_error("sigprocmask");
+
+ sfd = signalfd(-1, &mask, SFD_CLOEXEC);
+ if (sfd == -1)
+ handle_error("signalfd");
+
+ return sfd;
+}
+
+static void parent_fn(pid_t child_pid)
+{
+ struct signalfd_siginfo si;
+ int count = 0;
+ ssize_t s;
+
+ for (;;) {
+ s = read(sigfd, &si, sizeof(struct signalfd_siginfo));
+ if (s != sizeof(struct signalfd_siginfo))
+ handle_error("read");
+
+ count++;
+ printf("\nReceived signal: ssi_pid %ld, ssi_signo %d\n",
+ si.ssi_pid, si.ssi_signo);
+ printf("siginfo: ssi_errno %d, ssi_code %d, ssi_status %d\n",
+ si.ssi_errno, si.ssi_code, si.ssi_status);
+
+ if (si.ssi_signo == SIGCHLD) {
+ if (si.ssi_code == CLD_PREDUMP)
+ printf("predump signal\n");
+ else
+ break;
+ } else if (si.ssi_signo == predump_signal)
+ printf("predump signal\n");
+ }
+
+ printf("Test result: %s\n", (count == 2) ? "PASS" : "FAIL");
+ fflush(stdout);
+}
+
+static void child_fn(void)
+{
+ int rc, sig;
+
+ printf("\nChild pid: %ld\n", (long)getpid());
+
+ /* Test: Child should not inherit the predump_signal */
+ rc = prctl(PR_GET_PREDUMP_SIG, &sig, 0, 0, 0);
+ if (rc < 0)
+ handle_error("prctl: child");
+
+ printf("child: predump_signal %d\n", sig);
+
+ /* Force coredump here */
+ printf("child: calling abort()\n");
+ fflush(stdout);
+ abort();
+}
+
+int main(int argc, char *argv[])
+{
+ pid_t child_pid;
+ int rc;
+
+ if (argc != 2) {
+ printf("invalid number of arguments\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (strcmp(argv[1], "SIGUSR1") == 0)
+ predump_signal = SIGUSR1;
+ else if (strcmp(argv[1], "SIGCHLD") == 0)
+ predump_signal = SIGCHLD;
+ else {
+ printf("invalid argument for signal\n");
+ fflush(stdout);
+ exit(EXIT_FAILURE);
+ }
+
+ /* Test: prctl() setting */
+ rc = test_prctl(0);
+ printf("prctl: sig %d %s\n", 0, (rc == 0) ? "PASS" : "FAIL");
+ rc = test_prctl(predump_signal);
+ printf("prctl: sig %d %s\n",
+ predump_signal, (rc == 0) ? "PASS" : "FAIL");
+
+ /* Init signalfd */
+ sigfd = init_signalfd();
+
+ child_pid = fork();
+ if (child_pid == -1)
+ handle_error("fork");
+
+ if (child_pid == 0) { /* Code executed by child */
+ child_fn();
+ } else { /* Code executed by parent */
+ parent_fn(child_pid);
+ exit(EXIT_SUCCESS);
+ }
+}
--
1.8.3.1