[RFC PATCH] x86/vdso/32: Add AT_SYSINFO cancellation helpers

From: Andy Lutomirski
Date: Tue Mar 08 2016 - 20:25:11 EST


musl implements system call cancellation in an unusual but clever way.
When a thread issues a cancellable syscall, musl issues the syscall
through a special thunk that looks roughly like this:

cancellable_syscall:
test whether a cancel is queued
jnz cancel_me
int $0x80
end_cancellable_syscall:

If a pthread cancellation signal hits with
cancellable_syscall <= EIP < end_cancellable_syscall, then the
signal interrupted a cancellation point before the syscall in
question started. If so, it rewrites the calling context to skip
the syscall and simulate a -EINTR return. The caller will detect
this simulated -EINTR or an actual -EINTR and handle a possible
cancellation event.

This technique doesn't work if int $0x80 is replaced by a call to
AT_SYSINFO: the signal handler can no longer tell whether it's
interrupting a call to AT_SYSINFO or, if it is, where AT_SYSINFO was
called from.

Add minimal helpers so that musl's signal handler can learn the
status of a possible pending AT_SYSINFO invocation and, if it hasn't
entered the kernel yet, abort it without needing to parse the vdso
DWARF unwind data.

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---

musl people-

Does this solve your AT_SYSINFO cancellation problem? I'd like to
make sure it survives an actual implementation before I commit to the ABI.

x86 people-

Are you okay with this idea?


arch/x86/entry/vdso/Makefile | 3 +-
arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 2 +
tools/testing/selftests/x86/unwind_vdso.c | 57 +++++++++--
4 files changed, 171 insertions(+), 7 deletions(-)
create mode 100644 arch/x86/entry/vdso/vdso32/cancellation_helpers.c

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index b88846471247..465052b49603 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -130,7 +130,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/

targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-targets += vdso32/vclock_gettime.o
+targets += vdso32/vclock_gettime.o vdso32/cancellation_helpers.o

KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
@@ -150,6 +150,7 @@ $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(obj)/vdso32.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
+ $(obj)/vdso32/cancellation_helpers.o \
$(obj)/vdso32/note.o \
$(obj)/vdso32/system_call.o \
$(obj)/vdso32/sigreturn.o
diff --git a/arch/x86/entry/vdso/vdso32/cancellation_helpers.c b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
new file mode 100644
index 000000000000..3cb2e88baec6
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Andrew Lutomirski
+ * Subject to the GNU Public License, v.2
+ *
+ * This provides helpers to enable libc implementations to cancel
+ * interrupted AT_SYSINFO invocations without needing to parse the
+ * DWARF unwinding instructions.
+ */
+
+#include <asm/signal.h>
+#include <asm/sigframe.h>
+
+extern char __kernel_vsyscall[] __attribute__((visibility("hidden")));
+extern char int80_landing_pad[] __attribute__((visibility("hidden")));
+
+static unsigned long *pending_syscall_retaddr_ptr(const void *context)
+{
+ const struct ucontext_ia32 *uc = context;
+ unsigned long ctx_eip = uc->uc_mcontext.ip;
+ unsigned long offset_into_vsyscall;
+ unsigned long *retaddr;
+
+ /*
+ * An AT_SYSINFO system call is pending if and only if we're in
+ * __kernel_vsyscall before int80_landing_pad. If we're at
+ * int80_landing_pad or beyond, we've finished the system call
+ * and are on our way out.
+ *
+ * If we're at int80_landing_pad-2, then either we're using the
+ * int $0x80 slow path because we have no fast system call
+ * support or we are restarting a fast system call. Either way,
+ * the system call is still pending.
+ */
+
+ if (ctx_eip < (unsigned long)__kernel_vsyscall ||
+ ctx_eip >= (unsigned long)int80_landing_pad)
+ return NULL;
+
+ /*
+ * The first three instructions of __kernel_vsyscall are one-byte
+ * pushes.
+ */
+ offset_into_vsyscall = (ctx_eip - (unsigned long)__kernel_vsyscall);
+ retaddr = (unsigned long *)uc->uc_mcontext.sp;
+ if (offset_into_vsyscall < 3)
+ retaddr += offset_into_vsyscall;
+ else
+ retaddr += 3;
+
+ /*
+ * GCC (correctly) fails to deduce out that retaddr can't be NULL
+ * in the success path. Helping it out reduces code size.
+ */
+ if (!retaddr)
+ __builtin_unreachable();
+
+ return retaddr;
+}
+
+/*
+ * If context is a sigcontet for a pending AT_SYSINFO syscall, returns
+ * the return address of that syscall. Otherwise returns -1UL.
+ */
+unsigned long __vdso_pending_syscall_return_address(const void *context)
+{
+ unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+ return retaddr ? *retaddr : -1UL;
+}
+
+/*
+ * If context is a sigcontext for a pending AT_SYSINFO syscall, then
+ * this will pop off the call frame and point the context to
+ * AT_SYSINFO's return address. ESP will contain whatever value it had
+ * immediately prior to the call instruction (i.e. ESP acts as though
+ * the system call returned normally). EAX will be set to -EINTR. All
+ * other GPRs will be clobbered. __vdso_abort_pending_syscall will
+ * return 0.
+ *
+ * If context is a valid sigcontext that does not represent a pending
+ * AT_SYSINFO syscall, then __vdso_abort_pending_syscall returns
+ * -EINVAL.
+ *
+ * If context is not a valid sigcontext at all, behavior is undefined.
+ */
+long __vdso_abort_pending_syscall(void *context)
+{
+ struct ucontext_ia32 *uc = context;
+ unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+
+ if (!retaddr)
+ return -EINVAL;
+
+ uc->uc_mcontext.ip = *retaddr;
+ uc->uc_mcontext.sp = (unsigned long)(retaddr + 1);
+
+ /*
+ * Clobber GPRs -- we don't want to implement full unwinding, and we
+ * don't want userspace to start expecting anything about the final
+ * state of the GPRs.
+ *
+ * (There really are subtleties here. EAX can be clobbered by
+ * syscall restart, and register limitations mean that the
+ * saved context has at least one of the argument registers
+ * used for a different purpose by the calling sequence just
+ * prior to kernel entry. In the current implementation, that
+ * register is EBP, but it could change.)
+ */
+ uc->uc_mcontext.ax = -EINTR;
+ uc->uc_mcontext.bx = 0xFFFFFFFF;
+ uc->uc_mcontext.cx = 0xFFFFFFFF;
+ uc->uc_mcontext.dx = 0xFFFFFFFF;
+ uc->uc_mcontext.si = 0xFFFFFFFF;
+ uc->uc_mcontext.di = 0xFFFFFFFF;
+ uc->uc_mcontext.bp = 0xFFFFFFFF;
+ return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 31056cf294bf..f04e8bd30755 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,6 +25,8 @@ VERSION
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_time;
+ __vdso_pending_syscall_return_address;
+ __vdso_abort_pending_syscall;
};

LINUX_2.5 {
diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c
index 00a26a82fa98..7c649b4b6834 100644
--- a/tools/testing/selftests/x86/unwind_vdso.c
+++ b/tools/testing/selftests/x86/unwind_vdso.c
@@ -35,6 +35,7 @@ int main()
#include <syscall.h>
#include <unistd.h>
#include <string.h>
+#include <errno.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <signal.h>
@@ -88,8 +89,12 @@ static unsigned long sysinfo;
static bool got_sysinfo = false;
static unsigned long return_address;

+static unsigned long (*vdso_pending_syscall_return_address)(
+ const void *context);
+
struct unwind_state {
unsigned long ip; /* trap source */
+ unsigned long ax; /* ax at call site */
int depth; /* -1 until we hit the trap source */
};

@@ -115,7 +120,7 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
unsigned long ebp = _Unwind_GetGR(ctx, 5);
unsigned long esi = _Unwind_GetGR(ctx, 6);
unsigned long edi = _Unwind_GetGR(ctx, 7);
- bool ok = (eax == SYS_getpid || eax == getpid()) &&
+ bool ok = (eax == SYS_break || eax == -ENOSYS) &&
ebx == 1 && ecx == 2 && edx == 3 &&
esi == 4 && edi == 5 && ebp == 6;

@@ -125,6 +130,8 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
(ok ? "OK" : "FAIL"),
eax, ebx, ecx, edx, esi, edi, ebp);

+ state->ax = eax;
+
return _URC_NORMAL_STOP;
} else {
state->depth++;
@@ -137,6 +144,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
ucontext_t *ctx = (ucontext_t *)ctx_void;
struct unwind_state state;
unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP];
+ unsigned long reported_return_address = 0;

if (!got_sysinfo && ip == sysinfo) {
got_sysinfo = true;
@@ -148,8 +156,15 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
ip, return_address);
}

- if (!got_sysinfo)
- return; /* Not there yet */
+ if (!got_sysinfo) {
+ if (vdso_pending_syscall_return_address &&
+ vdso_pending_syscall_return_address(ctx_void) != -1UL) {
+ printf("[FAIL]\t__vdso_pending_syscall_return_address incorrectly detected a pending syscall\n");
+ nerrs++;
+ }
+
+ return; /* We haven't started AT_SYSINFO yet */
+ }

if (ip == return_address) {
ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF;
@@ -157,11 +172,32 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
return;
}

- printf("\tSIGTRAP at 0x%lx\n", ip);
+ if (vdso_pending_syscall_return_address) {
+ reported_return_address =
+ vdso_pending_syscall_return_address(ctx_void);
+ if (reported_return_address != -1UL)
+ printf("\tSIGTRAP at 0x%lx, pending syscall will return to 0x%lx\n",
+ ip, reported_return_address);
+ else
+ printf("\tSIGTRAP at 0x%lx, no syscall pending\n", ip);
+ } else {
+ printf("\tSIGTRAP at 0x%lx\n", ip);
+ }

state.ip = ip;
state.depth = -1;
_Unwind_Backtrace(trace_fn, &state);
+
+ if (vdso_pending_syscall_return_address) {
+ unsigned long expected =
+ (state.ax == SYS_break ? return_address : -1UL);
+ if (reported_return_address != expected) {
+ printf("[FAIL]\t __vdso_pending_syscall_return_address returned 0x%lx; expected 0x%lx\n", reported_return_address, expected);
+ nerrs++;
+ } else {
+ printf("[OK]\t __vdso_pending_syscall_return_address returned the correct value\n");
+ }
+ }
}

int main()
@@ -177,12 +213,21 @@ int main()
info.dli_fname, info.dli_fbase);
}

+ void *vdso = dlopen("linux-gate.so.1", RTLD_NOW);
+ if (vdso)
+ vdso_pending_syscall_return_address = dlsym(vdso, "__vdso_pending_syscall_return_address");
+
sethandler(SIGTRAP, sigtrap, 0);

- syscall(SYS_getpid); /* Force symbol binding without TF set. */
+ syscall(SYS_break); /* Force symbol binding without TF set. */
printf("[RUN]\tSet TF and check a fast syscall\n");
set_eflags(get_eflags() | X86_EFLAGS_TF);
- syscall(SYS_getpid, 1, 2, 3, 4, 5, 6);
+
+ /*
+ * We need a harmless syscall that will never return its own syscall
+ * nr. SYS_break is not implemented and returns -ENOSYS.
+ */
+ syscall(SYS_break, 1, 2, 3, 4, 5, 6);
if (!got_sysinfo) {
set_eflags(get_eflags() & ~X86_EFLAGS_TF);

--
2.5.0