[RFC] x86-64: Allow emulated vsyscalls from user addresses

From: Andy Lutomirski
Date: Fri Aug 05 2011 - 23:04:41 EST


A few dynamic recompilation tools are too clever for their own good.
They trace control flow through the vsyscall page and recompile that
code somewhere else. Then they expect it to work. DynamoRIO
(http://dynamorio.org/) and Pin (http://www.pintool.org/) are
affected. They crash when tracing programs that use vsyscalls.
Valgrind is smart enough not to cause problems. It crashes on the
getcpu vsyscall, but that has nothing to do with emulation.

This patch makes each of the three vsyscall entries use a different
vector so that they can work when relocated. It assumes that the
code that relocates them is okay with the int instruction acting
like ret. DynamoRIO at least appears to work.

We print an obnoxious (rate-limited) message to the log when this
happens. Hopefully it will inspire the JIT tools to learn not to
trace into kernel address space.

Signed-off-by: Andy Lutomirski <luto@xxxxxxx>
---

This uses vectors 0x40, 0x41, and 0x42 for now. They are REX
prefixes in 64-bit code, and jumping to the second byte of one
of these instructions will turn into 'rex.? int3', which will
trap.

arch/x86/kernel/vsyscall_64.c | 75 ++++++++++--------------------------
arch/x86/kernel/vsyscall_emu_64.S | 6 +-
2 files changed, 24 insertions(+), 57 deletions(-)

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index f785f5b..a33ad02 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -105,22 +105,8 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
regs->sp, regs->ax, regs->si, regs->di);
}

-static int addr_to_vsyscall_nr(unsigned long addr)
-{
- int nr;
-
- if ((addr & ~0xC00UL) != VSYSCALL_START)
- return -EINVAL;
-
- nr = (addr & 0xC00UL) >> 10;
- if (nr >= 3)
- return -EINVAL;
-
- return nr;
-}
-
-void emulate_vsyscall(struct pt_regs *regs, int nr,
- long (*vsys)(struct pt_regs *))
+static void emulate_vsyscall(struct pt_regs *regs, int nr,
+ long (*vsys)(struct pt_regs *))
{
struct task_struct *tsk;
unsigned long caller;
@@ -128,6 +114,8 @@ void emulate_vsyscall(struct pt_regs *regs, int nr,

local_irq_enable();

+ trace_emulate_vsyscall(nr);
+
if (!user_64bit_mode(regs)) {
/*
* If we trapped from kernel mode, we might as well OOPS now
@@ -138,50 +126,29 @@ void emulate_vsyscall(struct pt_regs *regs, int nr,

/* Compat mode and non-compat 32-bit CS should both segfault. */
warn_bad_vsyscall(KERN_WARNING, regs,
- "illegal int 0xcc from 32-bit mode");
+ "illegal emulated vsyscall from 32-bit mode");
goto sigsegv;
}

- /*
- * x86-ism here: regs->ip points to the instruction after the int 0xcc,
- * and int 0xcc is two bytes long.
- */
- vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
-
- trace_emulate_vsyscall(vsyscall_nr);
-
- if (vsyscall_nr < 0) {
- warn_bad_vsyscall(KERN_WARNING, regs,
- "illegal int 0xcc (exploit attempt?)");
- goto sigsegv;
- }
+ tsk = current;
+ if (seccomp_mode(&tsk->seccomp))
+ do_exit(SIGKILL);

if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
goto sigsegv;
}

- tsk = current;
- if (seccomp_mode(&tsk->seccomp))
- do_exit(SIGKILL);
+ /*
+ * x86-ism here: regs->ip points to the instruction after the int 0xcc,
+ * and int 0xcc is two bytes long.
+ */
+ if (((regs->ip - 2) & ~0xfff) != VSYSCALL_START)
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "emulated vsyscall from bogus address -- "
+ "fix your code");

- switch (vsyscall_nr) {
- case 0:
- ret = sys_gettimeofday(
- (struct timeval __user *)regs->di,
- (struct timezone __user *)regs->si);
- break;
-
- case 1:
- ret = sys_time((time_t __user *)regs->di);
- break;
-
- case 2:
- ret = sys_getcpu((unsigned __user *)regs->di,
- (unsigned __user *)regs->si,
- 0);
- break;
- }
+ ret = vsys(regs);

if (ret == -EFAULT) {
/*
@@ -223,9 +190,9 @@ static long vsys_gettimeofday(struct pt_regs *regs)
(struct timezone __user *)regs->si);
}

-void dotraplinkage emulate_vsyscall0(struct pt_regs *regs, long error_code)
+void dotraplinkage do_emulate_vsyscall0(struct pt_regs *regs, long error_code)
{
- emulate_vsyscall(regs, vsys_gettimeofday);
+ emulate_vsyscall(regs, 0, vsys_gettimeofday);
}

static long vsys_time(struct pt_regs *regs)
@@ -233,7 +200,7 @@ static long vsys_time(struct pt_regs *regs)
return sys_time((time_t __user *)regs->di);
}

-void dotraplinkage emulate_vsyscall1(struct pt_regs *regs, long error_code)
+void dotraplinkage do_emulate_vsyscall1(struct pt_regs *regs, long error_code)
{
emulate_vsyscall(regs, 1, vsys_time);
}
@@ -245,7 +212,7 @@ static long vsys_getcpu(struct pt_regs *regs)
0);
}

-void dotraplinkage emulate_vsyscall2(struct pt_regs *regs, long error_code)
+void dotraplinkage do_emulate_vsyscall2(struct pt_regs *regs, long error_code)
{
emulate_vsyscall(regs, 2, vsys_getcpu);
}
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
index ffa845e..a4f02a3 100644
--- a/arch/x86/kernel/vsyscall_emu_64.S
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -13,15 +13,15 @@

.section .vsyscall_0, "a"
ENTRY(vsyscall_0)
- int $VSYSCALL_EMU_VECTOR
+ int $VSYSCALL0_EMU_VECTOR
END(vsyscall_0)

.section .vsyscall_1, "a"
ENTRY(vsyscall_1)
- int $VSYSCALL_EMU_VECTOR
+ int $VSYSCALL1_EMU_VECTOR
END(vsyscall_1)

.section .vsyscall_2, "a"
ENTRY(vsyscall_2)
- int $VSYSCALL_EMU_VECTOR
+ int $VSYSCALL2_EMU_VECTOR
END(vsyscall_2)
--
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/