[PATCH v21 024/100] c/r: x86-64: checkpoint/restart implementation

From: Oren Laadan
Date: Sat May 01 2010 - 10:33:46 EST


Support for checkpoint and restart for X86_32 architecture.
Partly based on Alexey's work.

Support for 32bit on 64bit and fixes from Serge Hallyn.

Checkpoint Restart
(app/arch) (app/arch/program*)
---------------------------------------
64/x86-64 -> 64/x86-64 works
32/x86-64 -> 32/x86-64 works
32/x86-64 -> 32/x86-32 ?
32/x86-32 -> 32/x86-64 ?

32/x86-64 -> 32/x86-32 ?
32/x86-32 -> 32/x86-64 ?

(*) "program" indicates the bit-ness of 'restart' executable.

Changelog[v21]:
- Do not include checkpoint_hdr.h explicitly
Changelog[v19-rc3]:
- Rebase to kernel 2.6.33
- [Serge Hallyn] Changes to fs/gs register handling
- [Serge Hallyn] Allow 32-bit restart of 64-bit and vice versa
- [Serge Hallyn] Only allow 'restart' with same bit-ness as image.

Cc: x86@xxxxxxxxxx
Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx>
Signed-off-by: Serge Hallyn <serue@xxxxxxxxxx>
---
arch/x86/Kconfig | 2 +-
arch/x86/include/asm/checkpoint_hdr.h | 6 +
arch/x86/include/asm/unistd_64.h | 4 +
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/checkpoint.c | 16 +++
arch/x86/kernel/checkpoint_64.c | 240 +++++++++++++++++++++++++++++++++
arch/x86/kernel/entry_64.S | 7 +
include/linux/checkpoint_hdr.h | 2 +
8 files changed, 278 insertions(+), 1 deletions(-)
create mode 100644 arch/x86/kernel/checkpoint_64.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0874484..335a4b3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,7 +95,7 @@ config HAVE_LATENCYTOP_SUPPORT

config CHECKPOINT_SUPPORT
bool
- default y if X86_32
+ default y

config MMU
def_bool y
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index e6cfc99..6f600dd 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -36,6 +36,10 @@
#include <asm/processor.h>
#endif

+#ifdef CONFIG_X86_64
+#define CKPT_ARCH_ID CKPT_ARCH_X86_64
+#endif
+
#ifdef CONFIG_X86_32
#define CKPT_ARCH_ID CKPT_ARCH_X86_32
#endif
@@ -106,6 +110,8 @@ struct ckpt_hdr_cpu {
#define CKPT_X86_SEG_NULL 0
#define CKPT_X86_SEG_USER32_CS 1
#define CKPT_X86_SEG_USER32_DS 2
+#define CKPT_X86_SEG_USER64_CS 3
+#define CKPT_X86_SEG_USER64_DS 4
#define CKPT_X86_SEG_TLS 0x4000 /* 0100 0000 0000 00xx */
#define CKPT_X86_SEG_LDT 0x8000 /* 100x xxxx xxxx xxxx */

diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 1cd16af..2b162e1 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -665,6 +665,10 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
#define __NR_eclone 300
__SYSCALL(__NR_eclone, stub_eclone)
+#define __NR_checkpoint 301
+__SYSCALL(__NR_checkpoint, stub_checkpoint)
+#define __NR_restart 302
+__SYSCALL(__NR_restart, stub_restart)

#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2078d1b..916a7e1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -138,4 +138,6 @@ ifeq ($(CONFIG_X86_64),y)

obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
+
+ obj-$(CONFIG_CHECKPOINT) += checkpoint_64.o
endif
diff --git a/arch/x86/kernel/checkpoint.c b/arch/x86/kernel/checkpoint.c
index 015401a..3976318 100644
--- a/arch/x86/kernel/checkpoint.c
+++ b/arch/x86/kernel/checkpoint.c
@@ -249,6 +249,22 @@ int restore_thread(struct ckpt_ctx *ctx)
load_TLS(thread, cpu);
put_cpu();

+ {
+ int pre, post;
+ /*
+ * Eventually we'd like to support mixed-bit restart, but for
+ * now don't pretend to.
+ */
+ pre = test_thread_flag(TIF_IA32);
+ post = !!(h->thread_info_flags & _TIF_IA32);
+ if (pre != post) {
+ ret = -EINVAL;
+ ckpt_err(ctx, ret, "%d-bit restarting %d-bit\n",
+ 64 >> pre, 64 >> post);
+ goto out;
+ }
+ }
+
/* TODO: restore TIF flags as necessary (e.g. TIF_NOTSC) */

ret = 0;
diff --git a/arch/x86/kernel/checkpoint_64.c b/arch/x86/kernel/checkpoint_64.c
new file mode 100644
index 0000000..cac52f3
--- /dev/null
+++ b/arch/x86/kernel/checkpoint_64.c
@@ -0,0 +1,240 @@
+/*
+ * Checkpoint/restart - architecture specific support for x86_64
+ *
+ * Copyright (C) 2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DSYS
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/elf.h>
+
+#include <linux/checkpoint.h>
+
+/* helpers to encode/decode/validate segments */
+
+int check_segment(__u16 seg)
+{
+ int ret = 0;
+
+ switch (seg) {
+ case CKPT_X86_SEG_NULL:
+ case CKPT_X86_SEG_USER64_CS:
+ case CKPT_X86_SEG_USER64_DS:
+#ifdef CONFIG_COMPAT
+ case CKPT_X86_SEG_USER32_CS:
+ case CKPT_X86_SEG_USER32_DS:
+#endif
+ return 1;
+ }
+ if (seg & CKPT_X86_SEG_TLS) {
+ seg &= ~CKPT_X86_SEG_TLS;
+ if (seg <= GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN)
+ ret = 1;
+ } else if (seg & CKPT_X86_SEG_LDT) {
+ seg &= ~CKPT_X86_SEG_LDT;
+ if (seg <= 0x1fff)
+ ret = 1;
+ }
+ return ret;
+}
+
+__u16 encode_segment(unsigned short seg)
+{
+ if (seg == 0)
+ return CKPT_X86_SEG_NULL;
+ BUG_ON((seg & 3) != 3);
+
+ if (seg == __USER_CS)
+ return CKPT_X86_SEG_USER64_CS;
+ if (seg == __USER_DS)
+ return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+ if (seg == __USER32_CS)
+ return CKPT_X86_SEG_USER32_CS;
+ if (seg == __USER32_DS)
+ return CKPT_X86_SEG_USER32_DS;
+#endif
+
+ if (seg & 4)
+ return CKPT_X86_SEG_LDT | (seg >> 3);
+
+ seg >>= 3;
+ if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+ return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+ printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+ BUG();
+}
+
+unsigned short decode_segment(__u16 seg)
+{
+ if (seg == CKPT_X86_SEG_NULL)
+ return 0;
+
+ if (seg == CKPT_X86_SEG_USER64_CS)
+ return __USER_CS;
+ if (seg == CKPT_X86_SEG_USER64_DS)
+ return __USER_DS;
+#ifdef CONFIG_COMPAT
+ if (seg == CKPT_X86_SEG_USER32_CS)
+ return __USER32_CS;
+ if (seg == CKPT_X86_SEG_USER32_DS)
+ return __USER32_DS;
+#endif
+
+ if (seg & CKPT_X86_SEG_TLS) {
+ seg &= ~CKPT_X86_SEG_TLS;
+ return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+ }
+ if (seg & CKPT_X86_SEG_LDT) {
+ seg &= ~CKPT_X86_SEG_LDT;
+ return (seg << 3) | 7;
+ }
+ BUG();
+}
+
+void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+ struct pt_regs *regs = task_pt_regs(t);
+ unsigned long _ds, _es, _fs, _gs;
+
+ h->r15 = regs->r15;
+ h->r14 = regs->r14;
+ h->r13 = regs->r13;
+ h->r12 = regs->r12;
+ h->r11 = regs->r11;
+ h->r10 = regs->r10;
+ h->r9 = regs->r9;
+ h->r8 = regs->r8;
+
+ h->bp = regs->bp;
+ h->bx = regs->bx;
+ h->ax = regs->ax;
+ h->cx = regs->cx;
+ h->dx = regs->dx;
+ h->si = regs->si;
+ h->di = regs->di;
+ h->orig_ax = regs->orig_ax;
+ h->ip = regs->ip;
+
+ h->flags = regs->flags;
+ h->sp = regs->sp;
+
+ /*
+ * for checkpoint in process context (from within a container)
+ * DS, ES, FS, GS registers should be saved from the hardware;
+ * otherwise they are already saved on the thread structure
+ */
+
+ h->cs = encode_segment(regs->cs);
+ h->ss = encode_segment(regs->ss);
+
+ if (t == current) {
+ savesegment(ds, _ds);
+ savesegment(es, _es);
+ savesegment(fs, _fs);
+ savesegment(gs, _gs);
+ rdmsrl(MSR_FS_BASE, h->fs);
+ rdmsrl(MSR_KERNEL_GS_BASE, h->gs);
+ } else {
+ _ds = t->thread.ds;
+ _es = t->thread.es;
+ _fs = t->thread.fsindex;
+ _gs = t->thread.gsindex;
+ h->fs = t->thread.fs;
+ h->gs = t->thread.gs;
+ }
+ h->ds = encode_segment(_ds);
+ h->es = encode_segment(_es);
+ h->fsindex = encode_segment(_fs);
+ h->gsindex = encode_segment(_gs);
+
+ /* see comment in __switch_to() */
+ if (_fs)
+ h->fs = 0;
+ if (_gs)
+ h->gs = 0;
+
+ /*
+ * for checkpoint in process context (from within a container),
+ * the actual syscall is taking place at this very moment; so
+ * we (optimistically) subtitute the future return value (0) of
+ * this syscall into the orig_eax, so that upon restart it will
+ * succeed (or it will endlessly retry checkpoint...)
+ */
+ if (t == current) {
+ BUG_ON(h->orig_ax < 0);
+ h->ax = 0;
+ }
+}
+
+int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+ struct thread_struct *thread = &t->thread;
+ struct pt_regs *regs = task_pt_regs(t);
+
+ if (h->cs == CKPT_X86_SEG_NULL)
+ return -EINVAL;
+ if (!check_segment(h->cs) || !check_segment(h->ds) ||
+ !check_segment(h->es) || !check_segment(h->ss) ||
+ !check_segment(h->fsindex) || !check_segment(h->gsindex))
+ return -EINVAL;
+
+ regs->r15 = h->r15;
+ regs->r14 = h->r14;
+ regs->r13 = h->r13;
+ regs->r12 = h->r12;
+ regs->r11 = h->r11;
+ regs->r10 = h->r10;
+ regs->r9 = h->r9;
+ regs->r8 = h->r8;
+
+ regs->bp = h->bp;
+ regs->bx = h->bx;
+ regs->ax = h->ax;
+ regs->cx = h->cx;
+ regs->dx = h->dx;
+ regs->si = h->si;
+ regs->di = h->di;
+ regs->orig_ax = h->orig_ax;
+ regs->ip = h->ip;
+
+ regs->sp = h->sp;
+ thread->usersp = h->sp;
+
+ preempt_disable();
+
+ regs->cs = decode_segment(h->cs);
+ regs->ss = decode_segment(h->ss);
+ thread->ds = decode_segment(h->ds);
+ thread->es = decode_segment(h->es);
+ thread->fsindex = decode_segment(h->fsindex);
+ thread->gsindex = decode_segment(h->gsindex);
+
+ thread->fs = h->fs;
+ thread->gs = h->gs;
+
+ /* XXX - unsure is this really needed ... */
+ loadsegment(fs, thread->fsindex);
+ if (thread->fs)
+ wrmsrl(MSR_FS_BASE, thread->fs);
+ load_gs_index(thread->gsindex);
+ /*
+ * when we switch to user-space, the MSR_KERNEL_GS_BASE
+ * will be moved back to MSR_GS_BASE.
+ * http://lists.openwall.net/linux-kernel/2008/11/18/340
+ */
+ if (thread->gs)
+ wrmsrl(MSR_KERNEL_GS_BASE, thread->gs);
+
+ preempt_enable();
+
+ return 0;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 216681e..c2ece28 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -699,6 +699,13 @@ END(\label)
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
PTREGSCALL stub_eclone, sys_eclone, %r8
+#ifdef CONFIG_CHECKPOINT
+ PTREGSCALL stub_checkpoint, sys_checkpoint, %r8
+ PTREGSCALL stub_restart, sys_restart, %r8
+#else
+ PTREGSCALL stub_checkpoint, sys_ni_syscall, %r8
+ PTREGSCALL stub_restart, sys_ni_syscall, %r8
+#endif

ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index a0189de..ecc4afb 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -90,6 +90,8 @@ enum {
enum {
CKPT_ARCH_X86_32 = 1,
#define CKPT_ARCH_X86_32 CKPT_ARCH_X86_32
+ CKPT_ARCH_X86_64,
+#define CKPT_ARCH_X86_64 CKPT_ARCH_X86_64
};

/* kernel constants */
--
1.6.3.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/