[patch 5/5] elf: Add support for loading ET_CKPT files
From: Cyrill Gorcunov
Date: Fri Oct 14 2011 - 07:05:38 EST
This patch add ability to run that named "checkpoint" files by
enhancing Elf file format, which includes
- new Elf file type ET_CKPT
- three additional program header types PT_CKPT_VMA, PT_CKPT_CORE
and PT_CKPT_PAGES.
PT_CKPT_VMA -- holds 'vma_entry' structure, which describes the
memory area the kernel should map. It also might contain a file descriptor
so the kernel will be mapping a file povided. Usually such file get
opened by user-space helper tool which prepares 'vma_entry' structure
for the kernel.
PT_CKPT_CORE -- 'core_entry' structure (registers, tls, tasks specific
settings). The structure is defined as a 16K container which should be
enough for most cases. 8K of it is reserved for arch specific settings.
PT_CKPT_PAGES -- a set of all pages which contents we should restored.
Apart from Elf extension flush_old_exec() has been splitted to two
functions -- the former flush_old_exec() and flush_exec_keep_thread().
The later doesn't call for de_thread() allowing to keep threads
relationship. Also arch_setup_additional_pages_at() helper added
to setup vdso at predefined address.
At moment only pure x86-64 architecture is supported.
Signed-off-by: Cyrill Gorcunov <gorcunov@xxxxxxxxxx>
CC: Andrew Vagin <avagin@xxxxxxxxxxxxx>
CC: Pavel Emelyanov <xemul@xxxxxxxxxxxxx>
CC: James Bottomley <jbottomley@xxxxxxxxxxxxx>
CC: Glauber Costa <glommer@xxxxxxxxxxxxx>
CC: H. Peter Anvin <hpa@xxxxxxxxx>
CC: Ingo Molnar <mingo@xxxxxxx>
CC: Tejun Heo <tj@xxxxxxxxxx>
CC: Dave Hansen <dave@xxxxxxxxxxxxxxxxxx>
CC: Eric W. Biederman <ebiederm@xxxxxxxxxxxx>
CC: Daniel Lezcano <dlezcano@xxxxxxxxxx>
CC: Alexey Dobriyan <adobriyan@xxxxxxxxx>
---
arch/x86/include/asm/elf.h | 3
arch/x86/include/asm/elf_ckpt.h | 80 ++++++++
arch/x86/kernel/Makefile | 2
arch/x86/kernel/elf_ckpt.c | 161 ++++++++++++++++++
arch/x86/vdso/vma.c | 22 ++
fs/Kconfig.binfmt | 11 +
fs/Makefile | 1
fs/binfmt_elf.c | 17 +
fs/binfmt_elf_ckpt.c | 356 ++++++++++++++++++++++++++++++++++++++++
fs/exec.c | 27 +--
include/linux/binfmts.h | 1
include/linux/elf_ckpt.h | 103 +++++++++++
12 files changed, 772 insertions(+), 12 deletions(-)
Index: linux-2.6.git/arch/x86/include/asm/elf.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/elf.h
+++ linux-2.6.git/arch/x86/include/asm/elf.h
@@ -314,7 +314,8 @@ struct linux_binprm;
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
-
+extern int arch_setup_additional_pages_at(struct linux_binprm *bprm,
+ void *addr, int uses_interp);
extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
#define compat_arch_setup_additional_pages syscall32_setup_pages
Index: linux-2.6.git/arch/x86/include/asm/elf_ckpt.h
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/include/asm/elf_ckpt.h
@@ -0,0 +1,80 @@
+#ifndef _LINUX_ELF_X86_CHECKPOINT_H
+#define _LINUX_ELF_X86_CHECKPOINT_H
+
+#include <linux/errno.h>
+
+#include <asm/types.h>
+#include <asm/ptrace.h>
+
+#define CKPT_GDT_ENTRY_TLS_ENTRIES 3
+
+struct user_regs_entry {
+ __u64 r15;
+ __u64 r14;
+ __u64 r13;
+ __u64 r12;
+ __u64 bp;
+ __u64 bx;
+ __u64 r11;
+ __u64 r10;
+ __u64 r9;
+ __u64 r8;
+ __u64 ax;
+ __u64 cx;
+ __u64 dx;
+ __u64 si;
+ __u64 di;
+ __u64 orig_ax;
+ __u64 ip;
+ __u64 cs;
+ __u64 flags;
+ __u64 sp;
+ __u64 ss;
+ __u64 fs_base;
+ __u64 gs_base;
+ __u64 ds;
+ __u64 es;
+ __u64 fs;
+ __u64 gs;
+} __packed;
+
+struct desc_struct_entry {
+ __u32 a;
+ __u32 b;
+} __packed;
+
+struct user_fpregs_entry {
+ __u16 cwd;
+ __u16 swd;
+ __u16 twd;
+ __u16 fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32];
+ __u32 xmm_space[64];
+ __u32 padding[24];
+} __packed;
+
+struct ckpt_arch_entry {
+ struct user_regs_entry gpregs;
+ struct user_fpregs_entry fpregs;
+ struct desc_struct tls_array[CKPT_GDT_ENTRY_TLS_ENTRIES];
+};
+
+struct core_entry;
+
+#ifdef CONFIG_X86_64
+extern int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
+ struct core_entry *core_entry);
+#else
+static inline int
+load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
+ struct core_entry *core_entry)
+{
+ return -ENOEXEC;
+}
+#endif
+
+#endif /* _LINUX_ELF_X86_CHECKPOINT_H */
Index: linux-2.6.git/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/Makefile
+++ linux-2.6.git/arch/x86/kernel/Makefile
@@ -99,6 +99,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION)
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_BINFMT_ELF_CKPT) += elf_ckpt.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
Index: linux-2.6.git/arch/x86/kernel/elf_ckpt.c
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/kernel/elf_ckpt.c
@@ -0,0 +1,161 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/personality.h>
+#include <linux/elfcore.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/utsname.h>
+#include <linux/coredump.h>
+#include <linux/regset.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
+#include <asm/i387.h>
+
+#include <linux/elf_ckpt.h>
+#include <linux/flex_array.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_X86_64
+
+#define cp_reg(d, s, r) d.r = s.r
+
+int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
+ struct core_entry *core_entry)
+{
+ struct ckpt_arch_entry *arch = (struct ckpt_arch_entry *)core_entry->arch;
+ struct thread_struct *thread = ¤t->thread;
+
+ struct user_regs_struct gpregs;
+ struct user_i387_struct fpregs;
+
+ mm_segment_t old_fs;
+ int i, ret;
+
+ if (core_entry->header.arch != CKPT_HEADER_ARCH_X86_64) {
+ pr_err("elf-ckpt-x86: Unsupported or corrupted header\n");
+ return -ENOEXEC;
+ }
+
+ BUILD_BUG_ON(CKPT_GDT_ENTRY_TLS_ENTRIES != GDT_ENTRY_TLS_ENTRIES);
+ BUILD_BUG_ON(sizeof(struct ckpt_arch_entry) > CKPT_ARCH_SIZE);
+
+ memset(&gpregs, 0, sizeof(gpregs));
+ memset(&fpregs, 0, sizeof(fpregs));
+
+ /*
+ * General purpose registers
+ */
+ cp_reg(gpregs, arch->gpregs, r15);
+ cp_reg(gpregs, arch->gpregs, r14);
+ cp_reg(gpregs, arch->gpregs, r13);
+ cp_reg(gpregs, arch->gpregs, r12);
+ cp_reg(gpregs, arch->gpregs, bp);
+ cp_reg(gpregs, arch->gpregs, bx);
+ cp_reg(gpregs, arch->gpregs, r11);
+ cp_reg(gpregs, arch->gpregs, r10);
+ cp_reg(gpregs, arch->gpregs, r9);
+ cp_reg(gpregs, arch->gpregs, r8);
+ cp_reg(gpregs, arch->gpregs, ax);
+ cp_reg(gpregs, arch->gpregs, cx);
+ cp_reg(gpregs, arch->gpregs, dx);
+ cp_reg(gpregs, arch->gpregs, si);
+ cp_reg(gpregs, arch->gpregs, di);
+ cp_reg(gpregs, arch->gpregs, orig_ax);
+ cp_reg(gpregs, arch->gpregs, ip);
+ cp_reg(gpregs, arch->gpregs, cs);
+ cp_reg(gpregs, arch->gpregs, flags);
+ cp_reg(gpregs, arch->gpregs, sp);
+ cp_reg(gpregs, arch->gpregs, ss);
+ cp_reg(gpregs, arch->gpregs, fs_base);
+ cp_reg(gpregs, arch->gpregs, gs_base);
+ cp_reg(gpregs, arch->gpregs, ds);
+ cp_reg(gpregs, arch->gpregs, es);
+ cp_reg(gpregs, arch->gpregs, fs);
+ cp_reg(gpregs, arch->gpregs, gs);
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = arch_ptrace(current, PTRACE_SETREGS, 0, (unsigned long)&gpregs);
+ set_fs(old_fs);
+ if (ret)
+ goto out;
+
+ *regs = *task_pt_regs(current);
+
+ thread->usersp = arch->gpregs.sp;
+ thread->ds = arch->gpregs.ds;
+ thread->es = arch->gpregs.es;
+ thread->fs = arch->gpregs.fs;
+ thread->gs = arch->gpregs.gs;
+
+ thread->fsindex = thread->fs;
+ thread->gsindex = thread->gs;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
+ thread->tls_array[i].a = arch->tls_array[i].a;
+ thread->tls_array[i].b = arch->tls_array[i].b;
+ }
+
+ if (arch->gpregs.fs_base) {
+ ret = do_arch_prctl(current, ARCH_SET_FS, arch->gpregs.fs_base);
+ if (ret)
+ goto out;
+ }
+
+ if (arch->gpregs.gs_base) {
+ ret = do_arch_prctl(current, ARCH_SET_GS, arch->gpregs.gs_base);
+ if (ret)
+ goto out;
+ }
+
+ /* Restoring FPU */
+ if (core_entry->task_flags & PF_USED_MATH) {
+
+ cp_reg(fpregs, arch->fpregs, cwd);
+ cp_reg(fpregs, arch->fpregs, swd);
+ cp_reg(fpregs, arch->fpregs, twd);
+ cp_reg(fpregs, arch->fpregs, fop);
+ cp_reg(fpregs, arch->fpregs, rip);
+ cp_reg(fpregs, arch->fpregs, rdp);
+ cp_reg(fpregs, arch->fpregs, mxcsr);
+ cp_reg(fpregs, arch->fpregs, mxcsr_mask);
+
+ for (i = 0; i < ARRAY_SIZE(arch->fpregs.st_space); i++)
+ cp_reg(fpregs, arch->fpregs, st_space[i]);
+
+ for (i = 0; i < ARRAY_SIZE(arch->fpregs.xmm_space); i++)
+ cp_reg(fpregs, arch->fpregs, xmm_space[i]);
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = arch_ptrace(current, PTRACE_SETFPREGS, 0, (unsigned long)&fpregs);
+ set_fs(old_fs);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+#endif /* CONFIG_X86_64 */
Index: linux-2.6.git/arch/x86/vdso/vma.c
===================================================================
--- linux-2.6.git.orig/arch/x86/vdso/vma.c
+++ linux-2.6.git/arch/x86/vdso/vma.c
@@ -137,6 +137,28 @@ up_fail:
return ret;
}
+int arch_setup_additional_pages_at(struct linux_binprm *bprm, void *addr, int uses_interp)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ if (!vdso_enabled)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+ current->mm->context.vdso = addr;
+ ret = install_special_mapping(mm, (unsigned long)addr, vdso_size,
+ VM_READ | VM_EXEC |
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
+ VM_ALWAYSDUMP,
+ vdso_pages);
+ if (ret)
+ current->mm->context.vdso = NULL;
+
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
static __init int vdso_setup(char *s)
{
vdso_enabled = simple_strtoul(s, NULL, 0);
Index: linux-2.6.git/fs/Kconfig.binfmt
===================================================================
--- linux-2.6.git.orig/fs/Kconfig.binfmt
+++ linux-2.6.git/fs/Kconfig.binfmt
@@ -23,6 +23,17 @@ config BINFMT_ELF
ld.so (check the file <file:Documentation/Changes> for location and
latest version).
+config BINFMT_ELF_CKPT
+ tristate "Kernel support for CKPT ELF binaries"
+ default n
+ depends on BINFMT_ELF && X86_64
+ help
+ ELF CKPT (checkpoint) is an extension to ELF format to restore
+ checkpointed processes. It's not confirmed yet and highly
+ experimental.
+
+ If unsure, say N.
+
config COMPAT_BINFMT_ELF
bool
depends on COMPAT && BINFMT_ELF
Index: linux-2.6.git/fs/Makefile
===================================================================
--- linux-2.6.git.orig/fs/Makefile
+++ linux-2.6.git/fs/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_BINFMT_MISC) += binfmt_misc
obj-y += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
+obj-$(CONFIG_BINFMT_ELF_CKPT) += binfmt_elf_ckpt.o
obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
Index: linux-2.6.git/fs/binfmt_elf.c
===================================================================
--- linux-2.6.git.orig/fs/binfmt_elf.c
+++ linux-2.6.git/fs/binfmt_elf.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/random.h>
#include <linux/elf.h>
+#include <linux/elf_ckpt.h>
#include <linux/utsname.h>
#include <linux/coredump.h>
#include <asm/uaccess.h>
@@ -592,7 +593,11 @@ static int load_elf_binary(struct linux_
if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
goto out;
- if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
+ if (loc->elf_ex.e_type != ET_EXEC &&
+#ifdef CONFIG_BINFMT_ELF_CKPT
+ loc->elf_ex.e_type != ET_CKPT &&
+#endif
+ loc->elf_ex.e_type != ET_DYN)
goto out;
if (!elf_check_arch(&loc->elf_ex))
goto out;
@@ -619,6 +624,16 @@ static int load_elf_binary(struct linux_
goto out_free_ph;
}
+#ifdef CONFIG_BINFMT_ELF_CKPT
+ if (loc->elf_ex.e_type == ET_CKPT) {
+ retval = load_elf_ckpt(bprm, regs, &loc->elf_ex,
+ (struct elf_phdr *)elf_phdata);
+ if (!retval)
+ set_binfmt(&elf_format);
+ goto out_free_ph;
+ }
+#endif
+
elf_ppnt = elf_phdata;
elf_bss = 0;
elf_brk = 0;
Index: linux-2.6.git/fs/binfmt_elf_ckpt.c
===================================================================
--- /dev/null
+++ linux-2.6.git/fs/binfmt_elf_ckpt.c
@@ -0,0 +1,356 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/personality.h>
+#include <linux/elfcore.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/utsname.h>
+#include <linux/coredump.h>
+#include <linux/regset.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
+#include <asm/i387.h>
+
+#include <linux/elf_ckpt.h>
+#include <asm/elf_ckpt.h>
+
+#include <linux/flex_array.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
+{
+ struct elf_phdr *elf_phdr_pages;
+ struct flex_array *fa = NULL;
+ struct vma_entry *vma_entry_ptr;
+ int nr_vma_found, nr_vma_mapped;
+ struct vma_entry vma_entry;
+ struct file *file = NULL;
+ unsigned long map_addr;
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ unsigned long vdso = -1UL;
+#endif
+
+ struct core_entry *core_entry = NULL;
+ unsigned long start_stack = -1UL;
+
+ int i, ret = -ENOEXEC;
+ loff_t off;
+
+ BUILD_BUG_ON(CKPT_TASK_COMM_LEN != TASK_COMM_LEN);
+ BUILD_BUG_ON(CKPT_PAGE_SIZE != PAGE_SIZE);
+ BUILD_BUG_ON(CKPT_CORE_SIZE != sizeof(*core_entry));
+
+ elf_phdr_pages = NULL;
+ nr_vma_found = 0;
+ nr_vma_mapped = 0;
+
+ /*
+ * An early check for header version so if we fail here
+ * we would not need to use flex array at all.
+ */
+ for (i = 0; i < elf_ex->e_phnum; i++) {
+ if (elf_phdr[i].p_type != PT_CKPT_CORE)
+ continue;
+
+ core_entry = vmalloc(sizeof(*core_entry));
+ if (!core_entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
+ (char *)core_entry, sizeof(*core_entry));
+ if (ret != sizeof(*core_entry)) {
+ pr_err("elf-ckpt: Can't read core_entry\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ if (core_entry->header.version != CKPT_HEADER_VERSION) {
+ pr_err("elf-ckpt: Unsupported or corrupted header\n");
+ ret = -ENOEXEC;
+ goto out;
+ }
+
+ break;
+ }
+
+ if (i == elf_ex->e_phnum) {
+ pr_err("elf-ckpt: No header found\n");
+ ret = -ENOEXEC;
+ goto out;
+ }
+
+
+ fa = flex_array_alloc(sizeof(vma_entry), elf_ex->e_phnum, GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, elf_ex->e_phnum, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa) {
+ flex_array_free(fa);
+ fa = NULL;
+ goto out;
+ }
+ }
+
+ ret = flush_exec_keep_thread(bprm);
+ if (ret)
+ goto out;
+
+ current->flags &= ~PF_FORKNOEXEC;
+ current->mm->def_flags = 0;
+
+ /*
+ * We don't care about parameters passed (such as argc, argv, env)
+ * when execute checkpoint file because we're to substitute
+ * all things anyway.
+ */
+ do_munmap(current->mm, 0, TASK_SIZE);
+
+ SET_PERSONALITY(loc->elf_ex);
+
+ for (i = 0; i < elf_ex->e_phnum; i++) {
+
+ switch (elf_phdr[i].p_type) {
+ case PT_CKPT_VMA:
+ ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
+ (char *)&vma_entry, sizeof(vma_entry));
+ if (ret != sizeof(vma_entry)) {
+ pr_err("elf-ckpt: Can't read vma_entry\n");
+ ret = -EIO;
+ goto out;
+ }
+ if (flex_array_put(fa, i, &vma_entry, GFP_KERNEL))
+ BUG();
+
+ /* We need to know if there is executable stack */
+ if (vma_entry.status & VMA_AREA_STACK) {
+ if (vma_entry.flags & PROT_EXEC)
+ current->personality |= READ_IMPLIES_EXEC;
+ }
+
+ nr_vma_found++;
+ continue;
+ case PT_CKPT_PAGES:
+ elf_phdr_pages = &elf_phdr[i];
+ continue;
+ default:
+ continue;
+ }
+ }
+
+ /* Be sure it has the file structure we expected to see. */
+ if (!elf_phdr_pages || !nr_vma_found) {
+ ret = -ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * VMA randomization still needs to be set (just in case if
+ * the program we restore will exec() something else later).
+ */
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ current->flags |= PF_RANDOMIZE;
+
+ /*
+ * FIXME: Note it flushes signal handlers as well,
+ * so we need to dump queued signals and restore
+ * them here.
+ */
+ setup_new_exec(bprm);
+
+ current->mm->free_area_cache = current->mm->mmap_base;
+ current->mm->cached_hole_size = 0;
+
+ for (i = 0; i < nr_vma_found; i++) {
+ vma_entry_ptr = flex_array_get(fa, i);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ if (vma_entry_ptr->status & VMA_AREA_VDSO)
+ vdso = vma_entry_ptr->start;
+#endif
+
+ if (vma_entry_ptr->status & VMA_AREA_STACK) {
+ /* Note if stack is VM_GROWSUP -- it should be reversed */
+ start_stack = vma_entry_ptr->start;
+ }
+
+ /* Anything special should be ignored */
+ if (!(vma_entry_ptr->status & VMA_AREA_REGULAR))
+ continue;
+
+ /* It's a file mmap'ed */
+ if (vma_entry_ptr->fd != -1) {
+ file = fget((unsigned int)vma_entry_ptr->fd);
+ if (!file) {
+ ret = -EBADF;
+ goto out_unmap;
+ }
+
+ /* Reuse this field to handle error cases */
+ vma_entry_ptr->fd = (__u64)file;
+ } else
+ file = NULL;
+
+ down_write(¤t->mm->mmap_sem);
+ map_addr = do_mmap(file,
+ vma_entry_ptr->start,
+ vma_entry_ptr->end - vma_entry_ptr->start,
+ vma_entry_ptr->prot,
+ vma_entry_ptr->flags | MAP_FIXED,
+ vma_entry_ptr->pgoff);
+ up_write(¤t->mm->mmap_sem);
+
+ if (file) {
+ fput(file);
+ do_close((unsigned int)vma_entry_ptr->fd);
+ }
+
+ if ((unsigned long)(map_addr) >= TASK_SIZE) {
+ ret = IS_ERR((void *)map_addr) ? PTR_ERR((void*)map_addr) : -EINVAL;
+ goto out_unmap;
+ }
+
+ nr_vma_mapped++;
+ }
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ if (vdso == -1UL) {
+ pr_err("elf-ckpt: Can't find VDSO address\n");
+ ret = -ENOEXEC;
+ goto out_unmap;
+ }
+#endif
+
+ if (start_stack == -1UL) {
+ pr_err("elf-ckpt: Can't find stack VMA\n");
+ ret = -ENOEXEC;
+ goto out_unmap;
+ }
+
+ /* The name it has before */
+ set_task_comm(current, core_entry->task_comm);
+
+ bprm->p = core_entry->mm_start_stack;
+
+ current->mm->start_code = core_entry->mm_start_code;
+ current->mm->end_code = core_entry->mm_end_code;
+ current->mm->start_data = core_entry->mm_start_data;
+ current->mm->end_data = core_entry->mm_end_data;
+ current->mm->start_stack = core_entry->mm_start_stack;
+ current->mm->start_brk = core_entry->mm_start_brk;
+ current->mm->brk = core_entry->mm_brk;
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ ret = arch_setup_additional_pages_at(bprm, (void *)vdso, 0);
+ if (ret) {
+ pr_err("elf-ckpt: Can't setup additional pages at %lx with %d\n",
+ vdso, ret);
+ goto out_unmap;
+ }
+#endif
+
+ /*
+ * Restore pages
+ */
+ off = elf_phdr_pages->p_offset;
+ while (1) {
+ struct vm_area_struct *vma;
+ struct page *page;
+ void *page_data;
+ __u64 va;
+
+ ret = kernel_read(bprm->file, off, (char *)&va, sizeof(va));
+ if (ret != sizeof(va)) {
+ pr_err("elf-ckpt: Can't read page virtual address: "
+ "ret = %d off = %lx\n", ret, (unsigned long)off);
+ ret = -EIO;
+ goto out_unmap;
+ }
+
+ /* End of pages reached */
+ if (!va)
+ break;
+
+ vma = find_vma(current->mm, (unsigned long)va);
+ if (!vma) {
+ pr_err("elf-ckpt: No VMA for page: %16lx\n", (unsigned long)va);
+ ret = -ESRCH;
+ goto out_unmap;
+ }
+
+ ret = get_user_pages(current, current->mm, (unsigned long)va,
+ 1, 1, 1, &page, NULL);
+ if (ret != 1) {
+ pr_err("elf-ckpt: Can't get user page: %16lx\n", (unsigned long)va);
+ ret = -EFAULT;
+ goto out_unmap;
+ }
+
+ page_data = kmap(page);
+ ret = kernel_read(bprm->file, off + sizeof(va), page_data, PAGE_SIZE);
+ kunmap(page);
+ put_page(page);
+
+ if (ret != PAGE_SIZE) {
+ pr_err("elf-ckpt: Can't read data on page: %16lx\n", (unsigned long)va);
+ ret = -EFAULT;
+ goto out_unmap;
+ }
+
+ off += sizeof(va) + PAGE_SIZE;
+ }
+
+ /*
+ * Architecture specific setup for registers
+ * and friends, it's done lately since if
+ * an error happened before there is no much
+ * point to setup arch-specific things at all.
+ */
+ ret = load_elf_ckpt_arch(current, regs, core_entry);
+ if (ret)
+ goto out_unmap;
+
+ /* We're done */
+ ret = 0;
+out:
+ if (core_entry)
+ vfree(core_entry);
+
+ if (fa)
+ flex_array_free(fa);
+ return ret;
+
+out_unmap:
+ for (i = 0; i < nr_vma_mapped; i++) {
+ vma_entry_ptr = flex_array_get(fa, i);
+ down_write(¤t->mm->mmap_sem);
+ do_munmap(current->mm, vma_entry_ptr->start,
+ vma_entry_ptr->end - vma_entry_ptr->start);
+ up_write(¤t->mm->mmap_sem);
+ }
+
+ send_sig(SIGKILL, current, 0);
+ goto out;
+}
Index: linux-2.6.git/fs/exec.c
===================================================================
--- linux-2.6.git.orig/fs/exec.c
+++ linux-2.6.git/fs/exec.c
@@ -1071,18 +1071,10 @@ void set_task_comm(struct task_struct *t
perf_event_comm(tsk);
}
-int flush_old_exec(struct linux_binprm * bprm)
+int flush_exec_keep_thread(struct linux_binprm * bprm)
{
int retval;
- /*
- * Make sure we have a private signal table and that
- * we are unassociated from the previous thread group.
- */
- retval = de_thread(current);
- if (retval)
- goto out;
-
set_mm_exe_file(bprm->mm, bprm->file);
/*
@@ -1101,10 +1093,25 @@ int flush_old_exec(struct linux_binprm *
current->personality &= ~bprm->per_clear;
return 0;
-
out:
return retval;
}
+EXPORT_SYMBOL(flush_exec_keep_thread);
+
+int flush_old_exec(struct linux_binprm * bprm)
+{
+ int retval;
+
+ /*
+ * Make sure we have a private signal table and that
+ * we are unassociated from the previous thread group.
+ */
+ retval = de_thread(current);
+ if (retval)
+ return retval;
+
+ return flush_exec_keep_thread(bprm);
+}
EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
Index: linux-2.6.git/include/linux/binfmts.h
===================================================================
--- linux-2.6.git.orig/include/linux/binfmts.h
+++ linux-2.6.git/include/linux/binfmts.h
@@ -110,6 +110,7 @@ extern int prepare_binprm(struct linux_b
extern int __must_check remove_arg_zero(struct linux_binprm *);
extern int search_binary_handler(struct linux_binprm *, struct pt_regs *);
extern int flush_old_exec(struct linux_binprm * bprm);
+extern int flush_exec_keep_thread(struct linux_binprm * bprm);
extern void setup_new_exec(struct linux_binprm * bprm);
extern void would_dump(struct linux_binprm *, struct file *);
Index: linux-2.6.git/include/linux/elf_ckpt.h
===================================================================
--- /dev/null
+++ linux-2.6.git/include/linux/elf_ckpt.h
@@ -0,0 +1,103 @@
+#ifndef _LINUX_ELF_CHECKPOINT_H
+#define _LINUX_ELF_CHECKPOINT_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/elf-em.h>
+
+#include <asm/elf.h>
+#include <asm/elf_ckpt.h>
+
+/*
+ * Elf extension includes new Elf file type
+ * and program header types as well.
+ */
+#define ET_CKPT 5
+
+#define PT_CKPT_OFFSET 0x01010101
+
+#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1)
+#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2)
+#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3)
+
+#define CKPT_PAGE_SIZE 4096
+#define CKPT_TASK_COMM_LEN 16
+
+#define CKPT_HEADER_VERSION 1
+#define CKPT_HEADER_ARCH_X86_64 1
+
+#define VMA_AREA_REGULAR (1 << 0)
+#define VMA_AREA_STACK (1 << 1)
+#define VMA_AREA_VSYSCALL (1 << 2)
+#define VMA_AREA_VDSO (1 << 3)
+#define VMA_FORCE_READ (1 << 4)
+#define VMA_AREA_HEAP (1 << 5)
+#define VMA_FILE_PRIVATE (1 << 6)
+#define VMA_FILE_SHARED (1 << 7)
+#define VMA_ANON_SHARED (1 << 8)
+#define VMA_ANON_PRIVATE (1 << 9)
+#define VMA_FORCE_WRITE (1 << 10)
+
+struct vma_entry {
+ __u64 start;
+ __u64 end;
+ __u64 pgoff;
+ __u32 prot;
+ __u32 flags;
+ __u32 status; /* from VMA_x above */
+ __u32 pid; /* pid VMA belongs to */
+ __s64 fd;
+ __u64 ino;
+ __u32 dev_maj;
+ __u32 dev_min;
+} __packed;
+
+struct page_entry {
+ __u64 va; /* page virtual address */
+ __u8 data[CKPT_PAGE_SIZE]; /* page contents */
+} __packed;
+
+struct image_header {
+ __u16 version;
+ __u16 arch;
+ __u32 flags;
+} __packed;
+
+#define CKPT_ARCH_SIZE (2 * 4096)
+#define CKPT_CORE_SIZE (4 * 4096)
+
+struct core_entry {
+ union {
+ struct {
+ struct image_header header;
+ __u8 arch[CKPT_ARCH_SIZE]; /* should be enough for all archs */
+ __u32 task_personality;
+ __u8 task_comm[CKPT_TASK_COMM_LEN];
+ __u32 task_flags;
+ __u64 mm_start_code;
+ __u64 mm_end_code;
+ __u64 mm_start_data;
+ __u64 mm_end_data;
+ __u64 mm_start_stack;
+ __u64 mm_start_brk;
+ __u64 mm_brk;
+ };
+ __u8 __core_pad[CKPT_CORE_SIZE];
+ };
+} __packed;
+
+#ifdef CONFIG_BINFMT_ELF_CKPT
+extern int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr);
+#else
+static inline int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
+{
+ return -ENOEXEC;
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_ELF_CHECKPOINT_H */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/