[PATCH i386] Live Patching Function on 2.6.11.7

From: Takashi Ikebe
Date: Sun Apr 17 2005 - 22:27:32 EST


The patch was over 50k, so I separate it to each architecture and in line..

This patch add function called "Live patching" which is defined on
OSDL's carrier grade linux requiremnt definition to linux 2.6.11.7 kernel.
The live patching allows process to patch on-line (without restarting
process) on i386 and x86_64 architectures, by overwriting jump assembly
code on entry point of functions which you want to fix, to patched
functions.
The live patching function is very common on high-availability system
such as carrier system, and this patch realize it also on linux.
(Patch & process restart time is very critical on such high-availability
system, live patch allows you to milliseconds order process stopping
time to apply new patch.)

The basis is below:
1. Live patch command loads the patch modules to target process's memory
area,
2. Live patch command resolve patch symbol.
3. Live patch command overwrite jump code to the entry point of function
which you want to fix, to the patch module's symbol.

Kernel patch and user mode tools are required, and both of them are
available at http://pannus.sourceforge.net
Please take a look and give us comments!

This patch add following system calls and function.
o mmap3: maps patch to target process's memory area with security check.
o accesspvm: access(read/write) target process's memory area.
o init_pend: initialization of live patch sequence on target process.
o rt_handlereturn: run initialize root of each patch (same as signal
handler).
o check_init: check that the initialization is finished or not.
o munmap3: unmap patch from target process's memory area.


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : ikebe.takashi@xxxxxxxxxxxxx

diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/Makefile linux-2.6.11.7-pannus-i386/arch/i386/kernel/Makefile
--- linux-2.6.11.7-vanilla/arch/i386/kernel/Makefile 2005-04-08 03:57:22.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/Makefile 2005-04-18 12:32:13.000000000 +0900
@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.ld
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- doublefault.o quirks.o
+ doublefault.o quirks.o accesspvm.o exechandle.o

obj-y += cpu/
obj-y += timers/
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/accesspvm.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/accesspvm.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/accesspvm.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/accesspvm.c 2005-04-18 12:32:13.000000000 +0900
@@ -0,0 +1,128 @@
+/*
+ * accesspvm.c
+ * Copyright (C) 2004 NTT Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Provide the system call to read/write the specific data in the user process.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * Provide the system call to read/write the specific data in the user process.
+ * param pid : process ID
+ * param addr : address of target's memory
+ * param datap : address of the user space memory
+ * param len : length of the kernel space memory to get
+ * param flag : flag which specifies action(read:0, write:1)
+ * return : error code(parameter error:EPERM, no-memory error:ENOMEM, I/O error:EIO)
+ */
+asmlinkage int sys_accesspvm(long pid, unsigned long addr, long datap, int len, int flag)
+{
+ struct task_struct *tsk;
+ int ret = -EPERM;
+ long *p = NULL;
+
+ // get the task_struct specified by pid.
+ read_lock(&tasklist_lock); // lock tasklist
+ tsk = find_task_by_pid(pid);
+ if (tsk)
+ get_task_struct(tsk); // get task_struct
+ read_unlock(&tasklist_lock); // unlock tasklist
+ if (!tsk) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ // if pid<1, then paramter error.
+ if (pid <= 1) // you may not mess with kernel thread or init.
+ goto out_tsk;
+
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in sys_accesspvm
+ return -EPERM;
+ }
+
+ // allocate memory for temporal buffer.
+ p = vmalloc(len);
+ if(!p){
+ printk("accesspvm: Cannot allocate by vmalloc\n");
+ ret = -ENOMEM;
+ goto out_tsk;
+ }
+
+ if(flag == 0){
+ // Read the data in the specified task
+ if(access_process_vm(tsk, addr, p, len, flag) != len) {
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+
+ // copy kernel space data to user space.
+ if(copy_to_user((void *)datap,(const void *)p,len)){
+ printk("accesspvm: Copy_to_user error\n");
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+ ret = 0;
+ vfree(p);
+
+ }
+ else if(flag == 1) {
+ // copy user space data to kernel space.
+ if(copy_from_user(p,(void *)datap,len)){
+ printk("accesspvm: Copy_from_user error\n");
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+
+ // change the data of specified task.
+ if (access_process_vm(tsk, addr, p, len, flag) == len){
+ vfree(p);
+ ret = 0;
+ goto out_tsk;
+ }
+ ret = -EIO;
+ vfree(p);
+ }
+ else {
+ vfree(p);
+ }
+
+out_tsk:
+ put_task_struct(tsk); // release the task_struct
+out:
+ return ret;
+}
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/asm-offsets.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/asm-offsets.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/asm-offsets.c 2005-04-08 03:57:30.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/asm-offsets.c 2005-04-18 12:32:13.000000000 +0900
@@ -52,6 +52,7 @@ void foo(void)
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
+ OFFSET(TI_inipending, thread_info, inipending);
BLANK();

OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/entry.S linux-2.6.11.7-pannus-i386/arch/i386/kernel/entry.S
--- linux-2.6.11.7-vanilla/arch/i386/kernel/entry.S 2005-04-08 03:57:26.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/entry.S 2005-04-18 12:32:13.000000000 +0900
@@ -172,8 +172,15 @@ ENTRY(resume_userspace)
andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
# int/exception return?
jne work_pending
+ cmpl $0,TI_inipending(%ebp) #for live patching fook.
+ jne resume_init
jmp restore_all

+resume_init:
+ movl $0,TI_inipending(%ebp)
+ call do_init
+ jmp resume_userspace
+
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
cli
@@ -263,6 +270,9 @@ restore_all:
# perform work that needs to be done immediately before resumption
ALIGN
work_pending:
+ cmpl $0,TI_inipending(%ebp)
+ jne work_init
+work_pending2:
testb $_TIF_NEED_RESCHED, %cl
jz work_notifysig
work_resched:
@@ -297,6 +307,29 @@ work_notifysig_v86:
call do_notify_resume
jmp restore_all

+ # perform live patching
+ ALIGN
+work_init:
+ testl $VM_MASK, EFLAGS(%esp)
+ movl %esp, %eax
+ jne work_init_v86
+
+ movl $0,TI_inipending(%ebp)
+ xorl %edx, %edx
+ call do_init
+ jmp work_pending2
+
+ ALIGN
+work_init_v86:
+ movl $0,TI_inipending(%ebp)
+ pushl %ecx # save ti_flags for do_notify_resume
+ call save_v86_state # %eax contains pt_regs pointer
+ popl %ecx
+ movl %eax, %esp
+ xorl %edx, %edx
+ call do_init_v86
+ jmp work_pending2
+
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
@@ -862,5 +895,11 @@ ENTRY(sys_call_table)
.long sys_add_key
.long sys_request_key
.long sys_keyctl
+ .long sys_mmap3
+ .long sys_accesspvm /* 290 */
+ .long sys_init_pend
+ .long sys_rt_handlereturn
+ .long sys_check_init
+ .long sys_munmap3

syscall_table_size=(.-sys_call_table)
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/exechandle.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/exechandle.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/exechandle.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/exechandle.c 2005-04-18 12:32:13.000000000 +0900
@@ -0,0 +1,611 @@
+/*
+ * exechandle.c
+ * Copyright (C) 2004-2005 NTT Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Initalization module.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/tty.h>
+#include <linux/personality.h>
+#include <linux/compiler.h>
+#include <linux/binfmts.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/exechandle.h>
+
+//#define DEBUG_INI 1
+#define round_down(x,y) ((x) & ~((y)-1))
+
+void init_fault(struct pt_regs *regs, void *frame, struct task_struct *me, char *where);
+
+/*
+ * Initialization frame
+ * Store the stack whenever initialize.
+ */
+struct rt_initframe
+{
+ char *pretcode; /* Return address after _init */
+ struct ucontext uc; /* user mode context before execute _init */
+ struct siginfo info; /* signal information before execute _init */
+};
+
+/*
+ * Restore the context before execute _init.
+ * param:regs register struct
+ * param:sc context before _init
+ * param:prax pointer for rax register
+ * return:normal:return value from __get_user/illegal:1
+ */
+
+static int
+restore_initcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
+{
+ unsigned int err = 0;
+
+
+#define COPY(x) err |= __get_user(regs->x, &sc->x)
+
+#define COPY_SEG(seg) \
+ { unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->x##seg = tmp; }
+
+#define COPY_SEG_STRICT(seg) \
+ { unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->x##seg = tmp|3; }
+
+#define GET_SEG(seg) \
+ { unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ loadsegment(seg,tmp); }
+
+#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF | \
+ X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
+ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
+
+ GET_SEG(gs);
+ GET_SEG(fs);
+ COPY_SEG(es);
+ COPY_SEG(ds);
+ COPY(edi);
+ COPY(esi);
+ COPY(ebp);
+ COPY(esp);
+ COPY(ebx);
+ COPY(edx);
+ COPY(ecx);
+ COPY(eip);
+ COPY_SEG_STRICT(cs);
+ COPY_SEG_STRICT(ss);
+
+ {
+ unsigned int tmpflags;
+ err |= __get_user(tmpflags, &sc->eflags);
+ regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_eax = -1; /* disable syscall checks */
+ }
+
+ {
+ struct _fpstate __user * buf;
+ err |= __get_user(buf, &sc->fpstate);
+
+ }
+
+ err |= __get_user(*peax, &sc->eax);
+ return err;
+
+}
+
+
+asmlinkage long sys_rt_handlereturn(unsigned long __unused)
+{
+ struct pt_regs *regs = (struct pt_regs *) &__unused;
+ struct rt_initframe *frame = (struct rt_initframe *)(regs->esp - 4);
+ stack_t st;
+ int eax;
+ struct task_struct *me = current;
+
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn:01\n");
+
+ printk("frame address = %p\n",frame);
+ printk("esp: %lx\n",regs->esp);
+ printk("eip: %lx\n",regs->eip);
+ printk("edx: %lx\n",regs->edx);
+ printk("esi: %lx\n",regs->esi);
+#endif
+ /* Check frame pointer */
+ if (verify_area(VERIFY_READ, frame, sizeof(*frame))) {
+ goto badframe;
+ }
+
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn:02\n");
+#endif
+ /* Restore hardware context */
+ if (restore_initcontext(regs, &frame->uc.uc_mcontext, &eax)) {
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn/restore_initcontext:01\n");
+#endif
+ goto badframe;
+ }
+
+#if DEBUG_INI
+ printk("%d sigreturn rip:%lx rsp:%lx frame:%p eax:%d\n",current->pid,regs->eip,regs->esp,frame,eax);
+#endif
+ /* Shift stack pointer */
+ if (__copy_from_user(&st, &frame->uc.uc_stack, sizeof(st))) {
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn/copy_from_user:01\n");
+#endif
+ goto badframe;
+ }
+
+ /* Clear initalization flag */
+ me->thread_info->inifinish=0;
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn:03\n");
+ printk("me->thread_info->inifinish = 0\n");
+#endif
+ return eax;
+
+ badframe:
+ me->thread_info->inifinish=-1;
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn/badframe\n");
+ printk("me->thread_info->inifinish = -1\n");
+#endif
+ init_fault(regs,frame,me,"handlereturn");
+ return 0;
+}
+
+/*
+ * Set up hardware context for initialization.
+ * param:sc context before initialization
+ * param:regs register struct
+ * param:mask signal mask
+ * param:me current task struct
+ * return:normal:return value from __put_user/illegal:none
+ */
+
+static inline int
+setup_initcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+{
+ int tmp, err = 0;
+
+ tmp = 0;
+ __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+ __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+
+ err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
+ err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
+ err |= __put_user(regs->edi, &sc->edi);
+ err |= __put_user(regs->esi, &sc->esi);
+ err |= __put_user(regs->ebp, &sc->ebp);
+ err |= __put_user(regs->esp, &sc->esp);
+ err |= __put_user(regs->ebx, &sc->ebx);
+ err |= __put_user(regs->edx, &sc->edx);
+ err |= __put_user(regs->ecx, &sc->ecx);
+ err |= __put_user(regs->eax, &sc->eax);
+ err |= __put_user(me->thread.trap_no, &sc->trapno);
+ err |= __put_user(me->thread.error_code, &sc->err);
+ err |= __put_user(regs->eip, &sc->eip);
+ err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
+ err |= __put_user(regs->eflags, &sc->eflags);
+ err |= __put_user(regs->esp, &sc->esp_at_signal);
+ err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+
+ /* non-iBCS2 extensions.. */
+ err |= __put_user(mask, &sc->oldmask);
+ err |= __put_user(current->thread.cr2, &sc->cr2);
+
+ return err;
+}
+
+
+/*
+ * Fix stack pointer.
+ * param:regs register struct
+ * param:size stack size
+ * return:normal:stack pointer value/illegal:none
+ */
+static inline void __user *
+get_initframe(struct pt_regs * regs, size_t frame_size)
+{
+ unsigned long esp;
+
+ /* Default to using normal stack */
+ esp = regs->esp;
+
+ return (void __user *)((esp - frame_size) & -8ul);
+}
+
+
+/*
+ * Set initialization frame and register.
+ * param:ka information for initialization
+ * param:regs register struct
+ * param:set signal set
+ * param:me current task struct
+ * return:none
+ */
+static void setup_init_frame(struct k_initaction *ka, struct pt_regs * regs,
+ sigset_t *set, struct task_struct *me)
+{
+ struct rt_initframe __user *frame;
+ int err = 0;
+ frame = get_initframe(regs, sizeof(struct rt_initframe)) - 8;
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ goto give_sigsegv;
+
+ /* Create the ucontext. */
+ err |= __put_user(0, &frame->uc.uc_flags);
+ err |= __put_user(0, &frame->uc.uc_link);
+ err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ err |= __put_user(sas_ss_flags(regs->esp),
+ &frame->uc.uc_stack.ss_flags);
+ err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_initcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
+ //err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (err)
+ goto give_sigsegv;
+
+ /* Set up to return from userspace. */
+ err |= __put_user(ka->ia.restorer, &frame->pretcode);
+
+
+ if (err)
+ goto give_sigsegv;
+
+ /* Set up registers for signal handler */
+ regs->esp = (unsigned long) frame;
+ regs->eip = (unsigned long) ka->ia.inithandler;
+ regs->eax = (unsigned long) 0;
+ regs->edx = (unsigned long) &frame->info;
+ regs->ecx = (unsigned long) &frame->uc;
+
+ set_fs(USER_DS);
+ regs->xds = __USER_DS;
+ regs->xes = __USER_DS;
+ regs->xss = __USER_DS;
+ regs->xcs = __USER_CS;
+
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->eflags &= ~TF_MASK;
+
+#if DEBUG_INI
+ printk("INI deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
+ current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+ return;
+
+give_sigsegv:
+ me->thread_info->inipending=0;
+ me->thread_info->inifinish=-1;
+ init_fault(regs,frame,me,"handle deliver");
+ return;
+}
+
+
+/*
+ * Check caller and invoke setup_init_frame.
+ * param:regs register struct
+ * param:ka information for initialization
+ * param:oldset signal set
+ * return:none
+ */
+void
+handle_init(struct pt_regs *regs, struct k_initaction *ka, sigset_t *oldset)
+{
+
+#if DEBUG_INI
+ printk("INIT_CP:handle_init:01\n");
+#endif
+ /* Check the caller is kernel or not */
+ if (regs->orig_eax >= 0) {
+ /* Return EINTER, if the caller is during systemcall. */
+ switch (regs->eax) {
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
+ regs->eax = -EINTR;
+#if DEBUG_INI
+ printk("ERESTARTNOHAN\n");
+#endif
+ break;
+
+ case -ERESTARTSYS:
+ regs->eax = -EINTR;
+#if DEBUG_INI
+ printk("ERESTARTSYS\n");
+#endif
+ break;
+ /* Skip if the value in rax is error from the beginning.*/
+ case -ERESTARTNOINTR:
+ regs->eax = regs->orig_eax;
+ regs->eip -= 2;
+#if DEBUG_INI
+ printk("ERESTARTNOINTR\n");
+#endif
+ break;
+
+ default:
+#if DEBUG_INI
+ printk("regs->eax=%ld\n",regs->eax);
+#endif
+ break;
+ }
+ }
+
+ setup_init_frame(ka, regs, oldset, current);
+
+}
+void do_init_v86(struct pt_regs *regs, sigset_t *oldset)
+{
+#if DEBUG_INI
+ printk("do_init_v86\n");
+ do_init(regs,oldset);
+#endif
+}
+
+/*
+ * Check the register and invoke handle_init.
+ * param:regs register struct
+ * param:oldset signal set
+ * return:none
+ */
+void do_init(struct pt_regs *regs, sigset_t *oldset)
+{
+
+ struct k_initaction *ka=&current->k_ia;
+
+#if DEBUG_INI
+ printk("INIT_CP:do_init:01\n");
+#endif
+ /* Exit if the third flag of CS register is not 3. */
+
+ if ((regs->xcs & 3) != 3) {
+#if DEBUG_INI
+ printk("regs->xcs != 3\n");
+ printk("current->thread_info->inifinish = 2\n");
+#endif
+ current->thread_info->inifinish=2;
+ return;
+ }
+
+ /* Block if there is signal set.*/
+ if (!oldset){
+#if DEBUG_INI
+ printk("!oldset\n");
+#endif
+ oldset = &current->blocked;
+ }
+
+ /* Clear debug watch point register.*/
+ if (current->thread.debugreg[7]){
+#if DEBUG_INI
+ printk("you have current->thread.debugreg[7]\n");
+#endif
+ asm volatile("movl %0,%%db7" :: "r" (current->thread.debugreg[7]));
+ }
+
+ handle_init(regs,ka,oldset);
+
+ return;
+}
+
+/*
+ * Output error in case of illegal.
+ * param:regs register struct
+ * param:frame stack frame
+ * param:me current task struct
+ * param:where output log string
+ * return:none
+ */
+void init_fault(struct pt_regs *regs, void *frame, struct task_struct *me, char *where)
+{
+
+#if DEBUG_INI
+ printk("INIT_CP:init_fault:01\n");
+#endif
+ /* Output messages if it is illegal.*/
+ printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
+ me->comm,me->pid,where,frame,regs->eip,regs->esp,regs->orig_eax);
+
+}
+
+/*
+ * Set inipending flag.
+ * param:target PID for setting pid flag
+ * param:user_k_ia initialization information
+ * return:normal:0/illegal:error code
+ */
+asmlinkage int sys_init_pend(pid_t pid, struct k_initaction *user_k_ia)
+{
+ struct k_initaction ka;
+ struct task_struct *tsk;
+ int error;
+
+#if DEBUG_INI
+ printk("sys_init_pend\n");
+#endif
+ /* Copy initialization information from user area to kernel area. */
+ error = -EFAULT;
+ if(copy_from_user(&ka,user_k_ia,sizeof(ka)))
+ goto out;
+
+ /* if pid <= 1, parameter error */
+ error = -EPERM;
+ if (pid <= 1)
+ goto out;
+
+ /* Get task struct from PID. */
+ error = -ESRCH;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid((pid_t)pid);
+ if(tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if(!tsk)
+ goto out;
+
+ /* capability check. */
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ /* Invalid user */
+ error = -EPERM;
+ return error;
+ }
+
+ error=-EPERM;
+ /* flag pending */
+#if DEBUG_INI
+ printk("tsk->thread_info->inipending = 1\n");
+ printk("tsk->thread_info->inifinish = 1\n");
+#endif
+ tsk->thread_info->inipending=1;
+ tsk->thread_info->inifinish=1;
+
+ /* set k_ia */
+ tsk->k_ia = ka;
+
+ smp_mb();
+
+#if DEBUG_INI
+ switch(tsk->state) {
+ case TASK_INTERRUPTIBLE:
+ printk("INIT_CP:task-state: TASK_INTERRUPTIBLE\n");
+ break;
+ case TASK_STOPPED:
+ printk("INIT_CP:task-state: TASK_STOPPED\n");
+ break;
+ case TASK_RUNNING:
+ printk("INIT_CP:task-state: TASK_RUNNING\n");
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ printk("INIT_CP:task-state: TASK_UNINTERRUPTIBLE\n");
+ break;
+ default:
+ printk("INIT_CP:task-state: Others\n");
+ }
+#endif
+
+ return 0;
+out:
+ return error;
+}
+
+/*
+ * Check inifinish.
+ * param:pid target pid of flag check
+ * return:normal:0/illegal:error code
+ */
+asmlinkage int sys_check_init(pid_t pid)
+{
+ struct task_struct *tsk;
+ int error;
+
+#if DEBUG_INI
+ printk("sys_check_init,pid=%d\n",pid);
+#endif
+ /* if pid <= 1, parameter error */
+ error = -EPERM;
+ if (pid <= 1){
+ printk("bad parameter,pid=%d\n",pid);
+ goto out;
+ }
+ /* Get task struct from pid.*/
+ error = -ESRCH;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if(tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+#if DEBUG_INI
+ printk("task=%p\n",tsk);
+#endif
+ if(!tsk){
+#if DEBUG_INI
+ printk("sys_check_init,can not find task_struct by pid\n");
+#endif
+ goto out;
+ }
+ /* capability check. */
+ error = -EPERM;
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ /* Invalid user */
+#if DEBUG_INI
+ printk("sys_check_init,Invalid user\n");
+#endif
+ goto out;
+ }
+
+ /*
+ * Check inifinish in task struct.
+ * If it is 0, return 0, if it is -1, return -1, else return EAGAIN.
+ */
+ if(tsk->thread_info->inifinish==0){
+ return 0;
+ }else if(tsk->thread_info->inifinish==-1){
+ error = -EINVAL;
+ printk("inifnich = -1, invalid value\n");
+ goto out;
+ }else if(tsk->thread_info->inifinish==2){
+ current->thread_info->inifinish=1;
+ current->thread_info->inipending=1;
+ return 1; //means retry attach/dettach
+ }else{
+ error = -EAGAIN;
+ printk("try again! error=%d, -EAGAIN=%d\n",error,-EAGAIN);
+ goto out;
+ }
+
+out:
+ return error;
+}
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/sys_i386.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/sys_i386.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/sys_i386.c 2005-04-08 03:58:31.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/sys_i386.c 2005-04-18 12:32:13.000000000 +0900
@@ -19,6 +19,7 @@
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
+#include <linux/sched.h>

#include <asm/uaccess.h>
#include <asm/ipc.h>
@@ -44,10 +45,11 @@ asmlinkage int sys_pipe(unsigned long __
static inline long do_mmap2(
unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff)
+ unsigned long fd, unsigned long pgoff, int pid)
{
int error = -EBADF;
struct file * file = NULL;
+ struct task_struct *tsk;

flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
if (!(flags & MAP_ANONYMOUS)) {
@@ -55,10 +57,34 @@ static inline long do_mmap2(
if (!file)
goto out;
}
+ if(pid > 0){
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid((pid_t)pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if (!tsk)
+ goto out;
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in sys_accesspvm
+ return -EPERM;
+ }
+
+ down_write(&tsk->mm->mmap_sem);
+ error = do_mmap_pgoff2(file, addr, len, prot, flags, pgoff, tsk);
+ up_write(&tsk->mm->mmap_sem);
+ } else {
+

down_write(&current->mm->mmap_sem);
error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up_write(&current->mm->mmap_sem);
+ }

if (file)
fput(file);
@@ -70,7 +96,44 @@ asmlinkage long sys_mmap2(unsigned long
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
- return do_mmap2(addr, len, prot, flags, fd, pgoff);
+ return do_mmap2(addr, len, prot, flags, fd, pgoff,0);
+}
+
+/*
+ * Provide the mmap3 system call which maps the file to the specified process's memory.
+ * param arg : struct which has memory mapping information
+ * return : normal end : the start address of mapped memory
+ * return : abnormal end : error code(can't read the struct:EFAULT, parameter error of pgoff:EINVAL, parameter error of file:EBADF)
+ */
+asmlinkage long sys_mmap3(struct _mmap3_arg_struct __user *arg)
+{
+ long error;
+ struct _mmap3_arg_struct a;
+ int ret;
+ // copy the struct in user space to kernel space
+ error = -EFAULT;
+ printk("sys_mmap3 called, arg=%p,&(arg.addr)=%p\n",arg,&(arg->addr));
+ printk("arg.addr=%lx,arg.len=%lx,arg.prot=%lx",arg->addr,arg->len,arg->prot);
+ printk("arg.flags=%lx,arg.fd=%lx,arg.pgoff=%lx,arg.pid=%lx\n",arg->flags,arg->fd,arg->pgoff,arg->pid);
+/* ret=access_ok(VERIFY_READ,arg,sizeof(a));
+ if(ret){
+ printk("access_ok! sizeof(a)=%d,%dbyte readed\n",sizeof(a),ret);
+ }*/
+
+ ret=copy_from_user(&a, arg, sizeof(a));
+ if(ret){
+ printk("mmap3 copy_from_user error.. %d byte left\n",ret);
+ printk("addr=%lx,len=%lx,prot=%lx,flags=%lx,fd=%lx,pgoff=%lx,pid=%ld\n",a.addr,a.len, a.prot, a.flags, a.fd, a.pgoff, a.pid);
+ goto out;
+ }else{
+ printk("Copy_from_User finish collecty, %dbytes left...\n", ret);
+ printk("addr=%lx,len=%lx,prot=%lx,flags=%lx,fd=%lx,pgoff=%lx,pid=%ld\n",a.addr,a.len, a.prot, a.flags, a.fd, a.pgoff, a.pid);
+ }
+ error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.pgoff, a.pid);
+
+ out:
+ printk("mmap3 out, return=%lx\n",error);
+ return error;
}

/*
@@ -101,7 +164,7 @@ asmlinkage int old_mmap(struct mmap_arg_
if (a.offset & ~PAGE_MASK)
goto out;

- err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
+ err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT,0);
out:
return err;
}
diff -urpN linux-2.6.11.7-vanilla/arch/i386/mm/mmap.c linux-2.6.11.7-pannus-i386/arch/i386/mm/mmap.c
--- linux-2.6.11.7-vanilla/arch/i386/mm/mmap.c 2005-04-08 03:57:36.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/mm/mmap.c 2005-04-18 12:32:13.000000000 +0900
@@ -62,10 +62,12 @@ void arch_pick_mmap_layout(struct mm_str
current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->get_unmapped_area2 = arch_get_unmapped_area2;
mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(mm);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ mm->get_unmapped_area2 = arch_get_unmapped_area_topdown2;
mm->unmap_area = arch_unmap_area_topdown;
}
}
diff -urpN linux-2.6.11.7-vanilla/include/asm-i386/exechandle.h linux-2.6.11.7-pannus-i386/include/asm-i386/exechandle.h
--- linux-2.6.11.7-vanilla/include/asm-i386/exechandle.h 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/asm-i386/exechandle.h 2005-04-18 12:32:13.000000000 +0900
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_64_EXECHANDLE_H
+#define _ASM_X86_64_EXECHANDLE_H
+
+#include <asm/types.h>
+#include <asm/signal.h>
+
+
+struct initaction
+{
+ void (*inithandler)(int);
+ void (*restorer)(void);
+};
+
+struct k_initaction
+{
+ struct initaction ia;
+};
+
+void do_init(struct pt_regs *regs, sigset_t *oldset);
+
+#endif
diff -urpN linux-2.6.11.7-vanilla/include/asm-i386/thread_info.h linux-2.6.11.7-pannus-i386/include/asm-i386/thread_info.h
--- linux-2.6.11.7-vanilla/include/asm-i386/thread_info.h 2005-04-08 03:57:14.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/asm-i386/thread_info.h 2005-04-18 12:32:13.000000000 +0900
@@ -44,6 +44,9 @@ struct thread_info {
of nested (IRQ) stacks
*/
__u8 supervisor_stack[0];
+
+ __u32 inipending; /* Pending flags for live patch */
+ __u32 inifinish; /* Finish flags for live patch */
};

#else /* !__ASSEMBLY__ */
diff -urpN linux-2.6.11.7-vanilla/include/asm-i386/unistd.h linux-2.6.11.7-pannus-i386/include/asm-i386/unistd.h
--- linux-2.6.11.7-vanilla/include/asm-i386/unistd.h 2005-04-08 03:57:46.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/asm-i386/unistd.h 2005-04-18 12:32:13.000000000 +0900
@@ -294,8 +294,14 @@
#define __NR_add_key 286
#define __NR_request_key 287
#define __NR_keyctl 288
+#define __NR_mmap3 289
+#define __NR_accesspvm (__NR_mmap3+1)
+#define __NR_init_pend (__NR_mmap3+2)
+#define __NR_rt_handlereturn (__NR_mmap3+3)
+#define __NR_check_init (__NR_mmap3+4)
+#define __NR_munmap3 (__NR_mmap3+5)

-#define NR_syscalls 289
+#define NR_syscalls 294

/*
* user-visible error numbers are in the range -1 - -128: see
diff -urpN linux-2.6.11.7-vanilla/include/linux/capability.h linux-2.6.11.7-pannus-i386/include/linux/capability.h
--- linux-2.6.11.7-vanilla/include/linux/capability.h 2005-04-08 03:57:26.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/capability.h 2005-04-18 12:32:13.000000000 +0900
@@ -288,6 +288,10 @@ typedef __u32 kernel_cap_t;

#define CAP_AUDIT_CONTROL 30

+/* Allow use of memory access system calls for Live Patching */
+
+#define CAP_SYS_PANNUS 31
+
#ifdef __KERNEL__
/*
* Bounding set
diff -urpN linux-2.6.11.7-vanilla/include/linux/mm.h linux-2.6.11.7-pannus-i386/include/linux/mm.h
--- linux-2.6.11.7-vanilla/include/linux/mm.h 2005-04-08 03:57:09.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/mm.h 2005-04-18 12:32:13.000000000 +0900
@@ -614,6 +614,7 @@ extern int install_page(struct mm_struct
extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
+extern int make_pages_present2(unsigned long addr, unsigned long end, struct task_struct *tsk);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);

@@ -730,10 +731,16 @@ extern void exit_mmap(struct mm_struct *

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

+extern unsigned long get_unmapped_area2(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, struct task_struct *);
+
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff);

+extern unsigned long do_mmap_pgoff2(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long pgoff, struct task_struct *);
+
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
diff -urpN linux-2.6.11.7-vanilla/include/linux/mman.h linux-2.6.11.7-pannus-i386/include/linux/mman.h
--- linux-2.6.11.7-vanilla/include/linux/mman.h 2005-04-08 03:57:13.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/mman.h 2005-04-18 12:32:13.000000000 +0900
@@ -64,4 +64,17 @@ calc_vm_flag_bits(unsigned long flags)
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
}

+/*
+ * Struct of argument to pass the mmap3 system call.
+ */
+typedef struct _mmap3_arg_struct {
+ unsigned long addr; /* address where file is loaded */
+ unsigned long len; /* length of data to be maped */
+ unsigned long prot; /* permission of the memory where the file is mapped */
+ unsigned long flags; /* flag of mapped memory */
+ unsigned long fd; /* file descriptor of data to be mapped */
+ unsigned long pgoff; /* page offset of data to be mapped */
+ unsigned long pid; /* process ID */
+} mmap3_arg_struct_t;
+
#endif /* _LINUX_MMAN_H */
diff -urpN linux-2.6.11.7-vanilla/include/linux/sched.h linux-2.6.11.7-pannus-i386/include/linux/sched.h
--- linux-2.6.11.7-vanilla/include/linux/sched.h 2005-04-08 03:57:12.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/sched.h 2005-04-18 12:32:13.000000000 +0900
@@ -21,6 +21,7 @@
#include <asm/ptrace.h>
#include <asm/mmu.h>
#include <asm/cputime.h>
+#include <asm/exechandle.h>

#include <linux/smp.h>
#include <linux/sem.h>
@@ -197,9 +198,19 @@ extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
extern unsigned long
+arch_get_unmapped_area2(struct file *, unsigned long, unsigned long,
+ unsigned long, unsigned long, struct task_struct *);
+
+extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
+
+extern unsigned long
+arch_get_unmapped_area_topdown2(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, struct task_struct *);
+
extern void arch_unmap_area(struct vm_area_struct *area);
extern void arch_unmap_area_topdown(struct vm_area_struct *area);

@@ -211,6 +222,11 @@ struct mm_struct {
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
+ unsigned long (*get_unmapped_area2) (struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags,
+ struct task_struct * tsk);
+
void (*unmap_area) (struct vm_area_struct *area);
unsigned long mmap_base; /* base of mmap area */
unsigned long free_area_cache; /* first hole */
@@ -685,6 +701,7 @@ struct task_struct {
struct mempolicy *mempolicy;
short il_next;
#endif
+ struct k_initaction k_ia; /*Inialization info for live patch */
};

static inline pid_t process_group(struct task_struct *tsk)
@@ -1173,6 +1190,7 @@ static inline void arch_pick_mmap_layout
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->get_unmapped_area2 = arch_get_unmapped_area2;
mm->unmap_area = arch_unmap_area;
}
#endif
diff -urpN linux-2.6.11.7-vanilla/kernel/fork.c linux-2.6.11.7-pannus-i386/kernel/fork.c
--- linux-2.6.11.7-vanilla/kernel/fork.c 2005-04-08 03:57:12.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/kernel/fork.c 2005-04-18 12:32:13.000000000 +0900
@@ -2,6 +2,7 @@
* linux/kernel/fork.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2004-2005 NTT Corporation
*/

/*
@@ -412,6 +413,12 @@ void mm_release(struct task_struct *tsk,
u32 __user * tidptr = tsk->clear_child_tid;
tsk->clear_child_tid = NULL;

+ /* initialize flag and information for live patch */
+ tsk->thread_info->inipending=0;
+ tsk->thread_info->inifinish=0;
+ tsk->k_ia.ia.inithandler=NULL;
+ tsk->k_ia.ia.restorer=NULL;
+
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
diff -urpN linux-2.6.11.7-vanilla/mm/memory.c linux-2.6.11.7-pannus-i386/mm/memory.c
--- linux-2.6.11.7-vanilla/mm/memory.c 2005-04-08 03:57:36.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/mm/memory.c 2005-04-18 12:32:13.000000000 +0900
@@ -2209,6 +2209,27 @@ int make_pages_present(unsigned long add
return ret == len ? 0 : -1;
}

+int make_pages_present2(unsigned long addr, unsigned long end, struct task_struct *tsk)
+{
+ int ret, len, write;
+ struct vm_area_struct * vma;
+
+ vma = find_vma(tsk->mm, addr);
+ if (!vma)
+ return -1;
+ write = (vma->vm_flags & VM_WRITE) != 0;
+ if (addr >= end)
+ BUG();
+ if (end > vma->vm_end)
+ BUG();
+ len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+ ret = get_user_pages(tsk, tsk->mm, addr,
+ len, write, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ return ret == len ? 0 : -1;
+}
+
/*
* Map a vmalloc()-space virtual address to the physical page.
*/
diff -urpN linux-2.6.11.7-vanilla/mm/mmap.c linux-2.6.11.7-pannus-i386/mm/mmap.c
--- linux-2.6.11.7-vanilla/mm/mmap.c 2005-04-08 03:57:45.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/mm/mmap.c 2005-04-18 12:32:13.000000000 +0900
@@ -1143,6 +1143,239 @@ unacct_error:

EXPORT_SYMBOL(do_mmap_pgoff);

+/*
+ * map the data which have the length specified in the file to the memory of
+ * the specified task.(clone of do_mmap_pgoff)
+ */
+
+unsigned long do_mmap_pgoff2(struct file * file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff, struct task_struct *tsk)
+{
+ struct mm_struct * mm = tsk->mm;
+ struct vm_area_struct * vma, * prev;
+ struct inode *inode;
+ unsigned int vm_flags;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node ** rb_link, * rb_parent;
+ int accountable = 1;
+ unsigned long charged = 0;
+
+ if (file) {
+ if (is_file_hugepages(file))
+ accountable = 0;
+
+ if (!file->f_op || !file->f_op->mmap)
+ return -ENODEV;
+
+ if ((prot & PROT_EXEC) &&
+ (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
+ return -EPERM;
+ }
+
+ if ((prot & PROT_READ) && (tsk->personality & READ_IMPLIES_EXEC))
+ if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ prot |= PROT_EXEC;
+ if (!len)
+ return addr;
+
+ len = PAGE_ALIGN(len);
+ if (!len || len > TASK_SIZE)
+ return -EINVAL;
+
+ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+ return -EINVAL;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ addr = get_unmapped_area2(file, addr, len, pgoff, flags, tsk);
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (flags & MAP_LOCKED) {
+ if (!can_do_mlock())
+ return -EPERM;
+ vm_flags |= VM_LOCKED;
+ }
+ if (vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = tsk->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ locked += len;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+
+ inode = file ? file->f_dentry->d_inode : NULL;
+
+ if (file) {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
+ return -EACCES;
+
+ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ if (locks_verify_locked(inode))
+ return -EAGAIN;
+
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ if (!(file->f_mode & FMODE_WRITE))
+ vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+
+ case MAP_PRIVATE:
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ } else {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ break;
+ case MAP_PRIVATE:
+ pgoff = addr >> PAGE_SHIFT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ error = security_file_mmap(file, prot, flags);
+ if (error)
+ return error;
+
+ error = -ENOMEM;
+munmap_back:
+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (vma && vma->vm_start < addr + len) {
+ if (do_munmap(mm, addr, len))
+ return -ENOMEM;
+ goto munmap_back;
+ }
+ if ((mm->total_vm << PAGE_SHIFT) + len
+ > tsk->signal->rlim[RLIMIT_AS].rlim_cur)
+ return -ENOMEM;
+
+ if (accountable && (!(flags & MAP_NORESERVE) ||
+ sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+ if (vm_flags & VM_SHARED) {
+ vm_flags |= VM_ACCOUNT;
+ } else if (vm_flags & VM_WRITE) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+ }
+
+ if (!file && !(vm_flags & VM_SHARED) &&
+ vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, NULL, pgoff, NULL))
+ goto out;
+
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+ memset(vma, 0, sizeof(*vma));
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ error = -EINVAL;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ goto free_vma;
+ if (vm_flags & VM_DENYWRITE) {
+ error = deny_write_access(file);
+ if (error)
+ goto free_vma;
+ correct_wcount = 1;
+ }
+ vma->vm_file = file;
+ get_file(file);
+ error = file->f_op->mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ }
+
+ if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
+ vma->vm_flags &= ~VM_ACCOUNT;
+
+
+ addr = vma->vm_start;
+ pgoff = vma->vm_pgoff;
+ vm_flags = vma->vm_flags;
+
+ if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
+ vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
+ file = vma->vm_file;
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ } else {
+ if (file) {
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ fput(file);
+ }
+ mpol_free(vma_policy(vma));
+ kmem_cache_free(vm_area_cachep, vma);
+ }
+
+out:
+ mm->total_vm += len >> PAGE_SHIFT;
+ __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += len >> PAGE_SHIFT;
+ make_pages_present2(addr, addr + len, tsk);
+ }
+ if (flags & MAP_POPULATE) {
+ up_write(&mm->mmap_sem);
+ sys_remap_file_pages(addr, len, 0,
+ pgoff, flags & MAP_NONBLOCK);
+ down_write(&mm->mmap_sem);
+ }
+ acct_update_integrals();
+ update_mem_hiwater();
+ return addr;
+
+unmap_and_free_vma:
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ vma->vm_file = NULL;
+ fput(file);
+
+ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+free_vma:
+ kmem_cache_free(vm_area_cachep, vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ return error;
+}
+EXPORT_SYMBOL(do_mmap_pgoff2);
+
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1199,6 +1432,48 @@ full_search:
addr = vma->vm_end;
}
}
+
+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of arch_get_unmapped_area)
+ */
+unsigned long
+arch_get_unmapped_area2(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags, struct task_struct *tsk)
+{
+
+ struct mm_struct *mm = tsk->mm;
+ struct vm_area_struct *vma;
+ unsigned long start_addr;
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+ start_addr = addr = mm->free_area_cache;
+
+full_search:
+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ if (TASK_SIZE - len < addr) {
+ if (start_addr != TASK_UNMAPPED_BASE) {
+ start_addr = addr = TASK_UNMAPPED_BASE;
+ goto full_search;
+ }
+ return -ENOMEM;
+ }
+ if (!vma || addr + len <= vma->vm_start) {
+ mm->free_area_cache = addr + len;
+ return addr;
+ }
+ addr = vma->vm_end;
+ }
+}
+
#endif

void arch_unmap_area(struct vm_area_struct *area)
@@ -1300,6 +1575,66 @@ fail:

return addr;
}
+
+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of arch_get_unmapped_area_topdown)
+ */
+unsigned long
+arch_get_unmapped_area_topdown2(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags, struct task_struct *tsk)
+{
+ struct vm_area_struct *vma, *prev_vma;
+ struct mm_struct *mm = tsk->mm;
+ unsigned long base = mm->mmap_base, addr = addr0;
+ int first_time = 1;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (mm->free_area_cache > base)
+ mm->free_area_cache = base;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+try_again:
+ if (mm->free_area_cache < len)
+ goto fail;
+
+ addr = (mm->free_area_cache - len) & PAGE_MASK;
+ do {
+ if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+ return addr;
+
+ if (addr+len <= vma->vm_start &&
+ (!prev_vma || (addr >= prev_vma->vm_end)))
+ return (mm->free_area_cache = addr);
+ else
+ if (mm->free_area_cache == vma->vm_end)
+ mm->free_area_cache = vma->vm_start;
+
+ addr = vma->vm_start-len;
+ } while (len <= vma->vm_start);
+
+fail:
+ if (first_time) {
+ mm->free_area_cache = base;
+ first_time = 0;
+ goto try_again;
+ }
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ addr = arch_get_unmapped_area2(filp, addr0, len, pgoff, flags, tsk);
+ mm->free_area_cache = base;
+ return addr;
+}
+
#endif

void arch_unmap_area_topdown(struct vm_area_struct *area)
@@ -1350,6 +1685,35 @@ get_unmapped_area(struct file *file, uns

EXPORT_SYMBOL(get_unmapped_area);

+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of get_unmapped_area)
+ */
+unsigned long
+get_unmapped_area2(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, struct task_struct *tsk)
+{
+ if (flags & MAP_FIXED) {
+ unsigned long ret;
+
+ if (addr > TASK_SIZE - len)
+ return -ENOMEM;
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+ if (file && is_file_hugepages(file)) {
+ ret = prepare_hugepage_range(addr, len);
+ } else {
+ ret = is_hugepage_only_range(addr, len);
+ }
+ if (ret)
+ return -EINVAL;
+ return addr;
+ }
+ return tsk->mm->get_unmapped_area2(file, addr, len, pgoff, flags, tsk);
+}
+
+EXPORT_SYMBOL(get_unmapped_area2);
+
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
{
@@ -1878,6 +2242,49 @@ static inline void verify_mm_writelocked
#endif
}

+
+
+/*
+ * Clear the specified mapped area in specified process.
+ * Provide the system call munmap3.
+ * Send memory map information struct to do_munmap.
+ */
+asmlinkage long sys_munmap3(unsigned long addr, size_t len, pid_t pid)
+{
+ int ret;
+ struct mm_struct *mm;
+
+ /* target process task struct */
+ struct task_struct *tsk;
+
+ /* get specified process task struct from pid.*/
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ read_unlock(&tasklist_lock);
+
+ if (!tsk)
+ return -ESRCH;
+
+ // capability check
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in munamp3
+ // EPERM:1 Operation not permitted
+ return -EPERM;
+ }
+
+
+ mm = tsk->mm;
+ down_write(&mm->mmap_sem);
+ ret = do_munmap(mm, addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some