Re: Handle kernel page faults using task gate

From: eliad lubovsky
Date: Thu Jun 30 2005 - 20:21:35 EST


attached a patch, it may be more clear to understand what I have done.
To cause a page fault there is a need to compile the kernel with 8KB
stack, and use the following module and application:
the module is a char device "my_device", the application opens a fd to
the module and call ioctl, the module respond in calling to an
overloaded stack function which cause a page fault.
the task gate is being set dynamically by the ioctl sys_call.


#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <asm/io.h>
#include <linux/vmalloc.h>

#include <asm/irq.h>
#include <asm/uaccess.h>

int simple_open (struct inode *inode, struct file *filp);
int simple_release(struct inode *inode, struct file *filp);
int simple_ioctl (struct inode *inode, struct file *filp, unsigned int
cmd, unsigned long arg);

static int simple_major = 0;
//extern void my_set_task_gate(unsigned int n, unsigned int gdt);
//extern void my_set_page_fault_intr_gate();

struct file_operations simple_nopage_ops = {
open: simple_open,
release: simple_release,
ioctl: simple_ioctl
};

struct file_operations *simple_fops = {
&simple_nopage_ops,
};

int simple_open (struct inode *inode, struct file *filp)
{
filp->f_op = simple_fops;
filp->private_data = 0;

return 0;
}
int simple_release(struct inode *inode, struct file *filp)
{
printk(KERN_INFO "Driver: simple release\n");
return 0;
}

/* declare a stack that is larger then a PAGE_SIZE
*/ static void call_overloaded_stack(void)
{
char buff[1420];
char buff1[1420];
char buff2[1500];

printk(KERN_INFO "overloaded stack called\n");
buff[1400] = 1;
buff1[1400] = 1;
buff2[1450] = 1;
}

/* cause a page fault by calling to call_overloaded_stack() method */
int simple_ioctl (struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
printk(KERN_INFO "Driver: Pre simple ioctl\n");

/* set task gate */
// my_set_task_gate(14, GDT_ENTRY_PAGE_FAULT_TSS); /* it is being set
by the ioctl sys_call */

call_overloaded_stack();

/* return to interrupt gate */
// my_set_page_fault_intr_gate();
return 0;
}

static int __init mm_init(void)
{
int result;
printk("Driver: initialized\n");

result = register_chrdev(simple_major, "my_device",
&simple_nopage_ops);
if (result < 0)
{
printk(KERN_WARNING "Driver: my_device: unable to get major
%d\n", simple_major);
return result;
}
if (simple_major == 0)
simple_major = result;
return 0;
}

static void __exit mm_exit(void)
{
printk("Driver: exited\n");
unregister_chrdev(simple_major, "my_device");
}

MODULE_AUTHOR("Eliad Lubovsky");
MODULE_DESCRIPTION("Memory Test");
MODULE_LICENSE("GPL");

module_init(mm_init);
module_exit(mm_exit);


/***********************************************************/
Application: open device and implement ioctl to cause a page fault by an
overloaded stack.

#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
int main(int argc, char ** argv) {
int ret;
unsigned int i=0;
int fd = open("/dev/my_device", O_RDWR);

if(fd<0) {
printf("Failed to open my_device\n");
exit(1);
}

ret = ioctl(fd, 0, i);
if(ret != 0) {
printf("Ioctl failed\n");
exit(1);
}
close(fd);
return 0;
}


On Thu, 2005-06-30 at 10:11, Ingo Molnar wrote:
> * eliad lubovsky <eliadl@xxxxxxx> wrote:
>
> > How do I clear the 'busy' bit?
> > I set my TSS descriptor with
> > __set_tss_desc(cpu, GDT_ENTRY_PAGE_FAULT_TSS, &pagefault_tss);
>
> i suspect you have to clear the busy bit in the pagefault handler
> itself. The CPU marks it as busy upon fault. I guess it would be OK to
> just do the above __set_tss_desc() for _every_ pagefault, that too will
> clear the busy bit, but you are probably better off just clearing that
> bit manually:
>
> cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff;
>
> Ingo
diff -urNp linux-2.6.9.orig/arch/i386/kernel/cpu/common.c linux-2.6.9/arch/i386/kernel/cpu/common.c
--- linux-2.6.9.orig/arch/i386/kernel/cpu/common.c 2004-10-18 23:53:07.000000000 +0200
+++ linux-2.6.9/arch/i386/kernel/cpu/common.c 2005-06-30 13:24:06.000000000 +0300
@@ -565,11 +565,13 @@ void __init cpu_init (void)
/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);

+ /* Set up page fault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_PAGE_FAULT_TSS, &pagefault_tss);
+
/* Clear %fs and %gs. */
asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");

/* Clear all 6 debug registers: */
-
#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );

CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
diff -urNp linux-2.6.9.orig/arch/i386/kernel/Makefile linux-2.6.9/arch/i386/kernel/Makefile
--- linux-2.6.9.orig/arch/i386/kernel/Makefile 2004-10-18 23:53:25.000000000 +0200
+++ linux-2.6.9/arch/i386/kernel/Makefile 2005-06-30 13:24:44.000000000 +0300
@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.ld
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- doublefault.o
+ doublefault.o pagefault.o

obj-y += cpu/
obj-y += timers/
diff -urNp linux-2.6.9.orig/arch/i386/kernel/pagefault.c linux-2.6.9/arch/i386/kernel/pagefault.c
--- linux-2.6.9.orig/arch/i386/kernel/pagefault.c 1970-01-01 02:00:00.000000000 +0200
+++ linux-2.6.9/arch/i386/kernel/pagefault.c 2005-07-01 03:03:02.000000000 +0300
@@ -0,0 +1,79 @@
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/vmalloc_thread_info.h>
+
+#define PAGEFAULT_STACKSIZE (2048)
+static unsigned long pagefault_stack[PAGEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(pagefault_stack+PAGEFAULT_STACKSIZE)
+
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + 0x1000000)
+
+extern struct vm_struct *find_vm_area(void *addr);
+extern void expand_stack_size(struct vm_struct *area);
+
+static void pagefault_fn(void)
+{
+ unsigned int address, aligned_addr;
+ unsigned int i=0;
+ struct vm_struct *area;
+
+ goto handle_fault;
+
+return_from_fault:
+
+// __set_tss_desc(0, GDT_ENTRY_PAGE_FAULT_TSS, &pagefault_tss);
+ __asm__("iret");
+
+handle_fault:
+
+printk("gdt entry a 0x%x\ngdt entry b 0x%x\n", (unsigned)(cpu_gdt_table[GDT_ENTRY_PAGE_FAULT_TSS].a), (unsigned)(cpu_gdt_table[GDT_ENTRY_PAGE_FAULT_TSS].b));
+
+ /* clear busy bit in the tss descriptor */
+ cpu_gdt_table[GDT_ENTRY_PAGE_FAULT_TSS].b &= 0xfffffdff;
+ //__set_tss_desc(0, GDT_ENTRY_PAGE_FAULT_TSS, &pagefault_tss);
+
+ __asm__("movl %%cr2,%0":"=r" (address));
+
+ aligned_addr = ((address+PAGE_SIZE)&(~(PAGE_SIZE-1)));
+ printk("Page fault address 0x%x, start address 0x%x\n", address, aligned_addr);
+
+ /* search for the vm area */
+ for(i=0; i<THREAD_SIZE/PAGE_SIZE; i++) {
+ area = find_vm_area((void*)(aligned_addr-(i*PAGE_SIZE)));
+ if(area) {
+ printk("vm area 0x%x, addr 0x%x, address 0x%x\n", (unsigned int)area, (unsigned int)(area->addr), aligned_addr-(i*4096));
+ break;
+ }
+ }
+
+ /* allocate a new physical page, expand the stack size */
+ expand_stack_size(area);
+
+ goto return_from_fault;
+}
+
+struct tss_struct pagefault_tss __cacheline_aligned = {
+ .esp0 = STACK_START,
+ .ss0 = __KERNEL_DS,
+ .ldt = 0,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+ .eip = (unsigned long) pagefault_fn,
+ .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
+ .esp = STACK_START,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+
+ .__cr3 = __pa(swapper_pg_dir)
+};
diff -urNp linux-2.6.9.orig/arch/i386/kernel/traps.c linux-2.6.9/arch/i386/kernel/traps.c
--- linux-2.6.9.orig/arch/i386/kernel/traps.c 2004-10-18 23:53:23.000000000 +0200
+++ linux-2.6.9/arch/i386/kernel/traps.c 2005-06-30 13:23:49.000000000 +0300
@@ -1025,6 +1025,20 @@ static void __init set_task_gate(unsigne
_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
}

+void my_set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+ set_task_gate(n, gdt_entry);
+}
+
+EXPORT_SYMBOL(my_set_task_gate);
+
+void my_set_page_fault_intr_gate()
+{
+ set_intr_gate(14,&page_fault);
+}
+
+EXPORT_SYMBOL(my_set_page_fault_intr_gate);
+

void __init trap_init(void)
{
diff -urNp linux-2.6.9.orig/fs/ioctl.c linux-2.6.9/fs/ioctl.c
--- linux-2.6.9.orig/fs/ioctl.c 2004-10-18 23:53:43.000000000 +0200
+++ linux-2.6.9/fs/ioctl.c 2005-06-30 13:26:21.000000000 +0300
@@ -57,6 +57,9 @@ asmlinkage long sys_ioctl(unsigned int f
unsigned int flag;
int on, error = -EBADF;

+ /* set task gate entry in the idt */
+ my_set_task_gate(14, GDT_ENTRY_PAGE_FAULT_TSS);
+
filp = fget(fd);
if (!filp)
goto out;
@@ -133,6 +136,9 @@ asmlinkage long sys_ioctl(unsigned int f
fput(filp);

out:
+ /* set intr gate entry in the idt */
+ my_set_page_fault_intr_gate();
+
return error;
}

diff -urNp linux-2.6.9.orig/include/asm-i386/processor.h linux-2.6.9/include/asm-i386/processor.h
--- linux-2.6.9.orig/include/asm-i386/processor.h 2004-10-18 23:53:07.000000000 +0200
+++ linux-2.6.9/include/asm-i386/processor.h 2005-06-30 13:22:59.000000000 +0300
@@ -86,6 +86,7 @@ struct cpuinfo_x86 {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
extern struct tss_struct doublefault_tss;
+extern struct tss_struct pagefault_tss;
DECLARE_PER_CPU(struct tss_struct, init_tss);

#ifdef CONFIG_SMP
diff -urNp linux-2.6.9.orig/include/asm-i386/segment.h linux-2.6.9/include/asm-i386/segment.h
--- linux-2.6.9.orig/include/asm-i386/segment.h 2004-10-18 23:53:44.000000000 +0200
+++ linux-2.6.9/include/asm-i386/segment.h 2005-06-30 13:22:43.000000000 +0300
@@ -42,7 +42,7 @@
* 27 - unused
* 28 - unused
* 29 - unused
- * 30 - unused
+ * 30 - TSS for page fault handler
* 31 - TSS for double fault handler
*/
#define GDT_ENTRY_TLS_ENTRIES 3
@@ -71,6 +71,7 @@
#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)

+#define GDT_ENTRY_PAGE_FAULT_TSS 30
#define GDT_ENTRY_DOUBLEFAULT_TSS 31

/*
diff -urNp linux-2.6.9.orig/include/asm-i386/smp.h linux-2.6.9/include/asm-i386/smp.h
--- linux-2.6.9.orig/include/asm-i386/smp.h 2004-10-18 23:55:36.000000000 +0200
+++ linux-2.6.9/include/asm-i386/smp.h 2005-06-30 13:22:32.000000000 +0300
@@ -50,7 +50,8 @@ extern u8 x86_cpu_to_apicid[];
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
-#define smp_processor_id() (current_thread_info()->cpu)
+//#define smp_processor_id() (current_thread_info()->cpu)
+#define smp_processor_id() 0

extern cpumask_t cpu_callout_map;
#define cpu_possible_map cpu_callout_map
diff -urNp linux-2.6.9.orig/include/asm-i386/thread_info.h linux-2.6.9/include/asm-i386/thread_info.h
--- linux-2.6.9.orig/include/asm-i386/thread_info.h 2004-10-18 23:53:21.000000000 +0200
+++ linux-2.6.9/include/asm-i386/thread_info.h 2005-06-30 13:22:14.000000000 +0300
@@ -55,7 +55,7 @@ struct thread_info {
#ifdef CONFIG_4KSTACKS
#define THREAD_SIZE (4096)
#else
-#define THREAD_SIZE (8192)
+#define THREAD_SIZE (8192*2)
#endif

#define STACK_WARN (THREAD_SIZE/8)
diff -urNp linux-2.6.9.orig/include/asm-i386/vmalloc_thread_info.h linux-2.6.9/include/asm-i386/vmalloc_thread_info.h
--- linux-2.6.9.orig/include/asm-i386/vmalloc_thread_info.h 1970-01-01 02:00:00.000000000 +0200
+++ linux-2.6.9/include/asm-i386/vmalloc_thread_info.h 2005-06-30 13:25:55.000000000 +0300
@@ -0,0 +1,15 @@
+#ifndef _LINUX_VMALLOC_THREAD_INFO_H
+#define _LINUX_VMALLOC_THREAD_INFO_H
+
+#include <linux/spinlock.h>
+#include <asm/page.h> /* pgprot_t */
+
+
+/*
+ * Highlevel APIs for driver use
+ */
+extern void *vmalloc_thread_info(unsigned long size);
+extern void extend_stack_size(struct vm_struct *area);
+
+#endif /* _LINUX_VMALLOC_THREAD_INFO_H */
+
diff -urNp linux-2.6.9.orig/include/linux/gfp.h linux-2.6.9/include/linux/gfp.h
--- linux-2.6.9.orig/include/linux/gfp.h 2004-10-18 23:53:44.000000000 +0200
+++ linux-2.6.9/include/linux/gfp.h 2005-06-30 13:23:23.000000000 +0300
@@ -91,6 +91,19 @@ static inline struct page *alloc_pages_n
NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
}

+extern struct page *
+FASTCALL(__alloc_thread_info_pages(unsigned int, unsigned int, struct zonelist *));
+
+static inline struct page *alloc_thread_info_pages_node(int nid, unsigned int gfp_mask,
+ unsigned int order)
+{
+ if (unlikely(order >= MAX_ORDER))
+ return NULL;
+
+ return __alloc_thread_info_pages(gfp_mask, order,
+ NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
+}
+
#ifdef CONFIG_NUMA
extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order);

@@ -107,9 +120,12 @@ extern struct page *alloc_page_vma(unsig
#else
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
+#define alloc_thread_info_pages(gfp_mask, order) \
+ alloc_thread_info_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
#endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+#define alloc_thread_info_page(gfp_mask) alloc_thread_info_pages(gfp_mask, 0)

extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order));
extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
diff -urNp linux-2.6.9.orig/kernel/fork.c linux-2.6.9/kernel/fork.c
--- linux-2.6.9.orig/kernel/fork.c 2004-10-18 23:53:13.000000000 +0200
+++ linux-2.6.9/kernel/fork.c 2005-06-30 13:21:42.000000000 +0300
@@ -79,7 +79,11 @@ static kmem_cache_t *task_struct_cachep;

void free_task(struct task_struct *tsk)
{
+#if 0
free_thread_info(tsk->thread_info);
+#else
+ vfree_thread_info(tsk->thread_info);
+#endif
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -264,7 +268,12 @@ static struct task_struct *dup_task_stru
if (!tsk)
return NULL;

+#if 0
ti = alloc_thread_info(tsk);
+#else
+ ti = vmalloc_thread_info(THREAD_SIZE);
+#endif
+
if (!ti) {
free_task_struct(tsk);
return NULL;
diff -urNp linux-2.6.9.orig/mm/Makefile linux-2.6.9/mm/Makefile
--- linux-2.6.9.orig/mm/Makefile 2004-10-18 23:54:37.000000000 +0200
+++ linux-2.6.9/mm/Makefile 2005-06-30 13:20:45.000000000 +0300
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o vmalloc_thread_info.o

obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o prio_tree.o \
diff -urNp linux-2.6.9.orig/mm/page_alloc.c linux-2.6.9/mm/page_alloc.c
--- linux-2.6.9.orig/mm/page_alloc.c 2004-10-18 23:53:11.000000000 +0200
+++ linux-2.6.9/mm/page_alloc.c 2005-06-30 13:20:28.000000000 +0300
@@ -2069,3 +2069,157 @@ void *__init alloc_large_system_hash(con

return table;
}
+
+
+struct page * fastcall
+__alloc_thread_info_pages(unsigned int gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ const int wait = gfp_mask & __GFP_WAIT;
+ unsigned long min;
+ struct zone **zones, *z;
+ struct page *page;
+ struct reclaim_state reclaim_state;
+// struct task_struct *p = current;
+ int i;
+ int alloc_type;
+ int do_retry;
+ int can_try_harder;
+
+ might_sleep_if(wait);
+
+ /*
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or is the caller has realtime scheduling
+ * policy
+ */
+// can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+ can_try_harder = 0;
+
+ zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
+
+ if (unlikely(zones[0] == NULL)) {
+ /* Should this ever happen?? */
+ return NULL;
+ }
+
+ alloc_type = zone_idx(zones[0]);
+
+ /* Go through the zonelist once, looking for a zone with enough free */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ min = z->pages_low + (1<<order) + z->protection[alloc_type];
+
+ if (z->free_pages < min)
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+
+ for (i = 0; (z = zones[i]) != NULL; i++)
+ wakeup_kswapd(z);
+
+ /*
+ * Go through the zonelist again. Let __GFP_HIGH and allocations
+ * coming from realtime tasks to go deeper into reserves
+ */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ min = z->pages_min;
+ if (gfp_mask & __GFP_HIGH)
+ min /= 2;
+ if (can_try_harder)
+ min -= min / 4;
+ min += (1<<order) + z->protection[alloc_type];
+
+ if (z->free_pages < min)
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+
+#if 0
+ /* This allocation should allow future memory freeing. */
+ if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
+ /* go through the zonelist yet again, ignoring mins */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+ goto nopage;
+ }
+#endif
+#if 0
+ /* Atomic allocations - we can't balance anything */
+ if (!wait)
+ goto nopage;
+
+#endif
+rebalance:
+#if 0
+ /* We now go into synchronous reclaim */
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ try_to_free_pages(zones, gfp_mask, order);
+
+ p->reclaim_state = NULL;
+ p->flags &= ~PF_MEMALLOC;
+#endif
+ /* go through the zonelist yet one more time */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ min = z->pages_min;
+ if (gfp_mask & __GFP_HIGH)
+ min /= 2;
+ if (can_try_harder)
+ min -= min / 4;
+ min += (1<<order) + z->protection[alloc_type];
+
+ if (z->free_pages < min)
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+
+ /*
+ * Don't let big-order allocations loop unless the caller explicitly
+ * requests that. Wait for some write requests to complete then retry.
+ *
+ * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+ * <= 3, but that may not be true in other implementations.
+ */
+ do_retry = 0;
+ if (!(gfp_mask & __GFP_NORETRY)) {
+ if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+ do_retry = 1;
+ if (gfp_mask & __GFP_NOFAIL)
+ do_retry = 1;
+ }
+ if (do_retry) {
+ blk_congestion_wait(WRITE, HZ/50);
+ goto rebalance;
+ }
+
+nopage:
+#if 0
+ if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+ printk(KERN_WARNING "%s: page allocation failure."
+ " order:%d, mode:0x%x\n",
+ p->comm, order, gfp_mask);
+ dump_stack();
+ }
+#endif
+ return NULL;
+got_pg:
+ zone_statistics(zonelist, z);
+ kernel_map_pages(page, 1 << order, 1);
+ return page;
+}
+
+EXPORT_SYMBOL(__alloc_thread_info_pages);
diff -urNp linux-2.6.9.orig/mm/vmalloc_thread_info.c linux-2.6.9/mm/vmalloc_thread_info.c
--- linux-2.6.9.orig/mm/vmalloc_thread_info.c 1970-01-01 02:00:00.000000000 +0200
+++ linux-2.6.9/mm/vmalloc_thread_info.c 2005-07-01 03:45:18.000000000 +0300
@@ -0,0 +1,502 @@
+/*
+ * linux/mm/vmalloc_thread_info.c
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+
+
+extern rwlock_t vmlist_lock;
+extern struct vm_struct *vmlist;
+
+void __vunmap_thread_info(void *addr, int deallocate_pages)
+{
+ struct vm_struct *area;
+
+ if (!addr)
+ return;
+
+ if ((PAGE_SIZE-1) & (unsigned long)addr) {
+ printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+ WARN_ON(1);
+ return;
+ }
+
+ area = remove_vm_area(addr);
+ if (unlikely(!area)) {
+ printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ addr);
+ WARN_ON(1);
+ return;
+ }
+
+ if (deallocate_pages) {
+ int i;
+
+ for (i = 0; i < area->nr_pages; i++) {
+ if (!(area->pages[i]))
+ continue;
+ __free_page(area->pages[i]);
+ }
+ kfree(area->pages);
+ }
+
+ kfree(area);
+ return;
+}
+
+void vfree_thread_info(void *addr)
+{
+ BUG_ON(in_interrupt());
+ __vunmap_thread_info(addr, 1);
+}
+
+EXPORT_SYMBOL(vfree_thread_info);
+
+static int map_area_pte_ti(pte_t *pte, unsigned long address,
+ unsigned long size, pgprot_t prot,
+ struct page ***pages)
+{
+ unsigned long end;
+ struct page *page = **pages;
+
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+
+// do {
+ //struct page *page = **pages;
+ WARN_ON(!pte_none(*pte));
+ if (!page)
+ return -ENOMEM;
+ set_pte(pte, mk_pte(page, prot));
+
+
+// address += PAGE_SIZE;
+ pte += (THREAD_SIZE/PAGE_SIZE-1);
+ (*pages) += (THREAD_SIZE/PAGE_SIZE-1);
+ page = **pages;
+ WARN_ON(!pte_none(*pte));
+ if (!page)
+ return -ENOMEM;
+ set_pte(pte, mk_pte(page, prot));
+
+// address += PAGE_SIZE;
+// pte++;
+// (*pages)++;
+
+
+
+// } while (address < end);
+ return 0;
+}
+
+static int map_area_pmd_ti(pmd_t *pmd, unsigned long address,
+ unsigned long size, pgprot_t prot,
+ struct page ***pages)
+{
+ unsigned long base, end;
+
+ base = address & PGDIR_MASK;
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+
+ do {
+ pte_t * pte = pte_alloc_kernel(&init_mm, pmd, base + address);
+ if (!pte)
+ return -ENOMEM;
+ if (map_area_pte_ti(pte, address, end - address, prot, pages))
+ return -ENOMEM;
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+
+ return 0;
+}
+
+int map_vm_area_ti(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+ unsigned long address = (unsigned long) area->addr;
+ unsigned long end = address + (area->size-PAGE_SIZE);
+ pgd_t *dir;
+ int err = 0;
+
+ dir = pgd_offset_k(address);
+ spin_lock(&init_mm.page_table_lock);
+ do {
+ pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
+ if (!pmd) {
+ err = -ENOMEM;
+ break;
+ }
+ if (map_area_pmd_ti(pmd, address, end - address, prot, pages)) {
+ err = -ENOMEM;
+ break;
+ }
+
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ dir++;
+ } while (address && (address < end));
+
+ spin_unlock(&init_mm.page_table_lock);
+ flush_cache_vmap((unsigned long) area->addr, end);
+ return err;
+}
+
+struct vm_struct *__get_vm_area_ti(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end)
+{
+ struct vm_struct **p, *tmp, *area;
+ unsigned long align = THREAD_SIZE;
+ unsigned long addr;
+
+ addr = ALIGN(start, align);
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
+ if (unlikely(!area))
+ return NULL;
+
+ /*
+ * We always allocate a guard page.
+ */
+ size += PAGE_SIZE;
+ if (unlikely(!size)) {
+ kfree (area);
+ return NULL;
+ }
+
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
+ if ((unsigned long)tmp->addr < addr) {
+ if((unsigned long)tmp->addr + tmp->size >= addr)
+ addr = ALIGN(tmp->size +
+ (unsigned long)tmp->addr, align);
+ continue;
+ }
+ if ((size + addr) < addr)
+ goto out;
+ if (size + addr <= (unsigned long)tmp->addr)
+ goto found;
+ addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
+ if (addr > end - size)
+ goto out;
+ }
+
+found:
+ area->next = *p;
+ *p = area;
+
+ area->flags = flags;
+ area->addr = (void *)addr;
+ area->size = size;
+ area->pages = NULL;
+ area->nr_pages = 0;
+ area->phys_addr = 0;
+ write_unlock(&vmlist_lock);
+ return area;
+
+out:
+ write_unlock(&vmlist_lock);
+ kfree(area);
+ if (printk_ratelimit())
+ printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
+ return NULL;
+}
+
+struct vm_struct *get_vm_area_ti(unsigned long size, unsigned long flags)
+{
+ return __get_vm_area_ti(size, flags, VMALLOC_START, VMALLOC_END);
+}
+
+void *__vmalloc_thread_info(unsigned long size, int gfp_mask, pgprot_t prot)
+{
+ struct vm_struct *area;
+ struct page **pages;
+ unsigned int nr_pages, array_size;
+
+ size = PAGE_ALIGN(size);
+ if (!size || (size >> PAGE_SHIFT) > num_physpages)
+ return NULL;
+
+ area = get_vm_area_ti(size, VM_ALLOC);
+ if (!area)
+ return NULL;
+
+ nr_pages = size >> PAGE_SHIFT;
+ array_size = (nr_pages * sizeof(struct page *));
+
+ area->nr_pages = nr_pages;
+ area->pages = pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
+ if (!area->pages) {
+ remove_vm_area(area->addr);
+ kfree(area);
+ return NULL;
+ }
+ memset(area->pages, 0, array_size);
+
+// for (i = 0; i < area->nr_pages; i++) {
+
+ area->pages[0] = alloc_page(gfp_mask);
+ if (unlikely(!area->pages[0])) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = 4; //try to free all pages
+ goto fail;
+ }
+
+#if 0
+ area->pages[1] = alloc_page(gfp_mask);
+ if (unlikely(!area->pages[1])) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = 1;
+ goto fail;
+ }
+
+ area->pages[2] = alloc_page(gfp_mask);
+ if (unlikely(!area->pages[2])) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = 2;
+ goto fail;
+ }
+#endif
+ area->pages[THREAD_SIZE/PAGE_SIZE-1] = alloc_page(gfp_mask);
+ if (unlikely(!area->pages[THREAD_SIZE/PAGE_SIZE-1])) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = 4;
+ goto fail;
+ }
+// }
+
+ if (map_vm_area_ti(area, prot, &pages))
+ goto fail;
+
+ area->nr_pages = 4;
+ return area->addr;
+
+fail:
+ area->nr_pages = 4;
+ vfree_thread_info(area->addr);
+ return NULL;
+}
+
+void *vmalloc_thread_info(unsigned long size)
+{
+ return __vmalloc_thread_info(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+EXPORT_SYMBOL(vmalloc_thread_info);
+
+struct vm_struct *find_vm_area(void *addr)
+{
+ struct vm_struct **p, *tmp;
+
+ write_lock(&vmlist_lock);
+ for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
+ if ((unsigned)(tmp->addr) == (unsigned)addr)
+ goto found;
+ }
+ write_unlock(&vmlist_lock);
+ return NULL;
+
+found:
+ printk("vm_area found 0x%x\n", (unsigned int)(tmp->addr));
+// unmap_vm_area(tmp);
+// *p = tmp->next;
+ write_unlock(&vmlist_lock);
+ return tmp;
+}
+
+EXPORT_SYMBOL(find_vm_area);
+
+void page_table_trace(unsigned start_address)
+{
+ unsigned int curr_address, i;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *ptep;
+
+
+ for(i=0; i<4; i++) {
+ printk("-------------------------------\n");
+ curr_address = start_address+i*PAGE_SIZE;
+ pgd = pgd_offset_k(curr_address);
+ if (pgd_none(*pgd) || pgd_bad(*pgd))
+ printk("bad pgd address 0x%x\n", curr_address);
+#if 0
+ else
+ printk("pgd address 0x%x pgd 0x%x\n", curr_address, (unsigned int)(pgd_val(*pgd)));
+#endif
+
+ pmd = pmd_offset(pgd, curr_address);
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ printk("bad pmd address 0x%x\n", curr_address);
+#if 0
+ else
+ printk("pmd address 0x%x pmd 0x%x\n", curr_address, (unsigned int)(pmd_val(*pmd)));
+#endif
+ ptep = pte_offset_kernel(pmd, curr_address);
+ if (!ptep)
+ printk("bad pte address 0x%x\n", curr_address);
+ else
+ printk("pte address 0x%x pte 0x%x\n", curr_address, (unsigned int)(pte_val(*ptep)));
+ }
+
+ printk("-------------------------------\n");
+}
+
+static int map_expand_stack_area_pte(pte_t *pte, unsigned long address,
+ unsigned long size, pgprot_t prot,
+ struct page ***pages)
+{
+ unsigned long end;
+
+ printk(KERN_INFO "Pre map_expand_stack_area_pte address 0x%x, size %d\n", (unsigned)address, (unsigned)size);
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+
+ do {
+ struct page *page = **pages;
+
+ WARN_ON(!pte_none(*pte));
+ if (!page)
+ return -ENOMEM;
+
+ set_pte(pte, mk_pte(page, prot));
+
+ printk(KERN_INFO "map_expand_stack_area_pte pte 0x%x address 0x%x\n", (unsigned int)(pte_val(*pte)), (unsigned)address);
+ address += PAGE_SIZE;
+ pte++;
+ (*pages)++;
+ } while (address < end);
+ return 0;
+}
+
+static int map_expand_stack_area_pmd(pmd_t *pmd, unsigned long address,
+ unsigned long size, pgprot_t prot,
+ struct page ***pages)
+{
+ unsigned long base, end;
+
+ base = address & PGDIR_MASK;
+ address &= ~PGDIR_MASK;
+ end = address + size;
+
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+
+ printk("map_expand_stack_area_pmd: address 0x%x, end 0x%x\n", (unsigned int)address, (unsigned int)end);
+
+ do {
+ pte_t * pte = pte_alloc_kernel(&init_mm, pmd, base + address);
+ if (!pte)
+ return -ENOMEM;
+ if (map_expand_stack_area_pte(pte, address, end - address, prot, pages))
+ return -ENOMEM;
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+
+ return 0;
+}
+
+int static map_expand_stack_vm_area(unsigned long address, pgprot_t prot, struct page ***pages)
+{
+// unsigned long address = (unsigned long) area->addr;
+ unsigned long end = address + PAGE_SIZE; // 1 page mappings
+// unsigned long end = address + PAGE_SIZE*2; // 2 pages mappings
+ //unsigned long end = address + (area->size-PAGE_SIZE);
+ pgd_t *dir;
+ int err = 0;
+
+ dir = pgd_offset_k(address);
+ spin_lock(&init_mm.page_table_lock);
+ do {
+ pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
+ if (!pmd) {
+ err = -ENOMEM;
+ break;
+ }
+ if (map_expand_stack_area_pmd(pmd, address, end - address, prot, pages)) {
+ err = -ENOMEM;
+ break;
+ }
+
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ dir++;
+ } while (address && (address < end));
+
+ spin_unlock(&init_mm.page_table_lock);
+ flush_cache_vmap(address, end);
+ //flush_cache_vmap((unsigned long) area->addr, end);
+ return err;
+}
+
+void expand_stack_size(struct vm_struct *area)
+{
+ struct page **pages;
+ unsigned int expand_address = (unsigned int)((area->addr)+PAGE_SIZE*2);
+// unsigned int expand_address = (unsigned int)((area->addr)+PAGE_SIZE);
+ unsigned gfp_mask = GFP_KERNEL | __GFP_HIGHMEM;
+ pgprot_t prot = PAGE_KERNEL;
+
+#if 1
+ if(area){
+ if(area->pages[0])
+ printk("area[0] exist ");
+ if(area->pages[1])
+ printk("area[1] exist ");
+ if(area->pages[2])
+ printk("area[2] exist ");
+ if(area->pages[3])
+ printk("area[3] exist\n");
+ }
+#endif
+
+ area->pages[2] = alloc_thread_info_page(gfp_mask);
+ if (unlikely(!area->pages[2])) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = 2;
+ printk("Alloc page failed\n");
+ goto fail;
+ }
+
+#if 0
+ area->pages[1] = alloc_thread_info_page(gfp_mask);
+ if (unlikely(!area->pages[1])) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = 1;
+ printk("Alloc page failed\n");
+ goto fail;
+ }
+
+#endif
+
+ area->nr_pages = 4;
+ pages = &(area->pages[2]);
+
+ if (map_expand_stack_vm_area(expand_address, prot, &pages))
+ goto fail;
+
+ page_table_trace((unsigned int)(area->addr));
+ return;
+fail:
+
+ printk("failed to expand_stack_size\n");
+// vfree(area->addr);
+}
+
+EXPORT_SYMBOL(expand_stack_size);
+