[rfc] x86: optimise page fault path a little

From: Nick Piggin
Date: Thu Nov 13 2008 - 02:28:39 EST

Next message: Stephen Rothwell: "linux-next: Tree for November 13"
Previous message: Pavel Machek: "Re: [ibm-acpi-devel] Fan level 7 after resume wit 2.6.28-rc3"
Next in thread: Ingo Molnar: "Re: [rfc] x86: optimise page fault path a little"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Hi,

I was just looking around the page fault code for any obvious performance
improvements. I noticed do_page_fault is rather big, uses a lot of stack,
and generates some branch mispredicts.

It's only about 1.1% on the profile of the workload I'm looking at, so my
improvement is pretty close to in the noise, but I wonder if micro
optimisations like the following would be welcome?

This patch adds branch hints and moves error condition code out of line.
It shrinks do_page_fault from 2410 bytes to 603 bytes, and from 352 to 64
bytes of stack. Total text size does grow by about 500 bytes due to the
additional functions added.

---

Index: linux-2.6/arch/x86/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/fault.c 2008-11-13 18:20:33.000000000 +1100
+++ linux-2.6/arch/x86/mm/fault.c 2008-11-13 18:23:26.000000000 +1100
@@ -91,8 +91,8 @@ static inline int notify_page_fault(stru
*
* Opcode checker based on code by Richard Brunner
*/
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
- unsigned long error_code)
+static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
+ unsigned long addr)
{
unsigned char *instr;
int scan_more = 1;
@@ -409,16 +409,16 @@ static void show_fault_oops(struct pt_re
}

#ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+/* TODO: match order of arguments */
+static noinline void pgtable_bad(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
{
unsigned long flags = oops_begin();
- struct task_struct *tsk;
+ struct task_struct *tsk = current;

printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
- current->comm, address);
+ tsk->comm, address);
dump_pagetable(address);
- tsk = current;
tsk->thread.cr2 = address;
tsk->thread.trap_no = 14;
tsk->thread.error_code = error_code;
@@ -428,6 +428,198 @@ static noinline void pgtable_bad(unsigne
}
#endif

+static noinline void no_context(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ struct task_struct *tsk = current;
+#ifdef CONFIG_X86_64
+ unsigned long flags;
+#endif
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs))
+ return;
+
+ /*
+ * X86_32
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ *
+ * X86_64
+ * Hall of shame of CPU/BIOS bugs.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata93(regs, address))
+ return;
+
+ /*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+#ifdef CONFIG_X86_32
+ bust_spinlocks(1);
+#else
+ flags = oops_begin();
+#endif
+
+ show_fault_oops(regs, error_code, address);
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+ die("Oops", regs, error_code);
+ bust_spinlocks(0);
+ do_exit(SIGKILL);
+#else
+ if (__die("Oops", regs, error_code))
+ regs = NULL;
+ /* Executive summary in case the body of the oops scrolled away */
+ printk(KERN_EMERG "CR2: %016lx\n", address);
+ oops_end(flags, regs, SIGKILL);
+#endif
+}
+
+static void __bad_area_nosemaphore(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address,
+ int si_code)
+{
+ struct task_struct *tsk = current;
+
+ /* User mode accesses just cause a SIGSEGV */
+ if (error_code & PF_USER) {
+ /*
+ * It's possible to have interrupts off here.
+ */
+ local_irq_enable();
+
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata100(regs, address))
+ return;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(
+ "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, task_pid_nr(tsk), address,
+ (void *) regs->ip, (void *) regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ tsk->thread.cr2 = address;
+ /* Kernel addresses are always protection faults */
+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+ return;
+ }
+
+ if (is_f00f_bug(regs, address))
+ return;
+
+ no_context(regs, error_code, address);
+}
+
+static noinline void bad_area_nosemaphore(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void __bad_area(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address,
+ int si_code)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ up_read(&mm->mmap_sem);
+
+ __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void bad_area(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void bad_area_accerr(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for oom handling */
+static void out_of_memory(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *mm = tsk->mm;
+ /*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+ up_read(&mm->mmap_sem);
+ if (is_global_init(tsk)) {
+ yield();
+ return;
+ }
+
+ printk("VM: killing process %s\n", tsk->comm);
+ if (error_code & PF_USER)
+ do_group_exit(SIGKILL);
+ no_context(regs, error_code, address);
+}
+
+static void do_sigbus(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *mm = tsk->mm;
+
+ up_read(&mm->mmap_sem);
+
+ /* Kernel mode? Handle exceptions or die */
+ if (!(error_code & PF_USER))
+ no_context(regs, error_code, address);
+#ifdef CONFIG_X86_32
+ /* User space => ok to do another page fault */
+ if (is_prefetch(regs, error_code, address))
+ return;
+#endif
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline void mm_fault_error(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address, unsigned int fault)
+{
+ if (fault & VM_FAULT_OOM)
+ out_of_memory(regs, error_code, address);
+ else if (fault & VM_FAULT_SIGBUS)
+ do_sigbus(regs, error_code, address);
+ else
+ BUG();
+}
+
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -447,8 +639,8 @@ static int spurious_fault_check(unsigned
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static int spurious_fault(unsigned long address,
- unsigned long error_code)
+static noinline int spurious_fault(unsigned long error_code,
+ unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
@@ -493,7 +685,7 @@ static int spurious_fault(unsigned long
*
* This assumes no large pages in there.
*/
-static int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
#ifdef CONFIG_X86_32
unsigned long pgd_paddr;
@@ -582,15 +774,12 @@ asmlinkage
#endif
void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
+ unsigned long address;
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct *vma;
- unsigned long address;
int write, si_code;
int fault;
-#ifdef CONFIG_X86_64
- unsigned long flags;
-#endif

tsk = current;
mm = tsk->mm;
@@ -601,7 +790,7 @@ void __kprobes do_page_fault(struct pt_r

si_code = SEGV_MAPERR;

- if (notify_page_fault(regs))
+ if (unlikely(notify_page_fault(regs)))
return;
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -636,7 +825,8 @@ void __kprobes do_page_fault(struct pt_r
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
*/
- goto bad_area_nosemaphore;
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
}

@@ -655,17 +845,18 @@ void __kprobes do_page_fault(struct pt_r

#ifdef CONFIG_X86_64
if (unlikely(error_code & PF_RSVD))
- pgtable_bad(address, regs, error_code);
+ pgtable_bad(regs, error_code, address);
#endif

/*
* If we're in an interrupt, have no user context or are running in an
* atomic region then we must not take the fault.
*/
- if (unlikely(in_atomic() || !mm))
- goto bad_area_nosemaphore;
+ if (unlikely(in_atomic() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }

-again:
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -682,20 +873,26 @@ again:
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
- !search_exception_tables(regs->ip))
- goto bad_area_nosemaphore;
+ !search_exception_tables(regs->ip)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
down_read(&mm->mmap_sem);
}

vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
+ if (unlikely(!vma)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ if (likely(vma->vm_start <= address))
goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
if (error_code & PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
@@ -703,31 +900,34 @@ again:
* and pusha to work. ("enter $65535,$31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
- if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
- goto bad_area;
+ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
}
- if (expand_stack(vma, address))
- goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+ if (unlikely(expand_stack(vma, address))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
good_area:
si_code = SEGV_ACCERR;
- write = 0;
- switch (error_code & (PF_PROT|PF_WRITE)) {
- default: /* 3: write, present */
- /* fall through */
- case PF_WRITE: /* write, not present */
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- write++;
- break;
- case PF_PROT: /* read, present */
- goto bad_area;
- case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
+ write = error_code & PF_WRITE;
+ if (write) {
+ if (unlikely(!(vma->vm_flags & VM_WRITE))) {
+ bad_area_accerr(regs, error_code, address);
+ return;
+ }
+ } else if (unlikely(error_code & PF_PROT)) {
+ bad_area_accerr(regs, error_code, address);
+ return;
+ } else if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) {
+ bad_area_accerr(regs, error_code, address);
+ return;
}

/*
@@ -737,11 +937,8 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, write);
if (unlikely(fault & VM_FAULT_ERROR)) {
- if (fault & VM_FAULT_OOM)
- goto out_of_memory;
- else if (fault & VM_FAULT_SIGBUS)
- goto do_sigbus;
- BUG();
+ mm_fault_error(regs, error_code, address, fault);
+ return;
}
if (fault & VM_FAULT_MAJOR)
tsk->maj_flt++;
@@ -752,145 +949,13 @@ good_area:
/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
- if (v8086_mode(regs)) {
+ if (unlikely(v8086_mode(regs))) {
unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
if (bit < 32)
tsk->thread.screen_bitmap |= 1 << bit;
}
#endif
up_read(&mm->mmap_sem);
- return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
- up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
- /* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
- /*
- * It's possible to have interrupts off here.
- */
- local_irq_enable();
-
- /*
- * Valid to do another page fault here because this one came
- * from user space.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (is_errata100(regs, address))
- return;
-
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(
- "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address,
- (void *) regs->ip, (void *) regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
-
- tsk->thread.cr2 = address;
- /* Kernel addresses are always protection faults */
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
- return;
- }
-
- if (is_f00f_bug(regs, address))
- return;
-
-no_context:
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
- return;
-
- /*
- * X86_32
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * X86_64
- * Hall of shame of CPU/BIOS bugs.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (is_errata93(regs, address))
- return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
- bust_spinlocks(1);
-#else
- flags = oops_begin();
-#endif
-
- show_fault_oops(regs, error_code, address);
-
- tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
- tsk->thread.error_code = error_code;
-
-#ifdef CONFIG_X86_32
- die("Oops", regs, error_code);
- bust_spinlocks(0);
- do_exit(SIGKILL);
-#else
- if (__die("Oops", regs, error_code))
- regs = NULL;
- /* Executive summary in case the body of the oops scrolled away */
- printk(KERN_EMERG "CR2: %016lx\n", address);
- oops_end(flags, regs, SIGKILL);
-#endif
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
- up_read(&mm->mmap_sem);
- if (is_global_init(tsk)) {
- yield();
- /*
- * Re-lookup the vma - in theory the vma tree might
- * have changed:
- */
- goto again;
- }
-
- printk("VM: killing process %s\n", tsk->comm);
- if (error_code & PF_USER)
- do_group_exit(SIGKILL);
- goto no_context;
-
-do_sigbus:
- up_read(&mm->mmap_sem);
-
- /* Kernel mode? Handle exceptions or die */
- if (!(error_code & PF_USER))
- goto no_context;
-#ifdef CONFIG_X86_32
- /* User space => ok to do another page fault */
- if (is_prefetch(regs, address, error_code))
- return;
-#endif
- tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
}

DEFINE_SPINLOCK(pgd_lock);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Stephen Rothwell: "linux-next: Tree for November 13"
Previous message: Pavel Machek: "Re: [ibm-acpi-devel] Fan level 7 after resume wit 2.6.28-rc3"
Next in thread: Ingo Molnar: "Re: [rfc] x86: optimise page fault path a little"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]