[PATCH -next 1/2][RFC] x86: Saveoops: Switch to real-mode and callBIOS

From: Ahmed S. Darwish
Date: Tue Jan 25 2011 - 08:51:38 EST



We get called here upon panic()s to save the kernel log buffer.

First, switch from 64-bit long mode to 16-bit real mode. Afterwards, save the
log buffer to disk using extended INT 0x13 BIOS services. The user has given
us an absolute LBA disk address to save the log buffer to.

By x86 design, this code is mandated to run on a single identity-mapped page.

- How to initialize the disk hardware to its POST state (thus making the
BIOS code work reliably) while keeping system RAM unmodified?

- Is it guaranteed that '0x80' will always be the boot disk drive number?
If not, we need to be passed the boot drive number from the bootloader.

Signed-off-by: Ahmed S. Darwish <darwish.07@xxxxxxxxx>
---

arch/x86/kernel/saveoops-rmode.S | 483 ++++++++++++++++++++++++++++++++++++++
1 files changed, 483 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/saveoops-rmode.S b/arch/x86/kernel/saveoops-rmode.S
new file mode 100644
index 0000000..6e07112
--- /dev/null
+++ b/arch/x86/kernel/saveoops-rmode.S
@@ -0,0 +1,483 @@
+/* PROTOTYPE - PROTOTYPE - PROTOTYPE - PROTOTYPE - PROTOTYPE - PROTOTYPE */
+
+/*
+ * Saveoops LongMode -> RealMode switch
+ *
+ * Don't come here with any unfinished business at hand, there's no return.
+ * After writing the log buffer to disk, we just halt.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/pgtable_types.h>
+#include <asm/segment.h>
+#include <asm/saveoops.h>
+
+/*
+ * Notes:
+ * - Avoid using relocatable symbols: we run from a different place than
+ * where we're originally linked to. Use absolute addresses
+ * - Run this from an identity page since we disable paging
+ * - Dynamic values are used for all x86 table bases to let this code run
+ * from *any* memory region below 1-Mbyte
+ */
+ .code64
+ENTRY(saveoops_start)
+ /*
+ * Switch to 32bit-compatibility mode using a L=0 code segment
+ */
+
+ cli
+
+ /* Permanently store passed parameters */
+ movq %rdi, %rbp
+ movl %esi, (ringbuf_addr - saveoops_start)(%ebp)
+ movl %edx, (rstack_base - saveoops_start)(%ebp)
+ movq %rcx, (disk_sector - saveoops_start)(%ebp)
+ movl %r8d, (ringbuf_len - saveoops_start)(%ebp)
+
+ /* Dynamically set the 32bit-compat. GDTR base */
+ leaq (lmode32_gdt - saveoops_start)(%ebp), %rax
+ movq %rax, (lmode32_gdt + 2 - saveoops_start)(%ebp)
+
+ /* Dynamically set the 32bit farpointer base */
+ leal (compat32 - saveoops_start)(%ebp), %eax
+ movl %eax, (lmode32_farpointer - saveoops_start)(%ebp)
+
+ lgdt (lmode32_gdt - saveoops_start)(%ebp)
+ ljmpl *(lmode32_farpointer - saveoops_start)(%ebp) # addr32
+
+ .code32
+compat32:
+ /*
+ * 32bit-compatibility Long Mode, using a L=0 %cs
+ */
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+
+ /* 'Deactivate' long mode: disable paging */
+ movl %cr0, %eax
+ andl $~X86_CR0_PG, %eax
+ movl %eax, %cr0
+
+ /*
+ * Prepare identity maps for the first 2Mbytes. PAE is already
+ * enabled from the original pmode -> lmode transition.
+ *
+ * Reuse head.S page tables instead of creating new ones. Such
+ * early tables are in fact already reused by the newer direct
+ * mapping tables, but since paging is now disabled (and we're
+ * not returning back), hopefully nothing will blow up.
+ */
+
+ /*
+ * Pick a table for the PAE Page Directory (PD)
+ */
+
+ .equ level2_pae_ident_pgt, (level2_ident_pgt - __START_KERNEL_map)
+ .equ level2_entry_count, 512
+ .equ level2_entry_len, 8
+
+ xorl %eax, %eax
+ movl $level2_pae_ident_pgt, %edi
+ movl $((level2_entry_count * level2_entry_len) / 4), %ecx
+ rep stosl
+
+ movl $(0 + __PAGE_KERNEL_IDENT_LARGE_EXEC), level2_pae_ident_pgt
+
+ /*
+ * Pick a table for for the PAE Page Directory Pointer (PDP)
+ */
+
+ .equ level3_pae_ident_pgt, (level2_spare_pgt - __START_KERNEL_map)
+ .equ level3_entry_count, 4
+ .equ level3_entry_len, 8
+
+ xorl %eax, %eax
+ movl $level3_pae_ident_pgt, %edi
+ movl $((level3_entry_count * level3_entry_len) / 4), %ecx
+ rep stosl
+
+ movl $(level2_pae_ident_pgt + _PAGE_PRESENT), level3_pae_ident_pgt
+
+ movl $level3_pae_ident_pgt, %eax
+ movl %eax, %cr3
+
+ /* 'Disable' long mode: clear the EFER.LME bit */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btcl $_EFER_LME, %eax
+ wrmsr
+
+ /* Finally, move to 32-bit pmode: re-enabling paging */
+ movl %cr0, %eax
+ orl $X86_CR0_PG, %eax
+ movl %eax, %cr0
+ jmp pmode32 # flush prefetch
+
+pmode32:
+ /*
+ * 32-bit protected mode, using a 2MB identity page.
+ */
+
+ /* Paging was only enabled for the lmode->pmode step */
+ movl %cr0, %eax
+ andl $~X86_CR0_PG, %eax
+ movl %eax, %cr0 # paging no more
+
+ xorl %eax, %eax
+ movl %eax, %cr3 # flush the TLB
+
+ /* Dynamically set the GDTR base value */
+ leal (pmode16_gdt - saveoops_start)(%ebp), %eax
+ movl %eax, (pmode16_gdt + 2 - saveoops_start)(%ebp) # base[00:32]
+
+ /* Dynamically set %cs and %ds bases */
+ leal (pmode16 - saveoops_start)(%ebp), %eax
+ movw %ax, (pmode16_cs + 2 - saveoops_start)(%ebp) # base[00:15]
+ movw %ax, (pmode16_ds + 2 - saveoops_start)(%ebp) # base[00:15]
+ shrl $16, %eax
+ movb %al, (pmode16_cs + 4 - saveoops_start)(%ebp) # base[16:23]
+ movb %al, (pmode16_ds + 4 - saveoops_start)(%ebp) # base[16:23]
+
+ /* Load the 16-bit code and data segments */
+ lgdt (pmode16_gdt - saveoops_start)(%ebp)
+
+ /* Switch to 16-bit pmode: use the setup 16-bit %cs */
+ ljmp $0x08, $0x0
+
+ /*
+ * - âSegment base addresses should be 16-byte alignedâ --Intel
+ * - We also use this as the rmode code base; the 16-byte align
+ * will make address caclulations much easier.
+ */
+ .align 16
+ .globl pmode16
+ .code16
+pmode16:
+ /*
+ * We're now in the 16-bit protected mode. Since PE is still = 1,
+ * we can change a segment cache by loading a GDT selector value.
+ */
+
+ movw $0x10, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movw %ax, %ss
+
+ /*
+ * NOTE! Due to the new %cs and %ds bases, dereference addresses
+ * using the from âlabel - pmode16â from now on.
+ */
+
+ /* Dynamically build an rmode segment and offset */
+ leal (pmode16 - saveoops_start)(%ebp), %eax # absolute value
+ shrl $4, %eax
+ movw %ax, rmode_farpointer - pmode16 + 2 # 8086 %cs
+ movw $(rmode - pmode16), rmode_farpointer - pmode16 # offset
+
+ /* Restore real-mode BIOS interrupt entries */
+ lidt (rmode_idtr - pmode16)
+
+ /* Switch to canonical real-mode: clear PE */
+ movl %cr0, %eax
+ andl $~X86_CR0_PE, %eax
+ movl %eax, %cr0
+
+ /* Flush prefetch; use the 8086 code segment */
+ ljmp *(rmode_farpointer - pmode16)
+
+#ifdef SAVEOOPS_DEBUG
+ /*
+ * Valid for any real-mode context where a stack exists
+ */
+#define __print(msg) ;\
+ pushfl ;\
+ pushal ;\
+ pushw $(1f - pmode16) ;\
+ call print_string ;\
+ .ascii "Saveoops: " ;\
+ .ascii msg ;\
+ .asciz " \n\r" ;\
+1: popal ;\
+ popfl
+#else
+#define __print(msg) ;
+#endif
+
+ .align 16
+rmode:
+ /*
+ * REAL Mode, at last!
+ *
+ * For further details on the BIOS interrupts used, check any
+ * version of the âEnhanced Disk Drive Specificationâ.
+ */
+
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+
+ /* Setup passed stack area */
+ movl (rstack_base - pmode16), %eax
+ shrl $4, %eax # 16byte-aligned
+ movw %ax, %ss
+ movw $RMODE_STACK_LEN, %sp
+
+ __print ("Entered real mode")
+
+ /*
+ * XXXX: We always use the boot disk drive number '0x80'. Can
+ * this map to a wrong device?
+ *
+ * NOTE! Do not trust the BIOS: assume it clobbered all the
+ * registers (relevant and not) while servicing interrupts.
+ */
+
+ /*
+ * Check Extensions Present (0x41) - Does the BIOS provide
+ * EDD int 0x13 extensions?
+ *
+ * input %bx - 0x55aa
+ * input %dl - drive number
+ * output success - carry = 0 && bx = 0xaa55 && cx bit0 = 1
+ * output failure - carry = 1 || any false condition above
+ */
+ movb $0x41, %ah
+ movw $0x55aa, %bx
+ movb $0x80, %dl
+ xorw %cx, %cx
+ pushw %ds
+ int $0x13
+ popw %ds
+ __print ("Queried BIOS for EDD services")
+ jc no_edd1
+ cmpw $0xaa55, %bx
+ jne no_edd2
+ shrw $1, %cx
+ jnc no_edd3
+
+ /* Store 16byte-aligned ring buffer address in disk packet */
+ movl (ringbuf_addr - pmode16), %eax
+ shrl $4, %eax
+ movw %ax, (buffer_seg - pmode16)
+ xorw %ax, %ax
+ movw %ax, (buffer_offset - pmode16)
+
+ /* Store ringbuf number of 512-byte blocks in disk packet */
+ movl (ringbuf_len - pmode16), %eax
+ movb %al, (sectors_cnt - pmode16)
+
+ __print ("Prepared the Disk Address Packet")
+
+ /*
+ * Reset Hard Disks (0x00)
+ *
+ * input %dl - drive number
+ * output success - carry = 0 && %ah (err code) = 0
+ * output failure - carry = 1 || %ah = error code
+ *
+ * The kernel has just paniced and left the disk controller
+ * in an unknown state. Reset controllers before write.
+ */
+ xorw %ax, %ax
+ movb $0x80, %dl
+ pushw %ds
+ int $0x13
+ popw %ds
+ __print ("Disk controller reset")
+ jc init_err1
+ cmpb $0x0, %ah
+ jne init_err2
+
+ /*
+ * Extended Write (0x43) - Transfer data from RAM to disk
+ *
+ * input %al - 0 (write with verify off)
+ * input %dl - drive number
+ * input %ds:si - pointer to the Disk Address Packet
+ * output success - carry = 0 && %ah (err code) = 0
+ * output failure - carry = 1 || %ah = error code
+ */
+ movb $0x43, %ah
+ xorb %al, %al
+ movb $0x80, %dl
+ movw $(disk_address_packet - pmode16), %si
+ pushw %ds
+ int $0x13
+ popw %ds
+ __print ("Extended write finished")
+ jc write_err1
+ cmpb $0x0, %ah
+ jne write_err2
+ jmp success
+
+init_err1:
+ __print ("INT 0x13/0x0 init error 1")
+ jmp print_errcode
+init_err2:
+ __print ("INT 0x13/0x0 init error 2")
+ jmp print_errcode
+write_err1:
+ __print ("INT 0x13/0x43 write error 1")
+ jmp print_errcode
+write_err2:
+ __print ("INT 0x13/0x43 write error 2")
+ jmp print_errcode
+no_edd1:
+ __print ("Bios does not support EDD service (err=1)")
+ jmp print_errcode
+no_edd2:
+ __print ("Bios does not support EDD service (err=2)")
+ jmp print_errcode
+no_edd3:
+ __print ("Bios does not support EDD service (err=3)")
+ jmp print_errcode
+success:
+ __print ("Sucess!!!")
+ jmp print_errcode
+
+halt: hlt
+ jmp halt
+
+#ifdef SAVEOOPS_DEBUG
+ /*
+ * Print Null-terminated string pointed by top of the stack
+ */
+ .type print_string, @function
+print_string:
+ popw %si
+1: xorb %bh, %bh
+ movb $0x0e, %ah
+ lodsb
+ cmpb $0, %al
+ je 2f
+ int $0x10
+ jmp 1b
+2: ret
+
+ /*
+ * print %dx value in hexadecimal ascii
+ */
+ .type print_hex, @function
+print_hex:
+ xorb %bh, %bh
+ movw $4, %cx # 2-bytes = 4 hex digits
+print_digit:
+ rolw $4, %dx # highest-order 4 bits in front
+ movw $0x0e0f, %ax # bios function 0x0e
+ andb %dl, %al
+ cmpb $0x0a, %al # transform to ASCII
+ jl digit
+ addb $0x07, %al
+digit:
+ addb $0x30, %al
+ int $0x10
+ loop print_digit
+ ret
+
+ /*
+ * Print INT13 err code, number of sectors written
+ */
+print_errcode:
+ movb %ah, %dl
+ call print_hex
+ movw (sectors_cnt - pmode16), %dx
+ call print_hex
+ jmp halt
+#else
+print_errcode:
+ jmp halt
+#endif
+
+
+/*
+ * Virtual data section; â(dyn.)â = A dynamically-set value
+ */
+
+ .align 16
+lmode32_gdt:
+ .word lmode32_gdt_end - lmode32_gdt - 1
+ .quad 0x0000000000000000 # base (dyn.)
+ .word 0, 0, 0 # padding
+lmode32_cs:
+ .word 0xffff # limit
+ .word 0x0000 # base
+ .word 0x9a00 # P=1, C=0, type=0xA (r/x)
+ .word 0x00cf # L=0 (compat.), D=1 (32-bit), G=1
+lmode32_ds:
+ .word 0xffff # limit
+ .word 0x0000 # base
+ .word 0x9200 # P=1, type=0x2 (r/w)
+ .word 0x00cf # G=1, D=1 (32-bit)
+lmode32_gdt_end:
+
+lmode32_farpointer:
+ .long 0x00000000 # offset (dyn.)
+ .word lmode32_cs -lmode32_gdt # %cs selector
+
+ .align 16
+pmode16_gdt:
+ .word pmode16_gdt_end - pmode16_gdt - 1
+ .long 0x00000000 # base (dyn.)
+ .word 0x0000 # padding
+pmode16_cs:
+ .word 0xffff # limit
+ .word 0x0000 # base (dyn.)
+ .word 0x9a00 # P=1, DPL=00, type=0xA (execute/read)
+ .word 0x0000 # G=0 (byte), D=0 (16-bit)
+pmode16_ds:
+ .word 0xffff # limit
+ .word 0x0000 # base (dyn.)
+ .word 0x9200 # P=1, DPL=00, type=0x2 (read/write)
+ .word 0x0000 # G=0 (byte), D=0 (16-bit)
+pmode16_gdt_end:
+
+rmode_farpointer:
+ .word 0x0000 # offset (dyn.)
+ .word 0x0000 # %cs (dyn.)
+
+rmode_idtr:
+ .equ RIDT_BASE, 0x0 # PC architecture defined
+ .equ RIDT_ENTRY_SIZE, 0x4 # 8086 defined
+ .equ RIDT_ENTRIES, 0x100 # 8086, 286, 386+ defined
+ .word RIDT_ENTRIES * RIDT_ENTRY_SIZE - 1
+ .long RIDT_BASE
+
+ /* Values passed by long-mode C code */
+ringbuf_addr:
+ .long 0x00000000 # 16-byte aligned, < 1-MB (dyn.)
+ringbuf_len:
+ .long 0x00000000 # 512-byte aligned (dyn.)
+rstack_base:
+ .long 0x00000000 # 16-byte aligned, < 1-MB (dyn.)
+
+ .align 16
+disk_address_packet: # for extended INT 0x13 services (dyn.)
+packet_size:
+ .byte 0x10 # in bytes
+reserved0:
+ .byte 0x00 # must be zero
+sectors_cnt:
+ .byte 0x00 # number of blocks to transfer [1 - 127]
+reserved1:
+ .byte 0x00 # must be zero
+buffer_offset:
+ .word 0x0000 # read/write buffer offset
+buffer_seg:
+ .word 0x0000 # read/write buffer segment
+disk_sector:
+ .quad 0x0000000000000000 # logical sector number (LBA)
+
+ENTRY(saveoops_end)
+
+/* PROTOTYPE - PROTOTYPE - PROTOTYPE - PROTOTYPE - PROTOTYPE - PROTOTYPE */

--
Darwish
http://darwish.07.googlepages.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/