Re: [PATCH RFC REPOST 1/2] paravirt: refactor struct paravirt_ops into smaller pv_*_ops

From: Rusty Russell
Date: Mon Oct 15 2007 - 04:17:14 EST


On Saturday 13 October 2007 05:16:04 Jeremy Fitzhardinge wrote:
> Are you otherwise happy with the patch in its current form? And are you
> happy with the lazymode changes?

No, and yes.

This applies on top of your 1/2. Boots here, runs lguest. It also
exports pv_info: lg.ko checks it to avoid lguest-in-lguest.

Cheers,
Rusty.

==
Wean off "struct paravirt_ops" entirely: it's now merely a patching enumeration tool

Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx>

diff -r d8192c55373b arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c Mon Oct 15 16:56:40 2007 +1000
+++ b/arch/i386/kernel/asm-offsets.c Mon Oct 15 18:10:52 2007 +1000
@@ -117,11 +117,13 @@ void foo(void)
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
- OFFSET(PARAVIRT_irq_disable, paravirt_ops, pv_irq_ops.irq_disable);
- OFFSET(PARAVIRT_irq_enable, paravirt_ops, pv_irq_ops.irq_enable);
- OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, pv_cpu_ops.irq_enable_sysexit);
- OFFSET(PARAVIRT_iret, paravirt_ops, pv_cpu_ops.iret);
- OFFSET(PARAVIRT_read_cr0, paravirt_ops, pv_cpu_ops.read_cr0);
+ OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+ OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+ OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+ OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+ OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+ OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+ OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
#endif

#ifdef CONFIG_XEN
diff -r d8192c55373b arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c Mon Oct 15 16:56:40 2007 +1000
+++ b/arch/i386/kernel/paravirt.c Mon Oct 15 18:10:52 2007 +1000
@@ -34,8 +34,6 @@
#include <asm/tlbflush.h>
#include <asm/timer.h>

-struct paravirt_ops paravirt_ops;
-
/* nop stub */
void _paravirt_nop(void)
{
@@ -160,10 +158,26 @@ unsigned paravirt_patch_jmp(void *insnbu
return 5;
}

+/* Neat trick to map patch type back to the call within the
+ * corresponding structure. */
+static void *get_call_destination(u8 type)
+{
+ struct paravirt_patch_template tmpl = {
+ .pv_init_ops = pv_init_ops,
+ .pv_misc_ops = pv_misc_ops,
+ .pv_time_ops = pv_time_ops,
+ .pv_cpu_ops = pv_cpu_ops,
+ .pv_irq_ops = pv_irq_ops,
+ .pv_apic_ops = pv_apic_ops,
+ .pv_mmu_ops = pv_mmu_ops,
+ };
+ return *((void **)&tmpl + type);
+}
+
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
unsigned long addr, unsigned len)
{
- void *opfunc = *((void **)&paravirt_ops + type);
+ void *opfunc = get_call_destination(type);
unsigned ret;

if (opfunc == NULL)
@@ -275,160 +289,136 @@ struct pv_info pv_info = {
.shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
};

-struct paravirt_ops paravirt_ops = {
- .pv_init_ops = {
- .patch = native_patch,
- .banner = default_banner,
- .arch_setup = paravirt_nop,
- .memory_setup = machine_specific_memory_setup,
- },
-
- .pv_time_ops = {
- .time_init = hpet_time_init,
- .get_wallclock = native_get_wallclock,
- .set_wallclock = native_set_wallclock,
- .sched_clock = native_sched_clock,
- .get_cpu_khz = native_calculate_cpu_khz,
- },
-
- .pv_irq_ops = {
- .init_IRQ = native_init_IRQ,
- .save_fl = native_save_fl,
- .restore_fl = native_restore_fl,
- .irq_disable = native_irq_disable,
- .irq_enable = native_irq_enable,
- .safe_halt = native_safe_halt,
- .halt = native_halt,
- },
-
- .pv_cpu_ops = {
- .cpuid = native_cpuid,
- .get_debugreg = native_get_debugreg,
- .set_debugreg = native_set_debugreg,
- .clts = native_clts,
- .read_cr0 = native_read_cr0,
- .write_cr0 = native_write_cr0,
- .read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
- .write_cr4 = native_write_cr4,
- .wbinvd = native_wbinvd,
- .read_msr = native_read_msr_safe,
- .write_msr = native_write_msr_safe,
- .read_tsc = native_read_tsc,
- .read_pmc = native_read_pmc,
- .load_tr_desc = native_load_tr_desc,
- .set_ldt = native_set_ldt,
- .load_gdt = native_load_gdt,
- .load_idt = native_load_idt,
- .store_gdt = native_store_gdt,
- .store_idt = native_store_idt,
- .store_tr = native_store_tr,
- .load_tls = native_load_tls,
- .write_ldt_entry = write_dt_entry,
- .write_gdt_entry = write_dt_entry,
- .write_idt_entry = write_dt_entry,
- .load_esp0 = native_load_esp0,
-
- .irq_enable_sysexit = native_irq_enable_sysexit,
- .iret = native_iret,
-
- .set_iopl_mask = native_set_iopl_mask,
- .io_delay = native_io_delay,
- },
-
- .pv_apic_ops = {
+struct pv_init_ops pv_init_ops = {
+ .patch = native_patch,
+ .banner = default_banner,
+ .arch_setup = paravirt_nop,
+ .memory_setup = machine_specific_memory_setup,
+};
+
+struct pv_time_ops pv_time_ops = {
+ .time_init = hpet_time_init,
+ .get_wallclock = native_get_wallclock,
+ .set_wallclock = native_set_wallclock,
+ .sched_clock = native_sched_clock,
+ .get_cpu_khz = native_calculate_cpu_khz,
+};
+
+struct pv_irq_ops pv_irq_ops = {
+ .init_IRQ = native_init_IRQ,
+ .save_fl = native_save_fl,
+ .restore_fl = native_restore_fl,
+ .irq_disable = native_irq_disable,
+ .irq_enable = native_irq_enable,
+ .safe_halt = native_safe_halt,
+ .halt = native_halt,
+};
+
+struct pv_cpu_ops pv_cpu_ops = {
+ .cpuid = native_cpuid,
+ .get_debugreg = native_get_debugreg,
+ .set_debugreg = native_set_debugreg,
+ .clts = native_clts,
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+ .read_cr4 = native_read_cr4,
+ .read_cr4_safe = native_read_cr4_safe,
+ .write_cr4 = native_write_cr4,
+ .wbinvd = native_wbinvd,
+ .read_msr = native_read_msr_safe,
+ .write_msr = native_write_msr_safe,
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+ .load_tr_desc = native_load_tr_desc,
+ .set_ldt = native_set_ldt,
+ .load_gdt = native_load_gdt,
+ .load_idt = native_load_idt,
+ .store_gdt = native_store_gdt,
+ .store_idt = native_store_idt,
+ .store_tr = native_store_tr,
+ .load_tls = native_load_tls,
+ .write_ldt_entry = write_dt_entry,
+ .write_gdt_entry = write_dt_entry,
+ .write_idt_entry = write_dt_entry,
+ .load_esp0 = native_load_esp0,
+
+ .irq_enable_sysexit = native_irq_enable_sysexit,
+ .iret = native_iret,
+
+ .set_iopl_mask = native_set_iopl_mask,
+ .io_delay = native_io_delay,
+};
+
+struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
- .apic_write = native_apic_write,
- .apic_write_atomic = native_apic_write_atomic,
- .apic_read = native_apic_read,
- .setup_boot_clock = setup_boot_APIC_clock,
- .setup_secondary_clock = setup_secondary_APIC_clock,
- .startup_ipi_hook = paravirt_nop,
+ .apic_write = native_apic_write,
+ .apic_write_atomic = native_apic_write_atomic,
+ .apic_read = native_apic_read,
+ .setup_boot_clock = setup_boot_APIC_clock,
+ .setup_secondary_clock = setup_secondary_APIC_clock,
+ .startup_ipi_hook = paravirt_nop,
#endif
- },
-
- .pv_misc_ops = {
- .set_lazy_mode = paravirt_nop,
- },
-
- .pv_mmu_ops = {
- .pagetable_setup_start = native_pagetable_setup_start,
- .pagetable_setup_done = native_pagetable_setup_done,
-
- .read_cr2 = native_read_cr2,
- .write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
- .write_cr3 = native_write_cr3,
-
- .flush_tlb_user = native_flush_tlb,
- .flush_tlb_kernel = native_flush_tlb_global,
- .flush_tlb_single = native_flush_tlb_single,
- .flush_tlb_others = native_flush_tlb_others,
-
- .alloc_pt = paravirt_nop,
- .alloc_pd = paravirt_nop,
- .alloc_pd_clone = paravirt_nop,
- .release_pt = paravirt_nop,
- .release_pd = paravirt_nop,
-
- .set_pte = native_set_pte,
- .set_pte_at = native_set_pte_at,
- .set_pmd = native_set_pmd,
- .pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
+};
+
+struct pv_misc_ops pv_misc_ops = {
+ .set_lazy_mode = paravirt_nop,
+};
+
+struct pv_mmu_ops pv_mmu_ops = {
+ .pagetable_setup_start = native_pagetable_setup_start,
+ .pagetable_setup_done = native_pagetable_setup_done,
+
+ .read_cr2 = native_read_cr2,
+ .write_cr2 = native_write_cr2,
+ .read_cr3 = native_read_cr3,
+ .write_cr3 = native_write_cr3,
+
+ .flush_tlb_user = native_flush_tlb,
+ .flush_tlb_kernel = native_flush_tlb_global,
+ .flush_tlb_single = native_flush_tlb_single,
+ .flush_tlb_others = native_flush_tlb_others,
+
+ .alloc_pt = paravirt_nop,
+ .alloc_pd = paravirt_nop,
+ .alloc_pd_clone = paravirt_nop,
+ .release_pt = paravirt_nop,
+ .release_pd = paravirt_nop,
+
+ .set_pte = native_set_pte,
+ .set_pte_at = native_set_pte_at,
+ .set_pmd = native_set_pmd,
+ .pte_update = paravirt_nop,
+ .pte_update_defer = paravirt_nop,

#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = kmap_atomic,
+ .kmap_atomic_pte = kmap_atomic,
#endif

#ifdef CONFIG_X86_PAE
- .set_pte_atomic = native_set_pte_atomic,
- .set_pte_present = native_set_pte_present,
- .set_pud = native_set_pud,
- .pte_clear = native_pte_clear,
- .pmd_clear = native_pmd_clear,
-
- .pmd_val = native_pmd_val,
- .make_pmd = native_make_pmd,
+ .set_pte_atomic = native_set_pte_atomic,
+ .set_pte_present = native_set_pte_present,
+ .set_pud = native_set_pud,
+ .pte_clear = native_pte_clear,
+ .pmd_clear = native_pmd_clear,
+
+ .pmd_val = native_pmd_val,
+ .make_pmd = native_make_pmd,
#endif

- .pte_val = native_pte_val,
- .pgd_val = native_pgd_val,
-
- .make_pte = native_make_pte,
- .make_pgd = native_make_pgd,
-
- .dup_mmap = paravirt_nop,
- .exit_mmap = paravirt_nop,
- .activate_mm = paravirt_nop,
- },
-};
-
-static void __init __used pv_aliases(void)
-{
- /*
- * These asm statements need to be wrapped in a function just
- * so we can pass args to them; they are completely constant,
- * and this function is never executed.
- */
-#define substructure(inner) \
- asm volatile(".data; .globl " #inner "; " \
- #inner " = paravirt_ops+%c0; .previous" \
- : : "i" (offsetof(struct paravirt_ops, inner)))
-
- substructure(pv_init_ops);
- substructure(pv_misc_ops);
- substructure(pv_time_ops);
- substructure(pv_cpu_ops);
- substructure(pv_irq_ops);
- substructure(pv_apic_ops);
- substructure(pv_mmu_ops);
-
-#undef substructure
-}
+ .pte_val = native_pte_val,
+ .pgd_val = native_pgd_val,
+
+ .make_pte = native_make_pte,
+ .make_pgd = native_make_pgd,
+
+ .dup_mmap = paravirt_nop,
+ .exit_mmap = paravirt_nop,
+ .activate_mm = paravirt_nop,
+};

EXPORT_SYMBOL_GPL(pv_time_ops);
EXPORT_SYMBOL_GPL(pv_cpu_ops);
EXPORT_SYMBOL_GPL(pv_mmu_ops);
EXPORT_SYMBOL_GPL(pv_apic_ops);
+EXPORT_SYMBOL_GPL(pv_info);
EXPORT_SYMBOL (pv_irq_ops);
diff -r d8192c55373b drivers/char/hvc_lguest.c
--- a/drivers/char/hvc_lguest.c Mon Oct 15 16:56:40 2007 +1000
+++ b/drivers/char/hvc_lguest.c Mon Oct 15 18:10:52 2007 +1000
@@ -115,7 +115,7 @@ static struct hv_ops lguest_cons = {
* (0), and the struct hv_ops containing the put_chars() function. */
static int __init cons_init(void)
{
- if (strcmp(paravirt_ops.name, "lguest") != 0)
+ if (strcmp(pv_info.name, "lguest") != 0)
return 0;

return hvc_instantiate(0, 0, &lguest_cons);
diff -r d8192c55373b drivers/lguest/core.c
--- a/drivers/lguest/core.c Mon Oct 15 16:56:40 2007 +1000
+++ b/drivers/lguest/core.c Mon Oct 15 18:10:52 2007 +1000
@@ -248,8 +248,8 @@ static void unmap_switcher(void)
}

/*H:130 Our Guest is usually so well behaved; it never tries to do things it
- * isn't allowed to. Unfortunately, "struct paravirt_ops" isn't quite
- * complete, because it doesn't contain replacements for the Intel I/O
+ * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't
+ * quite complete, because it doesn't contain replacements for the Intel I/O
* instructions. As a result, the Guest sometimes fumbles across one during
* the boot process as it probes for various things which are usually attached
* to a PC.
diff -r d8192c55373b drivers/lguest/lguest.c
--- a/drivers/lguest/lguest.c Mon Oct 15 16:56:40 2007 +1000
+++ b/drivers/lguest/lguest.c Mon Oct 15 18:10:52 2007 +1000
@@ -23,7 +23,7 @@
*
* So how does the kernel know it's a Guest? The Guest starts at a special
* entry point marked with a magic string, which sets up a few things then
- * calls here. We replace the native functions in "struct paravirt_ops"
+ * calls here. We replace the native functions various "paravirt" structures
* with our Guest versions, then boot like normal. :*/

/*
@@ -331,7 +331,7 @@ static void lguest_load_tls(struct threa
}

/*G:038 That's enough excitement for now, back to ploughing through each of
- * the paravirt_ops (we're about 1/3 of the way through).
+ * the different pv_ops structures (we're about 1/3 of the way through).
*
* This is the Local Descriptor Table, another weird Intel thingy. Linux only
* uses this for some strange applications like Wine. We don't do anything
@@ -558,7 +558,7 @@ static void lguest_set_pte(pte_t *ptep,
lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
}

-/* Unfortunately for Lguest, the paravirt_ops for page tables were based on
+/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
* native page table operations. On native hardware you can set a new page
* table entry whenever you want, but if you want to remove one you have to do
* a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -902,7 +902,7 @@ static __init char *lguest_memory_setup(
/*G:050
* Patching (Powerfully Placating Performance Pedants)
*
- * We have already seen that "struct paravirt_ops" lets us replace simple
+ * We have already seen that pv_ops structures let us replace simple
* native instructions with calls to the appropriate back end all throughout
* the kernel. This allows the same kernel to run as a Guest and as a native
* kernel, but it's slow because of all the indirect branches.
@@ -957,9 +957,9 @@ static unsigned lguest_patch(u8 type, u1
return insn_len;
}

-/*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops
- * structure in the kernel provides a single point for (almost) every routine
- * we have to override to avoid privileged instructions. */
+/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
+ * structures in the kernel provide points for (almost) every routine we have
+ * to override to avoid privileged instructions. */
__init void lguest_init(void *boot)
{
/* Copy boot parameters first: the Launcher put the physical location
diff -r d8192c55373b include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h Mon Oct 15 16:56:40 2007 +1000
+++ b/include/asm-i386/paravirt.h Mon Oct 15 18:10:52 2007 +1000
@@ -246,7 +246,10 @@ struct pv_mmu_ops {
#endif
};

-struct paravirt_ops
+/* This contains all the paravirt structures: we get a convenient
+ * number for each function using the offset which we use to indicate
+ * what to patch. */
+struct paravirt_patch_template
{
struct pv_init_ops pv_init_ops;
struct pv_misc_ops pv_misc_ops;
@@ -267,7 +270,7 @@ extern struct pv_mmu_ops pv_mmu_ops;
extern struct pv_mmu_ops pv_mmu_ops;

#define PARAVIRT_PATCH(x) \
- (offsetof(struct paravirt_ops, x) / sizeof(void *))
+ (offsetof(struct paravirt_patch_template, x) / sizeof(void *))

#define paravirt_type(op) \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
@@ -311,14 +314,14 @@ int paravirt_disable_iospace(void);
/*
* This generates an indirect call based on the operation type number.
* The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_ops structure, and can therefore be freely
- * converted back into a structure offset.
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
*/
#define PARAVIRT_CALL "call *%[paravirt_opptr];"

/*
- * These macros are intended to wrap calls into a paravirt_ops
- * operation, so that they can be later identified and patched at
+ * These macros are intended to wrap calls through one of the paravirt
+ * ops structs, so that they can be later identified and patched at
* runtime.
*
* Normally, a call to a pv_op function is a simple indirect call:
@@ -341,7 +344,7 @@ int paravirt_disable_iospace(void);
* The call instruction itself is marked by placing its start address
* and size into the .parainstructions section, so that
* apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend paravirt_ops
+ * appropriate patching under the control of the backend pv_init_ops
* implementation.
*
* Unfortunately there's no way to get gcc to generate the args setup
@@ -1092,7 +1095,7 @@ static inline unsigned long __raw_local_

#else /* __ASSEMBLY__ */

-#define PARA_PATCH(off) ((off) / 4)
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)

#define PARA_SITE(ptype, clobbers, ops) \
771:; \
@@ -1105,29 +1108,29 @@ 772:; \
.short clobbers; \
.popsection

-#define INTERRUPT_RETURN \
- PARA_SITE(PARA_PATCH(PARAVIRT_iret), CLBR_NONE, \
- jmp *%cs:paravirt_ops+PARAVIRT_iret)
+#define INTERRUPT_RETURN \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
+ jmp *%cs:pv_cpu_ops+PV_CPU_iret)

#define DISABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PARAVIRT_irq_disable), clobbers, \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
pushl %eax; pushl %ecx; pushl %edx; \
- call *%cs:paravirt_ops+PARAVIRT_irq_disable; \
+ call *%cs:pv_irq_ops+PV_IRQ_irq_disable; \
popl %edx; popl %ecx; popl %eax) \

#define ENABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable), clobbers, \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
pushl %eax; pushl %ecx; pushl %edx; \
- call *%cs:paravirt_ops+PARAVIRT_irq_enable; \
+ call *%cs:pv_irq_ops+PV_IRQ_irq_enable; \
popl %edx; popl %ecx; popl %eax)

-#define ENABLE_INTERRUPTS_SYSEXIT \
- PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable_sysexit), CLBR_NONE, \
- jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)
+#define ENABLE_INTERRUPTS_SYSEXIT \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\
+ jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit)

#define GET_CR0_INTO_EAX \
push %ecx; push %edx; \
- call *paravirt_ops+PARAVIRT_read_cr0; \
+ call *pv_cpu_ops+PV_CPU_read_cr0; \
pop %edx; pop %ecx

#endif /* __ASSEMBLY__ */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/