Re: [PATCH v1 04/12] xen/hvmlite: Bootstrap HVMlite guest
From: Luis R. Rodriguez
Date: Fri Jan 22 2016 - 18:33:17 EST
On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote:
> Start HVMlite guest XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
> page, initialize boot_params, enable early page tables.
>
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
> ---
> arch/x86/xen/Makefile | 1 +
> arch/x86/xen/enlighten.c | 91 +++++++++++++++++++++++++-
> arch/x86/xen/xen-hvmlite.S | 158 ++++++++++++++++++++++++++++++++++++++++++++
> include/xen/xen.h | 6 ++
> 4 files changed, 255 insertions(+), 1 deletions(-)
> create mode 100644 arch/x86/xen/xen-hvmlite.S
>
> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
> index e47e527..1d913d7 100644
> --- a/arch/x86/xen/Makefile
> +++ b/arch/x86/xen/Makefile
> @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
> obj-$(CONFIG_XEN_DOM0) += vga.o
> obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
> obj-$(CONFIG_XEN_EFI) += efi.o
> +obj-$(CONFIG_XEN_PVHVM) += xen-hvmlite.o
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 2cf446a..2ed8b2b 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -118,7 +118,8 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
> */
> DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
>
> -enum xen_domain_type xen_domain_type = XEN_NATIVE;
> +enum xen_domain_type xen_domain_type
> + __attribute__((section(".data"))) = XEN_NATIVE;
> EXPORT_SYMBOL_GPL(xen_domain_type);
But why? This is not explained.
>
> unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
> @@ -171,6 +172,17 @@ struct tls_descs {
> */
> static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
>
> +#ifdef CONFIG_XEN_PVHVM
> +/*
> + * HVMlite variables. These need to live in data segment since they are
> + * initialized before startup_{32|64}, which clear .bss, are invoked.
> + */
I guess for this reason.. Perhaps a bit more clear:
/*
* HVMlite variables. These need to live in data segment since they are
* before before startup_{32|64} is invoked, otherwise they would be cleared.
*/
> +int xen_hvmlite __attribute__((section(".data"))) = 0;
> +struct hvm_start_info hvmlite_start_info __attribute__((section(".data")));
> +uint hvmlite_start_info_sz = sizeof(hvmlite_start_info);
> +struct boot_params xen_hvmlite_boot_params __attribute__((section(".data")));
> +#endif
> +
> static void clamp_max_cpus(void)
> {
> #ifdef CONFIG_SMP
> @@ -1736,6 +1748,83 @@ asmlinkage __visible void __init xen_start_kernel(void)
> #endif
> }
>
> +#ifdef CONFIG_XEN_PVHVM
> +static void __init hvmlite_bootparams(void)
> +{
Hrm.
> + struct xen_memory_map memmap;
> + int i;
> +
> + memset(&xen_hvmlite_boot_params, 0, sizeof(xen_hvmlite_boot_params));
> +
> + memmap.nr_entries = ARRAY_SIZE(xen_hvmlite_boot_params.e820_map);
> + set_xen_guest_handle(memmap.buffer, xen_hvmlite_boot_params.e820_map);
> + if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) {
> + xen_raw_console_write("XENMEM_memory_map failed\n");
> + BUG();
> + }
> +
> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].addr =
> + ISA_START_ADDRESS;
> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].size =
> + ISA_END_ADDRESS - ISA_START_ADDRESS;
> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries++].type =
> + E820_RESERVED;
> +
> + sanitize_e820_map(xen_hvmlite_boot_params.e820_map,
> + ARRAY_SIZE(xen_hvmlite_boot_params.e820_map),
> + &memmap.nr_entries);
> +
> + xen_hvmlite_boot_params.e820_entries = memmap.nr_entries;
> + for (i = 0; i < xen_hvmlite_boot_params.e820_entries; i++)
> + e820_add_region(xen_hvmlite_boot_params.e820_map[i].addr,
> + xen_hvmlite_boot_params.e820_map[i].size,
> + xen_hvmlite_boot_params.e820_map[i].type);
> +
> + xen_hvmlite_boot_params.hdr.cmd_line_ptr =
> + hvmlite_start_info.cmdline_paddr;
> +
> + /* The first module is always ramdisk */
> + if (hvmlite_start_info.nr_modules) {
> + struct hvm_modlist_entry *modaddr =
> + __va(hvmlite_start_info.modlist_paddr);
> + xen_hvmlite_boot_params.hdr.ramdisk_image = modaddr->paddr;
> + xen_hvmlite_boot_params.hdr.ramdisk_size = modaddr->size;
> + }
> +
> + /*
> + * See Documentation/x86/boot.txt.
> + *
> + * Version 2.12 supports Xen entry point but we will use default x86/PC
> + * environment (i.e. hardware_subarch 0).
> + */
> + xen_hvmlite_boot_params.hdr.version = 0x212;
> + xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */
> +}
I realize PV got away with setting up boot_params on C code but best
ask now that this new code is being introduced: why can't we just have
the Xen hypervisor fill this in? It'd save us all this code.
On this page I show the difference, as an example of what this would look
like in contrast to how lguest set things up as an example in a very
clean way:
http://www.do-not-panic.com/2015/12/xen-and-x86-linux-zero-page.html
> +/*
> + * This routine (and those that it might call) should not use
> + * anything that lives in .bss since that segment will be cleared later
> + */
> +void __init xen_prepare_hvmlite(void)
> +{
> + u32 eax, ecx, edx, msr;
> + u64 pfn;
> +
> + cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx);
> + pfn = __pa(hypercall_page);
> + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> + pv_info.name = "Xen HVMlite";
> + xen_domain_type = XEN_HVM_DOMAIN;
> + xen_hvmlite = 1;
> +
> + x86_init.oem.arch_setup = xen_init_kernel;
> + x86_init.oem.banner = xen_banner;
> +
> + hvmlite_bootparams();
> +}
> +#endif
If the boot_params.hdr.hardware_subarch_data pointed to a custom
struct then the first C entry point for Xen could shuffle this and
set this too, by still using less asm entry helpers. We'd still
need this run but with the linker table I think we could have
a stub small stub for hvm run, it would not be a call from asm.
> +
> void __ref xen_hvm_init_shared_info(void)
> {
> int cpu;
> diff --git a/arch/x86/xen/xen-hvmlite.S b/arch/x86/xen/xen-hvmlite.S
> new file mode 100644
> index 0000000..90f03d0
> --- /dev/null
> +++ b/arch/x86/xen/xen-hvmlite.S
> @@ -0,0 +1,158 @@
> +/*
> + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> + .code32
> + .text
> +#define _pa(x) ((x) - __START_KERNEL_map)
> +
> +#include <linux/elfnote.h>
> +#include <linux/init.h>
> +#include <linux/linkage.h>
> +#include <asm/segment.h>
> +#include <asm/asm.h>
> +#include <asm/boot.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr.h>
> +#include <xen/interface/elfnote.h>
> +
> + __HEAD
> + .code32
> +
> +/* Entry point for HVMlite guests */
> +ENTRY(hvmlite_start_xen)
Yet another asm entry point for Linux, could it perhaps someway be possible to
just share some of this asm entry code in a clean way ?
> + cli
> + cld
> +
> + mov $_pa(gdt), %eax
> + lgdt (%eax)
> +
> + movl $(__BOOT_DS),%eax
> + movl %eax,%ds
> + movl %eax,%es
> + movl %eax,%ss
> +
> + /* Stash hvm_start_info */
> + mov $_pa(hvmlite_start_info), %edi
> + mov %ebx, %esi
> + mov $_pa(hvmlite_start_info_sz), %ecx
> + mov (%ecx), %ecx
> + rep
> + movsb
> +
> + movl $_pa(early_stack_end), %eax
> + movl %eax, %esp
> +
> + /* Enable PAE mode */
> + movl %cr4, %eax
> + orl $X86_CR4_PAE, %eax
> + movl %eax, %cr4
> +
> +#ifdef CONFIG_X86_64
> + /* Enable Long mode */
> + movl $MSR_EFER, %ecx
> + rdmsr
> + btsl $_EFER_LME, %eax
> + wrmsr
> +
> + /* Enable pre-constructed page tables */
> + mov $_pa(init_level4_pgt), %eax
> + movl %eax, %cr3
> + movl $(X86_CR0_PG | X86_CR0_PE), %eax
> + movl %eax, %cr0
> +
> + /* Jump to 64-bit mode. */
> + pushl $__KERNEL_CS
> + leal _pa(1f), %eax
> + pushl %eax
> + lret
> +
> + /* 64-bit entry point */
> + .code64
> +1:
> + call xen_prepare_hvmlite
> +
> + /* startup_64 expects boot_params in %rsi */
> + mov $_pa(xen_hvmlite_boot_params), %rsi
> + movq $_pa(startup_64), %rax
Nice! But again why can't the Xen hypervisor just set the boot_params?
All other Linux loaders do it. Why is Xen special?
Luis
> + jmp *%rax
> +
> +#else /* CONFIG_X86_64 */
> +
> + /* Use initial_page table and set level 2 to map 2M pages */
> + movl $_pa(initial_pg_pmd), %edi
> + movl $(_PAGE_PSE | _PAGE_RW | _PAGE_PRESENT), %eax
> + movl $2048, %ecx
> +2:
> + movl %eax, 0(%edi)
> + addl $0x00200000, %eax
> + addl $8, %edi
> + decl %ecx
> + jnz 2b
> +
> + /* Enable the boot paging */
> + movl $_pa(initial_page_table), %eax
> + movl %eax, %cr3
> + movl %cr0, %eax
> + orl $(X86_CR0_PG | X86_CR0_PE), %eax
> + movl %eax, %cr0
> +
> + ljmp $__BOOT_CS,$3f
> +3:
> + call xen_prepare_hvmlite
> + mov $_pa(xen_hvmlite_boot_params), %esi
> +
> + /* startup_32 doesn't expect paging and PAE to be on */
> + ljmp $__BOOT_CS,$_pa(4f)
> +4:
> + movl %cr0, %eax
> + andl $~X86_CR0_PG, %eax
> + movl %eax, %cr0
> + movl %cr4, %eax
> + andl $~X86_CR4_PAE, %eax
> + movl %eax, %cr4
> +
> + /* Restore initial_pg_pmd to its original (zero) state */
> + movl $_pa(initial_pg_pmd), %edi
> + xorl %eax, %eax
> + movl $(PAGE_SIZE/4), %ecx
> + rep stosl
> +
> + ljmp $0x10, $_pa(startup_32)
> +#endif
> +
> + .data
> +gdt:
> + .word gdt_end - gdt
> + .long _pa(gdt)
> + .word 0
> + .quad 0x0000000000000000 /* NULL descriptor */
> +#ifdef CONFIG_X86_64
> + .quad 0x00af9a000000ffff /* __KERNEL_CS */
> +#else
> + .quad 0x00cf9a000000ffff /* __KERNEL_CS */
> +#endif
> + .quad 0x00cf92000000ffff /* __KERNEL_DS */
> +gdt_end:
> +
> + .bss
> + .balign 4
> +early_stack:
> + .fill 16, 1, 0
> +early_stack_end:
> +
> + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
> + _ASM_PTR (hvmlite_start_xen - __START_KERNEL_map))
> diff --git a/include/xen/xen.h b/include/xen/xen.h
> index 0c0e3ef..6a0d3f3 100644
> --- a/include/xen/xen.h
> +++ b/include/xen/xen.h
> @@ -29,6 +29,12 @@ extern enum xen_domain_type xen_domain_type;
> #define xen_initial_domain() (0)
> #endif /* CONFIG_XEN_DOM0 */
>
> +#ifdef CONFIG_XEN_PVHVM
> +extern int xen_hvmlite;
> +#else
> +#define xen_hvmlite (0)
> +#endif
> +
> #ifdef CONFIG_XEN_PVH
> /* This functionality exists only for x86. The XEN_PVHVM support exists
> * only in x86 world - hence on ARM it will be always disabled.
> --
> 1.7.1
>
>
--
Luis Rodriguez, SUSE LINUX GmbH
Maxfeldstrasse 5; D-90409 Nuernberg