Re: [RFC PATCH] KVM: x86: Allow Qemu/KVM to use PVH entry point

From: Andrew Cooper
Date: Tue Nov 28 2017 - 14:51:24 EST


On 28/11/17 19:34, Maran Wilson wrote:
> For certain applications it is desirable to rapidly boot a KVM virtual
> machine. In cases where legacy hardware and software support within the
> guest is not needed, Qemu should be able to boot directly into the
> uncompressed Linux kernel binary without the need to run firmware.
>
> There already exists an ABI to allow this for Xen PVH guests and the ABI is
> supported by Linux and FreeBSD:
>
> https://xenbits.xen.org/docs/unstable/misc/hvmlite.html

Just FYI, this link has recently become stale, following some cleanup.Â
The document is now:

https://xenbits.xen.org/docs/unstable/misc/pvh.html

~Andrew

>
> This PoC patch enables Qemu to use that same entry point for booting KVM
> guests.
>
> Even though the code is still PoC quality, I'm sending this as an RFC now
> since there are a number of different ways the specific implementation
> details can be handled. I chose a shared code path for Xen and KVM guests
> but could just as easily create a separate code path that is advertised by
> a different ELF note for KVM. There also seems to be some flexibility in
> how the e820 table data is passed and how (or if) it should be identified
> as e820 data. As a starting point, I've chosen the options that seem to
> result in the smallest patch with minimal to no changes required of the
> x86/HVM direct boot ABI.
> ---
> arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
> 1 file changed, 55 insertions(+), 19 deletions(-)
>
> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> index 98ab176..d93f711 100644
> --- a/arch/x86/xen/enlighten_pvh.c
> +++ b/arch/x86/xen/enlighten_pvh.c
> @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void)
> acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
> }
>
> -static void __init init_pvh_bootparams(void)
> +static void __init init_pvh_bootparams(bool xen_guest)
> {
> struct xen_memory_map memmap;
> int rc;
>
> memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
>
> - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> - if (rc) {
> - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> - BUG();
> + if (xen_guest) {
> + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> + if (rc) {
> + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> + BUG();
> + }
> + pvh_bootparams.e820_entries = memmap.nr_entries;
> + } else if (pvh_start_info.nr_modules > 1) {
> + /* The second module should be the e820 data for KVM guests */
> + struct hvm_modlist_entry *modaddr;
> + char e820_sig[] = "e820 data";
> + struct boot_e820_entry *ep;
> + struct e820_table *tp;
> + char *cmdline_str;
> + int idx;
> +
> + modaddr = __va(pvh_start_info.modlist_paddr +
> + sizeof(struct hvm_modlist_entry));
> + cmdline_str = __va(modaddr->cmdline_paddr);
> +
> + if ((modaddr->cmdline_paddr) &&
> + (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) {
> + tp = __va(modaddr->paddr);
> + ep = (struct boot_e820_entry *)tp->entries;
> +
> + pvh_bootparams.e820_entries = tp->nr_entries;
> +
> + for (idx = 0; idx < tp->nr_entries ; idx++, ep++)
> + pvh_bootparams.e820_table[idx] = *ep;
> + }
> }
> - pvh_bootparams.e820_entries = memmap.nr_entries;
>
> if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
> pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
> @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void)
> pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
> E820_TYPE_RESERVED;
> pvh_bootparams.e820_entries++;
> - } else
> + } else if (xen_guest) {
> xen_raw_printk("Warning: Can fit ISA range into e820\n");
> + }
>
> pvh_bootparams.hdr.cmd_line_ptr =
> pvh_start_info.cmdline_paddr;
> @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void)
> * environment (i.e. hardware_subarch 0).
> */
> pvh_bootparams.hdr.version = 0x212;
> - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
> + pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0;
> }
>
> /*
> @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void)
> */
> void __init xen_prepare_pvh(void)
> {
> - u32 msr;
> +
> + u32 msr = xen_cpuid_base();
> u64 pfn;
> + bool xen_guest = msr ? true : false;
>
> if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
> - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> - pvh_start_info.magic);
> + if (xen_guest)
> + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> + pvh_start_info.magic);
> BUG();
> }
>
> - xen_pvh = 1;
> + if (xen_guest) {
> + xen_pvh = 1;
> +
> + msr = cpuid_ebx(msr + 2);
> + pfn = __pa(hypercall_page);
> + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> + } else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) {
> + BUG();
> + }
>
> - msr = cpuid_ebx(xen_cpuid_base() + 2);
> - pfn = __pa(hypercall_page);
> - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> + init_pvh_bootparams(xen_guest);
>
> - init_pvh_bootparams();
> + if (xen_guest)
> + x86_init.oem.arch_setup = xen_pvh_arch_setup;
>
> - x86_init.oem.arch_setup = xen_pvh_arch_setup;
> }