Re: [PATCH] x86: temporarily remove all attempts to provide setup_data
From: Michael S. Tsirkin
Date: Wed Feb 08 2023 - 13:14:35 EST
On Wed, Feb 08, 2023 at 03:08:35PM -0300, Jason A. Donenfeld wrote:
> All attempts at providing setup_data have been made as an iteration on
> whatever was there before, stretching back to the original
> implementation used for DTBs that [mis]used the kernel image itself.
> We've now had a dozen rounds of bugs and hacks, and the result is
> turning into a pile of unmaintainable and increasingly brittle hacks.
>
> Let's just rip out all the madness and start over. We can re-architect
> this based on having a separate standalone setup_data file, which is how
> it should have been done in the first place. This is a larger project
> with a few things to coordinate, but we can't really begin thinking
> about that while trying to play whack-a-mole with the current buggy
> implementation.
>
> So this commit removes the setup_data setting from x86_load_linux(),
> while leaving intact the infrastructure we'll need in the future to try
> again.
>
> Cc: Michael S. Tsirkin <mst@xxxxxxxxxx>
> Cc: Dov Murik <dovmurik@xxxxxxxxxxxxx>
> Cc: Tom Lendacky <thomas.lendacky@xxxxxxx>
> Cc: Gerd Hoffmann <kraxel@xxxxxxxxxx>
> Cc: Daniel P. Berrangé <berrange@xxxxxxxxxx>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Richard Henderson <richard.henderson@xxxxxxxxxx>
> Cc: H. Peter Anvin <hpa@xxxxxxxxx>
> Cc: Philippe Mathieu-Daudé <philmd@xxxxxxxxxx>
> Cc: Nathan Chancellor <nathan@xxxxxxxxxx>
> Cc: Borislav Petkov <bp@xxxxxxxxx>
> Cc: Eric Biggers <ebiggers@xxxxxxxxxx>
> Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
I think I'll be happier if this is just a revert of
the relevant commits in reverse order to make life easier
for backporters.
Unless that's too much work as we made other changes around
this code?
Failing that list the affected commits so people at least
know what to revert?
> ---
> hw/i386/microvm.c | 15 ++----
> hw/i386/x86.c | 120 +++++-----------------------------------------
> 2 files changed, 17 insertions(+), 118 deletions(-)
>
> diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
> index 29f30dd6d3..170a331e3f 100644
> --- a/hw/i386/microvm.c
> +++ b/hw/i386/microvm.c
> @@ -378,8 +378,7 @@ static void microvm_fix_kernel_cmdline(MachineState *machine)
> MicrovmMachineState *mms = MICROVM_MACHINE(machine);
> BusState *bus;
> BusChild *kid;
> - char *cmdline, *existing_cmdline;
> - size_t len;
> + char *cmdline;
>
> /*
> * Find MMIO transports with attached devices, and add them to the kernel
> @@ -388,8 +387,7 @@ static void microvm_fix_kernel_cmdline(MachineState *machine)
> * Yes, this is a hack, but one that heavily improves the UX without
> * introducing any significant issues.
> */
> - existing_cmdline = fw_cfg_read_bytes_ptr(x86ms->fw_cfg, FW_CFG_CMDLINE_DATA);
> - cmdline = g_strdup(existing_cmdline);
> + cmdline = g_strdup(machine->kernel_cmdline);
> bus = sysbus_get_default();
> QTAILQ_FOREACH(kid, &bus->children, sibling) {
> DeviceState *dev = kid->child;
> @@ -413,12 +411,9 @@ static void microvm_fix_kernel_cmdline(MachineState *machine)
> }
> }
>
> - len = strlen(cmdline);
> - if (len > VIRTIO_CMDLINE_TOTAL_MAX_LEN + strlen(existing_cmdline)) {
> - fprintf(stderr, "qemu: virtio mmio cmdline too large, skipping\n");
> - } else {
> - memcpy(existing_cmdline, cmdline, len + 1);
> - }
> + fw_cfg_modify_i32(x86ms->fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(cmdline) + 1);
> + fw_cfg_modify_string(x86ms->fw_cfg, FW_CFG_CMDLINE_DATA, cmdline);
> +
> g_free(cmdline);
> }
>
> diff --git a/hw/i386/x86.c b/hw/i386/x86.c
> index eaff4227bd..6cfdca9acd 100644
> --- a/hw/i386/x86.c
> +++ b/hw/i386/x86.c
> @@ -50,7 +50,6 @@
> #include "hw/intc/i8259.h"
> #include "hw/rtc/mc146818rtc.h"
> #include "target/i386/sev.h"
> -#include "hw/i386/microvm.h"
>
> #include "hw/acpi/cpu_hotplug.h"
> #include "hw/irq.h"
> @@ -770,30 +769,6 @@ static bool load_elfboot(const char *kernel_filename,
> return true;
> }
>
> -typedef struct SetupDataFixup {
> - void *pos;
> - hwaddr orig_val, new_val;
> - uint32_t addr;
> -} SetupDataFixup;
> -
> -static void fixup_setup_data(void *opaque)
> -{
> - SetupDataFixup *fixup = opaque;
> - stq_p(fixup->pos, fixup->new_val);
> -}
> -
> -static void reset_setup_data(void *opaque)
> -{
> - SetupDataFixup *fixup = opaque;
> - stq_p(fixup->pos, fixup->orig_val);
> -}
> -
> -static void reset_rng_seed(void *opaque)
> -{
> - SetupData *setup_data = opaque;
> - qemu_guest_getrandom_nofail(setup_data->data, le32_to_cpu(setup_data->len));
> -}
> -
> void x86_load_linux(X86MachineState *x86ms,
> FWCfgState *fw_cfg,
> int acpi_data_size,
> @@ -803,29 +778,20 @@ void x86_load_linux(X86MachineState *x86ms,
> bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
> uint16_t protocol;
> int setup_size, kernel_size, cmdline_size;
> - int dtb_size, setup_data_offset;
> uint32_t initrd_max;
> uint8_t header[8192], *setup, *kernel;
> - hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0, first_setup_data = 0;
> + hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0;
> FILE *f;
> char *vmode;
> MachineState *machine = MACHINE(x86ms);
> - SetupData *setup_data;
> const char *kernel_filename = machine->kernel_filename;
> const char *initrd_filename = machine->initrd_filename;
> - const char *dtb_filename = machine->dtb;
> - char *kernel_cmdline;
> + const char *kernel_cmdline = machine->kernel_cmdline;
> SevKernelLoaderContext sev_load_ctx = {};
> enum { RNG_SEED_LENGTH = 32 };
>
> - /*
> - * Add the NUL terminator, some padding for the microvm cmdline fiddling
> - * hack, and then align to 16 bytes as a paranoia measure
> - */
> - cmdline_size = (strlen(machine->kernel_cmdline) + 1 +
> - VIRTIO_CMDLINE_TOTAL_MAX_LEN + 16) & ~15;
> - /* Make a copy, since we might append arbitrary bytes to it later. */
> - kernel_cmdline = g_strndup(machine->kernel_cmdline, cmdline_size);
> + /* Align to 16 bytes as a paranoia measure */
> + cmdline_size = (strlen(kernel_cmdline) + 16) & ~15;
>
> /* load the kernel header */
> f = fopen(kernel_filename, "rb");
> @@ -966,6 +932,12 @@ void x86_load_linux(X86MachineState *x86ms,
> initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
> }
>
> + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
> + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1);
> + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
> + sev_load_ctx.cmdline_data = (char *)kernel_cmdline;
> + sev_load_ctx.cmdline_size = strlen(kernel_cmdline) + 1;
> +
> if (protocol >= 0x202) {
> stl_p(header + 0x228, cmdline_addr);
> } else {
> @@ -1078,81 +1050,13 @@ void x86_load_linux(X86MachineState *x86ms,
> }
> fclose(f);
>
> - /* append dtb to kernel */
> - if (dtb_filename) {
> - if (protocol < 0x209) {
> - fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n");
> - exit(1);
> - }
> -
> - dtb_size = get_image_size(dtb_filename);
> - if (dtb_size <= 0) {
> - fprintf(stderr, "qemu: error reading dtb %s: %s\n",
> - dtb_filename, strerror(errno));
> - exit(1);
> - }
> -
> - setup_data_offset = cmdline_size;
> - cmdline_size += sizeof(SetupData) + dtb_size;
> - kernel_cmdline = g_realloc(kernel_cmdline, cmdline_size);
> - setup_data = (void *)kernel_cmdline + setup_data_offset;
> - setup_data->next = cpu_to_le64(first_setup_data);
> - first_setup_data = cmdline_addr + setup_data_offset;
> - setup_data->type = cpu_to_le32(SETUP_DTB);
> - setup_data->len = cpu_to_le32(dtb_size);
> - load_image_size(dtb_filename, setup_data->data, dtb_size);
> - }
> -
> - if (!legacy_no_rng_seed && protocol >= 0x209) {
> - setup_data_offset = cmdline_size;
> - cmdline_size += sizeof(SetupData) + RNG_SEED_LENGTH;
> - kernel_cmdline = g_realloc(kernel_cmdline, cmdline_size);
> - setup_data = (void *)kernel_cmdline + setup_data_offset;
> - setup_data->next = cpu_to_le64(first_setup_data);
> - first_setup_data = cmdline_addr + setup_data_offset;
> - setup_data->type = cpu_to_le32(SETUP_RNG_SEED);
> - setup_data->len = cpu_to_le32(RNG_SEED_LENGTH);
> - qemu_guest_getrandom_nofail(setup_data->data, RNG_SEED_LENGTH);
> - qemu_register_reset_nosnapshotload(reset_rng_seed, setup_data);
> - fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_KERNEL_DATA, reset_rng_seed, NULL,
> - setup_data, kernel, kernel_size, true);
> - } else {
> - fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
> - }
> -
> - fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
> - fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, cmdline_size);
> - fw_cfg_add_bytes(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline, cmdline_size);
> - sev_load_ctx.cmdline_data = (char *)kernel_cmdline;
> - sev_load_ctx.cmdline_size = cmdline_size;
> -
> + fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
> fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr);
> fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size);
> sev_load_ctx.kernel_data = (char *)kernel;
> sev_load_ctx.kernel_size = kernel_size;
>
> - /*
> - * If we're starting an encrypted VM, it will be OVMF based, which uses the
> - * efi stub for booting and doesn't require any values to be placed in the
> - * kernel header. We therefore don't update the header so the hash of the
> - * kernel on the other side of the fw_cfg interface matches the hash of the
> - * file the user passed in.
> - */
> - if (!sev_enabled() && first_setup_data) {
> - SetupDataFixup *fixup = g_malloc(sizeof(*fixup));
> -
> - memcpy(setup, header, MIN(sizeof(header), setup_size));
> - /* Offset 0x250 is a pointer to the first setup_data link. */
> - fixup->pos = setup + 0x250;
> - fixup->orig_val = ldq_p(fixup->pos);
> - fixup->new_val = first_setup_data;
> - fixup->addr = cpu_to_le32(real_addr);
> - fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_SETUP_ADDR, fixup_setup_data, NULL,
> - fixup, &fixup->addr, sizeof(fixup->addr), true);
> - qemu_register_reset(reset_setup_data, fixup);
> - } else {
> - fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
> - }
> + fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
> fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size);
> fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size);
> sev_load_ctx.setup_data = (char *)setup;
> --
> 2.39.1