Re: [PATCH RFC 2/5] acpi/ghes: Use HEST table offsets when preparing GHES records

From: Igor Mammedov
Date: Wed Oct 02 2024 - 10:30:36 EST


On Tue, 1 Oct 2024 13:42:47 +0200
Mauro Carvalho Chehab <mchehab+huawei@xxxxxxxxxx> wrote:

> There are two pointers that are needed during error injection:
>
> 1. The start address of the CPER block to be stored;
> 2. The address of the ack, which needs a reset before next error.
>
> Calculate them preferrable from the HEST table, as this allows
> checking the source ID, the size of the table and the type of
> HEST error block structures.
>
> Yet, keep the old code, as this is needed for migration purposes.
>
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@xxxxxxxxxx>
> ---
> hw/acpi/ghes.c | 93 ++++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 83 insertions(+), 10 deletions(-)
>
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index 2c2cf444edeb..313a6e453af6 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -61,6 +61,23 @@
> */
> #define ACPI_GHES_GESB_SIZE 20
>
> +/*
> + * Offsets with regards to the start of the HEST table stored at
> + * ags->hest_addr_le, according with the memory layout map at
> + * docs/specs/acpi_hest_ghes.rst.
> + */
> +
> +/* ACPI 6.2: 18.3.2.8 Generic Hardware Error Source version 2
> + * Table 18-382 Generic Hardware Error Source version 2 (GHESv2) Structure
> + */
> +#define HEST_GHES_V2_TABLE_SIZE 92
> +#define GHES_ACK_OFFSET (64 + GAS_ADDR_OFFSET)
> +
> +/* ACPI 6.2: 18.3.2.7: Generic Hardware Error Source
> + * Table 18-380: 'Error Status Address' field
> + */
> +#define GHES_ERR_ST_ADDR_OFFSET (20 + GAS_ADDR_OFFSET)
> +
> /*
> * Values for error_severity field
> */
> @@ -218,14 +235,6 @@ static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker,
> {
> int i, error_status_block_offset;
>
> - /*
> - * TODO: Current version supports only one source.
> - * A further patch will drop this check, after adding a proper migration
> - * code, as, for the code to work, we need to store a bios pointer to the
> - * HEST table.
> - */
> - assert(num_sources == 1);
> -
> /* Build error_block_address */
> for (i = 0; i < num_sources; i++) {
> build_append_int_noprefix(hardware_errors, 0, sizeof(uint64_t));
> @@ -425,6 +434,65 @@ static void get_ghes_offsets(uint64_t ghes_addr,
> *read_ack_register_addr = ghes_addr + sizeof(uint64_t);
> }
>
> +static void get_hest_offsets(uint16_t source_id, uint64_t hest_addr,
> + uint64_t *cper_addr,
> + uint64_t *read_ack_start_addr,
> + Error **errp)

cper/read_ack are GHES specific only, aren't they?

perhaps s/get_hest_offsets/get_ghes_source_offsets/


> +{
> + uint64_t hest_err_block_addr, hest_read_ack_start_addr;
> + uint64_t err_source_struct, error_block_addr;
> + uint32_t num_sources, i;
> +
> + if (!hest_addr) {
> + return;
> + }
> +
> + cpu_physical_memory_read(hest_addr, &num_sources, sizeof(num_sources));
> +
> + err_source_struct = hest_addr + sizeof(num_sources);
> +
> + /*
> + * Currently, HEST Error source navigates only for GHESv2 tables
> + */
> +
> + for (i = 0; i < num_sources; i++) {

missing le2cpu(num_sources)

> + uint64_t addr = err_source_struct;
> + uint16_t type, src_id;
> +
> + cpu_physical_memory_read(addr, &type, sizeof(type));

ditto for anything larger than 1 byte that you read from guest memory
(all over the patch)

> +
> + /* For now, we only know the size of GHESv2 table */
> + assert(type == ACPI_GHES_SOURCE_GENERIC_ERROR_V2);

Imagine in qemu-9.3 we add non GHES error source, and then try to migrate
such guest to qemu-9.2. It will explode here.
Of-cause we can add some compat property to ged or machine type to
make sure that code works old way in qemu-9.3 for virt-9.2
at expense of keeping 9.2 code in 9.3. Which adds to maintenance burden
and fragile, also it's a matter of time before we screw it up and release
non-migratable/broken QEMU.

So I'd like to avoid piling up compat code/knobs on to of each other
and do it in a way where this src id lookup could gracefully skip
not implemented yet error sources.
This way we won't need any compat knobs to deal with in the future.

> +
> + /* It is GHES. Compare CPER source address */
> + addr += sizeof(type);
> + cpu_physical_memory_read(addr, &src_id, sizeof(src_id));
> +
> + if (src_id == source_id) {
> + break;
> + }
> +
> + err_source_struct += HEST_GHES_V2_TABLE_SIZE;
> + }
> + if (i == num_sources) {
> + error_setg(errp, "HEST: Source %d not found.", source_id);
> + return;
> + }
> +
> + /* Navigate though table address pointers */
> + hest_err_block_addr = err_source_struct + GHES_ERR_ST_ADDR_OFFSET;
> + hest_read_ack_start_addr = err_source_struct + GHES_ACK_OFFSET;

s/hest_read_ack_start_addr/hest_read_ack_addr/

> +
> + cpu_physical_memory_read(hest_err_block_addr, &error_block_addr,
> + sizeof(error_block_addr));
> +
> + cpu_physical_memory_read(error_block_addr, cper_addr,
> + sizeof(*cper_addr));
> +
> + cpu_physical_memory_read(hest_read_ack_start_addr, read_ack_start_addr,
> + sizeof(*read_ack_start_addr));
> +}
> +
> void ghes_record_cper_errors(const void *cper, size_t len,
> uint16_t source_id, Error **errp)
> {
> @@ -445,8 +513,13 @@ void ghes_record_cper_errors(const void *cper, size_t len,
> }
> ags = &acpi_ged_state->ghes_state;
>
> - get_ghes_offsets(le64_to_cpu(ags->hw_error_le),
> - &cper_addr, &read_ack_register_addr);
> + if (!ags->hest_addr_le) {
> + get_ghes_offsets(le64_to_cpu(ags->hw_error_le),

should it be named get_hw_error_offsets

> + &cper_addr, &read_ack_register_addr);
> + } else {
> + get_hest_offsets(source_id, le64_to_cpu(ags->hest_addr_le),
> + &cper_addr, &read_ack_register_addr, errp);
> + }
>
> if (!cper_addr) {
> error_setg(errp, "can not find Generic Error Status Block");