[PATCH v1 1/4] xen/acpi: upload power and performance related data from a PVH dom0

From: Penny Zheng
Date: Wed Dec 04 2024 - 03:27:17 EST


From: Roger Pau Monné <roger.pau@xxxxxxxxxx>

When running as a PVH dom0 the ACPI MADT is crafted by Xen in order to
report the correct numbers of vCPUs that dom0 has, so the host MADT is
not provided to dom0. This creates issues when parsing the power and
performance related data from ACPI dynamic tables, as the ACPI
Processor UIDs found on the dynamic code are likely to not match the
ones crafted by Xen in the dom0 MADT.

Xen would rely on Linux having filled at least the power and
performance related data of the vCPUs on the system, and would clone
that information in order to setup the remaining pCPUs on the system
if dom0 vCPUs < pCPUs. However when running as PVH dom0 it's likely
that none of dom0 CPUs will have the power and performance data
filled, and hence the Xen ACPI Processor driver needs to fetch that
information by itself.

In order to do so correctly, introduce a new helper to fetch the _CST
data without taking into account the system capabilities from the
CPUID output, as the capabilities reported to dom0 in CPUID might be
different from the ones on the host.

Note that the newly introduced code will only fetch the _CST, _PSS,
_PPC and _PCT from a single CPU, and clone that information for all the
other Processors. This won't work on an heterogeneous system with
Processors having different power and performance related data between
them.

Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
Signed-off-by: Jason Andryuk <jason.andryuk@xxxxxxx>
---
drivers/xen/pcpu.c | 3 +-
drivers/xen/xen-acpi-processor.c | 231 ++++++++++++++++++++++++++++---
include/xen/xen.h | 2 +-
3 files changed, 217 insertions(+), 19 deletions(-)

diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
index c63f317e3df3..dc9f2c14bf62 100644
--- a/drivers/xen/pcpu.c
+++ b/drivers/xen/pcpu.c
@@ -388,7 +388,7 @@ static int __init xen_pcpu_init(void)
arch_initcall(xen_pcpu_init);

#ifdef CONFIG_ACPI
-bool __init xen_processor_present(uint32_t acpi_id)
+bool xen_processor_present(uint32_t acpi_id)
{
const struct pcpu *pcpu;
bool online = false;
@@ -403,6 +403,7 @@ bool __init xen_processor_present(uint32_t acpi_id)

return online;
}
+EXPORT_SYMBOL_GPL(xen_processor_present);

void xen_sanitize_proc_cap_bits(uint32_t *cap)
{
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 296703939846..74dcc84b9199 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -48,6 +48,8 @@ static unsigned long *acpi_id_cst_present;
/* Which ACPI P-State dependencies for a enumerated processor */
static struct acpi_psd_package *acpi_psd;

+static bool pr_initialized;
+
static int push_cxx_to_hypervisor(struct acpi_processor *_pr)
{
struct xen_platform_op op = {
@@ -172,8 +174,13 @@ static int xen_copy_psd_data(struct acpi_processor *_pr,

/* 'acpi_processor_preregister_performance' does not parse if the
* num_processors <= 1, but Xen still requires it. Do it manually here.
+ *
+ * Also init the field if not set, as that's possible if the physical
+ * CPUs on the system doesn't match the data provided in the MADT when
+ * running as a PVH dom0.
*/
- if (pdomain->num_processors <= 1) {
+ if (pdomain->num_processors <= 1 ||
+ dst->shared_type == CPUFREQ_SHARED_TYPE_NONE) {
if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
dst->shared_type = CPUFREQ_SHARED_TYPE_ALL;
else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
@@ -313,6 +320,155 @@ static unsigned int __init get_max_acpi_id(void)
pr_debug("Max ACPI ID: %u\n", max_acpi_id);
return max_acpi_id;
}
+
+/*
+ * Custom version of the native acpi_processor_evaluate_cst() function, to
+ * avoid some sanity checks done based on the CPUID data. When running as a
+ * Xen domain the CPUID data provided to dom0 is not the native one, so C
+ * states cannot be sanity checked. Leave it to the hypervisor which is also
+ * the entity running the driver.
+ */
+static int xen_acpi_processor_evaluate_cst(acpi_handle handle,
+ struct acpi_processor_power *info)
+{
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ union acpi_object *cst;
+ acpi_status status;
+ u64 count;
+ int last_index = 0;
+ int i, ret = 0;
+
+ status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
+ if (ACPI_FAILURE(status)) {
+ acpi_handle_debug(handle, "No _CST\n");
+ return -ENODEV;
+ }
+
+ cst = buffer.pointer;
+
+ /* There must be at least 2 elements. */
+ if (!cst || cst->type != ACPI_TYPE_PACKAGE || cst->package.count < 2) {
+ acpi_handle_warn(handle, "Invalid _CST output\n");
+ ret = -EFAULT;
+ goto end;
+ }
+
+ count = cst->package.elements[0].integer.value;
+
+ /* Validate the number of C-states. */
+ if (count < 1 || count != cst->package.count - 1) {
+ acpi_handle_warn(handle, "Inconsistent _CST data\n");
+ ret = -EFAULT;
+ goto end;
+ }
+
+ for (i = 1; i <= count; i++) {
+ union acpi_object *element;
+ union acpi_object *obj;
+ struct acpi_power_register *reg;
+ struct acpi_processor_cx cx;
+
+ /*
+ * If there is not enough space for all C-states, skip the
+ * excess ones and log a warning.
+ */
+ if (last_index >= ACPI_PROCESSOR_MAX_POWER - 1) {
+ acpi_handle_warn(handle, "No room for more idle states (limit: %d)\n",
+ ACPI_PROCESSOR_MAX_POWER - 1);
+ break;
+ }
+
+ memset(&cx, 0, sizeof(cx));
+
+ element = &cst->package.elements[i];
+ if (element->type != ACPI_TYPE_PACKAGE) {
+ acpi_handle_info(handle, "_CST C%d type(%x) is not package, skip...\n",
+ i, element->type);
+ continue;
+ }
+
+ if (element->package.count != 4) {
+ acpi_handle_info(handle, "_CST C%d package count(%d) is not 4, skip...\n",
+ i, element->package.count);
+ continue;
+ }
+
+ obj = &element->package.elements[0];
+
+ if (obj->type != ACPI_TYPE_BUFFER) {
+ acpi_handle_info(handle, "_CST C%d package element[0] type(%x) is not buffer, skip...\n",
+ i, obj->type);
+ continue;
+ }
+
+ reg = (struct acpi_power_register *)obj->buffer.pointer;
+
+ obj = &element->package.elements[1];
+ if (obj->type != ACPI_TYPE_INTEGER) {
+ acpi_handle_info(handle, "_CST C[%d] package element[1] type(%x) is not integer, skip...\n",
+ i, obj->type);
+ continue;
+ }
+
+ cx.type = obj->integer.value;
+ /*
+ * There are known cases in which the _CST output does not
+ * contain C1, so if the type of the first state found is not
+ * C1, leave an empty slot for C1 to be filled in later.
+ */
+ if (i == 1 && cx.type != ACPI_STATE_C1)
+ last_index = 1;
+
+ cx.address = reg->address;
+ cx.index = last_index + 1;
+
+ switch (reg->space_id) {
+ case ACPI_ADR_SPACE_FIXED_HARDWARE:
+ cx.entry_method = ACPI_CSTATE_FFH;
+ break;
+
+ case ACPI_ADR_SPACE_SYSTEM_IO:
+ cx.entry_method = ACPI_CSTATE_SYSTEMIO;
+ break;
+
+ default:
+ acpi_handle_info(handle, "_CST C%d space_id(%x) neither FIXED_HARDWARE nor SYSTEM_IO, skip...\n",
+ i, reg->space_id);
+ continue;
+ }
+
+ if (cx.type == ACPI_STATE_C1)
+ cx.valid = 1;
+
+ obj = &element->package.elements[2];
+ if (obj->type != ACPI_TYPE_INTEGER) {
+ acpi_handle_info(handle, "_CST C%d package element[2] type(%x) not integer, skip...\n",
+ i, obj->type);
+ continue;
+ }
+
+ cx.latency = obj->integer.value;
+
+ obj = &element->package.elements[3];
+ if (obj->type != ACPI_TYPE_INTEGER) {
+ acpi_handle_info(handle, "_CST C%d package element[3] type(%x) not integer, skip...\n",
+ i, obj->type);
+ continue;
+ }
+
+ memcpy(&info->states[++last_index], &cx, sizeof(cx));
+ }
+
+ acpi_handle_info(handle, "Found %d idle states\n", last_index);
+
+ info->count = last_index;
+
+end:
+ kfree(buffer.pointer);
+
+ return ret;
+}
+
/*
* The read_acpi_id and check_acpi_ids are there to support the Xen
* oddity of virtual CPUs != physical CPUs in the initial domain.
@@ -331,6 +487,7 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
unsigned long long tmp;
union acpi_object object = { 0 };
struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+ struct acpi_buffer cst_buf = { ACPI_ALLOCATE_BUFFER, NULL };
acpi_io_address pblk = 0;

status = acpi_get_type(handle, &acpi_type);
@@ -354,24 +511,45 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
default:
return AE_OK;
}
- if (invalid_phys_cpuid(acpi_get_phys_id(handle,
- acpi_type == ACPI_TYPE_DEVICE,
- acpi_id))) {
+
+ if (!xen_processor_present(acpi_id)) {
pr_debug("CPU with ACPI ID %u is unavailable\n", acpi_id);
return AE_OK;
}
- /* There are more ACPI Processor objects than in x2APIC or MADT.
- * This can happen with incorrect ACPI SSDT declerations. */
- if (acpi_id >= nr_acpi_bits) {
- pr_debug("max acpi id %u, trying to set %u\n",
- nr_acpi_bits - 1, acpi_id);
- return AE_OK;
- }
+
/* OK, There is a ACPI Processor object */
__set_bit(acpi_id, acpi_id_present);

pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk);

+ if (!pr_initialized) {
+ struct acpi_processor *pr = context;
+ int rc, rc2;
+
+ /*
+ * There's no CPU on the system that has any performance or
+ * power related data, initialize all the required fields by
+ * fetching that info here.
+ *
+ * Note such information is only fetched once, and then reused
+ * for all pCPUs. This won't work on heterogeneous systems
+ * with different Cx anb/or Px states between CPUs.
+ */
+
+ pr->handle = handle;
+
+ rc = acpi_processor_get_performance_info(pr);
+ if (rc)
+ pr_debug("ACPI CPU%u failed to get performance data\n",
+ acpi_id);
+ rc2 = xen_acpi_processor_evaluate_cst(handle, &pr->power);
+ if (rc2)
+ pr_debug("ACPI CPU%u failed to get _CST data\n", acpi_id);
+
+ if (!rc && !rc2)
+ pr_initialized = true;
+ }
+
/* It has P-state dependencies */
if (!acpi_processor_get_psd(handle, &acpi_psd[acpi_id])) {
pr_debug("ACPI CPU%u w/ PST:coord_type = %llu domain = %llu\n",
@@ -379,11 +557,13 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
acpi_psd[acpi_id].domain);
}

- status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
+ status = acpi_evaluate_object(handle, "_CST", NULL, &cst_buf);
if (ACPI_FAILURE(status)) {
if (!pblk)
return AE_OK;
}
+ kfree(cst_buf.pointer);
+
/* .. and it has a C-state */
__set_bit(acpi_id, acpi_id_cst_present);

@@ -392,8 +572,7 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
static int check_acpi_ids(struct acpi_processor *pr_backup)
{

- if (!pr_backup)
- return -ENODEV;
+ BUG_ON(!pr_backup);

if (acpi_id_present && acpi_id_cst_present)
/* OK, done this once .. skip to uploading */
@@ -422,8 +601,8 @@ static int check_acpi_ids(struct acpi_processor *pr_backup)

acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
ACPI_UINT32_MAX,
- read_acpi_id, NULL, NULL, NULL);
- acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, read_acpi_id, NULL, NULL);
+ read_acpi_id, NULL, pr_backup, NULL);
+ acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, read_acpi_id, pr_backup, NULL);

upload:
if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) {
@@ -464,6 +643,7 @@ static int xen_upload_processor_pm_data(void)
struct acpi_processor *pr_backup = NULL;
int i;
int rc = 0;
+ bool free_perf = false;

pr_info("Uploading Xen processor PM info\n");

@@ -473,12 +653,29 @@ static int xen_upload_processor_pm_data(void)
if (!_pr)
continue;

- if (!pr_backup)
+ if (!pr_backup) {
pr_backup = kmemdup(_pr, sizeof(*_pr), GFP_KERNEL);
+ pr_initialized = true;
+ }
(void)upload_pm_data(_pr);
}

+ if (!pr_backup) {
+ pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+ if (!pr_backup)
+ return -ENOMEM;
+ pr_backup->performance = kzalloc(sizeof(struct acpi_processor_performance),
+ GFP_KERNEL);
+ if (!pr_backup->performance) {
+ kfree(pr_backup);
+ return -ENOMEM;
+ }
+ free_perf = true;
+ }
+
rc = check_acpi_ids(pr_backup);
+ if (free_perf)
+ kfree(pr_backup->performance);
kfree(pr_backup);

return rc;
diff --git a/include/xen/xen.h b/include/xen/xen.h
index a1e5b3f18d69..6ff3e2f40803 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -81,7 +81,7 @@ static inline void xen_free_unpopulated_pages(unsigned int nr_pages,
#endif

#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) && defined(CONFIG_X86)
-bool __init xen_processor_present(uint32_t acpi_id);
+bool xen_processor_present(uint32_t acpi_id);
#else
#include <linux/bug.h>
static inline bool xen_processor_present(uint32_t acpi_id)
--
2.34.1