[PATCH 07/10] platform/x86: ISST: Support partitioned systems

From: Srinivas Pandruvada
Date: Tue Apr 23 2024 - 16:48:20 EST


A partitioned system has two different PCI VSEC devices per package.
A non-partitioned device has only one PCI VSEC device per package.
The current implementation only supports non partitioned systems.

Each partition maps a set of power domains. Other than reading from
different MMIO regions, there is no change in the SST functionality.
The scope of SST control is still per power domain. Hence user space
does not need to be aware of existence of partitions.

With partitions, existing per package information defined using struct
tpmi_sst_struct is enhanced to store information for both partitions. A
mapping function map_partition_power_domain_id() is introduced, which
maps to correct partition and index. This mapping function is called
in get_instance() and isst_if_clos_assoc(), before indexing into
tpmi_sst_struct->power_domain_info[].

The TPMI core platform info provides partition id and compute die ID
mask for each partition. Use this information to order power domains,
so that compute dies are presented before IO dies to match hardware
defined compute die ID for each CPU.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
Reviewed-by: Zhang Rui <rui.zhang@xxxxxxxxx>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@xxxxxxxxxxxxxxx>
---
.../intel/speed_select_if/isst_tpmi_core.c | 299 ++++++++++++++++--
1 file changed, 267 insertions(+), 32 deletions(-)

diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
index 49d573fcbd72..b8da6847622b 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
@@ -23,6 +23,7 @@
#include <linux/fs.h>
#include <linux/io.h>
#include <linux/kernel.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <uapi/linux/isst_if.h>

@@ -263,20 +264,33 @@ struct tpmi_per_power_domain_info {
bool write_blocked;
};

+/* Supported maximum partitions */
+#define SST_MAX_PARTITIONS 2
+
/**
* struct tpmi_sst_struct - Store sst info for a package
* @package_id: Package id for this aux device instance
* @number_of_power_domains: Number of power_domains pointed by power_domain_info pointer
* @power_domain_info: Pointer to power domains information
+ * @cdie_mask: Mask of compute dies present in a partition from hardware.
+ * This mask is not present in the version 1 information header.
+ * @io_dies: Number of IO dies in a partition. This will be 0 for TPMI
+ * version 1 information header.
+ * @partition_mask: Mask of all partitions.
+ * @partition_mask_current: Current partition mask as some may have been unbound.
*
* This structure is used store full SST information for a package.
- * Each package has a unique OOB PCI device, which enumerates TPMI.
- * Each Package will have multiple power_domains.
+ * Each package has one or multiple OOB PCI devices. Each package can contain multiple
+ * power domains.
*/
struct tpmi_sst_struct {
int package_id;
- int number_of_power_domains;
- struct tpmi_per_power_domain_info *power_domain_info;
+ struct tpmi_per_power_domain_info *power_domain_info[SST_MAX_PARTITIONS];
+ u16 cdie_mask[SST_MAX_PARTITIONS];
+ u8 number_of_power_domains[SST_MAX_PARTITIONS];
+ u8 io_dies[SST_MAX_PARTITIONS];
+ u8 partition_mask;
+ u8 partition_mask_current;
};

/**
@@ -387,6 +401,126 @@ static int sst_main(struct auxiliary_device *auxdev, struct tpmi_per_power_domai
return 0;
}

+static u8 isst_instance_count(struct tpmi_sst_struct *sst_inst)
+{
+ u8 i, max_part, count = 0;
+
+ /* Partition mask starts from bit 0 and contains 1s only */
+ max_part = hweight8(sst_inst->partition_mask);
+ for (i = 0; i < max_part; i++)
+ count += sst_inst->number_of_power_domains[i];
+
+ return count;
+}
+
+/**
+ * map_cdies() - Map user domain ID to compute domain ID
+ * @sst_inst: TPMI Instance
+ * @id: User domain ID
+ * @partition: Resolved partition
+ *
+ * Helper function to map_partition_power_domain_id() to resolve compute
+ * domain ID and partition. Use hardware provided cdie_mask for a partition
+ * as is to resolve a compute domain ID.
+ *
+ * Return: %-EINVAL on error, otherwise mapped domain ID >= 0.
+ */
+static int map_cdies(struct tpmi_sst_struct *sst_inst, u8 id, u8 *partition)
+{
+ u8 i, max_part;
+
+ max_part = hweight8(sst_inst->partition_mask);
+ for (i = 0; i < max_part; i++) {
+ if (!(sst_inst->cdie_mask[i] & BIT(id)))
+ continue;
+
+ *partition = i;
+ return id - ffs(sst_inst->cdie_mask[i]) + 1;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * map_partition_power_domain_id() - Map user domain ID to partition domain ID
+ * @sst_inst: TPMI Instance
+ * @id: User domain ID
+ * @partition: Resolved partition
+ *
+ * In a partitioned system a CPU package has two separate MMIO ranges (Under
+ * two PCI devices). But the CPU package compute die/power domain IDs are
+ * unique in a package. User space can get compute die/power domain ID from
+ * CPUID and MSR 0x54 for a CPU. So, those IDs need to be preserved even if
+ * they are present in two different partitions with its own order.
+ *
+ * For example for command ISST_IF_COUNT_TPMI_INSTANCES, the valid_mask
+ * is 111111b for a 4 compute and 2 IO dies system. This is presented as
+ * provided by the hardware in a non-partitioned system with the following
+ * order:
+ * I1-I0-C3-C2-C1-C0
+ * Here: "C": for compute and "I" for IO die.
+ * Compute dies are always present first in TPMI instances, as they have
+ * to map to the real power domain/die ID of a system. In a non-partitioned
+ * system there is no way to identify compute and IO die boundaries from
+ * this driver without reading each CPU's mapping.
+ *
+ * The same order needs to be preserved, even if those compute dies are
+ * distributed among multiple partitions. For example:
+ * Partition 1 can contain: I1-C1-C0
+ * Partition 2 can contain: I2-C3-C2
+ *
+ * This will require a conversion of user space IDs to the actual index into
+ * array of stored power domains for each partition. For the above example
+ * this function will return partition and index as follows:
+ *
+ * ============= ========= ===== ========
+ * User space ID Partition Index Die type
+ * ============= ========= ===== ========
+ * 0 0 0 Compute
+ * 1 0 1 Compute
+ * 2 1 0 Compute
+ * 3 1 1 Compute
+ * 4 0 2 IO
+ * 5 1 2 IO
+ * ============= ========= ===== ========
+ *
+ * Return: %-EINVAL on error, otherwise mapped domain ID >= 0.
+ */
+static int map_partition_power_domain_id(struct tpmi_sst_struct *sst_inst, u8 id, u8 *partition)
+{
+ u8 i, io_start_id, max_part;
+
+ *partition = 0;
+
+ /* If any PCI device for partition is unbound, treat this as failure */
+ if (sst_inst->partition_mask != sst_inst->partition_mask_current)
+ return -EINVAL;
+
+ max_part = hweight8(sst_inst->partition_mask);
+
+ /* IO Index begin here */
+ io_start_id = fls(sst_inst->cdie_mask[max_part - 1]);
+
+ if (id < io_start_id)
+ return map_cdies(sst_inst, id, partition);
+
+ for (i = 0; i < max_part; i++) {
+ u8 io_id;
+
+ io_id = id - io_start_id;
+ if (io_id < sst_inst->io_dies[i]) {
+ u8 cdie_range;
+
+ cdie_range = fls(sst_inst->cdie_mask[i]) - ffs(sst_inst->cdie_mask[i]) + 1;
+ *partition = i;
+ return cdie_range + io_id;
+ }
+ io_start_id += sst_inst->io_dies[i];
+ }
+
+ return -EINVAL;
+}
+
/*
* Map a package and power_domain id to SST information structure unique for a power_domain.
* The caller should call under isst_tpmi_dev_lock.
@@ -395,6 +529,7 @@ static struct tpmi_per_power_domain_info *get_instance(int pkg_id, int power_dom
{
struct tpmi_per_power_domain_info *power_domain_info;
struct tpmi_sst_struct *sst_inst;
+ u8 part;

if (pkg_id < 0 || pkg_id > isst_common.max_index ||
pkg_id >= topology_max_packages())
@@ -404,10 +539,11 @@ static struct tpmi_per_power_domain_info *get_instance(int pkg_id, int power_dom
if (!sst_inst)
return NULL;

- if (power_domain_id < 0 || power_domain_id >= sst_inst->number_of_power_domains)
+ power_domain_id = map_partition_power_domain_id(sst_inst, power_domain_id, &part);
+ if (power_domain_id < 0)
return NULL;

- power_domain_info = &sst_inst->power_domain_info[power_domain_id];
+ power_domain_info = &sst_inst->power_domain_info[part][power_domain_id];

if (power_domain_info && !power_domain_info->sst_base)
return NULL;
@@ -579,6 +715,7 @@ static long isst_if_clos_assoc(void __user *argp)
struct tpmi_sst_struct *sst_inst;
int offset, shift, cpu;
u64 val, mask, clos;
+ u8 part;

if (copy_from_user(&clos_assoc, ptr, sizeof(clos_assoc)))
return -EFAULT;
@@ -602,10 +739,11 @@ static long isst_if_clos_assoc(void __user *argp)

sst_inst = isst_common.sst_inst[pkg_id];

- if (clos_assoc.power_domain_id > sst_inst->number_of_power_domains)
+ punit_id = map_partition_power_domain_id(sst_inst, punit_id, &part);
+ if (punit_id < 0)
return -EINVAL;

- power_domain_info = &sst_inst->power_domain_info[punit_id];
+ power_domain_info = &sst_inst->power_domain_info[part][punit_id];

if (assoc_cmds.get_set && power_domain_info->write_blocked)
return -EPERM;
@@ -1134,18 +1272,28 @@ static int isst_if_get_tpmi_instance_count(void __user *argp)
if (tpmi_inst.socket_id >= topology_max_packages())
return -EINVAL;

- tpmi_inst.count = isst_common.sst_inst[tpmi_inst.socket_id]->number_of_power_domains;
-
sst_inst = isst_common.sst_inst[tpmi_inst.socket_id];
+
+ tpmi_inst.count = isst_instance_count(sst_inst);
+
tpmi_inst.valid_mask = 0;
- for (i = 0; i < sst_inst->number_of_power_domains; ++i) {
+ for (i = 0; i < tpmi_inst.count; i++) {
struct tpmi_per_power_domain_info *pd_info;
+ u8 part;
+ int pd;
+
+ pd = map_partition_power_domain_id(sst_inst, i, &part);
+ if (pd < 0)
+ continue;

- pd_info = &sst_inst->power_domain_info[i];
+ pd_info = &sst_inst->power_domain_info[part][pd];
if (pd_info->sst_base)
tpmi_inst.valid_mask |= BIT(i);
}

+ if (!tpmi_inst.valid_mask)
+ tpmi_inst.count = 0;
+
if (copy_to_user(argp, &tpmi_inst, sizeof(tpmi_inst)))
return -EFAULT;

@@ -1276,8 +1424,11 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev)
struct intel_tpmi_plat_info *plat_info;
struct device *dev = &auxdev->dev;
struct tpmi_sst_struct *tpmi_sst;
- int i, ret, pkg = 0, inst = 0;
- int num_resources;
+ u8 i, num_resources, io_die_cnt;
+ int ret, pkg = 0, inst = 0;
+ bool first_enum = false;
+ u16 cdie_mask;
+ u8 partition;

ret = tpmi_get_feature_status(auxdev, TPMI_ID_SST, &read_blocked, &write_blocked);
if (ret)
@@ -1300,21 +1451,59 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev)
return -EINVAL;
}

- if (isst_common.sst_inst[pkg])
- return -EEXIST;
+ partition = plat_info->partition;
+ if (partition >= SST_MAX_PARTITIONS) {
+ dev_err(&auxdev->dev, "Invalid partition :%x\n", partition);
+ return -EINVAL;
+ }

num_resources = tpmi_get_resource_count(auxdev);

if (!num_resources)
return -EINVAL;

- tpmi_sst = devm_kzalloc(dev, sizeof(*tpmi_sst), GFP_KERNEL);
- if (!tpmi_sst)
- return -ENOMEM;
+ mutex_lock(&isst_tpmi_dev_lock);
+
+ if (isst_common.sst_inst[pkg]) {
+ tpmi_sst = isst_common.sst_inst[pkg];
+ } else {
+ /*
+ * tpmi_sst instance is for a package. So needs to be
+ * allocated only once for both partitions. We can't use
+ * devm_* allocation here as each partition is a
+ * different device, which can be unbound.
+ */
+ tpmi_sst = kzalloc(sizeof(*tpmi_sst), GFP_KERNEL);
+ if (!tpmi_sst) {
+ ret = -ENOMEM;
+ goto unlock_exit;
+ }
+ first_enum = true;
+ }
+
+ ret = 0;

pd_info = devm_kcalloc(dev, num_resources, sizeof(*pd_info), GFP_KERNEL);
- if (!pd_info)
- return -ENOMEM;
+ if (!pd_info) {
+ ret = -ENOMEM;
+ goto unlock_free;
+ }
+
+ /* Get the IO die count, if cdie_mask is present */
+ if (plat_info->cdie_mask) {
+ u8 cdie_range;
+
+ cdie_mask = plat_info->cdie_mask;
+ cdie_range = fls(cdie_mask) - ffs(cdie_mask) + 1;
+ io_die_cnt = num_resources - cdie_range;
+ } else {
+ /*
+ * This is a synthetic mask, careful when assuming that
+ * they are compute dies only.
+ */
+ cdie_mask = (1 << num_resources) - 1;
+ io_die_cnt = 0;
+ }

for (i = 0; i < num_resources; ++i) {
struct resource *res;
@@ -1330,11 +1519,20 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev)
pd_info[i].auxdev = auxdev;
pd_info[i].write_blocked = write_blocked;
pd_info[i].sst_base = devm_ioremap_resource(dev, res);
- if (IS_ERR(pd_info[i].sst_base))
- return PTR_ERR(pd_info[i].sst_base);
+ if (IS_ERR(pd_info[i].sst_base)) {
+ ret = PTR_ERR(pd_info[i].sst_base);
+ goto unlock_free;
+ }

ret = sst_main(auxdev, &pd_info[i]);
if (ret) {
+ /*
+ * This entry is not valid, hardware can partially
+ * populate dies. In this case MMIO will have 0xFFs.
+ * Also possible some pre-production hardware has
+ * invalid data. But don't fail and continue to use
+ * other dies with valid data.
+ */
devm_iounmap(dev, pd_info[i].sst_base);
pd_info[i].sst_base = NULL;
continue;
@@ -1343,30 +1541,53 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev)
++inst;
}

- if (!inst)
- return -ENODEV;
+ if (!inst) {
+ ret = -ENODEV;
+ goto unlock_free;
+ }

tpmi_sst->package_id = pkg;
- tpmi_sst->power_domain_info = pd_info;
- tpmi_sst->number_of_power_domains = num_resources;
+
+ tpmi_sst->power_domain_info[partition] = pd_info;
+ tpmi_sst->number_of_power_domains[partition] = num_resources;
+ tpmi_sst->cdie_mask[partition] = cdie_mask;
+ tpmi_sst->io_dies[partition] = io_die_cnt;
+ tpmi_sst->partition_mask |= BIT(partition);
+ tpmi_sst->partition_mask_current |= BIT(partition);
+
auxiliary_set_drvdata(auxdev, tpmi_sst);

- mutex_lock(&isst_tpmi_dev_lock);
if (isst_common.max_index < pkg)
isst_common.max_index = pkg;
isst_common.sst_inst[pkg] = tpmi_sst;
+
+unlock_free:
+ if (ret && first_enum)
+ kfree(tpmi_sst);
+unlock_exit:
mutex_unlock(&isst_tpmi_dev_lock);

- return 0;
+ return ret;
}
EXPORT_SYMBOL_NS_GPL(tpmi_sst_dev_add, INTEL_TPMI_SST);

void tpmi_sst_dev_remove(struct auxiliary_device *auxdev)
{
struct tpmi_sst_struct *tpmi_sst = auxiliary_get_drvdata(auxdev);
+ struct intel_tpmi_plat_info *plat_info;
+
+ plat_info = tpmi_get_platform_data(auxdev);
+ if (!plat_info)
+ return;

mutex_lock(&isst_tpmi_dev_lock);
- isst_common.sst_inst[tpmi_sst->package_id] = NULL;
+ tpmi_sst->power_domain_info[plat_info->partition] = NULL;
+ tpmi_sst->partition_mask_current &= ~BIT(plat_info->partition);
+ /* Free the package instance when the all partitions are removed */
+ if (!tpmi_sst->partition_mask_current) {
+ kfree(tpmi_sst);
+ isst_common.sst_inst[tpmi_sst->package_id] = NULL;
+ }
mutex_unlock(&isst_tpmi_dev_lock);
}
EXPORT_SYMBOL_NS_GPL(tpmi_sst_dev_remove, INTEL_TPMI_SST);
@@ -1374,9 +1595,16 @@ EXPORT_SYMBOL_NS_GPL(tpmi_sst_dev_remove, INTEL_TPMI_SST);
void tpmi_sst_dev_suspend(struct auxiliary_device *auxdev)
{
struct tpmi_sst_struct *tpmi_sst = auxiliary_get_drvdata(auxdev);
- struct tpmi_per_power_domain_info *power_domain_info = tpmi_sst->power_domain_info;
+ struct tpmi_per_power_domain_info *power_domain_info;
+ struct intel_tpmi_plat_info *plat_info;
void __iomem *cp_base;

+ plat_info = tpmi_get_platform_data(auxdev);
+ if (!plat_info)
+ return;
+
+ power_domain_info = tpmi_sst->power_domain_info[plat_info->partition];
+
cp_base = power_domain_info->sst_base + power_domain_info->sst_header.cp_offset;
power_domain_info->saved_sst_cp_control = readq(cp_base + SST_CP_CONTROL_OFFSET);

@@ -1395,9 +1623,16 @@ EXPORT_SYMBOL_NS_GPL(tpmi_sst_dev_suspend, INTEL_TPMI_SST);
void tpmi_sst_dev_resume(struct auxiliary_device *auxdev)
{
struct tpmi_sst_struct *tpmi_sst = auxiliary_get_drvdata(auxdev);
- struct tpmi_per_power_domain_info *power_domain_info = tpmi_sst->power_domain_info;
+ struct tpmi_per_power_domain_info *power_domain_info;
+ struct intel_tpmi_plat_info *plat_info;
void __iomem *cp_base;

+ plat_info = tpmi_get_platform_data(auxdev);
+ if (!plat_info)
+ return;
+
+ power_domain_info = tpmi_sst->power_domain_info[plat_info->partition];
+
cp_base = power_domain_info->sst_base + power_domain_info->sst_header.cp_offset;
writeq(power_domain_info->saved_sst_cp_control, cp_base + SST_CP_CONTROL_OFFSET);

--
2.40.1