[PATCH] EDAC/{i10nm,skx,skx_common}: Support multiple clumps

From: Kyle Meyer
Date: Thu Dec 05 2024 - 12:28:33 EST


The 3-bit source IDs in PCI configuration space registers are limited to
8 unique IDs, and each ID is local to a clump (UPI/QPI domain).

Source IDs can not be used to map devices to sockets on systems with
multiple clumps because each clump has identical repeating source IDs.

Get package IDs instead of source IDs on systems with multiple clumps
and use package/source IDs to name IMC information structures.

Signed-off-by: Kyle Meyer <kyle.meyer@xxxxxxx>
---
drivers/edac/i10nm_base.c | 21 +++++++++-------
drivers/edac/skx_base.c | 19 ++++++++------
drivers/edac/skx_common.c | 52 +++++++++++++++++++++++++++++++++------
drivers/edac/skx_common.h | 5 ++--
4 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 51556c72a967..59384677d025 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -1010,7 +1010,7 @@ static struct notifier_block i10nm_mce_dec = {

static int __init i10nm_init(void)
{
- u8 mc = 0, src_id = 0, node_id = 0;
+ u8 mc = 0, src_id = 0;
const struct x86_cpu_id *id;
struct res_config *cfg;
const char *owner;
@@ -1018,6 +1018,7 @@ static int __init i10nm_init(void)
int rc, i, off[3] = {0xd0, 0xc8, 0xcc};
u64 tolm, tohm;
int imc_num;
+ int dup_src_ids = 0;

edac_dbg(2, "\n");

@@ -1065,24 +1066,26 @@ static int __init i10nm_init(void)

imc_num = res_cfg->ddr_imc_num + res_cfg->hbm_imc_num;

- list_for_each_entry(d, i10nm_edac_list, list) {
- rc = skx_get_src_id(d, 0xf8, &src_id);
- if (rc < 0)
- goto fail;
+ rc = dup_src_ids = skx_check_dup_src_ids(0xf8);
+ if (rc < 0)
+ goto fail;

- rc = skx_get_node_id(d, &node_id);
+ list_for_each_entry(d, i10nm_edac_list, list) {
+ if (dup_src_ids)
+ rc = skx_get_pkg_id(d, &src_id);
+ else
+ rc = skx_get_src_id(d, 0xf8, &src_id);
if (rc < 0)
goto fail;

- edac_dbg(2, "src_id = %d node_id = %d\n", src_id, node_id);
+ edac_dbg(2, "src_id = %d\n", src_id);
for (i = 0; i < imc_num; i++) {
if (!d->imc[i].mdev)
continue;

d->imc[i].mc = mc++;
d->imc[i].lmc = i;
- d->imc[i].src_id = src_id;
- d->imc[i].node_id = node_id;
+ d->imc[i].src_id = src_id;
if (d->imc[i].hbm_mc) {
d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
d->imc[i].num_channels = cfg->hbm_chan_num;
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 14cfd394b469..189b8c5a1bda 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -600,8 +600,9 @@ static int __init skx_init(void)
const struct munit *m;
const char *owner;
int rc = 0, i, off[3] = {0xd0, 0xd4, 0xd8};
- u8 mc = 0, src_id, node_id;
+ u8 mc = 0, src_id;
struct skx_dev *d;
+ int dup_src_ids = 0;

edac_dbg(2, "\n");

@@ -646,19 +647,23 @@ static int __init skx_init(void)
}
}

+ rc = dup_src_ids = skx_check_dup_src_ids(0xf0);
+ if (rc < 0)
+ goto fail;
+
list_for_each_entry(d, skx_edac_list, list) {
- rc = skx_get_src_id(d, 0xf0, &src_id);
- if (rc < 0)
- goto fail;
- rc = skx_get_node_id(d, &node_id);
+ if (dup_src_ids)
+ rc = skx_get_pkg_id(d, &src_id);
+ else
+ rc = skx_get_src_id(d, 0xf0, &src_id);
if (rc < 0)
goto fail;
- edac_dbg(2, "src_id=%d node_id=%d\n", src_id, node_id);
+
+ edac_dbg(2, "src_id = %d\n", src_id);
for (i = 0; i < SKX_NUM_IMC; i++) {
d->imc[i].mc = mc++;
d->imc[i].lmc = i;
d->imc[i].src_id = src_id;
- d->imc[i].node_id = node_id;
rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev,
"Skylake Socket", EDAC_MOD_STR,
skx_get_dimm_config, cfg);
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 6cf17af7d911..56fec7310f40 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -235,19 +235,55 @@ int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
}
EXPORT_SYMBOL_GPL(skx_get_src_id);

-int skx_get_node_id(struct skx_dev *d, u8 *id)
+int skx_check_dup_src_ids(int off)
{
- u32 reg;
+ u8 id;
+ struct skx_dev *d;
+ int rc;
+ DECLARE_BITMAP(id_map, 8);

- if (pci_read_config_dword(d->util_all, 0xf4, &reg)) {
- skx_printk(KERN_ERR, "Failed to read node id\n");
- return -ENODEV;
+ bitmap_zero(id_map, 8);
+
+ /*
+ * The 3-bit source IDs in PCI configuration space registers are limited
+ * to 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).
+ */
+ list_for_each_entry(d, &dev_edac_list, list) {
+ rc = skx_get_src_id(d, off, &id);
+ if (rc < 0)
+ return rc;
+
+ if (test_bit(id, id_map))
+ return 1;
+
+ set_bit(id, id_map);
}

- *id = GET_BITFIELD(reg, 0, 2);
return 0;
}
-EXPORT_SYMBOL_GPL(skx_get_node_id);
+EXPORT_SYMBOL_GPL(skx_check_dup_src_ids);
+
+int skx_get_pkg_id(struct skx_dev *d, u8 *id)
+{
+ int node;
+ int cpu;
+
+ node = pcibus_to_node(d->util_all->bus);
+ if (numa_valid_node(node)) {
+ for_each_cpu(cpu, cpumask_of_pcibus(d->util_all->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ *id = c->topo.pkg_id;
+ return 0;
+ }
+ }
+ }
+
+ skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n");
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(skx_get_pkg_id);

static int get_width(u32 mtr)
{
@@ -507,7 +543,7 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
pvt->imc = imc;

mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name,
- imc->node_id, imc->lmc);
+ imc->src_id, imc->lmc);
if (!mci->ctl_name) {
rc = -ENOMEM;
goto fail0;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 54bba8a62f72..0f06d45c9b3e 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -103,7 +103,7 @@ struct skx_dev {
bool hbm_mc;
u8 mc; /* system wide mc# */
u8 lmc; /* socket relative mc# */
- u8 src_id, node_id;
+ u8 src_id;
struct skx_channel {
struct pci_dev *cdev;
struct pci_dev *edev;
@@ -244,7 +244,8 @@ void skx_set_mem_cfg(bool mem_cfg_2lm);
void skx_set_res_cfg(struct res_config *cfg);

int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
-int skx_get_node_id(struct skx_dev *d, u8 *id);
+int skx_check_dup_src_ids(int off);
+int skx_get_pkg_id(struct skx_dev *d, u8 *id);

int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);

--
2.47.1