Re: [PATCH] EDAC/{i10nm,skx,skx_common}: Support multiple clumps

From: Luck, Tony
Date: Thu Dec 05 2024 - 17:52:13 EST


On Thu, Dec 05, 2024 at 02:05:36PM -0600, Kyle Meyer wrote:
> On Thu, Dec 05, 2024 at 11:13:23AM -0800, Luck, Tony wrote:
> > On Thu, Dec 05, 2024 at 10:59:54AM -0600, Kyle Meyer wrote:
> > > The 3-bit source IDs in PCI configuration space registers are limited to
> > > 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).
> >
> > Is there any better name than "clump"?
>
> Yes, a UPI/QPI domain.

Ok, let's use that.

> > > Source IDs can not be used to map devices to sockets on systems with
> > > multiple clumps because each clump has identical repeating source IDs.
> > >
> > > Get package IDs instead of source IDs on systems with multiple clumps
> > > and use package/source IDs to name IMC information structures.
> >
> > What would happen if you just assumed the system had clumps and you
> > always used package/source? Would that change EDAC naming for
> > existing systems? That would be less complexity for the driver.
>
> That works if NUMA is enabled. skx_get_pkg_id() uses NUMA information to
> determine the package ID.

I cobbled together this patch on top of yours. I think it is
functionally equivalent. The #ifdef CONFIG_NUMA in the ".c"
file is ugly, but serves to illustrate what I'm trying to do
without too much cleanup noise.

1) Does this work? I tried on a non-clumpy system that is NUMA.

2) Is it better (assuming #fidef factored off into a .h file)?


-Tony

---

diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 0f06d45c9b3e..b0845bdd4516 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -244,8 +244,6 @@ void skx_set_mem_cfg(bool mem_cfg_2lm);
void skx_set_res_cfg(struct res_config *cfg);

int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
-int skx_check_dup_src_ids(int off);
-int skx_get_pkg_id(struct skx_dev *d, u8 *id);

int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 59384677d025..16d1110c0692 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -1018,7 +1018,6 @@ static int __init i10nm_init(void)
int rc, i, off[3] = {0xd0, 0xc8, 0xcc};
u64 tolm, tohm;
int imc_num;
- int dup_src_ids = 0;

edac_dbg(2, "\n");

@@ -1066,15 +1065,8 @@ static int __init i10nm_init(void)

imc_num = res_cfg->ddr_imc_num + res_cfg->hbm_imc_num;

- rc = dup_src_ids = skx_check_dup_src_ids(0xf8);
- if (rc < 0)
- goto fail;
-
list_for_each_entry(d, i10nm_edac_list, list) {
- if (dup_src_ids)
- rc = skx_get_pkg_id(d, &src_id);
- else
- rc = skx_get_src_id(d, 0xf8, &src_id);
+ rc = skx_get_src_id(d, 0xf8, &src_id);
if (rc < 0)
goto fail;

diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 189b8c5a1bda..93f7c05faccc 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -602,7 +602,6 @@ static int __init skx_init(void)
int rc = 0, i, off[3] = {0xd0, 0xd4, 0xd8};
u8 mc = 0, src_id;
struct skx_dev *d;
- int dup_src_ids = 0;

edac_dbg(2, "\n");

@@ -647,15 +646,8 @@ static int __init skx_init(void)
}
}

- rc = dup_src_ids = skx_check_dup_src_ids(0xf0);
- if (rc < 0)
- goto fail;
-
list_for_each_entry(d, skx_edac_list, list) {
- if (dup_src_ids)
- rc = skx_get_pkg_id(d, &src_id);
- else
- rc = skx_get_src_id(d, 0xf0, &src_id);
+ rc = skx_get_src_id(d, 0xf0, &src_id);
if (rc < 0)
goto fail;

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 56fec7310f40..414a1e6cf0f5 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -221,49 +221,7 @@ void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
}
EXPORT_SYMBOL_GPL(skx_set_decode);

-int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
-{
- u32 reg;
-
- if (pci_read_config_dword(d->util_all, off, &reg)) {
- skx_printk(KERN_ERR, "Failed to read src id\n");
- return -ENODEV;
- }
-
- *id = GET_BITFIELD(reg, 12, 14);
- return 0;
-}
-EXPORT_SYMBOL_GPL(skx_get_src_id);
-
-int skx_check_dup_src_ids(int off)
-{
- u8 id;
- struct skx_dev *d;
- int rc;
- DECLARE_BITMAP(id_map, 8);
-
- bitmap_zero(id_map, 8);
-
- /*
- * The 3-bit source IDs in PCI configuration space registers are limited
- * to 8 unique IDs, and each ID is local to a clump (UPI/QPI domain).
- */
- list_for_each_entry(d, &dev_edac_list, list) {
- rc = skx_get_src_id(d, off, &id);
- if (rc < 0)
- return rc;
-
- if (test_bit(id, id_map))
- return 1;
-
- set_bit(id, id_map);
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(skx_check_dup_src_ids);
-
-int skx_get_pkg_id(struct skx_dev *d, u8 *id)
+static int skx_get_pkg_id(struct skx_dev *d, u8 *id)
{
int node;
int cpu;
@@ -283,7 +241,24 @@ int skx_get_pkg_id(struct skx_dev *d, u8 *id)
skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n");
return -ENODEV;
}
-EXPORT_SYMBOL_GPL(skx_get_pkg_id);
+
+int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
+{
+#ifdef CONFIG_NUMA
+ return skx_get_pkg_id(d, id);
+#else
+ u32 reg;
+
+ if (pci_read_config_dword(d->util_all, off, &reg)) {
+ skx_printk(KERN_ERR, "Failed to read src id\n");
+ return -ENODEV;
+ }
+
+ *id = GET_BITFIELD(reg, 12, 14);
+ return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(skx_get_src_id);

static int get_width(u32 mtr)
{