[PATCH 06/20] x86/mce/amd: Use helper for GPU UMC bank type checks

From: Yazen Ghannam
Date: Sat Nov 18 2023 - 14:33:28 EST


The type of an Scalable MCA bank should be determined solely using the
values in its MCA_IPID register.

Define and use a helper function to determine if a bank represents a GPU
Unified Memory Controller (UMC), and where the exact bank type is not
needed.

Use bitops and rename old mask until removed.

Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
---
arch/x86/include/asm/mce.h | 4 +++-
arch/x86/kernel/cpu/mce/amd.c | 12 +++++++++++-
drivers/edac/amd64_edac.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++-----
4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c43b41677a3e..012caf68dcbb 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -59,8 +59,9 @@
* - TCC bit is present in MCx_STATUS.
*/
#define MCI_CONFIG_MCAX 0x1
-#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_MCATYPE_OLD 0xFFFF0000
#define MCI_IPID_HWID_OLD 0xFFF
+#define MCI_IPID_MCATYPE GENMASK_ULL(63, 48)
#define MCI_IPID_HWID GENMASK_ULL(43, 32)

/*
@@ -341,6 +342,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);

void mce_amd_feature_init(struct cpuinfo_x86 *c);
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
+bool smca_gpu_umc_bank_type(u64 ipid);
#else

static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c8fb6c24170f..6fc35967b11b 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -150,6 +150,16 @@ static bool smca_umc_bank_type(u64 ipid)
return FIELD_GET(MCI_IPID_HWID, ipid) == 0x96;
}

+/* GPU UMCs have MCATYPE=0x1.*/
+bool smca_gpu_umc_bank_type(u64 ipid)
+{
+ if (!smca_umc_bank_type(ipid))
+ return false;
+
+ return FIELD_GET(MCI_IPID_MCATYPE, ipid) == 0x1;
+}
+EXPORT_SYMBOL_GPL(smca_gpu_umc_bank_type);
+
static const struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype } */

@@ -312,7 +322,7 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
}

hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID_OLD,
- (high & MCI_IPID_MCATYPE) >> 16);
+ (high & MCI_IPID_MCATYPE_OLD) >> 16);

for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 9b6642d00871..b593795e1e6b 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1032,7 +1032,7 @@ static int fixup_node_id(int node_id, struct mce *m)
/* MCA_IPID[InstanceIdHi] give the AMD Node ID for the bank. */
u8 nid = (m->ipid >> 44) & 0xF;

- if (smca_get_bank_type(m->extcpu, m->bank) != SMCA_UMC_V2)
+ if (!smca_gpu_umc_bank_type(m->ipid))
return node_id;

/* Nodes below the GPU base node are CPU nodes and don't need a fixup. */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84e5987b14e0..7235668b3cc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3279,12 +3279,11 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
uint32_t umc_inst = 0, ch_inst = 0;

/*
- * If the error was generated in UMC_V2, which belongs to GPU UMCs,
- * and error occurred in DramECC (Extended error code = 0) then only
- * process the error, else bail out.
+ * If the error was generated in a GPU UMC and error occurred in
+ * DramECC (Extended error code = 0) then only process the error,
+ * else bail out.
*/
- if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
- (XEC(m->status, 0x3f) == 0x0)))
+ if (!m || !(smca_gpu_umc_bank_type(m->ipid) && (XEC(m->status, 0x3f) == 0x0)))
return NOTIFY_DONE;

/*
--
2.34.1