[PATCH 2/5] x86/MCE: Handle MCA controls in a per_cpu way

From: Ghannam, Yazen
Date: Sun Apr 07 2019 - 19:14:08 EST


From: Yazen Ghannam <yazen.ghannam@xxxxxxx>

Current AMD systems have unique MCA banks per logical CPU even though
the type of the banks may all align to the same bank number. Each CPU
will have control of a set of MCA banks in the hardware and these are
not shared with other CPUs.

For example, bank 0 may be the Load-Store Unit on every logical CPU, but
each bank 0 is a unique structure in the hardware. In other words, there
isn't a *single* Load-Store Unit at MCA bank 0 that all logical CPUs
share.

This idea extends even to non-core MCA banks. For example, CPU0 and CPU4
may see a Unified Memory Controller at bank 15, but each CPU is actually
seeing a unique hardware structure that is not shared with other CPUs.

Because the MCA banks are all unique hardware structures, it would be
good to control them in a more granular way. For example, if there is a
known issue with the Floating Point Unit on CPU5 and a user wishes to
disable an error type on the Floating Point Unit, then it would be good
to do this only for CPU5 rather than all CPUs.

Also, future AMD systems may have heterogeneous MCA banks. Meaning the
bank numbers may not necessarily represent the same types between CPUs.
For example, bank 20 visible to CPU0 may be a Unified Memory Controller
and bank 20 visible to CPU4 may be a Coherent Slave. So granular control
will be even more necessary should the user wish to control specific MCA
banks.

Split the device attributes from struct mce_bank leaving only the MCA
bank control fields.

Make struct mce_banks[] per_cpu in order to have more granular control
over individual MCA banks in the hardware.

Allocate the device attributes statically based on the maximum number of
MCA banks supported. The sysfs interface will use as many as needed per
CPU. Currently, this is set to mca_cfg.banks, but will be changed to a
per_cpu bank count in a future patch.

Allocate the MCA control bits dynamically. Use the maximum number of MCA
banks supported for now. This will be changed to a per_cpu bank count in
a future patch.

Redo the sysfs store/show functions to handle the per_cpu mce_banks[].

Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
---
arch/x86/kernel/cpu/mce/core.c | 77 ++++++++++++++++++++++------------
1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 8d0d1e8425db..14583c5c6e12 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -64,16 +64,21 @@ static DEFINE_MUTEX(mce_sysfs_mutex);

DEFINE_PER_CPU(unsigned, mce_exception_count);

+struct mce_bank {
+ u64 ctl; /* subevents to enable */
+ bool init; /* initialise bank? */
+};
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank*, mce_banks);
+
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
-struct mce_bank {
- u64 ctl; /* subevents to enable */
- bool init; /* initialise bank? */
+struct mce_bank_dev {
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
+ u8 bank; /* bank number */
};
+static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];

-static struct mce_bank *mce_banks __read_mostly;
struct mce_vendor_flags mce_flags __read_mostly;

struct mca_config mca_cfg __read_mostly = {
@@ -695,7 +700,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.tsc = rdtsc();

for (i = 0; i < mca_cfg.banks; i++) {
- if (!mce_banks[i].ctl || !test_bit(i, *b))
+ if (!this_cpu_read(mce_banks)[i].ctl || !test_bit(i, *b))
continue;

m.misc = 0;
@@ -1138,7 +1143,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
if (!test_bit(i, valid_banks))
continue;

- if (!mce_banks[i].ctl)
+ if (!this_cpu_read(mce_banks)[i].ctl)
continue;

m->misc = 0;
@@ -1475,16 +1480,19 @@ static int __mcheck_cpu_mce_banks_init(void)
{
int i;

- mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL);
- if (!mce_banks)
+ per_cpu(mce_banks, smp_processor_id()) =
+ kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL);
+
+ if (!this_cpu_read(mce_banks))
return -ENOMEM;

for (i = 0; i < MAX_NR_BANKS; i++) {
- struct mce_bank *b = &mce_banks[i];
+ struct mce_bank *b = &this_cpu_read(mce_banks)[i];

b->ctl = -1ULL;
b->init = 1;
}
+
return 0;
}

@@ -1504,7 +1512,7 @@ static int __mcheck_cpu_cap_init(void)

mca_cfg.banks = max(mca_cfg.banks, b);

- if (!mce_banks) {
+ if (!this_cpu_read(mce_banks)) {
int err = __mcheck_cpu_mce_banks_init();
if (err)
return err;
@@ -1547,7 +1555,7 @@ static void __mcheck_cpu_init_clear_banks(void)
int i;

for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
+ struct mce_bank *b = &this_cpu_read(mce_banks)[i];

if (!b->init)
continue;
@@ -1602,7 +1610,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* trips off incorrectly with the IOMMU & 3ware
* & Cerberus:
*/
- clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ clear_bit(10, (unsigned long *)&this_cpu_read(mce_banks)[4].ctl);
}
if (c->x86 < 0x11 && cfg->bootlog < 0) {
/*
@@ -1616,7 +1624,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* by default.
*/
if (c->x86 == 6 && cfg->banks > 0)
- mce_banks[0].ctl = 0;
+ this_cpu_read(mce_banks)[0].ctl = 0;

/*
* overflow_recov is supported for F15h Models 00h-0fh
@@ -1638,7 +1646,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/

if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
- mce_banks[0].init = 0;
+ this_cpu_read(mce_banks)[0].init = 0;

/*
* All newer Intel systems support MCE broadcasting. Enable
@@ -1952,7 +1960,7 @@ static void mce_disable_error_reporting(void)
int i;

for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
+ struct mce_bank *b = &this_cpu_read(mce_banks)[i];

if (b->init)
wrmsrl(msr_ops.ctl(i), 0);
@@ -2051,26 +2059,41 @@ static struct bus_type mce_subsys = {

DEFINE_PER_CPU(struct device *, mce_device);

-static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
+static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
{
- return container_of(attr, struct mce_bank, attr);
+ return container_of(attr, struct mce_bank_dev, attr);
}

static ssize_t show_bank(struct device *s, struct device_attribute *attr,
char *buf)
{
- return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
+ struct mce_bank *b;
+ u8 bank = attr_to_bank(attr)->bank;
+
+ if (bank >= mca_cfg.banks)
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks, s->id)[bank];
+
+ return sprintf(buf, "%llx\n", b->ctl);
}

static ssize_t set_bank(struct device *s, struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
+ struct mce_bank *b;
+ u8 bank = attr_to_bank(attr)->bank;

if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;

- attr_to_bank(attr)->ctl = new;
+ if (bank >= mca_cfg.banks)
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks, s->id)[bank];
+
+ b->ctl = new;
mce_restart();

return size;
@@ -2185,7 +2208,7 @@ static void mce_device_release(struct device *dev)
kfree(dev);
}

-/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+/* Per cpu device init. All of the cpus still share the same bank device: */
static int mce_device_create(unsigned int cpu)
{
struct device *dev;
@@ -2218,7 +2241,7 @@ static int mce_device_create(unsigned int cpu)
goto error;
}
for (j = 0; j < mca_cfg.banks; j++) {
- err = device_create_file(dev, &mce_banks[j].attr);
+ err = device_create_file(dev, &mce_bank_devs[j].attr);
if (err)
goto error2;
}
@@ -2228,7 +2251,7 @@ static int mce_device_create(unsigned int cpu)
return 0;
error2:
while (--j >= 0)
- device_remove_file(dev, &mce_banks[j].attr);
+ device_remove_file(dev, &mce_bank_devs[j].attr);
error:
while (--i >= 0)
device_remove_file(dev, mce_device_attrs[i]);
@@ -2250,7 +2273,7 @@ static void mce_device_remove(unsigned int cpu)
device_remove_file(dev, mce_device_attrs[i]);

for (i = 0; i < mca_cfg.banks; i++)
- device_remove_file(dev, &mce_banks[i].attr);
+ device_remove_file(dev, &mce_bank_devs[i].attr);

device_unregister(dev);
cpumask_clear_cpu(cpu, mce_device_initialized);
@@ -2279,7 +2302,7 @@ static void mce_reenable_cpu(void)
if (!cpuhp_tasks_frozen)
cmci_reenable();
for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
+ struct mce_bank *b = &this_cpu_read(mce_banks)[i];

if (b->init)
wrmsrl(msr_ops.ctl(i), b->ctl);
@@ -2328,10 +2351,12 @@ static __init void mce_init_banks(void)
{
int i;

- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
+ for (i = 0; i < MAX_NR_BANKS; i++) {
+ struct mce_bank_dev *b = &mce_bank_devs[i];
struct device_attribute *a = &b->attr;

+ b->bank = i;
+
sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
snprintf(b->attrname, ATTR_LEN, "bank%d", i);
--
2.17.1