Re: AMD zen microcode updates breaks boot

From: Borislav Petkov
Date: Thu Oct 10 2024 - 09:48:11 EST


On Wed, Oct 09, 2024 at 05:04:23AM -0600, Jens Axboe wrote:
> Yep, 0xaa00215

Found something: I'm not handling the stepping properly, below is a big diff
along with debug printks. Can you pls run it and send me dmesg. I'm assuming
the box will boot with it.

Thx.

---

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index f63b051f25a0..a86fd2684913 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -158,6 +158,9 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
c.family = 0xf;
c.ext_fam = p.ext_fam;

+ pr_info("%s: val: 0x%x, p.stepping: 0x%x, c.stepping: 0x%x\n",
+ __func__, val, p.stepping, c.stepping);
+
return c;
}

@@ -613,16 +616,22 @@ static int __init save_microcode_in_initrd(void)
}
early_initcall(save_microcode_in_initrd);

-static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n)
+static inline bool patch_cpus_equivalent(struct ucode_patch *p,
+ struct ucode_patch *n,
+ bool ignore_stepping)
{
/* Zen and newer hardcode the f/m/s in the patch ID */
if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);

- /* Zap stepping */
- p_cid.stepping = 0;
- n_cid.stepping = 0;
+ if (ignore_stepping) {
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+ }
+
+ pr_info("%s: p_cid.full: 0x%x, n_cid.full: 0x%x\n",
+ __func__, p_cid.full, n_cid.full);

return p_cid.full == n_cid.full;
} else {
@@ -641,16 +650,22 @@ static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equi
n.equiv_cpu = equiv_cpu;
n.patch_id = uci->cpu_sig.rev;

+ pr_info("%s: equiv_cpu: 0x%x, patch_id: 0x%x\n",
+ __func__, equiv_cpu, uci->cpu_sig.rev);
+
WARN_ON_ONCE(!n.patch_id);

- list_for_each_entry(p, &microcode_cache, plist)
- if (patch_cpus_equivalent(p, &n))
+ list_for_each_entry(p, &microcode_cache, plist) {
+ if (patch_cpus_equivalent(p, &n, false)) {
+ pr_info("%s: using 0x%x\n", __func__, p->patch_id);
return p;
+ }
+ }

return NULL;
}

-static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n)
{
/* Zen and newer hardcode the f/m/s in the patch ID */
if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
@@ -659,6 +674,9 @@ static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n)
zp.ucode_rev = p->patch_id;
zn.ucode_rev = n->patch_id;

+ if (zn.stepping != zp.stepping)
+ return -1;
+
return zn.rev > zp.rev;
} else {
return n->patch_id > p->patch_id;
@@ -668,22 +686,32 @@ static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n)
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
+ int ret;

list_for_each_entry(p, &microcode_cache, plist) {
- if (patch_cpus_equivalent(p, new_patch)) {
- if (!patch_newer(p, new_patch)) {
+ if (patch_cpus_equivalent(p, new_patch, true)) {
+ ret = patch_newer(p, new_patch);
+ if (ret < 0)
+ continue;
+ else if (!ret) {
/* we already have the latest patch */
kfree(new_patch->data);
kfree(new_patch);
return;
}

+ pr_info("%s: replace 0x%x with 0x%x\n",
+ __func__, p->patch_id, new_patch->patch_id);
+
list_replace(&p->plist, &new_patch->plist);
kfree(p->data);
kfree(p);
return;
}
}
+
+ pr_info("%s: add patch: 0x%x\n", __func__, new_patch->patch_id);
+
/* no patch found, add it */
list_add_tail(&new_patch->plist, &microcode_cache);
}

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette