Re: [PATCH v15 22/23] x86/mce: Improve error log of kernel space TDX #MC due to erratum

From: Huang, Kai
Date: Tue Dec 05 2023 - 14:42:49 EST


On Tue, 2023-12-05 at 15:25 +0100, Borislav Petkov wrote:
> On Fri, Nov 10, 2023 at 12:55:59AM +1300, Kai Huang wrote:
> > +static const char *mce_memory_info(struct mce *m)
> > +{
> > + if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
> > + return NULL;
> > +
> > + /*
> > + * Certain initial generations of TDX-capable CPUs have an
> > + * erratum. A kernel non-temporal partial write to TDX private
> > + * memory poisons that memory, and a subsequent read of that
> > + * memory triggers #MC.
> > + *
> > + * However such #MC caused by software cannot be distinguished
> > + * from the real hardware #MC. Just print additional message
> > + * to show such #MC may be result of the CPU erratum.
> > + */
> > + if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
> > + return NULL;
> > +
> > + return !tdx_is_private_mem(m->addr) ? NULL :
> > + "TDX private memory error. Possible kernel bug.";
> > +}
> > +
> > static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
> > {
> > struct llist_node *pending;
> > struct mce_evt_llist *l;
> > int apei_err = 0;
> > + const char *memmsg;
> >
> > /*
> > * Allow instrumentation around external facilities usage. Not that it
> > @@ -283,6 +307,15 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
> > }
> > if (exp)
> > pr_emerg(HW_ERR "Machine check: %s\n", exp);
> > + /*
> > + * Confidential computing platforms such as TDX platforms
> > + * may occur MCE due to incorrect access to confidential
> > + * memory. Print additional information for such error.
> > + */
> > + memmsg = mce_memory_info(final);
> > + if (memmsg)
> > + pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
> > +
>
> No, this is not how this is done. First of all, this function should be
> called something like
>
> mce_dump_aux_info()
>
> or so to state that it is dumping some auxiliary info.
>
> Then, it does:
>
> if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
> return tdx_get_mce_info();
>
> or so and you put that tdx_get_mce_info() function in TDX code and there
> you do all your picking apart of things, what needs to be dumped or what
> not, checking whether it is a memory error and so on.
>
> Thx.
>

Thanks Boris. Looks good to me, with one exception that it is actually TDX
host, but not TDX guest. So that the above X86_FEATURE_TDX_GUEST cpu feature
check needs to be replaced with the host one.

Full incremental diff below. Could you take a look whether this is what you
want?

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index a621721f63dd..0c02b66dcc41 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -115,13 +115,13 @@ bool platform_tdx_enabled(void);
int tdx_cpu_enable(void);
int tdx_enable(void);
void tdx_reset_memory(void);
-bool tdx_is_private_mem(unsigned long phys);
+const char *tdx_get_mce_info(unsigned long phys);
#else
static inline bool platform_tdx_enabled(void) { return false; }
static inline int tdx_cpu_enable(void) { return -ENODEV; }
static inline int tdx_enable(void) { return -ENODEV; }
static inline void tdx_reset_memory(void) { }
-static inline bool tdx_is_private_mem(unsigned long phys) { return false; }
+static inline const char *tdx_get_mce_info(unsigned long phys) { return NULL; }
#endif /* CONFIG_INTEL_TDX_HOST */

#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e33537cfc507..b7e650b5f7ef 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -229,26 +229,20 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}

-static const char *mce_memory_info(struct mce *m)
+static const char *mce_dump_aux_info(struct mce *m)
{
- if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
- return NULL;
-
/*
- * Certain initial generations of TDX-capable CPUs have an
- * erratum. A kernel non-temporal partial write to TDX private
- * memory poisons that memory, and a subsequent read of that
- * memory triggers #MC.
- *
- * However such #MC caused by software cannot be distinguished
- * from the real hardware #MC. Just print additional message
- * to show such #MC may be result of the CPU erratum.
+ * Confidential computing platforms such as TDX platforms
+ * may occur MCE due to incorrect access to confidential
+ * memory. Print additional information for such error.
*/
- if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
return NULL;

- return !tdx_is_private_mem(m->addr) ? NULL :
- "TDX private memory error. Possible kernel bug.";
+ if (platform_tdx_enabled())
+ return tdx_get_mce_info(m->addr);
+
+ return NULL;
}

static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
@@ -256,7 +250,7 @@ static noinstr void mce_panic(const char *msg, struct mce
*final, char *exp)
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
- const char *memmsg;
+ const char *auxinfo;

/*
* Allow instrumentation around external facilities usage. Not that it
@@ -307,14 +301,10 @@ static noinstr void mce_panic(const char *msg, struct mce
*final, char *exp)
}
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
- /*
- * Confidential computing platforms such as TDX platforms
- * may occur MCE due to incorrect access to confidential
- * memory. Print additional information for such error.
- */
- memmsg = mce_memory_info(final);
- if (memmsg)
- pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
+ auxinfo = mce_dump_aux_info(final);
+ if (auxinfo)
+ pr_emerg(HW_ERR "Machine check: %s\n", auxinfo);

if (!fake_panic) {
if (panic_timeout == 0)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 1b84dcdf63cb..cfbaec0f43b2 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1329,7 +1329,7 @@ static bool is_pamt_page(unsigned long phys)
* because it cannot distinguish #MC between software bug and real
* hardware error anyway.
*/
-bool tdx_is_private_mem(unsigned long phys)
+static bool tdx_is_private_mem(unsigned long phys)
{
struct tdx_module_args args = {
.rcx = phys & PAGE_MASK,
@@ -1391,6 +1391,25 @@ bool tdx_is_private_mem(unsigned long phys)
}
}

+const char *tdx_get_mce_info(unsigned long phys)
+{
+ /*
+ * Certain initial generations of TDX-capable CPUs have an
+ * erratum. A kernel non-temporal partial write to TDX private
+ * memory poisons that memory, and a subsequent read of that
+ * memory triggers #MC.
+ *
+ * However such #MC caused by software cannot be distinguished
+ * from the real hardware #MC. Just print additional message
+ * to show such #MC may be result of the CPU erratum.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return NULL;
+
+ return !tdx_is_private_mem(phys) ? NULL :
+ "TDX private memory error. Possible kernel bug.";
+}
+
static int __init record_keyid_partitioning(u32 *tdx_keyid_start,
u32 *nr_tdx_keyids)
{