[PATCH 1/1] mce-inject: Add an MCE injection case with the yellow status set

From: Qiuxu Zhuo
Date: Tue Nov 12 2024 - 03:06:23 EST


Intel processors with the capability of 'threshold-based error status'
use tracking hardware to monitor corrected errors of certain hardware
components (e.g., CPU caches). If the tracking hardware overflows, a
'yellow' flag will be set in the MCI_STATUS[54:53] to indicate that
the corrected errors of the associated hardware component exceed the
predefined threshold. If so, then the system may need to be scheduled
for servicing within a few weeks.

Add an MCE injection case with the yellow status set to test whether the
events of threshold-based corrected errors are handled by Linux kernel
or/and the user-space tools like mcelog or rasdaemon.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx>
---
mce.h | 2 ++
mce.lex | 2 ++
mce.y | 8 ++++----
test/corrected-yellow | 4 ++++
4 files changed, 12 insertions(+), 4 deletions(-)
create mode 100644 test/corrected-yellow

diff --git a/mce.h b/mce.h
index d941dce85326..b23232ec6273 100644
--- a/mce.h
+++ b/mce.h
@@ -15,6 +15,7 @@
#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
#define MCG_STATUS_LMCES (1ULL<<3) /* local machine check exception signaled */
+#define MCG_STATUS_TES_P (1ULL<<11) /* Threshold-based error status supported */
#define MCG_STATUS_SEAM_NR (1ULL<<12) /* SEAM NON-ROOT */

#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
@@ -26,6 +27,7 @@
#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
+#define MCI_STATUS_TES_YELLOW (1ULL<<54) /* Threshold-based errors above threshold */

/* MISC register defines */
#define MCM_ADDR_SEGOFF 0 /* segment offset */
diff --git a/mce.lex b/mce.lex
index 81e950c16cf6..5c8cd51d7109 100644
--- a/mce.lex
+++ b/mce.lex
@@ -96,6 +96,7 @@ static struct key {
KEYVAL(MCIP, MCG_STATUS_MCIP),
KEYVAL(LMCES, MCG_STATUS_LMCES),
KEYVAL(SEAM_NR, MCG_STATUS_SEAM_NR),
+ KEYVAL(TES_P, MCG_STATUS_TES_P),
KEYVAL(VAL, MCI_STATUS_VAL),
KEYVAL(OVER, MCI_STATUS_OVER),
KEYVAL(UC, MCI_STATUS_UC),
@@ -103,6 +104,7 @@ static struct key {
KEYVAL(PCC, MCI_STATUS_PCC),
KEYVAL(S, MCI_STATUS_S),
KEYVAL(AR, MCI_STATUS_AR),
+ KEYVAL(TES_YELLOW, MCI_STATUS_TES_YELLOW),
KEYVAL(UCNA, 0),
KEYVAL(SRAO, MCI_STATUS_S),
KEYVAL(SRAR, MCI_STATUS_S|MCI_STATUS_AR),
diff --git a/mce.y b/mce.y
index 90149e046589..9a28fa77a41b 100644
--- a/mce.y
+++ b/mce.y
@@ -49,8 +49,8 @@ static void init(void);
%token SYMBOL
%token MACHINE CHECK EXCEPTION

-%token RIPV EIPV MCIP LMCES SEAM_NR
-%token VAL OVER UC EN PCC S AR UCNA SRAO SRAR
+%token RIPV EIPV MCIP LMCES TES_P SEAM_NR
+%token VAL OVER UC EN PCC S AR TES_YELLOW UCNA SRAO SRAR

%%

@@ -103,13 +103,13 @@ mcgstatus_list: /* empty */ { $$ = 0; }
| mcgstatus_list mcgstatus { $$ = $1 | $2; }
;

-mcgstatus : RIPV | EIPV | MCIP | LMCES | SEAM_NR | NUMBER ;
+mcgstatus : RIPV | EIPV | MCIP | LMCES | TES_P | SEAM_NR | NUMBER ;

status_list: /* empty */ { $$ = 0; }
| status_list status { $$ = $1 | $2; }

status: UC | EN | VAL | OVER | PCC | NUMBER | CORRECTED | UNCORRECTED |
- FATAL | S | AR | UCNA | SRAO | SRAR
+ FATAL | S | AR | TES_YELLOW | UCNA | SRAO | SRAR
;

%%
diff --git a/test/corrected-yellow b/test/corrected-yellow
new file mode 100644
index 000000000000..15c8374cc41b
--- /dev/null
+++ b/test/corrected-yellow
@@ -0,0 +1,4 @@
+# Log corrected errors with yellow status (corrected errors above the predefined threshold).
+CPU 0 BANK 3
+MCGSTATUS TES_P
+STATUS CORRECTED TES_YELLOW
--
2.17.1