[RFC PATCH 2/2] ACPI: APEI: separate synchronous error handling into task work

From: Shuai Xue
Date: Tue Dec 06 2022 - 10:34:14 EST


On Arm64 platform, errors could be signaled by synchronous interrupt, e.g.
when an error is detected by a background scrubber, or signaled by
synchronous exception, e.g. when an uncorrected error is consumed. Both
synchronous and asynchronous error are queued and handled by a dedicated
kthread in workqueue.

commit 7f17b4a121d0 ("ACPI: APEI: Kick the memory_failure() queue for
synchronous errors") keep track of whether memory_failure() work was
queued, and make task_work pending to flush out the workqueue so that the
work for synchronous error is processed before returning to user-space.
The trick ensures that the corrupted page is unmapped and poisoned. And
after returning to user-space, the task starts at current instruction which
triggering a page fault and kernel will send sigbus due to
VM_FAULT_HWPOISON.

Although the task could be killed by page fault, the memory failure is
handled in a kthread context so that the hwpoison-aware mechanisms, e.g.
PF_MCE_EARLY, early kill, does not work as expected.

To this end, separate synchronous and asynchronous error handling into
different paths like X86 does:

- task work for synchronous error.
- and workqueue for asynchronous error.

Signed-off-by: Shuai Xue <xueshuai@xxxxxxxxxxxxxxxxx>
---
drivers/acpi/apei/ghes.c | 118 ++++++++++++++++++++++-----------------
1 file changed, 66 insertions(+), 52 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a420759fce2d..f13c298f47e6 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -421,46 +421,80 @@ static void ghes_clear_estatus(struct ghes *ghes,
ghes_ack_error(ghes->generic_v2);
}

-/*
- * Called as task_work before returning to user-space.
- * Ensure any queued work has been done before we return to the context that
- * triggered the notification.
+/**
+ * struct mce_task_work - for synchronous RAS event
+ *
+ * @twork: callback_head for task work
+ * @pfn: page frame number of corrupted page
+ * @flags: fine tune action taken
+ *
+ * Structure to pass task work to be handled before
+ * returning to userspace via task_work_add().
*/
-static void ghes_kick_task_work(struct callback_head *head)
+struct mce_task_work {
+ struct callback_head twork;
+ u64 pfn;
+ int flags;
+};
+
+static void memory_failure_cb(struct callback_head *twork)
{
- struct acpi_hest_generic_status *estatus;
- struct ghes_estatus_node *estatus_node;
- u32 node_len;
+ int ret;
+ struct mce_task_work *twcb =
+ container_of(twork, struct mce_task_work, twork);
+ ret = memory_failure(twcb->pfn, twcb->flags);
+ kfree(twcb);

- estatus_node = container_of(head, struct ghes_estatus_node, task_work);
- if (IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
- memory_failure_queue_kick(estatus_node->task_work_cpu);
+ if (!ret)
+ return;
+ /*
+ * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+ * to the current process with the proper error info,
+ * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
+ *
+ * In both cases, no further processing is required.
+ */
+ if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
+ return;

- estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
- node_len = GHES_ESTATUS_NODE_LEN(cper_estatus_len(estatus));
- gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+ pr_err("Memory error not recovered");
+ force_sig(SIGBUS);
}

-static bool ghes_do_memory_failure(u64 physical_addr, int flags)
+static void ghes_do_memory_failure(u64 physical_addr, int flags)
{
unsigned long pfn;
+ struct mce_task_work *twcb;

if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
- return false;
+ return;

pfn = PHYS_PFN(physical_addr);
if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) {
pr_warn_ratelimited(FW_WARN GHES_PFX
"Invalid address in generic error data: %#llx\n",
physical_addr);
- return false;
+ return;
}

- memory_failure_queue(pfn, flags);
- return true;
+ if (flags == MF_ACTION_REQUIRED && current->mm) {
+ twcb = kmalloc(sizeof(*twcb), GFP_ATOMIC);
+ if (!twcb)
+ return;
+
+ twcb->pfn = pfn;
+ twcb->flags = flags;
+ init_task_work(&twcb->twork, memory_failure_cb);
+ task_work_add(current, &twcb->twork, TWA_RESUME);
+ return;
+ } else {
+ memory_failure_queue(pfn, flags);
+ }
+
+ return;
}

-static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
+static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
int sev)
{
int flags = -1;
@@ -468,7 +502,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);

if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
- return false;
+ return;

/* iff following two events can be handled properly by now */
if (sec_sev == GHES_SEV_CORRECTED &&
@@ -478,15 +512,12 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
flags = (gdata->flags & CPER_SEC_SYNC) ? MF_ACTION_REQUIRED : 0;

if (flags != -1)
- return ghes_do_memory_failure(mem_err->physical_addr, flags);
-
- return false;
+ ghes_do_memory_failure(mem_err->physical_addr, flags);
}

-static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
+static void ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
{
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
- bool queued = false;
int sec_sev, i;
char *p;

@@ -494,7 +525,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s

sec_sev = ghes_severity(gdata->error_severity);
if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
- return false;
+ return;

p = (char *)(err + 1);
for (i = 0; i < err->err_info_num; i++) {
@@ -510,7 +541,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s
* and don't filter out 'corrected' error here.
*/
if (is_cache && has_pa) {
- queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
+ ghes_do_memory_failure(err_info->physical_fault_addr, 0);
p += err_info->length;
continue;
}
@@ -524,7 +555,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s
p += err_info->length;
}

- return queued;
+ return;
}

/*
@@ -622,7 +653,7 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
schedule_work(&entry->work);
}

-static bool ghes_do_proc(struct ghes *ghes,
+static void ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
int sev, sec_sev;
@@ -630,7 +661,6 @@ static bool ghes_do_proc(struct ghes *ghes,
guid_t *sec_type;
const guid_t *fru_id = &guid_null;
char *fru_text = "";
- bool queued = false;

sev = ghes_severity(estatus->error_severity);
apei_estatus_for_each_section(estatus, gdata) {
@@ -648,13 +678,13 @@ static bool ghes_do_proc(struct ghes *ghes,
ghes_edac_report_mem_error(sev, mem_err);

arch_apei_report_mem_error(sev, mem_err);
- queued = ghes_handle_memory_failure(gdata, sev);
+ ghes_handle_memory_failure(gdata, sev);
}
else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
ghes_handle_aer(gdata);
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
- queued = ghes_handle_arm_hw_error(gdata, sev);
+ ghes_handle_arm_hw_error(gdata, sev);
} else {
void *err = acpi_hest_get_payload(gdata);

@@ -664,8 +694,6 @@ static bool ghes_do_proc(struct ghes *ghes,
gdata->error_data_length);
}
}
-
- return queued;
}

static void __ghes_print_estatus(const char *pfx,
@@ -961,9 +989,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
struct ghes_estatus_node *estatus_node;
struct acpi_hest_generic *generic;
struct acpi_hest_generic_status *estatus;
- bool task_work_pending;
u32 len, node_len;
- int ret;

llnode = llist_del_all(&ghes_estatus_llist);
/*
@@ -978,26 +1004,15 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
len = cper_estatus_len(estatus);
node_len = GHES_ESTATUS_NODE_LEN(len);
- task_work_pending = ghes_do_proc(estatus_node->ghes, estatus);
+ ghes_do_proc(estatus_node->ghes, estatus);
if (!ghes_estatus_cached(estatus)) {
generic = estatus_node->generic;
if (ghes_print_estatus(NULL, generic, estatus))
ghes_estatus_cache_add(generic, estatus);
}

- if (task_work_pending && current->mm) {
- estatus_node->task_work.func = ghes_kick_task_work;
- estatus_node->task_work_cpu = smp_processor_id();
- ret = task_work_add(current, &estatus_node->task_work,
- TWA_RESUME);
- if (ret)
- estatus_node->task_work.func = NULL;
- }
-
- if (!estatus_node->task_work.func)
- gen_pool_free(ghes_estatus_pool,
- (unsigned long)estatus_node, node_len);
-
+ gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+ node_len);
llnode = next;
}
}
@@ -1057,7 +1072,6 @@ static int ghes_in_nmi_queue_one_entry(struct ghes *ghes,

estatus_node->ghes = ghes;
estatus_node->generic = ghes->generic;
- estatus_node->task_work.func = NULL;
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);

if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
--
2.20.1.12.g72788fdb