[PATCH 3/3] RAS/CEC: immediate soft-offline page when count_threshold == 1

From: WANG Chao
Date: Wed Apr 17 2019 - 23:50:51 EST


count_threshol == 1 isn't working as expected. CEC only does soft
offline the second time the same pfn is hit by a correctable error.

Signed-off-by: WANG Chao <chao.wang@xxxxxxxxx>
---
drivers/ras/cec.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 702e4c02c713..ac879c45377c 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -272,7 +272,22 @@ static u64 __maybe_unused del_lru_elem(void)
return pfn;
}

+static void cec_valid_soft_offline(u64 pfn)
+{
+ if (!pfn_valid(pfn)) {
+ pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
+ } else {
+ /* We have reached max count for this page, soft-offline it. */
+ pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
+ memory_failure_queue(pfn, MF_SOFT_OFFLINE, &cec_chain);
+ ce_arr.pfns_poisoned++;
+ }
+}

+/*
+ * Return a >0 value to denote that we've reached the offlining
+ * threshold.
+ */
int cec_add_elem(u64 pfn)
{
struct ce_array *ca = &ce_arr;
@@ -295,6 +310,11 @@ int cec_add_elem(u64 pfn)

ret = find_elem(ca, pfn, &to);
if (ret < 0) {
+ if (count_threshold == 1) {
+ cec_valid_soft_offline(pfn);
+ ret = 1;
+ goto unlock;
+ }
/*
* Shift range [to-end] to make room for one more element.
*/
@@ -320,23 +340,9 @@ int cec_add_elem(u64 pfn)

ret = 0;
} else {
- u64 pfn = ca->array[to] >> PAGE_SHIFT;
-
- if (!pfn_valid(pfn)) {
- pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
- } else {
- /* We have reached max count for this page, soft-offline it. */
- pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
- memory_failure_queue(pfn, MF_SOFT_OFFLINE);
- ca->pfns_poisoned++;
- }
-
+ cec_valid_soft_offline(pfn);
del_elem(ca, to);

- /*
- * Return a >0 value to denote that we've reached the offlining
- * threshold.
- */
ret = 1;

goto unlock;
--
2.21.0