Re: [PATCH v3 0/5] HWpoison: further fixes and cleanups

From: Oscar Salvador
Date: Wed Sep 16 2020 - 03:27:15 EST


On Tue, Sep 15, 2020 at 05:22:22PM -0400, Aristeu Rozanski wrote:
> Hi Oscar, Naoya,

Hi Aristeu,

thanks for reporting this.

> I've run these tests using mmotm and mmotm with this patchset on top.

Could you please re-run the tests with the below patch applied, and
attached then the logs here?

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 84a7f228af36..d7b6e7724e47 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -67,6 +67,7 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);

static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
{
+ dump_page(page, "page_handle_poison");
if (release) {
put_page(page);
drain_all_pages(page_zone(page));
@@ -77,7 +78,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
* Doing this check for free pages is also fine since dissolve_free_huge_page
* returns 0 for non-hugetlb pages as well.
*/
- if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) {
/*
* We could fail to take off the target page from buddy
* for example due to racy page allocaiton, but that's
@@ -85,7 +86,9 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
* and if someone really want to use it, they should
* take it.
*/
+ pr_info("%s: hugepage_or_freepage failed¸n", __func__);
return false;
+ }
}

SetPageHWPoison(page);
@@ -1858,8 +1861,11 @@ static int __soft_offline_page(struct page *page)
if (!ret) {
bool release = !huge;

- if (!page_handle_poison(page, true, release))
+ if (!page_handle_poison(page, true, release)) {
+ pr_info("%s: page_handle_poison -EBUSY\n", __func__);
+ dump_page(page, "__soft_offline_page after migrate");
ret = -EBUSY;
+ }
} else {
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
@@ -1872,6 +1878,7 @@ static int __soft_offline_page(struct page *page)
} else {
pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n",
pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags);
+ dump_page(page, "__soft_offline_page isolation failed");
ret = -EBUSY;
}
return ret;
@@ -1882,8 +1889,11 @@ static int soft_offline_in_use_page(struct page *page)
struct page *hpage = compound_head(page);

if (!PageHuge(page) && PageTransHuge(hpage))
- if (try_to_split_thp_page(page, "soft offline") < 0)
+ if (try_to_split_thp_page(page, "soft offline") < 0) {
+ pr_info("%s: try_to_split_thp_page -EBUSY\n", __func__);
+ dump_page(page, "try_to_split_thp_page");
return -EBUSY;
+ }
return __soft_offline_page(page);
}

@@ -1891,8 +1901,11 @@ static int soft_offline_free_page(struct page *page)
{
int rc = 0;

- if (!page_handle_poison(page, true, false))
+ if (!page_handle_poison(page, true, false)) {
+ pr_info("%s: page_handle_poison -EBUSY\n", __func__);
+ dump_page(page, "soft_offline_free_page");
rc = -EBUSY;
+ }

return rc;
}

Thanks

--
Oscar Salvador
SUSE L3