[RFC PATCH 15/25] exchange pages: exchange anonymous page and file-backed page.

From: Zi Yan
Date: Wed Apr 03 2019 - 22:10:08 EST


From: Zi Yan <ziy@xxxxxxxxxx>

This is only done for the basic exchange pages, because we might
need to lock multiple files when doing concurrent exchange pages,
which could cause deadlocks easily.

Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
---
mm/exchange.c | 284 ++++++++++++++++++++++++++++++++++++++++++++++------------
mm/internal.h | 9 ++
mm/migrate.c | 6 +-
3 files changed, 241 insertions(+), 58 deletions(-)

diff --git a/mm/exchange.c b/mm/exchange.c
index bbada58..555a72c 100644
--- a/mm/exchange.c
+++ b/mm/exchange.c
@@ -20,6 +20,8 @@
#include <linux/memcontrol.h>
#include <linux/balloon_compaction.h>
#include <linux/buffer_head.h>
+#include <linux/fs.h> /* buffer_migrate_page */
+#include <linux/backing-dev.h>


#include "internal.h"
@@ -147,8 +149,6 @@ static void exchange_page_flags(struct page *to_page, struct page *from_page)
from_page_flags.page_is_idle = page_is_idle(from_page);
clear_page_idle(from_page);
from_page_flags.page_swapcache = PageSwapCache(from_page);
- from_page_flags.page_private = PagePrivate(from_page);
- ClearPagePrivate(from_page);
from_page_flags.page_writeback = test_clear_page_writeback(from_page);


@@ -170,8 +170,6 @@ static void exchange_page_flags(struct page *to_page, struct page *from_page)
to_page_flags.page_is_idle = page_is_idle(to_page);
clear_page_idle(to_page);
to_page_flags.page_swapcache = PageSwapCache(to_page);
- to_page_flags.page_private = PagePrivate(to_page);
- ClearPagePrivate(to_page);
to_page_flags.page_writeback = test_clear_page_writeback(to_page);

/* set to_page */
@@ -268,18 +266,22 @@ static void exchange_page_flags(struct page *to_page, struct page *from_page)
static int exchange_page_move_mapping(struct address_space *to_mapping,
struct address_space *from_mapping,
struct page *to_page, struct page *from_page,
+ struct buffer_head *to_head, struct buffer_head *from_head,
enum migrate_mode mode,
int to_extra_count, int from_extra_count)
{
- int to_expected_count = 1 + to_extra_count,
- from_expected_count = 1 + from_extra_count;
- unsigned long from_page_index = page_index(from_page),
- to_page_index = page_index(to_page);
+ int to_expected_count = expected_page_refs(to_mapping, to_page) + to_extra_count,
+ from_expected_count = expected_page_refs(from_mapping, from_page) + from_extra_count;
+ unsigned long from_page_index = from_page->index;
+ unsigned long to_page_index = to_page->index;
int to_swapbacked = PageSwapBacked(to_page),
from_swapbacked = PageSwapBacked(from_page);
- struct address_space *to_mapping_value = to_page->mapping,
- *from_mapping_value = from_page->mapping;
+ struct address_space *to_mapping_value = to_page->mapping;
+ struct address_space *from_mapping_value = from_page->mapping;

+ VM_BUG_ON_PAGE(to_mapping != page_mapping(to_page), to_page);
+ VM_BUG_ON_PAGE(from_mapping != page_mapping(from_page), from_page);
+ VM_BUG_ON(PageCompound(from_page) != PageCompound(to_page));

if (!to_mapping) {
/* Anonymous page without mapping */
@@ -293,26 +295,125 @@ static int exchange_page_move_mapping(struct address_space *to_mapping,
return -EAGAIN;
}

- /*
- * Now we know that no one else is looking at the page:
- * no turning back from here.
- */
- /* from_page */
- from_page->index = to_page_index;
- from_page->mapping = to_mapping_value;
+ /* both are anonymous pages */
+ if (!from_mapping && !to_mapping) {
+ /* from_page */
+ from_page->index = to_page_index;
+ from_page->mapping = to_mapping_value;
+
+ ClearPageSwapBacked(from_page);
+ if (to_swapbacked)
+ SetPageSwapBacked(from_page);
+
+
+ /* to_page */
+ to_page->index = from_page_index;
+ to_page->mapping = from_mapping_value;
+
+ ClearPageSwapBacked(to_page);
+ if (from_swapbacked)
+ SetPageSwapBacked(to_page);
+ } else if (!from_mapping && to_mapping) {
+ /* from is anonymous, to is file-backed */
+ XA_STATE(to_xas, &to_mapping->i_pages, page_index(to_page));
+ struct zone *from_zone, *to_zone;
+ int dirty;
+
+ from_zone = page_zone(from_page);
+ to_zone = page_zone(to_page);
+
+ xas_lock_irq(&to_xas);
+
+ if (page_count(to_page) != to_expected_count ||
+ xas_load(&to_xas) != to_page) {
+ xas_unlock_irq(&to_xas);
+ return -EAGAIN;
+ }
+
+ if (!page_ref_freeze(to_page, to_expected_count)) {
+ xas_unlock_irq(&to_xas);
+ pr_debug("cannot freeze page count\n");
+ return -EAGAIN;
+ }
+
+ if (!page_ref_freeze(from_page, from_expected_count)) {
+ page_ref_unfreeze(to_page, to_expected_count);
+ xas_unlock_irq(&to_xas);
+
+ return -EAGAIN;
+ }
+ /*
+ * Now we know that no one else is looking at the page:
+ * no turning back from here.
+ */
+ ClearPageSwapBacked(from_page);
+ ClearPageSwapBacked(to_page);
+
+ /* from_page */
+ from_page->index = to_page_index;
+ from_page->mapping = to_mapping_value;
+ /* to_page */
+ to_page->index = from_page_index;
+ to_page->mapping = from_mapping_value;
+
+ if (to_swapbacked)
+ __SetPageSwapBacked(from_page);
+ else
+ VM_BUG_ON_PAGE(PageSwapCache(to_page), to_page);

- ClearPageSwapBacked(from_page);
- if (to_swapbacked)
- SetPageSwapBacked(from_page);
+ if (from_swapbacked)
+ __SetPageSwapBacked(to_page);
+ else
+ VM_BUG_ON_PAGE(PageSwapCache(from_page), from_page);

+ dirty = PageDirty(to_page);

- /* to_page */
- to_page->index = from_page_index;
- to_page->mapping = from_mapping_value;
+ xas_store(&to_xas, from_page);
+ if (PageTransHuge(to_page)) {
+ int i;
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ xas_next(&to_xas);
+ xas_store(&to_xas, from_page + i);
+ }
+ }
+
+ /* move cache reference */
+ page_ref_unfreeze(to_page, to_expected_count - hpage_nr_pages(to_page));
+ page_ref_unfreeze(from_page, from_expected_count + hpage_nr_pages(from_page));
+
+ xas_unlock(&to_xas);
+
+ /*
+ * If moved to a different zone then also account
+ * the page for that zone. Other VM counters will be
+ * taken care of when we establish references to the
+ * new page and drop references to the old page.
+ *
+ * Note that anonymous pages are accounted for
+ * via NR_FILE_PAGES and NR_ANON_MAPPED if they
+ * are mapped to swap space.
+ */
+ if (to_zone != from_zone) {
+ __dec_node_state(to_zone->zone_pgdat, NR_FILE_PAGES);
+ __inc_node_state(from_zone->zone_pgdat, NR_FILE_PAGES);
+ if (PageSwapBacked(to_page) && !PageSwapCache(to_page)) {
+ __dec_node_state(to_zone->zone_pgdat, NR_SHMEM);
+ __inc_node_state(from_zone->zone_pgdat, NR_SHMEM);
+ }
+ if (dirty && mapping_cap_account_dirty(to_mapping)) {
+ __dec_node_state(to_zone->zone_pgdat, NR_FILE_DIRTY);
+ __dec_zone_state(to_zone, NR_ZONE_WRITE_PENDING);
+ __inc_node_state(from_zone->zone_pgdat, NR_FILE_DIRTY);
+ __inc_zone_state(from_zone, NR_ZONE_WRITE_PENDING);
+ }
+ }
+ local_irq_enable();

- ClearPageSwapBacked(to_page);
- if (from_swapbacked)
- SetPageSwapBacked(to_page);
+ } else {
+ /* from is file-backed to is anonymous: fold this to the case above */
+ /* both are file-backed */
+ VM_BUG_ON(1);
+ }

return MIGRATEPAGE_SUCCESS;
}
@@ -322,6 +423,7 @@ static int exchange_from_to_pages(struct page *to_page, struct page *from_page,
{
int rc = -EBUSY;
struct address_space *to_page_mapping, *from_page_mapping;
+ struct buffer_head *to_head = NULL, *to_bh = NULL;

VM_BUG_ON_PAGE(!PageLocked(from_page), from_page);
VM_BUG_ON_PAGE(!PageLocked(to_page), to_page);
@@ -330,15 +432,71 @@ static int exchange_from_to_pages(struct page *to_page, struct page *from_page,
to_page_mapping = page_mapping(to_page);
from_page_mapping = page_mapping(from_page);

+ /* from_page has to be anonymous page */
BUG_ON(from_page_mapping);
- BUG_ON(to_page_mapping);
-
BUG_ON(PageWriteback(from_page));
+ /* writeback has to finish */
BUG_ON(PageWriteback(to_page));

- /* actual page mapping exchange */
- rc = exchange_page_move_mapping(to_page_mapping, from_page_mapping,
- to_page, from_page, mode, 0, 0);
+ /* to_page is anonymous */
+ if (!to_page_mapping) {
+exchange_mappings:
+ /* actual page mapping exchange */
+ rc = exchange_page_move_mapping(to_page_mapping, from_page_mapping,
+ to_page, from_page, NULL, NULL, mode, 0, 0);
+ } else {
+ if (to_page_mapping->a_ops->migratepage == buffer_migrate_page) {
+ if (!page_has_buffers(to_page))
+ goto exchange_mappings;
+
+ to_head = page_buffers(to_page);
+
+ rc = exchange_page_move_mapping(to_page_mapping,
+ from_page_mapping, to_page, from_page,
+ to_head, NULL, mode, 0, 0);
+
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ /*
+ * In the async case, migrate_page_move_mapping locked the buffers
+ * with an IRQ-safe spinlock held. In the sync case, the buffers
+ * need to be locked now
+ */
+ if ((mode & MIGRATE_MODE_MASK) != MIGRATE_ASYNC)
+ BUG_ON(!buffer_migrate_lock_buffers(to_head, mode));
+
+ ClearPagePrivate(to_page);
+ set_page_private(from_page, page_private(to_page));
+ set_page_private(to_page, 0);
+ /* transfer private page count */
+ put_page(to_page);
+ get_page(from_page);
+
+ to_bh = to_head;
+ do {
+ set_bh_page(to_bh, from_page, bh_offset(to_bh));
+ to_bh = to_bh->b_this_page;
+
+ } while (to_bh != to_head);
+
+ SetPagePrivate(from_page);
+
+ to_bh = to_head;
+ } else if (!to_page_mapping->a_ops->migratepage) {
+ /* fallback_migrate_page */
+ if (PageDirty(to_page)) {
+ if ((mode & MIGRATE_MODE_MASK) != MIGRATE_SYNC)
+ return -EBUSY;
+ return writeout(to_page_mapping, to_page);
+ }
+ if (page_has_private(to_page) &&
+ !try_to_release_page(to_page, GFP_KERNEL))
+ return -EAGAIN;
+
+ goto exchange_mappings;
+ }
+ }
/* actual page data exchange */
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -356,8 +514,28 @@ static int exchange_from_to_pages(struct page *to_page, struct page *from_page,
rc = 0;
}

+ /*
+ * 1. buffer_migrate_page:
+ * private flag should be transferred from to_page to from_page
+ *
+ * 2. anon<->anon, fallback_migrate_page:
+ * both have none private flags or to_page's is cleared.
+ * */
+ VM_BUG_ON(!((page_has_private(from_page) && !page_has_private(to_page)) ||
+ (!page_has_private(from_page) && !page_has_private(to_page))));
+
exchange_page_flags(to_page, from_page);

+ if (to_bh) {
+ VM_BUG_ON(to_bh != to_head);
+ do {
+ unlock_buffer(to_bh);
+ put_bh(to_bh);
+ to_bh = to_bh->b_this_page;
+
+ } while (to_bh != to_head);
+ }
+
return rc;
}

@@ -369,34 +547,12 @@ static int unmap_and_exchange(struct page *from_page, struct page *to_page,
pgoff_t from_index, to_index;
struct anon_vma *from_anon_vma = NULL, *to_anon_vma = NULL;

- /* from_page lock down */
if (!trylock_page(from_page)) {
if ((mode & MIGRATE_MODE_MASK) == MIGRATE_ASYNC)
goto out;
-
lock_page(from_page);
}

- BUG_ON(PageWriteback(from_page));
-
- /*
- * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
- * we cannot notice that anon_vma is freed while we migrates a page.
- * This get_anon_vma() delays freeing anon_vma pointer until the end
- * of migration. File cache pages are no problem because of page_lock()
- * File Caches may use write_page() or lock_page() in migration, then,
- * just care Anon page here.
- *
- * Only page_get_anon_vma() understands the subtleties of
- * getting a hold on an anon_vma from outside one of its mms.
- * But if we cannot get anon_vma, then we won't need it anyway,
- * because that implies that the anon page is no longer mapped
- * (and cannot be remapped so long as we hold the page lock).
- */
- if (PageAnon(from_page) && !PageKsm(from_page))
- from_anon_vma = page_get_anon_vma(from_page);
-
- /* to_page lock down */
if (!trylock_page(to_page)) {
if ((mode & MIGRATE_MODE_MASK) == MIGRATE_ASYNC)
goto out_unlock;
@@ -404,7 +560,22 @@ static int unmap_and_exchange(struct page *from_page, struct page *to_page,
lock_page(to_page);
}

- BUG_ON(PageWriteback(to_page));
+ /* from_page is supposed to be an anonymous page */
+ VM_BUG_ON_PAGE(PageWriteback(from_page), from_page);
+
+ if (PageWriteback(to_page)) {
+ /*
+ * Only in the case of a full synchronous migration is it
+ * necessary to wait for PageWriteback. In the async case,
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
+ */
+ if ((mode & MIGRATE_MODE_MASK) != MIGRATE_SYNC) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+ wait_on_page_writeback(to_page);
+ }

/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
@@ -420,6 +591,9 @@ static int unmap_and_exchange(struct page *from_page, struct page *to_page,
* because that implies that the anon page is no longer mapped
* (and cannot be remapped so long as we hold the page lock).
*/
+ if (PageAnon(from_page) && !PageKsm(from_page))
+ from_anon_vma = page_get_anon_vma(from_page);
+
if (PageAnon(to_page) && !PageKsm(to_page))
to_anon_vma = page_get_anon_vma(to_page);

@@ -753,7 +927,7 @@ static int exchange_page_mapping_concur(struct list_head *unmapped_list_ptr,

/* actual page mapping exchange */
rc = exchange_page_move_mapping(to_page_mapping, from_page_mapping,
- to_page, from_page, mode, 0, 0);
+ to_page, from_page, NULL, NULL, mode, 0, 0);

if (rc) {
if (one_pair->from_page_was_mapped)
diff --git a/mm/internal.h b/mm/internal.h
index a039459..cf63bf6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -566,4 +566,13 @@ extern int exchange_page_mthread(struct page *to, struct page *from,
extern int exchange_page_lists_mthread(struct page **to,
struct page **from,
int nr_pages);
+
+extern int exchange_two_pages(struct page *page1, struct page *page2);
+
+bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode);
+int writeout(struct address_space *mapping, struct page *page);
+int expected_page_refs(struct address_space *mapping, struct page *page);
+
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/migrate.c b/mm/migrate.c
index ad02797..a0ca817 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -385,7 +385,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
}
#endif

-static int expected_page_refs(struct address_space *mapping, struct page *page)
+int expected_page_refs(struct address_space *mapping, struct page *page)
{
int expected_count = 1;

@@ -732,7 +732,7 @@ EXPORT_SYMBOL(migrate_page);

#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
{
struct buffer_head *bh = head;
@@ -880,7 +880,7 @@ int buffer_migrate_page_norefs(struct address_space *mapping,
/*
* Writeback a page to clean the dirty state
*/
-static int writeout(struct address_space *mapping, struct page *page)
+int writeout(struct address_space *mapping, struct page *page)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
--
2.7.4