[PATCH v18 20/32] mm/lru: replace pgdat lru_lock with lruvec lock

From: Alex Shi
Date: Mon Aug 24 2020 - 08:56:42 EST


This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't
have to suffer from per node pgdat->lru_lock competition. They could go
fast with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

In func isolate_migratepages_block, compact_unlock_should_abort is
opend, and lock_page_lruvec logical is embedded for tight process.
Also add a debug func in locking which may give some clues if there are
sth out of hands.

According to Daniel Jordan's suggestion, I run 208 'dd' with on 104
containers on a 2s * 26cores * HT box with a modefied case:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-lru-file-readtwice

With this and later patches, the readtwice performance increases
about 80% within concurrent containers.

On a large machine with memcg enabled but not used, the page's lruvec
seeking pass a few pointers, that may lead to lru_lock holding time
increase and a bit regression.

Hugh Dickins helped on patch polish, thanks!

Reported-by: kernel test robot <lkp@xxxxxxxxx>
Signed-off-by: Alex Shi <alex.shi@xxxxxxxxxxxxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Yang Shi <yang.shi@xxxxxxxxxxxxxxxxx>
Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
Cc: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: linux-mm@xxxxxxxxx
Cc: cgroups@xxxxxxxxxxxxxxx
---
include/linux/memcontrol.h | 58 +++++++++++++++++++++++++
include/linux/mmzone.h | 2 +
mm/compaction.c | 56 +++++++++++++++---------
mm/huge_memory.c | 11 ++---
mm/memcontrol.c | 60 +++++++++++++++++++++++++-
mm/mlock.c | 47 +++++++++++++-------
mm/mmzone.c | 1 +
mm/swap.c | 105 +++++++++++++++++++++------------------------
mm/vmscan.c | 70 +++++++++++++++++-------------
9 files changed, 279 insertions(+), 131 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d0b036123c6a..7b170e9028b5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -494,6 +494,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,

struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);

+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -1035,6 +1048,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irq(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flagsp)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
+ return &pgdat->__lruvec;
+}
+
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1282,6 +1320,10 @@ static inline void count_memcg_page_event(struct page *page,
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
#endif /* CONFIG_MEMCG */

/* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1411,6 +1453,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+ spin_unlock(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+ spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+ unsigned long flags)
+{
+ spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+}
+
#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8379432f4f2f..27a1513a43fc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -273,6 +273,8 @@ enum lruvec_flags {
};

struct lruvec {
+ /* per lruvec lru_lock for memcg */
+ spinlock_t lru_lock;
struct list_head lists[NR_LRU_LISTS];
/*
* These track the cost of reclaiming one LRU - file or anon -
diff --git a/mm/compaction.c b/mm/compaction.c
index 253382d99969..b724eacf6421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -805,7 +805,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
- bool locked = false;
+ struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -865,11 +865,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc)) {
- low_pfn = 0;
- goto fatal_pending;
+ if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+
+ low_pfn = 0;
+ goto fatal_pending;
+ }
+
+ cond_resched();
}

if (!pfn_valid_within(low_pfn))
@@ -941,9 +950,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock,
- flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}

if (!isolate_movable_page(page, isolate_mode))
@@ -984,10 +992,19 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!TestClearPageLRU(page))
goto isolate_fail_put;

+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
/* If we already hold the lock, we can skip some rechecking */
- if (!locked) {
- locked = compact_lock_irqsave(&pgdat->lru_lock,
- &flags, cc);
+ if (lruvec != locked) {
+ if (locked)
+ unlock_page_lruvec_irqrestore(locked, flags);
+
+ compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ locked = lruvec;
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);

/* Try get exclusive access under lock */
if (!skip_updated) {
@@ -1006,9 +1023,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
SetPageLRU(page);
goto isolate_fail_put;
}
- }
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ } else
+ rcu_read_unlock();

/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
@@ -1042,8 +1058,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
put_page(page);

@@ -1058,8 +1074,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
@@ -1087,7 +1103,7 @@ static bool too_many_isolated(pg_data_t *pgdat)

isolate_abort:
if (locked)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(locked, flags);
if (page) {
SetPageLRU(page);
put_page(page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6380c925e904..c9e08fdc08e9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2319,7 +2319,7 @@ static void lru_add_page_tail(struct page *head, struct page *page_tail,
VM_BUG_ON_PAGE(!PageHead(head), head);
VM_BUG_ON_PAGE(PageCompound(page_tail), head);
VM_BUG_ON_PAGE(PageLRU(page_tail), head);
- lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+ lockdep_assert_held(&lruvec->lru_lock);

if (list) {
/* page reclaim is reclaiming a huge page */
@@ -2403,7 +2403,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end)
{
struct page *head = compound_head(page);
- pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
@@ -2420,10 +2419,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}

- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock(&pgdat->lru_lock);
-
- lruvec = mem_cgroup_page_lruvec(head, pgdat);
+ /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+ lruvec = lock_page_lruvec(head);

for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
@@ -2444,7 +2441,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}

ClearPageCompound(head);
- spin_unlock(&pgdat->lru_lock);
+ unlock_page_lruvec(lruvec);
/* Caller disabled irqs, so they are still disabled here */

split_page_owner(head, HPAGE_PMD_ORDER);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65c1e873153e..5b95529e64a4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1302,6 +1302,19 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
return ret;
}

+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!page->mem_cgroup)
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+ else
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != page->mem_cgroup, page);
+}
+#endif
+
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
@@ -1341,6 +1354,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
return lruvec;
}

+struct lruvec *lock_page_lruvec(struct page *page)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irqsave(&lruvec->lru_lock, *flags);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
/**
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
@@ -3222,7 +3280,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)

/*
* Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * lruvec->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
diff --git a/mm/mlock.c b/mm/mlock.c
index 3762d9dd5b31..177d2588e863 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -105,12 +105,10 @@ void mlock_vma_page(struct page *page)
* Isolate a page from LRU with optional get_page() pin.
* Assumes lru_lock already held and page already pinned.
*/
-static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+static bool __munlock_isolate_lru_page(struct page *page,
+ struct lruvec *lruvec, bool getpage)
{
if (TestClearPageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
if (getpage)
get_page(page);
del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -180,7 +178,7 @@ static void __munlock_isolation_failed(struct page *page)
unsigned int munlock_vma_page(struct page *page)
{
int nr_pages;
- pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;

/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
@@ -188,11 +186,16 @@ unsigned int munlock_vma_page(struct page *page)
VM_BUG_ON_PAGE(PageTail(page), page);

/*
- * Serialize with any parallel __split_huge_page_refcount() which
+ * Serialize split tail pages in __split_huge_page_tail() which
* might otherwise copy PageMlocked to part of the tail pages before
* we clear it in the head page. It also stabilizes thp_nr_pages().
+ * TestClearPageLRU can't be used here to block page isolation, since
+ * out of lock clear_page_mlock may interfer PageLRU/PageMlocked
+ * sequence, same as __pagevec_lru_add_fn, and lead the page place to
+ * wrong lru list here. So relay on PageLocked to stop lruvec change
+ * in mem_cgroup_move_account().
*/
- spin_lock_irq(&pgdat->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);

if (!TestClearPageMlocked(page)) {
/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
@@ -203,15 +206,15 @@ unsigned int munlock_vma_page(struct page *page)
nr_pages = thp_nr_pages(page);
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);

- if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(&pgdat->lru_lock);
+ if (__munlock_isolate_lru_page(page, lruvec, true)) {
+ unlock_page_lruvec_irq(lruvec);
__munlock_isolated_page(page);
goto out;
}
__munlock_isolation_failed(page);

unlock_out:
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);

out:
return nr_pages - 1;
@@ -291,23 +294,34 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
int nr = pagevec_count(pvec);
int delta_munlocked = -nr;
struct pagevec pvec_putback;
+ struct lruvec *lruvec = NULL;
int pgrescued = 0;

pagevec_init(&pvec_putback);

/* Phase 1: page isolation */
- spin_lock_irq(&zone->zone_pgdat->lru_lock);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
+ struct lruvec *new_lruvec;
+
+ /* block memcg change in mem_cgroup_move_account */
+ lock_page_memcg(page);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (new_lruvec != lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ lruvec = lock_page_lruvec_irq(page);
+ }

if (TestClearPageMlocked(page)) {
/*
* We already have pin from follow_page_mask()
* so we can spare the get_page() here.
*/
- if (__munlock_isolate_lru_page(page, false))
+ if (__munlock_isolate_lru_page(page, lruvec, false)) {
+ unlock_page_memcg(page);
continue;
- else
+ } else
__munlock_isolation_failed(page);
} else {
delta_munlocked++;
@@ -319,11 +333,14 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* pin. We cannot do it under lru_lock however. If it's
* the last pin, __page_cache_release() would deadlock.
*/
+ unlock_page_memcg(page);
pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
}
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+ if (lruvec) {
+ __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ unlock_page_lruvec_irq(lruvec);
+ }

/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..3750a90ed4a0 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -91,6 +91,7 @@ void lruvec_init(struct lruvec *lruvec)
enum lru_list lru;

memset(lruvec, 0, sizeof(struct lruvec));
+ spin_lock_init(&lruvec->lru_lock);

for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/swap.c b/mm/swap.c
index 2d9a86bf93a4..b67959b701c0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,15 +79,13 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
unsigned long flags;

__ClearPageLRU(page);
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(lruvec, flags);
}
__ClearPageWaiters(page);
}
@@ -206,32 +204,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
void (*move_fn)(struct page *page, struct lruvec *lruvec))
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;

for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
-
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- }
+ struct lruvec *new_lruvec;

/* block memcg migration during page moving between lru */
if (!TestClearPageLRU(page))
continue;

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
+ }
+
(*move_fn)(page, lruvec);

SetPageLRU(page);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
@@ -274,9 +270,8 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
do {
unsigned long lrusize;
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += nr_pages;
@@ -300,7 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
} while ((lruvec = parent_lruvec(lruvec)));
}

@@ -364,13 +359,13 @@ static inline void activate_page_drain(int cpu)

void activate_page(struct page *page)
{
- pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;

page = compound_head(page);
- spin_lock_irq(&pgdat->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);
if (PageLRU(page))
- __activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
- spin_unlock_irq(&pgdat->lru_lock);
+ __activate_page(page, lruvec);
+ unlock_page_lruvec_irq(lruvec);
}
#endif

@@ -819,8 +814,7 @@ void release_pages(struct page **pages, int nr)
{
int i;
LIST_HEAD(pages_to_free);
- struct pglist_data *locked_pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags;
unsigned int lock_batch;

@@ -830,21 +824,20 @@ void release_pages(struct page **pages, int nr)
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
- * same pgdat. The lock is held only if pgdat != NULL.
+ * same lruvec. The lock is held only if lruvec != NULL.
*/
- if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}

if (is_huge_zero_page(page))
continue;

if (is_zone_device_page(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
- flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
/*
* ZONE_DEVICE pages that return 'false' from
@@ -863,29 +856,29 @@ void release_pages(struct page **pages, int nr)
continue;

if (PageCompound(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
__put_compound_page(page);
continue;
}

if (PageLRU(page)) {
- struct pglist_data *pgdat = page_pgdat(page);
+ struct lruvec *new_lruvec;

- if (pgdat != locked_pgdat) {
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+ new_lruvec = mem_cgroup_page_lruvec(page,
+ page_pgdat(page));
+ if (new_lruvec != lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec,
flags);
lock_batch = 0;
- locked_pgdat = pgdat;
- spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
}

VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
- lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}

@@ -895,8 +888,8 @@ void release_pages(struct page **pages, int nr)

list_add(&page->lru, &pages_to_free);
}
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);

mem_cgroup_uncharge_list(&pages_to_free);
free_unref_page_list(&pages_to_free);
@@ -984,26 +977,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
void __pagevec_lru_add(struct pagevec *pvec)
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;

for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
+ struct lruvec *new_lruvec;

- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
}

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
__pagevec_lru_add_fn(page, lruvec);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 48b50695f883..789444ae4c88 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1768,15 +1768,13 @@ int isolate_lru_page(struct page *page)
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");

if (TestClearPageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
int lru = page_lru(page);

get_page(page);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- spin_lock_irq(&pgdat->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);
del_page_from_lru_list(page, lruvec, lru);
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
ret = 0;
}

@@ -1843,20 +1841,22 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
struct list_head *list)
{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
+ struct lruvec *orig_lruvec = lruvec;
enum lru_list lru;

while (!list_empty(list)) {
+ struct lruvec *new_lruvec = NULL;
+
page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
putback_lru_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
continue;
}

@@ -1871,6 +1871,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
* list_add(&page->lru,)
* list_add(&page->lru,)
*/
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (new_lruvec != lruvec) {
+ if (lruvec)
+ spin_unlock_irq(&lruvec->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);
+ }
SetPageLRU(page);

if (unlikely(put_page_testzero(page))) {
@@ -1878,16 +1884,15 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
__ClearPageActive(page);

if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
destroy_compound_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
} else
list_add(&page->lru, &pages_to_free);

continue;
}

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
lru = page_lru(page);
nr_pages = thp_nr_pages(page);

@@ -1897,6 +1902,11 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
if (PageActive(page))
workingset_age_nonresident(lruvec, nr_pages);
}
+ if (orig_lruvec != lruvec) {
+ if (lruvec)
+ spin_unlock_irq(&lruvec->lru_lock);
+ spin_lock_irq(&orig_lruvec->lru_lock);
+ }

/*
* To save our caller's stack, now use input list for pages to free.
@@ -1952,7 +1962,7 @@ static int current_may_throttle(void)

lru_add_drain();

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, lru);
@@ -1964,7 +1974,7 @@ static int current_may_throttle(void)
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);

- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

if (nr_taken == 0)
return 0;
@@ -1972,7 +1982,7 @@ static int current_may_throttle(void)
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
&stat, false);

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
move_pages_to_lru(lruvec, &page_list);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1981,7 +1991,7 @@ static int current_may_throttle(void)
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

lru_note_cost(lruvec, file, stat.nr_pageout);
mem_cgroup_uncharge_list(&page_list);
@@ -2034,7 +2044,7 @@ static void shrink_active_list(unsigned long nr_to_scan,

lru_add_drain();

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
@@ -2045,7 +2055,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);

- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

while (!list_empty(&l_hold)) {
cond_resched();
@@ -2091,7 +2101,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2102,7 +2112,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
@@ -2684,10 +2694,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
/*
* Determine the scan balance between anon and file LRUs.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&target_lruvec->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&target_lruvec->lru_lock);

/*
* Target desirable inactive:active list ratios for the anon
@@ -4263,24 +4273,22 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
*/
void check_move_unevictable_pages(struct pagevec *pvec)
{
- struct lruvec *lruvec;
- struct pglist_data *pgdat = NULL;
+ struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;

for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
+ struct lruvec *new_lruvec;

pgscanned++;
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irq(&pgdat->lru_lock);
- pgdat = pagepgdat;
- spin_lock_irq(&pgdat->lru_lock);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ lruvec = lock_page_lruvec_irq(page);
}
- lruvec = mem_cgroup_page_lruvec(page, pgdat);

if (!PageLRU(page) || !PageUnevictable(page))
continue;
@@ -4296,10 +4304,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
}
}

- if (pgdat) {
+ if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
--
1.8.3.1