[PATCH 5/10] mm/memcg: introduce page_relock_lruvec

From: Hugh Dickins
Date: Mon Feb 20 2012 - 18:33:43 EST


Delete the mem_cgroup_page_lruvec() which we just added, replacing
it and nearby spin_lock_irq or spin_lock_irqsave of zone->lru_lock:
in most places by page_lock_lruvec() or page_relock_lruvec() (the
former being a simple case of the latter) or just by lock_lruvec().
unlock_lruvec() does the spin_unlock_irqrestore for them all.

page_relock_lruvec() is born from that "pagezone" pattern in swap.c
and vmscan.c, where we loop over an array of pages, switching lock
whenever the zone changes: bearing in mind that if we were to refine
that lock to per-memcg per-zone, then we would have to switch whenever
the memcg changes too.

page_relock_lruvec(page, &lruvec) locates the right lruvec for page,
unlocks the old lruvec if different (and not NULL), locks the new,
and updates lruvec on return: so that we shall have just one routine
to locate and lock the lruvec, whereas originally it got re-evaluated
at different stages. But I don't yet know how to satisfy sparse(1).

There are some loops where we never change zone, and a non-memcg kernel
would not change memcg: use no-op mem_cgroup_page_relock_lruvec() there.

In compaction's isolate_migratepages(), although we do know the zone,
we don't know the lruvec in advance: allow for taking the lock later,
and reorganize its cond_resched() lock-dropping accordingly.

page_relock_lruvec() (and its wrappers) is actually an _irqsave operation:
there are a few cases in swap.c where it may be needed at interrupt time
(to free or to rotate a page on I/O completion). Ideally(?) we would use
straightforward _irq disabling elsewhere, but the variants get confusing,
and page_relock_lruvec() will itself grow more complicated in subsequent
patches: so keep it simple for now with just the one irqsaver everywhere.

Passing an irqflags argument/pointer down several levels looks messy
too, and I'm reluctant to add any more to the page reclaim stack: so
save the irqflags alongside the lru_lock and restore them from there.

It's a little sad now to be including mm.h in swap.h to get page_zone();
but I think that swap.h (despite its name) is the right place for these
lru functions, and without those inlines the optimizer cannot do so
well in the !MEM_RES_CTLR case.

(Is this an appropriate place to confess? that even at the end of the
series, we're left with a small bug in putback_inactive_pages(), one
that I've not yet decided is worth fixing: reclaim_stat there is from
the lruvec on entry, but we might update stats after dropping its lock.
And do zone->pages_scanned and zone->all_unreclaimable need locking?
page_alloc.c thinks zone->lock, vmscan.c thought zone->lru_lock,
and that weakens if we now split lru_lock by memcg.)

Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
---
include/linux/memcontrol.h | 7 --
include/linux/mmzone.h | 1
include/linux/swap.h | 65 +++++++++++++++++++++++
mm/compaction.c | 45 ++++++++++------
mm/huge_memory.c | 10 +--
mm/memcontrol.c | 56 ++++++++++++--------
mm/swap.c | 67 +++++++-----------------
mm/vmscan.c | 95 ++++++++++++++++-------------------
8 files changed, 194 insertions(+), 152 deletions(-)

--- mmotm.orig/include/linux/memcontrol.h 2012-02-18 11:57:35.583524425 -0800
+++ mmotm/include/linux/memcontrol.h 2012-02-18 11:57:42.675524592 -0800
@@ -63,7 +63,6 @@ extern int mem_cgroup_cache_charge(struc
gfp_t gfp_mask);

struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
-extern struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
extern struct mem_cgroup *mem_cgroup_from_lruvec(struct lruvec *lruvec);
extern void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);

@@ -241,12 +240,6 @@ static inline struct lruvec *mem_cgroup_
{
return &zone->lruvec;
}
-
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
- struct zone *zone)
-{
- return &zone->lruvec;
-}

static inline struct mem_cgroup *mem_cgroup_from_lruvec(struct lruvec *lruvec)
{
--- mmotm.orig/include/linux/mmzone.h 2012-02-18 11:57:28.371524252 -0800
+++ mmotm/include/linux/mmzone.h 2012-02-18 11:57:42.675524592 -0800
@@ -374,6 +374,7 @@ struct zone {

/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
+ unsigned long irqflags;
struct lruvec lruvec;

unsigned long pages_scanned; /* since last reclaim */
--- mmotm.orig/include/linux/swap.h 2012-02-18 11:57:35.583524425 -0800
+++ mmotm/include/linux/swap.h 2012-02-18 11:57:42.675524592 -0800
@@ -8,7 +8,7 @@
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/node.h>
-
+#include <linux/mm.h> /* for page_zone(page) */
#include <linux/atomic.h>
#include <asm/page.h>

@@ -250,6 +250,69 @@ static inline void lru_cache_add_file(st
__lru_cache_add(page, LRU_INACTIVE_FILE);
}

+static inline spinlock_t *lru_lockptr(struct lruvec *lruvec)
+{
+ return &lruvec->zone->lru_lock;
+}
+
+static inline void lock_lruvec(struct lruvec *lruvec)
+{
+ struct zone *zone = lruvec->zone;
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&zone->lru_lock, irqflags);
+ zone->irqflags = irqflags;
+}
+
+static inline void unlock_lruvec(struct lruvec *lruvec)
+{
+ struct zone *zone = lruvec->zone;
+ unsigned long irqflags;
+
+ irqflags = zone->irqflags;
+ spin_unlock_irqrestore(&zone->lru_lock, irqflags);
+}
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/* linux/mm/memcontrol.c */
+extern void page_relock_lruvec(struct page *page, struct lruvec **lruvp);
+
+static inline void
+mem_cgroup_page_relock_lruvec(struct page *page, struct lruvec **lruvp)
+{
+ page_relock_lruvec(page, lruvp);
+}
+#else
+static inline void page_relock_lruvec(struct page *page, struct lruvec **lruvp)
+{
+ struct lruvec *lruvec;
+
+ lruvec = &page_zone(page)->lruvec;
+ if (*lruvp && *lruvp != lruvec) {
+ unlock_lruvec(*lruvp);
+ *lruvp = NULL;
+ }
+ if (!*lruvp) {
+ *lruvp = lruvec;
+ lock_lruvec(lruvec);
+ }
+}
+
+static inline void
+mem_cgroup_page_relock_lruvec(struct page *page, struct lruvec **lruvp)
+{
+ /* No-op used in a few places where zone is known not to change */
+}
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+
+static inline struct lruvec *page_lock_lruvec(struct page *page)
+{
+ struct lruvec *lruvec = NULL;
+
+ page_relock_lruvec(page, &lruvec);
+ return lruvec;
+}
+
/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
--- mmotm.orig/mm/compaction.c 2012-02-18 11:57:35.583524425 -0800
+++ mmotm/mm/compaction.c 2012-02-18 11:57:42.675524592 -0800
@@ -262,7 +262,7 @@ static isolate_migrate_t isolate_migrate
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;

/* Do not scan outside zone boundaries */
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -293,26 +293,23 @@ static isolate_migrate_t isolate_migrate
}

/* Time to isolate some pages for migration */
- cond_resched();
- spin_lock_irq(&zone->lru_lock);
for (; low_pfn < end_pfn; low_pfn++) {
struct page *page;
- bool locked = true;

- /* give a chance to irqs before checking need_resched() */
- if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
- spin_unlock_irq(&zone->lru_lock);
- locked = false;
- }
- if (need_resched() || spin_is_contended(&zone->lru_lock)) {
- if (locked)
- spin_unlock_irq(&zone->lru_lock);
+ /* give a chance to irqs before cond_resched() */
+ if (lruvec) {
+ if (!((low_pfn+1) % SWAP_CLUSTER_MAX) ||
+ spin_is_contended(lru_lockptr(lruvec)) ||
+ need_resched()) {
+ unlock_lruvec(lruvec);
+ lruvec = NULL;
+ }
+ }
+ if (!lruvec) {
cond_resched();
- spin_lock_irq(&zone->lru_lock);
if (fatal_signal_pending(current))
break;
- } else if (!locked)
- spin_lock_irq(&zone->lru_lock);
+ }

/*
* migrate_pfn does not necessarily start aligned to a
@@ -359,6 +356,15 @@ static isolate_migrate_t isolate_migrate
continue;
}

+ if (!lruvec) {
+ /*
+ * We do need to take the lock before advancing to
+ * check PageLRU etc., but there's no guarantee that
+ * the page we're peeking at has a stable memcg here.
+ */
+ lruvec = &zone->lruvec;
+ lock_lruvec(lruvec);
+ }
if (!PageLRU(page))
continue;

@@ -379,7 +385,7 @@ static isolate_migrate_t isolate_migrate
if (__isolate_lru_page(page, mode, 0) != 0)
continue;

- lruvec = mem_cgroup_page_lruvec(page, zone);
+ page_relock_lruvec(page, &lruvec);

VM_BUG_ON(PageTransCompound(page));

@@ -396,9 +402,14 @@ static isolate_migrate_t isolate_migrate
}
}

+ if (!lruvec)
+ local_irq_disable();
acct_isolated(zone, cc);
+ if (lruvec)
+ unlock_lruvec(lruvec);
+ else
+ local_irq_enable();

- spin_unlock_irq(&zone->lru_lock);
cc->migrate_pfn = low_pfn;

trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
--- mmotm.orig/mm/huge_memory.c 2012-02-18 11:57:35.583524425 -0800
+++ mmotm/mm/huge_memory.c 2012-02-18 11:57:42.679524592 -0800
@@ -1222,13 +1222,11 @@ static int __split_huge_page_splitting(s
static void __split_huge_page_refcount(struct page *page)
{
int i;
- struct zone *zone = page_zone(page);
struct lruvec *lruvec;
int tail_count = 0;

/* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = page_lock_lruvec(page);

compound_lock(page);
/* complete memcg works before add pages to LRU */
@@ -1310,12 +1308,12 @@ static void __split_huge_page_refcount(s
atomic_sub(tail_count, &page->_count);
BUG_ON(atomic_read(&page->_count) <= 0);

- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+ __mod_zone_page_state(lruvec->zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
+ __mod_zone_page_state(lruvec->zone, NR_ANON_PAGES, HPAGE_PMD_NR);

ClearPageCompound(page);
compound_unlock(page);
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(lruvec);

for (i = 1; i < HPAGE_PMD_NR; i++) {
struct page *page_tail = page + i;
--- mmotm.orig/mm/memcontrol.c 2012-02-18 11:57:35.587524424 -0800
+++ mmotm/mm/memcontrol.c 2012-02-18 11:57:42.679524592 -0800
@@ -1037,23 +1037,36 @@ struct mem_cgroup *mem_cgroup_from_lruve
*/

/**
- * mem_cgroup_page_lruvec - return lruvec for adding an lru page
+ * page_relock_lruvec - lock and update lruvec for this page, unlocking previous
* @page: the page
- * @zone: zone of the page
+ * @lruvp: pointer to where to output lruvec; unlock input lruvec if non-NULL
*/
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
+void page_relock_lruvec(struct page *page, struct lruvec **lruvp)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
struct page_cgroup *pc;
+ struct lruvec *lruvec;

if (mem_cgroup_disabled())
- return &zone->lruvec;
+ lruvec = &page_zone(page)->lruvec;
+ else {
+ pc = lookup_page_cgroup(page);
+ memcg = pc->mem_cgroup;
+ mz = page_cgroup_zoneinfo(memcg, page);
+ lruvec = &mz->lruvec;
+ }

- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
- mz = page_cgroup_zoneinfo(memcg, page);
- return &mz->lruvec;
+ /*
+ * For the moment, simply lock by zone just as before.
+ */
+ if (*lruvp && (*lruvp)->zone != lruvec->zone) {
+ unlock_lruvec(*lruvp);
+ *lruvp = NULL;
+ }
+ if (!*lruvp)
+ lock_lruvec(lruvec);
+ *lruvp = lruvec;
}

/**
@@ -2631,30 +2644,27 @@ __mem_cgroup_commit_charge_lrucare(struc
enum charge_type ctype)
{
struct page_cgroup *pc = lookup_page_cgroup(page);
- struct zone *zone = page_zone(page);
- unsigned long flags;
- bool removed = false;
struct lruvec *lruvec;
+ bool removed = false;

/*
* In some case, SwapCache, FUSE(splice_buf->radixtree), the page
* is already on LRU. It means the page may on some other page_cgroup's
* LRU. Take care of it.
*/
- spin_lock_irqsave(&zone->lru_lock, flags);
+ lruvec = page_lock_lruvec(page);
if (PageLRU(page)) {
- lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
del_page_from_lru_list(page, lruvec, page_lru(page));
ClearPageLRU(page);
removed = true;
}
__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
if (removed) {
- lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
+ page_relock_lruvec(page, &lruvec);
add_page_to_lru_list(page, lruvec, page_lru(page));
SetPageLRU(page);
}
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ unlock_lruvec(lruvec);
}

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -3572,15 +3582,15 @@ static int mem_cgroup_force_empty_list(s
int node, int zid, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
- unsigned long flags, loop;
+ unsigned long loop;
struct list_head *list;
struct page *busy;
- struct zone *zone;
+ struct lruvec *lruvec;
int ret = 0;

- zone = &NODE_DATA(node)->node_zones[zid];
mz = mem_cgroup_zoneinfo(memcg, node, zid);
- list = &mz->lruvec.lists[lru];
+ lruvec = &mz->lruvec;
+ list = &lruvec->lists[lru];

loop = mz->lru_size[lru];
/* give some margin against EBUSY etc...*/
@@ -3591,19 +3601,19 @@ static int mem_cgroup_force_empty_list(s
struct page *page;

ret = 0;
- spin_lock_irqsave(&zone->lru_lock, flags);
+ lock_lruvec(lruvec);
if (list_empty(list)) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ unlock_lruvec(lruvec);
break;
}
page = list_entry(list->prev, struct page, lru);
if (busy == page) {
list_move(&page->lru, list);
busy = NULL;
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ unlock_lruvec(lruvec);
continue;
}
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ unlock_lruvec(lruvec);

pc = lookup_page_cgroup(page);

--- mmotm.orig/mm/swap.c 2012-02-18 11:57:35.587524424 -0800
+++ mmotm/mm/swap.c 2012-02-18 11:57:42.679524592 -0800
@@ -47,16 +47,13 @@ static DEFINE_PER_CPU(struct pagevec, lr
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
struct lruvec *lruvec;
- unsigned long flags;

- spin_lock_irqsave(&zone->lru_lock, flags);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = page_lock_lruvec(page);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ unlock_lruvec(lruvec);
}
}

@@ -208,26 +205,16 @@ static void pagevec_lru_move_fn(struct p
void *arg)
{
int i;
- struct zone *zone = NULL;
- struct lruvec *lruvec;
- unsigned long flags = 0;
+ struct lruvec *lruvec = NULL;

for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);

- if (pagezone != zone) {
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = pagezone;
- spin_lock_irqsave(&zone->lru_lock, flags);
- }
-
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ page_relock_lruvec(page, &lruvec);
(*move_fn)(page, lruvec, arg);
}
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ if (lruvec)
+ unlock_lruvec(lruvec);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
@@ -334,11 +321,11 @@ static inline void activate_page_drain(i

void activate_page(struct page *page)
{
- struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;

- spin_lock_irq(&zone->lru_lock);
- __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
- spin_unlock_irq(&zone->lru_lock);
+ lruvec = page_lock_lruvec(page);
+ __activate_page(page, lruvec, NULL);
+ unlock_lruvec(lruvec);
}
#endif

@@ -403,15 +390,13 @@ void lru_cache_add_lru(struct page *page
*/
void add_page_to_unevictable_list(struct page *page)
{
- struct zone *zone = page_zone(page);
struct lruvec *lruvec;

- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = page_lock_lruvec(page);
SetPageUnevictable(page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(lruvec);
}

/*
@@ -577,17 +562,15 @@ void release_pages(struct page **pages,
{
int i;
LIST_HEAD(pages_to_free);
- struct zone *zone = NULL;
- struct lruvec *lruvec;
- unsigned long uninitialized_var(flags);
+ struct lruvec *lruvec = NULL;

for (i = 0; i < nr; i++) {
struct page *page = pages[i];

if (unlikely(PageCompound(page))) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
+ if (lruvec) {
+ unlock_lruvec(lruvec);
+ lruvec = NULL;
}
put_compound_page(page);
continue;
@@ -597,17 +580,7 @@ void release_pages(struct page **pages,
continue;

if (PageLRU(page)) {
- struct zone *pagezone = page_zone(page);
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock,
- flags);
- zone = pagezone;
- spin_lock_irqsave(&zone->lru_lock, flags);
- }
-
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ page_relock_lruvec(page, &lruvec);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -615,8 +588,8 @@ void release_pages(struct page **pages,

list_add(&page->lru, &pages_to_free);
}
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ if (lruvec)
+ unlock_lruvec(lruvec);

free_hot_cold_page_list(&pages_to_free, cold);
}
@@ -652,7 +625,7 @@ void lru_add_page_tail(struct page *page
VM_BUG_ON(!PageHead(page));
VM_BUG_ON(PageCompound(page_tail));
VM_BUG_ON(PageLRU(page_tail));
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&lruvec->zone->lru_lock));
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(lru_lockptr(lruvec)));

SetPageLRU(page_tail);

--- mmotm.orig/mm/vmscan.c 2012-02-18 11:57:35.587524424 -0800
+++ mmotm/mm/vmscan.c 2012-02-18 11:57:42.679524592 -0800
@@ -1212,8 +1212,8 @@ static unsigned long isolate_lru_pages(u
break;

if (__isolate_lru_page(cursor_page, mode, file) == 0) {
- lruvec = mem_cgroup_page_lruvec(cursor_page,
- lruvec->zone);
+ mem_cgroup_page_relock_lruvec(cursor_page,
+ &lruvec);
isolated_pages = hpage_nr_pages(cursor_page);
mem_cgroup_update_lru_size(lruvec,
page_lru(cursor_page), -isolated_pages);
@@ -1294,11 +1294,9 @@ int isolate_lru_page(struct page *page)
VM_BUG_ON(!page_count(page));

if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
struct lruvec *lruvec;

- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = page_lock_lruvec(page);
if (PageLRU(page)) {
int lru = page_lru(page);
get_page(page);
@@ -1306,7 +1304,7 @@ int isolate_lru_page(struct page *page)
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(lruvec);
}
return ret;
}
@@ -1337,10 +1335,9 @@ static int too_many_isolated(struct zone
}

static noinline_for_stack void
-putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+putback_inactive_pages(struct lruvec **lruvec, struct list_head *page_list)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- struct zone *zone = lruvec->zone;
+ struct zone_reclaim_stat *reclaim_stat = &(*lruvec)->reclaim_stat;
LIST_HEAD(pages_to_free);

/*
@@ -1353,17 +1350,18 @@ putback_inactive_pages(struct lruvec *lr
VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
if (unlikely(!page_evictable(page, NULL))) {
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(*lruvec);
putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(*lruvec);
continue;
}

- lruvec = mem_cgroup_page_lruvec(page, zone);
+ /* lock lru, occasionally changing lruvec */
+ mem_cgroup_page_relock_lruvec(page, lruvec);

SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, *lruvec, lru);

if (is_active_lru(lru)) {
int file = is_file_lru(lru);
@@ -1373,12 +1371,12 @@ putback_inactive_pages(struct lruvec *lr
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
+ del_page_from_lru_list(page, *lruvec, lru);

if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(*lruvec);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(*lruvec);
} else
list_add(&page->lru, &pages_to_free);
}
@@ -1513,7 +1511,7 @@ shrink_inactive_list(unsigned long nr_to
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;

- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(lruvec);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, 0, file);
@@ -1524,7 +1522,7 @@ shrink_inactive_list(unsigned long nr_to
else
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
}
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(lruvec);

if (nr_taken == 0)
return 0;
@@ -1541,7 +1539,7 @@ shrink_inactive_list(unsigned long nr_to
priority, &nr_dirty, &nr_writeback);
}

- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(lruvec);

reclaim_stat->recent_scanned[0] += nr_anon;
reclaim_stat->recent_scanned[1] += nr_file;
@@ -1550,12 +1548,12 @@ shrink_inactive_list(unsigned long nr_to
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);

- putback_inactive_pages(lruvec, &page_list);
+ putback_inactive_pages(&lruvec, &page_list);

__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);

- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(lruvec);

free_hot_cold_page_list(&page_list, 1);

@@ -1611,42 +1609,44 @@ shrink_inactive_list(unsigned long nr_to
* But we had to alter page->flags anyway.
*/

-static void move_active_pages_to_lru(struct lruvec *lruvec,
+static void move_active_pages_to_lru(struct lruvec **lruvec,
struct list_head *list,
struct list_head *pages_to_free,
enum lru_list lru)
{
- struct zone *zone = lruvec->zone;
unsigned long pgmoved = 0;
struct page *page;
int nr_pages;

while (!list_empty(list)) {
page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ /* lock lru, occasionally changing lruvec */
+ mem_cgroup_page_relock_lruvec(page, lruvec);

VM_BUG_ON(PageLRU(page));
SetPageLRU(page);

nr_pages = hpage_nr_pages(page);
- list_move(&page->lru, &lruvec->lists[lru]);
- mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+ list_move(&page->lru, &(*lruvec)->lists[lru]);
+ mem_cgroup_update_lru_size(*lruvec, lru, nr_pages);
pgmoved += nr_pages;

if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
+ del_page_from_lru_list(page, *lruvec, lru);

if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(*lruvec);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(*lruvec);
} else
list_add(&page->lru, pages_to_free);
}
}
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
+ __mod_zone_page_state((*lruvec)->zone, NR_LRU_BASE + lru, pgmoved);
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}
@@ -1676,7 +1676,7 @@ static void shrink_active_list(unsigned
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;

- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(lruvec);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, 1, file);
@@ -1691,7 +1691,8 @@ static void shrink_active_list(unsigned
else
__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+
+ unlock_lruvec(lruvec);

while (!list_empty(&l_hold)) {
cond_resched();
@@ -1735,7 +1736,7 @@ static void shrink_active_list(unsigned
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(lruvec);
/*
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
@@ -1744,12 +1745,13 @@ static void shrink_active_list(unsigned
*/
reclaim_stat->recent_rotated[file] += nr_rotated;

- move_active_pages_to_lru(lruvec, &l_active, &l_hold,
+ move_active_pages_to_lru(&lruvec, &l_active, &l_hold,
LRU_ACTIVE + file * LRU_FILE);
- move_active_pages_to_lru(lruvec, &l_inactive, &l_hold,
+ move_active_pages_to_lru(&lruvec, &l_inactive, &l_hold,
LRU_BASE + file * LRU_FILE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+
+ unlock_lruvec(lruvec);

free_hot_cold_page_list(&l_hold, 1);
}
@@ -1940,7 +1942,7 @@ static void get_scan_count(struct lruvec
*
* anon in [0], file in [1]
*/
- spin_lock_irq(&zone->lru_lock);
+ lock_lruvec(lruvec);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
@@ -1961,7 +1963,7 @@ static void get_scan_count(struct lruvec

fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(lruvec);

fraction[0] = ap;
fraction[1] = fp;
@@ -3525,25 +3527,16 @@ int page_evictable(struct page *page, st
*/
void check_move_unevictable_pages(struct page **pages, int nr_pages)
{
- struct lruvec *lruvec;
- struct zone *zone = NULL;
+ struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;

for (i = 0; i < nr_pages; i++) {
struct page *page = pages[i];
- struct zone *pagezone;

pgscanned++;
- pagezone = page_zone(page);
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ page_relock_lruvec(page, &lruvec);

if (!PageLRU(page) || !PageUnevictable(page))
continue;
@@ -3559,10 +3552,10 @@ void check_move_unevictable_pages(struct
}
}

- if (zone) {
+ if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&zone->lru_lock);
+ unlock_lruvec(lruvec);
}
}
#endif /* CONFIG_SHMEM */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/