[PATCH 1/2] mm: workingset: tell cache transitions from workingset thrashing

From: Johannes Weiner
Date: Wed Sep 27 2017 - 12:28:49 EST


Refaults happen during transitions between workingsets as well as
in-place thrashing. Knowing the difference between the two has a range
of applications, including measuring the impact of memory shortage on
the system performance, as well as the ability to smarter balance
pressure between the filesystem cache and the swap-backed workingset.

During workingset transitions, inactive cache refaults and pushes out
established active cache. When that active cache isn't stale, however,
and also ends up refaulting, that's bonafide thrashing.

Introduce a new page flag that tells on eviction whether the page has
been active or not in its lifetime. This bit is then stored in the
shadow entry, to classify refaults as transitioning or thrashing.

Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

Conflicts:
include/linux/page-flags.h
mm/memcontrol.c
mm/workingset.c

Change-Id: Iee21f70d3b20aa5b9bf6b9e40693e170f785c813
---
include/linux/mmzone.h | 1 +
include/linux/page-flags.h | 5 ++-
include/linux/swap.h | 2 +-
include/trace/events/mmflags.h | 1 +
mm/filemap.c | 9 ++--
mm/huge_memory.c | 1 +
mm/memcontrol.c | 2 +
mm/migrate.c | 2 +
mm/swap_state.c | 1 +
mm/vmscan.c | 1 +
mm/vmstat.c | 1 +
mm/workingset.c | 97 +++++++++++++++++++++++++++---------------
12 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7e273e2..a2d1517 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -152,6 +152,7 @@ enum node_stat_item {
NR_PAGES_SCANNED, /* pages scanned since last reclaim */
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
+ WORKINGSET_RESTORE,
WORKINGSET_NODERECLAIM,
NR_ANON_MAPPED, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74e4dda..e607142 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -73,12 +73,13 @@
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
- PG_error,
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
+ PG_workingset,
+ PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
@@ -262,6 +263,8 @@ static __always_inline int PageCompound(struct page *page)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
TESTCLEARFLAG(Active, active, PF_HEAD)
+PAGEFLAG(Workingset, workingset, PF_HEAD)
+ TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 55ff559..cbe483a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -245,7 +245,7 @@ struct swap_info_struct {

/* linux/mm/workingset.c */
void *workingset_eviction(struct address_space *mapping, struct page *page);
-bool workingset_refault(void *shadow);
+void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
extern struct list_lru workingset_shadow_nodes;

diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5a81ab4..5076e11 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -87,6 +87,7 @@
{1UL << PG_dirty, "dirty" }, \
{1UL << PG_lru, "lru" }, \
{1UL << PG_active, "active" }, \
+ {1UL << PG_workingset, "workingset" }, \
{1UL << PG_slab, "slab" }, \
{1UL << PG_owner_priv_1, "owner_priv_1" }, \
{1UL << PG_arch_1, "arch_1" }, \
diff --git a/mm/filemap.c b/mm/filemap.c
index edfb90e..59fe831 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -745,12 +745,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- if (!(gfp_mask & __GFP_WRITE) &&
- shadow && workingset_refault(shadow)) {
- SetPageActive(page);
- workingset_activation(page);
- } else
- ClearPageActive(page);
+ WARN_ON_ONCE(PageActive(page));
+ if (!(gfp_mask & __GFP_WRITE) && shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
}
return ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d5b2b75..22d1961 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1869,6 +1869,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
+ (1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
(1L << PG_dirty)));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 47559cc..d636bd6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5239,6 +5239,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
events[MEM_CGROUP_EVENTS_PGFAULT]);
seq_printf(m, "pgmajfault %lu\n",
events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+ seq_printf(m, "workingset_restore %lu\n",
+ stat[WORKINGSET_RESTORE]);

return 0;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6850f62..9348fd4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -636,6 +636,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
+ if (PageWorkingset(page))
+ SetPageWorkingset(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 35d7e0e..10fec1f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -368,6 +368,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Initiate read into locked page and return.
*/
+ SetPageWorkingset(new_page);
lru_cache_add_anon(new_page);
*new_page_allocated = true;
return new_page;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 30a88b9..6361461 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1991,6 +1991,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}

ClearPageActive(page); /* we are de-activating */
+ SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 604f26a..3adf071 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -957,6 +957,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
"nr_pages_scanned",
"workingset_refault",
"workingset_activate",
+ "workingset_restore",
"workingset_nodereclaim",
"nr_anon_pages",
"nr_mapped",
diff --git a/mm/workingset.c b/mm/workingset.c
index 4c4f056..b058fd3d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -118,7 +118,7 @@
* the only thing eating into inactive list space is active pages.
*
*
- * Activating refaulting pages
+ * Refaulting inactive pages
*
* All that is known about the active list is that the pages have been
* accessed more than once in the past. This means that at any given
@@ -131,6 +131,10 @@
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
* used once will be evicted from memory.
@@ -138,6 +142,14 @@
* But if this is right, the stale pages will be pushed out of memory
* and the used pages get to stay in cache.
*
+ * Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
*
* Implementation
*
@@ -153,8 +165,7 @@
*/

#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
- NODES_SHIFT + \
- MEM_CGROUP_ID_SHIFT)
+ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)

/*
@@ -167,23 +178,28 @@
*/
static unsigned int bucket_order __read_mostly;

-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
{
eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+ eviction = (eviction << 1) | workingset;
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);

return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}

static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, bool *workingsetp)
{
unsigned long entry = (unsigned long)shadow;
int memcgid, nid;
+ bool workingset;

entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+ workingset = entry & 1;
+ entry >>= 1;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -192,6 +208,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+ *workingsetp = workingset;
}

/**
@@ -204,8 +221,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg = page_memcg(page);
int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
@@ -217,30 +234,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)

lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age);
- return pack_shadow(memcgid, pgdat, eviction);
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}

/**
* workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node it was allocated in.
- *
- * Returns %true if the page should be activated, %false otherwise.
*/
-bool workingset_refault(void *shadow)
+void workingset_refault(struct page *page, void *shadow)
{
unsigned long refault_distance;
+ struct pglist_data *pgdat;
unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
unsigned long refault;
- struct pglist_data *pgdat;
+ bool workingset;
int memcgid;

- unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);

rcu_read_lock();
/*
@@ -260,40 +277,50 @@ bool workingset_refault(void *shadow)
* configurations instead.
*/
memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() && !memcg) {
- rcu_read_unlock();
- return false;
- }
+ if (!mem_cgroup_disabled() && !memcg)
+ goto out;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
- rcu_read_unlock();

/*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
+ * Calculate the refault distance
*
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases. There is a
+ * special case: usually, shadow entries have a short lifetime
+ * and are either refaulted or reclaimed along with the inode
+ * before they get too old. But it is not impossible for the
+ * inactive_age to lap a shadow entry in the field, which can
+ * then can result in a false small refault distance, leading
+ * to a false activation should this old entry actually
+ * refault again. However, earlier kernels used to deactivate
+ * unconditionally with *every* reclaim invocation for the
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;

inc_node_state(pgdat, WORKINGSET_REFAULT);

- if (refault_distance <= active_file) {
- inc_node_state(pgdat, WORKINGSET_ACTIVATE);
- return true;
- }
- return false;
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */
+ if (refault_distance > active_file)
+ goto out;
+
+ SetPageActive(page);
+ SetPageWorkingset(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_node_state(pgdat, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset)
+ inc_node_state(pgdat, WORKINGSET_RESTORE);
+out:
+ rcu_read_unlock();
}

/**
--
1.9.1


--------------9B27BB3AED4106EF45BCD875
Content-Type: text/x-patch;
name="0001-proc-stat-add-major-page-faults-time-accounting.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
filename*0="0001-proc-stat-add-major-page-faults-time-accounting.patch"