[RFC PATCH v2 2/8] mm: make zone_reclaim_stat updates thread-safe

From: Daniel Jordan
Date: Mon Sep 10 2018 - 20:43:11 EST


lru_lock needs to be held to update the zone_reclaim_stat statistics.
Similar to the previous patch, this requirement again arises fairly
naturally because callers are holding lru_lock already.

In preparation for allowing concurrent adds and removes from the LRU,
however, make concurrent updates to these statistics safe without
lru_lock. The lock continues to be held until later in the series, when
it is replaced with a rwlock that also disables preemption, maintaining
the assumption in the comment above __update_page_reclaim_stat, which is
introduced here.

Use a combination of per-cpu counters and atomics.

Signed-off-by: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx>
---
include/linux/mmzone.h | 50 ++++++++++++++++++++++++++++++++++++++++++
init/main.c | 1 +
mm/memcontrol.c | 20 ++++++++---------
mm/memory_hotplug.c | 1 +
mm/mmzone.c | 14 ++++++++++++
mm/swap.c | 14 ++++++++----
mm/vmscan.c | 42 ++++++++++++++++++++---------------
7 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2dc52a..6d4c23a3069d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -229,6 +229,12 @@ struct zone_reclaim_stat {
*
* The anon LRU stats live in [0], file LRU stats in [1]
*/
+ atomic_long_t recent_rotated[2];
+ atomic_long_t recent_scanned[2];
+};
+
+/* These spill into the counters in struct zone_reclaim_stat beyond a cutoff. */
+struct zone_reclaim_stat_cpu {
unsigned long recent_rotated[2];
unsigned long recent_scanned[2];
};
@@ -236,6 +242,7 @@ struct zone_reclaim_stat {
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
+ struct zone_reclaim_stat_cpu __percpu *reclaim_stat_cpu;
/* Evictions & activations on the inactive file list */
atomic_long_t inactive_age;
/* Refaults at the time of last reclaim cycle */
@@ -245,6 +252,47 @@ struct lruvec {
#endif
};

+#define RECLAIM_STAT_BATCH 32U /* From SWAP_CLUSTER_MAX */
+
+/*
+ * Callers of the below three functions that update reclaim stats must hold
+ * lru_lock and have preemption disabled. Use percpu counters that spill into
+ * atomics to allow concurrent updates when multiple readers hold lru_lock.
+ */
+
+static inline void __update_page_reclaim_stat(unsigned long count,
+ unsigned long *percpu_stat,
+ atomic_long_t *stat)
+{
+ unsigned long val = *percpu_stat + count;
+
+ if (unlikely(val > RECLAIM_STAT_BATCH)) {
+ atomic_long_add(val, stat);
+ val = 0;
+ }
+ *percpu_stat = val;
+}
+
+static inline void update_reclaim_stat_scanned(struct lruvec *lruvec, int file,
+ unsigned long count)
+{
+ struct zone_reclaim_stat_cpu __percpu *percpu_stat =
+ this_cpu_ptr(lruvec->reclaim_stat_cpu);
+
+ __update_page_reclaim_stat(count, &percpu_stat->recent_scanned[file],
+ &lruvec->reclaim_stat.recent_scanned[file]);
+}
+
+static inline void update_reclaim_stat_rotated(struct lruvec *lruvec, int file,
+ unsigned long count)
+{
+ struct zone_reclaim_stat_cpu __percpu *percpu_stat =
+ this_cpu_ptr(lruvec->reclaim_stat_cpu);
+
+ __update_page_reclaim_stat(count, &percpu_stat->recent_rotated[file],
+ &lruvec->reclaim_stat.recent_rotated[file]);
+}
+
/* Mask used at gathering information at once (see memcontrol.c) */
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
@@ -795,6 +843,8 @@ extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn
unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);
+extern void lruvec_init_late(struct lruvec *lruvec);
+extern void lruvecs_init_late(void);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
diff --git a/init/main.c b/init/main.c
index 3b4ada11ed52..80ad02fe99de 100644
--- a/init/main.c
+++ b/init/main.c
@@ -526,6 +526,7 @@ static void __init mm_init(void)
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
pti_init();
+ lruvecs_init_late();
}

asmlinkage __visible void __init start_kernel(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5463ad160e10..f7f9682482cd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3152,22 +3152,22 @@ static int memcg_stat_show(struct seq_file *m, void *v)
pg_data_t *pgdat;
struct mem_cgroup_per_node *mz;
struct zone_reclaim_stat *rstat;
- unsigned long recent_rotated[2] = {0, 0};
- unsigned long recent_scanned[2] = {0, 0};
+ unsigned long rota[2] = {0, 0};
+ unsigned long scan[2] = {0, 0};

for_each_online_pgdat(pgdat) {
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
rstat = &mz->lruvec.reclaim_stat;

- recent_rotated[0] += rstat->recent_rotated[0];
- recent_rotated[1] += rstat->recent_rotated[1];
- recent_scanned[0] += rstat->recent_scanned[0];
- recent_scanned[1] += rstat->recent_scanned[1];
+ rota[0] += atomic_long_read(&rstat->recent_rotated[0]);
+ rota[1] += atomic_long_read(&rstat->recent_rotated[1]);
+ scan[0] += atomic_long_read(&rstat->recent_scanned[0]);
+ scan[1] += atomic_long_read(&rstat->recent_scanned[1]);
}
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
+ seq_printf(m, "recent_rotated_anon %lu\n", rota[0]);
+ seq_printf(m, "recent_rotated_file %lu\n", rota[1]);
+ seq_printf(m, "recent_scanned_anon %lu\n", scan[0]);
+ seq_printf(m, "recent_scanned_file %lu\n", scan[1]);
}
#endif

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 25982467800b..d3ebb11c3f9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1009,6 +1009,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_node(nid, zones_size, start_pfn, zholes_size);
pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+ lruvec_init_late(node_lruvec(pgdat));

/*
* The node we allocated has no zone fallback lists. For avoiding
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..090cd4f7effb 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -9,6 +9,7 @@
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
+#include <linux/percpu.h>

struct pglist_data *first_online_pgdat(void)
{
@@ -96,6 +97,19 @@ void lruvec_init(struct lruvec *lruvec)
INIT_LIST_HEAD(&lruvec->lists[lru]);
}

+void lruvec_init_late(struct lruvec *lruvec)
+{
+ lruvec->reclaim_stat_cpu = alloc_percpu(struct zone_reclaim_stat_cpu);
+}
+
+void lruvecs_init_late(void)
+{
+ pg_data_t *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ lruvec_init_late(node_lruvec(pgdat));
+}
+
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
int page_cpupid_xchg_last(struct page *page, int cpupid)
{
diff --git a/mm/swap.c b/mm/swap.c
index 3dd518832096..219c234d632f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,6 +34,7 @@
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
+#include <linux/mmzone.h>

#include "internal.h"

@@ -260,14 +261,19 @@ void rotate_reclaimable_page(struct page *page)
}
}

+/*
+ * Updates page reclaim statistics using per-cpu counters that spill into
+ * atomics above a threshold.
+ *
+ * Assumes that the caller has disabled preemption. IRQs may be enabled
+ * because this function is not called from irq context.
+ */
static void update_page_reclaim_stat(struct lruvec *lruvec,
int file, int rotated)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-
- reclaim_stat->recent_scanned[file]++;
+ update_reclaim_stat_scanned(lruvec, file, 1);
if (rotated)
- reclaim_stat->recent_rotated[file]++;
+ update_reclaim_stat_rotated(lruvec, file, 1);
}

static void __activate_page(struct page *page, struct lruvec *lruvec,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9270a4370d54..730b6d0c6c61 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1655,7 +1655,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
static noinline_for_stack void
putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
LIST_HEAD(pages_to_free);

@@ -1684,7 +1683,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
- reclaim_stat->recent_rotated[file] += numpages;
+ update_reclaim_stat_rotated(lruvec, file, numpages);
}
if (put_page_testzero(page)) {
__ClearPageLRU(page);
@@ -1736,7 +1735,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;

while (unlikely(too_many_isolated(pgdat, file, sc))) {
@@ -1763,7 +1761,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
&nr_scanned, sc, isolate_mode, lru);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
+ update_reclaim_stat_scanned(lruvec, file, nr_taken);

if (current_is_kswapd()) {
if (global_reclaim(sc))
@@ -1914,7 +1912,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
@@ -1932,7 +1929,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
&nr_scanned, sc, isolate_mode, lru);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
+ update_reclaim_stat_scanned(lruvec, file, nr_taken);

__count_vm_events(PGREFILL, nr_scanned);
count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
@@ -1989,7 +1986,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* helps balance scan pressure between file and anonymous pages in
* get_scan_count.
*/
- reclaim_stat->recent_rotated[file] += nr_rotated;
+ update_reclaim_stat_rotated(lruvec, file, nr_rotated);

nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
@@ -2116,7 +2113,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
unsigned long *lru_pages)
{
int swappiness = mem_cgroup_swappiness(memcg);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ struct zone_reclaim_stat *rstat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2125,6 +2122,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
unsigned long anon, file;
unsigned long ap, fp;
enum lru_list lru;
+ long recent_scanned[2], recent_rotated[2];

/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2238,14 +2236,22 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);

spin_lock_irq(&pgdat->lru_lock);
- if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- reclaim_stat->recent_scanned[0] /= 2;
- reclaim_stat->recent_rotated[0] /= 2;
+ recent_scanned[0] = atomic_long_read(&rstat->recent_scanned[0]);
+ recent_rotated[0] = atomic_long_read(&rstat->recent_rotated[0]);
+ if (unlikely(recent_scanned[0] > anon / 4)) {
+ recent_scanned[0] /= 2;
+ recent_rotated[0] /= 2;
+ atomic_long_set(&rstat->recent_scanned[0], recent_scanned[0]);
+ atomic_long_set(&rstat->recent_rotated[0], recent_rotated[0]);
}

- if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- reclaim_stat->recent_scanned[1] /= 2;
- reclaim_stat->recent_rotated[1] /= 2;
+ recent_scanned[1] = atomic_long_read(&rstat->recent_scanned[1]);
+ recent_rotated[1] = atomic_long_read(&rstat->recent_rotated[1]);
+ if (unlikely(recent_scanned[1] > file / 4)) {
+ recent_scanned[1] /= 2;
+ recent_rotated[1] /= 2;
+ atomic_long_set(&rstat->recent_scanned[1], recent_scanned[1]);
+ atomic_long_set(&rstat->recent_rotated[1], recent_rotated[1]);
}

/*
@@ -2253,11 +2259,11 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
*/
- ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
- ap /= reclaim_stat->recent_rotated[0] + 1;
+ ap = anon_prio * (recent_scanned[0] + 1);
+ ap /= recent_rotated[0] + 1;

- fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
- fp /= reclaim_stat->recent_rotated[1] + 1;
+ fp = file_prio * (recent_scanned[1] + 1);
+ fp /= recent_rotated[1] + 1;
spin_unlock_irq(&pgdat->lru_lock);

fraction[0] = ap;
--
2.18.0