[PATCH RFC] mm/madvise: implement MADV_STOCKPILE (kswapd from user space)

From: Konstantin Khlebnikov
Date: Mon May 27 2019 - 06:09:18 EST


Memory cgroup has no background memory reclaimer. Reclaiming after passing
high-limit blocks task because works synchronously in task-work.

This implements manual kswapd-style memory reclaim initiated by userspace.
It reclaims both physical memory and cgroup pages. It works in context of
task who calls syscall madvise thus cpu time is accounted correctly.

Interface:

ret = madvise(ptr, size, MADV_STOCKPILE)

Returns:
0 - ok, free memory >= size
-EINVAL - not supported
-ENOMEM - not enough memory/cgroup limit
-EINTR - interrupted by pending signal
-EAGAIN - cannot reclaim enough memory

Argument 'size' is interpreted size of required free memory.
Implementation triggers direct reclaim until amount of free memory is
lower than that size. Argument 'ptr' could points to vma for specifying
numa allocation policy, right now should be NULL.

Usage scenario: independent thread or standalone daemon estimates rate of
allocations and calls MADV_STOCKPILE in loop to prepare free pages.
Thus fast path avoids allocation latency induced by direct reclaim.

We are using this embedded into memory allocator based on MADV_FREE.


Demonstration in memory cgroup with limit 1G:

touch zero
truncate -s 5G zero

Without stockpile:

perf stat -e vmscan:* md5sum zero

Performance counter stats for 'md5sum zero':

0 vmscan:mm_vmscan_kswapd_sleep
0 vmscan:mm_vmscan_kswapd_wake
0 vmscan:mm_vmscan_wakeup_kswapd
0 vmscan:mm_vmscan_direct_reclaim_begin
10147 vmscan:mm_vmscan_memcg_reclaim_begin
0 vmscan:mm_vmscan_memcg_softlimit_reclaim_begin
0 vmscan:mm_vmscan_direct_reclaim_end
10147 vmscan:mm_vmscan_memcg_reclaim_end
0 vmscan:mm_vmscan_memcg_softlimit_reclaim_end
99910 vmscan:mm_shrink_slab_start
99910 vmscan:mm_shrink_slab_end
39654 vmscan:mm_vmscan_lru_isolate
0 vmscan:mm_vmscan_writepage
39652 vmscan:mm_vmscan_lru_shrink_inactive
2 vmscan:mm_vmscan_lru_shrink_active
19982 vmscan:mm_vmscan_inactive_list_is_low

10.886832585 seconds time elapsed

8.928366000 seconds user
1.935212000 seconds sys

With stockpile:

stockpile 100 10 & # up to 100M every 10ms
perf stat -e vmscan:* md5sum zero

Performance counter stats for 'md5sum zero':

0 vmscan:mm_vmscan_kswapd_sleep
0 vmscan:mm_vmscan_kswapd_wake
0 vmscan:mm_vmscan_wakeup_kswapd
0 vmscan:mm_vmscan_direct_reclaim_begin
0 vmscan:mm_vmscan_memcg_reclaim_begin
0 vmscan:mm_vmscan_memcg_softlimit_reclaim_begin
0 vmscan:mm_vmscan_direct_reclaim_end
0 vmscan:mm_vmscan_memcg_reclaim_end
0 vmscan:mm_vmscan_memcg_softlimit_reclaim_end
0 vmscan:mm_shrink_slab_start
0 vmscan:mm_shrink_slab_end
0 vmscan:mm_vmscan_lru_isolate
0 vmscan:mm_vmscan_writepage
0 vmscan:mm_vmscan_lru_shrink_inactive
0 vmscan:mm_vmscan_lru_shrink_active
0 vmscan:mm_vmscan_inactive_list_is_low

10.469776675 seconds time elapsed

8.976261000 seconds user
1.491378000 seconds sys

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
---
include/linux/memcontrol.h | 6 +++++
include/uapi/asm-generic/mman-common.h | 2 ++
mm/madvise.c | 39 ++++++++++++++++++++++++++++++
mm/memcontrol.c | 41 ++++++++++++++++++++++++++++++++
tools/vm/Makefile | 2 +-
tools/vm/stockpile.c | 30 +++++++++++++++++++++++
6 files changed, 119 insertions(+), 1 deletion(-)
create mode 100644 tools/vm/stockpile.c

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bc74d6a4407c..25325f18ad55 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -517,6 +517,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
}

void mem_cgroup_handle_over_high(void);
+int mem_cgroup_stockpile(unsigned long goal_pages);

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

@@ -968,6 +969,11 @@ static inline void mem_cgroup_handle_over_high(void)
{
}

+static inline int mem_cgroup_stockpile(unsigned long goal_page)
+{
+ return 0;
+}
+
static inline void mem_cgroup_enter_user_fault(void)
{
}
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index abd238d0f7a4..675145864fee 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -64,6 +64,8 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */

+#define MADV_STOCKPILE 20 /* stockpile free pages */
+
/* compatibility flags */
#define MAP_FILE 0

diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..f908b08ecc9f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -686,6 +686,41 @@ static int madvise_inject_error(int behavior,
}
#endif

+static long madvise_stockpile(unsigned long start, size_t len)
+{
+ unsigned long goal_pages, progress;
+ struct zonelist *zonelist;
+ int ret;
+
+ if (start)
+ return -EINVAL;
+
+ goal_pages = len >> PAGE_SHIFT;
+
+ if (goal_pages > totalram_pages() - totalreserve_pages)
+ return -ENOMEM;
+
+ ret = mem_cgroup_stockpile(goal_pages);
+ if (ret)
+ return ret;
+
+ /* TODO: use vma mempolicy */
+ zonelist = node_zonelist(numa_node_id(), GFP_HIGHUSER);
+
+ while (global_zone_page_state(NR_FREE_PAGES) <
+ goal_pages + totalreserve_pages) {
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ progress = try_to_free_pages(zonelist, 0, GFP_HIGHUSER, NULL);
+ if (!progress)
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
@@ -728,6 +763,7 @@ madvise_behavior_valid(int behavior)
case MADV_DODUMP:
case MADV_WIPEONFORK:
case MADV_KEEPONFORK:
+ case MADV_STOCKPILE:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
@@ -834,6 +870,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return madvise_inject_error(behavior, start, start + len_in);
#endif

+ if (behavior == MADV_STOCKPILE)
+ return madvise_stockpile(start, len);
+
write = madvise_need_mmap_write(behavior);
if (write) {
if (down_write_killable(&current->mm->mmap_sem))
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e50a2db5b4ff..dc23dc6bbeb3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2276,6 +2276,47 @@ void mem_cgroup_handle_over_high(void)
current->memcg_nr_pages_over_high = 0;
}

+int mem_cgroup_stockpile(unsigned long goal_pages)
+{
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned long limit, nr_free, progress;
+ struct mem_cgroup *memcg, *pos;
+ int ret = 0;
+
+ pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+retry:
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ limit = min(pos->memory.max, pos->high);
+ if (goal_pages > limit) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ nr_free = limit - page_counter_read(&pos->memory);
+ if ((long)nr_free < (long)goal_pages) {
+ progress = try_to_free_mem_cgroup_pages(pos,
+ goal_pages - nr_free, GFP_HIGHUSER, true);
+ if (progress || nr_retries--)
+ goto retry;
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ pos = parent_mem_cgroup(pos);
+ if (pos)
+ goto retry;
+
+out:
+ css_put(&memcg->css);
+ return ret;
+}
+
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
diff --git a/tools/vm/Makefile b/tools/vm/Makefile
index 20f6cf04377f..e5b5bc0d9421 100644
--- a/tools/vm/Makefile
+++ b/tools/vm/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for vm tools
#
-TARGETS=page-types slabinfo page_owner_sort
+TARGETS=page-types slabinfo page_owner_sort stockpile

LIB_DIR = ../lib/api
LIBS = $(LIB_DIR)/libapi.a
diff --git a/tools/vm/stockpile.c b/tools/vm/stockpile.c
new file mode 100644
index 000000000000..245e24f293ec
--- /dev/null
+++ b/tools/vm/stockpile.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <err.h>
+#include <errno.h>
+
+#ifndef MADV_STOCKPILE
+# define MADV_STOCKPILE 20
+#endif
+
+int main(int argc, char **argv)
+{
+ int interval;
+ size_t size;
+ int ret;
+
+ if (argc != 3)
+ errx(1, "usage: %s <size_mb> <interval_ms>", argv[0]);
+
+ size = atol(argv[1]) << 20;
+ interval = atoi(argv[2]) * 1000;
+
+ while (1) {
+ ret = madvise(NULL, size, MADV_STOCKPILE);
+ if (ret && errno != EAGAIN)
+ err(2, "madvise(NULL, %zu, MADV_STOCKPILE)", size);
+ usleep(interval);
+ }
+}