[PATCH] mm: allow to decrease swap.max below actual swap usage

From: Roman Gushchin
Date: Thu Apr 12 2018 - 09:28:05 EST


Currently an attempt to set swap.max into a value lower
than the actual swap usage fails. And a user can't do much
with it, except turning off swap globally (using swapoff).

This is an actual problem we met in the production.
The default value for swap.max is "max", so turning swap on
globally may lead to swapping out in some cgroups.
As it's not possible to lower the allowed maximum,
it creates additional configuration issues.

For example, to disable swapping for a particular cgroup
it has to be re-created with memory.max set to 0.

This patch aims to fix this issue by allowing setting swap.max
into any value (which corresponds to cgroup v2 API design),
and schedule a background job to fit swap size into the new limit.

The following script can be used to test the memory.swap behavior:
#!/bin/bash

mkdir -p /sys/fs/cgroup/test_swap
echo 100M > /sys/fs/cgroup/test_swap/memory.max
echo max > /sys/fs/cgroup/test_swap/memory.swap.max

mkdir -p /sys/fs/cgroup/test_swap_2
echo 100M > /sys/fs/cgroup/test_swap_2/memory.max
echo max > /sys/fs/cgroup/test_swap_2/memory.swap.max

echo $$ > /sys/fs/cgroup/test_swap/cgroup.procs
allocate 200M &

echo $$ > /sys/fs/cgroup/test_swap_2/cgroup.procs
allocate 200M &

sleep 2

cat /sys/fs/cgroup/test_swap/memory.swap.current
cat /sys/fs/cgroup/test_swap_2/memory.swap.current

echo max > /sys/fs/cgroup/test_swap/memory.max
echo 50M > /sys/fs/cgroup/test_swap/memory.swap.max

sleep 10

cat /sys/fs/cgroup/test_swap/memory.swap.current
cat /sys/fs/cgroup/test_swap_2/memory.swap.current

pkill allocate

Original test results:
106024960
106348544
./swap.sh: line 23: echo: write error: Device or resource busy
106024960
106348544

With this patch applied:
106045440
106352640
52428800
106201088

Signed-off-by: Roman Gushchin <guro@xxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Shaohua Li <shli@xxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: linux-mm@xxxxxxxxx
Cc: cgroups@xxxxxxxxxxxxxxx
---
include/linux/memcontrol.h | 1 +
include/linux/swap.h | 9 +++++++
include/linux/swapfile.h | 3 ++-
mm/frontswap.c | 2 +-
mm/memcontrol.c | 27 +++++++++++++++----
mm/swapfile.c | 64 ++++++++++++++++++++++++++++++++++++++++++----
6 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index af9eed2e3e04..a0825cc61ee7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -186,6 +186,7 @@ struct mem_cgroup {

/* Range enforcement for interrupt charges */
struct work_struct high_work;
+ struct work_struct swap_work;

unsigned long soft_limit;

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1985940af479..878f111d0603 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -650,6 +650,8 @@ extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct page *page);
+extern int mem_cgroup_shrink_swap(struct mem_cgroup *memcg,
+ unsigned long nr_pages);
#else
static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
@@ -675,6 +677,13 @@ static inline bool mem_cgroup_swap_full(struct page *page)
{
return vm_swap_full();
}
+
+static inline int mem_cgroup_shrink_swap(struct mem_cgroup *memcg,
+ unsigned long nr_pages)
+{
+ return 0;
+}
+
#endif

#endif /* __KERNEL__*/
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 06bd7b096167..16844259e802 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -9,6 +9,7 @@
extern spinlock_t swap_lock;
extern struct plist_head swap_active_head;
extern struct swap_info_struct *swap_info[];
-extern int try_to_unuse(unsigned int, bool, unsigned long);
+extern int try_to_unuse(unsigned int type, bool fronstswap,
+ unsigned long pages_to_unuse, struct mem_cgroup *memcg);

#endif /* _LINUX_SWAPFILE_H */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index fec8b5044040..f7cb2e802fce 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -458,7 +458,7 @@ void frontswap_shrink(unsigned long target_pages)
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
spin_unlock(&swap_lock);
if (ret == 0)
- try_to_unuse(type, true, pages_to_unuse);
+ try_to_unuse(type, true, pages_to_unuse, NULL);
return;
}
EXPORT_SYMBOL(frontswap_shrink);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3e7942c301a8..bacb1fd17100 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -60,6 +60,7 @@
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
+#include <linux/swapfile.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
@@ -1870,6 +1871,23 @@ static void high_work_func(struct work_struct *work)
reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
}

+static void swap_work_func(struct work_struct *work)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = container_of(work, struct mem_cgroup, swap_work);
+
+ for (;;) {
+ unsigned long usage = page_counter_read(&memcg->swap);
+
+ if (usage <= memcg->swap.limit)
+ break;
+
+ if (mem_cgroup_shrink_swap(memcg, usage - memcg->swap.limit))
+ break;
+ }
+}
+
/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
@@ -4394,6 +4412,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;

INIT_WORK(&memcg->high_work, high_work_func);
+ INIT_WORK(&memcg->swap_work, swap_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
@@ -4529,6 +4548,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
+ cancel_work_sync(&memcg->swap_work);
mem_cgroup_remove_from_trees(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
@@ -6415,11 +6435,8 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
if (err)
return err;

- mutex_lock(&memcg_limit_mutex);
- err = page_counter_limit(&memcg->swap, max);
- mutex_unlock(&memcg_limit_mutex);
- if (err)
- return err;
+ xchg(&memcg->swap.limit, max);
+ schedule_work(&memcg->swap_work);

return nbytes;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5a280972bd87..45ef9858645b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2093,11 +2093,11 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
*
- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
- * pages_to_unuse==0 means all pages; ignored if frontswap is false
+ * Only unuse pages_to_unuse pages; pages_to_unuse==0 means all pages.
*/
int try_to_unuse(unsigned int type, bool frontswap,
- unsigned long pages_to_unuse)
+ unsigned long pages_to_unuse,
+ struct mem_cgroup *memcg)
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
@@ -2192,6 +2192,17 @@ int try_to_unuse(unsigned int type, bool frontswap,
lock_page(page);
wait_on_page_writeback(page);

+ if (memcg && do_swap_account) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id = lookup_swap_cgroup_id(ent);
+
+ if (memcg != mem_cgroup_from_id(id)) {
+ unlock_page(page);
+ put_page(page);
+ continue;
+ }
+ }
+
/*
* Remove all references to entry.
*/
@@ -2310,7 +2321,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
* interactive performance.
*/
cond_resched();
- if (frontswap && pages_to_unuse > 0) {
+ if (pages_to_unuse > 0) {
if (!--pages_to_unuse)
break;
}
@@ -2618,6 +2629,49 @@ bool has_usable_swap(void)
return ret;
}

+int mem_cgroup_shrink_swap(struct mem_cgroup *memcg, unsigned long nr_pages)
+{
+ struct swap_info_struct *p = NULL;
+ unsigned long to_shrink;
+ int err;
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(p, &swap_active_head, list) {
+ if (!(p->flags & SWP_WRITEOK))
+ continue;
+
+ to_shrink = min(512UL, nr_pages);
+
+ del_from_avail_list(p);
+ spin_lock(&p->lock);
+ plist_del(&p->list, &swap_active_head);
+ p->flags &= ~SWP_WRITEOK;
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+
+ disable_swap_slots_cache_lock();
+
+ set_current_oom_origin();
+ err = try_to_unuse(p->type, false, to_shrink, memcg);
+ clear_current_oom_origin();
+
+ reinsert_swap_info(p);
+ reenable_swap_slots_cache_unlock();
+
+ if (err)
+ return err;
+
+ nr_pages -= to_shrink;
+ if (!nr_pages)
+ return err;
+
+ spin_lock(&swap_lock);
+ }
+ spin_unlock(&swap_lock);
+
+ return 0;
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -2693,7 +2747,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
disable_swap_slots_cache_lock();

set_current_oom_origin();
- err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ err = try_to_unuse(p->type, false, 0, NULL); /* force unuse all pages */
clear_current_oom_origin();

if (err) {
--
2.14.3