[PATCH] mm: vmscan: Replace shrinker_rwsem trylocks with SRCU protection

From: Sultan Alsawaf
Date: Mon Sep 27 2021 - 03:48:45 EST


From: Sultan Alsawaf <sultan@xxxxxxxxxxxxxxx>

The shrinker rwsem is problematic because the actual shrinking path must
back off when contention appears, causing some or all shrinkers to be
skipped. This can be especially bad when shrinkers are frequently
registered and unregistered. A high frequency of shrinker registrations/
unregistrations can effectively DoS the shrinker mechanism, rendering it
useless.

Using SRCU to protect iteration through the shrinker list and idr
eliminates this issue entirely. Now, shrinking can happen concurrently
with shrinker registrations/unregistrations.

Signed-off-by: Sultan Alsawaf <sultan@xxxxxxxxxxxxxxx>
---
mm/vmscan.c | 68 ++++++++++++++++++++++++++++-------------------------
1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 74296c2d1fed..3dea927be5ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -190,6 +190,7 @@ static void set_task_reclaim_state(struct task_struct *task,

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+DEFINE_STATIC_SRCU(shrinker_srcu);

#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
@@ -212,6 +213,20 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
lockdep_is_held(&shrinker_rwsem));
}

+static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
+ int nid)
+{
+ return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu);
+}
+
+static void free_shrinker_info_srcu(struct rcu_head *head)
+{
+ struct shrinker_info *info = container_of(head, typeof(*info), rcu);
+
+ kvfree_rcu(info, rcu);
+}
+
static int expand_one_shrinker_info(struct mem_cgroup *memcg,
int map_size, int defer_size,
int old_map_size, int old_defer_size)
@@ -244,7 +259,12 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);

rcu_assign_pointer(pn->shrinker_info, new);
- kvfree_rcu(old, rcu);
+
+ /*
+ * Shrinker info is used under both SRCU and regular RCU, so
+ * synchronize the free against both of them.
+ */
+ call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_srcu);
}

return 0;
@@ -357,7 +377,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
return -ENOSYS;

down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -391,7 +410,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;

- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}

@@ -400,7 +419,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;

- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}

@@ -641,6 +660,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
up_write(&shrinker_rwsem);
+ synchronize_srcu(&shrinker_srcu);
return;
}

@@ -651,7 +671,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
+ list_add_tail_rcu(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
up_write(&shrinker_rwsem);
}
@@ -676,11 +696,12 @@ void unregister_shrinker(struct shrinker *shrinker)
return;

down_write(&shrinker_rwsem);
- list_del(&shrinker->list);
+ list_del_rcu(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
up_write(&shrinker_rwsem);
+ synchronize_srcu(&shrinker_srcu);

kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
@@ -792,15 +813,13 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
- int i;
+ int i, srcu_idx;

if (!mem_cgroup_online(memcg))
return 0;

- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
if (unlikely(!info))
goto unlock;

@@ -850,14 +869,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
-
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
}
unlock:
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
return freed;
}
#else /* CONFIG_MEMCG */
@@ -894,6 +908,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
+ int srcu_idx;

/*
* The root memcg might be allocated even though memcg is disabled
@@ -905,10 +920,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

- if (!down_read_trylock(&shrinker_rwsem))
- goto out;
-
- list_for_each_entry(shrinker, &shrinker_list, list) {
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ list_for_each_entry_srcu(shrinker, &shrinker_list, list,
+ srcu_read_lock_held(&shrinker_srcu)) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -919,19 +933,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
freed += ret;
- /*
- * Bail out if someone want to register a new shrinker to
- * prevent the registration from being stalled for long periods
- * by parallel ongoing shrinking.
- */
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
}
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);

- up_read(&shrinker_rwsem);
-out:
cond_resched();
return freed;
}
--
2.33.0