Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless
From: Kirill Tkhai
Date: Tue Feb 21 2023 - 16:43:19 EST
On 20.02.2023 12:16, Qi Zheng wrote:
> Like global slab shrink, since commit 1cd0bd06093c
> ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU
> to protect readers who previously held shrinker_rwsem.
>
> We can test with the following script:
>
> ```
> DIR="/root/shrinker/memcg/mnt"
>
> do_create()
> {
> mkdir /sys/fs/cgroup/memory/test
> echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> for i in `seq 0 $1`;
> do
> mkdir /sys/fs/cgroup/memory/test/$i;
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> mkdir -p $DIR/$i;
> done
> }
>
> do_mount()
> {
> for i in `seq $1 $2`;
> do
> mount -t tmpfs $i $DIR/$i;
> done
> }
>
> do_touch()
> {
> for i in `seq $1 $2`;
> do
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
> done
> }
>
> do_create 2000
> do_mount 0 2000
> do_touch 0 1000
> ```
>
> Before applying:
>
> 46.60% [kernel] [k] down_read_trylock
> 18.70% [kernel] [k] up_read
> 15.44% [kernel] [k] shrink_slab
> 4.37% [kernel] [k] _find_next_bit
> 2.75% [kernel] [k] xa_load
> 2.07% [kernel] [k] idr_find
> 1.73% [kernel] [k] do_shrink_slab
> 1.42% [kernel] [k] shrink_lruvec
> 0.74% [kernel] [k] shrink_node
> 0.60% [kernel] [k] list_lru_count_one
>
> After applying:
>
> 19.53% [kernel] [k] _find_next_bit
> 14.63% [kernel] [k] do_shrink_slab
> 14.58% [kernel] [k] shrink_slab
> 11.83% [kernel] [k] shrink_lruvec
> 9.33% [kernel] [k] __blk_flush_plug
> 6.67% [kernel] [k] mem_cgroup_iter
> 3.73% [kernel] [k] list_lru_count_one
> 2.43% [kernel] [k] shrink_node
> 1.96% [kernel] [k] super_cache_count
> 1.78% [kernel] [k] __rcu_read_unlock
> 1.38% [kernel] [k] __srcu_read_lock
> 1.30% [kernel] [k] xas_descend
>
> We can see that the readers is no longer blocked.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
> ---
> mm/vmscan.c | 56 ++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 95a3d6ddc6c1..dc47396ecd0e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
> #include <linux/khugepaged.h>
> #include <linux/rculist_nulls.h>
> #include <linux/random.h>
> +#include <linux/srcu.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
> static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
> int nid)
> {
> - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
> - lockdep_is_held(&shrinker_rwsem));
> + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu,
> + lockdep_is_held(&shrinker_rwsem));
> +}
> +
> +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
> + int nid)
> +{
> + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu);
> +}
> +
> +static void free_shrinker_info_rcu(struct rcu_head *head)
> +{
> + kvfree(container_of(head, struct shrinker_info, rcu));
> }
>
> static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> defer_size - old_defer_size);
>
> rcu_assign_pointer(pn->shrinker_info, new);
> - kvfree_rcu(old, rcu);
> + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
> }
>
> return 0;
> @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
> {
> if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
> struct shrinker_info *info;
> + int srcu_idx;
>
> - rcu_read_lock();
> - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> /* Pairs with smp mb in shrink_slab() */
> smp_mb__before_atomic();
> set_bit(shrinker_id, info->map);
> - rcu_read_unlock();
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
> }
>
> @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> return -ENOSYS;
>
> down_write(&shrinker_rwsem);
> - /* This may call shrinker, so it must use down_read_trylock() */
> id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
> if (id < 0)
> goto unlock;
> @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
> }
>
> @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
> }
>
> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> {
> - int i, nid;
> + int i, nid, srcu_idx;
> long nr;
> struct mem_cgroup *parent;
> struct shrinker_info *child_info, *parent_info;
> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> parent = root_mem_cgroup;
>
> /* Prevent from concurrent shrinker_info expand */
> - down_read(&shrinker_rwsem);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> for_each_node(nid) {
> - child_info = shrinker_info_protected(memcg, nid);
> - parent_info = shrinker_info_protected(parent, nid);
> + child_info = shrinker_info_srcu(memcg, nid);
> + parent_info = shrinker_info_srcu(parent, nid);
> for (i = 0; i < shrinker_nr_max; i++) {
> nr = atomic_long_read(&child_info->nr_deferred[i]);
> atomic_long_add(nr, &parent_info->nr_deferred[i]);
> }
> }
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
>
> static bool cgroup_reclaim(struct scan_control *sc)
> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> {
> struct shrinker_info *info;
> unsigned long ret, freed = 0;
> + int srcu_idx;
> int i;
>
> if (!mem_cgroup_online(memcg))
> return 0;
>
> - if (!down_read_trylock(&shrinker_rwsem))
> - return 0;
> -
> - info = shrinker_info_protected(memcg, nid);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> if (unlikely(!info))
> goto unlock;
There is shrinker_nr_max dereference under this hunk. It's not in the patch:
for_each_set_bit(i, info->map, shrinker_nr_max) {
Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(
It looks like we should save size of info->map as a new member of struct shrinker_info.
>
> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> set_shrinker_bit(memcg, nid, i);
> }
> freed += ret;
> -
> - if (rwsem_is_contended(&shrinker_rwsem)) {
> - freed = freed ? : 1;
> - break;
> - }
> }
> unlock:
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> return freed;
> }
> #else /* CONFIG_MEMCG */