Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

From: Chengming Zhou
Date: Tue Mar 12 2024 - 00:55:58 EST


On 2024/3/12 00:12, Johannes Weiner wrote:
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
> check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
> every freed entry. As the shrinker might work on tens or hundreds of
> objects per scan cycle, this is a large reduction in aggregations.
>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

Great! This is a clever simplification and optimization.

With the incremental diff, feel free to add:

Reviewed-by: Chengming Zhou <chengming.zhou@xxxxxxxxx>

Thanks.

> ---
> fs/proc/meminfo.c | 3 +-
> include/linux/zswap.h | 2 +-
> mm/zswap.c | 98 +++++++++++++++++++++----------------------
> 3 files changed, 49 insertions(+), 54 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 45af9a989d40..245171d9164b 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "SwapTotal: ", i.totalswap);
> show_val_kb(m, "SwapFree: ", i.freeswap);
> #ifdef CONFIG_ZSWAP
> - seq_printf(m, "Zswap: %8lu kB\n",
> - (unsigned long)(zswap_pool_total_size >> 10));
> + show_val_kb(m, "Zswap: ", zswap_total_pages());
> seq_printf(m, "Zswapped: %8lu kB\n",
> (unsigned long)atomic_read(&zswap_stored_pages) <<
> (PAGE_SHIFT - 10));
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 341aea490070..2a85b941db97 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -7,7 +7,6 @@
>
> struct lruvec;
>
> -extern u64 zswap_pool_total_size;
> extern atomic_t zswap_stored_pages;
>
> #ifdef CONFIG_ZSWAP
> @@ -27,6 +26,7 @@ struct zswap_lruvec_state {
> atomic_long_t nr_zswap_protected;
> };
>
> +unsigned long zswap_total_pages(void);
> bool zswap_store(struct folio *folio);
> bool zswap_load(struct folio *folio);
> void zswap_invalidate(swp_entry_t swp);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9a3237752082..7c39327a7cc2 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -43,8 +43,6 @@
> /*********************************
> * statistics
> **********************************/
> -/* Total bytes used by the compressed storage */
> -u64 zswap_pool_total_size;
> /* The number of compressed pages currently stored in zswap */
> atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> /* The number of same-value filled pages currently stored in zswap */
> @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> zpool_get_type((p)->zpools[0]))
>
> -static bool zswap_is_full(void)
> -{
> - return totalram_pages() * zswap_max_pool_percent / 100 <
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static bool zswap_can_accept(void)
> -{
> - return totalram_pages() * zswap_accept_thr_percent / 100 *
> - zswap_max_pool_percent / 100 >
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static u64 get_zswap_pool_size(struct zswap_pool *pool)
> -{
> - u64 pool_size = 0;
> - int i;
> -
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> - pool_size += zpool_get_total_size(pool->zpools[i]);
> -
> - return pool_size;
> -}
> -
> -static void zswap_update_total_size(void)
> -{
> - struct zswap_pool *pool;
> - u64 total = 0;
> -
> - rcu_read_lock();
> -
> - list_for_each_entry_rcu(pool, &zswap_pools, list)
> - total += get_zswap_pool_size(pool);
> -
> - rcu_read_unlock();
> -
> - zswap_pool_total_size = total;
> -}
> -
> /*********************************
> * pool functions
> **********************************/
> @@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static unsigned long zswap_max_pages(void)
> +{
> + return totalram_pages() * zswap_max_pool_percent / 100;
> +}
> +
> +unsigned long zswap_total_pages(void)
> +{
> + struct zswap_pool *pool;
> + u64 total = 0;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(pool, &zswap_pools, list) {
> + int i;
> +
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + total += zpool_get_total_size(pool->zpools[i]);
> + }
> + rcu_read_unlock();
> +
> + return total >> PAGE_SHIFT;
> +}
> +
> /*********************************
> * param callbacks
> **********************************/
> @@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
> }
> zswap_entry_cache_free(entry);
> atomic_dec(&zswap_stored_pages);
> - zswap_update_total_size();
> }
>
> /*
> @@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
> #else
> /* use pool stats instead of memcg stats */
> - nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
> + nr_backing = zswap_total_pages();
> nr_stored = atomic_read(&zswap_nr_stored);
> #endif
>
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> {
> struct mem_cgroup *memcg;
> int ret, failures = 0;
> + unsigned long thr;
> +
> + /* Reclaim down to the accept threshold */
> + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
> /* global reclaim will select cgroup in a round-robin fashion. */
> do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
> break;
> if (ret && ++failures == MAX_RECLAIM_RETRIES)
> break;
> -
> resched:
> cond_resched();
> - } while (!zswap_can_accept());
> + } while (zswap_total_pages() > thr);
> }
>
> static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
> struct zswap_entry *entry, *dupentry;
> struct obj_cgroup *objcg = NULL;
> struct mem_cgroup *memcg = NULL;
> + unsigned long max_pages, cur_pages;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
> if (!zswap_enabled)
> goto check_old;
>
> + /* Check cgroup limits */
> objcg = get_obj_cgroup_from_folio(folio);
> if (objcg && !obj_cgroup_may_zswap(objcg)) {
> memcg = get_mem_cgroup_from_objcg(objcg);
> @@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
> mem_cgroup_put(memcg);
> }
>
> - /* reclaim space if needed */
> - if (zswap_is_full()) {
> + /* Check global limits */
> + cur_pages = zswap_total_pages();
> + max_pages = zswap_max_pages();
> +
> + if (cur_pages >= max_pages) {
> zswap_pool_limit_hit++;
> zswap_pool_reached_full = true;
> goto shrink;
> }
>
> if (zswap_pool_reached_full) {
> - if (!zswap_can_accept())
> + unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
> +
> + if (cur_pages > thr)
> goto shrink;
> else
> zswap_pool_reached_full = false;
> @@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)
>
> /* update stats */
> atomic_inc(&zswap_stored_pages);
> - zswap_update_total_size();
> count_vm_event(ZSWPOUT);
>
> return true;
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
> static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> + *val = zswap_total_pages() * PAGE_SIZE;
> + return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> +
> static int zswap_debugfs_init(void)
> {
> if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
> zswap_debugfs_root, &zswap_reject_compress_poor);
> debugfs_create_u64("written_back_pages", 0444,
> zswap_debugfs_root, &zswap_written_back_pages);
> - debugfs_create_u64("pool_total_size", 0444,
> - zswap_debugfs_root, &zswap_pool_total_size);
> + debugfs_create_file("pool_total_size", 0444,
> + zswap_debugfs_root, NULL, &total_size_fops);
> debugfs_create_atomic_t("stored_pages", 0444,
> zswap_debugfs_root, &zswap_stored_pages);
> debugfs_create_atomic_t("same_filled_pages", 0444,