Re: [PATCH] mm: zswap: limit number of zpools based on CPU and RAM
From: Yosry Ahmed
Date: Thu Jun 06 2024 - 13:46:49 EST
On Thu, Jun 6, 2024 at 9:53 AM Takero Funaki <flintglass@xxxxxxxxx> wrote:
>
> This patch limits the number of zpools used by zswap on smaller systems.
>
> Currently, zswap allocates 32 pools unconditionally. This was
> implemented to reduce contention on per-zpool locks. However, it incurs
> allocation overhead by distributing pages across pools, wasting memory
> on systems with fewer CPUs and less RAM.
>
> This patch allocates approximately 2*CPU zpools, with a minimum of 1
> zpool for single-CPU systems and up to 32 zpools for systems with 16 or
> more CPUs. This number is sufficient to keep the probability of
> busy-waiting by a thread under 40%. The upper limit of 32 zpools remains
> unchanged.
>
> For memory, it limits to 1 zpool per 60MB of memory for the 20% default
> max pool size limit, assuming the best case with no fragmentation in
> zspages. It expects 90% pool usage for zsmalloc.
>
> Signed-off-by: Takero Funaki <flintglass@xxxxxxxxx>
There are a lot of magic numbers in this patch, and it seems like it's
all based on theory. I don't object to making the number of zpools
dynamic in some way, but unless we do it in a data-driven way where we
understand the implications, I think the added complexity and
inconsistency is not justified.
For example, 2*CPU zpools is an overkill and will cause a lot of
fragmentation. We use 32 zpools right now for machines with 100s of
CPUs. I know that you are keeping 32 as the limit, but why 2*CPUs if
nr_cpus <= 16?
Also, the limitation based on memory size assumes that zsmalloc is the
only allocator used by zswap, which is unfortunately not the case.
The current implementation using 32 zpools all the time is not
perfect, and I did write a patch to make it at least be min(nr_cpus,
32), but it is simple and it works. Complexity should be justified.
> ---
> mm/zswap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 60 insertions(+), 7 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 4de342a63bc2..e957bfdeaf70 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -124,8 +124,11 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
> module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
> uint, 0644);
>
> -/* Number of zpools in zswap_pool (empirically determined for scalability) */
> -#define ZSWAP_NR_ZPOOLS 32
> +/*
> + * Number of max zpools in zswap_pool (empirically determined for scalability)
> + * This must be order of 2, for pointer hashing.
> + */
> +#define ZSWAP_NR_ZPOOLS_MAX 32
>
> /* Enable/disable memory pressure-based shrinker. */
> static bool zswap_shrinker_enabled = IS_ENABLED(
> @@ -157,12 +160,13 @@ struct crypto_acomp_ctx {
> * needs to be verified that it's still valid in the tree.
> */
> struct zswap_pool {
> - struct zpool *zpools[ZSWAP_NR_ZPOOLS];
> + struct zpool *zpools[ZSWAP_NR_ZPOOLS_MAX];
> struct crypto_acomp_ctx __percpu *acomp_ctx;
> struct percpu_ref ref;
> struct list_head list;
> struct work_struct release_work;
> struct hlist_node node;
> + unsigned char nr_zpools_order;
> char tfm_name[CRYPTO_MAX_ALG_NAME];
> };
>
> @@ -243,11 +247,55 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> zpool_get_type((p)->zpools[0]))
>
> +static unsigned long zswap_max_pages(void);
> +
> /*********************************
> * pool functions
> **********************************/
> static void __zswap_pool_empty(struct percpu_ref *ref);
>
> +/*
> + * Estimate the optimal number of zpools based on CPU and memory.
> + *
> + * For CPUs, aim for 40% or lower probability of busy-waiting from a thread,
> + * assuming all cores are accessing zswap concurrently.
> + * The threshold is chosen for the simplicity of the formula:
> + * The probability is 1-(1-(1/pool))^(thr-1). For 40% threshold, this is
> + * approximately pool = 2 * threads rounded up to orders of 2.
> + * Threads \ Pools
> + * 2 4 8 16 32
> + * 2 0.50 0.25 < 0.13 0.06 0.03
> + * 4 0.88 0.58 0.33 < 0.18 0.09
> + * 6 0.97 0.76 0.49 0.28 < 0.15
> + * 8 0.99 0.87 0.61 0.36 < 0.20
> + * 10 1.00 0.92 0.70 0.44 0.25 <
> + * 16 1.00 0.99 0.87 0.62 0.38 <
> + * 18 1.00 0.99 0.90 0.67 0.42
> + *
> + * For memory, expect 90% pool usage for zsmalloc in the best case.
> + * Assuming uniform distribution, we need to store:
> + * 590 : sum of pages_per_zspage
> + * * 0.5 : about half of zspage is empty if no fragmentation
> + * / (1-0.9) : 90% target usage
> + * = 2950 : expected max pages of a zpool,
> + * equivalent to 60MB RAM for a 20% max_pool_percent.
> + */
> +static void __zswap_set_nr_zpools(struct zswap_pool *pool)
> +{
> + unsigned long mem = zswap_max_pages();
> + unsigned long cpu = num_online_cpus();
> +
> + mem = DIV_ROUND_UP(mem, 2950);
> + mem = min(max(1, mem), ZSWAP_NR_ZPOOLS_MAX);
> +
> + if (cpu <= 1)
> + cpu = 1;
> + else
> + cpu = 1 << ilog2(min(cpu * 2, ZSWAP_NR_ZPOOLS_MAX);
> +
> + pool->nr_zpools_order = ilog2(min(mem, cpu));
> +}
> +
> static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> {
> int i;
> @@ -271,7 +319,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> if (!pool)
> return NULL;
>
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
> + __zswap_set_nr_zpools(pool);
> +
> + for (i = 0; i < (1 << pool->nr_zpools_order); i++) {
> /* unique name for each pool specifically required by zsmalloc */
> snprintf(name, 38, "zswap%x",
> atomic_inc_return(&zswap_pools_count));
> @@ -372,7 +422,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> free_percpu(pool->acomp_ctx);
>
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + for (i = 0; i < (1 << pool->nr_zpools_order); i++)
> zpool_destroy_pool(pool->zpools[i]);
> kfree(pool);
> }
> @@ -513,7 +563,7 @@ unsigned long zswap_total_pages(void)
> list_for_each_entry_rcu(pool, &zswap_pools, list) {
> int i;
>
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + for (i = 0; i < (1 << pool->nr_zpools_order); i++)
> total += zpool_get_total_pages(pool->zpools[i]);
> }
> rcu_read_unlock();
> @@ -822,7 +872,10 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
>
> static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> {
> - return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))];
> + if (entry->pool->nr_zpools_order == 0)
> + return entry->pool->zpools[0];
> +
> + return entry->pool->zpools[hash_ptr(entry, entry->pool->nr_zpools_order)];
> }
>
> /*
> --
> 2.43.0
>