Re: [RFC PATCH v2 2/4] mm/zsmalloc: introduce zs_free_deferred() for async handle freeing

From: Nhat Pham

Date: Tue Apr 21 2026 - 15:47:05 EST

On Tue, Apr 21, 2026 at 5:16 AM Wenchao Hao <haowenchao22@xxxxxxxxx> wrote:
>
> zs_free() is expensive due to internal locking (pool->lock, class->lock)
> and potential zspage freeing. On the process exit path, the slow
> zs_free() blocks memory reclamation, delaying overall memory release.
> This has been reported to significantly impact Android low-memory
> killing where slot_free() accounts for over 80% of the total swap
> entry freeing cost.
>
> Introduce zs_free_deferred() which queues handles into a fixed-size
> per-pool array for later processing by a workqueue. This allows callers
> to defer the expensive zs_free() and return quickly, so the process
> exit path can release memory faster. The array capacity is derived from
> a 128MB uncompressed data budget (128MB >> PAGE_SHIFT entries), which
> scales naturally with PAGE_SIZE. When the array reaches half capacity,
> the workqueue is scheduled to drain pending handles.
>
> zs_free_deferred() uses spin_trylock() to access the deferred queue.
> If the lock is contended (e.g. drain in progress) or the queue is full,
> it falls back to synchronous zs_free() to guarantee correctness.
>
> Also introduce zs_free_deferred_flush() for use during pool teardown to
> ensure all pending handles are freed.

Hmmm per-pool workqueue.

Does that mean that if you only have one zs pool (in the case of
zswap, or if you only have one zram device), you'll have less
concurrency in freeing up zsmalloc memory for process teardown? Would
this be problematic?

I think Kairui was also suggesting per-cpu-fying these batches/queues.

>
> Signed-off-by: Wenchao Hao <haowenchao@xxxxxxxxxx>
> ---
> include/linux/zsmalloc.h | 2 +
> mm/zsmalloc.c | 111 +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 113 insertions(+)
>
> diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
> index 478410c880b1..1e5ac1a39d41 100644
> --- a/include/linux/zsmalloc.h
> +++ b/include/linux/zsmalloc.h
> @@ -30,6 +30,8 @@ void zs_destroy_pool(struct zs_pool *pool);
> unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags,
> const int nid);
> void zs_free(struct zs_pool *pool, unsigned long obj);
> +void zs_free_deferred(struct zs_pool *pool, unsigned long handle);
> +void zs_free_deferred_flush(struct zs_pool *pool);
>
> size_t zs_huge_class_size(struct zs_pool *pool);
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 40687c8a7469..defc892555e4 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -53,6 +53,10 @@
>
> #define ZS_HANDLE_SIZE (sizeof(unsigned long))
>
> +#define ZS_DEFERRED_FREE_MAX_BYTES (128 << 20)
> +#define ZS_DEFERRED_FREE_CAPACITY (ZS_DEFERRED_FREE_MAX_BYTES >> PAGE_SHIFT)
> +#define ZS_DEFERRED_FREE_THRESHOLD (ZS_DEFERRED_FREE_CAPACITY / 2)
> +
> /*
> * Object location (<PFN>, <obj_idx>) is encoded as
> * a single (unsigned long) handle value.
> @@ -217,6 +221,13 @@ struct zs_pool {
> /* protect zspage migration/compaction */
> rwlock_t lock;
> atomic_t compaction_in_progress;
> +
> + /* deferred free support */
> + spinlock_t deferred_lock;
> + unsigned long *deferred_handles;
> + unsigned int deferred_count;
> + unsigned int deferred_capacity;
> + struct work_struct deferred_free_work;
> };
>
> static inline void zpdesc_set_first(struct zpdesc *zpdesc)
> @@ -579,6 +590,19 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
> }
> DEFINE_SHOW_ATTRIBUTE(zs_stats_size);
>
> +static int zs_stats_deferred_show(struct seq_file *s, void *v)
> +{
> + struct zs_pool *pool = s->private;
> +
> + spin_lock(&pool->deferred_lock);
> + seq_printf(s, "pending: %u\n", pool->deferred_count);
> + seq_printf(s, "capacity: %u\n", pool->deferred_capacity);
> + spin_unlock(&pool->deferred_lock);
> +
> + return 0;
> +}
> +DEFINE_SHOW_ATTRIBUTE(zs_stats_deferred);
> +
> static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
> {
> if (!zs_stat_root) {
> @@ -590,6 +614,9 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
>
> debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool,
> &zs_stats_size_fops);
> + debugfs_create_file("deferred_free", S_IFREG | 0444,
> + pool->stat_dentry, pool,
> + &zs_stats_deferred_fops);
> }
>
> static void zs_pool_stat_destroy(struct zs_pool *pool)
> @@ -1432,6 +1459,76 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> }
> EXPORT_SYMBOL_GPL(zs_free);
>
> +static void zs_deferred_free_work(struct work_struct *work)
> +{
> + struct zs_pool *pool = container_of(work, struct zs_pool,
> + deferred_free_work);
> + unsigned long handle;
> +
> + while (1) {
> + spin_lock(&pool->deferred_lock);
> + if (pool->deferred_count == 0) {
> + spin_unlock(&pool->deferred_lock);
> + break;
> + }
> + handle = pool->deferred_handles[--pool->deferred_count];
> + spin_unlock(&pool->deferred_lock);

Any reason why we're locking, grabbing a handle, then unlocking, one
at a time? Why dont we just lock, grab all the handles (or at least a
batch of them), unlock, then process the handles one at a time?

We can also have a pair of handle arrays. Whenever defer worker is
woken up, just swap the arrays under the lock, then free the handles
in the old array :)

> +
> + zs_free(pool, handle);
> + cond_resched();
> + }
> +}
> +
> +/**
> + * zs_free_deferred - queue a handle for asynchronous freeing
> + * @pool: pool to free from
> + * @handle: handle to free
> + *
> + * Place @handle into a deferred free queue for later processing by a
> + * workqueue. This is intended for callers that are in atomic context
> + * (e.g. under a spinlock) and cannot afford the cost of zs_free()
> + * directly. When the queue reaches a threshold the work is scheduled.
> + * Falls back to synchronous zs_free() if the lock is contended (drain
> + * in progress) or if the queue is full.
> + */
> +void zs_free_deferred(struct zs_pool *pool, unsigned long handle)
> +{
> + if (IS_ERR_OR_NULL((void *)handle))
> + return;
> +
> + if (!spin_trylock(&pool->deferred_lock))
> + goto sync_free;
> +
> + if (pool->deferred_count >= pool->deferred_capacity) {
> + spin_unlock(&pool->deferred_lock);
> + goto sync_free;
> + }
> +
> + pool->deferred_handles[pool->deferred_count++] = handle;
> + if (pool->deferred_count >= ZS_DEFERRED_FREE_THRESHOLD)
> + queue_work(system_wq, &pool->deferred_free_work);
> + spin_unlock(&pool->deferred_lock);
> + return;
> +
> +sync_free:
> + zs_free(pool, handle);
> +}
> +EXPORT_SYMBOL_GPL(zs_free_deferred);
> +
> +/**
> + * zs_free_deferred_flush - flush all pending deferred frees
> + * @pool: pool to flush
> + *
> + * Wait for any scheduled work to complete, then drain any remaining
> + * handles. Must be called from process context.
> + */
> +void zs_free_deferred_flush(struct zs_pool *pool)
> +{
> + flush_work(&pool->deferred_free_work);
> + zs_deferred_free_work(&pool->deferred_free_work);
> +}
> +EXPORT_SYMBOL_GPL(zs_free_deferred_flush);
> +
> static void zs_object_copy(struct size_class *class, unsigned long dst,
> unsigned long src)
> {
> @@ -2099,6 +2196,18 @@ struct zs_pool *zs_create_pool(const char *name)
> rwlock_init(&pool->lock);
> atomic_set(&pool->compaction_in_progress, 0);
>
> + spin_lock_init(&pool->deferred_lock);
> + pool->deferred_capacity = ZS_DEFERRED_FREE_CAPACITY;
> + pool->deferred_handles = kvmalloc_array(pool->deferred_capacity,
> + sizeof(unsigned long),
> + GFP_KERNEL);
> + if (!pool->deferred_handles) {
> + kfree(pool);
> + return NULL;
> + }
> + pool->deferred_count = 0;
> + INIT_WORK(&pool->deferred_free_work, zs_deferred_free_work);
> +
> pool->name = kstrdup(name, GFP_KERNEL);
> if (!pool->name)
> goto err;
> @@ -2201,6 +2310,7 @@ void zs_destroy_pool(struct zs_pool *pool)
> int i;
>
> zs_unregister_shrinker(pool);
> + zs_free_deferred_flush(pool);
> zs_flush_migration(pool);
> zs_pool_stat_destroy(pool);
>
> @@ -2224,6 +2334,7 @@ void zs_destroy_pool(struct zs_pool *pool)
> kfree(class);
> }
>
> + kvfree(pool->deferred_handles);
> kfree(pool->name);
> kfree(pool);
> }
> --
> 2.34.1
>