Re: [RFC PATCH v3 1/4] mm/zsmalloc: introduce deferred free framework with callback ops

From: Nhat Pham

Date: Fri May 08 2026 - 20:30:15 EST


On Thu, May 7, 2026 at 11:08 PM Wenchao Hao <haowenchao22@xxxxxxxxx> wrote:
>
> Add a per-cpu deferred free mechanism to zsmalloc with a callback
> interface that lets callers (zram, zswap) customize push and drain
> behavior.
>
> Each CPU owns a single-page buffer. The hot path (zs_free_deferred)
> writes a value into the current CPU's buffer via the push callback
> with preemption disabled — no locks, no atomics. When the buffer
> fills, it is swapped with a fresh page from a pre-allocated page
> pool and the full page is queued to a WQ_UNBOUND worker for drain.
>
> The drain worker invokes the drain callback which performs the actual
> expensive work (zs_free, slot_free, etc.) in batch, away from the
> original hot path.
>
> Page pool management:
> - Pool is pre-allocated at enable time (ZS_DEFERRED_POOL_SIZE pages)
> - Full buffers are drained and returned to the pool
> - If no free page is available when buffer is full, the push falls
> back to synchronous processing by the caller
>
> Signed-off-by: Wenchao Hao <haowenchao@xxxxxxxxxx>
> ---
> +#define ZS_DEFERRED_POOL_SIZE (256 * 1024 / PAGE_SIZE)

Seems oddly specific? :) And this doesn't quite scale with number of
CPUs, or memory size?

> +
> +struct zs_deferred_percpu {
> + unsigned int count;
> + void *buf;
> +};
> +
> struct zs_pool {
> const char *name;
>
> @@ -217,6 +224,18 @@ struct zs_pool {
> /* protect zspage migration/compaction */
> rwlock_t lock;
> atomic_t compaction_in_progress;
> +
> + /* per-cpu deferred free */
> + const struct zs_deferred_ops *deferred_ops;
> + void *deferred_private;
> + struct zs_deferred_percpu __percpu *deferred;
> + struct work_struct deferred_work;
> + struct workqueue_struct *deferred_wq;
> + struct list_head deferred_pool;
> + unsigned int deferred_pool_count;
> + spinlock_t deferred_pool_lock;
> + struct list_head deferred_drain_list;
> + spinlock_t deferred_drain_lock;
> };
>
> static inline void zpdesc_set_first(struct zpdesc *zpdesc)
> @@ -1416,6 +1435,171 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> }
> EXPORT_SYMBOL_GPL(zs_free);
>
> +static struct page *deferred_pool_get(struct zs_pool *pool)
> +{
> + struct page *page = NULL;
> +
> + spin_lock(&pool->deferred_pool_lock);
> + if (!list_empty(&pool->deferred_pool)) {
> + page = list_first_entry(&pool->deferred_pool, struct page, lru);
> + list_del(&page->lru);
> + pool->deferred_pool_count--;
> + }
> + spin_unlock(&pool->deferred_pool_lock);
> + return page;
> +}
> +
> +static void deferred_pool_put(struct zfs_pool *pool, struct page *page)
> +{
> + spin_lock(&pool->deferred_pool_lock);
> + list_add_tail(&page->lru, &pool->deferred_pool);
> + pool->deferred_pool_count++;
> + spin_unlock(&pool->deferred_pool_lock);
> +}
> +
> +static void zs_deferred_work_fn(struct work_struct *work)
> +{
> + struct zs_pool *pool = container_of(work, struct zs_pool, deferred_work);
> + struct page *page;
> +
> + while (true) {
> + unsigned int count;
> +
> + spin_lock(&pool->deferred_drain_lock);
> + if (list_empty(&pool->deferred_drain_list)) {
> + spin_unlock(&pool->deferred_drain_lock);
> + break;
> + }
> + page = list_first_entry(&pool->deferred_drain_list,
> + struct page, lru);
> + list_del(&page->lru);
> + count = page_private(page);
> + spin_unlock(&pool->deferred_drain_lock);
> +
> + pool->deferred_ops->drain(pool->deferred_private,
> + page_address(page), count);
> + deferred_pool_put(pool, page);
> + cond_resched();
> + }
> +}
> +
> +bool zs_free_deferred(struct zs_pool *pool, unsigned long value)
> +{
> + struct zs_deferred_percpu *def;
> + struct page *new_page, *full_page;
> + enum zs_push_ret ret;
> +
> + if (!pool->deferred)
> + return false;
> +
> + def = get_cpu_ptr(pool->deferred);
> +
> + ret = pool->deferred_ops->push(def->buf, def->count, value);
> + if (ret == ZS_PUSH_OK) {
> + def->count++;
> + put_cpu_ptr(pool->deferred);
> + return true;
> + }
> +
> + if (ret == ZS_PUSH_FULL_QUEUED)
> + def->count++;
> +
> + new_page = deferred_pool_get(pool);
> + if (new_page) {
> + full_page = virt_to_page(def->buf);
> + set_page_private(full_page, def->count);
> + def->buf = page_address(new_page);
> + def->count = 0;
> +
> + if (ret == ZS_PUSH_FULL) {
> + pool->deferred_ops->push(def->buf, 0, value);
> + def->count = 1;
> + }
> + put_cpu_ptr(pool->deferred);
> +
> + spin_lock(&pool->deferred_drain_lock);
> + list_add_tail(&full_page->lru, &pool->deferred_drain_list);
> + spin_unlock(&pool->deferred_drain_lock);
> + queue_work(pool->deferred_wq, &pool->deferred_work);
> + return true;
> + }
> + put_cpu_ptr(pool->deferred);
> +
> + /* ret==2: value already queued, will be drained eventually */
> + if (ret == 2)

== 2? :)

> + return true;
> +
> + /* ret==1: value not queued, caller must fallback */
> + return false;
> +}
> +EXPORT_SYMBOL_GPL(zs_free_deferred);