From 4227725e8df80e3c6c7493e9cc917e6b5231cc7a Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Fri, 8 May 2026 14:07:21 +0800 Subject: [PATCH 1/4] mm/zsmalloc: introduce deferred free framework with callback ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a per-cpu deferred free mechanism to zsmalloc with a callback interface that lets callers (zram, zswap) customize push and drain behavior. Each CPU owns a single-page buffer. The hot path (zs_free_deferred) writes a value into the current CPU's buffer via the push callback with preemption disabled — no locks, no atomics. When the buffer fills, it is swapped with a fresh page from a pre-allocated page pool and the full page is queued to a WQ_UNBOUND worker for drain. The drain worker invokes the drain callback which performs the actual expensive work (zs_free, slot_free, etc.) in batch, away from the original hot path. Page pool management: - Pool is pre-allocated at enable time (ZS_DEFERRED_POOL_SIZE pages) - Full buffers are drained and returned to the pool - If no free page is available when buffer is full, the push falls back to synchronous processing by the caller Signed-off-by: Wenchao Hao --- include/linux/zsmalloc.h | 16 +++ mm/zsmalloc.c | 208 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 223 insertions(+), 1 deletion(-) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 478410c880b1f..8d6c675b10dc3 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -24,12 +24,28 @@ struct zs_pool_stats { struct zs_pool; struct scatterlist; +enum zs_push_ret { + ZS_PUSH_OK = 0, + ZS_PUSH_FULL, + ZS_PUSH_FULL_QUEUED, +}; + +struct zs_deferred_ops { + enum zs_push_ret (*push)(void *buf, unsigned int count, + unsigned long value); + void (*drain)(void *private, void *buf, unsigned int count); +}; + struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags, const int nid); void zs_free(struct zs_pool *pool, unsigned long obj); +int zs_pool_enable_deferred_free(struct zs_pool *pool, + const struct zs_deferred_ops *ops, + void *private); +bool zs_free_deferred(struct zs_pool *pool, unsigned long value); size_t zs_huge_class_size(struct zs_pool *pool); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 63128ddb79598..d8220a8753a7d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -196,6 +196,13 @@ struct link_free { static struct kmem_cache *handle_cachep; static struct kmem_cache *zspage_cachep; +#define ZS_DEFERRED_POOL_SIZE (256 * 1024 / PAGE_SIZE) + +struct zs_deferred_percpu { + unsigned int count; + void *buf; +}; + struct zs_pool { const char *name; @@ -217,6 +224,18 @@ struct zs_pool { /* protect zspage migration/compaction */ rwlock_t lock; atomic_t compaction_in_progress; + + /* per-cpu deferred free */ + const struct zs_deferred_ops *deferred_ops; + void *deferred_private; + struct zs_deferred_percpu __percpu *deferred; + struct work_struct deferred_work; + struct workqueue_struct *deferred_wq; + struct list_head deferred_pool; + unsigned int deferred_pool_count; + spinlock_t deferred_pool_lock; + struct list_head deferred_drain_list; + spinlock_t deferred_drain_lock; }; static inline void zpdesc_set_first(struct zpdesc *zpdesc) @@ -1416,6 +1435,171 @@ void zs_free(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_free); +static struct page *deferred_pool_get(struct zs_pool *pool) +{ + struct page *page = NULL; + + spin_lock(&pool->deferred_pool_lock); + if (!list_empty(&pool->deferred_pool)) { + page = list_first_entry(&pool->deferred_pool, struct page, lru); + list_del(&page->lru); + pool->deferred_pool_count--; + } + spin_unlock(&pool->deferred_pool_lock); + return page; +} + +static void deferred_pool_put(struct zs_pool *pool, struct page *page) +{ + spin_lock(&pool->deferred_pool_lock); + list_add_tail(&page->lru, &pool->deferred_pool); + pool->deferred_pool_count++; + spin_unlock(&pool->deferred_pool_lock); +} + +static void zs_deferred_work_fn(struct work_struct *work) +{ + struct zs_pool *pool = container_of(work, struct zs_pool, deferred_work); + struct page *page; + + while (true) { + unsigned int count; + + spin_lock(&pool->deferred_drain_lock); + if (list_empty(&pool->deferred_drain_list)) { + spin_unlock(&pool->deferred_drain_lock); + break; + } + page = list_first_entry(&pool->deferred_drain_list, + struct page, lru); + list_del(&page->lru); + count = page_private(page); + spin_unlock(&pool->deferred_drain_lock); + + pool->deferred_ops->drain(pool->deferred_private, + page_address(page), count); + deferred_pool_put(pool, page); + cond_resched(); + } +} + +bool zs_free_deferred(struct zs_pool *pool, unsigned long value) +{ + struct zs_deferred_percpu *def; + struct page *new_page, *full_page; + enum zs_push_ret ret; + + if (!pool->deferred) + return false; + + def = get_cpu_ptr(pool->deferred); + + ret = pool->deferred_ops->push(def->buf, def->count, value); + if (ret == ZS_PUSH_OK) { + def->count++; + put_cpu_ptr(pool->deferred); + return true; + } + + if (ret == ZS_PUSH_FULL_QUEUED) + def->count++; + + new_page = deferred_pool_get(pool); + if (new_page) { + full_page = virt_to_page(def->buf); + set_page_private(full_page, def->count); + def->buf = page_address(new_page); + def->count = 0; + + if (ret == ZS_PUSH_FULL) { + pool->deferred_ops->push(def->buf, 0, value); + def->count = 1; + } + put_cpu_ptr(pool->deferred); + + spin_lock(&pool->deferred_drain_lock); + list_add_tail(&full_page->lru, &pool->deferred_drain_list); + spin_unlock(&pool->deferred_drain_lock); + queue_work(pool->deferred_wq, &pool->deferred_work); + return true; + } + put_cpu_ptr(pool->deferred); + + /* ret==2: value already queued, will be drained eventually */ + if (ret == 2) + return true; + + /* ret==1: value not queued, caller must fallback */ + return false; +} +EXPORT_SYMBOL_GPL(zs_free_deferred); + +int zs_pool_enable_deferred_free(struct zs_pool *pool, + const struct zs_deferred_ops *ops, + void *private) +{ + int cpu; + unsigned int pg_idx; + struct page *page, *tmp; + + pool->deferred_ops = ops; + pool->deferred_private = private; + + INIT_WORK(&pool->deferred_work, zs_deferred_work_fn); + pool->deferred_wq = alloc_workqueue("zs_drain", WQ_UNBOUND, 0); + if (!pool->deferred_wq) + return -ENOMEM; + + INIT_LIST_HEAD(&pool->deferred_pool); + spin_lock_init(&pool->deferred_pool_lock); + pool->deferred_pool_count = 0; + INIT_LIST_HEAD(&pool->deferred_drain_list); + spin_lock_init(&pool->deferred_drain_lock); + + for (pg_idx = 0; pg_idx < ZS_DEFERRED_POOL_SIZE; pg_idx++) { + page = alloc_page(GFP_KERNEL); + if (!page) + goto err_pages; + list_add_tail(&page->lru, &pool->deferred_pool); + pool->deferred_pool_count++; + } + + pool->deferred = alloc_percpu(struct zs_deferred_percpu); + if (!pool->deferred) + goto err_pages; + + for_each_possible_cpu(cpu) { + struct zs_deferred_percpu *def = per_cpu_ptr(pool->deferred, cpu); + + page = deferred_pool_get(pool); + if (!page) + goto err_percpu; + def->buf = page_address(page); + def->count = 0; + } + + return 0; + +err_percpu: + for_each_possible_cpu(cpu) { + struct zs_deferred_percpu *def = per_cpu_ptr(pool->deferred, cpu); + + if (def->buf) + deferred_pool_put(pool, virt_to_page(def->buf)); + } + free_percpu(pool->deferred); + pool->deferred = NULL; +err_pages: + list_for_each_entry_safe(page, tmp, &pool->deferred_pool, lru) { + list_del(&page->lru); + __free_page(page); + } + destroy_workqueue(pool->deferred_wq); + pool->deferred_wq = NULL; + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(zs_pool_enable_deferred_free); + static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { @@ -2182,9 +2366,31 @@ EXPORT_SYMBOL_GPL(zs_create_pool); void zs_destroy_pool(struct zs_pool *pool) { - int i; + int i, cpu; + struct page *page, *tmp; zs_unregister_shrinker(pool); + + if (pool->deferred) { + flush_work(&pool->deferred_work); + for_each_possible_cpu(cpu) { + struct zs_deferred_percpu *def = + per_cpu_ptr(pool->deferred, cpu); + + if (def->buf && def->count) + pool->deferred_ops->drain(pool->deferred_private, + def->buf, def->count); + if (def->buf) + deferred_pool_put(pool, virt_to_page(def->buf)); + } + free_percpu(pool->deferred); + list_for_each_entry_safe(page, tmp, &pool->deferred_pool, lru) { + list_del(&page->lru); + __free_page(page); + } + destroy_workqueue(pool->deferred_wq); + } + zs_flush_migration(pool); zs_pool_stat_destroy(pool); From 08c12d89ccd9eb8f90c516696593014dc39d165e Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Fri, 8 May 2026 14:07:22 +0800 Subject: [PATCH 2/4] mm/zswap: use zsmalloc deferred free callback for async invalidate Register zswap_deferred_ops to defer the entire zswap_entry_free() to the WQ_UNBOUND worker. The invalidate hot path only stores the entry pointer into the per-cpu buffer (512 entries/page). The drain callback performs the full entry teardown: lru_del, zs_free, memcg uncharge, cache_free, and stats update. On deferred failure, fallback to synchronous zswap_entry_free(). Signed-off-by: Wenchao Hao --- mm/zswap.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 4b5149173b0ec..3f23ddbe525c0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -270,6 +270,8 @@ static void acomp_ctx_free(struct crypto_acomp_ctx *acomp_ctx) acomp_ctx->buffer = NULL; } +static const struct zs_deferred_ops zswap_deferred_ops; + static struct zswap_pool *zswap_pool_create(char *compressor) { struct zswap_pool *pool; @@ -289,6 +291,8 @@ static struct zswap_pool *zswap_pool_create(char *compressor) if (!pool->zs_pool) goto error; + zs_pool_enable_deferred_free(pool->zs_pool, &zswap_deferred_ops, pool); + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); /* Many things rely on the zero-initialization. */ @@ -777,6 +781,36 @@ static void zswap_entry_free(struct zswap_entry *entry) atomic_long_dec(&zswap_stored_pages); } +static enum zs_push_ret zswap_deferred_push(void *buf, + unsigned int count, unsigned long value) +{ + unsigned long *entries = buf; + + if (count >= PAGE_SIZE / sizeof(unsigned long)) + return ZS_PUSH_FULL; + entries[count] = value; + if (count + 1 >= PAGE_SIZE / sizeof(unsigned long)) + return ZS_PUSH_FULL_QUEUED; + return ZS_PUSH_OK; +} + +static void zswap_deferred_drain(void *private, void *buf, unsigned int count) +{ + unsigned long *entries = buf; + unsigned int i; + + for (i = 0; i < count; i++) { + struct zswap_entry *entry = (struct zswap_entry *)entries[i]; + + zswap_entry_free(entry); + } +} + +static const struct zs_deferred_ops zswap_deferred_ops = { + .push = zswap_deferred_push, + .drain = zswap_deferred_drain, +}; + /********************************* * compressed storage functions **********************************/ @@ -1647,7 +1681,9 @@ void zswap_invalidate(swp_entry_t swp) return; entry = xa_erase(tree, offset); - if (entry) + if (!entry) + return; + if (!zs_free_deferred(entry->pool->zs_pool, (unsigned long)entry)) zswap_entry_free(entry); } From 78fd67826a9c917fb1d8c653a711f07551bf1e20 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 8 May 2026 14:07:23 +0800 Subject: [PATCH 3/4] zram: use zsmalloc deferred free callback for async slot free Register zram_deferred_ops with zs_pool_enable_deferred_free() to defer slot freeing to a WQ_UNBOUND worker. The notify hot path only stores a u32 slot index into the per-cpu buffer (1024 entries/page). The drain callback does slot_lock + slot_free + slot_unlock for each index. On deferred failure (no free page), fallback to synchronous slot_lock + slot_free + slot_unlock. Signed-off-by: Barry Song Signed-off-by: Wenchao Hao --- drivers/block/zram/zram_drv.c | 39 +++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index aebc710f0d6ae..0d07f0901e555 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -56,6 +56,7 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; static void slot_free(struct zram *zram, u32 index); +static const struct zs_deferred_ops zram_deferred_ops; #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map) static void slot_lock_init(struct zram *zram, u32 index) @@ -1994,6 +1995,8 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zs_pool_enable_deferred_free(zram->mem_pool, &zram_deferred_ops, zram); + for (index = 0; index < num_pages; index++) slot_lock_init(zram, index); @@ -2784,6 +2787,39 @@ static void zram_submit_bio(struct bio *bio) } } +static enum zs_push_ret zram_deferred_push(void *buf, + unsigned int count, unsigned long value) +{ + u32 *indices = buf; + + if (count >= PAGE_SIZE / sizeof(u32)) + return ZS_PUSH_FULL; + indices[count] = (u32)value; + if (count + 1 >= PAGE_SIZE / sizeof(u32)) + return ZS_PUSH_FULL_QUEUED; + return ZS_PUSH_OK; +} + +static void zram_deferred_drain(void *private, void *buf, unsigned int count) +{ + struct zram *zram = private; + u32 *indices = buf; + unsigned int i; + + for (i = 0; i < count; i++) { + u32 index = indices[i]; + + slot_lock(zram, index); + slot_free(zram, index); + slot_unlock(zram, index); + } +} + +static const struct zs_deferred_ops zram_deferred_ops = { + .push = zram_deferred_push, + .drain = zram_deferred_drain, +}; + static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { @@ -2792,6 +2828,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; atomic64_inc(&zram->stats.notify_free); + if (zs_free_deferred(zram->mem_pool, (unsigned long)index)) + return; + if (!slot_trylock(zram, index)) { atomic64_inc(&zram->stats.miss_free); return; From d9623ad5fd6a9327c4aa1fd3490e8712eda235f1 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Fri, 8 May 2026 14:07:24 +0800 Subject: [PATCH 4/4] zram: batch clear flags in slot_free with single write Replace four separate flag clear operations in slot_free() with a single mask write. This reduces redundant read-modify-write cycles on the same flags word. Signed-off-by: Wenchao Hao --- drivers/block/zram/zram_drv.c | 5 +---- drivers/block/zram/zram_drv.h | 6 ++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0d07f0901e555..b1a565d35567e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2011,10 +2011,7 @@ static void slot_free(struct zram *zram, u32 index) zram->table[index].attr.ac_time = 0; #endif - clear_slot_flag(zram, index, ZRAM_IDLE); - clear_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE); - clear_slot_flag(zram, index, ZRAM_PP_SLOT); - set_slot_comp_priority(zram, index, 0); + zram->table[index].attr.flags &= ~ZRAM_SLOT_FREE_CLEAR_MASK; if (test_slot_flag(zram, index, ZRAM_HUGE)) { /* diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 08d1774c15db6..89a7e39a2f4b8 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -57,6 +57,12 @@ enum zram_pageflags { __NR_ZRAM_PAGEFLAGS, }; +#define ZRAM_SLOT_FREE_CLEAR_MASK (BIT(ZRAM_IDLE) | \ + BIT(ZRAM_INCOMPRESSIBLE) | \ + BIT(ZRAM_PP_SLOT) | \ + (ZRAM_COMP_PRIORITY_MASK << \ + ZRAM_COMP_PRIORITY_BIT1)) + /* * Allocated for each disk page. We use bit-lock (ZRAM_ENTRY_LOCK bit * of flags) to save memory. There can be plenty of entries and standard