[PATCH v4 3/4] swap: apply new pw_queue_on() interface

From: Leonardo Bras

Date: Mon May 18 2026 - 21:31:02 EST

Make use of the new pw_{un,}lock*() and pw_queue_on() interface to improve
performance & latency.

For functions that may be scheduled in a different cpu, replace
local_{un,}lock*() by pw_{un,}lock*(), and replace schedule_work_on() by
pw_queue_on(). The same happens for flush_work() and pw_flush().

The change requires allocation of pw_structs instead of a work_structs,
and changing parameters of a few functions to include the cpu parameter.

This should bring no relevant performance impact on non-PWLOCKS kernels:
For functions that may be scheduled in a different cpu, the local_*lock's
this_cpu_ptr() becomes a per_cpu_ptr(smp_processor_id()).

Signed-off-by: Leonardo Bras <leobras.c@xxxxxxxxx>
Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
---
mm/internal.h | 4 ++-
mm/mlock.c | 51 ++++++++++++++++++++++++++----------
mm/page_alloc.c | 2 +-
mm/swap.c | 69 ++++++++++++++++++++++++++-----------------------
4 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e0b..1ec9a11c373b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1209,24 +1209,26 @@ static inline void munlock_vma_folio(struct folio *folio,
* cause folio not fully mapped to VMA.
*
* But it's not easy to confirm that's the situation. So we
* always munlock the folio and page reclaim will correct it
* if it's wrong.
*/
if (unlikely(vma->vm_flags & VM_LOCKED))
munlock_folio(folio);
}

+int __init mlock_init(void);
void mlock_new_folio(struct folio *folio);
bool need_mlock_drain(int cpu);
void mlock_drain_local(void);
-void mlock_drain_remote(int cpu);
+void mlock_drain_cpu(int cpu);
+void mlock_drain_offline(int cpu);

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/**
* vma_address - Find the virtual address a page range is mapped at
* @vma: The vma which maps this object.
* @pgoff: The page offset within its object.
* @nr_pages: The number of pages to consider.
*
* If any page in this range is mapped by this VMA, return the first address
diff --git a/mm/mlock.c b/mm/mlock.c
index 8c227fefa2df..5d25bbbb09e9 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -18,31 +18,30 @@
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/secretmem.h>
+#include <linux/pwlocks.h>

#include "internal.h"

struct mlock_fbatch {
- local_lock_t lock;
+ pw_lock_t lock;
struct folio_batch fbatch;
};

-static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
+static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch);

bool can_do_mlock(void)
{
if (rlimit(RLIMIT_MEMLOCK) != 0)
return true;
if (capable(CAP_IPC_LOCK))
return true;
return false;
}
EXPORT_SYMBOL(can_do_mlock);
@@ -202,32 +201,43 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
lruvec = __mlock_new_folio(folio, lruvec);
else
lruvec = __munlock_folio(folio, lruvec);
}

if (lruvec)
lruvec_unlock_irq(lruvec);
folios_put(fbatch);
}

+void mlock_drain_cpu(int cpu)
+{
+ struct folio_batch *fbatch;
+
+ pw_lock(&mlock_fbatch.lock, cpu);
+ fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
+ if (folio_batch_count(fbatch))
+ mlock_folio_batch(fbatch);
+ pw_unlock(&mlock_fbatch.lock, cpu);
+}
+
void mlock_drain_local(void)
{
struct folio_batch *fbatch;

- local_lock(&mlock_fbatch.lock);
+ pw_lock_local(&mlock_fbatch.lock);
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
if (folio_batch_count(fbatch))
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ pw_unlock_local(&mlock_fbatch.lock);
}

-void mlock_drain_remote(int cpu)
+void mlock_drain_offline(int cpu)
{
struct folio_batch *fbatch;

WARN_ON_ONCE(cpu_online(cpu));
fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
if (folio_batch_count(fbatch))
mlock_folio_batch(fbatch);
}

bool need_mlock_drain(int cpu)
@@ -236,79 +246,79 @@ bool need_mlock_drain(int cpu)
}

/**
* mlock_folio - mlock a folio already on (or temporarily off) LRU
* @folio: folio to be mlocked.
*/
void mlock_folio(struct folio *folio)
{
struct folio_batch *fbatch;

- local_lock(&mlock_fbatch.lock);
+ pw_lock_local(&mlock_fbatch.lock);
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);

if (!folio_test_set_mlocked(folio)) {
int nr_pages = folio_nr_pages(folio);

zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}

folio_get(folio);
if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ pw_unlock_local(&mlock_fbatch.lock);
}

/**
* mlock_new_folio - mlock a newly allocated folio not yet on LRU
* @folio: folio to be mlocked, either normal or a THP head.
*/
void mlock_new_folio(struct folio *folio)
{
struct folio_batch *fbatch;
int nr_pages = folio_nr_pages(folio);

- local_lock(&mlock_fbatch.lock);
+ pw_lock_local(&mlock_fbatch.lock);
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
folio_set_mlocked(folio);

zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);

folio_get(folio);
if (!folio_batch_add(fbatch, mlock_new(folio)) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ pw_unlock_local(&mlock_fbatch.lock);
}

/**
* munlock_folio - munlock a folio
* @folio: folio to be munlocked, either normal or a THP head.
*/
void munlock_folio(struct folio *folio)
{
struct folio_batch *fbatch;

- local_lock(&mlock_fbatch.lock);
+ pw_lock_local(&mlock_fbatch.lock);
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
/*
* folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
* which will check whether the folio is multiply mlocked.
*/
folio_get(folio);
if (!folio_batch_add(fbatch, folio) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ pw_unlock_local(&mlock_fbatch.lock);
}

static inline unsigned int folio_mlock_step(struct folio *folio,
pte_t *pte, unsigned long addr, unsigned long end)
{
unsigned int count = (end - addr) >> PAGE_SHIFT;
pte_t ptent = ptep_get(pte);

if (!folio_test_large(folio))
return 1;
@@ -822,10 +832,25 @@ int user_shm_lock(size_t size, struct ucounts *ucounts)
return allowed;
}

void user_shm_unlock(size_t size, struct ucounts *ucounts)
{
spin_lock(&shmlock_user_lock);
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
spin_unlock(&shmlock_user_lock);
put_ucounts(ucounts);
}
+
+int __init mlock_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct mlock_fbatch *fbatch = &per_cpu(mlock_fbatch, cpu);
+
+ pw_lock_init(&fbatch->lock);
+ }
+
+ return 0;
+}
+
+module_init(mlock_init);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 227d58dc3de6..fa768f07f88a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6217,21 +6217,21 @@ void free_reserved_page(struct page *page)
__free_page(page);
adjust_managed_page_count(page, 1);
}
EXPORT_SYMBOL(free_reserved_page);

static int page_alloc_cpu_dead(unsigned int cpu)
{
struct zone *zone;

lru_add_drain_cpu(cpu);
- mlock_drain_remote(cpu);
+ mlock_drain_offline(cpu);
drain_pages(cpu);

/*
* Spill the event counters of the dead processor
* into the current processors event counters.
* This artificially elevates the count of the current
* processor.
*/
vm_events_fold_cpu(cpu);

diff --git a/mm/swap.c b/mm/swap.c
index ed9b3d371547..42f51bf4bb71 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -28,54 +28,51 @@
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
-#include <linux/local_lock.h>
+#include <linux/pwlocks.h>
#include <linux/buffer_head.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>

/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
static const int page_cluster_max = 31;

struct cpu_fbatches {
/*
* The following folio batches are grouped together because they are protected
* by disabling preemption (and interrupts remain enabled).
*/
- local_lock_t lock;
+ pw_lock_t lock;
struct folio_batch lru_add;
struct folio_batch lru_deactivate_file;
struct folio_batch lru_deactivate;
struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
struct folio_batch lru_activate;
#endif
/* Protecting the following batches which require disabling interrupts */
- local_lock_t lock_irq;
+ pw_lock_t lock_irq;
struct folio_batch lru_move_tail;
};

-static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
- .lock = INIT_LOCAL_LOCK(lock),
- .lock_irq = INIT_LOCAL_LOCK(lock_irq),
-};
+static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches);

static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
unsigned long *flagsp)
{
if (folio_test_lru(folio)) {
folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
lruvec_del_folio(*lruvecp, folio);
__folio_clear_lru_flags(folio);
}
}
@@ -180,32 +177,32 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
}

static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
struct folio *folio, move_fn_t move_fn, bool disable_irq)
{
unsigned long flags;

folio_get(folio);

if (disable_irq)
- local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
+ pw_lock_local_irqsave(&cpu_fbatches.lock_irq, flags);
else
- local_lock(&cpu_fbatches.lock);
+ pw_lock_local(&cpu_fbatches.lock);

if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);

if (disable_irq)
- local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
+ pw_unlock_local_irqrestore(&cpu_fbatches.lock_irq, flags);
else
- local_unlock(&cpu_fbatches.lock);
+ pw_unlock_local(&cpu_fbatches.lock);
}

#define folio_batch_add_and_move(folio, op) \
__folio_batch_add_and_move( \
&cpu_fbatches.op, \
folio, \
op, \
offsetof(struct cpu_fbatches, op) >= \
offsetof(struct cpu_fbatches, lock_irq) \
)
@@ -356,21 +353,21 @@ void folio_activate(struct folio *folio)
lruvec_unlock_irq(lruvec);
folio_set_lru(folio);
}
#endif

static void __lru_cache_activate_folio(struct folio *folio)
{
struct folio_batch *fbatch;
int i;

- local_lock(&cpu_fbatches.lock);
+ pw_lock_local(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);

/*
* Search backwards on the optimistic assumption that the folio being
* activated has just been added to this batch. Note that only
* the local batch is examined as a !LRU folio could be in the
* process of being released, reclaimed, migrated or on a remote
* batch that is currently being drained. Furthermore, marking
* a remote batch's folio active potentially hits a race where
* a folio is marked active just after it is added to the inactive
@@ -378,21 +375,21 @@ static void __lru_cache_activate_folio(struct folio *folio)
*/
for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
struct folio *batch_folio = fbatch->folios[i];

if (batch_folio == folio) {
folio_set_active(folio);
break;
}
}

- local_unlock(&cpu_fbatches.lock);
+ pw_unlock_local(&cpu_fbatches.lock);
}

#ifdef CONFIG_LRU_GEN

static void lru_gen_inc_refs(struct folio *folio)
{
unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);

if (folio_test_unevictable(folio))
return;
@@ -652,23 +649,23 @@ void lru_add_drain_cpu(int cpu)

if (folio_batch_count(fbatch))
folio_batch_move_lru(fbatch, lru_add);

fbatch = &fbatches->lru_move_tail;
/* Disabling interrupts below acts as a compiler barrier. */
if (data_race(folio_batch_count(fbatch))) {
unsigned long flags;

/* No harm done if a racing interrupt already did this */
- local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
+ pw_lock_irqsave(&cpu_fbatches.lock_irq, flags, cpu);
folio_batch_move_lru(fbatch, lru_move_tail);
- local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
+ pw_unlock_irqrestore(&cpu_fbatches.lock_irq, flags, cpu);
}

fbatch = &fbatches->lru_deactivate_file;
if (folio_batch_count(fbatch))
folio_batch_move_lru(fbatch, lru_deactivate_file);

fbatch = &fbatches->lru_deactivate;
if (folio_batch_count(fbatch))
folio_batch_move_lru(fbatch, lru_deactivate);

@@ -732,56 +729,56 @@ void folio_mark_lazyfree(struct folio *folio)
if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
!folio_test_lru(folio) ||
folio_test_swapcache(folio) || folio_test_unevictable(folio))
return;

folio_batch_add_and_move(folio, lru_lazyfree);
}

void lru_add_drain(void)
{
- local_lock(&cpu_fbatches.lock);
+ pw_lock_local(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
- local_unlock(&cpu_fbatches.lock);
+ pw_unlock_local(&cpu_fbatches.lock);
mlock_drain_local();
}

/*
* It's called from per-cpu workqueue context in SMP case so
* lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
* the same cpu. It shouldn't be a problem in !SMP case since
* the core is only one and the locks will disable preemption.
*/
-static void lru_add_mm_drain(void)
+static void lru_add_mm_drain(int cpu)
{
- local_lock(&cpu_fbatches.lock);
- lru_add_drain_cpu(smp_processor_id());
- local_unlock(&cpu_fbatches.lock);
- mlock_drain_local();
+ pw_lock(&cpu_fbatches.lock, cpu);
+ lru_add_drain_cpu(cpu);
+ pw_unlock(&cpu_fbatches.lock, cpu);
+ mlock_drain_cpu(cpu);
}

void lru_add_drain_cpu_zone(struct zone *zone)
{
- local_lock(&cpu_fbatches.lock);
+ pw_lock_local(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
drain_local_pages(zone);
- local_unlock(&cpu_fbatches.lock);
+ pw_unlock_local(&cpu_fbatches.lock);
mlock_drain_local();
}

#ifdef CONFIG_SMP

-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+static DEFINE_PER_CPU(struct pw_struct, lru_add_drain_pw);

-static void lru_add_drain_per_cpu(struct work_struct *dummy)
+static void lru_add_drain_per_cpu(struct work_struct *w)
{
- lru_add_mm_drain();
+ lru_add_mm_drain(pw_get_cpu(w));
}

static DEFINE_PER_CPU(struct work_struct, bh_add_drain_work);

static void bh_add_drain_per_cpu(struct work_struct *dummy)
{
invalidate_bh_lrus_cpu();
}

static bool cpu_needs_mm_drain(unsigned int cpu)
@@ -882,38 +879,38 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
* If the paired barrier is done at any later step, e.g. after the
* loop, CPU #x will just exit at (C) and miss flushing out all of its
* added pages.
*/
WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
smp_mb();

cpumask_clear(&has_mm_work);
cpumask_clear(&has_bh_work);
for_each_online_cpu(cpu) {
- struct work_struct *mm_work = &per_cpu(lru_add_drain_work, cpu);
+ struct pw_struct *mm_pw = &per_cpu(lru_add_drain_pw, cpu);
struct work_struct *bh_work = &per_cpu(bh_add_drain_work, cpu);

if (cpu_needs_mm_drain(cpu)) {
- INIT_WORK(mm_work, lru_add_drain_per_cpu);
- queue_work_on(cpu, mm_percpu_wq, mm_work);
+ INIT_PW(mm_pw, lru_add_drain_per_cpu, cpu);
+ pw_queue_on(cpu, mm_percpu_wq, mm_pw);
__cpumask_set_cpu(cpu, &has_mm_work);
}

if (cpu_needs_bh_drain(cpu)) {
INIT_WORK(bh_work, bh_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, bh_work);
__cpumask_set_cpu(cpu, &has_bh_work);
}
}

for_each_cpu(cpu, &has_mm_work)
- flush_work(&per_cpu(lru_add_drain_work, cpu));
+ pw_flush(&per_cpu(lru_add_drain_pw, cpu));

for_each_cpu(cpu, &has_bh_work)
flush_work(&per_cpu(bh_add_drain_work, cpu));

done:
mutex_unlock(&lock);
}

void lru_add_drain_all(void)
{
@@ -949,21 +946,21 @@ void lru_cache_disable(void)
*
* Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
* preempt_disable() regions of code. So any CPU which sees
* lru_disable_count = 0 will have exited the critical
* section when synchronize_rcu() returns.
*/
synchronize_rcu_expedited();
#ifdef CONFIG_SMP
__lru_add_drain_all(true);
#else
- lru_add_mm_drain();
+ lru_add_mm_drain(smp_processor_id());
invalidate_bh_lrus_cpu();
#endif
}

/**
* folios_put_refs - Reduce the reference count on a batch of folios.
* @folios: The folios.
* @refs: The number of refs to subtract from each folio.
*
* Like folio_put(), but for a batch of folios. This is more efficient
@@ -1156,23 +1153,31 @@ static const struct ctl_table swap_sysctl_table[] = {
.extra2 = (void *)&page_cluster_max,
}
};

/*
* Perform any setup for the swap system
*/
void __init swap_setup(void)
{
unsigned long megs = PAGES_TO_MB(totalram_pages());
+ unsigned int cpu;

/* Use a smaller cluster for small-memory machines */
if (megs < 16)
page_cluster = 2;
else
page_cluster = 3;
/*
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/

register_sysctl_init("vm", swap_sysctl_table);
+
+ for_each_possible_cpu(cpu) {
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+
+ pw_lock_init(&fbatches->lock);
+ pw_lock_init(&fbatches->lock_irq);
+ }
}
--
2.54.0