Re: [PATCH v11 07/14] mm: multi-gen LRU: exploit locality in rmap

From: Barry Song
Date: Mon Jun 06 2022 - 05:25:48 EST


On Wed, May 18, 2022 at 4:49 PM Yu Zhao <yuzhao@xxxxxxxxxx> wrote:
>
> Searching the rmap for PTEs mapping each page on an LRU list (to test
> and clear the accessed bit) can be expensive because pages from
> different VMAs (PA space) are not cache friendly to the rmap (VA
> space). For workloads mostly using mapped pages, the rmap has a high
> CPU cost in the reclaim path.
>
> This patch exploits spatial locality to reduce the trips into the
> rmap. When shrink_page_list() walks the rmap and finds a young PTE, a
> new function lru_gen_look_around() scans at most BITS_PER_LONG-1
> adjacent PTEs. On finding another young PTE, it clears the accessed
> bit and updates the gen counter of the page mapped by this PTE to
> (max_seq%MAX_NR_GENS)+1.
>
> Server benchmark results:
> Single workload:
> fio (buffered I/O): no change
>
> Single workload:
> memcached (anon): +[5.5, 7.5]%
> Ops/sec KB/sec
> patch1-6: 1120643.70 43588.06
> patch1-7: 1193918.93 46438.15
>
> Configurations:
> no change
>
> Client benchmark results:
> kswapd profiles:
> patch1-6
> 35.99% lzo1x_1_do_compress (real work)
> 19.40% page_vma_mapped_walk
> 6.31% _raw_spin_unlock_irq
> 3.95% do_raw_spin_lock
> 2.39% anon_vma_interval_tree_iter_first
> 2.25% ptep_clear_flush
> 1.92% __anon_vma_interval_tree_subtree_search
> 1.70% folio_referenced_one
> 1.68% __zram_bvec_write
> 1.43% anon_vma_interval_tree_iter_next
>
> patch1-7
> 45.90% lzo1x_1_do_compress (real work)
> 9.14% page_vma_mapped_walk
> 6.81% _raw_spin_unlock_irq
> 2.80% ptep_clear_flush
> 2.34% __zram_bvec_write
> 2.29% do_raw_spin_lock
> 1.84% lru_gen_look_around
> 1.78% memmove
> 1.74% obj_malloc
> 1.50% free_unref_page_list
>
> Configurations:
> no change
>
> Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
> Acked-by: Brian Geffon <bgeffon@xxxxxxxxxx>
> Acked-by: Jan Alexander Steffens (heftig) <heftig@xxxxxxxxxxxxx>
> Acked-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx>
> Acked-by: Steven Barrett <steven@xxxxxxxxxxxx>
> Acked-by: Suleiman Souhlal <suleiman@xxxxxxxxxx>
> Tested-by: Daniel Byrne <djbyrne@xxxxxxx>
> Tested-by: Donald Carr <d@xxxxxxxxxxxxxxx>
> Tested-by: Holger Hoffstätte <holger@xxxxxxxxxxxxxxxxxxxxxx>
> Tested-by: Konstantin Kharlamov <Hi-Angel@xxxxxxxxx>
> Tested-by: Shuang Zhai <szhai2@xxxxxxxxxxxxxxxx>
> Tested-by: Sofia Trinh <sofia.trinh@edi.works>
> Tested-by: Vaibhav Jain <vaibhav@xxxxxxxxxxxxx>
> ---
> include/linux/memcontrol.h | 31 ++++++++
> include/linux/mm.h | 5 ++
> include/linux/mmzone.h | 6 ++
> mm/internal.h | 1 +
> mm/memcontrol.c | 1 +
> mm/rmap.c | 7 ++
> mm/swap.c | 4 +-
> mm/vmscan.c | 157 +++++++++++++++++++++++++++++++++++++
> 8 files changed, 210 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 89b14729d59f..2bfdcc77648a 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -438,6 +438,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
> * - LRU isolation
> * - lock_page_memcg()
> * - exclusive reference
> + * - mem_cgroup_trylock_pages()
> *
> * For a kmem folio a caller should hold an rcu read lock to protect memcg
> * associated with a kmem folio from being released.
> @@ -499,6 +500,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
> * - LRU isolation
> * - lock_page_memcg()
> * - exclusive reference
> + * - mem_cgroup_trylock_pages()
> *
> * For a kmem page a caller should hold an rcu read lock to protect memcg
> * associated with a kmem page from being released.
> @@ -948,6 +950,23 @@ void unlock_page_memcg(struct page *page);
>
> void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
>
> +/* try to stablize folio_memcg() for all the pages in a memcg */
> +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
> +{
> + rcu_read_lock();
> +
> + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
> + return true;
> +
> + rcu_read_unlock();
> + return false;
> +}
> +
> +static inline void mem_cgroup_unlock_pages(void)
> +{
> + rcu_read_unlock();
> +}
> +
> /* idx can be of type enum memcg_stat_item or node_stat_item */
> static inline void mod_memcg_state(struct mem_cgroup *memcg,
> int idx, int val)
> @@ -1386,6 +1405,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
> {
> }
>
> +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
> +{
> + /* to match folio_memcg_rcu() */
> + rcu_read_lock();
> + return true;
> +}
> +
> +static inline void mem_cgroup_unlock_pages(void)
> +{
> + rcu_read_unlock();
> +}
> +
> static inline void mem_cgroup_handle_over_high(void)
> {
> }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 894c289c2c06..4e8ab4ad4473 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1523,6 +1523,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
> return page_to_pfn(&folio->page);
> }
>
> +static inline struct folio *pfn_folio(unsigned long pfn)
> +{
> + return page_folio(pfn_to_page(pfn));
> +}
> +
> static inline atomic_t *folio_pincount_ptr(struct folio *folio)
> {
> return &folio_page(folio, 1)->compound_pincount;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2d023d243e73..f0b980362186 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -374,6 +374,7 @@ enum lruvec_flags {
> #ifndef __GENERATING_BOUNDS_H
>
> struct lruvec;
> +struct page_vma_mapped_walk;
>
> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
> @@ -429,6 +430,7 @@ struct lru_gen_struct {
> };
>
> void lru_gen_init_lruvec(struct lruvec *lruvec);
> +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
>
> #ifdef CONFIG_MEMCG
> void lru_gen_init_memcg(struct mem_cgroup *memcg);
> @@ -441,6 +443,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
> {
> }
>
> +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
> +{
> +}
> +
> #ifdef CONFIG_MEMCG
> static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
> {
> diff --git a/mm/internal.h b/mm/internal.h
> index cf16280ce132..59d2422b647d 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -68,6 +68,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
> void folio_rotate_reclaimable(struct folio *folio);
> bool __folio_end_writeback(struct folio *folio);
> void deactivate_file_folio(struct folio *folio);
> +void folio_activate(struct folio *folio);
>
> void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
> unsigned long floor, unsigned long ceiling);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2ee074f80e72..98aa720ac639 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2769,6 +2769,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
> * - LRU isolation
> * - lock_page_memcg()
> * - exclusive reference
> + * - mem_cgroup_trylock_pages()
> */
> folio->memcg_data = (unsigned long)memcg;
> }
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fedb82371efe..7cb7ef29088a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -73,6 +73,7 @@
> #include <linux/page_idle.h>
> #include <linux/memremap.h>
> #include <linux/userfaultfd_k.h>
> +#include <linux/mm_inline.h>
>
> #include <asm/tlbflush.h>
>
> @@ -821,6 +822,12 @@ static bool folio_referenced_one(struct folio *folio,
> }
>
> if (pvmw.pte) {
> + if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
> + !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
> + lru_gen_look_around(&pvmw);
> + referenced++;
> + }
> +
> if (ptep_clear_flush_young_notify(vma, address,

Hello, Yu.
look_around() is calling ptep_test_and_clear_young(pvmw->vma, addr, pte + i)
only without flush and notify. for flush, there is a tlb operation for arm64:
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
int young = ptep_test_and_clear_young(vma, address, ptep);

if (young) {
/*
* We can elide the trailing DSB here since the worst that can
* happen is that a CPU continues to use the young entry in its
* TLB and we mistakenly reclaim the associated page. The
* window for such an event is bounded by the next
* context-switch, which provides a DSB to complete the TLB
* invalidation.
*/
flush_tlb_page_nosync(vma, address);
}

return young;
}

Does it mean the current kernel is over cautious? is it
safe to call ptep_test_and_clear_young() only?

btw, lru_gen_look_around() has already included 'address', are we doing
pte check for 'address' twice here?


> pvmw.pte)) {
> /*
> diff --git a/mm/swap.c b/mm/swap.c
> index a99d22308f28..0aa1d0b33d42 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -342,7 +342,7 @@ static bool need_activate_page_drain(int cpu)
> return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
> }
>
> -static void folio_activate(struct folio *folio)
> +void folio_activate(struct folio *folio)
> {
> if (folio_test_lru(folio) && !folio_test_active(folio) &&
> !folio_test_unevictable(folio)) {
> @@ -362,7 +362,7 @@ static inline void activate_page_drain(int cpu)
> {
> }
>
> -static void folio_activate(struct folio *folio)
> +void folio_activate(struct folio *folio)
> {
> struct lruvec *lruvec;
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 891f0ab69b3a..cf89a28c3b0e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1554,6 +1554,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
> if (!sc->may_unmap && page_mapped(page))
> goto keep_locked;
>
> + /* folio_update_gen() tried to promote this page? */
> + if (lru_gen_enabled() && !ignore_references &&
> + page_mapped(page) && PageReferenced(page))
> + goto keep_locked;
> +
> may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
> (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
>
> @@ -3137,6 +3142,28 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
> * the aging
> ******************************************************************************/
>
> +static int folio_update_gen(struct folio *folio, int gen)
> +{
> + unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
> +
> + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
> + VM_WARN_ON_ONCE(!rcu_read_lock_held());
> +
> + do {
> + /* lru_gen_del_folio() has isolated this page? */
> + if (!(old_flags & LRU_GEN_MASK)) {
> + /* for shrink_page_list() */
> + new_flags = old_flags | BIT(PG_referenced);
> + continue;
> + }
> +
> + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
> + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
> + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
> +
> + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
> +}
> +
> static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
> {
> int type = folio_is_file_lru(folio);
> @@ -3147,6 +3174,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
> VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
>
> do {
> + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
> + /* folio_update_gen() has promoted this page? */
> + if (new_gen >= 0 && new_gen != old_gen)
> + return new_gen;
> +
> new_gen = (old_gen + 1) % MAX_NR_GENS;
>
> new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
> @@ -3365,6 +3397,125 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
> } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
> }
>
> +/*
> + * This function exploits spatial locality when shrink_page_list() walks the
> + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
> + */
> +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
> +{
> + int i;
> + pte_t *pte;
> + unsigned long start;
> + unsigned long end;
> + unsigned long addr;
> + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
> + struct folio *folio = pfn_folio(pvmw->pfn);
> + struct mem_cgroup *memcg = folio_memcg(folio);
> + struct pglist_data *pgdat = folio_pgdat(folio);
> + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> + DEFINE_MAX_SEQ(lruvec);
> + int old_gen, new_gen = lru_gen_from_seq(max_seq);
> +
> + lockdep_assert_held(pvmw->ptl);
> + VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
> +
> + if (spin_is_contended(pvmw->ptl))
> + return;
> +
> + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
> + end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end);
> +
> + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
> + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
> + end = start + MIN_LRU_BATCH * PAGE_SIZE;
> + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
> + start = end - MIN_LRU_BATCH * PAGE_SIZE;
> + else {
> + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
> + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
> + }
> + }
> +
> + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
> +
> + rcu_read_lock();
> + arch_enter_lazy_mmu_mode();
> +
> + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
> + unsigned long pfn = pte_pfn(pte[i]);
> +
> + VM_WARN_ON_ONCE(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end);
> +
> + if (!pte_present(pte[i]) || is_zero_pfn(pfn))
> + continue;
> +
> + if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
> + continue;
> +
> + if (!pte_young(pte[i]))
> + continue;
> +
> + VM_WARN_ON_ONCE(!pfn_valid(pfn));
> + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
> + continue;
> +
> + folio = pfn_folio(pfn);
> + if (folio_nid(folio) != pgdat->node_id)
> + continue;
> +
> + if (folio_memcg_rcu(folio) != memcg)
> + continue;
> +
> + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
> + continue;
> +
> + if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
> + !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
> + !folio_test_swapcache(folio)))
> + folio_mark_dirty(folio);
> +
> + old_gen = folio_lru_gen(folio);
> + if (old_gen < 0)
> + folio_set_referenced(folio);
> + else if (old_gen != new_gen)
> + __set_bit(i, bitmap);
> + }
> +
> + arch_leave_lazy_mmu_mode();
> + rcu_read_unlock();
> +
> + if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
> + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
> + folio = pfn_folio(pte_pfn(pte[i]));
> + folio_activate(folio);
> + }
> + return;
> + }
> +
> + /* folio_update_gen() requires stable folio_memcg() */
> + if (!mem_cgroup_trylock_pages(memcg))
> + return;
> +
> + spin_lock_irq(&lruvec->lru_lock);
> + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
> +
> + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
> + folio = pfn_folio(pte_pfn(pte[i]));
> + if (folio_memcg_rcu(folio) != memcg)
> + continue;
> +
> + old_gen = folio_update_gen(folio, new_gen);
> + if (old_gen < 0 || old_gen == new_gen)
> + continue;
> +
> + lru_gen_update_size(lruvec, folio, old_gen, new_gen);
> + }
> +
> + spin_unlock_irq(&lruvec->lru_lock);
> +
> + mem_cgroup_unlock_pages();
> +}
> +
> /******************************************************************************
> * the eviction
> ******************************************************************************/
> @@ -3401,6 +3552,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
> return true;
> }
>
> + /* promoted */
> + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
> + list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
> + return true;
> + }
> +
> /* protected */
> if (tier > tier_idx) {
> int hist = lru_hist_from_seq(lrugen->min_seq[type]);
> --
> 2.36.0.550.gb090851708-goog
>

Thanks
Barry