Re: [PATCH v9 10/14] mm: multi-gen LRU: kill switch

From: Barry Song
Date: Tue Mar 22 2022 - 03:47:29 EST


On Wed, Mar 9, 2022 at 3:48 PM Yu Zhao <yuzhao@xxxxxxxxxx> wrote:
>
> Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
> can be disabled include:
> 0x0001: the multi-gen LRU core
> 0x0002: walking page table, when arch_has_hw_pte_young() returns
> true
> 0x0004: clearing the accessed bit in non-leaf PMD entries, when
> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
> [yYnN]: apply to all the components above
> E.g.,
> echo y >/sys/kernel/mm/lru_gen/enabled
> cat /sys/kernel/mm/lru_gen/enabled
> 0x0007
> echo 5 >/sys/kernel/mm/lru_gen/enabled
> cat /sys/kernel/mm/lru_gen/enabled
> 0x0005
>
> NB: the page table walks happen on the scale of seconds under heavy
> memory pressure, in which case the mmap_lock contention is a lesser
> concern, compared with the LRU lock contention and the I/O congestion.
> So far the only well-known case of the mmap_lock contention happens on
> Android, due to Scudo [1] which allocates several thousand VMAs for
> merely a few hundred MBs. The SPF and the Maple Tree also have
> provided their own assessments [2][3]. However, if walking page tables
> does worsen the mmap_lock contention, the kill switch can be used to
> disable it. In this case the multi-gen LRU will suffer a minor
> performance degradation, as shown previously.
>
> Clearing the accessed bit in non-leaf PMD entries can also be
> disabled, since this behavior was not tested on x86 varieties other
> than Intel and AMD.
>
> [1] https://source.android.com/devices/tech/debug/scudo
> [2] https://lore.kernel.org/lkml/20220128131006.67712-1-michel@xxxxxxxxxxxxxx/
> [3] https://lore.kernel.org/lkml/20220202024137.2516438-1-Liam.Howlett@xxxxxxxxxx/
>
> Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
> Acked-by: Brian Geffon <bgeffon@xxxxxxxxxx>
> Acked-by: Jan Alexander Steffens (heftig) <heftig@xxxxxxxxxxxxx>
> Acked-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx>
> Acked-by: Steven Barrett <steven@xxxxxxxxxxxx>
> Acked-by: Suleiman Souhlal <suleiman@xxxxxxxxxx>
> Tested-by: Daniel Byrne <djbyrne@xxxxxxx>
> Tested-by: Donald Carr <d@xxxxxxxxxxxxxxx>
> Tested-by: Holger Hoffstätte <holger@xxxxxxxxxxxxxxxxxxxxxx>
> Tested-by: Konstantin Kharlamov <Hi-Angel@xxxxxxxxx>
> Tested-by: Shuang Zhai <szhai2@xxxxxxxxxxxxxxxx>
> Tested-by: Sofia Trinh <sofia.trinh@edi.works>
> Tested-by: Vaibhav Jain <vaibhav@xxxxxxxxxxxxx>
> ---
> include/linux/cgroup.h | 15 +-
> include/linux/mm_inline.h | 12 +-
> include/linux/mmzone.h | 9 ++
> kernel/cgroup/cgroup-internal.h | 1 -
> mm/Kconfig | 6 +
> mm/vmscan.c | 237 +++++++++++++++++++++++++++++++-
> 6 files changed, 271 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 75c151413fda..b145025f3eac 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
> css_put(&cgrp->self);
> }
>
> +extern struct mutex cgroup_mutex;
> +
> +static inline void cgroup_lock(void)
> +{
> + mutex_lock(&cgroup_mutex);
> +}
> +
> +static inline void cgroup_unlock(void)
> +{
> + mutex_unlock(&cgroup_mutex);
> +}
> +
> /**
> * task_css_set_check - obtain a task's css_set with extra access conditions
> * @task: the task to obtain css_set for
> @@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
> * as locks used during the cgroup_subsys::attach() methods.
> */
> #ifdef CONFIG_PROVE_RCU
> -extern struct mutex cgroup_mutex;
> extern spinlock_t css_set_lock;
> #define task_css_set_check(task, __c) \
> rcu_dereference_check((task)->cgroups, \
> @@ -707,6 +718,8 @@ struct cgroup;
> static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
> static inline void css_get(struct cgroup_subsys_state *css) {}
> static inline void css_put(struct cgroup_subsys_state *css) {}
> +static inline void cgroup_lock(void) {}
> +static inline void cgroup_unlock(void) {}
> static inline int cgroup_attach_task_all(struct task_struct *from,
> struct task_struct *t) { return 0; }
> static inline int cgroupstats_build(struct cgroupstats *stats,
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 15a04a9b5560..1c8d617e73a9 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -106,7 +106,15 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
>
> static inline bool lru_gen_enabled(void)
> {
> - return true;
> +#ifdef CONFIG_LRU_GEN_ENABLED
> + DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
> +
> + return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
> +#else
> + DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
> +
> + return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
> +#endif
> }
>
> static inline bool lru_gen_in_fault(void)
> @@ -196,7 +204,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
> int zone = folio_zonenum(folio);
> struct lru_gen_struct *lrugen = &lruvec->lrugen;
>
> - if (folio_test_unevictable(folio))
> + if (folio_test_unevictable(folio) || !lrugen->enabled)
> return false;
> /*
> * There are three common cases for this page:
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index a2d53025a321..116c9237e401 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -371,6 +371,13 @@ enum {
> LRU_GEN_FILE,
> };
>
> +enum {
> + LRU_GEN_CORE,
> + LRU_GEN_MM_WALK,
> + LRU_GEN_NONLEAF_YOUNG,
> + NR_LRU_GEN_CAPS
> +};
> +
> #define MIN_LRU_BATCH BITS_PER_LONG
> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 128)
>
> @@ -409,6 +416,8 @@ struct lru_gen_struct {
> /* can be modified without holding the LRU lock */
> atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
> atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
> + /* whether the multi-gen LRU is enabled */
> + bool enabled;
> };
>
> enum {
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 6e36e854b512..929ed3bf1a7c 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -165,7 +165,6 @@ struct cgroup_mgctx {
> #define DEFINE_CGROUP_MGCTX(name) \
> struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
>
> -extern struct mutex cgroup_mutex;
> extern spinlock_t css_set_lock;
> extern struct cgroup_subsys *cgroup_subsys[];
> extern struct list_head cgroup_roots;
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 804c2bca8205..050de1eae2d6 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -901,6 +901,12 @@ config LRU_GEN
> help
> A high performance LRU implementation for memory overcommit.
>
> +config LRU_GEN_ENABLED
> + bool "Enable by default"
> + depends on LRU_GEN
> + help
> + This option enables the multi-gen LRU by default.
> +
> config LRU_GEN_STATS
> bool "Full stats for debugging"
> depends on LRU_GEN
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 7375c9dae08f..55cc7d6b018b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3063,6 +3063,12 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
>
> #ifdef CONFIG_LRU_GEN
>
> +#ifdef CONFIG_LRU_GEN_ENABLED
> +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
> +#else
> +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
> +#endif
> +
> /******************************************************************************
> * shorthand helpers
> ******************************************************************************/
> @@ -3099,6 +3105,15 @@ static int folio_lru_tier(struct folio *folio)
> return lru_tier_from_refs(refs);
> }
>
> +static bool get_cap(int cap)
> +{
> +#ifdef CONFIG_LRU_GEN_ENABLED
> + return static_branch_likely(&lru_gen_caps[cap]);
> +#else
> + return static_branch_unlikely(&lru_gen_caps[cap]);
> +#endif
> +}
> +
> static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
> {
> struct pglist_data *pgdat = NODE_DATA(nid);
> @@ -3892,7 +3907,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
> goto next;
>
> if (!pmd_trans_huge(pmd[i])) {
> - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
> + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
> + get_cap(LRU_GEN_NONLEAF_YOUNG))
> pmdp_test_and_clear_young(vma, addr, pmd + i);
> goto next;
> }
> @@ -3999,10 +4015,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
> priv->mm_stats[MM_PMD_TOTAL]++;
>
> #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
> - if (!pmd_young(val))
> - continue;
> + if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
> + if (!pmd_young(val))
> + continue;
>
> - walk_pmd_range_locked(pud, addr, vma, walk, &pos);
> + walk_pmd_range_locked(pud, addr, vma, walk, &pos);
> + }
> #endif
> if (!priv->full_scan && !test_bloom_filter(priv->lruvec, priv->max_seq, pmd + i))
> continue;
> @@ -4233,7 +4251,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
> * handful of PTEs. Spreading the work out over a period of time usually
> * is less efficient, but it avoids bursty page faults.
> */
> - if (!full_scan && !arch_has_hw_pte_young()) {
> + if (!full_scan && (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK))) {
> success = iterate_mm_list_nowalk(lruvec, max_seq);
> goto done;
> }
> @@ -4946,6 +4964,211 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
> blk_finish_plug(&plug);
> }
>
> +/******************************************************************************
> + * state change
> + ******************************************************************************/
> +
> +static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
> +{
> + struct lru_gen_struct *lrugen = &lruvec->lrugen;
> +
> + if (lrugen->enabled) {
> + enum lru_list lru;
> +
> + for_each_evictable_lru(lru) {
> + if (!list_empty(&lruvec->lists[lru]))
> + return false;
> + }
> + } else {
> + int gen, type, zone;
> +
> + for_each_gen_type_zone(gen, type, zone) {
> + if (!list_empty(&lrugen->lists[gen][type][zone]))
> + return false;
> +
> + /* unlikely but not a bug when reset_batch_size() is pending */
> + VM_WARN_ON(lrugen->nr_pages[gen][type][zone]);
> + }
> + }
> +
> + return true;
> +}
> +
> +static bool fill_evictable(struct lruvec *lruvec)
> +{
> + enum lru_list lru;
> + int remaining = MAX_LRU_BATCH;
> +
> + for_each_evictable_lru(lru) {
> + int type = is_file_lru(lru);
> + bool active = is_active_lru(lru);
> + struct list_head *head = &lruvec->lists[lru];
> +
> + while (!list_empty(head)) {
> + bool success;
> + struct folio *folio = lru_to_folio(head);
> +
> + VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
> + VM_BUG_ON_FOLIO(folio_test_active(folio) != active, folio);
> + VM_BUG_ON_FOLIO(folio_is_file_lru(folio) != type, folio);
> + VM_BUG_ON_FOLIO(folio_lru_gen(folio) < MAX_NR_GENS, folio);
> +
> + lruvec_del_folio(lruvec, folio);
> + success = lru_gen_add_folio(lruvec, folio, false);
> + VM_BUG_ON(!success);
> +
> + if (!--remaining)
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
> +static bool drain_evictable(struct lruvec *lruvec)
> +{
> + int gen, type, zone;
> + int remaining = MAX_LRU_BATCH;
> +
> + for_each_gen_type_zone(gen, type, zone) {
> + struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
> +
> + while (!list_empty(head)) {
> + bool success;
> + struct folio *folio = lru_to_folio(head);
> +
> + VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
> + VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
> + VM_BUG_ON_FOLIO(folio_is_file_lru(folio) != type, folio);
> + VM_BUG_ON_FOLIO(folio_zonenum(folio) != zone, folio);
> +
> + success = lru_gen_del_folio(lruvec, folio, false);
> + VM_BUG_ON(!success);
> + lruvec_add_folio(lruvec, folio);

for example, max_seq=4(GEN=0) and max_seq-1=3, then we are supposed to put
max_seq in the head of active list. but your code seems to be putting max_seq-1
after putting max_seq, then max_seq is more likely to be evicted
afterwards as it
is in the tail of the active list.

anyway, it might not be so important. I can't imagine we will
frequently switch mglru
with lru dynamically. will we?

> +
> + if (!--remaining)
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
> +static void lru_gen_change_state(bool enable)
> +{
> + static DEFINE_MUTEX(state_mutex);
> +
> + struct mem_cgroup *memcg;
> +
> + cgroup_lock();
> + cpus_read_lock();
> + get_online_mems();
> + mutex_lock(&state_mutex);
> +
> + if (enable == lru_gen_enabled())
> + goto unlock;
> +
> + if (enable)
> + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
> + else
> + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
> +
> + memcg = mem_cgroup_iter(NULL, NULL, NULL);
> + do {
> + int nid;
> +
> + for_each_node(nid) {
> + struct lruvec *lruvec = get_lruvec(memcg, nid);
> +
> + if (!lruvec)
> + continue;
> +
> + spin_lock_irq(&lruvec->lru_lock);
> +
> + VM_BUG_ON(!seq_is_valid(lruvec));
> + VM_BUG_ON(!state_is_valid(lruvec));
> +
> + lruvec->lrugen.enabled = enable;
> +
> + while (!(enable ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
> + spin_unlock_irq(&lruvec->lru_lock);
> + cond_resched();
> + spin_lock_irq(&lruvec->lru_lock);
> + }
> +
> + spin_unlock_irq(&lruvec->lru_lock);
> + }
> +
> + cond_resched();
> + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
> +unlock:
> + mutex_unlock(&state_mutex);
> + put_online_mems();
> + cpus_read_unlock();
> + cgroup_unlock();
> +}
> +
> +/******************************************************************************
> + * sysfs interface
> + ******************************************************************************/
> +
> +static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> + unsigned int caps = 0;
> +
> + if (get_cap(LRU_GEN_CORE))
> + caps |= BIT(LRU_GEN_CORE);
> +
> + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
> + caps |= BIT(LRU_GEN_MM_WALK);
> +
> + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
> + caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
> +
> + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
> +}
> +
> +static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
> + const char *buf, size_t len)
> +{
> + int i;
> + unsigned int caps;
> +
> + if (tolower(*buf) == 'n')
> + caps = 0;
> + else if (tolower(*buf) == 'y')
> + caps = -1;
> + else if (kstrtouint(buf, 0, &caps))
> + return -EINVAL;
> +
> + for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
> + bool enable = caps & BIT(i);
> +
> + if (i == LRU_GEN_CORE)
> + lru_gen_change_state(enable);
> + else if (enable)
> + static_branch_enable(&lru_gen_caps[i]);
> + else
> + static_branch_disable(&lru_gen_caps[i]);
> + }
> +
> + return len;
> +}
> +
> +static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
> + enabled, 0644, show_enable, store_enable
> +);
> +
> +static struct attribute *lru_gen_attrs[] = {
> + &lru_gen_enabled_attr.attr,
> + NULL
> +};
> +
> +static struct attribute_group lru_gen_attr_group = {
> + .name = "lru_gen",
> + .attrs = lru_gen_attrs,
> +};
> +
> /******************************************************************************
> * initialization
> ******************************************************************************/
> @@ -4956,6 +5179,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
> struct lru_gen_struct *lrugen = &lruvec->lrugen;
>
> lrugen->max_seq = MIN_NR_GENS + 1;
> + lrugen->enabled = lru_gen_enabled();
>
> for_each_gen_type_zone(gen, type, zone)
> INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
> @@ -4996,6 +5220,9 @@ static int __init init_lru_gen(void)
> BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
> BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
>
> + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
> + pr_err("lru_gen: failed to create sysfs group\n");
> +
> return 0;
> };
> late_initcall(init_lru_gen);
> --
> 2.35.1.616.g0bdcbb4464-goog
>

Thanks
Barry