Re: [PATCH v4 1/1] mm: vmascan: retry folios written back while isolated for traditional LRU
From: chenridong
Date: Thu Dec 12 2024 - 19:54:33 EST
On 2024/12/13 7:17, Barry Song wrote:
> On Wed, Dec 11, 2024 at 1:11 AM chenridong <chenridong@xxxxxxxxxx> wrote:
>>
>>
>>
>> On 2024/12/10 16:24, Barry Song wrote:
>>> On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@xxxxxxxxxx> wrote:
>>>>
>>>>
>>>>
>>>> On 2024/12/10 12:54, Barry Song wrote:
>>>>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@xxxxxxxxxxxxxxx> wrote:
>>>>>>
>>>>>> From: Chen Ridong <chenridong@xxxxxxxxxx>
>>>>>>
>>>>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back
>>>>>> while isolated") only fixed the issue for mglru. However, this issue
>>>>>> also exists in the traditional active/inactive LRU. This issue will be
>>>>>> worse if THP is split, which makes the list longer and needs longer time
>>>>>> to finish a batch of folios reclaim.
>>>>>>
>>>>>> This issue should be fixed in the same way for the traditional LRU.
>>>>>> Therefore, the common logic was extracted to the 'find_folios_written_back'
>>>>>> function firstly, which is then reused in the 'shrink_inactive_list'
>>>>>> function. Finally, retry reclaiming those folios that may have missed the
>>>>>> rotation for traditional LRU.
>>>>>
>>>>> let's drop the cover-letter and refine the changelog.
>>>>>
>>>> Will update.
>>>>
>>>>>>
>>>>>> Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx>
>>>>>> ---
>>>>>> include/linux/mmzone.h | 3 +-
>>>>>> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------
>>>>>> 2 files changed, 77 insertions(+), 34 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>>> index b36124145a16..47c6e8c43dcd 100644
>>>>>> --- a/include/linux/mmzone.h
>>>>>> +++ b/include/linux/mmzone.h
>>>>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk;
>>>>>>
>>>>>> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
>>>>>> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
>>>>>> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
>>>>>>
>>>>>> #ifdef CONFIG_LRU_GEN
>>>>>>
>>>>>> @@ -406,8 +407,6 @@ enum {
>>>>>> NR_LRU_GEN_CAPS
>>>>>> };
>>>>>>
>>>>>> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
>>>>>> -
>>>>>> #define MIN_LRU_BATCH BITS_PER_LONG
>>>>>> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
>>>>>>
>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>> index 76378bc257e3..1f0d194f8b2f 100644
>>>>>> --- a/mm/vmscan.c
>>>>>> +++ b/mm/vmscan.c
>>>>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>> task->reclaim_state = rs;
>>>>>> }
>>>>>>
>>>>>> +/**
>>>>>> + * find_folios_written_back - Find and move the written back folios to a new list.
>>>>>> + * @list: filios list
>>>>>> + * @clean: the written back folios list
>>>>>> + * @skip: whether skip to move the written back folios to clean list.
>>>>>> + */
>>>>>> +static inline void find_folios_written_back(struct list_head *list,
>>>>>> + struct list_head *clean, bool skip)
>>>>>> +{
>>>>>> + struct folio *folio;
>>>>>> + struct folio *next;
>>>>>> +
>>>>>> + list_for_each_entry_safe_reverse(folio, next, list, lru) {
>>>>>> + if (!folio_evictable(folio)) {
>>>>>> + list_del(&folio->lru);
>>>>>> + folio_putback_lru(folio);
>>>>>> + continue;
>>>>>> + }
>>>>>> +
>>>>>> + if (folio_test_reclaim(folio) &&
>>>>>> + (folio_test_dirty(folio) || folio_test_writeback(folio))) {
>>>>>> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
>>>>>> + if (lru_gen_enabled() && folio_test_workingset(folio))
>>>>>> + folio_set_referenced(folio);
>>>>>> + continue;
>>>>>> + }
>>>>>> +
>>>>>> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) ||
>>>>>> + folio_mapped(folio) || folio_test_locked(folio) ||
>>>>>> + folio_test_dirty(folio) || folio_test_writeback(folio)) {
>>>>>> + /* don't add rejected folios to the oldest generation */
>>>>>> + if (lru_gen_enabled())
>>>>>> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
>>>>>> + BIT(PG_active));
>>>>>> + continue;
>>>>>> + }
>>>>>> +
>>>>>> + /* retry folios that may have missed folio_rotate_reclaimable() */
>>>>>> + list_move(&folio->lru, clean);
>>>>>> + }
>>>>>> +}
>>>>>> +
>>>>>> /*
>>>>>> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
>>>>>> * scan_control->nr_reclaimed.
>>>>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void)
>>>>>> return !(current->flags & PF_LOCAL_THROTTLE);
>>>>>> }
>>>>>>
>>>>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat,
>>>>>> + struct reclaim_stat *curr)
>>>>>> +{
>>>>>> + int i;
>>>>>> +
>>>>>> + stat->nr_dirty += curr->nr_dirty;
>>>>>> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty;
>>>>>> + stat->nr_congested += curr->nr_congested;
>>>>>> + stat->nr_writeback += curr->nr_writeback;
>>>>>> + stat->nr_immediate += curr->nr_immediate;
>>>>>> + stat->nr_pageout += curr->nr_pageout;
>>>>>> + stat->nr_ref_keep += curr->nr_ref_keep;
>>>>>> + stat->nr_unmap_fail += curr->nr_unmap_fail;
>>>>>> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail;
>>>>>> + stat->nr_demoted += curr->nr_demoted;
>>>>>> + for (i = 0; i < ANON_AND_FILE; i++)
>>>>>> + stat->nr_activate[i] = curr->nr_activate[i];
>>>>>> +}
>>>>>
>>>>> you had no this before, what's the purpose of this?
>>>>>
>>>>
>>>> We may call shrink_folio_list twice, and the 'stat curr' will reset in
>>>> the shrink_folio_list function. We should accumulate the stats as a
>>>> whole, which will then be used to calculate the cost and return it to
>>>> the caller.
>>>
>>> Does mglru have the same issue? If so, we may need to send a patch to
>>> fix mglru's stat accounting as well. By the way, the code is rather
>>> messy—could it be implemented as shown below instead?
>>>
>>
>> I have checked the code (in the evict_folios function) again, and it
>> appears that 'reclaimed' should correspond to sc->nr_reclaimed, which
>> accumulates the results twice. Should I address this issue with a
>> separate patch?
>
> I don't think there is any problem.
>
> reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
> sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
> sc->nr_reclaimed += reclaimed;
>
> reclaimed is always the number of pages we have reclaimed in
> shrink_folio_list() no matter if it is retry or not.
>
Thank you, I made a mistake.
>>
>> if (!cgroup_reclaim(sc))
>> __count_vm_events(item, reclaimed);
>> __count_memcg_events(memcg, item, reclaimed);
>> __count_vm_events(PGSTEAL_ANON + type, reclaimed);
>>
>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 1f0d194f8b2f..40d2ddde21f5 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct
>>> list_head *folio_list,
>>> struct swap_iocb *plug = NULL;
>>>
>>> folio_batch_init(&free_folios);
>>> - memset(stat, 0, sizeof(*stat));
>>> cond_resched();
>>> do_demote_pass = can_demote(pgdat->node_id, sc);
>>>
>>> @@ -1949,25 +1948,6 @@ static int current_may_throttle(void)
>>> return !(current->flags & PF_LOCAL_THROTTLE);
>>> }
>>>
>>> -static inline void acc_reclaimed_stat(struct reclaim_stat *stat,
>>> - struct reclaim_stat *curr)
>>> -{
>>> - int i;
>>> -
>>> - stat->nr_dirty += curr->nr_dirty;
>>> - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty;
>>> - stat->nr_congested += curr->nr_congested;
>>> - stat->nr_writeback += curr->nr_writeback;
>>> - stat->nr_immediate += curr->nr_immediate;
>>> - stat->nr_pageout += curr->nr_pageout;
>>> - stat->nr_ref_keep += curr->nr_ref_keep;
>>> - stat->nr_unmap_fail += curr->nr_unmap_fail;
>>> - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail;
>>> - stat->nr_demoted += curr->nr_demoted;
>>> - for (i = 0; i < ANON_AND_FILE; i++)
>>> - stat->nr_activate[i] = curr->nr_activate[i];
>>> -}
>>> -
>>> /*
>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number
>>> * of reclaimed pages
>>> @@ -1981,7 +1961,7 @@ static unsigned long
>>> shrink_inactive_list(unsigned long nr_to_scan,
>>> unsigned long nr_scanned;
>>> unsigned int nr_reclaimed = 0;
>>> unsigned long nr_taken;
>>> - struct reclaim_stat stat, curr;
>>> + struct reclaim_stat stat;
>>> bool file = is_file_lru(lru);
>>> enum vm_event_item item;
>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>> @@ -2022,9 +2002,8 @@ static unsigned long
>>> shrink_inactive_list(unsigned long nr_to_scan,
>>>
>>> memset(&stat, 0, sizeof(stat));
>>> retry:
>>> - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false);
>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
>>> find_folios_written_back(&folio_list, &clean_list, skip_retry);
>>> - acc_reclaimed_stat(&stat, &curr);
>>>
>>> spin_lock_irq(&lruvec->lru_lock);
>>> move_folios_to_lru(lruvec, &folio_list);
>>>
>>
>> This seems much better. But we have extras works to do:
>>
>> 1. In the shrink_folio_list function:
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct
>> list_head *folio_list,
>> LIST_HEAD(ret_folios);
>> LIST_HEAD(demote_folios);
>> unsigned int nr_reclaimed = 0;
>> - unsigned int pgactivate = 0;
>> + unsigned int pgactivate = stat->nr_activate[0] +
>> stat->nr_activate[1];
>> + unsigned int nr_demote = 0;
>> bool do_demote_pass;
>> struct swap_iocb *plug = NULL;
>>
>> folio_batch_init(&free_folios);
>> - memset(stat, 0, sizeof(*stat));
>> cond_resched();
>> do_demote_pass = can_demote(pgdat->node_id, sc);
>>
>> @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct
>> list_head *folio_list,
>>
>> /* Migrate folios selected for demotion */
>> stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
>> - nr_reclaimed += stat->nr_demoted;
>> + stat->nr_demoted += nr_demote;
>> + nr_reclaimed += nr_demote;
>> /* Folios that could not be demoted are still in @demote_folios */
>> if (!list_empty(&demote_folios)) {
>> /* Folios which weren't demoted go back on @folio_list */
>> @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct
>> list_head *folio_list,
>> }
>> }
>>
>> - pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
>> + pgactivate = stat->nr_activate[0] + stat->nr_activate[1] -
>> pgactivate;
>>
>> mem_cgroup_uncharge_folios(&free_folios);
>> try_to_unmap_flush();
>>
>>
>> 2. Outsize of the shrink_folio_list function, The callers should memset
>> the stat.
>>
>> If you think this will be better, I will update like this.
>
> no. Please goto retry after you have collected all you need from
> `stat` just as mglru
> is doing, drop the "curr" and acc_reclaimed_stat().
>
> sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
> sc->nr_reclaimed += reclaimed;
>
> move_folios_to_lru() has helped moving all uninterested folios back to lruvec
> before you retry. There is no duplicated counting.
>
See, thank you.
Thanks,
Ridong
>>
>>>>
>>>> Thanks,
>>>> Ridong
>>>>
>>>>>> +
>>>>>> /*
>>>>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number
>>>>>> * of reclaimed pages
>>>>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>>>>>> enum lru_list lru)
>>>>>> {
>>>>>> LIST_HEAD(folio_list);
>>>>>> + LIST_HEAD(clean_list);
>>>>>> unsigned long nr_scanned;
>>>>>> unsigned int nr_reclaimed = 0;
>>>>>> unsigned long nr_taken;
>>>>>> - struct reclaim_stat stat;
>>>>>> + struct reclaim_stat stat, curr;
>>>>>> bool file = is_file_lru(lru);
>>>>>> enum vm_event_item item;
>>>>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>>>>> bool stalled = false;
>>>>>> + bool skip_retry = false;
>>>>>>
>>>>>> while (unlikely(too_many_isolated(pgdat, file, sc))) {
>>>>>> if (stalled)
>>>>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>>>>>> if (nr_taken == 0)
>>>>>> return 0;
>>>>>>
>>>>>> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
>>>>>> + memset(&stat, 0, sizeof(stat));
>>>>>> +retry:
>>>>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false);
>>>>>> + find_folios_written_back(&folio_list, &clean_list, skip_retry);
>>>>>> + acc_reclaimed_stat(&stat, &curr);
>>>>>>
>>>>>> spin_lock_irq(&lruvec->lru_lock);
>>>>>> move_folios_to_lru(lruvec, &folio_list);
>>>>>> + if (!list_empty(&clean_list)) {
>>>>>> + list_splice_init(&clean_list, &folio_list);
>>>>>> + skip_retry = true;
>>>>>> + spin_unlock_irq(&lruvec->lru_lock);
>>>>>> + goto retry;
>>>
>>> This is rather confusing. We're still jumping to retry even though
>>> skip_retry=true is set. Can we find a clearer approach for this?
>>>
>>> It was somewhat acceptable before we introduced the extracted
>>> function find_folios_written_back(). However, it has become
>>> harder to follow now that skip_retry is passed across functions.
>>>
>>> I find renaming skip_retry to is_retry more intuitive. The logic
>>> is that since we are already retrying, find_folios_written_back()
>>> shouldn’t move folios to the clean list again. The intended semantics
>>> are: we have retris, don’t retry again.
>>>
>>
>> Reasonable. Will update.
>>
>> Thanks,
>> Ridong
>>
>>>
>>>>>> + }
>>>>>>
>>>>>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
>>>>>> stat.nr_demoted);
>>>>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>>>>>> int reclaimed;
>>>>>> LIST_HEAD(list);
>>>>>> LIST_HEAD(clean);
>>>>>> - struct folio *folio;
>>>>>> - struct folio *next;
>>>>>> enum vm_event_item item;
>>>>>> struct reclaim_stat stat;
>>>>>> struct lru_gen_mm_walk *walk;
>>>>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>>>>>> scanned, reclaimed, &stat, sc->priority,
>>>>>> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
>>>>>>
>>>>>> - list_for_each_entry_safe_reverse(folio, next, &list, lru) {
>>>>>> - if (!folio_evictable(folio)) {
>>>>>> - list_del(&folio->lru);
>>>>>> - folio_putback_lru(folio);
>>>>>> - continue;
>>>>>> - }
>>>>>> -
>>>>>> - if (folio_test_reclaim(folio) &&
>>>>>> - (folio_test_dirty(folio) || folio_test_writeback(folio))) {
>>>>>> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
>>>>>> - if (folio_test_workingset(folio))
>>>>>> - folio_set_referenced(folio);
>>>>>> - continue;
>>>>>> - }
>>>>>> -
>>>>>> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
>>>>>> - folio_mapped(folio) || folio_test_locked(folio) ||
>>>>>> - folio_test_dirty(folio) || folio_test_writeback(folio)) {
>>>>>> - /* don't add rejected folios to the oldest generation */
>>>>>> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
>>>>>> - BIT(PG_active));
>>>>>> - continue;
>>>>>> - }
>>>>>> -
>>>>>> - /* retry folios that may have missed folio_rotate_reclaimable() */
>>>>>> - list_move(&folio->lru, &clean);
>>>>>> - }
>>>>>> -
>>>>>> + find_folios_written_back(&list, &clean, skip_retry);
>>>>>> spin_lock_irq(&lruvec->lru_lock);
>>>>>>
>>>>>> move_folios_to_lru(lruvec, &list);
>>>>>> --
>>>>>> 2.34.1
>>>>>>
>>>>>
>>>
>>> Thanks
>>> Barry