RE: [PATCH] vmscan: add a vmscan event for reclaim_pages

From: Jaewon Kim
Date: Fri Oct 11 2024 - 04:25:53 EST


>> The reclaim_folio_list uses a dummy reclaim_stat and is not being
>> used. To know the memory stat, add a new trace event. This is useful how
>> how many pages are not reclaimed or why.
>>
>> This is an example.
>> mm_vmscan_reclaim_pages: nr_scanned=17 nr_reclaimed=17 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=0 nr_ref_keep=0 nr_unmap_fail=0
>>
>> Currenlty reclaim_folio_list is only called by reclaim_pages, and
>> reclaim_pages is used by damon and madvise. In the latest Android,
>> reclaim_pages is also used by shmem to reclaim all pages in a
>> address_space.
>>
>> Signed-off-by: Jaewon Kim <jaewon31.kim@xxxxxxxxxxx>
>> ---
>> include/trace/events/vmscan.h | 41 +++++++++++++++++++++++++++++++++++
>> mm/vmscan.c | 40 +++++++++++++++++++++++++++++-----
>> 2 files changed, 76 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
>> index 1a488c30afa5..509110a12fa5 100644
>> --- a/include/trace/events/vmscan.h
>> +++ b/include/trace/events/vmscan.h
>> @@ -346,6 +346,47 @@ TRACE_EVENT(mm_vmscan_write_folio,
>> show_reclaim_flags(__entry->reclaim_flags))
>> );
>>
>> +TRACE_EVENT(mm_vmscan_reclaim_pages,
>> +
>> + TP_PROTO(unsigned long nr_scanned, unsigned long nr_reclaimed,
>> + struct reclaim_stat *stat),
>> +
>> + TP_ARGS(nr_scanned, nr_reclaimed, stat),
>> +
>> + TP_STRUCT__entry(
>> + __field(unsigned long, nr_scanned)
>> + __field(unsigned long, nr_reclaimed)
>> + __field(unsigned long, nr_dirty)
>> + __field(unsigned long, nr_writeback)
>> + __field(unsigned long, nr_congested)
>> + __field(unsigned long, nr_immediate)
>> + __field(unsigned int, nr_activate0)
>> + __field(unsigned int, nr_activate1)
>> + __field(unsigned long, nr_ref_keep)
>> + __field(unsigned long, nr_unmap_fail)
>> + ),
>> +
>> + TP_fast_assign(
>> + __entry->nr_scanned = nr_scanned;
>> + __entry->nr_reclaimed = nr_reclaimed;
>> + __entry->nr_dirty = stat->nr_dirty;
>> + __entry->nr_writeback = stat->nr_writeback;
>> + __entry->nr_congested = stat->nr_congested;
>> + __entry->nr_immediate = stat->nr_immediate;
>> + __entry->nr_activate0 = stat->nr_activate[0];
>> + __entry->nr_activate1 = stat->nr_activate[1];
>> + __entry->nr_ref_keep = stat->nr_ref_keep;
>> + __entry->nr_unmap_fail = stat->nr_unmap_fail;
>> + ),
>> +
>> + TP_printk("nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld",
>> + __entry->nr_scanned, __entry->nr_reclaimed,
>> + __entry->nr_dirty, __entry->nr_writeback,
>> + __entry->nr_congested, __entry->nr_immediate,
>> + __entry->nr_activate0, __entry->nr_activate1,
>> + __entry->nr_ref_keep, __entry->nr_unmap_fail)
>> +);
>> +
>> TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
>>
>> TP_PROTO(int nid,
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 749cdc110c74..4776c42dfd2a 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -2126,9 +2126,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
>> }
>>
>> static unsigned int reclaim_folio_list(struct list_head *folio_list,
>> - struct pglist_data *pgdat)
>> + struct pglist_data *pgdat,
>> + struct reclaim_stat *stat)
>> {
>> - struct reclaim_stat dummy_stat;
>> unsigned int nr_reclaimed;
>> struct folio *folio;
>> struct scan_control sc = {
>> @@ -2139,7 +2139,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
>> .no_demotion = 1,
>> };
>>
>> - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
>> + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, stat, true);
>> while (!list_empty(folio_list)) {
>> folio = lru_to_folio(folio_list);
>> list_del(&folio->lru);
>> @@ -2149,16 +2149,40 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
>> return nr_reclaimed;
>> }
>>
>> +static void reclaim_stat_add(struct reclaim_stat *stat_from,
>> + struct reclaim_stat *stat_to)
>> +{
>> + int type;
>> +
>> + if (!trace_mm_vmscan_reclaim_pages_enabled())
>> + return;
>> +
>> + stat_to->nr_dirty += stat_from->nr_dirty;
>> + stat_to->nr_unqueued_dirty += stat_from->nr_unqueued_dirty;
>> + stat_to->nr_congested += stat_from->nr_congested;
>> + stat_to->nr_writeback += stat_from->nr_writeback;
>> + stat_to->nr_immediate += stat_from->nr_immediate;
>> + stat_to->nr_pageout += stat_from->nr_pageout;
>> + for (type = 0; type < ANON_AND_FILE; type++)
>> + stat_to->nr_activate[type] += stat_from->nr_activate[type];
>> + stat_to->nr_ref_keep += stat_from->nr_ref_keep;
>> + stat_to->nr_unmap_fail += stat_from->nr_unmap_fail;
>> + stat_to->nr_lazyfree_fail += stat_from->nr_lazyfree_fail;
>> +}
>
>Could we avoid this by using a single stat that just accumulates over
>multiple calls to reclaim_folio_list()?
>
>That means shrink_folio_list() would not do the initial memset(0) and it
>would be caller responsibility.

Hi

Thank you for your coment. Yes if it is allowed, I can do that way. When
I checked, the following functions should do the memset().

reclaim_clean_pages_from_list
shrink_inactive_list
reclaim_folio_list
evict_folios

Actually I was planning to move trace_mm_vmscan_reclaim_pages into
reclaim_folio_list so that we don't have to sum up and we may be able
to print node number, too. As we will see log for each node, if we'd
like to know the sum, that would be the post parser's job.

Option 1. No change on memset, but print on each node.
mm_vmscan_reclaim_pages: nid=0 nr_scanned=112 nr_reclaimed=112 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=0 nr_ref_keep=0 nr_unmap_fail=0
mm_vmscan_reclaim_pages: nid=1 ...
mm_vmscan_reclaim_pages: nid=2 ...

Option 2. Change on memset, but we don't care the stat from each node.
mm_vmscan_reclaim_pages: nr_scanned=35 nr_reclaimed=35 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=0 nr_ref_keep=0 nr_unmap_fail=0

Would you give me you preference between the two options?

Thank you
Jaewon Kim

>
>AFAICS shrink_folio_list() only cares about these fields:
>
>pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
>
>in order to do
>
>count_vm_events(PGACTIVATE, pgactivate);
>
>Which could be adjusted to deal with accumulating stat - i.e. take an
>initial sum of the fields in stat and subtract from the final sum to get
>the delta.
>
>> unsigned long reclaim_pages(struct list_head *folio_list)
>> {
>> int nid;
>> + unsigned int nr_scanned = 0;
>> unsigned int nr_reclaimed = 0;
>> LIST_HEAD(node_folio_list);
>> unsigned int noreclaim_flag;
>> + struct reclaim_stat stat_total, stat_one;
>>
>> if (list_empty(folio_list))
>> return nr_reclaimed;
>>
>> + memset(&stat_total, 0, sizeof(stat_total));
>> noreclaim_flag = memalloc_noreclaim_save();
>>
>> nid = folio_nid(lru_to_folio(folio_list));
>> @@ -2168,14 +2192,20 @@ unsigned long reclaim_pages(struct list_head *folio_list)
>> if (nid == folio_nid(folio)) {
>> folio_clear_active(folio);
>> list_move(&folio->lru, &node_folio_list);
>> + nr_scanned += folio_nr_pages(folio);
>> continue;
>> }
>>
>> - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
>> + nr_reclaimed += reclaim_folio_list(&node_folio_list,
>> + NODE_DATA(nid), &stat_one);
>> + reclaim_stat_add(&stat_one, &stat_total);
>> nid = folio_nid(lru_to_folio(folio_list));
>> } while (!list_empty(folio_list));
>>
>> - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
>> + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
>> + &stat_one);
>> + reclaim_stat_add(&stat_one, &stat_total);
>> + trace_mm_vmscan_reclaim_pages(nr_scanned, nr_reclaimed, &stat_total);
>>
>> memalloc_noreclaim_restore(noreclaim_flag);
>>