Re: [PATCH V2 2/7] mm: move MADV_FREE pages into LRU_INACTIVE_FILE list

From: Michal Hocko
Date: Fri Feb 10 2017 - 08:24:41 EST


On Fri 03-02-17 15:33:18, Shaohua Li wrote:
> Userspace indicates MADV_FREE pages could be freed without pageout, so
> it pretty much likes used once file pages. For such pages, we'd like to
> reclaim them once there is memory pressure. Also it might be unfair
> reclaiming MADV_FREE pages always before used once file pages and we
> definitively want to reclaim the pages before other anonymous and file
> pages.
>
> To speed up MADV_FREE pages reclaim, we put the pages into
> LRU_INACTIVE_FILE list. The rationale is LRU_INACTIVE_FILE list is tiny
> nowadays and should be full of used once file pages. Reclaiming
> MADV_FREE pages will not have much interfere of anonymous and active
> file pages. And the inactive file pages and MADV_FREE pages will be
> reclaimed according to their age, so we don't reclaim too many MADV_FREE
> pages too. Putting the MADV_FREE pages into LRU_INACTIVE_FILE_LIST also
> means we can reclaim the pages without swap support. This idea is
> suggested by Johannes.
>
> We also clear the pages SwapBacked flag to indicate they are MADV_FREE
> pages.

I like this. I have expected this to be more convoluted but it looks
quite straightforward. I didn't get to do a really deep review and add
my acked-by but from a quick look there do not seem to be any surprises.
I was worried about vmstat accounting. There are some places which
isolate page from LRU and account based on the LRU and later use
page_is_file_cache to tell which LRU this was. This should work fine,
though, because you never touch pages which are off-lru.

That being said I do not see any major issues. There might be some minor
things and this will need a lot of testing but it is definitely a move
into right direction. I hope to do the deeper review after I get back
from vacation (20th Feb).

> Cc: Michal Hocko <mhocko@xxxxxxxx>
> Cc: Minchan Kim <minchan@xxxxxxxxxx>
> Cc: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Rik van Riel <riel@xxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>

I guess
Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>

would be appropriate.

> Signed-off-by: Shaohua Li <shli@xxxxxx>
> ---
> include/linux/mm_inline.h | 5 +++++
> include/linux/swap.h | 2 +-
> include/linux/vm_event_item.h | 2 +-
> mm/huge_memory.c | 5 ++---
> mm/madvise.c | 3 +--
> mm/swap.c | 50 ++++++++++++++++++++++++-------------------
> mm/vmstat.c | 1 +
> 7 files changed, 39 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index e030a68..fdded06 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -22,6 +22,11 @@ static inline int page_is_file_cache(struct page *page)
> return !PageSwapBacked(page);
> }
>
> +static inline bool page_is_lazyfree(struct page *page)
> +{
> + return PageAnon(page) && !PageSwapBacked(page);
> +}
> +
> static __always_inline void __update_lru_size(struct lruvec *lruvec,
> enum lru_list lru, enum zone_type zid,
> int nr_pages)
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 45e91dd..486494e 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu);
> extern void lru_add_drain_all(void);
> extern void rotate_reclaimable_page(struct page *page);
> extern void deactivate_file_page(struct page *page);
> -extern void deactivate_page(struct page *page);
> +extern void mark_page_lazyfree(struct page *page);
> extern void swap_setup(void);
>
> extern void add_page_to_unevictable_list(struct page *page);
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 6aa1b6c..94e58da 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> FOR_ALL_ZONES(PGALLOC),
> FOR_ALL_ZONES(ALLOCSTALL),
> FOR_ALL_ZONES(PGSCAN_SKIP),
> - PGFREE, PGACTIVATE, PGDEACTIVATE,
> + PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
> PGFAULT, PGMAJFAULT,
> PGLAZYFREED,
> PGREFILL,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index ecf569d..ddb9a94 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1391,9 +1391,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> ClearPageDirty(page);
> unlock_page(page);
>
> - if (PageActive(page))
> - deactivate_page(page);
> -
> if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
> orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
> tlb->fullmm);
> @@ -1404,6 +1401,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> set_pmd_at(mm, addr, pmd, orig_pmd);
> tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> }
> +
> + mark_page_lazyfree(page);
> ret = true;
> out:
> spin_unlock(ptl);
> diff --git a/mm/madvise.c b/mm/madvise.c
> index c867d88..c24549e 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -378,10 +378,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> ptent = pte_mkclean(ptent);
> ptent = pte_wrprotect(ptent);
> set_pte_at(mm, addr, pte, ptent);
> - if (PageActive(page))
> - deactivate_page(page);
> tlb_remove_tlb_entry(tlb, pte, addr);
> }
> + mark_page_lazyfree(page);
> }
> out:
> if (nr_swap) {
> diff --git a/mm/swap.c b/mm/swap.c
> index c4910f1..69a7e9d 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -46,7 +46,7 @@ int page_cluster;
> static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
> static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
> static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
> -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
> +static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
> #ifdef CONFIG_SMP
> static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
> #endif
> @@ -268,6 +268,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
> int lru = page_lru_base_type(page);
>
> del_page_from_lru_list(page, lruvec, lru);
> + if (page_is_lazyfree(page)) {
> + SetPageSwapBacked(page);
> + file = 0;
> + lru = LRU_INACTIVE_ANON;
> + }
> SetPageActive(page);
> lru += LRU_ACTIVE;
> add_page_to_lru_list(page, lruvec, lru);
> @@ -561,20 +566,21 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
> }
>
>
> -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
> +static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
> void *arg)
> {
> - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
> - int file = page_is_file_cache(page);
> - int lru = page_lru_base_type(page);
> + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
> + !PageUnevictable(page)) {
> + bool active = PageActive(page);
>
> - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
> + del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active);
> ClearPageActive(page);
> ClearPageReferenced(page);
> - add_page_to_lru_list(page, lruvec, lru);
> + ClearPageSwapBacked(page);
> + add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
>
> - __count_vm_event(PGDEACTIVATE);
> - update_page_reclaim_stat(lruvec, file, 0);
> + update_page_reclaim_stat(lruvec, 1, 0);
> + count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
> }
> }
>
> @@ -604,9 +610,9 @@ void lru_add_drain_cpu(int cpu)
> if (pagevec_count(pvec))
> pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
>
> - pvec = &per_cpu(lru_deactivate_pvecs, cpu);
> + pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
> if (pagevec_count(pvec))
> - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
> + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
>
> activate_page_drain(cpu);
> }
> @@ -638,22 +644,22 @@ void deactivate_file_page(struct page *page)
> }
>
> /**
> - * deactivate_page - deactivate a page
> + * mark_page_lazyfree - make an anon page lazyfree
> * @page: page to deactivate
> *
> - * deactivate_page() moves @page to the inactive list if @page was on the active
> - * list and was not an unevictable page. This is done to accelerate the reclaim
> - * of @page.
> + * mark_page_lazyfree() moves @page to the inactive file list.
> + * This is done to accelerate the reclaim of @page.
> */
> -void deactivate_page(struct page *page)
> -{
> - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
> - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
> +void mark_page_lazyfree(struct page *page)
> + {
> + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
> + !PageUnevictable(page)) {
> + struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
>
> get_page(page);
> if (!pagevec_add(pvec, page) || PageCompound(page))
> - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
> - put_cpu_var(lru_deactivate_pvecs);
> + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
> + put_cpu_var(lru_lazyfree_pvecs);
> }
> }
>
> @@ -704,7 +710,7 @@ void lru_add_drain_all(void)
> if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
> pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
> pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
> - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
> + pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
> need_activate_page_drain(cpu)) {
> INIT_WORK(work, lru_add_drain_per_cpu);
> queue_work_on(cpu, lru_add_drain_wq, work);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 69f9aff..7774196 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -992,6 +992,7 @@ const char * const vmstat_text[] = {
> "pgfree",
> "pgactivate",
> "pgdeactivate",
> + "pglazyfree",
>
> "pgfault",
> "pgmajfault",
> --
> 2.9.3
>

--
Michal Hocko
SUSE Labs