Re: [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages

From: Minchan Kim
Date: Wed Jul 27 2016 - 21:25:41 EST


On Wed, Jul 27, 2016 at 04:13:33PM +0100, Mel Gorman wrote:
> On Wed, Jul 27, 2016 at 03:22:26PM +0100, Mel Gorman wrote:
> > ---8<---
> > From: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> > Subject: [PATCH] mm, vmscan: Wait on a waitqueue when too many pages are
> > isolated
> >
>
> This is potentially a much better version as it avoids wakeup storms and
> do a better job of handling the case where pages could not be reclaimed.
>
> ---8<---
> mm, vmscan: Wait on a waitqueue when too many pages are isolated
>
> When too many pages are isolated, direct reclaim waits on congestion to
> clear for up to a tenth of a second. There is no reason to believe that too
> many pages are isolated due to dirty pages, reclaim efficiency or congestion.
> It may simply be because an extremely large number of processes have entered
> direct reclaim at the same time.
>
> This patch has processes wait on a waitqueue when too many pages are
> isolated. When parallel reclaimers finish shrink_page_list, they wake the
> waiters to recheck whether too many pages are isolated. While it is difficult
> to trigger this corner case, it's possible by lauching an extremely large
> number of hackbench processes on a 32-bit system with limited memory. Without
> the patch, a large number of processes wait uselessly and with the patch
> applied, I was unable to stall the system.
>
> Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> ---
> include/linux/mmzone.h | 1 +
> mm/page_alloc.c | 1 +
> mm/vmscan.c | 24 +++++++++++++++---------
> 3 files changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index d572b78b65e1..467878d7af33 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -653,6 +653,7 @@ typedef struct pglist_data {
> int node_id;
> wait_queue_head_t kswapd_wait;
> wait_queue_head_t pfmemalloc_wait;
> + wait_queue_head_t isolated_wait;
> struct task_struct *kswapd; /* Protected by
> mem_hotplug_begin/end() */
> int kswapd_order;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fbd329e61bf6..3800972f240e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5859,6 +5859,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
> #endif
> init_waitqueue_head(&pgdat->kswapd_wait);
> init_waitqueue_head(&pgdat->pfmemalloc_wait);
> + init_waitqueue_head(&pgdat->isolated_wait);
> #ifdef CONFIG_COMPACTION
> init_waitqueue_head(&pgdat->kcompactd_wait);
> #endif
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9c0b2b0fc164..e264fcb7556b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1554,16 +1554,16 @@ int isolate_lru_page(struct page *page)
> * the LRU list will go small and be scanned faster than necessary, leading to
> * unnecessary swapping, thrashing and OOM.
> */
> -static int too_many_isolated(struct pglist_data *pgdat, int file,
> +static bool safe_to_isolate(struct pglist_data *pgdat, int file,
> struct scan_control *sc)
> {
> unsigned long inactive, isolated;
>
> if (current_is_kswapd())
> - return 0;
> + return true;
>
> - if (!sane_reclaim(sc))
> - return 0;
> + if (sane_reclaim(sc))
> + return true;
>
> if (file) {
> inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> @@ -1581,7 +1581,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
> if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> inactive >>= 3;
>
> - return isolated > inactive;
> + return isolated < inactive;
> }
>
> static noinline_for_stack void
> @@ -1701,12 +1701,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> if (!inactive_reclaimable_pages(lruvec, sc, lru))
> return 0;
>
> - while (unlikely(too_many_isolated(pgdat, file, sc))) {
> - congestion_wait(BLK_RW_ASYNC, HZ/10);
> + if (!safe_to_isolate(pgdat, file, sc)) {
> + wait_event_killable(pgdat->isolated_wait,
> + safe_to_isolate(pgdat, file, sc));
>
> /* We are about to die and free our memory. Return now. */
> - if (fatal_signal_pending(current))
> - return SWAP_CLUSTER_MAX;
> + if (fatal_signal_pending(current)) {
> + nr_reclaimed = SWAP_CLUSTER_MAX;
> + goto out;
> + }
> }
>
> lru_add_drain();
> @@ -1819,6 +1822,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
> nr_scanned, nr_reclaimed,
> sc->priority, file);
> +
> +out:
> + wake_up(&pgdat->isolated_wait);

Isolation can happen migrate/khugepaged as well as reclaim so it would
sleep forever unless all of places can isolate pages don't wake up.