Re: [PATCH 2/5] mm/vmscan: Throttle reclaim and compaction when too may pages are isolated

From: Yang Shi
Date: Tue Sep 21 2021 - 14:45:41 EST


On Mon, Sep 20, 2021 at 1:55 AM Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> wrote:
>
> Page reclaim throttles on congestion if too many parallel reclaim instances
> have isolated too many pages. This makes no sense, excessive parallelisation
> has nothing to do with writeback or congestion.
>
> This patch creates an additional workqueue to sleep on when too many
> pages are isolated. The throttled tasks are woken when the number
> of isolated pages is reduced or a timeout occurs. There may be
> some false positive wakeups for GFP_NOIO/GFP_NOFS callers but
> the tasks will throttle again if necessary.
>
> Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> ---
> include/linux/mmzone.h | 4 +++-
> include/trace/events/vmscan.h | 4 +++-
> mm/compaction.c | 2 +-
> mm/internal.h | 2 ++
> mm/page_alloc.c | 6 +++++-
> mm/vmscan.c | 22 ++++++++++++++++------
> 6 files changed, 30 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index ef0a63ebd21d..ca65d6a64bdd 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -275,6 +275,8 @@ enum lru_list {
>
> enum vmscan_throttle_state {
> VMSCAN_THROTTLE_WRITEBACK,
> + VMSCAN_THROTTLE_ISOLATED,
> + NR_VMSCAN_THROTTLE,
> };
>
> #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
> @@ -846,7 +848,7 @@ typedef struct pglist_data {
> int node_id;
> wait_queue_head_t kswapd_wait;
> wait_queue_head_t pfmemalloc_wait;
> - wait_queue_head_t reclaim_wait; /* wq for throttling reclaim */
> + wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];
> atomic_t nr_reclaim_throttled; /* nr of throtted tasks */
> unsigned long nr_reclaim_start; /* nr pages written while throttled
> * when throttling started. */
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index c317f9fe0d17..d4905bd9e9c4 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -28,10 +28,12 @@
> ) : "RECLAIM_WB_NONE"
>
> #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK)
> +#define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED)
>
> #define show_throttle_flags(flags) \
> (flags) ? __print_flags(flags, "|", \
> - {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"} \
> + {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \
> + {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"} \
> ) : "VMSCAN_THROTTLE_NONE"
>
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index bfc93da1c2c7..221c9c10ad7e 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -822,7 +822,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
> if (cc->mode == MIGRATE_ASYNC)
> return -EAGAIN;
>
> - congestion_wait(BLK_RW_ASYNC, HZ/10);
> + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10);

It seems waking up tasks is missed in compaction's
too_many_isolated(). There are two too_many_isolated(), one is for
compaction, the other is for reclaimer. I saw the waking up code was
added to the reclaimer's in the below. Or the compaction one is left
out intentionally?

>
> if (fatal_signal_pending(current))
> return -EINTR;
> diff --git a/mm/internal.h b/mm/internal.h
> index e25b3686bfab..e6cd22fb5a43 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -118,6 +118,8 @@ extern unsigned long highest_memmap_pfn;
> */
> extern int isolate_lru_page(struct page *page);
> extern void putback_lru_page(struct page *page);
> +extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason,
> + long timeout);
>
> /*
> * in mm/rmap.c:
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d849ddfc1e51..78e538067651 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -7389,6 +7389,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
>
> static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
> {
> + int i;
> +
> pgdat_resize_init(pgdat);
>
> pgdat_init_split_queue(pgdat);
> @@ -7396,7 +7398,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>
> init_waitqueue_head(&pgdat->kswapd_wait);
> init_waitqueue_head(&pgdat->pfmemalloc_wait);
> - init_waitqueue_head(&pgdat->reclaim_wait);
> +
> + for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
> + init_waitqueue_head(&pgdat->reclaim_wait[i]);
>
> pgdat_page_ext_init(pgdat);
> lruvec_init(&pgdat->__lruvec);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b58ea0b13286..eb81dcac15b2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1006,11 +1006,10 @@ static void handle_write_error(struct address_space *mapping,
> unlock_page(page);
> }
>
> -static void
> -reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason,
> +void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason,
> long timeout)
> {
> - wait_queue_head_t *wqh = &pgdat->reclaim_wait;
> + wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
> unsigned long start = jiffies;
> long ret;
> DEFINE_WAIT(wait);
> @@ -1044,7 +1043,7 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page)
> READ_ONCE(pgdat->nr_reclaim_start);
>
> if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
> - wake_up_interruptible_all(&pgdat->reclaim_wait);
> + wake_up_interruptible_all(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
> }
>
> /* possible outcome of pageout() */
> @@ -2159,6 +2158,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
> struct scan_control *sc)
> {
> unsigned long inactive, isolated;
> + bool too_many;
>
> if (current_is_kswapd())
> return 0;
> @@ -2182,6 +2182,17 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
> if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
> inactive >>= 3;
>
> + too_many = isolated > inactive;
> +
> + /* Wake up tasks throttled due to too_many_isolated. */
> + if (!too_many) {
> + wait_queue_head_t *wqh;
> +
> + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
> + if (waitqueue_active(wqh))
> + wake_up_interruptible_all(wqh);
> + }
> +
> return isolated > inactive;

Just return too_many?

> }
>
> @@ -2291,8 +2302,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> return 0;
>
> /* wait a bit for the reclaimer. */
> - msleep(100);
> - stalled = true;
> + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10);
>
> /* We are about to die and free our memory. Return now. */
> if (fatal_signal_pending(current))
> --
> 2.31.1
>
>