Re: [patch] mm, page_alloc: wakeup kcompactd even if kswapd cannot free more memory
From: Vlastimil Babka
Date: Tue Mar 20 2018 - 05:58:12 EST
On 03/12/2018 01:00 AM, David Rientjes wrote:
> Kswapd will not wakeup if per-zone watermarks are not failing or if too
> many previous attempts at background reclaim have failed.
>
> This can be true if there is a lot of free memory available. For high-
> order allocations, kswapd is responsible for waking up kcompactd for
> background compaction. If the zone is now below its watermarks or
not ?
> reclaim has recently failed (lots of free memory, nothing left to
> reclaim), kcompactd does not get woken up.
>
> When __GFP_DIRECT_RECLAIM is not allowed, allow kcompactd to still be
> woken up even if kswapd will not reclaim. This allows high-order
> allocations, such as thp, to still trigger background compaction even
> when the zone has an abundance of free memory.
>
> Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx>
Acked-by: Vlastimil Babka <vbabka@xxxxxxx>
> ---
> .../postprocess/trace-vmscan-postprocess.pl | 4 +--
> include/linux/mmzone.h | 3 +-
> include/trace/events/vmscan.h | 17 ++++++----
> mm/page_alloc.c | 14 ++++----
> mm/vmscan.c | 32 +++++++++++++------
> 5 files changed, 45 insertions(+), 25 deletions(-)
>
> diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> @@ -111,7 +111,7 @@ my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flag
> my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
> my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
> my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
> -my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
> +my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
> my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
> my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
> my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
> @@ -201,7 +201,7 @@ $regex_kswapd_sleep = generate_traceevent_regex(
> $regex_wakeup_kswapd = generate_traceevent_regex(
> "vmscan/mm_vmscan_wakeup_kswapd",
> $regex_wakeup_kswapd_default,
> - "nid", "zid", "order");
> + "nid", "zid", "order", "gfp_flags");
> $regex_lru_isolate = generate_traceevent_regex(
> "vmscan/mm_vmscan_lru_isolate",
> $regex_lru_isolate_default,
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -775,7 +775,8 @@ static inline bool is_dev_zone(const struct zone *zone)
> #include <linux/memory_hotplug.h>
>
> void build_all_zonelists(pg_data_t *pgdat);
> -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
> +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
> + enum zone_type classzone_idx);
> bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
> int classzone_idx, unsigned int alloc_flags,
> long free_pages);
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
>
> TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>
> - TP_PROTO(int nid, int zid, int order),
> + TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
>
> - TP_ARGS(nid, zid, order),
> + TP_ARGS(nid, zid, order, gfp_flags),
>
> TP_STRUCT__entry(
> - __field( int, nid )
> - __field( int, zid )
> - __field( int, order )
> + __field( int, nid )
> + __field( int, zid )
> + __field( int, order )
> + __field( gfp_t, gfp_flags )
> ),
>
> TP_fast_assign(
> __entry->nid = nid;
> __entry->zid = zid;
> __entry->order = order;
> + __entry->gfp_flags = gfp_flags;
> ),
>
> - TP_printk("nid=%d zid=%d order=%d",
> + TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
> __entry->nid,
> __entry->zid,
> - __entry->order)
> + __entry->order,
> + show_gfp_flags(__entry->gfp_flags))
> );
>
> DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3683,16 +3683,18 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
> return page;
> }
>
> -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
> +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
> + const struct alloc_context *ac)
> {
> struct zoneref *z;
> struct zone *zone;
> pg_data_t *last_pgdat = NULL;
> + enum zone_type high_zoneidx = ac->high_zoneidx;
>
> - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
> - ac->high_zoneidx, ac->nodemask) {
> + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
> + ac->nodemask) {
> if (last_pgdat != zone->zone_pgdat)
> - wakeup_kswapd(zone, order, ac->high_zoneidx);
> + wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
> last_pgdat = zone->zone_pgdat;
> }
> }
> @@ -3971,7 +3973,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> goto nopage;
>
> if (gfp_mask & __GFP_KSWAPD_RECLAIM)
> - wake_all_kswapds(order, ac);
> + wake_all_kswapds(order, gfp_mask, ac);
>
> /*
> * The adjusted alloc_flags might result in immediate success, so try
> @@ -4029,7 +4031,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> retry:
> /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
> if (gfp_mask & __GFP_KSWAPD_RECLAIM)
> - wake_all_kswapds(order, ac);
> + wake_all_kswapds(order, gfp_mask, ac);
>
> reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
> if (reserve_flags)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3546,16 +3546,21 @@ static int kswapd(void *p)
> }
>
> /*
> - * A zone is low on free memory, so wake its kswapd task to service it.
> + * A zone is low on free memory or too fragmented for high-order memory. If
> + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
> + * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
> + * has failed or is not needed, still wake up kcompactd if only compaction is
> + * needed.
> */
> -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
> +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
> + enum zone_type classzone_idx)
> {
> pg_data_t *pgdat;
>
> if (!managed_zone(zone))
> return;
>
> - if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
> + if (!cpuset_zone_allowed(zone, gfp_flags))
> return;
> pgdat = zone->zone_pgdat;
> pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
> @@ -3564,14 +3569,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
> if (!waitqueue_active(&pgdat->kswapd_wait))
> return;
>
> - /* Hopeless node, leave it to direct reclaim */
> - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
> - return;
> -
> - if (pgdat_balanced(pgdat, order, classzone_idx))
> + /* Hopeless node, leave it to direct reclaim if possible */
> + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
> + pgdat_balanced(pgdat, order, classzone_idx)) {
> + /*
> + * There may be plenty of free memory available, but it's too
> + * fragmented for high-order allocations. Wake up kcompactd
> + * and rely on compaction_suitable() to determine if it's
> + * needed. If it fails, it will defer subsequent attempts to
> + * ratelimit its work.
> + */
> + if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
> + wakeup_kcompactd(pgdat, order, classzone_idx);
> return;
> + }
>
> - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
> + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
> + gfp_flags);
> wake_up_interruptible(&pgdat->kswapd_wait);
> }
>
>