Re: [PATCH v5 07/14] mm, compaction: khugepaged should not give up due to need_resched()
From: David Rientjes
Date: Mon Jul 28 2014 - 20:59:36 EST
On Mon, 28 Jul 2014, Vlastimil Babka wrote:
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index b2e4c92..60bdf8d 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -13,6 +13,14 @@
> /* The full zone was compacted */
> #define COMPACT_COMPLETE 4
>
> +/* Used to signal whether compaction detected need_sched() or lock contention */
> +/* No contention detected */
> +#define COMPACT_CONTENDED_NONE 0
> +/* Either need_sched() was true or fatal signal pending */
> +#define COMPACT_CONTENDED_SCHED 1
> +/* Zone lock or lru_lock was contended in async compaction */
> +#define COMPACT_CONTENDED_LOCK 2
> +
Make this an enum?
> #ifdef CONFIG_COMPACTION
> extern int sysctl_compact_memory;
> extern int sysctl_compaction_handler(struct ctl_table *table, int write,
> @@ -24,7 +32,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
> extern int fragmentation_index(struct zone *zone, unsigned int order);
> extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
> int order, gfp_t gfp_mask, nodemask_t *mask,
> - enum migrate_mode mode, bool *contended,
> + enum migrate_mode mode, int *contended,
> struct zone **candidate_zone);
> extern void compact_pgdat(pg_data_t *pgdat, int order);
> extern void reset_isolation_suitable(pg_data_t *pgdat);
> @@ -94,7 +102,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
> #else
> static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
> int order, gfp_t gfp_mask, nodemask_t *nodemask,
> - enum migrate_mode mode, bool *contended,
> + enum migrate_mode mode, int *contended,
> struct zone **candidate_zone)
> {
> return COMPACT_CONTINUE;
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 76a9775..2b8b6d8 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -223,9 +223,21 @@ static void update_pageblock_skip(struct compact_control *cc,
> }
> #endif /* CONFIG_COMPACTION */
>
> -static inline bool should_release_lock(spinlock_t *lock)
> +static int should_release_lock(spinlock_t *lock)
> {
> - return need_resched() || spin_is_contended(lock);
> + /*
> + * Sched contention has higher priority here as we may potentially
> + * have to abort whole compaction ASAP. Returning with lock contention
> + * means we will try another zone, and further decisions are
> + * influenced only when all zones are lock contended. That means
> + * potentially missing a lock contention is less critical.
> + */
> + if (need_resched())
> + return COMPACT_CONTENDED_SCHED;
> + else if (spin_is_contended(lock))
> + return COMPACT_CONTENDED_LOCK;
> + else
> + return COMPACT_CONTENDED_NONE;
I would avoid the last else statement and just return
COMPACT_CONTENDED_NONE.
> }
>
> /*
> @@ -240,7 +252,9 @@ static inline bool should_release_lock(spinlock_t *lock)
> static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> bool locked, struct compact_control *cc)
> {
> - if (should_release_lock(lock)) {
> + int contended = should_release_lock(lock);
> +
> + if (contended) {
> if (locked) {
> spin_unlock_irqrestore(lock, *flags);
> locked = false;
> @@ -248,7 +262,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>
> /* async aborts if taking too long or contended */
> if (cc->mode == MIGRATE_ASYNC) {
> - cc->contended = true;
> + cc->contended = contended;
> return false;
> }
>
> @@ -274,7 +288,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
> /* async compaction aborts if contended */
> if (need_resched()) {
> if (cc->mode == MIGRATE_ASYNC) {
> - cc->contended = true;
> + cc->contended = COMPACT_CONTENDED_SCHED;
> return true;
> }
>
> @@ -1139,7 +1153,7 @@ out:
> }
>
> static unsigned long compact_zone_order(struct zone *zone, int order,
> - gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
> + gfp_t gfp_mask, enum migrate_mode mode, int *contended)
> {
> unsigned long ret;
> struct compact_control cc = {
> @@ -1154,11 +1168,11 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
> INIT_LIST_HEAD(&cc.migratepages);
>
> ret = compact_zone(zone, &cc);
> + *contended = cc.contended;
>
> VM_BUG_ON(!list_empty(&cc.freepages));
> VM_BUG_ON(!list_empty(&cc.migratepages));
>
> - *contended = cc.contended;
Not sure why, but ok :)
> return ret;
> }
>
> @@ -1171,14 +1185,15 @@ int sysctl_extfrag_threshold = 500;
> * @gfp_mask: The GFP mask of the current allocation
> * @nodemask: The allowed nodes to allocate from
> * @mode: The migration mode for async, sync light, or sync migration
> - * @contended: Return value that is true if compaction was aborted due to lock contention
> + * @contended: Return value that determines if compaction was aborted due to
> + * need_resched() or lock contention
> * @candidate_zone: Return the zone where we think allocation should succeed
> *
> * This is the main entry point for direct page compaction.
> */
> unsigned long try_to_compact_pages(struct zonelist *zonelist,
> int order, gfp_t gfp_mask, nodemask_t *nodemask,
> - enum migrate_mode mode, bool *contended,
> + enum migrate_mode mode, int *contended,
> struct zone **candidate_zone)
> {
> enum zone_type high_zoneidx = gfp_zone(gfp_mask);
> @@ -1188,6 +1203,9 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
> struct zone *zone;
> int rc = COMPACT_DEFERRED;
> int alloc_flags = 0;
> + bool all_zones_lock_contended = true; /* init true for &= operation */
> +
> + *contended = COMPACT_CONTENDED_NONE;
>
> /* Check if the GFP flags allow compaction */
> if (!order || !may_enter_fs || !may_perform_io)
> @@ -1201,13 +1219,20 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
> for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
> nodemask) {
> int status;
> + int zone_contended;
>
> if (compaction_deferred(zone, order))
> continue;
>
> status = compact_zone_order(zone, order, gfp_mask, mode,
> - contended);
> + &zone_contended);
> rc = max(status, rc);
> + /*
> + * It takes at least one zone that wasn't lock contended
> + * to turn all_zones_lock_contended to false.
> + */
> + all_zones_lock_contended &=
> + (zone_contended == COMPACT_CONTENDED_LOCK);
Eek, does this always work? COMPACT_CONTENDED_LOCK is 0x2 and
all_zones_lock_contended is a bool initialized to true. I'm fairly
certain you'd get better code generation if you defined
all_zones_lock_contended as
int all_zones_lock_contended = COMPACT_CONTENDED_LOCK
from the start and the source code would be more clear.
>
> /* If a normal allocation would succeed, stop compacting */
> if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
> @@ -1220,8 +1245,20 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
> * succeeds in this zone.
> */
> compaction_defer_reset(zone, order, false);
> - break;
> - } else if (mode != MIGRATE_ASYNC) {
> + /*
> + * It is possible that async compaction aborted due to
> + * need_resched() and the watermarks were ok thanks to
> + * somebody else freeing memory. The allocation can
> + * however still fail so we better signal the
> + * need_resched() contention anyway.
> + */
Makes sense because the page allocator still tries to allocate the page
even if this is returned, but it's not very clear in the comment.
> + if (zone_contended == COMPACT_CONTENDED_SCHED)
> + *contended = COMPACT_CONTENDED_SCHED;
> +
> + goto break_loop;
> + }
> +
> + if (mode != MIGRATE_ASYNC) {
> /*
> * We think that allocation won't succeed in this zone
> * so we defer compaction there. If it ends up
> @@ -1229,8 +1266,36 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
> */
> defer_compaction(zone, order);
> }
> +
> + /*
> + * We might have stopped compacting due to need_resched() in
> + * async compaction, or due to a fatal signal detected. In that
> + * case do not try further zones and signal need_resched()
> + * contention.
> + */
> + if ((zone_contended == COMPACT_CONTENDED_SCHED)
> + || fatal_signal_pending(current)) {
> + *contended = COMPACT_CONTENDED_SCHED;
> + goto break_loop;
> + }
> +
> + continue;
> +break_loop:
> + /*
> + * We might not have tried all the zones, so be conservative
> + * and assume they are not all lock contended.
> + */
> + all_zones_lock_contended = false;
> + break;
> }
>
> + /*
> + * If at least one zone wasn't deferred or skipped, we report if all
> + * zones that were tried were contended.
> + */
> + if (rc > COMPACT_SKIPPED && all_zones_lock_contended)
> + *contended = COMPACT_CONTENDED_LOCK;
> +
> return rc;
> }
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 5a0738f..4c1d604 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -144,8 +144,8 @@ struct compact_control {
> int order; /* order a direct compactor needs */
> int migratetype; /* MOVABLE, RECLAIMABLE etc */
> struct zone *zone;
> - bool contended; /* True if a lock was contended, or
> - * need_resched() true during async
> + int contended; /* Signal need_sched() or lock
> + * contention detected during
> * compaction
> */
> };
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f424752..e3c633b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2296,7 +2296,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> struct zonelist *zonelist, enum zone_type high_zoneidx,
> nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> int classzone_idx, int migratetype, enum migrate_mode mode,
> - bool *contended_compaction, bool *deferred_compaction,
> + int *contended_compaction, bool *deferred_compaction,
> unsigned long *did_some_progress)
> {
> struct zone *last_compact_zone = NULL;
> @@ -2547,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> unsigned long did_some_progress;
> enum migrate_mode migration_mode = MIGRATE_ASYNC;
> bool deferred_compaction = false;
> - bool contended_compaction = false;
> + int contended_compaction = COMPACT_CONTENDED_NONE;
>
> /*
> * In the slowpath, we sanity check order to avoid ever trying to
> @@ -2660,15 +2660,36 @@ rebalance:
> if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
> migration_mode = MIGRATE_SYNC_LIGHT;
>
> - /*
> - * If compaction is deferred for high-order allocations, it is because
> - * sync compaction recently failed. In this is the case and the caller
> - * requested a movable allocation that does not heavily disrupt the
> - * system then fail the allocation instead of entering direct reclaim.
> - */
> - if ((deferred_compaction || contended_compaction) &&
> - (gfp_mask & __GFP_NO_KSWAPD))
> - goto nopage;
Hmm, this check will have unfortunately changed in the latest mmotm due to
mm-thp-restructure-thp-avoidance-of-light-synchronous-migration.patch.
> + /* Checks for THP-specific high-order allocations */
> + if (gfp_mask & __GFP_NO_KSWAPD) {
> + /*
> + * If compaction is deferred for high-order allocations, it is
> + * because sync compaction recently failed. If this is the case
> + * and the caller requested a THP allocation, we do not want
> + * to heavily disrupt the system, so we fail the allocation
> + * instead of entering direct reclaim.
> + */
> + if (deferred_compaction)
> + goto nopage;
> +
> + /*
> + * In all zones where compaction was attempted (and not
> + * deferred or skipped), lock contention has been detected.
> + * For THP allocation we do not want to disrupt the others
> + * so we fallback to base pages instead.
> + */
> + if (contended_compaction == COMPACT_CONTENDED_LOCK)
> + goto nopage;
> +
> + /*
> + * If compaction was aborted due to need_resched(), we do not
> + * want to further increase allocation latency, unless it is
> + * khugepaged trying to collapse.
> + */
> + if (contended_compaction == COMPACT_CONTENDED_SCHED
> + && !(current->flags & PF_KTHREAD))
> + goto nopage;
> + }
>
> /* Try direct reclaim and then allocating */
> page = __alloc_pages_direct_reclaim(gfp_mask, order,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/