Re: [patch for-3.2-rc3] cpusets: stall when updating mems_allowedfor mempolicy or disjoint nodemask
From: Miao Xie
Date: Tue Nov 22 2011 - 23:49:42 EST
On Tue, 22 Nov 2011 19:32:40 -0800 (pst), David Rientjes wrote:
[SNIP]
> One of the major reasons why changing cpuset.mems can take >30s is because
> of lengthy delays in the page allocator because it continuously loops
> while trying reclaim or killing a thread and trying to allocate a page.
>
> I think we should be able to optimize this by dropping it when it's not
> required and moving it to get_page_from_freelist() which is the only thing
> that cares about cpuset_zone_allowed_softwall().
>
> Something like this:
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1643,17 +1643,29 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
> static struct page *
> get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
> struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
> - struct zone *preferred_zone, int migratetype)
> + struct zone **preferred_zone, int migratetype)
> {
> struct zoneref *z;
> struct page *page = NULL;
> int classzone_idx;
> struct zone *zone;
> - nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
> + nodemask_t *allowednodes = NULL;
> int zlc_active = 0; /* set if using zonelist_cache */
> int did_zlc_setup = 0; /* just call zlc_setup() one time */
>
> - classzone_idx = zone_idx(preferred_zone);
> + get_mems_allowed();
> + /*
> + * preferred_zone must come from an allowed node if the allocation is
> + * constrained to either a mempolicy (nodemask != NULL) or otherwise
> + * limited by cpusets.
> + */
> + if (alloc_flags & ALLOC_CPUSET)
> + allowednodes = nodemask ? : &cpuset_current_mems_allowed;
> +
> + first_zones_zonelist(zonelist, high_zoneidx, allowednodes,
> + preferred_zone);
> + classzone_idx = zone_idx(*preferred_zone);
> + allowednodes = NULL;
> zonelist_scan:
> /*
> * Scan zonelist, looking for a zone with enough free.
> @@ -1717,7 +1729,7 @@ zonelist_scan:
> }
>
> try_this_zone:
> - page = buffered_rmqueue(preferred_zone, zone, order,
> + page = buffered_rmqueue(*preferred_zone, zone, order,
> gfp_mask, migratetype);
> if (page)
> break;
> @@ -1731,6 +1743,7 @@ this_zone_full:
> zlc_active = 0;
> goto zonelist_scan;
> }
> + put_mems_allowed();
> return page;
> }
>
> @@ -1832,7 +1845,7 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
> static inline struct page *
> __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> struct zonelist *zonelist, enum zone_type high_zoneidx,
> - nodemask_t *nodemask, struct zone *preferred_zone,
> + nodemask_t *nodemask, struct zone **preferred_zone,
> int migratetype)
> {
> struct page *page;
> @@ -1885,13 +1898,13 @@ out:
> static struct page *
> __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> struct zonelist *zonelist, enum zone_type high_zoneidx,
> - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> + nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone,
> int migratetype, unsigned long *did_some_progress,
> bool sync_migration)
> {
> struct page *page;
>
> - if (!order || compaction_deferred(preferred_zone))
> + if (!order || compaction_deferred(*preferred_zone))
> return NULL;
>
> current->flags |= PF_MEMALLOC;
> @@ -1909,8 +1922,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> alloc_flags, preferred_zone,
> migratetype);
> if (page) {
> - preferred_zone->compact_considered = 0;
> - preferred_zone->compact_defer_shift = 0;
> + *preferred_zone->compact_considered = 0;
> + *preferred_zone->compact_defer_shift = 0;
> count_vm_event(COMPACTSUCCESS);
> return page;
> }
> @@ -1921,7 +1934,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> * but not enough to satisfy watermarks.
> */
> count_vm_event(COMPACTFAIL);
> - defer_compaction(preferred_zone);
> + defer_compaction(*preferred_zone);
>
> cond_resched();
> }
> @@ -1932,7 +1945,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> static inline struct page *
> __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> struct zonelist *zonelist, enum zone_type high_zoneidx,
> - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> + nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone,
> int migratetype, unsigned long *did_some_progress,
> bool sync_migration)
> {
> @@ -1944,7 +1957,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> static inline struct page *
> __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
> struct zonelist *zonelist, enum zone_type high_zoneidx,
> - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> + nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone,
> int migratetype, unsigned long *did_some_progress)
> {
> struct page *page = NULL;
> @@ -2001,7 +2014,7 @@ retry:
> static inline struct page *
> __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
> struct zonelist *zonelist, enum zone_type high_zoneidx,
> - nodemask_t *nodemask, struct zone *preferred_zone,
> + nodemask_t *nodemask, struct zone **preferred_zone,
> int migratetype)
> {
> struct page *page;
> @@ -2012,7 +2025,8 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
> preferred_zone, migratetype);
>
> if (!page && gfp_mask & __GFP_NOFAIL)
> - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
> + wait_iff_congested(*preferred_zone, BLK_RW_ASYNC,
> + HZ/50);
> } while (!page && (gfp_mask & __GFP_NOFAIL));
>
> return page;
> @@ -2075,7 +2089,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
> static inline struct page *
> __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> struct zonelist *zonelist, enum zone_type high_zoneidx,
> - nodemask_t *nodemask, struct zone *preferred_zone,
> + nodemask_t *nodemask, struct zone **preferred_zone,
> int migratetype)
> {
> const gfp_t wait = gfp_mask & __GFP_WAIT;
> @@ -2110,7 +2124,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> restart:
> if (!(gfp_mask & __GFP_NO_KSWAPD))
> wake_all_kswapd(order, zonelist, high_zoneidx,
> - zone_idx(preferred_zone));
> + zone_idx(*preferred_zone));
>
> /*
> * OK, we're below the kswapd watermark and have kicked background
> @@ -2119,14 +2133,6 @@ restart:
> */
> alloc_flags = gfp_to_alloc_flags(gfp_mask);
>
> - /*
> - * Find the true preferred zone if the allocation is unconstrained by
> - * cpusets.
> - */
> - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
> - first_zones_zonelist(zonelist, high_zoneidx, NULL,
> - &preferred_zone);
> -
> rebalance:
> /* This is the last chance, in general, before the goto nopage. */
> page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
> @@ -2220,7 +2226,7 @@ rebalance:
> pages_reclaimed += did_some_progress;
> if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
> /* Wait for some write requests to complete then retry */
> - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
> + wait_iff_congested(*preferred_zone, BLK_RW_ASYNC, HZ/50);
> goto rebalance;
> } else {
> /*
> @@ -2277,25 +2283,17 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
> if (unlikely(!zonelist->_zonerefs->zone))
> return NULL;
>
> - get_mems_allowed();
> - /* The preferred zone is used for statistics later */
> - first_zones_zonelist(zonelist, high_zoneidx,
> - nodemask ? : &cpuset_current_mems_allowed,
> - &preferred_zone);
> - if (!preferred_zone) {
> - put_mems_allowed();
> - return NULL;
> - }
> -
> /* First allocation attempt */
> page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
> zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
> - preferred_zone, migratetype);
> - if (unlikely(!page))
> + &preferred_zone, migratetype);
> + if (unlikely(!page)) {
> + if (!preferred_zone)
> + return NULL;
> page = __alloc_pages_slowpath(gfp_mask, order,
> zonelist, high_zoneidx, nodemask,
> - preferred_zone, migratetype);
> - put_mems_allowed();
> + &preferred_zone, migratetype);
> + }
>
> trace_mm_page_alloc(page, order, gfp_mask, migratetype);
> return page;
>
> This would significantly reduce the amount of time that it takes to write
> to cpuset.mems because we drop get_mems_allowed() between allocation
> attempts. We really, really want to do this anyway because it's possible
> that a cpuset is being expanded to a larger set of nodes and they are
> inaccessible to concurrent memory allocations because the page allocator
> is holding get_mems_allowed() while looping and trying to find more
> memory.
>
> Comments?
This is a good idea. But I worry that oom will happen easily, because we do
direct reclamation and compact by mems_allowed.
I think we can fix it by using a sequence to make the tasks know their
mems_allowed are changed, and the task will do direct reclamation and
compact again.
The other method is that making the tasks ignore its mems_allowed when
they do direct reclamation and compact at the second time.
By these two way, we can drop get_mems_allowed() as many as possible.
Thanks
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/