Re: [PATCH V2 3/3] mm: Enable Buddy allocation isolation for CDM nodes

From: Anshuman Khandual
Date: Tue Feb 14 2017 - 23:29:15 EST


On 02/14/2017 03:44 PM, Anshuman Khandual wrote:
> On 02/14/2017 01:58 PM, Vlastimil Babka wrote:
>> On 02/10/2017 11:06 AM, Anshuman Khandual wrote:
>>> This implements allocation isolation for CDM nodes in buddy allocator by
>>> discarding CDM memory zones all the time except in the cases where the gfp
>>> flag has got __GFP_THISNODE or the nodemask contains CDM nodes in cases
>>> where it is non NULL (explicit allocation request in the kernel or user
>>> process MPOL_BIND policy based requests).
>>>
>>> Signed-off-by: Anshuman Khandual <khandual@xxxxxxxxxxxxxxxxxx>
>>> ---
>>> mm/page_alloc.c | 16 ++++++++++++++++
>>> 1 file changed, 16 insertions(+)
>>>
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 84d61bb..392c24a 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -64,6 +64,7 @@
>>> #include <linux/page_owner.h>
>>> #include <linux/kthread.h>
>>> #include <linux/memcontrol.h>
>>> +#include <linux/node.h>
>>>
>>> #include <asm/sections.h>
>>> #include <asm/tlbflush.h>
>>> @@ -2908,6 +2909,21 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>>> struct page *page;
>>> unsigned long mark;
>>>
>>> + /*
>>> + * CDM nodes get skipped if the requested gfp flag
>>> + * does not have __GFP_THISNODE set or the nodemask
>>> + * does not have any CDM nodes in case the nodemask
>>> + * is non NULL (explicit allocation requests from
>>> + * kernel or user process MPOL_BIND policy which has
>>> + * CDM nodes).
>>> + */
>>> + if (is_cdm_node(zone->zone_pgdat->node_id)) {
>>> + if (!(gfp_mask & __GFP_THISNODE)) {
>>> + if (!ac->nodemask)
>>> + continue;
>>> + }
>>> + }
>>
>> With the current cpuset implementation, this will have a subtle corner
>> case when allocating from a cpuset that allows the cdm node, and there
>> is no (task or vma) mempolicy applied for the allocation. In the fast
>> path (__alloc_pages_nodemask()) we'll set ac->nodemask to
>> current->mems_allowed, so your code will wrongly assume that this
>> ac->nodemask is a policy that allows the CDM node. Probably not what you
>> want?
>
> You are right, its a problem and not what we want. We can make the
> function get_page_from_freelist() take another parameter "orig_nodemask"
> which gets passed into __alloc_pages_nodemask() in the first place. So
> inside zonelist iterator we can compare orig_nodemask with current
> ac.nodemask to figure out if cpuset swapping of nodemask happened and
> skip CDM node if necessary. Thats a viable solution IMHO.

Hello Vlastimil,

As I mentioned before yesterday this solution works and tested to verify that
there is no allocation leak happening to CDM even after cpuset_enabled() is
turned ON after changing /sys/fs/cgroup/cpuset/ setup. Major part of the change
is just to add an additional parameter into the function get_page_from_freelist
and changing all it's call sites.

- Anshuman

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 392c24a..9f41e0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2894,11 +2894,15 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
- const struct alloc_context *ac)
+ const struct alloc_context *ac, nodemask_t *orig_mask)
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
+ bool cpuset_fallback;
+
+ if (ac->nodemask != orig_mask)
+ cpuset_fallback = true;

/*
* Scan zonelist, looking for a zone with enough free.
@@ -2919,6 +2923,9 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
*/
if (is_cdm_node(zone->zone_pgdat->node_id)) {
if (!(gfp_mask & __GFP_THISNODE)) {
+ if (cpuset_fallback)
+ continue;
+
if (!ac->nodemask)
continue;
}
@@ -3066,7 +3073,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)

static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
- const struct alloc_context *ac, unsigned long *did_some_progress)
+ const struct alloc_context *ac, unsigned long *did_some_progress, nodemask_t *orig_mask)
{
struct oom_control oc = {
.zonelist = ac->zonelist,
@@ -3095,7 +3102,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
* we're still under heavy pressure.
*/
page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac, orig_mask);
if (page)
goto out;

@@ -3131,14 +3138,14 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)

if (gfp_mask & __GFP_NOFAIL) {
page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
+ ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac, orig_mask);
/*
* fallback to ignore cpuset restriction if our nodes
* are depleted
*/
if (!page)
page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
+ ALLOC_NO_WATERMARKS, ac, orig_mask);
}
}
out:
@@ -3157,7 +3164,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum compact_priority prio, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result, nodemask_t *orig_mask)
{
struct page *page;

@@ -3178,7 +3185,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
*/
count_vm_event(COMPACTSTALL);

- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);

if (page) {
struct zone *zone = page_zone(page);
@@ -3263,7 +3270,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum compact_priority prio, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result, nodemask_t *orig_mask)
{
*compact_result = COMPACT_SKIPPED;
return NULL;
@@ -3330,7 +3337,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- unsigned long *did_some_progress)
+ unsigned long *did_some_progress, nodemask_t *orig_mask)
{
struct page *page = NULL;
bool drained = false;
@@ -3340,7 +3347,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
return NULL;

retry:
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);

/*
* If an allocation failed after direct reclaim, it could be because
@@ -3533,7 +3540,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
- struct alloc_context *ac)
+ struct alloc_context *ac, nodemask_t *orig_mask)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
@@ -3597,7 +3604,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);
if (page)
goto got_pg;

@@ -3612,7 +3619,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
- &compact_result);
+ &compact_result, orig_mask);
if (page)
goto got_pg;

@@ -3661,7 +3668,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
}

/* Attempt with potentially adjusted zonelist and alloc_flags */
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);
if (page)
goto got_pg;

@@ -3697,13 +3704,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
- &did_some_progress);
+ &did_some_progress, orig_mask);
if (page)
goto got_pg;

/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
- compact_priority, &compact_result);
+ compact_priority, &compact_result, orig_mask);
if (page)
goto got_pg;

@@ -3750,7 +3757,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
goto retry_cpuset;

/* Reclaim has failed us, start killing things */
- page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress, orig_mask);
if (page)
goto got_pg;

@@ -3842,7 +3849,7 @@ struct page *
}

/* First allocation attempt */
- page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac, nodemask);
if (likely(page))
goto out;

@@ -3861,7 +3868,7 @@ struct page *
if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;

- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac, nodemask);

out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&