Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

From: Li Zefan
Date: Thu Aug 14 2014 - 23:17:34 EST


On 2014/8/12 5:05, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
>
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1963,7 +1963,7 @@ zonelist_scan:
>>>
>>> /*
>>> * Scan zonelist, looking for a zone with enough free.
>>> - * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
>>> + * See __cpuset_node_allowed() comment in kernel/cpuset.c.
>>> */
>>> for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>> high_zoneidx, nodemask) {
>>> @@ -1974,7 +1974,7 @@ zonelist_scan:
>>> continue;
>>> if (cpusets_enabled() &&
>>> (alloc_flags & ALLOC_CPUSET) &&
>>> - !cpuset_zone_allowed_softwall(zone, gfp_mask))
>>> + !cpuset_zone_allowed(zone, gfp_mask))
>>> continue;
>>
>> So, this is get_page_from_freelist. It's called from
>> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
>> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
>> bit set only for __GFP_WAIT allocations. That said, w/o your patch we
>> try to respect cpusets for all allocations, including atomic, and only
>> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
>> allocations, while with your patch we always ignore cpusets for
>> !__GFP_WAIT allocations. Not sure if it really matters though, because
>> usually one uses cpuset.mems in conjunction with cpuset.cpus and it
>> won't make any difference then. It also doesn't conflict with any cpuset
>> documentation.
>>
>
> Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.

I'm not quite sure. That code has been there before I got involved in cpuset.

> The only thing that we get by falling back to the page allocator slowpath
> is that kswapd gets woken up before the allocation is attempted without
> ALLOC_CPUSET. It seems pointless to wakeup kswapd when the allocation can
> succeed on any node. Even with the patch, if the allocation fails because
> all nodes are below their min watermark, then we still fallback to the
> slowpath and wake up kswapd but there's nothing much else we can do
> because it's !__GFP_WAIT.
> .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson <pj@xxxxxxx>
Date: Wed Dec 13 00:34:25 2006 -0800

[PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
kernel/cpuset.c | 81 ++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 49 insertions(+), 32 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
*/

static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);

/*
* CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
*/
static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
+ unsigned long flags;
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
continue;
rcu_read_unlock();

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cpumask_copy(cp->effective_cpus, new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
+ unsigned long flags;
int retval;

/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

/* use trialcs->cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1105,6 +1107,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
+ unsigned long flags;
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;

@@ -1132,8 +1135,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
rcu_read_unlock();

mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cp->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1162,6 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
+ unsigned long flags;
int retval;

/*
@@ -1201,9 +1206,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cs->mems_allowed = trialcs->mems_allowed;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1264,6 +1269,7 @@ static void update_tasks_flags(struct cpuset *cs)
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
+ unsigned long flags;
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
@@ -1288,9 +1294,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cs->flags = trialcs->flags;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -1690,11 +1696,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
ssize_t count;
char *buf, *s;
int ret = 0;
+ unsigned long flags;

count = seq_get_buf(sf, &buf);
s = buf;

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_mutex, flags);

switch (type) {
case FILE_CPULIST:
@@ -1721,7 +1728,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
seq_commit(sf, -1);
}
out_unlock:
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqstore(&callback_lock, flags);
return ret;
}

@@ -1924,6 +1931,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
struct cpuset *parent = parent_cs(cs);
struct cpuset *tmp_cs;
struct cgroup_subsys_state *pos_css;
+ unsigned long flags;

if (!parent)
return 0;
@@ -1938,12 +1946,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)

cpuset_inc();

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
if (cgroup_on_dfl(cs->css.cgroup)) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1970,10 +1978,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_lock_irqrestore(&callback_lock, flags);
out_unlock:
mutex_unlock(&cpuset_mutex);
return 0;
@@ -2011,8 +2019,10 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)

static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
+ unsigned long flags;
+
mutex_lock(&cpuset_mutex);
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);

if (cgroup_on_dfl(root_css->cgroup)) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2023,7 +2033,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}

- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
mutex_unlock(&cpuset_mutex);
}

@@ -2107,13 +2117,14 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
bool cpus_updated, bool mems_updated)
{
bool is_empty;
+ unsigned long flags;

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2145,15 +2156,17 @@ hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
+ unsigned long flags;
+
if (cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

if (cpus_updated)
update_tasks_cpumask(cs);
@@ -2227,6 +2240,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+ unsigned long flags;

mutex_lock(&cpuset_mutex);

@@ -2239,21 +2253,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)

/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
/* we don't mess with cpumasks of tasks in top_cpuset */
}

/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
update_tasks_nodemask(&top_cpuset);
}

@@ -2346,11 +2360,13 @@ void __init cpuset_init_smp(void)

void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- mutex_lock(&callback_mutex);
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
}

void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2396,12 +2412,13 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
+ unsigned long flags;

- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);

return mask;
}
@@ -2514,14 +2531,14 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
return 1;

/* Not hardwall and node outside mems_allowed: scan up cpusets */
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_mutex, flags);

rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();

- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/