[PATCH] workqueue: update numa affinity when node hotplug

From: Gu Zheng
Date: Fri Feb 27 2015 - 05:22:28 EST


Yasuaki Ishimatsu found that with node online/offline, cpu<->node
relationship is established. Because workqueue uses a info which was
established at boot time, but it may be changed by node hotpluging.

Once pool->node points to a stale node, following allocation failure
happens.
==
SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
cache: kmalloc-192, object size: 192, buffer size: 192, default
order:
1, min order: 0
node 0: slabs: 6172, objs: 259224, free: 245741
node 1: slabs: 3261, objs: 136962, free: 127656
==

This patch use the present cpumask directly rather than self-maintained
wq_numa_possible_cpumask, so that the wq is a simple customer of numa, and
updates per cpu workqueue pool's node affinity when numa node changed via
the notify callback that registered to node on/down event at the functions
try_online/offline_node.

Unbound workqueue's per node pool are already updated by wq_update_unbound_numa()
at CPU_DOWN_PREPARE of the last cpu, by existing code.

Reported-by: Yasuaki Ishimatsu <isimatu.yasuaki@xxxxxxxxxxxxxx>
Signed-off-by: Gu Zheng <guz.fnst@xxxxxxxxxxxxxx>
---
include/linux/memory_hotplug.h | 8 +++
kernel/workqueue.c | 93 +++++++++++++++++++++++++--------------
mm/memory_hotplug.c | 44 +++++++++++++++++++
3 files changed, 111 insertions(+), 34 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a419..07bb94e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -98,6 +98,14 @@ extern void __online_page_free(struct page *page);

extern int try_online_node(int nid);

+/* event type of node on/dwon */
+#define NODE_ON 0x00001
+#define NODE_DOWN 0x00002
+
+extern int __ref register_node_notifier(struct notifier_block *nb);
+extern int __ref unregister_node_notifier(struct notifier_block *nb);
+extern void get_present_cpumask_of_node(cpumask_var_t cpumask, int node);
+
#ifdef CONFIG_MEMORY_HOTREMOVE
extern bool is_pageblock_removable_nolock(struct page *page);
extern int arch_remove_memory(u64 start, u64 size);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9..60c5d29 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,9 +265,6 @@ struct workqueue_struct {

static struct kmem_cache *pwq_cache;

-static cpumask_var_t *wq_numa_possible_cpumask;
- /* possible CPUs of each node */
-
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

@@ -3493,10 +3490,18 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
pool->attrs->no_numa = false;

/* if cpumask is contained inside a NUMA node, we belong to that node */
- if (wq_numa_enabled) {
+ if (wq_numa_enabled && !cpumask_empty(pool->attrs->cpumask)) {
for_each_node(node) {
- if (cpumask_subset(pool->attrs->cpumask,
- wq_numa_possible_cpumask[node])) {
+ int cpu;
+ bool is_sub_set = true;
+
+ for_each_cpu(cpu, pool->attrs->cpumask)
+ if (cpu_to_node(cpu) != node) {
+ is_sub_set = false;
+ break;
+ }
+
+ if (is_sub_set) {
pool->node = node;
break;
}
@@ -3717,8 +3722,9 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
if (cpumask_empty(cpumask))
goto use_dfl;

- /* yeap, return possible CPUs in @node that @attrs wants */
- cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ /* yeap, return present CPUs in @node that @attrs wants */
+ get_present_cpumask_of_node(cpumask, node);
+ cpumask_and(cpumask, attrs->cpumask, cpumask);
return !cpumask_equal(cpumask, attrs->cpumask);

use_dfl:
@@ -4564,6 +4570,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
+ int node = cpu_to_node(cpu);
struct worker_pool *pool;
struct workqueue_struct *wq;
int pi;
@@ -4571,6 +4578,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
for_each_cpu_worker_pool(pool, cpu) {
+ pool->node = node;
if (pool->nr_workers)
continue;
if (!create_worker(pool))
@@ -4787,11 +4795,50 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */

-static void __init wq_numa_init(void)
+static int wq_numa_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
{
- cpumask_var_t *tbl;
- int node, cpu;
+ int node = (unsigned long)arg;
+ struct worker_pool *pool;
+ int pi;
+ int cpu;

+ switch (action) {
+ case NODE_DOWN:
+ mutex_lock(&wq_pool_mutex);
+ for_each_pool(pool, pi) {
+ if (pool->node == node) {
+ pool->node = NUMA_NO_NODE;
+ if (pool->cpu < 0)
+ hash_del(&pool->hash_node);
+ }
+ }
+ mutex_unlock(&wq_pool_mutex);
+ break;
+ case NODE_ON:
+ mutex_lock(&wq_pool_mutex);
+ for_each_present_cpu(cpu) {
+ if (node != cpu_to_node(cpu))
+ continue;
+ for_each_cpu_worker_pool(pool, cpu)
+ pool->node = node;
+ }
+ mutex_unlock(&wq_pool_mutex);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block wq_numa_nb = {
+ .notifier_call = wq_numa_callback,
+ .priority = 0
+};
+
+static void __init wq_numa_init(void)
+{
if (num_possible_nodes() <= 1)
return;

@@ -4803,29 +4850,7 @@ static void __init wq_numa_init(void)
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
BUG_ON(!wq_update_unbound_numa_attrs_buf);

- /*
- * We want masks of possible CPUs of each node which isn't readily
- * available. Build one from cpu_to_node() which should have been
- * fully initialized by now.
- */
- tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
- BUG_ON(!tbl);
-
- for_each_node(node)
- BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE));
-
- for_each_possible_cpu(cpu) {
- node = cpu_to_node(cpu);
- if (WARN_ON(node == NUMA_NO_NODE)) {
- pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
- /* happens iff arch is bonkers, let's just proceed */
- return;
- }
- cpumask_set_cpu(cpu, tbl[node]);
- }
-
- wq_numa_possible_cpumask = tbl;
+ register_node_notifier(&wq_numa_nb);
wq_numa_enabled = true;
}

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab107..1778628 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1132,6 +1132,44 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
return;
}

+static RAW_NOTIFIER_HEAD(node_chain);
+
+int __ref register_node_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ ret = raw_notifier_chain_register(&node_chain, nb);
+ return ret;
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+int __ref unregister_node_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ ret = raw_notifier_chain_unregister(&node_chain, nb);
+ return ret;
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+static int call_node_notify(unsigned long val, void *v)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&node_chain, val, v, -1, NULL);
+
+ return notifier_to_errno(ret);
+}
+
+void get_present_cpumask_of_node(cpumask_var_t cpumask, int node)
+{
+ unsigned int cpu;
+
+ for_each_present_cpu(cpu)
+ if (node == cpu_to_node(cpu))
+ cpumask_set_cpu(cpu, cpumask);
+}
+EXPORT_SYMBOL(get_present_cpumask_of_node);

/**
* try_online_node - online a node if offlined
@@ -1157,6 +1195,9 @@ int try_online_node(int nid)
ret = register_one_node(nid);
BUG_ON(ret);

+ /* notify that the node is on */
+ call_node_notify(NODE_ON, (void *)(long)nid);
+
if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
@@ -1978,6 +2019,9 @@ void try_offline_node(int nid)
vfree(zone->wait_table);
}

+ /* notify that the node is down */
+ call_node_notify(NODE_DOWN, (void *)(long)nid);
+
/*
* Since there is no way to guarentee the address of pgdat/zone is not
* on stack of any kernel threads or used by other kernel objects
--
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/