[GIT PULL] workqueue fixes for v4.13-rc3

From: Tejun Heo
Date: Mon Jul 31 2017 - 11:38:21 EST


Hello, Linus.

Two notable fixes.

* While adding NUMA affinity support to unbound workqueues, the
assumption that an unbound workqueue with max_active == 1 is ordered
was broken. The plan was to use explicit alloc_ordered_workqueue()
for those cases. Unfortunately, I forgot to update the
documentation properly and we grew a handful of use cases which
depend on that assumption.

While we want to convert them to alloc_ordered_workqueue(), we don't
really lose anything by enforcing ordered execution on unbound
max_active == 1 workqueues and it doesn't make sense to risk subtle
bugs. Restore the assumption.

* Workqueue assumes that CPU <-> NUMA node mapping remains static.
This is a general assumption - we don't have any synchronization
mechanism around CPU <-> node mapping. Unfortunately, powerpc may
change the mapping dynamically leading to crashes. Michael added a
workaround so that we at least don't crash while powerpc hotplug
code gets updated.

Thanks.

The following changes since commit 74cbd96bc2e00f5daa805e2ebf49e998f7045062:

Merge tag 'md/4.13-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md (2017-07-18 11:51:08 -0700)

are available in the git repository at:

git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-4.13-fixes

for you to fetch changes up to 1ad0f0a7aa1bf3bd42dcd108a96713d255eacd9f:

workqueue: Work around edge cases for calc of pool's cpumask (2017-07-28 11:05:52 -0400)

----------------------------------------------------------------
Michael Bringmann (1):
workqueue: Work around edge cases for calc of pool's cpumask

Tejun Heo (2):
workqueue: restore WQ_UNBOUND/max_active==1 to be ordered
workqueue: implicit ordered attribute should be overridable

include/linux/workqueue.h | 4 +++-
kernel/workqueue.c | 30 ++++++++++++++++++++++++++----
2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index c102ef6..db6dc9d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -323,6 +323,7 @@ enum {

__WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */
__WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */
+ __WQ_ORDERED_EXPLICIT = 1 << 18, /* internal: alloc_ordered_workqueue() */
__WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */

WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */
@@ -422,7 +423,8 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
* Pointer to the allocated workqueue on success, %NULL on failure.
*/
#define alloc_ordered_workqueue(fmt, flags, args...) \
- alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
+ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \
+ __WQ_ORDERED_EXPLICIT | (flags), 1, ##args)

#define create_workqueue(name) \
alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688f..ca937b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,

/* yeap, return possible CPUs in @node that @attrs wants */
cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+
+ if (cpumask_empty(cpumask)) {
+ pr_warn_once("WARNING: workqueue cpumask: online intersect > "
+ "possible intersect\n");
+ return false;
+ }
+
return !cpumask_equal(cpumask, attrs->cpumask);

use_dfl:
@@ -3744,8 +3751,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;

/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }

ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
@@ -3929,6 +3940,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;

+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4119,13 +4140,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;

/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;

max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

mutex_lock(&wq->mutex);

+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;

for_each_pwq(pwq, wq)
@@ -5253,7 +5275,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;

wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);