[PATCH 2/6] workqueue: generic framework to manage normal&rescuer workers' cpumask

From: Lai Jiangshan
Date: Sat Apr 12 2014 - 06:43:18 EST

Next message: Lai Jiangshan: "[PATCH 6/6] workqueue: destroy worker directly in idle timeout handler"
Previous message: Lai Jiangshan: "[PATCH 1/6] workqueue: generic routine to restore percpu/unbound pools' workers' cpumask"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

If a worker's cpumask need to be kept co-ordinate with the pool
during cpu-hotpug, we add this worker to a special set which
will be used to manage workers' cpumask.

This special set is worker_idr currently and it serves for normal workers only.
But we can't add rescuer to this set due to we can't allocate id from
worker_idr for rescuers. We need to introduce a new set(bind_list).

When the rescuer adds itself to bind_list. it needs some synchronization.
but we can't re-use manager_mutex due to manager_mutex has dependency
to memory allocation. So we introduce a new mutex - bind_mutex.

Now restore_workers_cpumask() also restore rescuers' cpumask via bind_list.

Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxx>
---
kernel/workqueue.c | 129 +++++++++++++++++++------------------------
kernel/workqueue_internal.h | 1 +
2 files changed, 57 insertions(+), 73 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0b56730..743917d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -135,6 +135,8 @@ enum {
* WR: wq->mutex protected for writes. Sched-RCU protected for reads.
*
* MD: wq_mayday_lock protected.
+ *
+ * B: pool->bind_mutex protected.
*/

/* struct worker is defined in workqueue_internal.h */
@@ -165,6 +167,17 @@ struct worker_pool {
struct mutex manager_mutex; /* manager exclusion */
struct idr worker_idr; /* MG: worker IDs and iteration */

+ /*
+ * A worker is bound to the pool, it means:
+ * 1) the worker's cpumask is bound to the pool.
+ *
+ * bind_mutex is held in rescuer before processing works,
+ * so bind_mutex shouldn't have any directly nor indirecty dependency
+ * to sleepable-memory-allocation.
+ */
+ struct mutex bind_mutex; /* workers binding */
+ struct list_head bind_list; /* B: bound workers*/
+
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */
@@ -1625,70 +1638,6 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
}

-/**
- * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
- * @pool: target worker_pool
- *
- * Bind %current to the cpu of @pool if it is associated and lock @pool.
- *
- * Works which are scheduled while the cpu is online must at least be
- * scheduled to a worker which is bound to the cpu so that if they are
- * flushed from cpu callbacks while cpu is going down, they are
- * guaranteed to execute on the cpu.
- *
- * This function is to be used by unbound workers and rescuers to bind
- * themselves to the target cpu and may race with cpu going down or
- * coming online. kthread_bind() can't be used because it may put the
- * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and pool may be
- * [dis]associated in the meantime.
- *
- * This function tries set_cpus_allowed() and locks pool and verifies the
- * binding against %POOL_DISASSOCIATED which is set during
- * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * enters idle state or fetches works without dropping lock, it can
- * guarantee the scheduling requirement described in the first paragraph.
- *
- * CONTEXT:
- * Might sleep. Called without any lock but returns with pool->lock
- * held.
- *
- * Return:
- * %true if the associated pool is online (@worker is successfully
- * bound), %false if offline.
- */
-static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
-__acquires(&pool->lock)
-{
- while (true) {
- /*
- * The following call may fail, succeed or succeed
- * without actually migrating the task to the cpu if
- * it races with cpu hotunplug operation. Verify
- * against POOL_DISASSOCIATED.
- */
- if (!(pool->flags & POOL_DISASSOCIATED))
- set_cpus_allowed_ptr(current, pool->attrs->cpumask);
-
- spin_lock_irq(&pool->lock);
- if (pool->flags & POOL_DISASSOCIATED)
- return false;
- if (task_cpu(current) == pool->cpu &&
- cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
- return true;
- spin_unlock_irq(&pool->lock);
-
- /*
- * We've raced with CPU hot[un]plug. Give it a breather
- * and retry migration. cond_resched() is required here;
- * otherwise, we might deadlock against cpu_stop trying to
- * bring down the CPU on non-preemptive kernel.
- */
- cpu_relax();
- cond_resched();
- }
-}
-
static struct worker *alloc_worker(void)
{
struct worker *worker;
@@ -1697,12 +1646,33 @@ static struct worker *alloc_worker(void)
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
+ INIT_LIST_HEAD(&worker->bind_entry);
/* on creation a worker is in !idle && prep state */
worker->flags = WORKER_PREP;
}
return worker;
}

+static void bind_worker(struct worker *worker, struct worker_pool *pool)
+{
+ mutex_lock(&pool->bind_mutex);
+ list_add(&worker->bind_entry, &pool->bind_list);
+ /*
+ * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+ * online CPUs. It'll be re-applied when any of the CPUs come up.
+ */
+ set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ mutex_unlock(&pool->bind_mutex);
+}
+
+
+static void unbind_worker(struct worker *worker, struct worker_pool *pool)
+{
+ mutex_lock(&pool->bind_mutex);
+ list_del(&worker->bind_entry);
+ mutex_unlock(&pool->bind_mutex);
+}
+
/**
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
@@ -1762,11 +1732,7 @@ static struct worker *create_worker(struct worker_pool *pool)
/* prevent userland from meddling with cpumask of workqueue workers */
worker->task->flags |= PF_NO_SETAFFINITY;

- /*
- * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
- * online CPUs. It'll be re-applied when any of the CPUs come up.
- */
- set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ bind_worker(worker, pool);

/*
* The caller is responsible for ensuring %POOL_DISASSOCIATED
@@ -2325,6 +2291,8 @@ woke_up:
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
worker->task->flags &= ~PF_WQ_WORKER;
+
+ unbind_worker(worker, pool);
return 0;
}

@@ -2447,9 +2415,9 @@ repeat:
spin_unlock_irq(&wq_mayday_lock);

/* migrate to the target cpu if possible */
- worker_maybe_bind_and_lock(pool);
+ bind_worker(rescuer, pool);
+ spin_lock_irq(&pool->lock);
rescuer->pool = pool;
- put_unbound_pwq(pwq);

/*
* Slurp in all works issued via this workqueue and
@@ -2470,6 +2438,17 @@ repeat:
if (keep_working(pool))
wake_up_worker(pool);

+ spin_unlock_irq(&pool->lock);
+
+ /*
+ * We still hold an indrectly reference(via pwq) to the pool.
+ * So the pool can't be destroyed even all works had been
+ * processed.
+ */
+ unbind_worker(rescuer, pool);
+
+ spin_lock_irq(&pool->lock);
+ put_unbound_pwq(pwq);
rescuer->pool = NULL;
spin_unlock(&pool->lock);
spin_lock(&wq_mayday_lock);
@@ -3548,6 +3527,9 @@ static int init_worker_pool(struct worker_pool *pool)
mutex_init(&pool->manager_mutex);
idr_init(&pool->worker_idr);

+ mutex_init(&pool->bind_mutex);
+ INIT_LIST_HEAD(&pool->bind_list);
+
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;

@@ -4711,7 +4693,6 @@ static void restore_workers_cpumask(struct worker_pool *pool, int cpu)
{
static cpumask_t cpumask;
struct worker *worker;
- int wi;

lockdep_assert_held(&pool->manager_mutex);

@@ -4724,10 +4705,12 @@ static void restore_workers_cpumask(struct worker_pool *pool, int cpu)
if (cpumask_weight(&cpumask) != 1)
return;

+ mutex_lock(&pool->bind_mutex);
/* as we're called from CPU_ONLINE, the following shouldn't fail */
- for_each_pool_worker(worker, wi, pool)
+ list_for_each_entry(worker, &pool->bind_list, bind_entry)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
+ mutex_unlock(&pool->bind_mutex);
}

/*
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 7e2204d..50f2a3a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -37,6 +37,7 @@ struct worker {
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* I: the associated pool */
/* L: for rescuers */
+ struct list_head bind_entry; /* B: bound with the pool */

unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
--
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Lai Jiangshan: "[PATCH 6/6] workqueue: destroy worker directly in idle timeout handler"
Previous message: Lai Jiangshan: "[PATCH 1/6] workqueue: generic routine to restore percpu/unbound pools' workers' cpumask"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]