Re: [PATCH 3/8] workqueue: Implement BH workqueues to eventually replace tasklets

From: Lai Jiangshan
Date: Thu Feb 01 2024 - 06:02:57 EST


Hello, Tejun

On Tue, Jan 30, 2024 at 5:16 PM Tejun Heo <tj@xxxxxxxxxx> wrote:

> @@ -1184,6 +1211,14 @@ static bool kick_pool(struct worker_pool *pool)
> if (!need_more_worker(pool) || !worker)
> return false;
>
> + if (pool->flags & POOL_BH) {
> + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
> + tasklet_hi_schedule(&worker->bh_tasklet);
> + else
> + tasklet_schedule(&worker->bh_tasklet);
> + return true;
> + }

I think it is more straightforward to call bh_worker_taskletfn[_hi]()
in tasklet_action() and tasklet_hi_action() rather than add a
worker->bh_tasklet.

raise_softirq_irqoff() can be used here (kick_pool()) instead.

As the changelog said, once the conversion is complete, tasklet can be
removed and BH workqueues can directly take over the tasklet softirqs,
in which case, then, bh_worker_taskletfn[_hi]() can directly take over
the tasklet_action() and tasklet_hi_action().


> +
> p = worker->task;
>
> #ifdef CONFIG_SMP
> @@ -1663,8 +1698,16 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
> lockdep_assert_held(&pool->lock);
>
> if (!nna) {
> - /* per-cpu workqueue, pwq->nr_active is sufficient */
> - obtained = pwq->nr_active < READ_ONCE(wq->max_active);
> + /*
> + * BH workqueues always share a single execution context per CPU
> + * and don't impose any max_active limit, so tryinc always
> + * succeeds. For a per-cpu workqueue, checking pwq->nr_active is
> + * sufficient.
> + */
> + if (wq->flags & WQ_BH)
> + obtained = true;
> + else
> + obtained = pwq->nr_active < READ_ONCE(wq->max_active);

I think wq->max_active can be forced to be UINT_MAX or ULONG_MAX
in the max_active management code to avoid a branch here.

> goto out;
> }
>
> @@ -2599,27 +2642,31 @@ static struct worker *create_worker(struct worker_pool *pool)
>
> worker->id = id;
>
> - if (pool->cpu >= 0)
> - snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
> - pool->attrs->nice < 0 ? "H" : "");
> - else
> - snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
> -
> - worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
> - "kworker/%s", id_buf);
> - if (IS_ERR(worker->task)) {
> - if (PTR_ERR(worker->task) == -EINTR) {
> - pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
> - id_buf);
> - } else {
> - pr_err_once("workqueue: Failed to create a worker thread: %pe",
> - worker->task);
> + if (pool->flags & POOL_BH) {
> + tasklet_setup(&worker->bh_tasklet, bh_worker_taskletfn);
> + } else {
> + if (pool->cpu >= 0)
> + snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
> + pool->attrs->nice < 0 ? "H" : "");
> + else
> + snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
> +
> + worker->task = kthread_create_on_node(worker_thread, worker,
> + pool->node, "kworker/%s", id_buf);
> + if (IS_ERR(worker->task)) {
> + if (PTR_ERR(worker->task) == -EINTR) {
> + pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
> + id_buf);
> + } else {
> + pr_err_once("workqueue: Failed to create a worker thread: %pe",
> + worker->task);
> + }
> + goto fail;
> }
> - goto fail;
> - }
>
> - set_user_nice(worker->task, pool->attrs->nice);
> - kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> + set_user_nice(worker->task, pool->attrs->nice);
> + kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> + }
>
> /* successful, attach the worker to the pool */
> worker_attach_to_pool(worker, pool);

worker_attach_to_pool() and worker_detach_from_pool also access to
worker->task with kthread_set_per_cpu() and luckily to_kthread()
checks the NULL pointer for it.

IMO, it is better to avoid calling kthread_set_per_cpu() for bh workers.




> @@ -5605,7 +5731,12 @@ static void pr_cont_pool_info(struct worker_pool *pool)
> pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
> if (pool->node != NUMA_NO_NODE)
> pr_cont(" node=%d", pool->node);
> - pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
> + pr_cont(" flags=0x%x", pool->flags);
> + if (pool->flags & POOL_BH)
> + pr_cont(" bh%s",
> + pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
> + else
> + pr_cont(" nice=%d", pool->attrs->nice);

There are also some "worker->task" in show_pwq(), show_one_worker_pool(),
and show_cpu_pool_hog() needing taking care of.

Thanks
Lai