[RFC PATCH v2] workqueue: Introduce a way to set percpu worker_pool's scheduler

From: Wen Yang
Date: Tue Dec 26 2017 - 21:59:02 EST


From: Liu Wei <liu.wei8186@xxxxxxxxxx>

When pinning RT threads to specific cores using CPU affinity, the
kworkers on the same CPU would starve, which may lead to some kind
of priority inversion. In that case, the RT threads would also
suffer high performance impact.

The priority inversion looks like,
CPU 0: libvirtd acquired cgroup_mutex, and triggered
lru_add_drain_per_cpu, then waiting for all the kworkers to complete:
PID: 44145 TASK: ffff8807bec7b980 CPU: 0 COMMAND: "libvirtd"
#0 [ffff8807f2cbb9d0] __schedule at ffffffff816410ed
#1 [ffff8807f2cbba38] schedule at ffffffff81641789
#2 [ffff8807f2cbba48] schedule_timeout at ffffffff8163f479
#3 [ffff8807f2cbbaf8] wait_for_completion at ffffffff81641b56
#4 [ffff8807f2cbbb58] flush_work at ffffffff8109efdc
#5 [ffff8807f2cbbbd0] lru_add_drain_all at ffffffff81179002
#6 [ffff8807f2cbbc08] migrate_prep at ffffffff811c77be
#7 [ffff8807f2cbbc18] do_migrate_pages at ffffffff811b8010
#8 [ffff8807f2cbbcf8] cpuset_migrate_mm at ffffffff810fea6c
#9 [ffff8807f2cbbd10] cpuset_attach at ffffffff810ff91e
#10 [ffff8807f2cbbd50] cgroup_attach_task at ffffffff810f9972
#11 [ffff8807f2cbbe08] attach_task_by_pid at ffffffff810fa520
#12 [ffff8807f2cbbe58] cgroup_tasks_write at ffffffff810fa593
#13 [ffff8807f2cbbe68] cgroup_file_write at ffffffff810f8773
#14 [ffff8807f2cbbef8] vfs_write at ffffffff811dfdfd
#15 [ffff8807f2cbbf38] sys_write at ffffffff811e089f
#16 [ffff8807f2cbbf80] system_call_fastpath at ffffffff8164c809

CPU 43: kworker/43 starved because of the RT threads:
CURRENT: PID: 21294 TASK: ffff883fd2d45080 COMMAND: "lwip"
RT PRIO_ARRAY: ffff883fff3f4950
[ 79] PID: 21294 TASK: ffff883fd2d45080 COMMAND: "lwip"
[ 79] PID: 21295 TASK: ffff88276d481700 COMMAND: "ovdk-ovsvswitch"
[ 79] PID: 21351 TASK: ffff8807be822280 COMMAND: "dispatcher"
[ 79] PID: 21129 TASK: ffff8807bef0f300 COMMAND: "ovdk-ovsvswitch"
[ 79] PID: 21337 TASK: ffff88276d482e00 COMMAND: "handler_3"
[ 79] PID: 21352 TASK: ffff8807be824500 COMMAND: "flow_dumper"
[ 79] PID: 21336 TASK: ffff88276d480b80 COMMAND: "handler_2"
[ 79] PID: 21342 TASK: ffff88276d484500 COMMAND: "handler_8"
[ 79] PID: 21341 TASK: ffff88276d482280 COMMAND: "handler_7"
[ 79] PID: 21338 TASK: ffff88276d483980 COMMAND: "handler_4"
[ 79] PID: 21339 TASK: ffff88276d480000 COMMAND: "handler_5"
[ 79] PID: 21340 TASK: ffff88276d486780 COMMAND: "handler_6"
CFS RB_ROOT: ffff883fff3f4868
[120] PID: 37959 TASK: ffff88276e148000 COMMAND: "kworker/43:1"

CPU 28: Systemd(Victim) was blocked by cgroup_mutex:
PID: 1 TASK: ffff883fd2d40000 CPU: 28 COMMAND: "systemd"
#0 [ffff881fd317bd60] __schedule at ffffffff816410ed
#1 [ffff881fd317bdc8] schedule_preempt_disabled at ffffffff81642869
#2 [ffff881fd317bdd8] __mutex_lock_slowpath at ffffffff81640565
#3 [ffff881fd317be38] mutex_lock at ffffffff8163f9cf
#4 [ffff881fd317be50] proc_cgroup_show at ffffffff810fd256
#5 [ffff881fd317be98] seq_read at ffffffff81203cda
#6 [ffff881fd317bf08] vfs_read at ffffffff811dfc6c
#7 [ffff881fd317bf38] sys_read at ffffffff811e07bf
#8 [ffff881fd317bf80] system_call_fastpath at ffffffff81

The simplest way to fix that is to set the scheduler of kworkers to
higher RT priority, just like,
chrt --fifo -p 61 <kworker_pid>
However, it can not avoid other WORK_CPU_BOUND worker threads running
and starving.

This patch introduces a way to set the scheduler(policy and priority)
of percpu worker_pool, in that way, user could set proper scheduler
policy and priority of the worker_pool as needed, which could apply
to all the WORK_CPU_BOUND workers on the same CPU. On the other hand,
we could using /sys/devices/virtual/workqueue/cpumask for
WORK_CPU_UNBOUND workers to prevent them starving.

This patch implements the basic infrastructure and /sys interface,
such as:
# cat /sys/devices/virtual/workqueue/worker_pool@3/scheduler
SCHED_OTHER:120
# echo SCHED_FIFO:3 > /sys/devices/virtual/workqueue/worker_pool@3/scheduler
# cat /sys/devices/virtual/workqueue/worker_pool@3/scheduler
SCHED_FIFO:3
# cat /sys/devices/virtual/workqueue/worker_pool@3H/scheduler
SCHED_OTHER:100
# echo SCHED_FIFO:4 > /sys/devices/virtual/workqueue/worker_pool@3H/scheduler
# cat /sys/devices/virtual/workqueue/worker_pool@3H/scheduler
# cat /sys/devices/virtual/workqueue/worker_pool@3H/scheduler
SCHED_FIFO:4
v2:
wrap dev and move it out of worker_pool, and alloc memory for dev
dynamically trying to fix the kernel size regression.

CC: Tejun Heo <tj@xxxxxxxxxx>
CC: Lai Jiangshan <jiangshanlai@xxxxxxxxx>
CC: kernel test robot <xiaolong.ye@xxxxxxxxx>
Signed-off-by: Wen Yang <wen.yang99@xxxxxxxxxx>
Signed-off-by: Jiang Biao <jiang.biao2@xxxxxxxxxx>
---
include/linux/workqueue.h | 6 ++
kernel/workqueue.c | 184 +++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 189 insertions(+), 1 deletion(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4a54ef9..0963831 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,7 @@
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask.h>
+#include <uapi/linux/sched/types.h>

struct workqueue_struct;

@@ -144,6 +145,11 @@ struct workqueue_attrs {
* doesn't participate in pool hash calculations or equality comparisons.
*/
bool no_numa;
+
+ /**
+ * @sched_attr: set kworker's scheduler
+ */
+ struct sched_attr sched_attr;
};

static inline struct delayed_work *to_delayed_work(struct work_struct *work)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8fdb710..5c5d8ee 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/sched/prio.h>

#include "workqueue_internal.h"

@@ -106,6 +107,30 @@ enum {
WQ_NAME_LEN = 24,
};

+#ifdef CONFIG_SYSFS
+
+#define SCHED_POLICY_NAME_SIZE 16
+
+static const char * const wq_sched_policys[] = {
+ [SCHED_NORMAL] = "SCHED_OTHER",
+ [SCHED_FIFO] = "SCHED_FIFO",
+ [SCHED_RR] = "SCHED_RR",
+};
+
+struct wq_pool_device {
+ struct worker_pool *pool;
+ struct device dev;
+};
+
+static struct worker_pool *dev_to_worker_pool(struct device *dev)
+{
+ struct wq_pool_device *wq_pool_dev = container_of(dev,
+ struct wq_pool_device, dev);
+ return wq_pool_dev->pool;
+}
+
+#endif
+
/*
* Structure fields follow one of the following exclusion rules.
*
@@ -174,6 +199,11 @@ struct worker_pool {
struct ida worker_ida; /* worker IDs for task name */

struct workqueue_attrs *attrs; /* I: worker attributes */
+
+#ifdef CONFIG_SYSFS
+ struct wq_pool_device *dev;
+#endif + struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */

@@ -1679,6 +1709,16 @@ static struct worker *alloc_worker(int node)
return worker;
}

+static int wq_set_worker_scheduler(struct worker *worker,
+ const struct sched_attr *sched_attr)
+{
+ struct sched_param param;
+
+ param.sched_priority = sched_attr->sched_priority;
+ return sched_setscheduler_nocheck(worker->task,
+ sched_attr->sched_policy, &param);
+}
+
/**
* worker_attach_to_pool() - attach a worker to a pool
* @worker: worker to be attached
@@ -1698,6 +1738,7 @@ static void worker_attach_to_pool(struct worker *worker,
* online CPUs. It'll be re-applied when any of the CPUs come up.
*/
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ wq_set_worker_scheduler(worker, &pool->attrs->sched_attr);

/*
* The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
@@ -5242,15 +5283,153 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
wq_unbound_cpumask_store);

+static ssize_t percpu_worker_pool_sched_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ unsigned int policy;
+ struct worker_pool *pool;
+
+ pool = dev_to_worker_pool(dev);
+ policy = pool->attrs->sched_attr.sched_policy;
+ if (WARN_ON_ONCE(policy > ARRAY_SIZE(wq_sched_policys)))
+ policy = SCHED_NORMAL;
+ return scnprintf(buf, PAGE_SIZE, "%s:%u\n", wq_sched_policys[policy],
+ pool->attrs->sched_attr.sched_priority);
+}
+
+static int wq_parse_scheduler(const char *buf, size_t count,
+ struct sched_attr *sched)
+{
+ char *token;
+ unsigned int len;
+ unsigned int policy, prio;
+ char policy_name[SCHED_POLICY_NAME_SIZE] = {0};
+
+ if (!buf)
+ return -EINVAL;
+ token = strchr(buf, ':');
+ if (!token)
+ return -EINVAL;
+
+ len = token - buf;
+ if (len >= sizeof(policy_name))
+ return -EINVAL;
+ strncpy(policy_name, buf, len);
+ policy_name[len] = '\0';
+
+ for (policy = 0; policy < ARRAY_SIZE(wq_sched_policys); policy++)
+ if (!strcmp(wq_sched_policys[policy], policy_name))
+ break;
+ if (policy >= ARRAY_SIZE(wq_sched_policys))
+ return -EINVAL;
+ sched->sched_policy = policy;
+ if (sscanf(++token, "%d", &prio) != 1)
+ return -EINVAL;
+ sched->sched_priority = prio;
+ return 0;
+}
+
+static void wq_apply_pool_scheduler(struct worker_pool *pool,
+ const struct sched_attr *new)
+{
+ struct worker *worker;
+ int ret;
+
+ mutex_lock(&pool->attach_mutex);
+ pool->attrs->sched_attr.sched_policy = new->sched_policy;
+ pool->attrs->sched_attr.sched_priority = new->sched_priority;
+ for_each_pool_worker(worker, pool) {
+ ret = wq_set_worker_scheduler(worker, new);
+ if (ret)
+ pr_err("%s:%d err[%d], worker[%s], policy[%d], prio[%d]\n",
+ __func__, __LINE__, ret,
+ worker->task->comm, new->sched_policy,
+ new->sched_priority);
+ }
+ mutex_unlock(&pool->attach_mutex);
+}
+
+static ssize_t percpu_worker_pool_sched_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct worker_pool *pool;
+ struct sched_attr new;
+ int ret;
+
+ ret = wq_parse_scheduler(buf, count, &new);
+ if (ret)
+ return ret;
+
+ pool = dev_to_worker_pool(dev);
+ if (pool->attrs->sched_attr.sched_policy == new.sched_policy &&
+ pool->attrs->sched_attr.sched_priority == new.sched_priority)
+ return count;
+
+ get_online_cpus();
+ wq_apply_pool_scheduler(pool, &new);
+ put_online_cpus();
+ return ret ? ret : count;
+}
+
+static struct device_attribute wq_sysfs_percpu_sched_attr =
+ __ATTR(scheduler, 0644, percpu_worker_pool_sched_show,
+ percpu_worker_pool_sched_store);
+
+static void wq_pool_device_release(struct device *dev)
+{
+ struct wq_pool_device *wq_pool_dev = container_of(dev,
+ struct wq_pool_device, dev);
+ kfree(wq_pool_dev);
+}
+
static int __init wq_sysfs_init(void)
{
int err;
+ int cpu;

err = subsys_virtual_register(&wq_subsys, NULL);
if (err)
return err;

- return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
+ err = device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
+ if (err)
+ return err;
+
+ for_each_possible_cpu(cpu) {
+ struct worker_pool *pool;
+ struct wq_pool_device *wq_pool_dev;
+
+ for_each_cpu_worker_pool(pool, cpu) {
+ pool->dev = wq_pool_dev = kzalloc(
+ sizeof(struct wq_pool_device),
+ GFP_KERNEL);
+ if (!wq_pool_dev)
+ return -ENOMEM;
+ wq_pool_dev->pool = pool;
+ wq_pool_dev->dev.release = wq_pool_device_release;
+ wq_pool_dev->dev.parent = wq_subsys.dev_root;
+ dev_set_name(&wq_pool_dev->dev, "worker_pool@%d%s", cpu,
+ pool->attrs->nice < 0 ? "H":"");
+ err = device_register(&wq_pool_dev->dev);
+ if (err) {
+ pr_err("%s:%d failed, error:%d\n",
+ __func__, __LINE__, err);
+ kfree(wq_pool_dev);
+ return err;
+ }
+
+ err = device_create_file(&wq_pool_dev->dev,
+ &wq_sysfs_percpu_sched_attr);
+ if (err) {
+ pr_err("%s:%d failed, error:%d\n",
+ __func__, __LINE__, err);
+ device_unregister(&wq_pool_dev->dev);
+ kfree(wq_pool_dev);
+ return err;
+ }
+ }
+ }
+ return 0;
}
core_initcall(wq_sysfs_init);

@@ -5570,6 +5749,9 @@ int __init workqueue_init_early(void)
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++];
pool->node = cpu_to_node(cpu);
+ pool->attrs->sched_attr.sched_policy = SCHED_NORMAL;
+ pool->attrs->sched_attr.sched_priority =
+ NICE_TO_PRIO(pool->attrs->nice);

/* alloc pool ID */
mutex_lock(&wq_pool_mutex);
--
1.8.3.1