Re: [PATCH v2 3/4] sched/isolation: Add HK_TYPE_WQ to isolcpus=domain

From: Frederic Weisbecker
Date: Fri Oct 14 2022 - 09:24:28 EST


On Fri, Oct 14, 2022 at 10:36:19AM +0200, Peter Zijlstra wrote:
>
> + Frederic; who actually does most of this code
>
> On Thu, Oct 13, 2022 at 03:40:28PM -0300, Leonardo Bras wrote:
> > Housekeeping code keeps multiple cpumasks in order to keep track of which
> > cpus can perform given housekeeping category.
> >
> > Every time the HK_TYPE_WQ cpumask is checked before queueing work at a cpu
> > WQ it also happens to check for HK_TYPE_DOMAIN. So It can be assumed that
> > the Domain isolation also ends up isolating work queues.
> >
> > Delegating current HK_TYPE_DOMAIN's work queue isolation to HK_TYPE_WQ
> > makes it simpler to check if a cpu can run a task into an work queue, since
> > code just need to go through a single HK_TYPE_* cpumask.
> >
> > Make isolcpus=domain aggregate both HK_TYPE_DOMAIN and HK_TYPE_WQ, and
> > remove a lot of cpumask_and calls.
> >
> > Also, remove a unnecessary '|=' at housekeeping_isolcpus_setup() since we
> > are sure that 'flags == 0' here.
> >
> > Signed-off-by: Leonardo Bras <leobras@xxxxxxxxxx>
>
> I've long maintained that having all these separate masks is daft;
> Frederic do we really need that?

Indeed. In my queue for the cpuset interface to nohz_full, I have the following
patch (but note DOMAIN and WQ have to stay seperate flags because workqueue
affinity can be modified seperately from isolcpus)

---
From: Frederic Weisbecker <frederic@xxxxxxxxxx>
Date: Tue, 26 Jul 2022 17:03:30 +0200
Subject: [PATCH] sched/isolation: Gather nohz_full related isolation features
into common flag

Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
---
arch/x86/kvm/x86.c | 2 +-
drivers/pci/pci-driver.c | 2 +-
include/linux/sched/isolation.h | 7 +------
kernel/cpu.c | 4 ++--
kernel/kthread.c | 4 ++--
kernel/rcu/tasks.h | 2 +-
kernel/rcu/tree_plugin.h | 6 +++---
kernel/sched/core.c | 10 +++++-----
kernel/sched/fair.c | 6 +++---
kernel/sched/isolation.c | 25 +++++++------------------
kernel/watchdog.c | 2 +-
kernel/workqueue.c | 2 +-
net/core/net-sysfs.c | 2 +-
13 files changed, 29 insertions(+), 45 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1910e1e78b15..d0b73fcf4a1c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9009,7 +9009,7 @@ int kvm_arch_init(void *opaque)
}

if (pi_inject_timer == -1)
- pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
+ pi_inject_timer = housekeeping_enabled(HK_TYPE_NOHZ_FULL);
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 49238ddd39ee..af3494a39921 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -378,7 +378,7 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
goto out;
}
cpumask_and(wq_domain_mask,
- housekeeping_cpumask(HK_TYPE_WQ),
+ housekeeping_cpumask(HK_TYPE_NOHZ_FULL),
housekeeping_cpumask(HK_TYPE_DOMAIN));

cpu = cpumask_any_and(cpumask_of_node(node),
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 8c15abd67aed..7ca34e04abe7 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,15 +6,10 @@
#include <linux/tick.h>

enum hk_type {
- HK_TYPE_TIMER,
- HK_TYPE_RCU,
- HK_TYPE_MISC,
+ HK_TYPE_NOHZ_FULL,
HK_TYPE_SCHED,
- HK_TYPE_TICK,
HK_TYPE_DOMAIN,
- HK_TYPE_WQ,
HK_TYPE_MANAGED_IRQ,
- HK_TYPE_KTHREAD,
HK_TYPE_MAX
};

diff --git a/kernel/cpu.c b/kernel/cpu.c
index bbad5e375d3b..573f14d75a2e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1500,8 +1500,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
- primary = housekeeping_any_cpu(HK_TYPE_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_NOHZ_FULL))
+ primary = housekeeping_any_cpu(HK_TYPE_NOHZ_FULL);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 544fd4097406..0719035feba0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -355,7 +355,7 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_NOHZ_FULL));

/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -721,7 +721,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_NOHZ_FULL));
set_mems_allowed(node_states[N_MEMORY]);

current->flags |= PF_NOFREEZE;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index f5bf6fb430da..b99f79625b26 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -537,7 +537,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct rcu_tasks *rtp = arg;

/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current, HK_TYPE_RCU);
+ housekeeping_affine(current, HK_TYPE_NOHZ_FULL);
WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!

/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b2219577fbe2..4935b06c3caf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1237,9 +1237,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_NOHZ_FULL));
if (cpumask_empty(cm))
- cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_NOHZ_FULL));
set_cpus_allowed_ptr(t, cm);
mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
@@ -1294,5 +1294,5 @@ static void rcu_bind_gp_kthread(void)
{
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current, HK_TYPE_RCU);
+ housekeeping_affine(current, HK_TYPE_NOHZ_FULL);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f53c0096860b..5ff205f39197 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1079,13 +1079,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;

- if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_NOHZ_FULL)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}

- hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_NOHZ_FULL);

rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -1101,7 +1101,7 @@ int get_nohz_timer_target(void)
}

if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_NOHZ_FULL);
cpu = default_cpu;
unlock:
rcu_read_unlock();
@@ -5562,7 +5562,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;

- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_NOHZ_FULL))
return;

WARN_ON_ONCE(!tick_work_cpu);
@@ -5583,7 +5583,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;

- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_NOHZ_FULL))
return;

WARN_ON_ONCE(!tick_work_cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77b2048a9326..ac3b33e00451 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10375,7 +10375,7 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ * - HK_TYPE_NOHZ_FULL CPUs are used for this task, because HK_TYPE_SCHED not set
* anywhere yet.
*/

@@ -10384,7 +10384,7 @@ static inline int find_new_ilb(void)
int ilb;
const struct cpumask *hk_mask;

- hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_NOHZ_FULL);

for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {

@@ -10400,7 +10400,7 @@ static inline int find_new_ilb(void)

/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * idle CPU in the HK_TYPE_NOHZ_FULL housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 4087718ee5b4..443f1ce83e32 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -4,20 +4,15 @@
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
*
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
- * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
+ * Copyright (C) 2017-2022 SUSE, Frederic Weisbecker
*
*/

enum hk_flags {
- HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
- HK_FLAG_RCU = BIT(HK_TYPE_RCU),
- HK_FLAG_MISC = BIT(HK_TYPE_MISC),
+ HK_FLAG_NOHZ_FULL = BIT(HK_TYPE_NOHZ_FULL),
HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
- HK_FLAG_TICK = BIT(HK_TYPE_TICK),
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
- HK_FLAG_WQ = BIT(HK_TYPE_WQ),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
- HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
};

DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@@ -88,7 +83,7 @@ void __init housekeeping_init(void)

static_branch_enable(&housekeeping_overridden);

- if (housekeeping.flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_NOHZ_FULL)
sched_tick_offload_init();

for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
@@ -111,7 +106,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
cpumask_var_t non_housekeeping_mask, housekeeping_staging;
int err = 0;

- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
+ if ((flags & HK_FLAG_NOHZ_FULL) && !(housekeeping.flags & HK_FLAG_NOHZ_FULL)) {
if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
pr_warn("Housekeeping: nohz unsupported."
" Build with CONFIG_NO_HZ_FULL\n");
@@ -163,7 +158,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
housekeeping_setup_type(type, housekeeping_staging);
}

- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
+ if ((flags & HK_FLAG_NOHZ_FULL) && !(housekeeping.flags & HK_FLAG_NOHZ_FULL))
tick_nohz_full_setup(non_housekeeping_mask);

housekeeping.flags |= flags;
@@ -179,12 +174,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)

static int __init housekeeping_nohz_full_setup(char *str)
{
- unsigned long flags;
-
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
- HK_FLAG_MISC | HK_FLAG_KTHREAD;
-
- return housekeeping_setup(str, flags);
+ return housekeeping_setup(str, HK_FLAG_NOHZ_FULL);
}
__setup("nohz_full=", housekeeping_nohz_full_setup);

@@ -198,8 +188,7 @@ static int __init housekeeping_isolcpus_setup(char *str)
while (isalpha(*str)) {
if (!strncmp(str, "nohz,", 5)) {
str += 5;
- flags |= HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER |
- HK_FLAG_RCU | HK_FLAG_MISC | HK_FLAG_KTHREAD;
+ flags |= HK_FLAG_NOHZ_FULL;
continue;
}

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 20a7a55e62b6..3e9636f4bac6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -852,7 +852,7 @@ void __init lockup_detector_init(void)
pr_info("Disabling watchdog on nohz_full cores by default\n");

cpumask_copy(&watchdog_cpumask,
- housekeeping_cpumask(HK_TYPE_TIMER));
+ housekeeping_cpumask(HK_TYPE_NOHZ_FULL));

if (!watchdog_nmi_probe())
nmi_watchdog_available = true;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ea50f6be843..3eb283d76d81 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5993,7 +5993,7 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_NOHZ_FULL));
cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));

pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e319e242dddf..6dddf359b754 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -852,7 +852,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,

if (!cpumask_empty(mask)) {
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_NOHZ_FULL));
if (cpumask_empty(mask)) {
free_cpumask_var(mask);
return -EINVAL;
--
2.25.1