[PATCH] sched: idle: Support nohlt_list kernel parameter

From: zhenwei pi
Date: Wed May 22 2019 - 05:11:42 EST


Currently kernel only supports hlt&nohlt kernel parameters, all the
CPUs would poll or not in idle. Guest OS can't control power in KVM
virtualization, so we can only choose high performance by nohlt or
CPU overcommit by hlt.
nohlt_list kernel parameter allows the specified CPU(s) to poll,
and other CPUs still halt in idle.

We can config boot parameter in guest(Ex, 16vCPUs on x86) like this:
linux ... irqaffinity=0,2,4,6 nohlt_list=0,2,4,6
it means that 25% of CPUs can always run in vm-mode and benefit
from posted-interrupt.

Signed-off-by: zhenwei pi <pizhenwei@xxxxxxxxxxxxx>
---
kernel/sched/idle.c | 35 +++++++++++++++++++++++++++++++++--
1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..5a0c3498258b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -50,6 +50,37 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
+
+static cpumask_var_t cpu_nohlt_cpumask __cpumask_var_read_mostly;
+static int __init cpu_idle_poll_list_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&cpu_nohlt_cpumask);
+ if (cpulist_parse(str, cpu_nohlt_cpumask)) {
+ pr_warn("idle: nohlt_list= incorrect CPU range\n");
+ cpumask_clear(cpu_nohlt_cpumask);
+ } else
+ pr_info("idle: nohlt_list=%s\n", str);
+
+ return 1;
+}
+__setup("nohlt_list=", cpu_idle_poll_list_setup);
+
+static inline bool cpu_idle_should_poll(void)
+{
+ int cpu;
+
+ if (cpu_idle_force_poll)
+ return !!cpu_idle_force_poll;
+
+ cpu = smp_processor_id();
+ return (cpumask_available(cpu_nohlt_cpumask) &&
+ !!cpumask_test_cpu(cpu, cpu_nohlt_cpumask));
+}
+#else
+static inline bool cpu_idle_should_poll(void)
+{
+ return !!cpu_idle_force_poll;
+}
#endif

static noinline int __cpuidle cpu_idle_poll(void)
@@ -60,7 +91,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
stop_critical_timings();

while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_should_poll() || tick_check_broadcast_expired()))
cpu_relax();
start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
@@ -256,7 +287,7 @@ static void do_idle(void)
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
- if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ if (cpu_idle_should_poll() || tick_check_broadcast_expired()) {
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
--
2.11.0