[RFC 3/3] sched/idle: run-time support for setting idle polling

From: Luiz Capitulino
Date: Tue Sep 22 2015 - 16:34:35 EST


Some archs allow the system administrator to set the
idle thread behavior to spin instead of entering
sleep states. The x86 arch, for example, has a idle=
command-line parameter for this purpose.

However, the command-line parameter has two problems:

1. You have to reboot if you change your mind
2. This setting affects all system cores

The second point is relevant for systems where cores
are partitioned into bookkeeping and isolated,
low-latency cores. Usually, it's OK for bookkeeping
cores to enter deeper sleep states. It's only the
isolated ones that should poll when entering idle.

This commit solves both problems by allowing the
system administrator to set the idle thread behavior
at run-time by writing to the following file:

/sys/devices/system/cpu/idle_poll

This file stores and outputs the cpumask which will
have idle polling behavior.

It's important to note that the system administrator
can't undo what was set by arch code. That is, if
arch code enables idle=poll by calling cpu_idle_poll_ctrl(true),
the system administrator won't be able to unset it.

Signed-off-by: Luiz Capitulino <lcapitulino@xxxxxxxxxx>
---
drivers/base/cpu.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/cpu.h | 2 ++
kernel/sched/idle.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 97 insertions(+)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 91bbb19..ac0dc3c 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -277,6 +277,49 @@ static ssize_t print_cpus_isolated(struct device *dev,
}
static DEVICE_ATTR(isolated, 0444, print_cpus_isolated, NULL);

+static ssize_t print_cpus_idle_loop(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret, len = PAGE_SIZE-2;
+ cpumask_var_t mask;
+
+ ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+ if (!ret)
+ return -ENOMEM;
+
+ cpu_idle_loop_to_cpumask(&mask);
+ ret = scnprintf(buf, len, "%*pb\n", cpumask_pr_args(mask));
+
+ free_cpumask_var(&mask);
+ return ret;
+}
+
+static ssize_t store_cpus_idle_loop(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ cpumask_var_t mask;
+ int ret;
+
+ ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+ if (!ret)
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, &mask);
+ if (ret < 0)
+ goto out;
+
+ ret = cpu_idle_loop_from_cpumask(&mask);
+
+out:
+ free_cpumask_var(&mask);
+ return ret < 0 ? ret : count;
+}
+
+static DEVICE_ATTR(idle_loop, 0644, print_cpus_idle_loop,
+ store_cpus_idle_loop);
+
#ifdef CONFIG_NO_HZ_FULL
static ssize_t print_cpus_nohz_full(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -457,6 +500,7 @@ static struct attribute *cpu_root_attrs[] = {
&dev_attr_kernel_max.attr,
&dev_attr_offline.attr,
&dev_attr_isolated.attr,
+ &dev_attr_idle_loop.attr,
#ifdef CONFIG_NO_HZ_FULL
&dev_attr_nohz_full.attr,
#endif
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 23c30bd..8744507 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -274,6 +274,8 @@ enum cpuhp_state {

void cpu_startup_entry(enum cpuhp_state state);

+void cpu_idle_loop_to_cpumask(struct cpumask *mask);
+int cpu_idle_loop_from_cpumask(const struct cpumask *mask);
void cpu_idle_poll_ctrl(bool enable);

void arch_cpu_idle(void);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3060977..03567d6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -26,10 +26,12 @@ void sched_idle_set_state(struct cpuidle_state *idle_state)

struct idle_poll {
int force_poll;
+ bool sysfs_set;
};

static DEFINE_PER_CPU(struct idle_poll, idle_poll) = {
.force_poll = 0,
+ .sysfs_set = 0,
};

static bool this_cpu_idle_poll(void)
@@ -56,6 +58,55 @@ void cpu_idle_poll_ctrl(bool enable)
}
}

+void cpu_idle_loop_to_cpumask(struct cpumask *mask)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (per_cpu(idle_poll, cpu).force_poll)
+ cpumask_set_cpu(cpu, mask);
+}
+
+/*
+ * This function enforces two rules:
+ *
+ * 1. force_poll can be incremented only once for a CPU
+ * via sysfs
+ * 2. force_poll can only be decremented for a CPU if it
+ * was previously incremented via sysfs
+ *
+ * This ensures that sysfs changes are independent of
+ * cpu_idle_poll_ctrl().
+ */
+int cpu_idle_loop_from_cpumask(const struct cpumask *mask)
+{
+ struct idle_poll *p;
+ int is_set, cpu;
+
+ for_each_possible_cpu(cpu) {
+ p = &per_cpu(idle_poll, cpu);
+ is_set = cpumask_test_cpu(cpu, mask);
+ if (!is_set && p->force_poll > 0 && !p->sysfs_set) {
+ pr_err("idle_poll: trying to undo arch setting\n");
+ return -EINVAL;
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct idle_poll *p = &per_cpu(idle_poll, cpu);
+ bool is_set = cpumask_test_cpu(cpu, mask);
+ if (is_set && !p->sysfs_set) {
+ p->force_poll++;
+ p->sysfs_set = true;
+ } else if (!is_set && p->sysfs_set) {
+ p->force_poll--;
+ p->sysfs_set = false;
+ }
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
static int __init cpu_idle_poll_setup(char *__unused)
{
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/