Re: [PATCH] sched: introduce configurable delay before entering idle

From: Wanpeng Li
Date: Mon May 13 2019 - 05:20:55 EST


On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@xxxxxxxxxx> wrote:
>
>
> Certain workloads perform poorly on KVM compared to baremetal
> due to baremetal's ability to perform mwait on NEED_RESCHED
> bit of task flags (therefore skipping the IPI).

KVM supports expose mwait to the guest, if it can solve this?

Regards,
Wanpeng Li

>
> This patch introduces a configurable busy-wait delay before entering the
> architecture delay routine, allowing wakeup IPIs to be skipped
> (if the IPI happens in that window).
>
> The real-life workload which this patch improves performance
> is SAP HANA (by 5-10%) (for which case setting idle_spin to 30
> is sufficient).
>
> This patch improves the attached server.py and client.py example
> as follows:
>
> Host: 31.814230202231556
> Guest: 38.17718765199993 (83 %)
> Guest, idle_spin=50us: 33.317709898000004 (95 %)
> Guest, idle_spin=220us: 32.27826551499999 (98 %)
>
> Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
>
> ---
> kernel/sched/idle.c | 86 ++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 86 insertions(+)
>
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index f5516bae0c1b..bca7656a7ea0 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -216,6 +216,29 @@ static void cpuidle_idle_call(void)
> rcu_idle_exit();
> }
>
> +static unsigned int spin_before_idle_us;
>
> +static void do_spin_before_idle(void)
> +{
> + ktime_t now, end_spin;
> +
> + now = ktime_get();
> + end_spin = ktime_add_ns(now, spin_before_idle_us*1000);
> +
> + rcu_idle_enter();
> + local_irq_enable();
> + stop_critical_timings();
> +
> + do {
> + cpu_relax();
> + now = ktime_get();
> + } while (!tif_need_resched() && ktime_before(now, end_spin));
> +
> + start_critical_timings();
> + rcu_idle_exit();
> + local_irq_disable();
> +}
> +
> /*
> * Generic idle loop implementation
> *
> @@ -259,6 +282,8 @@ static void do_idle(void)
> tick_nohz_idle_restart_tick();
> cpu_idle_poll();
> } else {
> + if (spin_before_idle_us)
> + do_spin_before_idle();
> cpuidle_idle_call();
> }
> arch_cpu_idle_exit();
> @@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = {
> .switched_to = switched_to_idle,
> .update_curr = update_curr_idle,
> };
> +
> +
> +static ssize_t store_idle_spin(struct kobject *kobj,
> + struct kobj_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned int val;
> +
> + if (kstrtouint(buf, 10, &val) < 0)
> + return -EINVAL;
> +
> + if (val > USEC_PER_SEC)
> + return -EINVAL;
> +
> + spin_before_idle_us = val;
> + return count;
> +}
> +
> +static ssize_t show_idle_spin(struct kobject *kobj,
> + struct kobj_attribute *attr,
> + char *buf)
> +{
> + ssize_t ret;
> +
> + ret = sprintf(buf, "%d\n", spin_before_idle_us);
> +
> + return ret;
> +}
> +
> +static struct kobj_attribute idle_spin_attr =
> + __ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin);
> +
> +static struct attribute *sched_attrs[] = {
> + &idle_spin_attr.attr,
> + NULL,
> +};
> +
> +static const struct attribute_group sched_attr_group = {
> + .attrs = sched_attrs,
> +};
> +
> +static struct kobject *sched_kobj;
> +
> +static int __init sched_sysfs_init(void)
> +{
> + int error;
> +
> + sched_kobj = kobject_create_and_add("sched", kernel_kobj);
> + if (!sched_kobj)
> + return -ENOMEM;
> +
> + error = sysfs_create_group(sched_kobj, &sched_attr_group);
> + if (error)
> + goto err;
> + return 0;
> +
> +err:
> + kobject_put(sched_kobj);
> + return error;
> +}
> +postcore_initcall(sched_sysfs_init);