Re: [RFC] sched: Add a new lockless wake-from-idle implementation

From: Andy Lutomirski
Date: Thu Feb 13 2014 - 15:35:53 EST


On Thu, Feb 13, 2014 at 12:16 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Thu, Feb 13, 2014 at 09:07:10AM -0800, Andy Lutomirski wrote:
>> > I also don't really like how the polling state is an atomic; its a cpu
>> > local property.
>>
>> Your patch also makes polling state be an atomic (albeit one that
>> isn't changed remotely).
>
> Yah, sorry for that, changed the email (and code about) a number of
> times before posting :/
>
>> On the subject of major surgery, though: there are very few places in
>> the kernel where TIF_NEED_RESCHED gets set. With something like my
>> patch applied, I think that there is no code at all that sets any
>> other task's TIF_NEED_RESCHED. That suggests that all
>> set_tsk_need_resched callers could just call into the scheduler
>> directly.
>
> One of the main callers would be the timer tick for local preemption;
> that's from interrupt context, can't call schedule() there, really needs
> to be the interrupt return path.
>
>> If so, the change could probably delete a whole lot of
>> assembly code, and every kernel exit would get faster.
>
> We already need to load that word for all kinds of other things; like
> delivering signals, so testing the one bit in the return path is
> basically free.
>
> Anyway; after all this mucking about I finally remembered Venki once
> attempted something similar:
>
> https://lkml.org/lkml/2012/2/6/357
>
> How about something like this?

This looks remarkably similar to my version, except that it's just for
mwait. Hmm.

My updated patch is currently completely broken. I'll fix it up and
send it out for comparison.

--Andy

>
> ---
> arch/x86/include/asm/mwait.h | 33 ++++++++++++++++--------
> arch/x86/kernel/process.c | 2 ++
> arch/x86/kernel/smp.c | 61 ++++++++++++++++++++++++++++++++++++++++++--
> 3 files changed, 83 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
> index 1da25a5f96f9..cb7bb8bb6617 100644
> --- a/arch/x86/include/asm/mwait.h
> +++ b/arch/x86/include/asm/mwait.h
> @@ -1,6 +1,7 @@
> #ifndef _ASM_X86_MWAIT_H
> #define _ASM_X86_MWAIT_H
>
> +#include <linux/percpu.h>
> #include <linux/sched.h>
>
> #define MWAIT_SUBSTATE_MASK 0xf
> @@ -15,6 +16,14 @@
>
> #define MWAIT_ECX_INTERRUPT_BREAK 0x1
>
> +#define MWAIT_IPI_ENABLED 0x01
> +#define MWAIT_IPI_RESCHED 0x02
> +#define MWAIT_IPI_SINGLE 0x04
> +
> +extern void mwait_intercept_handler(void);
> +
> +DECLARE_PER_CPU_ALIGNED(unsigned int, mwait_ipi);
> +
> static inline void __monitor(const void *eax, unsigned long ecx,
> unsigned long edx)
> {
> @@ -42,18 +51,20 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
> */
> static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
> {
> - if (!current_set_polling_and_test()) {
> - if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
> - mb();
> - clflush((void *)&current_thread_info()->flags);
> - mb();
> - }
> -
> - __monitor((void *)&current_thread_info()->flags, 0, 0);
> - if (!need_resched())
> - __mwait(eax, ecx);
> + unsigned int *ptr = this_cpu_ptr(&mwait_ipi);
> + unsigned int old = xchg(ptr, MWAIT_IPI_ENABLED);
> +
> + WARN_ON_ONCE(old);
> +
> + if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
> + mb();
> + clflush((void *)ptr);
> + mb();
> }
> - current_clr_polling();
> +
> + __monitor((void *)ptr, 0, 0);
> + if (!(*ptr & ~MWAIT_IPI_ENABLED))
> + __mwait(eax, ecx);
> }
>
> #endif /* _ASM_X86_MWAIT_H */
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 4505e2a950d8..00afb2b676b8 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -28,6 +28,7 @@
> #include <asm/fpu-internal.h>
> #include <asm/debugreg.h>
> #include <asm/nmi.h>
> +#include <asm/mwait.h>
>
> /*
> * per-CPU TSS segments. Threads are completely 'soft' on Linux,
> @@ -286,6 +287,7 @@ void arch_cpu_idle_enter(void)
> void arch_cpu_idle_exit(void)
> {
> __exit_idle();
> + mwait_intercept_handler();
> }
>
> void arch_cpu_idle_dead(void)
> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
> index 7c3a5a61f2e4..4b078a8d6b83 100644
> --- a/arch/x86/kernel/smp.c
> +++ b/arch/x86/kernel/smp.c
> @@ -23,6 +23,8 @@
> #include <linux/interrupt.h>
> #include <linux/cpu.h>
> #include <linux/gfp.h>
> +#include <linux/sched.h>
> +#include <linux/smp.h>
>
> #include <asm/mtrr.h>
> #include <asm/tlbflush.h>
> @@ -31,6 +33,8 @@
> #include <asm/apic.h>
> #include <asm/nmi.h>
> #include <asm/trace/irq_vectors.h>
> +#include <asm/mwait.h>
> +
> /*
> * Some notes on x86 processor bugs affecting SMP operation:
> *
> @@ -113,6 +117,56 @@
> static atomic_t stopping_cpu = ATOMIC_INIT(-1);
> static bool smp_no_nmi_ipi = false;
>
> +DEFINE_PER_CPU_ALIGNED(unsigned int, mwait_ipi);
> +EXPORT_PER_CPU_SYMBOL_GPL(mwait_ipi);
> +
> +static bool mwait_intercept(int cpu, int ipi)
> +{
> + u32 *ptr = &per_cpu(mwait_ipi, cpu);
> + u32 val, new, old;
> +
> + if (!static_cpu_has(X86_FEATURE_MWAIT))
> + return false;
> +
> + val = *ptr;
> + if (!(val & MWAIT_IPI_ENABLED))
> + return false;
> +
> + for (;;) {
> + new = val | ipi;
> + old = cmpxchg(ptr, val, new);
> + if (old == val)
> + break;
> + val = old;
> + }
> +
> + if (!(old & MWAIT_IPI_ENABLED))
> + return false;
> +
> + return true;
> +}
> +
> +void mwait_intercept_handler(void)
> +{
> + unsigned int val, *ptr;
> +
> + if (!static_cpu_has(X86_FEATURE_MWAIT))
> + return;
> +
> + ptr = this_cpu_ptr(&mwait_ipi);
> + val = xchg(ptr, 0);
> +
> + if (!(val & ~MWAIT_IPI_ENABLED))
> + return;
> +
> + local_irq_disable();
> + if (val & MWAIT_IPI_RESCHED)
> + scheduler_ipi();
> + if (val & MWAIT_IPI_SINGLE)
> + generic_smp_call_function_single_interrupt();
> + local_irq_enable();
> +}
> +
> /*
> * this function sends a 'reschedule' IPI to another CPU.
> * it goes straight through and wastes no time serializing
> @@ -124,12 +178,15 @@ static void native_smp_send_reschedule(int cpu)
> WARN_ON(1);
> return;
> }
> - apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
> +
> + if (!mwait_intercept(cpu, MWAIT_IPI_RESCHED))
> + apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
> }
>
> void native_send_call_func_single_ipi(int cpu)
> {
> - apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
> + if (!mwait_intercept(cpu, MWAIT_IPI_SINGLE))
> + apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
> }
>
> void native_send_call_func_ipi(const struct cpumask *mask)



--
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/