Re: RFC: Petition Intel/AMD to add POPF_IF insn

From: Denys Vlasenko
Date: Thu Aug 18 2016 - 05:21:59 EST


On 08/17/2016 11:35 PM, Linus Torvalds wrote:
On Wed, Aug 17, 2016 at 2:26 PM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:

Of course, if somebody uses native_restore_fl() to actually *disable*
interrupts (when they weren't already disabled), then this untested
patch will just not work. But why would you do somethign so stupid?
Famous last words...

Looking around, the vsmp code actually uses "native_restore_fl()" to
not just set the interrupt flag, but AC as well.

And the PV spinlock case has that "push;popf" sequence encoded in an alternate.

So that trivial patch may (or may not - still not tested) work for
some quick testing, but needs more effort for any *real* use.

I'm going to test the attached patch.

PV and debug maze is daunting... I discovered that Fedora kernels
compile with the set of .config options which result in
spin_unlock_irqrestore which looks like this:

ffffffff818d0f40 <_raw_spin_unlock_irqrestore>:
ffffffff818d0f40: e8 1b 31 00 00 callq ffffffff818d4060 <__fentry__> ffffffff818d0f41: R_X86_64_PC32 __fentry__-0x4
ffffffff818d0f45: 55 push %rbp
ffffffff818d0f46: 48 89 e5 mov %rsp,%rbp
ffffffff818d0f49: 41 54 push %r12
ffffffff818d0f4b: 53 push %rbx
ffffffff818d0f4c: 48 8b 55 08 mov 0x8(%rbp),%rdx
ffffffff818d0f50: 49 89 fc mov %rdi,%r12
ffffffff818d0f53: 48 89 f3 mov %rsi,%rbx
ffffffff818d0f56: 48 83 c7 18 add $0x18,%rdi
ffffffff818d0f5a: be 01 00 00 00 mov $0x1,%esi
ffffffff818d0f5f: e8 8c b8 83 ff callq ffffffff8110c7f0 <lock_release> ffffffff818d0f60: R_X86_64_PC32 lock_release-0x4
ffffffff818d0f64: 4c 89 e7 mov %r12,%rdi
ffffffff818d0f67: e8 d4 fb 83 ff callq ffffffff81110b40 <do_raw_spin_unlock> ffffffff818d0f68: R_X86_64_PC32 do_raw_spin_unlock-0x4
ffffffff818d0f6c: f6 c7 02 test $0x2,%bh
ffffffff818d0f6f: 74 1b je ffffffff818d0f8c <_raw_spin_unlock_irqrestore+0x4c>
ffffffff818d0f71: e8 aa 9d 83 ff callq ffffffff8110ad20 <trace_hardirqs_on> ffffffff818d0f72: R_X86_64_PC32 trace_hardirqs_on-0x4
ffffffff818d0f76: 48 89 df mov %rbx,%rdi
ffffffff818d0f79: ff 14 25 48 32 e2 81 callq *0xffffffff81e23248 ffffffff818d0f7c: R_X86_64_32S pv_irq_ops+0x8
ffffffff818d0f80: 65 ff 0d c9 c4 73 7e decl %gs:0x7e73c4c9(%rip) # d450 <__preempt_count> ffffffff818d0f83: R_X86_64_PC32 __preempt_count-0x4
ffffffff818d0f87: 5b pop %rbx
ffffffff818d0f88: 41 5c pop %r12
ffffffff818d0f8a: 5d pop %rbp
ffffffff818d0f8b: c3 retq
ffffffff818d0f8c: 48 89 df mov %rbx,%rdi
ffffffff818d0f8f: ff 14 25 48 32 e2 81 callq *0xffffffff81e23248 ffffffff818d0f92: R_X86_64_32S pv_irq_ops+0x8
ffffffff818d0f96: e8 35 5b 83 ff callq ffffffff81106ad0 <trace_hardirqs_off> ffffffff818d0f97: R_X86_64_PC32 trace_hardirqs_off-0x4
ffffffff818d0f9b: eb e3 jmp ffffffff818d0f80 <_raw_spin_unlock_irqrestore+0x40>
ffffffff818d0f9d: 0f 1f 00 nopl (%rax)

Gawd... really Fedora? All this is needed?

Testing with _this_ is not going to show any differences, I need to weed
all the cruft out and test a fast, non-debug .config.

Changed it like this:

+# CONFIG_UPROBES is not set
+# CONFIG_SCHED_OMIT_FRAME_POINTER is not set
+# CONFIG_HYPERVISOR_GUEST is not set
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_KMEMCHECK is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_PROVE_RCU is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_FTRACE is not set
+# CONFIG_BINARY_PRINTF is not set

Looks better (it's with the patch already):

00000000000000f0 <_raw_spin_unlock_irqrestore>:
f0: 53 push %rbx
f1: 48 89 f3 mov %rsi,%rbx
f4: e8 00 00 00 00 callq f9 <_raw_spin_unlock_irqrestore+0x9> f5: R_X86_64_PC32 do_raw_spin_unlock-0x4
f9: 80 e7 02 and $0x2,%bh
fc: 74 01 je ff <_raw_spin_unlock_irqrestore+0xf>
fe: fb sti
ff: 65 ff 0d 00 00 00 00 decl %gs:0x0(%rip) # 106 <_raw_spin_unlock_irqrestore+0x16> 102: R_X86_64_PC32 __preempt_count-0x4
106: 5b pop %rbx
107: c3 retq

This also raises questions. Such as "why do_raw_spin_unlock() is not inlined here?"

Anyway... on to more kernel builds and testing.
Please take a look at the below patch. If it's obviously buggy, let me know.


diff -urp linux-4.7.1.org/arch/x86/include/asm/irqflags.h linux-4.7.1.obj/arch/x86/include/asm/irqflags.h
--- linux-4.7.1.org/arch/x86/include/asm/irqflags.h 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/include/asm/irqflags.h 2016-08-18 10:01:09.514219644 +0200
@@ -44,6 +44,16 @@ static inline void native_irq_enable(voi
asm volatile("sti": : :"memory");
}
+/*
+ * This produces a "test; jz; sti" insn sequence.
+ * It's faster than "push reg; popf". If sti is skipped, it's much faster.
+ */
+static inline void native_cond_irq_enable(unsigned long flags)
+{
+ if (flags & X86_EFLAGS_IF)
+ native_irq_enable();
+}
+
static inline void native_safe_halt(void)
{
asm volatile("sti; hlt": : :"memory");
@@ -69,7 +79,8 @@ static inline notrace unsigned long arch
static inline notrace void arch_local_irq_restore(unsigned long flags)
{
- native_restore_fl(flags);
+ if (flags & X86_EFLAGS_IF)
+ native_irq_enable();
}
static inline notrace void arch_local_irq_disable(void)
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt.c linux-4.7.1.obj/arch/x86/kernel/paravirt.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt.c 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt.c 2016-08-18 10:01:24.797155109 +0200
@@ -313,7 +313,7 @@ struct pv_time_ops pv_time_ops = {
__visible struct pv_irq_ops pv_irq_ops = {
.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+ .restore_fl = __PV_IS_CALLEE_SAVE(native_cond_irq_enable),
.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c 2016-08-18 04:43:19.388102755 +0200
@@ -2,7 +2,7 @@
DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testb $((1<<9)>>8),%ah; jz 1f; sti; 1: ;");
DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
DEF_NATIVE(pv_cpu_ops, iret, "iret");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c 2016-08-18 04:42:56.364545509 +0200
@@ -4,7 +4,7 @@
DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testw $(1<<9),%di; jz 1f; sti; 1: ;");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");