[PATCH] x86: Optimize TIF checking in __switch_to_xtra.
From: Kyle Huey
Date: Tue Dec 06 2016 - 18:04:56 EST
For the TIF_BLOCKSTEP and TIF_NOTSC cases, __switch_to_xtra only needs to act
if the value of the flags changes with the task switch. That leads to C code
like:
if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
/* do work */
}
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* do some more work */
}
GCC compiles that to:
mov %rdi,%r13
mov (%rdi),%rax
mov %rsi,%r12
mov (%rsi),%rdx
shr $0x19,%rax
shr $0x19,%rdx
and $0x1,%eax
and $0x1,%edx
cmp %al,%dl
je 1f
/* do work */
1: mov (%r13),%rax
mov (%r12),%rdx
shr $0x10,%rax
shr $0x10,%rdx
and $0x1,%edx
and $0x1,%eax
cmp %al,%dl
je 2f
/* do some more work */
2:
This does a bunch of unnecessary work. If we are not going to do the
TIF_BLOCKSTEP specific work here, we don't care about the actual values of the
bits, only whether or not they are different. And if we are going to do that
work, GCC won't reuse the computation anyways because test_tsk_thread_flag
calls test_bit which takes a volatile argument.
In https://lkml.org/lkml/2016/12/3/120, Ingo is currently blocking my attempt
to add a third flag of this type.
I propose the following patch, which results in GCC generating this code:
mov (%rsi),%rax
xor (%rdi),%rax
mov %rdi,%rbx
mov %rsi,%r12
test $0x2000000,%eax
mov %rax,%rdx
jne 3f
1: test $0x10000,%edx
je 2f
/* do some more work */
2: /* the rest of the function */
ret
3: /* do work */
mov (%r12),%rdx
xor (%rbx),%rdx
jmp 1b
When we are not doing the work controlled by these flags this is significantly
better. We only have to load the flags from memory once, and we do a single
XOR that we can test multiple times instead of repeatedly isolating bits.
When we are doing the work controlled by these flags, this is slightly better.
We still get to avoid isolating particular bits, and we simply recompute the
XOR before jumping back if the register was clobbered by the work. Finally,
in either event, additional TIF checks of this form simply become another test
and branch, reducing the overhead of the new check I would like to add.
Signed-off-by: Kyle Huey <khuey@xxxxxxxxxxxx>
---
arch/x86/kernel/process.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0888a879120f..bbe85c377345 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -187,37 +187,45 @@ int set_tsc_mode(unsigned int val)
else if (val == PR_TSC_ENABLE)
enable_TSC();
else
return -EINVAL;
return 0;
}
+static inline int test_tsk_threads_flag_differ(struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long mask)
+{
+ unsigned long diff;
+
+ diff = task_thread_info(prev)->flags ^ task_thread_info(next)->flags;
+ return (diff & mask) != 0;
+}
+
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
struct thread_struct *prev, *next;
prev = &prev_p->thread;
next = &next_p->thread;
- if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
- test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+ if (test_tsk_threads_flag_differ(prev_p, next_p, _TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
debugctl &= ~DEBUGCTLMSR_BTF;
if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
debugctl |= DEBUGCTLMSR_BTF;
update_debugctlmsr(debugctl);
}
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ if (test_tsk_threads_flag_differ(prev_p, next_p, _TIF_NOTSC)) {
/* prev and next are different */
if (test_tsk_thread_flag(next_p, TIF_NOTSC))
hard_disable_TSC();
else
hard_enable_TSC();
}
if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
--
2.11.0