Re: [PATCHv2] arm: Preserve TPIDRURW on context switch

From: Jonathan Austin
Date: Fri May 03 2013 - 11:24:53 EST


Hi Russell,

Thanks for the comments - you're right about the 'switch_tls'
being more appropriate - needed to take a step back to see that.

I've got a few questions, added inline.

AndrÃ, Assuming I've understood things okay, there's a patch that
uses Russell's asm stuff (with minor modifications, see the questions)
and includes the C-world changes too. Perhaps you could see that it
solves your problem?

On 03/05/13 10:55, Russell King - ARM Linux wrote:
> On Fri, May 03, 2013 at 10:21:34AM +0100, Jonathan Austin wrote:
>> .macro set_tls_v6k, tp, tmp1, tmp2
>> - mcr p15, 0, \tp, c13, c0, 3 @ set TLS register
>> - mov \tmp1, #0
>> - mcr p15, 0, \tmp1, c13, c0, 2 @ clear user r/w TLS register
>> + ldrd \tmp1, \tmp2, [\tp]
>> + mcr p15, 0, \tmp1, c13, c0, 3 @ set user r/o TLS register
>> + mcr p15, 0, \tmp2, c13, c0, 2 @ set user r/w TLS register
>
> So we're still back at stalling the pipeline with result delays on older
> CPUs?

How much older? This particular bit is v6k specific so I wasn't worrying
too much but I guess I'm missing something?

It's an academic question wrt to this patch now, though, as the version
you show below re-orders to reduce the stalls...

>
>> + .endm
>> +
>> + .macro save_tlsuser_v6k, tp, tmp1, tmp2
>> + @ TPIDRURW can be updated from userspace, so we have to re-read it
>> + mrc p15, 0, \tmp2, c13, c0, 2 @ load user r/w TLS register
>> + str \tmp2, [\tp, #4]
>> .endm
>>
>> .macro set_tls_v6, tp, tmp1, tmp2
>> @@ -16,15 +25,26 @@
>> ldr \tmp1, [\tmp1, #0]
>> mov \tmp2, #0xffff0fff
>> tst \tmp1, #HWCAP_TLS @ hardware TLS available?
>> - mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register
>> - movne \tmp1, #0
>> - mcrne p15, 0, \tmp1, c13, c0, 2 @ clear user r/w TLS register
>> - streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0
>> + ldrned \tmp1, \tmp2, [\tp]
>> + ldreq \tmp1, [\tp]
>> + mcrne p15, 0, \tmp1, c13, c0, 3 @ yes, set user r/o TLS register
>> + mcrne p15, 0, \tmp2, c13, c0, 2 @ set user r/w TLS register
>> + streq \tmp1, [\tmp2, #-15] @ set TLS value at 0xffff0ff0
>
> This at least is better.
>
>> + .endm
>> +
>> + .macro save_tlsuser_v6, tp, tmp1, tmp2
>> + @ TPIDRURW can be updated from userspace, so we have to re-read it
>> + ldr \tmp1, =elf_hwcap
>> + ldr \tmp1, [\tmp1, #0]
>> + tst \tmp1, #HWCAP_TLS @ hardware TLS available?
>
> But this isn't - this involves two delays.

Indeed. You left this section untouched in your asm below - was that
because you didn't look at optimising it, or because you thought there
wasn't much better that could be done with it?

As far as I can see, we can't start doing any mcr/mrc operations until we
know for sure that the hw implements them, so this is something we're
stuck with?

Is V6 but not V6k just 1136?

>
>> + mrcne p15, 0, \tmp2, c13, c0, 2 @ read user r/w TLS register
>> + strne \tmp2, [\tp, #4] @ save in to thread_info
>> .endm
>>
>> .macro set_tls_software, tp, tmp1, tmp2
>> - mov \tmp1, #0xffff0fff
>> - str \tp, [\tmp1, #-15] @ set TLS value at 0xffff0ff0
>> + ldr \tmp1, [\tp]
>> + mov \tmp2, #0xffff0fff
>> + str \tmp1, [\tmp2, #-15] @ set TLS value at 0xffff0ff0
>> .endm
>> #endif
>>
>> @@ -32,18 +52,31 @@
>> #define tls_emu 1
>> #define has_tls_reg 1
>> #define set_tls set_tls_none
>> +#define save_tlsuser save_tlsuser_none
>> +#define get_tlsuser get_tlsuser_none
>> #elif defined(CONFIG_CPU_V6)
>> #define tls_emu 0
>> #define has_tls_reg (elf_hwcap & HWCAP_TLS)
>> #define set_tls set_tls_v6
>> +#define save_tlsuser save_tlsuser_v6
>> +#define get_tlsuser get_tlsuser_v6
>> #elif defined(CONFIG_CPU_32v6K)
>> #define tls_emu 0
>> #define has_tls_reg 1
>> #define set_tls set_tls_v6k
>> +#define save_tlsuser save_tlsuser_v6k
>> +#define get_tlsuser get_tlsuser_v6k
>> #else
>> #define tls_emu 0
>> #define has_tls_reg 0
>> #define set_tls set_tls_software
>> +#define save_tlsuser save_tlsuser_none
>> +#define get_tlsuser get_tlsuser_none
>> #endif
>
> This separation of setting and saving the TLS value is actually quite
> silly. They're called from the same place, so lets just call it
> "switch_tls" instead.

Agreed. Thanks for the suggestion.

>
> Here's just the assembly bits doing that - this is totally untested
> of course:
>
> arch/arm/include/asm/tls.h | 28 +++++++++++++++-------------
> arch/arm/kernel/entry-armv.S | 4 ++--
> 2 files changed, 17 insertions(+), 15 deletions(-)
>
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 73409e6..9c377f1 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -2,27 +2,29 @@
> #define __ASMARM_TLS_H
>
> #ifdef __ASSEMBLY__

+#include <asm/asm-offsets.h>

required for TI_TP_VALUE

> - .macro set_tls_none, tp, tmp1, tmp2
> + .macro switch_tls_none, base, tp, trw, tmp1, tmp2
> .endm
>
> - .macro set_tls_v6k, tp, tmp1, tmp2
> + .macro switch_tls_v6k, base, tp, trw, tmp1, tmp2

How do you feel about calling tp and trw something different? tpidro
and tpidrw, or tp and tpuser?

The naming threw me off slightly first time I read this new signature
(tp=thread_pointer/tls_pointer/etc).

> + mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register
> mcr p15, 0, \tp, c13, c0, 3 @ set TLS register
> - mov \tmp1, #0
> - mcr p15, 0, \tmp1, c13, c0, 2 @ clear user r/w TLS register
> + mcr p15, 0, \trw, c13, c0, 2 @ and the user r/w register
> + str \tmp2, [\base, #TI_TP_VALUE + 4]@ save it
> .endm
>
> - .macro set_tls_v6, tp, tmp1, tmp2
> + .macro switch_tls_v6, base, tp, trw, tmp1, tmp2
> ldr \tmp1, =elf_hwcap
> ldr \tmp1, [\tmp1, #0]
> mov \tmp2, #0xffff0fff
> tst \tmp1, #HWCAP_TLS @ hardware TLS available?
> - mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register
> - movne \tmp1, #0
> - mcrne p15, 0, \tmp1, c13, c0, 2 @ clear user r/w TLS register
> streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0
> + mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register
> + mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register
> + mcrne p15, 0, \trw, c13, c0, 2 @ set user r/w register
> + strne \tmp2, [\base, #TI_TP_VALUE + 4]@ save it
> .endm
>
> - .macro set_tls_software, tp, tmp1, tmp2
> + .macro switch_tls_software, base, tp, trw, tmp1, tmp2
> mov \tmp1, #0xffff0fff
> str \tp, [\tmp1, #-15] @ set TLS value at 0xffff0ff0
> .endm
> @@ -31,19 +33,19 @@
> #ifdef CONFIG_TLS_REG_EMUL
> #define tls_emu 1
> #define has_tls_reg 1
> -#define set_tls set_tls_none
> +#define switch_tls switch_tls_none
> #elif defined(CONFIG_CPU_V6)
> #define tls_emu 0
> #define has_tls_reg (elf_hwcap & HWCAP_TLS)
> -#define set_tls set_tls_v6
> +#define switch_tls switch_tls_v6
> #elif defined(CONFIG_CPU_32v6K)
> #define tls_emu 0
> #define has_tls_reg 1
> -#define set_tls set_tls_v6k
> +#define switch_tls switch_tls_v6k
> #else
> #define tls_emu 0
> #define has_tls_reg 0
> -#define set_tls set_tls_software
> +#define switch_tls switch_tls_software
> #endif
>
> #endif /* __ASMARM_TLS_H */
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index 0f82098..81a08b1 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -728,15 +728,15 @@ ENTRY(__switch_to)
> UNWIND(.fnstart )
> UNWIND(.cantunwind )
> add ip, r1, #TI_CPU_SAVE
> - ldr r3, [r2, #TI_TP_VALUE]
> ARM( stmia ip!, {r4 - sl, fp, sp, lr} ) @ Store most regs on stack
> THUMB( stmia ip!, {r4 - sl, fp} ) @ Store most regs on stack
> THUMB( str sp, [ip], #4 )
> THUMB( str lr, [ip], #4 )
> + ldrd r4, r5, [r2, #TI_TP_VALUE]
> #ifdef CONFIG_CPU_USE_DOMAINS
> ldr r6, [r2, #TI_CPU_DOMAIN]
> #endif
> - set_tls r3, r4, r5
> + switch_tls r2, r4, r5, r3, r7

Looking at the implementation above and the way you use 'base', I think
that should be
switch_tls r1, r4, r5, r3, r7
not
switch_tls r2, r4, r5, r3, r7

That way we save tpidrurw in to the old thread pointer not the new one.

> #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
> ldr r7, [r2, #TI_TASK]
> ldr r8, =__stack_chk_guard
>
>

Here's a complete patch including Russell's suggested asm. Tested on
TC2, build-tested for 1136

I think we could drop the extra .c file too - though it is kinda nice
for keeping include/asm/tls.h clean...

----8<------
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index cddda1f..d90be6d 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -58,7 +58,7 @@ struct thread_info {
struct cpu_context_save cpu_context; /* cpu context */
__u32 syscall; /* syscall number */
__u8 used_cp[16]; /* thread used copro */
- unsigned long tp_value;
+ unsigned long tp_value[2]; /* TLS registers */
#ifdef CONFIG_CRUNCH
struct crunch_state crunchstate;
#endif
diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 73409e6..39bce5b 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -2,27 +2,30 @@
#define __ASMARM_TLS_H

#ifdef __ASSEMBLY__
- .macro set_tls_none, tp, tmp1, tmp2
+#include <asm/asm-offsets.h>
+ .macro switch_tls_none, base, tp, trw, tmp1, tmp2
.endm

- .macro set_tls_v6k, tp, tmp1, tmp2
+ .macro switch_tls_v6k, base, tp, trw, tmp1, tmp2
+ mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register
mcr p15, 0, \tp, c13, c0, 3 @ set TLS register
- mov \tmp1, #0
- mcr p15, 0, \tmp1, c13, c0, 2 @ clear user r/w TLS register
+ mcr p15, 0, \trw, c13, c0, 2 @ and the user r/w register
+ strne \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
.endm

- .macro set_tls_v6, tp, tmp1, tmp2
+ .macro switch_tls_v6, base, tp, trw, tmp1, tmp2
ldr \tmp1, =elf_hwcap
ldr \tmp1, [\tmp1, #0]
mov \tmp2, #0xffff0fff
tst \tmp1, #HWCAP_TLS @ hardware TLS available?
- mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register
- movne \tmp1, #0
- mcrne p15, 0, \tmp1, c13, c0, 2 @ clear user r/w TLS register
streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0
+ mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register
+ mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register
+ mcrne p15, 0, \trw, c13, c0, 2 @ set user r/w register
+ strne \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
.endm

- .macro set_tls_software, tp, tmp1, tmp2
+ .macro switch_tls_software, base, tp, trw, tmp1, tmp2
mov \tmp1, #0xffff0fff
str \tp, [\tmp1, #-15] @ set TLS value at 0xffff0ff0
.endm
@@ -31,19 +34,28 @@
#ifdef CONFIG_TLS_REG_EMUL
#define tls_emu 1
#define has_tls_reg 1
-#define set_tls set_tls_none
+#define switch_tls switch_tls_none
+#define get_tlsuser get_tlsuser_none
#elif defined(CONFIG_CPU_V6)
#define tls_emu 0
#define has_tls_reg (elf_hwcap & HWCAP_TLS)
-#define set_tls set_tls_v6
+#define switch_tls switch_tls_v6
+#define get_tlsuser get_tlsuser_v6
#elif defined(CONFIG_CPU_32v6K)
#define tls_emu 0
#define has_tls_reg 1
-#define set_tls set_tls_v6k
+#define switch_tls switch_tls_v6k
+#define get_tlsuser get_tlsuser_v6k
#else
#define tls_emu 0
#define has_tls_reg 0
-#define set_tls set_tls_software
+#define switch_tls switch_tls_software
+#define get_tlsuser get_tlsuser_none
#endif

+#ifndef __ASSEMBLY__
+extern unsigned long get_tlsuser_none(void);
+extern unsigned long get_tlsuser_v6(void);
+extern unsigned long get_tlsuser_v6k(void);
+#endif
#endif /* __ASMARM_TLS_H */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 5f3338e..4e1114c 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -17,7 +17,7 @@ CFLAGS_REMOVE_return_address.o = -pg

obj-y := elf.o entry-armv.o entry-common.o irq.o opcodes.o \
process.o ptrace.o return_address.o sched_clock.o \
- setup.o signal.o stacktrace.o sys_arm.o time.o traps.o
+ setup.o signal.o stacktrace.o sys_arm.o time.o tls.o traps.o

obj-$(CONFIG_ATAGS) += atags_parse.o
obj-$(CONFIG_ATAGS_PROC) += atags_proc.o
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 0f82098..80f09fe 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -728,15 +728,15 @@ ENTRY(__switch_to)
UNWIND(.fnstart )
UNWIND(.cantunwind )
add ip, r1, #TI_CPU_SAVE
- ldr r3, [r2, #TI_TP_VALUE]
ARM( stmia ip!, {r4 - sl, fp, sp, lr} ) @ Store most regs on stack
THUMB( stmia ip!, {r4 - sl, fp} ) @ Store most regs on stack
THUMB( str sp, [ip], #4 )
THUMB( str lr, [ip], #4 )
+ ldrd r4, r5, [r2, #TI_TP_VALUE]
#ifdef CONFIG_CPU_USE_DOMAINS
ldr r6, [r2, #TI_CPU_DOMAIN]
#endif
- set_tls r3, r4, r5
+ switch_tls r1, r4, r5, r3, r7
#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
ldr r7, [r2, #TI_TASK]
ldr r8, =__stack_chk_guard
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 047d3e4..24dbc72 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -39,6 +39,7 @@
#include <asm/thread_notify.h>
#include <asm/stacktrace.h>
#include <asm/mach/time.h>
+#include <asm/tls.h>

#ifdef CONFIG_CC_STACKPROTECTOR
#include <linux/stackprotector.h>
@@ -395,7 +396,8 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start,
clear_ptrace_hw_breakpoint(p);

if (clone_flags & CLONE_SETTLS)
- thread->tp_value = childregs->ARM_r3;
+ thread->tp_value[0] = childregs->ARM_r3;
+ thread->tp_value[1] = get_tlsuser();

thread_notify(THREAD_NOTIFY_COPY, thread);

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 03deeff..2bc1514 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -849,7 +849,7 @@ long arch_ptrace(struct task_struct *child, long request,
#endif

case PTRACE_GET_THREAD_AREA:
- ret = put_user(task_thread_info(child)->tp_value,
+ ret = put_user(task_thread_info(child)->tp_value[0],
datap);
break;

diff --git a/arch/arm/kernel/tls.c b/arch/arm/kernel/tls.c
new file mode 100644
index 0000000..1627f5b
--- /dev/null
+++ b/arch/arm/kernel/tls.c
@@ -0,0 +1,50 @@
+/*
+ * arch/arm/kernel/tls.c
+ *
+ * Copyright (C) 2013 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <asm/tls.h>
+
+/*
+ * Access to the TPIDRURW register, with full certainty that it exists.
+ */
+unsigned long get_tlsuser_v6k(void)
+{
+ unsigned long v;
+ asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r" (v));
+ return v;
+}
+
+/*
+ * Access to the TPIDRURW register if it exists.
+ */
+unsigned long get_tlsuser_v6(void)
+{
+ unsigned long v = 0;
+ if (elf_hwcap & HWCAP_TLS)
+ asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r" (v));
+ return v;
+}
+
+/*
+ * Dummy access for the case that TLS is emulated in software
+ */
+unsigned long get_tlsuser_none(void)
+{
+ return 0;
+}
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 1c08911..f9d6259 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -588,7 +588,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
return regs->ARM_r0;

case NR(set_tls):
- thread->tp_value = regs->ARM_r0;
+ thread->tp_value[0] = regs->ARM_r0;
if (tls_emu)
return 0;
if (has_tls_reg) {
@@ -706,7 +706,7 @@ static int get_tp_trap(struct pt_regs *regs, unsigned int instr)
int reg = (instr >> 12) & 15;
if (reg == 15)
return 1;
- regs->uregs[reg] = current_thread_info()->tp_value;
+ regs->uregs[reg] = current_thread_info()->tp_value[0];
regs->ARM_pc += 4;
return 0;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/