Re: [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg andthis_cpu_xchg operations
From: Mathieu Desnoyers
Date: Wed Dec 08 2010 - 13:17:45 EST
* Christoph Lameter (cl@xxxxxxxxx) wrote:
> Alternate approach: Could also use cmpxchg for xchg..
>
>
> Subject: cpuops: Use cmpxchg for xchg to avoid lock semantics
>
> Cmpxchg has a lower cycle count due to the implied lock semantics of xchg.
>
> Simulate the xchg through cmpxchg for the cpu ops.
Hi Christoph,
Can you show if this provides savings in terms of:
- instruction cache footprint
- cycles required to run
- large-scale impact on the branch prediction buffers
Given that this targets per-cpu data only, the additional impact on cache-line
exchange traffic of using cmpxchg over xchg (cache-line not grabbed as exclusive
by the initial read) should not really matter.
I'm CCing Arjan and HPA, because they might have some interesting insight into
the performance impact of lock-prefixed xchg vs using local cmpxchg in a loop.
Thanks,
Mathieu
>
> Signed-off-by: Christoph Lameter <cl@xxxxxxxxx>
>
> ---
> arch/x86/include/asm/percpu.h | 68 +++++++-----------------------------------
> 1 file changed, 12 insertions(+), 56 deletions(-)
>
> Index: linux-2.6/arch/x86/include/asm/percpu.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/percpu.h 2010-12-08 11:43:50.000000000 -0600
> +++ linux-2.6/arch/x86/include/asm/percpu.h 2010-12-08 12:00:21.000000000 -0600
> @@ -212,48 +212,6 @@ do { \
> ret__; \
> })
>
> -/*
> - * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
> - * full lock semantics even though they are not needed.
> - */
> -#define percpu_xchg_op(var, nval) \
> -({ \
> - typeof(var) __ret; \
> - typeof(var) __new = (nval); \
> - switch (sizeof(var)) { \
> - case 1: \
> - asm("xchgb %2, "__percpu_arg(1) \
> - : "=a" (__ret), "+m" (var) \
> - : "q" (__new) \
> - : "memory"); \
> - break; \
> - case 2: \
> - asm("xchgw %2, "__percpu_arg(1) \
> - : "=a" (__ret), "+m" (var) \
> - : "r" (__new) \
> - : "memory"); \
> - break; \
> - case 4: \
> - asm("xchgl %2, "__percpu_arg(1) \
> - : "=a" (__ret), "+m" (var) \
> - : "r" (__new) \
> - : "memory"); \
> - break; \
> - case 8: \
> - asm("xchgq %2, "__percpu_arg(1) \
> - : "=a" (__ret), "+m" (var) \
> - : "r" (__new) \
> - : "memory"); \
> - break; \
> - default: __bad_percpu_size(); \
> - } \
> - __ret; \
> -})
> -
> -/*
> - * cmpxchg has no such implied lock semantics as a result it is much
> - * more efficient for cpu local operations.
> - */
> #define percpu_cmpxchg_op(var, oval, nval) \
> ({ \
> typeof(var) __ret; \
> @@ -412,16 +370,6 @@ do { \
> #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
> #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
>
> -#define __this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define __this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define __this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
> -
> #ifndef CONFIG_M386
> #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
> #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
> @@ -489,16 +437,24 @@ do { \
> #define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
> #define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
>
> -#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
> -
> #define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
> #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
> #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
>
> #endif
>
> +#define this_cpu_xchg(pcp, val) \
> +({ \
> + typeof(val) __o; \
> + do { \
> + __o = __this_cpu_read(pcp); \
> + } while (this_cpu_cmpxchg(pcp, __o, val) != __o); \
> + __o; \
> +})
> +
> +#define __this_cpu_xchg this_cpu_xchg
> +#define irqsafe_cpu_xchg this_cpu_xchg
> +
> /* This is not atomic against other CPUs -- CPU preemption needs to be off */
> #define x86_test_and_clear_bit_percpu(bit, var) \
> ({ \
>
--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/