Re: [PATCH -v2] x86/hweight: Get rid of the special calling convention
From: Borislav Petkov
Date: Thu May 12 2016 - 07:57:48 EST
On Wed, May 11, 2016 at 09:54:50PM -0700, H. Peter Anvin wrote:
> I was thinking it isn't really very complex code even in assembly as
> it is super-regular; you can even crib the gcc-generated code if you
> wish.
Do I wanna do experiments in asm? Always! :-)
Ok, so I did steal gcc -m32 -O3 output because there it uses only one
additional register. So how about this (only __sw_hweight32 today):
#ifdef CONFIG_X86_32
# define PUSH_DX "pushl %%edx\n\t"
# define POP_DX "popl %%edx\n\t"
#else
# define PUSH_DX "pushq %%rdx\n\t"
# define POP_DX "popq %%rdx\n\t"
#endif
unsigned int __sw_hweight32(unsigned int w)
{
asm volatile(PUSH_DX
"movl %[w], %%edx\n\t" /* w -> t */
"shrl %%edx\n\t" /* t >> 1 */
"andl $0x55555555, %%edx\n\t" /* t & 0x55555555 */
"subl %%edx, %[w]\n" /* w -= t */
"\n\t"
"movl %[w], %%edx\n\t" /* w -> t */
"shrl $2, %[w]\n\t" /* w_tmp >> 2 */
"andl $0x33333333, %%edx\n\t" /* t & 0x33333333 */
"andl $0x33333333, %[w]\n\t" /* w_tmp & 0x33333333 */
"addl %%edx, %[w]\n" /* w = w_tmp + t */
"\n\t"
"movl %[w], %%edx\n\t" /* w -> t */
"shrl $4, %%edx\n\t" /* t >> 4 */
"addl %%edx, %[w]\n\t" /* w_tmp += t */
"andl $0x0f0f0f0f, %[w]\n\t" /* w_tmp &= 0x0f0f0f0f */
"imull $0x01010101, %[w], %[w]\n\t" /* w_tmp *= 0x01010101 */
"shrl $24, %[w]\n\t" /* w = w_tmp >> 24 */
POP_DX
: [w] "+r" (w));
return w;
}
I've chosen rDX as a temp because gcc takes that one but it doesn't
matter which - we're stashing it.
And then we rely on gcc to figure out which reg to use for w. It ends up
using rAX as that is the return reg which fits nicely with our intention
of returning POPCNT values in rAX.
I'm guessing we can just as well write %%rax in the asm because we're
returning that value and that's ABI.
Generated asm looks ok, only on 64-bit it does one
movl %edi, %eax # w, w
before the inline asm in order to stick w in rAX.
Complaints?
Full diff:
---
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7bb15747fea2..79e0bcd61cb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -292,11 +292,6 @@ config X86_32_LAZY_GS
def_bool y
depends on X86_32 && !CC_STACKPROTECTOR
-config ARCH_HWEIGHT_CFLAGS
- string
- default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
- default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
config ARCH_SUPPORTS_UPROBES
def_bool y
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 02e799fa43d1..7dd97eaba67d 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,10 +2,11 @@
#define _ASM_X86_HWEIGHT_H
#include <asm/cpufeatures.h>
+#include <asm/static_cpu_has.h>
#ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
/* popcnt %rdi, %rax */
#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
#define REG_IN "D"
@@ -17,19 +18,15 @@
#define REG_OUT "a"
#endif
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
- unsigned int res = 0;
+ unsigned int res;
asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
- : "="REG_OUT (res)
- : REG_IN (w));
+ : "="REG_OUT (res)
+ : REG_IN (w));
return res;
}
@@ -53,13 +50,16 @@ static inline unsigned long __arch_hweight64(__u64 w)
#else
static __always_inline unsigned long __arch_hweight64(__u64 w)
{
- unsigned long res = 0;
+ unsigned long res;
- asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
- : "="REG_OUT (res)
- : REG_IN (w));
+ if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+ asm volatile(POPCNT64
+ : "="REG_OUT (res)
+ : REG_IN (w));
- return res;
+ return res;
+ }
+ return __sw_hweight64(w);
}
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 07c942d84662..9a70b12ae8df 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,6 +6,8 @@
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
#include <asm/asm.h>
+#include <asm/static_cpu_has.h>
+
#include <linux/bitops.h>
enum cpuid_leafs
@@ -45,51 +47,6 @@ extern const char * const x86_power_flags[32];
*/
extern const char * const x86_bug_flags[NBUGINTS*32];
-#define test_cpu_cap(c, bit) \
- test_bit(bit, (unsigned long *)((c)->x86_capability))
-
-#define REQUIRED_MASK_BIT_SET(bit) \
- ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \
- (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \
- (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \
- (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \
- (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \
- (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \
- (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \
- (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \
- (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \
- (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \
- (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \
- (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \
- (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \
- (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \
- (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
-
-#define DISABLED_MASK_BIT_SET(bit) \
- ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \
- (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \
- (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \
- (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \
- (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \
- (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \
- (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \
- (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \
- (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \
- (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \
- (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \
- (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \
- (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \
- (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \
- (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
-
-#define cpu_has(c, bit) \
- (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
- test_cpu_cap(c, bit))
-
#define this_cpu_has(bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
@@ -105,8 +62,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define cpu_feature_enabled(bit) \
(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
-#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
-
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
#define setup_clear_cpu_cap(bit) do { \
@@ -118,69 +73,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-/*
- * Static testing of CPU features. Used the same as boot_cpu_has().
- * These will statically patch the target code for additional
- * performance.
- */
-static __always_inline __pure bool _static_cpu_has(u16 bit)
-{
- asm_volatile_goto("1: jmp 6f\n"
- "2:\n"
- ".skip -(((5f-4f) - (2b-1b)) > 0) * "
- "((5f-4f) - (2b-1b)),0x90\n"
- "3:\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 4f - .\n" /* repl offset */
- " .word %P1\n" /* always replace */
- " .byte 3b - 1b\n" /* src len */
- " .byte 5f - 4f\n" /* repl len */
- " .byte 3b - 2b\n" /* pad len */
- ".previous\n"
- ".section .altinstr_replacement,\"ax\"\n"
- "4: jmp %l[t_no]\n"
- "5:\n"
- ".previous\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 0\n" /* no replacement */
- " .word %P0\n" /* feature bit */
- " .byte 3b - 1b\n" /* src len */
- " .byte 0\n" /* repl len */
- " .byte 0\n" /* pad len */
- ".previous\n"
- ".section .altinstr_aux,\"ax\"\n"
- "6:\n"
- " testb %[bitnum],%[cap_byte]\n"
- " jnz %l[t_yes]\n"
- " jmp %l[t_no]\n"
- ".previous\n"
- : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
- [bitnum] "i" (1 << (bit & 7)),
- [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
- : : t_yes, t_no);
- t_yes:
- return true;
- t_no:
- return false;
-}
-
-#define static_cpu_has(bit) \
-( \
- __builtin_constant_p(boot_cpu_has(bit)) ? \
- boot_cpu_has(bit) : \
- _static_cpu_has(bit) \
-)
-#else
-/*
- * Fall back to dynamic for gcc versions which don't support asm goto. Should be
- * a minority now anyway.
- */
-#define static_cpu_has(bit) boot_cpu_has(bit)
-#endif
-
#define cpu_has_bug(c, bit) cpu_has(c, (bit))
#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
diff --git a/arch/x86/include/asm/cpuinfo.h b/arch/x86/include/asm/cpuinfo.h
new file mode 100644
index 000000000000..a6632044f199
--- /dev/null
+++ b/arch/x86/include/asm/cpuinfo.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_CPUINFO_H_
+#define _ASM_X86_CPUINFO_H_
+
+/*
+ * CPU type and hardware bug flags. Kept separately for each CPU.
+ * Members of this structure are referenced in head.S, so think twice
+ * before touching them. [mj]
+ */
+struct cpuinfo_x86 {
+ __u8 x86; /* CPU family */
+ __u8 x86_vendor; /* CPU vendor */
+ __u8 x86_model;
+ __u8 x86_mask;
+#ifdef CONFIG_X86_32
+ char wp_works_ok; /* It doesn't on 386's */
+
+ /* Problems on some 486Dx4's and old 386's: */
+ char rfu;
+ char pad0;
+ char pad1;
+#else
+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+ int x86_tlbsize;
+#endif
+ __u8 x86_virt_bits;
+ __u8 x86_phys_bits;
+ /* CPUID returned core id bits: */
+ __u8 x86_coreid_bits;
+ /* Max extended CPUID function supported: */
+ __u32 extended_cpuid_level;
+ /* Maximum supported CPUID level, -1=no CPUID: */
+ int cpuid_level;
+ __u32 x86_capability[NCAPINTS + NBUGINTS];
+ char x86_vendor_id[16];
+ char x86_model_id[64];
+ /* in KB - valid for CPUS which support this call: */
+ int x86_cache_size;
+ int x86_cache_alignment; /* In bytes */
+ /* Cache QoS architectural values: */
+ int x86_cache_max_rmid; /* max index */
+ int x86_cache_occ_scale; /* scale to bytes */
+ int x86_power;
+ unsigned long loops_per_jiffy;
+ /* cpuid returned max cores value: */
+ u16 x86_max_cores;
+ u16 apicid;
+ u16 initial_apicid;
+ u16 x86_clflush_size;
+ /* number of cores as seen by the OS: */
+ u16 booted_cores;
+ /* Physical processor id: */
+ u16 phys_proc_id;
+ /* Logical processor id: */
+ u16 logical_proc_id;
+ /* Core id: */
+ u16 cpu_core_id;
+ /* Index into per_cpu list: */
+ u16 cpu_index;
+ u32 microcode;
+};
+
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+
+#endif /* _ASM_X86_CPUINFO_H_ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 62c6cc3cc5d3..6f6555b20e3d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
#include <asm/nops.h>
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
+#include <asm/cpuinfo.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -78,65 +79,6 @@ extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];
-/*
- * CPU type and hardware bug flags. Kept separately for each CPU.
- * Members of this structure are referenced in head.S, so think twice
- * before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
- __u8 x86; /* CPU family */
- __u8 x86_vendor; /* CPU vendor */
- __u8 x86_model;
- __u8 x86_mask;
-#ifdef CONFIG_X86_32
- char wp_works_ok; /* It doesn't on 386's */
-
- /* Problems on some 486Dx4's and old 386's: */
- char rfu;
- char pad0;
- char pad1;
-#else
- /* Number of 4K pages in DTLB/ITLB combined(in pages): */
- int x86_tlbsize;
-#endif
- __u8 x86_virt_bits;
- __u8 x86_phys_bits;
- /* CPUID returned core id bits: */
- __u8 x86_coreid_bits;
- /* Max extended CPUID function supported: */
- __u32 extended_cpuid_level;
- /* Maximum supported CPUID level, -1=no CPUID: */
- int cpuid_level;
- __u32 x86_capability[NCAPINTS + NBUGINTS];
- char x86_vendor_id[16];
- char x86_model_id[64];
- /* in KB - valid for CPUS which support this call: */
- int x86_cache_size;
- int x86_cache_alignment; /* In bytes */
- /* Cache QoS architectural values: */
- int x86_cache_max_rmid; /* max index */
- int x86_cache_occ_scale; /* scale to bytes */
- int x86_power;
- unsigned long loops_per_jiffy;
- /* cpuid returned max cores value: */
- u16 x86_max_cores;
- u16 apicid;
- u16 initial_apicid;
- u16 x86_clflush_size;
- /* number of cores as seen by the OS: */
- u16 booted_cores;
- /* Physical processor id: */
- u16 phys_proc_id;
- /* Logical processor id: */
- u16 logical_proc_id;
- /* Core id: */
- u16 cpu_core_id;
- /* Index into per_cpu list: */
- u16 cpu_index;
- u32 microcode;
-};
-
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
#define X86_VENDOR_AMD 2
@@ -151,9 +93,6 @@ struct cpuinfo_x86 {
/*
* capabilities of CPUs
*/
-extern struct cpuinfo_x86 boot_cpu_data;
-extern struct cpuinfo_x86 new_cpu_data;
-
extern struct tss_struct doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS];
extern __u32 cpu_caps_set[NCAPINTS];
diff --git a/arch/x86/include/asm/static_cpu_has.h b/arch/x86/include/asm/static_cpu_has.h
new file mode 100644
index 000000000000..648ada0c7ffe
--- /dev/null
+++ b/arch/x86/include/asm/static_cpu_has.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_X86_STATIC_CPU_HAS_H
+#define _ASM_X86_STATIC_CPU_HAS_H
+
+#include <asm/cpuinfo.h>
+
+#define test_cpu_cap(c, bit) \
+ test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define REQUIRED_MASK_BIT_SET(bit) \
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \
+ (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \
+ (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \
+ (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \
+ (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
+
+#define DISABLED_MASK_BIT_SET(bit) \
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \
+ (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \
+ (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \
+ (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \
+ (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
+
+#define cpu_has(c, bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ test_cpu_cap(c, bit))
+
+#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
+
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+/*
+ * Static testing of CPU features. Used the same as boot_cpu_has().
+ * These will statically patch the target code for additional
+ * performance.
+ */
+static __always_inline __pure bool _static_cpu_has(u16 bit)
+{
+ asm_volatile_goto("1: jmp 6f\n"
+ "2:\n"
+ ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+ "((5f-4f) - (2b-1b)),0x90\n"
+ "3:\n"
+ ".section .altinstructions,\"a\"\n"
+ " .long 1b - .\n" /* src offset */
+ " .long 4f - .\n" /* repl offset */
+ " .word %P1\n" /* always replace */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 5f - 4f\n" /* repl len */
+ " .byte 3b - 2b\n" /* pad len */
+ ".previous\n"
+ ".section .altinstr_replacement,\"ax\"\n"
+ "4: jmp %l[t_no]\n"
+ "5:\n"
+ ".previous\n"
+ ".section .altinstructions,\"a\"\n"
+ " .long 1b - .\n" /* src offset */
+ " .long 0\n" /* no replacement */
+ " .word %P0\n" /* feature bit */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 0\n" /* repl len */
+ " .byte 0\n" /* pad len */
+ ".previous\n"
+ ".section .altinstr_aux,\"ax\"\n"
+ "6:\n"
+ " testb %[bitnum],%[cap_byte]\n"
+ " jnz %l[t_yes]\n"
+ " jmp %l[t_no]\n"
+ ".previous\n"
+ : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+ [bitnum] "i" (1 << (bit & 7)),
+ [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+ : : t_yes, t_no);
+ t_yes:
+ return true;
+ t_no:
+ return false;
+}
+
+#define static_cpu_has(bit) \
+( \
+ __builtin_constant_p(boot_cpu_has(bit)) ? \
+ boot_cpu_has(bit) : \
+ _static_cpu_has(bit) \
+)
+#else
+/*
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
+ */
+#define static_cpu_has(bit) boot_cpu_has(bit)
+#endif
+
+#endif /* _ASM_X86_STATIC_CPU_HAS_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 72a576752a7e..ec969cc3eb20 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hweight.c b/arch/x86/lib/hweight.c
new file mode 100644
index 000000000000..5834fb9af6ff
--- /dev/null
+++ b/arch/x86/lib/hweight.c
@@ -0,0 +1,37 @@
+#include <linux/export.h>
+#include <linux/compiler.h>
+
+#ifdef CONFIG_X86_32
+# define PUSH_DX "pushl %%edx\n\t"
+# define POP_DX "popl %%edx\n\t"
+#else
+# define PUSH_DX "pushq %%rdx\n\t"
+# define POP_DX "popq %%rdx\n\t"
+#endif
+
+unsigned int __sw_hweight32(unsigned int w)
+{
+ asm volatile(PUSH_DX
+ "movl %[w], %%edx\n\t" /* w -> t */
+ "shrl %%edx\n\t" /* t >> 1 */
+ "andl $0x55555555, %%edx\n\t" /* t & 0x55555555 */
+ "subl %%edx, %[w]\n" /* w -= t */
+ "\n\t"
+ "movl %[w], %%edx\n\t" /* w -> t */
+ "shrl $2, %[w]\n\t" /* w_tmp >> 2 */
+ "andl $0x33333333, %%edx\n\t" /* t & 0x33333333 */
+ "andl $0x33333333, %[w]\n\t" /* w_tmp & 0x33333333 */
+ "addl %%edx, %[w]\n" /* w = w_tmp + t */
+ "\n\t"
+ "movl %[w], %%edx\n\t" /* w -> t */
+ "shrl $4, %%edx\n\t" /* t >> 4 */
+ "addl %%edx, %[w]\n\t" /* w_tmp += t */
+ "andl $0x0f0f0f0f, %[w]\n\t" /* w_tmp &= 0x0f0f0f0f */
+ "imull $0x01010101, %[w], %[w]\n\t" /* w_tmp *= 0x01010101 */
+ "shrl $24, %[w]\n\t" /* w = w_tmp >> 24 */
+ POP_DX
+ : [w] "+r" (w));
+
+ return w;
+}
+EXPORT_SYMBOL(__sw_hweight32);
diff --git a/lib/Makefile b/lib/Makefile
index a65e9a861535..55ad20701dc0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
KCOV_INSTRUMENT_list_debug.o := n
KCOV_INSTRUMENT_debugobjects.o := n
KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -72,8 +69,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
obj-$(CONFIG_BTREE) += btree.o
diff --git a/lib/hweight.c b/lib/hweight.c
index 9a5c1f221558..d53137a8def4 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,6 +9,7 @@
* The Hamming Weight of a number is the total number of bits set in it.
*/
+#ifndef __HAVE_ARCH_SW_HWEIGHT
unsigned int __sw_hweight32(unsigned int w)
{
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
#endif
}
EXPORT_SYMBOL(__sw_hweight32);
+#endif
unsigned int __sw_hweight16(unsigned int w)
{
--
Regards/Gruss,
Boris.
SUSE Linux GmbH, GF: Felix ImendÃrffer, Jane Smithard, Graham Norton, HRB 21284 (AG NÃrnberg)
--