[PATCH -tip 1/2] x86/hweight: Fix false output register dependency of POPCNT insn

From: Uros Bizjak
Date: Tue Mar 25 2025 - 12:49:20 EST


On Sandy/Ivy Bridge and later Intel processors, the POPCNT instruction
appears to have a false dependency on the destination register. Even
though the instruction only writes to it, the instruction will wait
until destination is ready before executing. This false dependency
was fixed for Cannon Lake (and later) processors.

Fix false dependency by clearing the destination register first.

The x86_64 defconfig object size increases by 779 bytes:

text data bss dec hex filename
27341418 4643015 814852 32799285 1f47a35 vmlinux-old.o
27342197 4643015 814852 32800064 1f47d40 vmlinux-new.o

Signed-off-by: Uros Bizjak <ubizjak@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
---
arch/x86/include/asm/arch_hweight.h | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index cbc6157f0b4b..aa0b3bd309fc 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -4,12 +4,21 @@

#include <asm/cpufeatures.h>

+/*
+ * On Sandy/Ivy Bridge and later Intel processors, the POPCNT instruction
+ * appears to have a false dependency on the destination register. Even
+ * though the instruction only writes to it, the instruction will wait
+ * until destination is ready before executing. This false dependency
+ * was fixed for Cannon Lake (and later) processors.
+ */
+#define ASM_FORCE_CLR "xorl %k[cnt], %k[cnt]\n\t"
+
#ifdef CONFIG_64BIT
#define REG_IN "D"
-#define REG_OUT "a"
+#define ASM_CLR ASM_FORCE_CLR
#else
#define REG_IN "a"
-#define REG_OUT "a"
+#define ASM_CLR
#endif

static __always_inline unsigned int __arch_hweight32(unsigned int w)
@@ -18,8 +27,9 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w)

asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
"call __sw_hweight32",
- "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT)
- : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
+ ASM_CLR "popcntl %[val], %[cnt]",
+ X86_FEATURE_POPCNT)
+ : [cnt] "=a" (res), ASM_CALL_CONSTRAINT
: [val] REG_IN (w));

return res;
@@ -48,8 +58,9 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)

asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
"call __sw_hweight64",
- "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT)
- : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
+ ASM_CLR "popcntq %[val], %[cnt]",
+ X86_FEATURE_POPCNT)
+ : [cnt] "=a" (res), ASM_CALL_CONSTRAINT
: [val] REG_IN (w));

return res;
--
2.42.0