[PATCH] x86/crc: drop the avx10_256 functions and rename avx10_512 to avx512
From: Eric Biggers
Date: Wed Mar 19 2025 - 14:17:05 EST
From: Eric Biggers <ebiggers@xxxxxxxxxx>
Intel made a late change to the AVX10 specification that removes support
for a 256-bit maximum vector length and enumeration of the maximum
vector length. AVX10 will imply a maximum vector length of 512 bits.
I.e. there won't be any such thing as AVX10/256 or AVX10/512; there will
just be AVX10, and it will essentially just consolidate AVX512 features.
As a result of this new development, my strategy of providing both
*_avx10_256 and *_avx10_512 functions didn't turn out to be that useful.
The only remaining motivation for the 256-bit AVX512 / AVX10 functions
is to avoid downclocking on older Intel CPUs. But I already wrote
*_avx2 code too (primarily to support CPUs without AVX512), which
performs almost as well as *_avx10_256. So we should just use that.
Therefore, remove the *_avx10_256 functions, and rename the *_avx10_512
functions to *_avx512. Make Ice Lake and Tiger Lake use the *_avx2
functions instead of *_avx10_256 which they previously used.
Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>
---
arch/x86/lib/crc-pclmul-template.S | 23 ++++++++---------------
arch/x86/lib/crc-pclmul-template.h | 15 +++++----------
2 files changed, 13 insertions(+), 25 deletions(-)
diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
index a19b730b642d3..ae0b6144c503c 100644
--- a/arch/x86/lib/crc-pclmul-template.S
+++ b/arch/x86/lib/crc-pclmul-template.S
@@ -136,11 +136,11 @@
// is a vector register containing the needed fold constants, and \tmp is a
// temporary vector register. All arguments must be the same length.
.macro _fold_vec acc, data, consts, tmp
_pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
_pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
-.if AVX_LEVEL < 10
+.if AVX_LEVEL <= 2
_cond_vex pxor, \data, \tmp, \tmp
_cond_vex pxor, \tmp, \acc, \acc
.else
vpternlogq $0x96, \data, \tmp, \acc
.endif
@@ -199,23 +199,20 @@
// bit0 to x^0, bit1 to x^1, bit7 to x^7.
//
// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
//
// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
-// 10 for AVX10 or AVX512.
+// 512 for AVX512.
//
// If \vl == 16 && \avx_level == 0, the generated code requires:
// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
//
// If \vl == 32 && \avx_level == 2, the generated code requires:
// VPCLMULQDQ && AVX2.
//
-// If \vl == 32 && \avx_level == 10, the generated code requires:
-// VPCLMULQDQ && (AVX10/256 || (AVX512BW && AVX512VL))
-//
-// If \vl == 64 && \avx_level == 10, the generated code requires:
-// VPCLMULQDQ && (AVX10/512 || (AVX512BW && AVX512VL))
+// If \vl == 64 && \avx_level == 512, the generated code requires:
+// VPCLMULQDQ && AVX512BW && AVX512VL.
//
// Other \vl and \avx_level combinations are either not supported or not useful.
.macro _crc_pclmul n, lsb_crc, vl, avx_level
.set LSB_CRC, \lsb_crc
.set VL, \vl
@@ -532,11 +529,11 @@
// floor(t0 / G). The most efficient way to do that is to move it to
// the physically high qword and use a ternlog to combine the two XORs.
.if LSB_CRC && \n == 64
_cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
_pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- .if AVX_LEVEL < 10
+ .if AVX_LEVEL <= 2
_cond_vex pxor, %xmm2, %xmm0, %xmm0
_cond_vex pxor, %xmm1, %xmm0, %xmm0
.else
vpternlogq $0x96, %xmm2, %xmm1, %xmm0
.endif
@@ -572,17 +569,13 @@ SYM_FUNC_END(prefix##_pclmul_sse); \
\
SYM_FUNC_START(prefix##_vpclmul_avx2); \
_crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
SYM_FUNC_END(prefix##_vpclmul_avx2); \
\
-SYM_FUNC_START(prefix##_vpclmul_avx10_256); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=10; \
-SYM_FUNC_END(prefix##_vpclmul_avx10_256); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx10_512); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=10; \
-SYM_FUNC_END(prefix##_vpclmul_avx10_512);
+SYM_FUNC_START(prefix##_vpclmul_avx512); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
+SYM_FUNC_END(prefix##_vpclmul_avx512);
#else
#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
SYM_FUNC_START(prefix##_pclmul_sse); \
_crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
SYM_FUNC_END(prefix##_pclmul_sse);
diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h
index 7b89f0edbc179..c5b3bfe11d8da 100644
--- a/arch/x86/lib/crc-pclmul-template.h
+++ b/arch/x86/lib/crc-pclmul-template.h
@@ -19,31 +19,26 @@
#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
const void *consts_ptr); \
crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx10_256(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx10_512(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
+crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
#define INIT_CRC_PCLMUL(prefix) \
do { \
if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \
boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \
boot_cpu_has(X86_FEATURE_AVX2) && \
cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \
if (boot_cpu_has(X86_FEATURE_AVX512BW) && \
boot_cpu_has(X86_FEATURE_AVX512VL) && \
+ !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \
cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \
- if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx10_256); \
- else \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx10_512); \
+ static_call_update(prefix##_pclmul, \
+ prefix##_vpclmul_avx512); \
} else { \
static_call_update(prefix##_pclmul, \
prefix##_vpclmul_avx2); \
} \
} \
base-commit: 981b39dc6da6dd11ec40824a224c1e0a7557b5ca
--
2.49.0