Re: [RFC V1 2/7] crypto: crct10dif - Accelerated CRC T10 DIF with vectorized instruction

From: Ard Biesheuvel
Date: Sat Jan 16 2021 - 12:10:28 EST


On Fri, 18 Dec 2020 at 22:07, Megha Dey <megha.dey@xxxxxxxxx> wrote:
>
> From: Kyung Min Park <kyung.min.park@xxxxxxxxx>
>
> Update the crc_pcl function that calculates T10 Data Integrity Field
> CRC16 (CRC T10 DIF) using VPCLMULQDQ instruction. VPCLMULQDQ instruction
> with AVX-512F adds EVEX encoded 512 bit version of PCLMULQDQ instruction.
> The advantage comes from packing multiples of 4 * 128 bit data into AVX512
> reducing instruction latency.
>
> The glue code in crct10diff module overrides the existing PCLMULQDQ version
> with the VPCLMULQDQ version when the following criteria are met:
> At compile time:
> 1. CONFIG_CRYPTO_AVX512 is enabled
> 2. toolchain(assembler) supports VPCLMULQDQ instructions
> At runtime:
> 1. VPCLMULQDQ and AVX512VL features are supported on a platform (currently
> only Icelake)
> 2. If compiled as built-in module, crct10dif_pclmul.use_avx512 is set at
> boot time or /sys/module/crct10dif_pclmul/parameters/use_avx512 is set
> to 1 after boot.
> If compiled as loadable module, use_avx512 module parameter must be set:
> modprobe crct10dif_pclmul use_avx512=1
>
> A typical run of tcrypt with CRC T10 DIF calculation with PCLMULQDQ
> instruction and VPCLMULQDQ instruction shows the following results:
> For bytes per update >= 1KB, we see the average improvement of 46%(~1.4x)
> For bytes per update < 1KB, we see the average improvement of 13%.
> Test was performed on an Icelake based platform with constant frequency
> set for CPU.
>
> Detailed results for a variety of block sizes and update sizes are in
> the table below.
>
> ---------------------------------------------------------------------------
> | | | cycles/operation | |
> | | | (the lower the better) | |
> | byte | bytes |----------------------------------| percentage |
> | blocks | per update | CRC T10 DIF | CRC T10 DIF | loss/gain |
> | | | with PCLMULQDQ | with VPCLMULQDQ | |
> |------------|------------|----------------|-----------------|------------|
> | 16 | 16 | 77 | 106 | -27.0 |
> | 64 | 16 | 411 | 390 | 5.4 |
> | 64 | 64 | 71 | 85 | -16.0 |
> | 256 | 16 | 1224 | 1308 | -6.4 |
> | 256 | 64 | 393 | 407 | -3.4 |
> | 256 | 256 | 93 | 86 | 8.1 |
> | 1024 | 16 | 4564 | 5020 | -9.0 |
> | 1024 | 256 | 486 | 475 | 2.3 |
> | 1024 | 1024 | 221 | 148 | 49.3 |
> | 2048 | 16 | 8945 | 9851 | -9.1 |
> | 2048 | 256 | 982 | 951 | 3.3 |
> | 2048 | 1024 | 500 | 369 | 35.5 |
> | 2048 | 2048 | 413 | 265 | 55.8 |
> | 4096 | 16 | 17885 | 19351 | -7.5 |
> | 4096 | 256 | 1828 | 1713 | 6.7 |
> | 4096 | 1024 | 968 | 805 | 20.0 |
> | 4096 | 4096 | 739 | 475 | 55.6 |
> | 8192 | 16 | 48339 | 41556 | 16.3 |
> | 8192 | 256 | 3494 | 3342 | 4.5 |
> | 8192 | 1024 | 1959 | 1462 | 34.0 |
> | 8192 | 4096 | 1561 | 1036 | 50.7 |
> | 8192 | 8192 | 1540 | 1004 | 53.4 |
> ---------------------------------------------------------------------------
>
> This work was inspired by the CRC T10 DIF AVX512 optimization published
> in Intel Intelligent Storage Acceleration Library.
> https://github.com/intel/isa-l/blob/master/crc/crc16_t10dif_by16_10.asm
>
> Co-developed-by: Greg Tucker <greg.b.tucker@xxxxxxxxx>
> Signed-off-by: Greg Tucker <greg.b.tucker@xxxxxxxxx>
> Co-developed-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx>
> Signed-off-by: Tomasz Kantecki <tomasz.kantecki@xxxxxxxxx>
> Signed-off-by: Kyung Min Park <kyung.min.park@xxxxxxxxx>
> Signed-off-by: Megha Dey <megha.dey@xxxxxxxxx>
> ---
> arch/x86/crypto/Makefile | 1 +
> arch/x86/crypto/crct10dif-avx512-asm_64.S | 482 ++++++++++++++++++++++++++++++
> arch/x86/crypto/crct10dif-pclmul_glue.c | 24 +-
> arch/x86/include/asm/disabled-features.h | 8 +-
> crypto/Kconfig | 23 ++
> 5 files changed, 535 insertions(+), 3 deletions(-)
> create mode 100644 arch/x86/crypto/crct10dif-avx512-asm_64.S
>
...
> diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
> index 71291d5a..26a6350 100644
> --- a/arch/x86/crypto/crct10dif-pclmul_glue.c
> +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
> @@ -35,6 +35,16 @@
> #include <asm/simd.h>
>
> asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
> +#ifdef CONFIG_CRYPTO_CRCT10DIF_AVX512
> +asmlinkage u16 crct10dif_pcl_avx512(u16 init_crc, const u8 *buf, size_t len);
> +#else
> +static u16 crct10dif_pcl_avx512(u16 init_crc, const u8 *buf, size_t len)
> +{ return 0; }
> +#endif
> +

Please drop the alternative definition. If you code the references
correctly, the alternative is never called.

> +static bool use_avx512;
> +module_param(use_avx512, bool, 0644);
> +MODULE_PARM_DESC(use_avx512, "Use AVX512 optimized algorithm, if available");
>
> struct chksum_desc_ctx {
> __u16 crc;
> @@ -56,7 +66,12 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
>
> if (length >= 16 && crypto_simd_usable()) {
> kernel_fpu_begin();
> - ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
> + if (IS_ENABLED(CONFIG_CRYPTO_CRCT10DIF_AVX512) &&
> + cpu_feature_enabled(X86_FEATURE_VPCLMULQDQ) &&
> + use_avx512)
> + ctx->crc = crct10dif_pcl_avx512(ctx->crc, data, length);
> + else
> + ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);

Please use a static call or static key here, and initialize its value
in the init code.

> kernel_fpu_end();
> } else
> ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
> @@ -75,7 +90,12 @@ static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
> {
> if (len >= 16 && crypto_simd_usable()) {
> kernel_fpu_begin();
> - *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
> + if (IS_ENABLED(CONFIG_CRYPTO_CRCT10DIF_AVX512) &&
> + cpu_feature_enabled(X86_FEATURE_VPCLMULQDQ) &&
> + use_avx512)
> + *(__u16 *)out = crct10dif_pcl_avx512(crc, data, len);
> + else
> + *(__u16 *)out = crc_t10dif_pcl(crc, data, len);

Same here.

> kernel_fpu_end();
> } else
> *(__u16 *)out = crc_t10dif_generic(crc, data, len);
> diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
> index 5861d34..1192dea 100644
> --- a/arch/x86/include/asm/disabled-features.h
> +++ b/arch/x86/include/asm/disabled-features.h
> @@ -56,6 +56,12 @@
> # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
> #endif
>
> +#if defined(CONFIG_AS_VPCLMULQDQ)
> +# define DISABLE_VPCLMULQDQ 0
> +#else
> +# define DISABLE_VPCLMULQDQ (1 << (X86_FEATURE_VPCLMULQDQ & 31))
> +#endif
> +
> #ifdef CONFIG_IOMMU_SUPPORT
> # define DISABLE_ENQCMD 0
> #else
> @@ -82,7 +88,7 @@
> #define DISABLED_MASK14 0
> #define DISABLED_MASK15 0
> #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
> - DISABLE_ENQCMD)
> + DISABLE_ENQCMD|DISABLE_VPCLMULQDQ)
> #define DISABLED_MASK17 0
> #define DISABLED_MASK18 0
> #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index a367fcf..b090f14 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -613,6 +613,29 @@ config CRYPTO_CRC32C_VPMSUM
> (vpmsum) instructions, introduced in POWER8. Enable on POWER8
> and newer processors for improved performance.
>
> +config CRYPTO_AVX512
> + bool "AVX512 hardware acceleration for crypto algorithms"
> + depends on X86
> + depends on 64BIT
> + help
> + This option will compile in AVX512 hardware accelerated crypto
> + algorithms. These optimized algorithms provide substantial(2-10x)
> + improvements over existing crypto algorithms for large data size.
> + However, it may also incur a frequency penalty (aka. "bin drops")
> + and cause collateral damage to other workloads running on the
> + same core.
> +
> +# We default CRYPTO_CRCT10DIF_AVX512 to Y but depend on CRYPTO_AVX512 in
> +# order to have a singular option (CRYPTO_AVX512) select multiple algorithms
> +# when supported. Specifically, if the platform and/or toolset does not
> +# support VPLMULQDQ. Then this algorithm should not be supported as part of
> +# the set that CRYPTO_AVX512 selects.
> +config CRYPTO_CRCT10DIF_AVX512
> + bool
> + default y
> + depends on CRYPTO_AVX512
> + depends on CRYPTO_CRCT10DIF_PCLMUL
> + depends on AS_VPCLMULQDQ
>
> config CRYPTO_CRC32C_SPARC64
> tristate "CRC32c CRC algorithm (SPARC64)"
> --
> 2.7.4
>