[PATCH v2 05/19] crypto: x86/crc - limit FPU preemption

From: Robert Elliott
Date: Wed Oct 12 2022 - 18:00:48 EST


As done by the ECB and CBC helpers in arch/x86/crypt/ecb_cbc_helpers.h,
limit the number of bytes processed between kernel_fpu_begin() and
kernel_fpu_end() calls.

Those functions call preempt_disable() and preempt_enable(), so
the CPU core is unavailable for scheduling while running, leading to:
rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: ...

Fixes: 78c37d191dd6 ("crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation")
Fixes: 6a8ce1ef3940 ("crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ instruction")
Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform")
Suggested-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Robert Elliott <elliott@xxxxxxx>
---
arch/x86/crypto/crc32-pclmul_asm.S | 6 ++--
arch/x86/crypto/crc32-pclmul_glue.c | 19 ++++++++----
arch/x86/crypto/crc32c-intel_glue.c | 29 ++++++++++++++----
arch/x86/crypto/crct10dif-pclmul_glue.c | 39 ++++++++++++++++++++-----
4 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index ca53e96996ac..9abd861636c3 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -72,15 +72,15 @@
.text
/**
* Calculate crc32
- * BUF - buffer (16 bytes aligned)
- * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ * BUF - buffer - must be 16 bytes aligned
+ * LEN - sizeof buffer - must be multiple of 16 bytes and greater than 63
* CRC - initial crc32
* return %eax crc32
* uint crc32_pclmul_le_16(unsigned char const *buffer,
* size_t len, uint crc32)
*/

-SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
+SYM_FUNC_START(crc32_pclmul_le_16)
movdqa (BUF), %xmm1
movdqa 0x10(BUF), %xmm2
movdqa 0x20(BUF), %xmm3
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 98cf3b4e4c9f..38539c6edfe5 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -46,6 +46,8 @@
#define SCALE_F 16L /* size of xmm register */
#define SCALE_F_MASK (SCALE_F - 1)

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);

static u32 __attribute__((pure))
@@ -70,12 +72,19 @@ static u32 __attribute__((pure))
iquotient = len & (~SCALE_F_MASK);
iremainder = len & SCALE_F_MASK;

- kernel_fpu_begin();
- crc = crc32_pclmul_le_16(p, iquotient, crc);
- kernel_fpu_end();
+ do {
+ unsigned int chunk = min(iquotient, FPU_BYTES);
+
+ kernel_fpu_begin();
+ crc = crc32_pclmul_le_16(p, chunk, crc);
+ kernel_fpu_end();
+
+ iquotient -= chunk;
+ p += chunk;
+ } while (iquotient >= PCLMUL_MIN_LEN);

- if (iremainder)
- crc = crc32_le(crc, p + iquotient, iremainder);
+ if (iquotient || iremainder)
+ crc = crc32_le(crc, p, iquotient + iremainder);

return crc;
}
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index feccb5254c7e..ece620227057 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -41,6 +41,8 @@
*/
#define CRC32C_PCL_BREAKEVEN 512

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
#endif /* CONFIG_X86_64 */
@@ -158,9 +160,16 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
- kernel_fpu_end();
+ do {
+ unsigned int chunk = min(len, FPU_BYTES);
+
+ kernel_fpu_begin();
+ *crcp = crc_pcl(data, chunk, *crcp);
+ kernel_fpu_end();
+
+ len -= chunk;
+ data += chunk;
+ } while (len);
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
@@ -170,9 +179,17 @@ static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
- kernel_fpu_end();
+ do {
+ unsigned int chunk = min(len, FPU_BYTES);
+
+ kernel_fpu_begin();
+ *crcp = crc_pcl(data, chunk, *crcp);
+ kernel_fpu_end();
+
+ len -= chunk;
+ data += chunk;
+ } while (len);
+ *(__le32 *)out = ~cpu_to_le32(*crcp);
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 71291d5af9f4..54a537fc88ee 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -34,6 +34,10 @@
#include <asm/cpu_device_id.h>
#include <asm/simd.h>

+#define PCLMUL_MIN_LEN 16U /* minimum size of buffer for crc_t10dif_pcl */
+
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);

struct chksum_desc_ctx {
@@ -54,10 +58,19 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);

- if (length >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
- kernel_fpu_end();
+ if (length >= PCLMUL_MIN_LEN && crypto_simd_usable()) {
+ do {
+ unsigned int chunk = min(length, FPU_BYTES);
+
+ kernel_fpu_begin();
+ ctx->crc = crc_t10dif_pcl(ctx->crc, data, chunk);
+ kernel_fpu_end();
+
+ length -= chunk;
+ data += chunk;
+ } while (length >= PCLMUL_MIN_LEN);
+ if (length)
+ ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
} else
ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
return 0;
@@ -73,10 +86,20 @@ static int chksum_final(struct shash_desc *desc, u8 *out)

static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
{
- if (len >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
- kernel_fpu_end();
+ if (len >= PCLMUL_MIN_LEN && crypto_simd_usable()) {
+ do {
+ unsigned int chunk = min(len, FPU_BYTES);
+
+ kernel_fpu_begin();
+ crc = crc_t10dif_pcl(crc, data, chunk);
+ kernel_fpu_end();
+
+ len -= chunk;
+ data += chunk;
+ } while (len >= PCLMUL_MIN_LEN);
+ if (len)
+ crc = crc_t10dif_generic(crc, data, len);
+ *(__u16 *)out = crc;
} else
*(__u16 *)out = crc_t10dif_generic(crc, data, len);
return 0;
--
2.37.3