[PATCH v2 09/19] crypto: x86 - use common macro for FPU limit

From: Robert Elliott
Date: Wed Oct 12 2022 - 18:00:42 EST


Use a common macro name (FPU_BYTES) for the limit of the number of bytes
processed within kernel_fpu_begin and kernel_fpu_end rather than
using SZ_4K (which is a signed value), or a magic value of 4096U.

Use unsigned int rather than size_t for some of the arguments to
avoid typecasting for the min() macro.

Signed-off-by: Robert Elliott <elliott@xxxxxxx>
---
arch/x86/crypto/blake2s-glue.c | 7 ++++---
arch/x86/crypto/chacha_glue.c | 4 +++-
arch/x86/crypto/nhpoly1305-avx2-glue.c | 4 +++-
arch/x86/crypto/nhpoly1305-sse2-glue.c | 4 +++-
arch/x86/crypto/poly1305_glue.c | 25 ++++++++++++++-----------
arch/x86/crypto/polyval-clmulni_glue.c | 5 +++--
6 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index aaba21230528..3054ee7fa219 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -16,6 +16,8 @@
#include <asm/processor.h>
#include <asm/simd.h>

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
@@ -29,8 +31,7 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
void blake2s_compress(struct blake2s_state *state, const u8 *block,
size_t nblocks, const u32 inc)
{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+ BUILD_BUG_ON(FPU_BYTES / BLAKE2S_BLOCK_SIZE < 8);

if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
blake2s_compress_generic(state, block, nblocks, inc);
@@ -39,7 +40,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,

do {
const size_t blocks = min_t(size_t, nblocks,
- SZ_4K / BLAKE2S_BLOCK_SIZE);
+ FPU_BYTES / BLAKE2S_BLOCK_SIZE);

kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 7b3a1cf0984b..0d7e172862db 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -15,6 +15,8 @@
#include <linux/sizes.h>
#include <asm/simd.h>

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
@@ -147,7 +149,7 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
return chacha_crypt_generic(state, dst, src, bytes, nrounds);

do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+ unsigned int todo = min(bytes, FPU_BYTES);

kernel_fpu_begin();
chacha_dosimd(state, dst, src, todo, nrounds);
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index 8ea5ab0f1ca7..59615ae95e86 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -13,6 +13,8 @@
#include <linux/sizes.h>
#include <asm/simd.h>

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
u8 hash[NH_HASH_BYTES]);

@@ -30,7 +32,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);

do {
- unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+ unsigned int n = min(srclen, FPU_BYTES);

kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index 2b353d42ed13..bf91c375821a 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -13,6 +13,8 @@
#include <linux/sizes.h>
#include <asm/simd.h>

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
u8 hash[NH_HASH_BYTES]);

@@ -30,7 +32,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);

do {
- unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+ unsigned int n = min(srclen, FPU_BYTES);

kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 1dfb8af48a3c..3764301bdf1b 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -15,20 +15,24 @@
#include <asm/intel-family.h>
#include <asm/simd.h>

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage void poly1305_init_x86_64(void *ctx,
const u8 key[POLY1305_BLOCK_SIZE]);
asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
+ const unsigned int len,
+ const u32 padbit);
asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
const u32 nonce[4]);
asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp,
+ const unsigned int len, const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp,
+ const unsigned int len, const u32 padbit);
asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
+ const unsigned int len,
+ const u32 padbit);

static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
@@ -86,14 +90,13 @@ static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
poly1305_init_x86_64(ctx, key);
}

-static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
+static void poly1305_simd_blocks(void *ctx, const u8 *inp, unsigned int len,
const u32 padbit)
{
struct poly1305_arch_internal *state = ctx;

- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
- SZ_4K % POLY1305_BLOCK_SIZE);
+ BUILD_BUG_ON(FPU_BYTES < POLY1305_BLOCK_SIZE ||
+ FPU_BYTES % POLY1305_BLOCK_SIZE);

if (!static_branch_likely(&poly1305_use_avx) ||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
@@ -104,7 +107,7 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
}

do {
- const size_t bytes = min_t(size_t, len, SZ_4K);
+ const unsigned int bytes = min(len, FPU_BYTES);

kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
index b7664d018851..2502964afef6 100644
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -29,6 +29,8 @@

#define NUM_KEY_POWERS 8

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
struct polyval_tfm_ctx {
/*
* These powers must be in the order h^8, ..., h^1.
@@ -123,8 +125,7 @@ static int polyval_x86_update(struct shash_desc *desc,
}

while (srclen >= POLYVAL_BLOCK_SIZE) {
- /* Allow rescheduling every 4K bytes. */
- nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+ nblocks = min(srclen, FPU_BYTES) / POLYVAL_BLOCK_SIZE;
internal_polyval_update(tctx, src, nblocks, dctx->buffer);
srclen -= nblocks * POLYVAL_BLOCK_SIZE;
src += nblocks * POLYVAL_BLOCK_SIZE;
--
2.37.3