[PATCH] random: add chacha8_block and swtich the rng to it

From: Aaron Toponce
Date: Mon Apr 29 2024 - 09:50:36 EST


According to Jean-Philippe Aumasson in his paper "Too Much Crypto" [1]:

> "The best result on ChaCha is a key recovery attack on the 7-round version
> with 2^237.7 time complexity using output data from 2^96 instances of ChaCha,
> that is, 2^105 bytes of data."

He then proposes that ChaCha use 8 rounds instead of 20, providing a 2.5x
speed-up. As such, this patch adds chacha8_block and chacha12_block and switches
the RNG from ChaCha20 to ChaCha8 to take advantage of that efficiency without
sacrificing security.

[1]: https://eprint.iacr.org/2019/1492

On my ThinkPad T480s with an Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz, the
speed-up is close to what would be expected.

Without the patch:

$ dd if=/dev/urandom of=/dev/null bs=32M count=300
300+0 records in
300+0 records out
10066329600 bytes (10 GB, 9.4 GiB) copied, 20.4806 s, 492 MB/s

With the patch:

$ dd if=/dev/urandom of=/dev/null bs=32M count=300
300+0 records in
300+0 records out
10066329600 bytes (10 GB, 9.4 GiB) copied, 11.5321 s, 873 MB/s

Signed-off-by: Aaron Toponce <aaron.toponce@xxxxxxxxx>
---
drivers/char/random.c | 8 ++++----
include/crypto/chacha.h | 14 ++++++++++++--
lib/crypto/chacha.c | 6 +++---
3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2597cb43f438..2e14a30b795f 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -302,7 +302,7 @@ static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
chacha_init_consts(chacha_state);
memcpy(&chacha_state[4], key, CHACHA_KEY_SIZE);
memset(&chacha_state[12], 0, sizeof(u32) * 4);
- chacha20_block(chacha_state, first_block);
+ chacha8_block(chacha_state, first_block);

memcpy(key, first_block, CHACHA_KEY_SIZE);
memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len);
@@ -388,13 +388,13 @@ static void _get_random_bytes(void *buf, size_t len)

while (len) {
if (len < CHACHA_BLOCK_SIZE) {
- chacha20_block(chacha_state, tmp);
+ chacha8_block(chacha_state, tmp);
memcpy(buf, tmp, len);
memzero_explicit(tmp, sizeof(tmp));
break;
}

- chacha20_block(chacha_state, buf);
+ chacha8_block(chacha_state, buf);
if (unlikely(chacha_state[12] == 0))
++chacha_state[13];
len -= CHACHA_BLOCK_SIZE;
@@ -444,7 +444,7 @@ static ssize_t get_random_bytes_user(struct iov_iter *iter)
}

for (;;) {
- chacha20_block(chacha_state, block);
+ chacha8_block(chacha_state, block);
if (unlikely(chacha_state[12] == 0))
++chacha_state[13];

diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index b3ea73b81944..64c45121c69a 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -8,8 +8,7 @@
*
* The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is
* recommended to use the 20-round variant ChaCha20. However, the other
- * variants can be needed in some performance-sensitive scenarios. The generic
- * ChaCha code currently allows only the 20 and 12-round variants.
+ * variants can be needed in some performance-sensitive scenarios.
*/

#ifndef _CRYPTO_CHACHA_H
@@ -31,11 +30,22 @@
#define XCHACHA_IV_SIZE 32

void chacha_block_generic(u32 *state, u8 *stream, int nrounds);
+
static inline void chacha20_block(u32 *state, u8 *stream)
{
chacha_block_generic(state, stream, 20);
}

+static inline void chacha12_block(u32 *state, u8 *stream)
+{
+ chacha_block_generic(state, stream, 12);
+}
+
+static inline void chacha8_block(u32 *state, u8 *stream)
+{
+ chacha_block_generic(state, stream, 8);
+}
+
void hchacha_block_arch(const u32 *state, u32 *out, int nrounds);
void hchacha_block_generic(const u32 *state, u32 *out, int nrounds);

diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c
index b748fd3d256e..15e773629f1d 100644
--- a/lib/crypto/chacha.c
+++ b/lib/crypto/chacha.c
@@ -18,7 +18,7 @@ static void chacha_permute(u32 *x, int nrounds)
int i;

/* whitelist the allowed round counts */
- WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+ WARN_ON_ONCE(nrounds != 20 && nrounds != 12 && nrounds != 8);

for (i = 0; i < nrounds; i += 2) {
x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
@@ -67,7 +67,7 @@ static void chacha_permute(u32 *x, int nrounds)
* chacha_block_generic - generate one keystream block and increment block counter
* @state: input state matrix (16 32-bit words)
* @stream: output keystream block (64 bytes)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ * @nrounds: number of rounds (20, 12, or 8; 20 is recommended)
*
* This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
* The caller has already converted the endianness of the input. This function
@@ -93,7 +93,7 @@ EXPORT_SYMBOL(chacha_block_generic);
* hchacha_block_generic - abbreviated ChaCha core, for XChaCha
* @state: input state matrix (16 32-bit words)
* @stream: output (8 32-bit words)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ * @nrounds: number of rounds (20, 12, or 8; 20 is recommended)
*
* HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
* towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
--
2.43.0