Re: [PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation

From: Ard Biesheuvel
Date: Mon Sep 02 2024 - 16:57:58 EST


Hi Adhemerval,

I have just a couple of more points below, on the BE handling in the asm.

On Mon, 2 Sept 2024 at 18:19, Adhemerval Zanella
<adhemerval.zanella@xxxxxxxxxx> wrote:
>
> Hook up the generic vDSO implementation to the aarch64 vDSO data page.
> The _vdso_rng_data required data is placed within the _vdso_data vvar
> page, by using a offset larger than the vdso_data.
>
> The vDSO function requires a ChaCha20 implementation that does not write
> to the stack, and that can do an entire ChaCha20 permutation. The one
> provided uses NEON on the permute operation, with a fallback to the
> syscall for chips that do not support AdvSIMD.
>
> This also passes the vdso_test_chacha test along with
> vdso_test_getrandom. The vdso_test_getrandom bench-single result on
> Neoverse-N1 shows:
>
> vdso: 25000000 times in 0.783884250 seconds
> libc: 25000000 times in 8.780275399 seconds
> syscall: 25000000 times in 8.786581518 seconds
>
> A small fixup to arch/arm64/include/asm/mman.h was required to avoid
> pulling kernel code into the vDSO, similar to what's already done in
> arch/arm64/include/asm/rwonce.h.
>
> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@xxxxxxxxxx>
> ---
> arch/arm64/Kconfig | 1 +
> arch/arm64/include/asm/mman.h | 6 +-
> arch/arm64/include/asm/vdso.h | 6 +
> arch/arm64/include/asm/vdso/getrandom.h | 50 ++++++
> arch/arm64/include/asm/vdso/vsyscall.h | 10 ++
> arch/arm64/kernel/vdso.c | 6 -
> arch/arm64/kernel/vdso/Makefile | 25 ++-
> arch/arm64/kernel/vdso/vdso | 1 +
> arch/arm64/kernel/vdso/vdso.lds.S | 4 +
> arch/arm64/kernel/vdso/vgetrandom-chacha.S | 178 +++++++++++++++++++++
> arch/arm64/kernel/vdso/vgetrandom.c | 15 ++
> tools/arch/arm64/vdso | 1 +
> tools/include/linux/compiler.h | 4 +
> tools/testing/selftests/vDSO/Makefile | 3 +-
> 14 files changed, 294 insertions(+), 16 deletions(-)
> create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
> create mode 120000 arch/arm64/kernel/vdso/vdso
> create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
> create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
> create mode 120000 tools/arch/arm64/vdso
>
...
> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..4e5f9c349522
> --- /dev/null
> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/linkage.h>
> +#include <asm/cache.h>
> +#include <asm/assembler.h>
> +
> + .text
> +
> +#define state0 v0
> +#define state1 v1
> +#define state2 v2
> +#define state3 v3
> +#define copy0 v4
> +#define copy0_q q4
> +#define copy1 v5
> +#define copy2 v6
> +#define copy3 v7
> +#define copy3_d d7
> +#define one_d d16
> +#define one_q q16
> +#define one_v v16
> +#define tmp v17
> +#define rot8 v18
> +
> +/*
> + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive
> + * number of blocks of output with nonce 0, taking an input key and 8-bytes
> + * counter. Importantly does not spill to the stack.
> + *
> + * This implementation avoids d8-d15 because they are callee-save in user
> + * space.
> + *
> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
> + * const uint8_t *key,
> + * uint32_t *counter,
> + * size_t nblocks)
> + *
> + * x0: output bytes
> + * x1: 32-byte key input
> + * x2: 8-byte counter input/output
> + * x3: number of 64-byte block to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +
> + /* copy0 = "expand 32-byte k" */
> + mov_q x8, 0x3320646e61707865
> + mov_q x9, 0x6b20657479622d32
> + mov copy0.d[0], x8
> + mov copy0.d[1], x9
> +
> + /* copy1,copy2 = key */
> + ld1 { copy1.4s, copy2.4s }, [x1]
> + /* copy3 = counter || zero nonce */
> + ldr copy3_d, [x2]
> +CPU_BE( rev64 copy3.4s, copy3.4s)
> +

This loads 2 u32s as a single u64, and then swaps them if we are running on BE.
So better to just use

ld1 {copy3.2s}, [x2]

here, and drop the CPU_BE() special case.

> + movi one_v.2s, #1
> + uzp1 one_v.4s, one_v.4s, one_v.4s
> +
> +.Lblock:
> + /* copy state to auxiliary vectors for the final add after the permute. */
> + mov state0.16b, copy0.16b
> + mov state1.16b, copy1.16b
> + mov state2.16b, copy2.16b
> + mov state3.16b, copy3.16b
> +
> + mov w4, 20
> +.Lpermute:
> + /*
> + * Permute one 64-byte block where the state matrix is stored in the four NEON
> + * registers state0-state3. It performs matrix operations on four words in parallel,
> + * but requires shuffling to rearrange the words after each round.
> + */
> +
> +.Ldoubleround:
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> + add state0.4s, state0.4s, state1.4s
> + eor state3.16b, state3.16b, state0.16b
> + rev32 state3.8h, state3.8h
> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> + add state2.4s, state2.4s, state3.4s
> + eor tmp.16b, state1.16b, state2.16b
> + shl state1.4s, tmp.4s, #12
> + sri state1.4s, tmp.4s, #20
> +
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> + add state0.4s, state0.4s, state1.4s
> + eor tmp.16b, state3.16b, state0.16b
> + shl state3.4s, tmp.4s, #8
> + sri state3.4s, tmp.4s, #24
> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> + add state2.4s, state2.4s, state3.4s
> + eor tmp.16b, state1.16b, state2.16b
> + shl state1.4s, tmp.4s, #7
> + sri state1.4s, tmp.4s, #25
> +
> + /* state1[0,1,2,3] = state1[1,2,3,0] */
> + ext state1.16b, state1.16b, state1.16b, #4
> + /* state2[0,1,2,3] = state2[2,3,0,1] */
> + ext state2.16b, state2.16b, state2.16b, #8
> + /* state3[0,1,2,3] = state3[1,2,3,0] */
> + ext state3.16b, state3.16b, state3.16b, #12
> +
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> + add state0.4s, state0.4s, state1.4s
> + eor state3.16b, state3.16b, state0.16b
> + rev32 state3.8h, state3.8h
> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> + add state2.4s, state2.4s, state3.4s
> + eor tmp.16b, state1.16b, state2.16b
> + shl state1.4s, tmp.4s, #12
> + sri state1.4s, tmp.4s, #20
> +
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> + add state0.4s, state0.4s, state1.4s
> + eor tmp.16b, state3.16b, state0.16b
> + shl state3.4s, tmp.4s, #8
> + sri state3.4s, tmp.4s, #24
> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> + add state2.4s, state2.4s, state3.4s
> + eor tmp.16b, state1.16b, state2.16b
> + shl state1.4s, tmp.4s, #7
> + sri state1.4s, tmp.4s, #25
> +
> + /* state1[0,1,2,3] = state1[3,0,1,2] */
> + ext state1.16b, state1.16b, state1.16b, #12
> + /* state2[0,1,2,3] = state2[2,3,0,1] */
> + ext state2.16b, state2.16b, state2.16b, #8
> + /* state3[0,1,2,3] = state3[1,2,3,0] */
> + ext state3.16b, state3.16b, state3.16b, #4
> +
> + subs w4, w4, #2
> + b.ne .Ldoubleround
> +
> + /* output0 = state0 + state0 */
> + add state0.4s, state0.4s, copy0.4s
> +CPU_BE( rev32 state0.16b, state0.16b)
> + /* output1 = state1 + state1 */
> + add state1.4s, state1.4s, copy1.4s
> +CPU_BE( rev32 state1.16b, state1.16b)
> + /* output2 = state2 + state2 */
> + add state2.4s, state2.4s, copy2.4s
> +CPU_BE( rev32 state2.16b, state2.16b)
> + /* output2 = state3 + state3 */
> + add state3.4s, state3.4s, copy3.4s
> +CPU_BE( rev32 state3.16b, state3.16b)
> + st1 { state0.4s - state3.4s }, [x0]
> +

If the u32s shouldn't be swabbed for BE, you should simply be able to do

st1 {state0.16b - state3.16b}, [x0]

here, and drop the CPU_BE(*).

> + /*
> + * ++copy3.counter, the 'add' clears the upper half of the SIMD register
> + * which is the expected behaviour here.
> + */
> + add copy3_d, copy3_d, one_d
> +
> + /* output += 64, --nblocks */
> + add x0, x0, 64
> + subs x3, x3, #1
> + b.ne .Lblock
> +
> + /* counter = copy3.counter */
> +CPU_BE( rev64 copy3.4s, copy3.4s)
> + str copy3_d, [x2]
> +

... and this could be

st1 {copy3.2s}, [x2]

> + /* Zero out the potentially sensitive regs, in case nothing uses these again. */
> + movi state0.16b, #0
> + movi state1.16b, #0
> + movi state2.16b, #0
> + movi state3.16b, #0
> + movi copy1.16b, #0
> + movi copy2.16b, #0
> + ret
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> +
> +emit_aarch64_feature_1_and