[PATCH 3/3] crypto: x86/sm4 - add AES-NI/AVX/x86_64 assembler implementation

From: Tianjia Zhang
Date: Thu Jun 10 2021 - 09:45:09 EST


This patch adds AES-NI/AVX/x86_64 assembler implementation of SM4
block cipher. Through two affine transforms, we can use the AES
S-Box to simulate the SM4 S-Box to achieve the effect of instruction
acceleration.

Signed-off-by: Tianjia Zhang <tianjia.zhang@xxxxxxxxxxxxxxxxx>
---
arch/x86/crypto/Makefile | 3 +
arch/x86/crypto/sm4-aesni-avx-asm_64.S | 339 +++++++++++++++++++++++++
arch/x86/crypto/sm4_aesni_avx_glue.c | 115 +++++++++
crypto/Kconfig | 29 +++
4 files changed, 486 insertions(+)
create mode 100644 arch/x86/crypto/sm4-aesni-avx-asm_64.S
create mode 100644 arch/x86/crypto/sm4_aesni_avx_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index d0959e7b809f..08f95d4e1e7c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -88,6 +88,9 @@ nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o

obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o

+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o
+sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..c7cbb7e90bec
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-04.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@xxxxxx>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@xxxxxx>
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@xxxxxxxxxxxxxxxxx>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+#define RX0 %xmm0
+#define RX1 %xmm1
+#define MASK_4BIT %xmm2
+#define RTMP0 %xmm3
+#define RTMP1 %xmm4
+#define RTMP2 %xmm5
+#define RTMP3 %xmm6
+#define RTMP4 %xmm7
+
+#define RA0 %xmm8
+#define RA1 %xmm9
+#define RA2 %xmm10
+#define RA3 %xmm11
+
+#define RB0 %xmm12
+#define RB1 %xmm13
+#define RB2 %xmm14
+#define RB3 %xmm15
+
+#define RNOT %xmm0
+#define RBSWAP %xmm1
+
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* pre-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction.
+ */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+
+.text
+.align 16
+
+/*
+ * void sm4_aesni_avx_expand_key(const u8 *key, u32 *rk_enc,
+ * u32 *rk_dec, const u32 *fk, const u32 *ck);
+ */
+SYM_FUNC_START(sm4_aesni_avx_expand_key)
+ /* input:
+ * %rdi: 128-bit key
+ * %rsi: rkey_enc
+ * %rdx: rkey_dec
+ * %rcx: fk array
+ * %r8: ck array
+ */
+ FRAME_BEGIN
+
+ vmovd 0*4(%rdi), RA0;
+ vmovd 1*4(%rdi), RA1;
+ vmovd 2*4(%rdi), RA2;
+ vmovd 3*4(%rdi), RA3;
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vmovd 0*4(%rcx), RB0;
+ vmovd 1*4(%rcx), RB1;
+ vmovd 2*4(%rcx), RB2;
+ vmovd 3*4(%rcx), RB3;
+ vpxor RB0, RA0, RA0;
+ vpxor RB1, RA1, RA1;
+ vpxor RB2, RA2, RA2;
+ vpxor RB3, RA3, RA3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+ vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+ vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+ vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+ vmovdqa .Linv_shift_row rRIP, RB3;
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vbroadcastss (4*(round))(%r8), RX0; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RB3, RX0, RX0; \
+ vpxor RX0, s0, s0; /* s0 ^ x */ \
+ vpslld $13, RX0, RTMP0; \
+ vpsrld $19, RX0, RTMP1; \
+ vpslld $23, RX0, RTMP2; \
+ vpsrld $9, RX0, RTMP3; \
+ vpxor RTMP0, RTMP1, RTMP1; \
+ vpxor RTMP2, RTMP3, RTMP3; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,13) */ \
+ vpxor RTMP3, s0, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */
+
+ leaq (32*4)(%r8), %rax;
+ leaq (32*4)(%rdx), %rdx;
+.align 16
+.Lroundloop_expand_key:
+ leaq (-4*4)(%rdx), %rdx;
+ ROUND(0, RA0, RA1, RA2, RA3);
+ ROUND(1, RA1, RA2, RA3, RA0);
+ ROUND(2, RA2, RA3, RA0, RA1);
+ ROUND(3, RA3, RA0, RA1, RA2);
+ leaq (4*4)(%r8), %r8;
+ vmovd RA0, (0*4)(%rsi);
+ vmovd RA1, (1*4)(%rsi);
+ vmovd RA2, (2*4)(%rsi);
+ vmovd RA3, (3*4)(%rsi);
+ vmovd RA0, (3*4)(%rdx);
+ vmovd RA1, (2*4)(%rdx);
+ vmovd RA2, (1*4)(%rdx);
+ vmovd RA3, (0*4)(%rdx);
+ leaq (4*4)(%rsi), %rsi;
+ cmpq %rax, %r8;
+ jne .Lroundloop_expand_key;
+
+#undef ROUND
+
+ vzeroall;
+ FRAME_END
+ ret;
+SYM_FUNC_END(sm4_aesni_avx_expand_key)
+
+
+/*
+ * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+SYM_FUNC_START(sm4_aesni_avx_crypt4)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..4 blocks)
+ * %rdx: src (1..4 blocks)
+ * %rcx: num blocks (1..4)
+ */
+ FRAME_BEGIN
+
+ vmovdqu 0*16(%rdx), RA0;
+ vmovdqa RA0, RA1;
+ vmovdqa RA0, RA2;
+ vmovdqa RA0, RA3;
+ cmpq $2, %rcx;
+ jb .Lblk4_load_input_done;
+ vmovdqu 1*16(%rdx), RA1;
+ je .Lblk4_load_input_done;
+ vmovdqu 2*16(%rdx), RA2;
+ cmpq $3, %rcx;
+ je .Lblk4_load_input_done;
+ vmovdqu 3*16(%rdx), RA3;
+
+.Lblk4_load_input_done:
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+ vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+ vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+ vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+ vmovdqa .Linv_shift_row rRIP, RB3;
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RB3, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP2, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP3, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+ ROUND(0, RA0, RA1, RA2, RA3);
+ ROUND(1, RA1, RA2, RA3, RA0);
+ ROUND(2, RA2, RA3, RA0, RA1);
+ ROUND(3, RA3, RA0, RA1, RA2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk4;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vmovdqu RA0, 0*16(%rsi);
+ cmpq $2, %rcx;
+ jb .Lblk4_store_output_done;
+ vmovdqu RA1, 1*16(%rsi);
+ je .Lblk4_store_output_done;
+ vmovdqu RA2, 2*16(%rsi);
+ cmpq $3, %rcx;
+ je .Lblk4_store_output_done;
+ vmovdqu RA3, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+ vzeroall;
+ FRAME_END
+ ret;
+SYM_FUNC_END(sm4_aesni_avx_crypt4)
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
new file mode 100644
index 000000000000..3f822fa1070a
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-04.html
+ *
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@xxxxxxxxxxxxxxxxx>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sm4.h>
+
+asmlinkage void sm4_aesni_avx_expand_key(const u8 *key, u32 *rk_enc,
+ u32 *rk_dec, const u32 *fk, const u32 *ck);
+asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+
+static int sm4_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ if (crypto_simd_usable()) {
+ kernel_fpu_begin();
+ sm4_aesni_avx_expand_key(in_key, ctx->rkey_enc,
+ ctx->rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+ kernel_fpu_end();
+ } else
+ crypto_sm4_expand_key(ctx, in_key, key_len);
+
+ return 0;
+}
+
+static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (crypto_simd_usable()) {
+ kernel_fpu_begin();
+ sm4_aesni_avx_crypt4(ctx->rkey_enc, out, in, 1);
+ kernel_fpu_end();
+ } else
+ crypto_sm4_do_crypt(ctx->rkey_enc, out, in);
+}
+
+static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (crypto_simd_usable()) {
+ kernel_fpu_begin();
+ sm4_aesni_avx_crypt4(ctx->rkey_dec, out, in, 1);
+ kernel_fpu_end();
+ } else
+ crypto_sm4_do_crypt(ctx->rkey_dec, out, in);
+}
+
+static struct crypto_alg sm4_asm_alg = {
+ .cra_name = "sm4",
+ .cra_driver_name = "sm4-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_sm4_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = SM4_KEY_SIZE,
+ .cia_max_keysize = SM4_KEY_SIZE,
+ .cia_setkey = sm4_setkey,
+ .cia_encrypt = sm4_encrypt,
+ .cia_decrypt = sm4_decrypt
+ }
+ }
+};
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_alg(&sm4_asm_alg);
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_alg(&sm4_asm_alg);
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@xxxxxxxxxxxxxxxxx>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-asm");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 4fbc9c080ca9..9f639395c667 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1570,6 +1570,35 @@ config CRYPTO_SM4

If unsure, say N.

+config CRYPTO_SM4_AESNI_AVX_X86_64
+ tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_LIB_SM4
+ help
+ SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX).
+
+ SM4 (GBT.32907-2016) is a cryptographic standard issued by the
+ Organization of State Commercial Administration of China (OSCCA)
+ as an authorized cryptographic algorithms for the use within China.
+
+ SMS4 was originally created for use in protecting wireless
+ networks, and is mandated in the Chinese National Standard for
+ Wireless LAN WAPI (Wired Authentication and Privacy Infrastructure)
+ (GB.15629.11-2003).
+
+ The latest SM4 standard (GBT.32907-2016) was proposed by OSCCA and
+ standardized through TC 260 of the Standardization Administration
+ of the People's Republic of China (SAC).
+
+ The input, output, and key of SMS4 are each 128 bits.
+
+ See also: <https://eprint.iacr.org/2008/329.pdf>
+
+ If unsure, say N.
+
config CRYPTO_TEA
tristate "TEA, XTEA and XETA cipher algorithms"
depends on CRYPTO_USER_API_ENABLE_OBSOLETE
--
2.19.1.3.ge56e4f7