Re: [PATCH v3] LoongArch: add checksum optimization for 64-bit system

From: Huacai Chen
Date: Wed Mar 01 2023 - 07:54:26 EST


Hi, Bibo,

I found the version here [1] provides more functions than this one. So
is it possible to take advantages from both versions?

[1] https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb

Huacai

On Thu, Feb 16, 2023 at 9:09 PM Bibo Mao <maobibo@xxxxxxxxxxx> wrote:
>
> loongArch platform is 64-bit system, which supports 8 bytes memory
> accessing, generic checksum function uses 4 byte memory access.
> This patch adds 8-bytes memory access optimization for checksum
> function on loongArch. And the code comes from arm64 system.
>
> When network hw checksum is disabled, iperf performance improves
> about 10% with this patch.
>
> Signed-off-by: Bibo Mao <maobibo@xxxxxxxxxxx>
> ---
> Changelog:
> v3: modify function accumulate() to handle better on loongarch platform,
> maybe it's compiler optimization issue.
> v2: use rotation API in function csum_fold to reduce one instruction.
> ---
> arch/loongarch/include/asm/checksum.h | 65 ++++++++++++
> arch/loongarch/lib/Makefile | 2 +-
> arch/loongarch/lib/csum.c | 141 ++++++++++++++++++++++++++
> 3 files changed, 207 insertions(+), 1 deletion(-)
> create mode 100644 arch/loongarch/include/asm/checksum.h
> create mode 100644 arch/loongarch/lib/csum.c
>
> diff --git a/arch/loongarch/include/asm/checksum.h b/arch/loongarch/include/asm/checksum.h
> new file mode 100644
> index 000000000000..8a7d368d801d
> --- /dev/null
> +++ b/arch/loongarch/include/asm/checksum.h
> @@ -0,0 +1,65 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2016 ARM Ltd.
> + * Copyright (C) 2023 Loongson Technology Corporation Limited
> + */
> +#ifndef __ASM_CHECKSUM_H
> +#define __ASM_CHECKSUM_H
> +
> +#include <linux/in6.h>
> +
> +#define _HAVE_ARCH_IPV6_CSUM
> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> + const struct in6_addr *daddr,
> + __u32 len, __u8 proto, __wsum sum);
> +
> +/*
> + * turns a 32-bit partial checksum (e.g. from csum_partial) into a
> + * 1's complement 16-bit checksum.
> + */
> +static inline __sum16 csum_fold(__wsum sum)
> +{
> + u32 tmp = (__force u32)sum;
> +
> + /*
> + * swap the two 16-bit halves of sum
> + * if there is a carry from adding the two 16-bit halves,
> + * it will carry from the lower half into the upper half,
> + * giving us the correct sum in the upper half.
> + */
> + return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16);
> +}
> +#define csum_fold csum_fold
> +
> +/*
> + * This is a version of ip_compute_csum() optimized for IP headers,
> + * which always checksum on 4 octet boundaries. ihl is the number
> + * of 32-bit words and is always >= 5.
> + */
> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> +{
> + __uint128_t tmp;
> + u64 sum;
> + int n = ihl; /* we want it signed */
> +
> + tmp = *(const __uint128_t *)iph;
> + iph += 16;
> + n -= 4;
> + tmp += ((tmp >> 64) | (tmp << 64));
> + sum = tmp >> 64;
> + do {
> + sum += *(const u32 *)iph;
> + iph += 4;
> + } while (--n > 0);
> +
> + sum += ror64(sum, 32);
> + return csum_fold((__force __wsum)(sum >> 32));
> +}
> +#define ip_fast_csum ip_fast_csum
> +
> +extern unsigned int do_csum(const unsigned char *buff, int len);
> +#define do_csum do_csum
> +
> +#include <asm-generic/checksum.h>
> +
> +#endif /* __ASM_CHECKSUM_H */
> diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
> index 40bde632900f..6ba6df411f90 100644
> --- a/arch/loongarch/lib/Makefile
> +++ b/arch/loongarch/lib/Makefile
> @@ -4,4 +4,4 @@
> #
>
> lib-y += delay.o memset.o memcpy.o memmove.o \
> - clear_user.o copy_user.o dump_tlb.o unaligned.o
> + clear_user.o copy_user.o dump_tlb.o unaligned.o csum.o
> diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c
> new file mode 100644
> index 000000000000..a5e84b403c3b
> --- /dev/null
> +++ b/arch/loongarch/lib/csum.c
> @@ -0,0 +1,141 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +// Copyright (C) 2019-2020 Arm Ltd.
> +
> +#include <linux/compiler.h>
> +#include <linux/kasan-checks.h>
> +#include <linux/kernel.h>
> +
> +#include <net/checksum.h>
> +
> +static u64 accumulate(u64 sum, u64 data)
> +{
> + sum += data;
> + if (sum < data)
> + sum += 1;
> + return sum;
> +}
> +
> +/*
> + * We over-read the buffer and this makes KASAN unhappy. Instead, disable
> + * instrumentation and call kasan explicitly.
> + */
> +unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
> +{
> + unsigned int offset, shift, sum;
> + const u64 *ptr;
> + u64 data, sum64 = 0;
> +
> + if (unlikely(len == 0))
> + return 0;
> +
> + offset = (unsigned long)buff & 7;
> + /*
> + * This is to all intents and purposes safe, since rounding down cannot
> + * result in a different page or cache line being accessed, and @buff
> + * should absolutely not be pointing to anything read-sensitive. We do,
> + * however, have to be careful not to piss off KASAN, which means using
> + * unchecked reads to accommodate the head and tail, for which we'll
> + * compensate with an explicit check up-front.
> + */
> + kasan_check_read(buff, len);
> + ptr = (u64 *)(buff - offset);
> + len = len + offset - 8;
> +
> + /*
> + * Head: zero out any excess leading bytes. Shifting back by the same
> + * amount should be at least as fast as any other way of handling the
> + * odd/even alignment, and means we can ignore it until the very end.
> + */
> + shift = offset * 8;
> + data = *ptr++;
> + data = (data >> shift) << shift;
> +
> + /*
> + * Body: straightforward aligned loads from here on (the paired loads
> + * underlying the quadword type still only need dword alignment). The
> + * main loop strictly excludes the tail, so the second loop will always
> + * run at least once.
> + */
> + while (unlikely(len > 64)) {
> + __uint128_t tmp1, tmp2, tmp3, tmp4;
> +
> + tmp1 = *(__uint128_t *)ptr;
> + tmp2 = *(__uint128_t *)(ptr + 2);
> + tmp3 = *(__uint128_t *)(ptr + 4);
> + tmp4 = *(__uint128_t *)(ptr + 6);
> +
> + len -= 64;
> + ptr += 8;
> +
> + /* This is the "don't dump the carry flag into a GPR" idiom */
> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> + tmp2 += (tmp2 >> 64) | (tmp2 << 64);
> + tmp3 += (tmp3 >> 64) | (tmp3 << 64);
> + tmp4 += (tmp4 >> 64) | (tmp4 << 64);
> + tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> + tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
> + tmp3 += (tmp3 >> 64) | (tmp3 << 64);
> + tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> + tmp1 = ((tmp1 >> 64) << 64) | sum64;
> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> + sum64 = tmp1 >> 64;
> + }
> + while (len > 8) {
> + __uint128_t tmp;
> +
> + sum64 = accumulate(sum64, data);
> + tmp = *(__uint128_t *)ptr;
> +
> + len -= 16;
> + ptr += 2;
> +
> + data = tmp >> 64;
> + sum64 = accumulate(sum64, tmp);
> + }
> + if (len > 0) {
> + sum64 = accumulate(sum64, data);
> + data = *ptr;
> + len -= 8;
> + }
> + /*
> + * Tail: zero any over-read bytes similarly to the head, again
> + * preserving odd/even alignment.
> + */
> + shift = len * -8;
> + data = (data << shift) >> shift;
> + sum64 = accumulate(sum64, data);
> +
> + /* Finally, folding */
> + sum64 += (sum64 >> 32) | (sum64 << 32);
> + sum = sum64 >> 32;
> + sum += (sum >> 16) | (sum << 16);
> + if (offset & 1)
> + return (u16)swab32(sum);
> +
> + return sum >> 16;
> +}
> +
> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> + const struct in6_addr *daddr,
> + __u32 len, __u8 proto, __wsum csum)
> +{
> + __uint128_t src, dst;
> + u64 sum = (__force u64)csum;
> +
> + src = *(const __uint128_t *)saddr->s6_addr;
> + dst = *(const __uint128_t *)daddr->s6_addr;
> +
> + sum += (__force u32)htonl(len);
> + sum += (u32)proto << 24;
> + src += (src >> 64) | (src << 64);
> + dst += (dst >> 64) | (dst << 64);
> +
> + sum = accumulate(sum, src >> 64);
> + sum = accumulate(sum, dst >> 64);
> +
> + sum += ((sum >> 32) | (sum << 32));
> + return csum_fold((__force __wsum)(sum >> 32));
> +}
> +EXPORT_SYMBOL(csum_ipv6_magic);
> --
> 2.27.0
>