Re: [PATCH v3] LoongArch: add checksum optimization for 64-bit system

From: Huacai Chen
Date: Wed Mar 01 2023 - 23:55:06 EST


On Thu, Mar 2, 2023 at 10:19 AM maobibo <maobibo@xxxxxxxxxxx> wrote:
>
>
>
> 在 2023/3/1 20:54, Huacai Chen 写道:
> > Hi, Bibo,
> >
> > I found the version here [1] provides more functions than this one. So
> > is it possible to take advantages from both versions?
> csum and copy user will take me much time:( Just for csum there is no obvious
> results between uint128 and interleave method, it depends on compiler version
> also. I want to re-investigate two methods after LoongArch gcc optimization
> version is stable.
This patch is target for 6.4, time is enough, you can update a new
version when you have time.

Huacai

>
> Three methods are tested on LoongArch 3A5000 machine: uint128/asm/interleave
> methods, interleave method gets double performance on x86 box compared with
> uint128 method, however no better than LoongArch 3A5000 machine and one
> ARM64 machine. The three test methods are listed at website:
> https://github.com/bibo-mao/bench/tree/master/csum
>
> Here is the result on 3A5000 machine, time unit is us, smaller for
> better performance.
>
> buf size loops uint128 asm interleave
> 4096 0x100000 279824 373401 291508
> 1472 0x100000 109137 138224 106035
> 250 0x100000 29008 35838 23408
> 40 0x100000 9789 23408 9122
>
> Regards
> Bibo, mao
> >
> > [1] https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb
> >
> > Huacai
> >
> > On Thu, Feb 16, 2023 at 9:09 PM Bibo Mao <maobibo@xxxxxxxxxxx> wrote:
> >>
> >> loongArch platform is 64-bit system, which supports 8 bytes memory
> >> accessing, generic checksum function uses 4 byte memory access.
> >> This patch adds 8-bytes memory access optimization for checksum
> >> function on loongArch. And the code comes from arm64 system.
> >>
> >> When network hw checksum is disabled, iperf performance improves
> >> about 10% with this patch.
> >>
> >> Signed-off-by: Bibo Mao <maobibo@xxxxxxxxxxx>
> >> ---
> >> Changelog:
> >> v3: modify function accumulate() to handle better on loongarch platform,
> >> maybe it's compiler optimization issue.
> >> v2: use rotation API in function csum_fold to reduce one instruction.
> >> ---
> >> arch/loongarch/include/asm/checksum.h | 65 ++++++++++++
> >> arch/loongarch/lib/Makefile | 2 +-
> >> arch/loongarch/lib/csum.c | 141 ++++++++++++++++++++++++++
> >> 3 files changed, 207 insertions(+), 1 deletion(-)
> >> create mode 100644 arch/loongarch/include/asm/checksum.h
> >> create mode 100644 arch/loongarch/lib/csum.c
> >>
> >> diff --git a/arch/loongarch/include/asm/checksum.h b/arch/loongarch/include/asm/checksum.h
> >> new file mode 100644
> >> index 000000000000..8a7d368d801d
> >> --- /dev/null
> >> +++ b/arch/loongarch/include/asm/checksum.h
> >> @@ -0,0 +1,65 @@
> >> +/* SPDX-License-Identifier: GPL-2.0-only */
> >> +/*
> >> + * Copyright (C) 2016 ARM Ltd.
> >> + * Copyright (C) 2023 Loongson Technology Corporation Limited
> >> + */
> >> +#ifndef __ASM_CHECKSUM_H
> >> +#define __ASM_CHECKSUM_H
> >> +
> >> +#include <linux/in6.h>
> >> +
> >> +#define _HAVE_ARCH_IPV6_CSUM
> >> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> >> + const struct in6_addr *daddr,
> >> + __u32 len, __u8 proto, __wsum sum);
> >> +
> >> +/*
> >> + * turns a 32-bit partial checksum (e.g. from csum_partial) into a
> >> + * 1's complement 16-bit checksum.
> >> + */
> >> +static inline __sum16 csum_fold(__wsum sum)
> >> +{
> >> + u32 tmp = (__force u32)sum;
> >> +
> >> + /*
> >> + * swap the two 16-bit halves of sum
> >> + * if there is a carry from adding the two 16-bit halves,
> >> + * it will carry from the lower half into the upper half,
> >> + * giving us the correct sum in the upper half.
> >> + */
> >> + return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16);
> >> +}
> >> +#define csum_fold csum_fold
> >> +
> >> +/*
> >> + * This is a version of ip_compute_csum() optimized for IP headers,
> >> + * which always checksum on 4 octet boundaries. ihl is the number
> >> + * of 32-bit words and is always >= 5.
> >> + */
> >> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> >> +{
> >> + __uint128_t tmp;
> >> + u64 sum;
> >> + int n = ihl; /* we want it signed */
> >> +
> >> + tmp = *(const __uint128_t *)iph;
> >> + iph += 16;
> >> + n -= 4;
> >> + tmp += ((tmp >> 64) | (tmp << 64));
> >> + sum = tmp >> 64;
> >> + do {
> >> + sum += *(const u32 *)iph;
> >> + iph += 4;
> >> + } while (--n > 0);
> >> +
> >> + sum += ror64(sum, 32);
> >> + return csum_fold((__force __wsum)(sum >> 32));
> >> +}
> >> +#define ip_fast_csum ip_fast_csum
> >> +
> >> +extern unsigned int do_csum(const unsigned char *buff, int len);
> >> +#define do_csum do_csum
> >> +
> >> +#include <asm-generic/checksum.h>
> >> +
> >> +#endif /* __ASM_CHECKSUM_H */
> >> diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
> >> index 40bde632900f..6ba6df411f90 100644
> >> --- a/arch/loongarch/lib/Makefile
> >> +++ b/arch/loongarch/lib/Makefile
> >> @@ -4,4 +4,4 @@
> >> #
> >>
> >> lib-y += delay.o memset.o memcpy.o memmove.o \
> >> - clear_user.o copy_user.o dump_tlb.o unaligned.o
> >> + clear_user.o copy_user.o dump_tlb.o unaligned.o csum.o
> >> diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c
> >> new file mode 100644
> >> index 000000000000..a5e84b403c3b
> >> --- /dev/null
> >> +++ b/arch/loongarch/lib/csum.c
> >> @@ -0,0 +1,141 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +// Copyright (C) 2019-2020 Arm Ltd.
> >> +
> >> +#include <linux/compiler.h>
> >> +#include <linux/kasan-checks.h>
> >> +#include <linux/kernel.h>
> >> +
> >> +#include <net/checksum.h>
> >> +
> >> +static u64 accumulate(u64 sum, u64 data)
> >> +{
> >> + sum += data;
> >> + if (sum < data)
> >> + sum += 1;
> >> + return sum;
> >> +}
> >> +
> >> +/*
> >> + * We over-read the buffer and this makes KASAN unhappy. Instead, disable
> >> + * instrumentation and call kasan explicitly.
> >> + */
> >> +unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
> >> +{
> >> + unsigned int offset, shift, sum;
> >> + const u64 *ptr;
> >> + u64 data, sum64 = 0;
> >> +
> >> + if (unlikely(len == 0))
> >> + return 0;
> >> +
> >> + offset = (unsigned long)buff & 7;
> >> + /*
> >> + * This is to all intents and purposes safe, since rounding down cannot
> >> + * result in a different page or cache line being accessed, and @buff
> >> + * should absolutely not be pointing to anything read-sensitive. We do,
> >> + * however, have to be careful not to piss off KASAN, which means using
> >> + * unchecked reads to accommodate the head and tail, for which we'll
> >> + * compensate with an explicit check up-front.
> >> + */
> >> + kasan_check_read(buff, len);
> >> + ptr = (u64 *)(buff - offset);
> >> + len = len + offset - 8;
> >> +
> >> + /*
> >> + * Head: zero out any excess leading bytes. Shifting back by the same
> >> + * amount should be at least as fast as any other way of handling the
> >> + * odd/even alignment, and means we can ignore it until the very end.
> >> + */
> >> + shift = offset * 8;
> >> + data = *ptr++;
> >> + data = (data >> shift) << shift;
> >> +
> >> + /*
> >> + * Body: straightforward aligned loads from here on (the paired loads
> >> + * underlying the quadword type still only need dword alignment). The
> >> + * main loop strictly excludes the tail, so the second loop will always
> >> + * run at least once.
> >> + */
> >> + while (unlikely(len > 64)) {
> >> + __uint128_t tmp1, tmp2, tmp3, tmp4;
> >> +
> >> + tmp1 = *(__uint128_t *)ptr;
> >> + tmp2 = *(__uint128_t *)(ptr + 2);
> >> + tmp3 = *(__uint128_t *)(ptr + 4);
> >> + tmp4 = *(__uint128_t *)(ptr + 6);
> >> +
> >> + len -= 64;
> >> + ptr += 8;
> >> +
> >> + /* This is the "don't dump the carry flag into a GPR" idiom */
> >> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> >> + tmp2 += (tmp2 >> 64) | (tmp2 << 64);
> >> + tmp3 += (tmp3 >> 64) | (tmp3 << 64);
> >> + tmp4 += (tmp4 >> 64) | (tmp4 << 64);
> >> + tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
> >> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> >> + tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
> >> + tmp3 += (tmp3 >> 64) | (tmp3 << 64);
> >> + tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
> >> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> >> + tmp1 = ((tmp1 >> 64) << 64) | sum64;
> >> + tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> >> + sum64 = tmp1 >> 64;
> >> + }
> >> + while (len > 8) {
> >> + __uint128_t tmp;
> >> +
> >> + sum64 = accumulate(sum64, data);
> >> + tmp = *(__uint128_t *)ptr;
> >> +
> >> + len -= 16;
> >> + ptr += 2;
> >> +
> >> + data = tmp >> 64;
> >> + sum64 = accumulate(sum64, tmp);
> >> + }
> >> + if (len > 0) {
> >> + sum64 = accumulate(sum64, data);
> >> + data = *ptr;
> >> + len -= 8;
> >> + }
> >> + /*
> >> + * Tail: zero any over-read bytes similarly to the head, again
> >> + * preserving odd/even alignment.
> >> + */
> >> + shift = len * -8;
> >> + data = (data << shift) >> shift;
> >> + sum64 = accumulate(sum64, data);
> >> +
> >> + /* Finally, folding */
> >> + sum64 += (sum64 >> 32) | (sum64 << 32);
> >> + sum = sum64 >> 32;
> >> + sum += (sum >> 16) | (sum << 16);
> >> + if (offset & 1)
> >> + return (u16)swab32(sum);
> >> +
> >> + return sum >> 16;
> >> +}
> >> +
> >> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> >> + const struct in6_addr *daddr,
> >> + __u32 len, __u8 proto, __wsum csum)
> >> +{
> >> + __uint128_t src, dst;
> >> + u64 sum = (__force u64)csum;
> >> +
> >> + src = *(const __uint128_t *)saddr->s6_addr;
> >> + dst = *(const __uint128_t *)daddr->s6_addr;
> >> +
> >> + sum += (__force u32)htonl(len);
> >> + sum += (u32)proto << 24;
> >> + src += (src >> 64) | (src << 64);
> >> + dst += (dst >> 64) | (dst << 64);
> >> +
> >> + sum = accumulate(sum, src >> 64);
> >> + sum = accumulate(sum, dst >> 64);
> >> +
> >> + sum += ((sum >> 32) | (sum << 32));
> >> + return csum_fold((__force __wsum)(sum >> 32));
> >> +}
> >> +EXPORT_SYMBOL(csum_ipv6_magic);
> >> --
> >> 2.27.0
> >>
>