From 6ff7f7a72a4855970b1621ac9724c44c393a6d44 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Jan 2024 09:46:32 -0800 Subject: [PATCH] Add the current kernel version as "New version" --- Makefile | 3 -- csum_partial.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index e4b1bb3..4e29f8a 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,3 @@ chain2.svg: graphs/chain2.dot chain2a.svg: graphs/chain2a.dot dot -Tsvg -O graphs/chain2a.dot mv graphs/chain2a.dot.svg chain2a.svg - - - \ No newline at end of file diff --git a/csum_partial.c b/csum_partial.c index 4db0d97..ddf6acd 100644 --- a/csum_partial.c +++ b/csum_partial.c @@ -14,13 +14,28 @@ #include typedef uint32_t __wsum; +typedef uint32_t __u32; +typedef uint64_t __u64; typedef uint64_t u64; typedef uint32_t u32; +# define likely(x) __builtin_expect(!!(x), 1) # define unlikely(x) __builtin_expect(!!(x), 0) +#define __force + #define LOOPCOUNT 102400 #define PACKETSIZE 40 +/** + * ror64 - rotate a 64-bit value right + * @word: value to rotate + * @shift: bits to roll + */ +static inline __u64 ror64(__u64 word, unsigned int shift) +{ + return (word >> (shift & 63)) | (word << ((-shift) & 63)); +} + static inline unsigned long load_unaligned_zeropad(const void *addr) { unsigned long ret, dummy; @@ -484,7 +499,105 @@ static inline __wsum nulltest(const void *buff, int len, __wsum sum) { return 2; } +static inline __wsum csum_finalize_sum(u64 temp64) +{ + return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32); +} +static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5]) +{ + asm("addq %1,%0\n\t" + "adcq %2,%0\n\t" + "adcq %3,%0\n\t" + "adcq %4,%0\n\t" + "adcq %5,%0\n\t" + "adcq $0,%0" + :"+r" (sum) + :"m" (m[0]), "m" (m[1]), "m" (m[2]), + "m" (m[3]), "m" (m[4])); + return sum; +} + +/* + * Do a checksum on an arbitrary memory area. + * Returns a 32bit checksum. + * + * This isn't as time critical as it used to be because many NICs + * do hardware checksumming these days. + * + * Still, with CHECKSUM_COMPLETE this is called to compute + * checksums on IPv6 headers (40 bytes) and other small parts. + * it's best to have buff aligned on a 64-bit boundary + */ +__wsum csum_partial_new(const void *buff, int len, __wsum sum) +{ + u64 temp64 = (__force u64)sum; + + /* Do two 40-byte chunks in parallel to get better ILP */ + if (likely(len >= 80)) { + u64 temp64_2 = 0; + do { + temp64 = update_csum_40b(temp64, buff); + temp64_2 = update_csum_40b(temp64_2, buff + 40); + buff += 80; + len -= 80; + } while (len >= 80); + + asm("addq %1,%0\n\t" + "adcq $0,%0" + :"+r" (temp64): "r" (temp64_2)); + } + + /* + * len == 40 is the hot case due to IPv6 headers, so return + * early for that exact case without checking the tail bytes. + */ + if (len >= 40) { + temp64 = update_csum_40b(temp64, buff); + len -= 40; + if (!len) + return csum_finalize_sum(temp64); + buff += 40; + } + + if (len & 32) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[32])buff)); + buff += 32; + } + if (len & 16) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[16])buff)); + buff += 16; + } + if (len & 8) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[8])buff)); + buff += 8; + } + if (len & 7) { + unsigned int shift = (-len << 3) & 63; + unsigned long trail; + + trail = (load_unaligned_zeropad(buff) << shift) >> shift; + + asm("addq %[trail],%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r"(temp64) + : [trail] "r"(trail)); + } + return csum_finalize_sum(temp64); +} double cycles[64]; int cyclecount[64]; @@ -612,6 +725,7 @@ int main(int argc, char **argv) MEASURE(2, csum_partial, "Upcoming linux kernel version"); MEASURE(4, csum_specialized, "Specialized to size 40"); + MEASURE(6, csum_partial_new, "New version"); MEASURE(22, csum_partial_no_odd, "Odd-alignment handling removed"); MEASURE(24, csum_partial_dead_code, "Dead code elimination "); MEASURE(28, csum_partial_ACX, "ADX interleaved "); @@ -619,7 +733,6 @@ int main(int argc, char **argv) MEASURE(34, csum_partial_32bit, "32 bit train "); MEASURE(36, csum_partial_zero_sum, "Assume zero input sum"); - report(); } -} \ No newline at end of file +}