[PATCH] lib/string: Bring optimized memcmp from glibc

From: Nikolay Borisov
Date: Wed Jul 21 2021 - 09:59:43 EST


This is glibc's memcmp version. The upside is that for architectures
which don't have an optimized version the kernel can provide some
solace in the form of a generic, word-sized optimized memcmp. I tested
this with a heavy IOCTL_FIDEDUPERANGE(2) workload and here are the
results I got:

UNPATCHED PATCHED
real 1m13.127s 1m10.611s
user 0m0.030s 0m0.033s
sys 0m5.890s 0m4.001s (32% decrease)

Comparing 2 perf profiles of the workload yield:

30.44% -29.39% [kernel.vmlinux] [k] memcmp

Signed-off-by: Nikolay Borisov <nborisov@xxxxxxxx>
---
lib/string.c | 303 +++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 293 insertions(+), 10 deletions(-)

diff --git a/lib/string.c b/lib/string.c
index 7548eb715ddb..915db7e49804 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -923,22 +923,305 @@ EXPORT_SYMBOL(memmove);
#endif

#ifndef __HAVE_ARCH_MEMCMP
+
+/* Threshold value for when to enter the unrolled loops. */
+#define MEMCMP_THRESH 16
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+# define CMP_LT_OR_GT(a, b) memcmp_bytes((a), (b))
+/*
+ * Compare A and B bytewise in the byte order of the machine.
+ * A and B are known to be different. This is needed only on little-endian
+ * machines.
+ */
+static inline int memcmp_bytes(unsigned long a, unsigned long b)
+{
+ long srcp1 = (long) &a;
+ long srcp2 = (long) &b;
+ unsigned long a0, b0;
+
+ do {
+ a0 = ((uint8_t *) srcp1)[0];
+ b0 = ((uint8_t *) srcp2)[0];
+ srcp1 += 1;
+ srcp2 += 1;
+ } while (a0 == b0);
+ return a0 - b0;
+}
+
+#else
+# define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+# define CMP_LT_OR_GT(a, b) ((a) > (b) ? 1 : -1)
+#endif
+
+/*
+ * The strategy of this memcmp is:
+ * 1. Compare bytes until one of the block pointers is aligned.
+ *
+ * 2. Compare using memcmp_common_alignment or memcmp_not_common_alignment,
+ * regarding the alignment of the other block after the initial byte operations.
+ * The maximum number of full words (of type unsigned long) are compared in
+ * this way.
+ *
+ * 3. Compare the few remaining bytes.
+ */
+
+/*
+ * Compare blocks at SRCP1 and SRCP2 with LEN `unsigned long' objects (not LEN
+ * bytes!). Both SRCP1 and SRCP2 should be aligned for memory operations on
+ * `unsigned long's.
+ */
+static int memcmp_common_alignment(long srcp1, long srcp2, size_t len)
+{
+ unsigned long a0, a1;
+ unsigned long b0, b1;
+
+ switch (len % 4) {
+ default: /* Avoid warning about uninitialized local variables. */
+ case 2:
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ srcp1 -= 2 * sizeof(unsigned long);
+ srcp2 -= 2 * sizeof(unsigned long);
+ len += 2;
+ goto do1;
+ case 3:
+ a1 = ((unsigned long *) srcp1)[0];
+ b1 = ((unsigned long *) srcp2)[0];
+ srcp1 -= sizeof(unsigned long);
+ srcp2 -= sizeof(unsigned long);
+ len += 1;
+ goto do2;
+ case 0:
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ return 0;
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ goto do3;
+ case 1:
+ a1 = ((unsigned long *) srcp1)[0];
+ b1 = ((unsigned long *) srcp2)[0];
+ srcp1 += sizeof(unsigned long);
+ srcp2 += sizeof(unsigned long);
+ len -= 1;
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ goto do0;
+ /* Fallthrough */
+ }
+
+ do {
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ if (a1 != b1)
+ return CMP_LT_OR_GT(a1, b1);
+
+do3:
+ a1 = ((unsigned long *) srcp1)[1];
+ b1 = ((unsigned long *) srcp2)[1];
+ if (a0 != b0)
+ return CMP_LT_OR_GT(a0, b0);
+
+do2:
+ a0 = ((unsigned long *) srcp1)[2];
+ b0 = ((unsigned long *) srcp2)[2];
+ if (a1 != b1)
+ return CMP_LT_OR_GT(a1, b1);
+
+do1:
+ a1 = ((unsigned long *) srcp1)[3];
+ b1 = ((unsigned long *) srcp2)[3];
+ if (a0 != b0)
+ return CMP_LT_OR_GT(a0, b0);
+
+ srcp1 += 4 * sizeof(unsigned long);
+ srcp2 += 4 * sizeof(unsigned long);
+ len -= 4;
+ } while (len != 0);
+
+ /*
+ * This is the right position for do0. Please don't move it into the
+ * loop.
+ */
+do0:
+ if (a1 != b1)
+ return CMP_LT_OR_GT(a1, b1);
+ return 0;
+}
+
+/*
+ * Compare blocks at SRCP1 and SRCP2 with LEN `unsigned long' objects (not LEN
+ * bytes!). SRCP2 should be aligned for memory operations on `unsigned long',
+ * but SRCP1 *should be unaligned*.
+ */
+static int memcmp_not_common_alignment(long srcp1, long srcp2, size_t len)
+{
+ unsigned long a0, a1, a2, a3;
+ unsigned long b0, b1, b2, b3;
+ unsigned long x;
+ int shl, shr;
+
+ /*
+ * Calculate how to shift a word read at the memory operation
+ * aligned srcp1 to make it aligned for comparison.
+ */
+
+ shl = 8 * (srcp1 % sizeof(unsigned long));
+ shr = 8 * sizeof(unsigned long) - shl;
+
+ /*
+ * Make SRCP1 aligned by rounding it down to the beginning of the
+ * `unsigned long' it points in the middle of.
+ */
+ srcp1 &= -sizeof(unsigned long);
+
+ switch (len % 4) {
+ default: /* Avoid warning about uninitialized local variables. */
+ case 2:
+ a1 = ((unsigned long *) srcp1)[0];
+ a2 = ((unsigned long *) srcp1)[1];
+ b2 = ((unsigned long *) srcp2)[0];
+ srcp1 -= 1 * sizeof(unsigned long);
+ srcp2 -= 2 * sizeof(unsigned long);
+ len += 2;
+ goto do1;
+ case 3:
+ a0 = ((unsigned long *) srcp1)[0];
+ a1 = ((unsigned long *) srcp1)[1];
+ b1 = ((unsigned long *) srcp2)[0];
+ srcp2 -= 1 * sizeof(unsigned long);
+ len += 1;
+ goto do2;
+ case 0:
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ return 0;
+ a3 = ((unsigned long *) srcp1)[0];
+ a0 = ((unsigned long *) srcp1)[1];
+ b0 = ((unsigned long *) srcp2)[0];
+ srcp1 += 1 * sizeof(unsigned long);
+ goto do3;
+ case 1:
+ a2 = ((unsigned long *) srcp1)[0];
+ a3 = ((unsigned long *) srcp1)[1];
+ b3 = ((unsigned long *) srcp2)[0];
+ srcp1 += 2 * sizeof(unsigned long);
+ srcp2 += 1 * sizeof(unsigned long);
+ len -= 1;
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ goto do0;
+ /* Fallthrough */
+ }
+
+ do {
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ x = MERGE(a2, shl, a3, shr);
+ if (x != b3)
+ return CMP_LT_OR_GT(x, b3);
+
+do3:
+ a1 = ((unsigned long *) srcp1)[1];
+ b1 = ((unsigned long *) srcp2)[1];
+ x = MERGE(a3, shl, a0, shr);
+ if (x != b0)
+ return CMP_LT_OR_GT(x, b0);
+
+do2:
+ a2 = ((unsigned long *) srcp1)[2];
+ b2 = ((unsigned long *) srcp2)[2];
+ x = MERGE(a0, shl, a1, shr);
+ if (x != b1)
+ return CMP_LT_OR_GT(x, b1);
+
+do1:
+ a3 = ((unsigned long *) srcp1)[3];
+ b3 = ((unsigned long *) srcp2)[3];
+ x = MERGE(a1, shl, a2, shr);
+ if (x != b2)
+ return CMP_LT_OR_GT(x, b2);
+
+ srcp1 += 4 * sizeof(unsigned long);
+ srcp2 += 4 * sizeof(unsigned long);
+ len -= 4;
+ } while (len != 0);
+
+ /*
+ * This is the right position for do0. Please don't move it into
+ * the loop.
+ */
+do0:
+ x = MERGE(a2, shl, a3, shr);
+ if (x != b3)
+ return CMP_LT_OR_GT(x, b3);
+ return 0;
+}
+
+#undef memcmp
/**
* memcmp - Compare two areas of memory
- * @cs: One area of memory
- * @ct: Another area of memory
+ * @s1: One area of memory
+ * @s2: Another area of memory
* @count: The size of the area.
*/
-#undef memcmp
-__visible int memcmp(const void *cs, const void *ct, size_t count)
+__visible int memcmp(const void *s1, const void *s2, size_t len)
{
- const unsigned char *su1, *su2;
- int res = 0;
+ unsigned long a0;
+ unsigned long b0;
+ long srcp1 = (long) s1;
+ long srcp2 = (long) s2;
+ unsigned long res;
+
+ if (len >= MEMCMP_THRESH) {
+ /*
+ * There are at least some bytes to compare. No need to test
+ * for LEN == 0 in this alignment loop.
+ */
+ while (!IS_ALIGNED(srcp2, sizeof(unsigned long))) {
+ a0 = ((uint8_t *) srcp1)[0];
+ b0 = ((uint8_t *) srcp2)[0];
+ srcp1 += 1;
+ srcp2 += 1;
+ res = a0 - b0;
+ if (res != 0)
+ return res;
+ len -= 1;
+ }

- for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
- if ((res = *su1 - *su2) != 0)
- break;
- return res;
+ /*
+ * SRCP2 is now aligned for memory operations on `unsigned long'.
+ * SRCP1 alignment determines if we can do a simple, aligned
+ * compare or need to shuffle bits.
+ */
+
+ if (IS_ALIGNED(srcp1, sizeof(unsigned long)))
+ res = memcmp_common_alignment(srcp1, srcp2,
+ len / sizeof(unsigned long));
+ else
+ res = memcmp_not_common_alignment(srcp1, srcp2,
+ len / sizeof(unsigned long));
+
+ if (res != 0)
+ return res;
+
+ /* Number of bytes remaining in the interval [0..sizeof(unsigned long)-1]. */
+ srcp1 += len & -sizeof(unsigned long);
+ srcp2 += len & -sizeof(unsigned long);
+ len %= sizeof(unsigned long);
+ }
+
+ /* There are just a few bytes to compare. Use byte memory operations. */
+ while (len != 0) {
+ a0 = ((uint8_t *) srcp1)[0];
+ b0 = ((uint8_t *) srcp2)[0];
+ srcp1 += 1;
+ srcp2 += 1;
+ res = a0 - b0;
+ if (res != 0)
+ return res;
+ len -= 1;
+ }
+
+ return 0;
}
EXPORT_SYMBOL(memcmp);
#endif
--
2.25.1