[PATCH 1/3] lib/string: optimized memcpy

From: Matteo Croce
Date: Thu Jun 24 2021 - 21:02:36 EST


From: Matteo Croce <mcroce@xxxxxxxxxxxxx>

Rewrite the generic memcpy() to copy a word at time, without generating
unaligned accesses.

The procedure is made of three steps:
First copy data one byte at time until the destination buffer is aligned
to a long boundary.
Then copy the data one long at time shifting the current and the next long
to compose a long at every cycle.
Finally, copy the remainder one byte at time.

Signed-off-by: Matteo Croce <mcroce@xxxxxxxxxxxxx>
---
lib/string.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/lib/string.c b/lib/string.c
index 546d59711a12..15e906f97d9e 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -33,6 +33,24 @@
#include <asm/word-at-a-time.h>
#include <asm/page.h>

+#define MIN_THRESHOLD (sizeof(long) * 2)
+
+/* convenience union to avoid cast between different pointer types */
+union types {
+ u8 *as_u8;
+ unsigned long *as_ulong;
+ uintptr_t as_uptr;
+};
+
+union const_types {
+ const u8 *as_u8;
+ const unsigned long *as_ulong;
+ uintptr_t as_uptr;
+};
+
+static const unsigned int bytes_long = sizeof(long);
+static const unsigned int word_mask = bytes_long - 1;
+
#ifndef __HAVE_ARCH_STRNCASECMP
/**
* strncasecmp - Case insensitive, length-limited string comparison
@@ -878,16 +896,73 @@ EXPORT_SYMBOL(memset64);
* You should not use this function to access IO space, use memcpy_toio()
* or memcpy_fromio() instead.
*/
+
+#ifdef __BIG_ENDIAN
+#define MERGE_UL(h, l, d) ((h) << ((d) * 8) | (l) >> ((bytes_long - (d)) * 8))
+#else
+#define MERGE_UL(h, l, d) ((h) >> ((d) * 8) | (l) << ((bytes_long - (d)) * 8))
+#endif
+
void *memcpy(void *dest, const void *src, size_t count)
{
- char *tmp = dest;
- const char *s = src;
+ union const_types s = { .as_u8 = src };
+ union types d = { .as_u8 = dest };
+ int distance = 0;
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+ if (count < MIN_THRESHOLD)
+ goto copy_remainder;
+
+ /* Copy a byte at time until destination is aligned. */
+ for (; d.as_uptr & word_mask; count--)
+ *d.as_u8++ = *s.as_u8++;
+
+ distance = s.as_uptr & word_mask;
+ }

+ if (distance) {
+ unsigned long last, next;
+
+ /*
+ * s is distance bytes ahead of d, and d just reached
+ * the alignment boundary. Move s backward to word align it
+ * and shift data to compensate for distance, in order to do
+ * word-by-word copy.
+ */
+ s.as_u8 -= distance;
+
+ next = s.as_ulong[0];
+ for (; count >= bytes_long + word_mask; count -= bytes_long) {
+ last = next;
+ next = s.as_ulong[1];
+
+ d.as_ulong[0] = MERGE_UL(last, next, distance);
+
+ d.as_ulong++;
+ s.as_ulong++;
+ }
+
+ /* Restore s with the original offset. */
+ s.as_u8 += distance;
+ } else {
+ /*
+ * If the source and dest lower bits are the same, do a simple
+ * 32/64 bit wide copy.
+ */
+ for (; count >= bytes_long; count -= bytes_long)
+ *d.as_ulong++ = *s.as_ulong++;
+ }
+
+copy_remainder:
while (count--)
- *tmp++ = *s++;
+ *d.as_u8++ = *s.as_u8++;
+
return dest;
}
EXPORT_SYMBOL(memcpy);
+
+#undef MERGE_UL
+
#endif

#ifndef __HAVE_ARCH_MEMMOVE
--
2.31.1