[PATCH v3 7/8] x86/string: extend memcpy_flushcache() fixed-size fastpaths

From: Li Zhe

Date: Tue May 26 2026 - 23:41:17 EST


Small constant-sized flushcache copies currently fall back to
__memcpy_flushcache() unless they are exactly 4, 8, or 16 bytes.

Factor the existing inline movnti sequences into small helpers and
extend the fixed-size fastpath coverage to 24..96 bytes for naturally
aligned transfers. This keeps common struct-page-sized copies on the
inline path for the upcoming memcpy_streaming() user, while still
falling back to __memcpy_flushcache() for unaligned or uncommon sizes.
Zero-length copies return immediately.

Issue the fixed-size stores in ascending address order so
write-combining sees a forward stream.

Signed-off-by: Li Zhe <lizhe.67@xxxxxxxxxxxxx>
---
arch/x86/include/asm/string_64.h | 125 ++++++++++++++++++++++++++-----
1 file changed, 107 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b57e9e6f3db..8e6fca0185ee 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -82,24 +82,6 @@ int strcmp(const char *cs, const char *ct);
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
-static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
-{
- if (__builtin_constant_p(cnt)) {
- switch (cnt) {
- case 4:
- asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
- return;
- case 8:
- asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
- return;
- case 16:
- asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
- asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
- return;
- }
- }
- __memcpy_flushcache(dst, src, cnt);
-}

/*
* Only reuse memcpy_flushcache() for transfers that can stay entirely
@@ -123,6 +105,113 @@ static __always_inline int memcpy_flushcache_nt_safe(const void *dst,
return cnt == 4 && !(d & 3) && !(s & 3);
}

+static __always_inline void memcpy_flushcache_4(void *dst, const void *src)
+{
+ asm volatile("movntil %1, %0"
+ : "=m"(*(u32 *)dst)
+ : "r"(*(const u32 *)src)
+ : "memory");
+}
+
+static __always_inline void memcpy_flushcache_8(void *dst, const void *src)
+{
+ asm volatile("movntiq %1, %0"
+ : "=m"(*(u64 *)dst)
+ : "r"(*(const u64 *)src)
+ : "memory");
+}
+
+static __always_inline void memcpy_flushcache_16(void *dst,
+ const void *src)
+{
+ memcpy_flushcache_8(dst, src);
+ memcpy_flushcache_8(dst + 8, src + 8);
+}
+
+static __always_inline void memcpy_flushcache_32(void *dst,
+ const void *src)
+{
+ memcpy_flushcache_16(dst, src);
+ memcpy_flushcache_16(dst + 16, src + 16);
+}
+
+static __always_inline void memcpy_flushcache_64(void *dst,
+ const void *src)
+{
+ memcpy_flushcache_32(dst, src);
+ memcpy_flushcache_32(dst + 32, src + 32);
+}
+
+/*
+ * Keep common fixed-size copies on the inline movnti path when they can
+ * stay entirely on aligned non-temporal stores. Issue the stores in
+ * ascending address order so write-combining sees a forward stream.
+ */
+static __always_inline int memcpy_flushcache_small(void *dst,
+ const void *src,
+ size_t cnt)
+{
+ char *d = dst;
+ const char *s = src;
+
+ if (!memcpy_flushcache_nt_safe(dst, src, cnt))
+ return 0;
+
+ switch (cnt) {
+ case 4:
+ memcpy_flushcache_4(d, s);
+ return 1;
+ case 8:
+ memcpy_flushcache_8(d, s);
+ return 1;
+ }
+
+ if (cnt & 8) {
+ memcpy_flushcache_8(d, s);
+ d += 8;
+ s += 8;
+ cnt -= 8;
+ }
+
+ switch (cnt) {
+ case 16:
+ memcpy_flushcache_16(d, s);
+ return 1;
+ case 32:
+ memcpy_flushcache_32(d, s);
+ return 1;
+ case 48:
+ memcpy_flushcache_32(d, s);
+ memcpy_flushcache_16(d + 32, s + 32);
+ return 1;
+ case 64:
+ memcpy_flushcache_64(d, s);
+ return 1;
+ case 80:
+ memcpy_flushcache_64(d, s);
+ memcpy_flushcache_16(d + 64, s + 64);
+ return 1;
+ case 96:
+ memcpy_flushcache_64(d, s);
+ memcpy_flushcache_32(d + 64, s + 64);
+ return 1;
+ }
+
+ return 0;
+}
+
+static __always_inline void memcpy_flushcache(void *dst, const void *src,
+ size_t cnt)
+{
+ if (!cnt)
+ return;
+
+ if (__builtin_constant_p(cnt) && memcpy_flushcache_small(dst, src, cnt))
+ return;
+
+ __memcpy_flushcache(dst, src, cnt);
+}
+
#define __HAVE_ARCH_MEMCPY_STREAMING 1
static __always_inline void memcpy_streaming(void *dst, const void *src,
size_t cnt)
--
2.20.1