[PATCH v2 6/7] x86/string: extend memcpy_flushcache() fixed-size fastpaths

From: Li Zhe

Date: Thu May 21 2026 - 00:05:05 EST


Small constant-sized flushcache copies currently fall back to
__memcpy_flushcache() unless they are exactly 4, 8, or 16 bytes.

Factor the existing inline movnti sequences into small helpers and
extend the fixed-size fastpath coverage to 24..96 bytes. This keeps
common struct-page-sized copies on the inline path for the upcoming
memcpy_streaming() user, while still falling back to
__memcpy_flushcache() for uncommon sizes.

Signed-off-by: Li Zhe <lizhe.67@xxxxxxxxxxxxx>
---
arch/x86/include/asm/string_64.h | 87 +++++++++++++++++++++++++++-----
1 file changed, 73 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 15504b844f1e..94dc92f287f3 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -82,22 +82,81 @@ int strcmp(const char *cs, const char *ct);
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
-static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+
+static __always_inline void memcpy_flushcache_4(void *dst, const void *src)
+{
+ asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+}
+
+static __always_inline void memcpy_flushcache_8(void *dst, const void *src)
+{
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+}
+
+static __always_inline void memcpy_flushcache_16(void *dst, const void *src)
+{
+ memcpy_flushcache_8(dst, src);
+ memcpy_flushcache_8(dst + 8, src + 8);
+}
+
+/*
+ * Keep common fixed-size copies on the inline movnti path instead of
+ * dropping into the generic helper.
+ */
+static __always_inline int memcpy_flushcache_small(void *dst, const void *src,
+ size_t cnt)
{
- if (__builtin_constant_p(cnt)) {
- switch (cnt) {
- case 4:
- asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
- return;
- case 8:
- asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
- return;
- case 16:
- asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
- asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
- return;
- }
+ switch (cnt) {
+ case 96:
+ memcpy_flushcache_16(dst + 80, src + 80);
+ fallthrough;
+ case 80:
+ memcpy_flushcache_16(dst + 64, src + 64);
+ fallthrough;
+ case 64:
+ memcpy_flushcache_16(dst + 48, src + 48);
+ fallthrough;
+ case 48:
+ memcpy_flushcache_16(dst + 32, src + 32);
+ fallthrough;
+ case 32:
+ memcpy_flushcache_16(dst + 16, src + 16);
+ fallthrough;
+ case 16:
+ memcpy_flushcache_16(dst, src);
+ return 1;
+
+ case 88:
+ memcpy_flushcache_16(dst + 72, src + 72);
+ fallthrough;
+ case 72:
+ memcpy_flushcache_16(dst + 56, src + 56);
+ fallthrough;
+ case 56:
+ memcpy_flushcache_16(dst + 40, src + 40);
+ fallthrough;
+ case 40:
+ memcpy_flushcache_16(dst + 24, src + 24);
+ fallthrough;
+ case 24:
+ memcpy_flushcache_16(dst + 8, src + 8);
+ fallthrough;
+ case 8:
+ memcpy_flushcache_8(dst, src);
+ return 1;
+
+ case 4:
+ memcpy_flushcache_4(dst, src);
+ return 1;
}
+
+ return 0;
+}
+
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt) && memcpy_flushcache_small(dst, src, cnt))
+ return;
__memcpy_flushcache(dst, src, cnt);
}

--
2.20.1