Re: [PATCH] x86: handle the tail in rep_movs_alternative() with an overlapping store

From: Mateusz Guzik
Date: Wed Mar 26 2025 - 19:01:16 EST


On Tue, Mar 25, 2025 at 11:42 PM Herton Krzesinski <hkrzesin@xxxxxxxxxx> wrote:
>
> On Fri, Mar 21, 2025 at 5:47 PM David Laight
> <david.laight.linux@xxxxxxxxx> wrote:
> >
> > On Thu, 20 Mar 2025 16:53:32 -0700
> > Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
> >
> > > On Thu, 20 Mar 2025 at 14:17, Linus Torvalds
> > > <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
> > > >
> > > > On Thu, 20 Mar 2025 at 12:33, Mateusz Guzik <mjguzik@xxxxxxxxx> wrote:
> > > > >
> > > > > I have a recollection that handling the tail after rep movsq with an
> > > > > overlapping store was suffering a penalty big enough to warrant a
> > > > > "normal" copy instead, avoiding the just written to area.
> > > >
> > > > Ahh. Good point. The rep movsq might indeed end up having odd effects
> > > > with subsequent aliasing memory operations.
> > > >
> > > > Consider myself convinced.
> > >
> > > Actually, I think there's a solution for this.
> > >
> > > Do not do the last 0-7 bytes as a word that overlaps with the tail of
> > > the 'rep movs'
> > >
> > > Do the last 8-15 bytes *non-overlapping* (well, they overlap each
> > > other, but not the 'rep movs')
> > >
> > > Something UNTESTED like the appended, in other words. The large case
> > > then ends up without any conditionals, looking something like this:
> > >
> > > mov %rcx,%rax
> > > shr $0x3,%rcx
> > > dec %rcx
> > > and $0x7,%eax
> > > rep movsq %ds:(%rsi),%es:(%rdi)
> > > mov (%rsi),%rcx
> > > mov %rcx,(%rdi)
> > > mov (%rsi,%rax,1),%rcx
> > > mov %rcx,(%rdi,%rax,1)
> > > xor %ecx,%ecx
> > > ret
> >
> > I think you can save the 'tail end' copying the same 8 bytes twice by doing:
> > sub $9,%rcx
> > mov %rcx,%rax
> > shr $3,%rcx
> > and $7,%rax
> > inc %rax
> > before the 'rep movsq'.
>
> Not sure how above will work handling the remaining in %rax?
>
> Anyway, another version may be like this to avoid
> the rep movs penalty? Not sure if doing it before would be ok?
>
> index fc9fb5d06174..a0f9655e364c 100644
> --- a/arch/x86/lib/copy_user_64.S
> +++ b/arch/x86/lib/copy_user_64.S
> @@ -62,10 +62,15 @@ SYM_FUNC_START(rep_movs_alternative)
> je .Lexit
> cmp $8,%ecx
> jae .Lword
> - jmp .Lcopy_user_tail
> +4: movq -8(%rsi,%rcx),%rax
> +5: movq %rax,-8(%rdi,%rcx)
> + xorl %ecx,%ecx
> + RET
>
> _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
> _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
> + _ASM_EXTABLE_UA( 4b, .Lcopy_user_tail)
> + _ASM_EXTABLE_UA( 5b, .Lcopy_user_tail)
>
> .Llarge:
> 0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS
> @@ -74,18 +79,20 @@ SYM_FUNC_START(rep_movs_alternative)
> _ASM_EXTABLE_UA( 0b, 1b)
>
> .Llarge_movsq:
> + /* copy tail byte first, to avoid overlapping
> + penalty with rep movsq */
> +0: movq -8(%rsi,%rcx),%rax
> +1: movq %rax,-8(%rdi,%rcx)
> movq %rcx,%rax
> shrq $3,%rcx
> - andl $7,%eax
> -0: rep movsq
> - movl %eax,%ecx
> - testl %ecx,%ecx
> - jne .Lcopy_user_tail
> +2: rep movsq
> + xorl %ecx,%ecx
> RET
> -
> -1: leaq (%rax,%rcx,8),%rcx
> +3: movq %rax,%rcx
> jmp .Lcopy_user_tail
>
> - _ASM_EXTABLE_UA( 0b, 1b)
> + _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
> + _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
> + _ASM_EXTABLE_UA( 2b, 3b)
> SYM_FUNC_END(rep_movs_alternative)
> EXPORT_SYMBOL(rep_movs_alternative)
>
>
>
> I have been trying to also measure the impact of changes like above, however,
> it seems I don't get improvement or it's limited due impact of
> profiling, I tried
> to uninline/move copy_user_generic() like this:
>
> diff --git a/arch/x86/include/asm/uaccess_64.h
> b/arch/x86/include/asm/uaccess_64.h
> index c52f0133425b..2ae442c8a4b5 100644
> --- a/arch/x86/include/asm/uaccess_64.h
> +++ b/arch/x86/include/asm/uaccess_64.h
> @@ -115,25 +115,8 @@ static inline bool __access_ok(const void __user
> *ptr, unsigned long size)
> __must_check unsigned long
> rep_movs_alternative(void *to, const void *from, unsigned len);
>
> -static __always_inline __must_check unsigned long
> -copy_user_generic(void *to, const void *from, unsigned long len)
> -{
> - stac();
> - /*
> - * If CPU has FSRM feature, use 'rep movs'.
> - * Otherwise, use rep_movs_alternative.
> - */
> - asm volatile(
> - "1:\n\t"
> - ALTERNATIVE("rep movsb",
> - "call rep_movs_alternative",
> ALT_NOT(X86_FEATURE_FSRM))
> - "2:\n"
> - _ASM_EXTABLE_UA(1b, 2b)
> - :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> - : : "memory", "rax");
> - clac();
> - return len;
> -}
> +__must_check unsigned long
> +copy_user_generic(void *to, const void *from, unsigned long len);
>
> static __always_inline __must_check unsigned long
> raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index e9251b89a9e9..4585349f8f33 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -142,3 +142,24 @@ void __memcpy_flushcache(void *_dst, const void
> *_src, size_t size)
> }
> EXPORT_SYMBOL_GPL(__memcpy_flushcache);
> #endif
> +
> +__must_check unsigned long
> +copy_user_generic(void *to, const void *from, unsigned long len)
> +{
> + stac();
> + /*
> + * If CPU has FSRM feature, use 'rep movs'.
> + * Otherwise, use rep_movs_alternative.
> + */
> + asm volatile(
> + "1:\n\t"
> + ALTERNATIVE("rep movsb",
> + "call rep_movs_alternative",
> ALT_NOT(X86_FEATURE_FSRM))
> + "2:\n"
> + _ASM_EXTABLE_UA(1b, 2b)
> + :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> + : : "memory", "rax");
> + clac();
> + return len;
> +}
> +EXPORT_SYMBOL_GPL(copy_user_generic);
>
>
> And then, using bpftrace with this script to try to measure execution times:
>
> #############################
> fentry:copy_user_generic
> /strcontains(comm,"iperf3")/
> {
> /*printf("start %ul %p\n", args.len, kptr(args.to));*/
> @start[kptr(args.to),args.len] = nsecs;
> }
>
> fexit:copy_user_generic
> /strcontains(comm,"iperf3") && @start[kptr(args.to)-args.len,args.len]/
> {
> /*printf("end %ul %p\n", args.len, kptr(args.to)-args.len);*/
>
> $len = args.len;
> $len >>= 1;
> $log_len = 0;
> while ($len) {
> $len >>= 1;
> $log_len++;
> }
> $log1 = 1;
> $log1 <<= $log_len;
> $log2 = $log1;
> $log2 <<= 1;
> $dalign = (uint64)(kptr(args.to) - args.len);
> $dalign &= 0x7;
>
> @us[$dalign,$log1,$log2] = hist((nsecs -
> @start[kptr(args.to)-args.len,args.len]));
> delete(@start, (kptr(args.to)-args.len,args.len))
> }
>
> END
> {
> clear(@start);
> }
> #############################
>
> But the result is mixed at least in case of this change, I can't prove
> an improvement
> with it.
>

I suspect going to ebpf here has enough probe effect to overshadow any impact.

You may get a better shot issuing rdtscp before and after and then
calling a dummy probe with the difference -- that way all ebpf
overhead is incurred outside of the measured area. But this does not
account for migrating between cpus.

However, all of this convinced me to dig up an (unfinished, but close)
test jig I had for these routines.

While it neglects cache effects (as in lets things remain cache-hot),
it issues ops based on a random seed, making sure to screw with the
branch predictor. This is not perfect by any means, but should be good
enough to justify some of the changes (namely sorting out memset and
this guy not using overlapping stores). I can't promise any specific
timeline for the sucker though, apart from this being a matter of
weeks. (best case this weekend)

--
Mateusz Guzik <mjguzik gmail.com>