RE: [PATCH RFC] [X86] performance improvement for memcpy_64.S byfast string.

From: Ma, Ling
Date: Wed Nov 11 2009 - 02:06:39 EST


Hi All
Please use the memcpy.c(cc -o memcpy memcpy.c -O2) to test more cases,
if you have interest. In this program we did simple modification
on memcpy_new function.

Thanks
Ling


>-----Original Message-----
>From: Ingo Molnar [mailto:mingo@xxxxxxx]
>Sent: 2009年11月9日 16:09
>To: H. Peter Anvin
>Cc: Ma, Ling; Ingo Molnar; Thomas Gleixner; linux-kernel
>Subject: Re: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast
>string.
>
>
>* H. Peter Anvin <hpa@xxxxxxxxx> wrote:
>
>> On 11/08/2009 11:24 PM, Ma, Ling wrote:
>> > Hi All
>> >
>> > Today we run our benchmark on Core2 and Sandy Bridge:
>> >
>>
>> Hi Ling,
>>
>> Thanks for doing that. Do you also have access to any older CPUs? I
>> suspect that the CPUs that Andi are worried about are older CPUs like
>> P4, K8 or Pentium M/Core 1. (Andi: please do clarify if you have
>> additional information.)
>>
>> My personal opinion is that if we can show no significant slowdown on
>> P4, K8, P-M/Core 1, Core 2, and Nehalem then we can simply use this
>> code unconditionally. If one of them is radically worse than
>> baseline, then we have to do something conditional, which is a lot
>> more complicated.
>>
>> [Ingo, Thomas: do you agree?]
>
>Yeah. IIRC the worst-case were the old P2's which had a really slow,
>microcode based string ops. (Some of them even had erratums in early
>prototypes although we can certainly ignore those as string ops get
>relied on quite frequently.)
>
>IIRC the original PPro core came up with some nifty, hardwired string
>ops, but those had to be dumbed down and emulated in microcode due to
>SMP bugs - making it an inferior choice in the end.
>
>But that should be ancient history and i'd suggest we ignore the P4
>dead-end too, unless it's some really big slowdown (which i doubt). If
>anyone cares then some optional assembly implementations could be added
>back.
>
>Ling, if you are interested, could you send a user-space test-app to
>this thread that everyone could just compile and run on various older
>boxes, to gather a performance profile of hand-coded versus string ops
>performance?
>
>( And i think we can make a judgement based on cache-hot performance
> alone - if then the strings ops will perform comparatively better in
> cache-cold scenarios, so the cache-hot numbers would be a conservative
> estimate. )
>
> Ingo
#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define MAXSAMPLESTPT 100000
#define MAXCOPYSIZE (1024 * 32)
#define ORIG 0
#define NEW 1
static char* buf1 = NULL;
static char* buf2 = NULL;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
total_time += tmptime; \
} \
while (0)

void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_throughput ( char *dst, char *src,
size_t len)
{
__asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
size_t i;
hp_timing_t start __attribute ((unused));
hp_timing_t stop __attribute ((unused));
hp_timing_t total_time = (hp_timing_t) 0;

__asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
for (i = 0; i < MAXSAMPLESTPT; ++i) {
HP_TIMING_NOW (start);
do_memcpy(buf1, buf2, len);
HP_TIMING_NOW (stop);
HP_TIMING_TOTAL (total_time, start, stop);
}

printf ("\t%zd", (size_t) total_time/MAXSAMPLESTPT);

}

static void
do_tpt_test (size_t align1, size_t align2, size_t len)
{
size_t i, j;
char *s1, *s2;

s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);


printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
do_memcpy = memcpy_orig;
do_one_throughput (s2, s1, len);
do_memcpy = memcpy_new;
do_one_throughput (s2, s1, len);

putchar ('\n');
}

static test_init(void)
{
int i;
buf1 = valloc(MAXCOPYSIZE);
buf2 = valloc(MAXCOPYSIZE);

for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
buf1[i] = buf2[i] = i & 0xff;
}

}

void memcpy_new(char *dst, char *src, int len)
{

__asm__("movq %rdi, %rax");
__asm__("movl %edx, %ecx");
__asm__("shrl $6, %ecx");
__asm__("jz 2f");

__asm__("cmp $0x400, %rdx");
__asm__("jae 8f");

__asm__("1:");
__asm__("decl %ecx");

__asm__("movq 0*8(%rsi), %r11");
__asm__("movq 1*8(%rdi), %r8");
__asm__("movq %r11, 0*8(%rdi)");
__asm__("movq %r8, 1*8(%rdi)");

__asm__("movq 2*8(%rsi), %r9");
__asm__("movq 3*8(%rdi), %r10");
__asm__("movq %r9, 2*8(%rdi)");
__asm__("movq %r10, 3*8(%rdi)");

__asm__("movq 4*8(%rsi), %r11");
__asm__("movq 5*8(%rdi), %r8");
__asm__("movq %r11, 4*8(%rdi)");
__asm__("movq %r8, 5*8(%rdi)");

__asm__("movq 6*8(%rsi), %r9");
__asm__("movq 7*8(%rdi), %r10");
__asm__("movq %r9, 6*8(%rdi)");
__asm__("movq %r10, 7*8(%rdi)");

__asm__("leaq 64(%rsi), %rsi");
__asm__("leaq 64(%rdi), %rdi");

__asm__("jnz 1b");

__asm__("2:");
__asm__("movl %edx, %ecx");
__asm__("andl $63, %ecx");
__asm__("shl $3, %ecx");
__asm__("jz 5f");

__asm__("3:");
__asm__("cmp %edi, %esi");
__asm__("mov $8, %r9");
__asm__("jl 4f");
__asm__("neg %r9");

__asm__("4:");
__asm__("decl %ecx");
__asm__("movq (%rsi), %r8");
__asm__("movq %r8, (%rdi)");
__asm__("leaq 8(%rdi), %rdi");
__asm__("leaq 8(%rsi), %rsi");
__asm__("jnz 3b");

__asm__("5:");
__asm__("movl %edx, %ecx");
__asm__("andl $7, %ecx");
__asm__("jz 7f");

__asm__("6:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %ecx");
__asm__("jnz 6b");

__asm__("7:");
__asm__("retq");

__asm__("8:");
__asm__("movl %edx, %ecx");
__asm__ ("shr $3, %ecx");
__asm__ ("andl $7, %edx");
__asm__("rep movsq ");
__asm__ ("jz 9f");
__asm__("movl %edx, %ecx");
__asm__("rep movsb");

__asm__("9:");
}
void memcpy_orig(char *dst, char *src, int len)
{
__asm__("movq %rdi, %rax");
__asm__("movl %edx, %ecx");
__asm__("shrl $6, %ecx");
__asm__("jz 2f");

__asm__("mov $0x80, %r8d "); /*aligned case for loop 1 */

__asm__("1:");
__asm__("decl %ecx");

__asm__("movq 0*8(%rsi), %r11");
__asm__("movq 1*8(%rdi), %r8");
__asm__("movq %r11, 0*8(%rdi)");
__asm__("movq %r8, 1*8(%rdi)");

__asm__("movq 2*8(%rsi), %r9");
__asm__("movq 3*8(%rdi), %r10");
__asm__("movq %r9, 2*8(%rdi)");
__asm__("movq %r10, 3*8(%rdi)");

__asm__("movq 4*8(%rsi), %r11");
__asm__("movq 5*8(%rdi), %r8");
__asm__("movq %r11, 4*8(%rdi)");
__asm__("movq %r8, 5*8(%rdi)");

__asm__("movq 6*8(%rsi), %r9");
__asm__("movq 7*8(%rdi), %r10");
__asm__("movq %r9, 6*8(%rdi)");
__asm__("movq %r10, 7*8(%rdi)");

__asm__("leaq 64(%rsi), %rsi");
__asm__("leaq 64(%rdi), %rdi");

__asm__("jnz 1b");

__asm__("2:");
__asm__("movl %edx, %ecx");
__asm__("andl $63, %ecx");
__asm__("shl $3, %ecx");
__asm__("jz 5f");

__asm__("3:");
__asm__("cmp %edi, %esi");
__asm__("mov $8, %r9");
__asm__("jl 4f");
__asm__("neg %r9");

__asm__("4:");
__asm__("decl %ecx");
__asm__("movq (%rsi), %r8");
__asm__("movq %r8, (%rdi)");
__asm__("leaq 8(%rdi), %rdi");
__asm__("leaq 8(%rsi), %rsi");
__asm__("jnz 3b");

__asm__("5:");
__asm__("movl %edx, %ecx");
__asm__("andl $7, %ecx");
__asm__("jz 7f");

__asm__("6:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %ecx");
__asm__("jnz 6b");

__asm__("7:");
__asm__("retq");
}


void main(void)
{
int i;
test_init();
printf ("%23s", "");
printf ("\t%s\t%s\n", "memcpy_orig", "memcpy_new");

for (i = 1024; i < 1024 * 16; i = i + 1024)
do_tpt_test(8, 0, i);

}