Re: [RFC] Improve memset

From: Borislav Petkov
Date: Fri Sep 13 2019 - 12:36:58 EST


On Fri, Sep 13, 2019 at 12:42:32PM +0200, Borislav Petkov wrote:
> Or should we talk to Intel hw folks about it...

Or, I can do something like this, while waiting. Benchmark at the end.

The numbers are from a KBL box:

model : 158
model name : Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz
stepping : 12

and if I'm not doing anything wrong with the benchmark (the asm looks
ok but I could very well be missing something), the numbers say that
the REP; STOSB is better from sizes of 8 and upwards and up to two
cachelines we're pretty much on-par with the builtin variant.

But after that we're pretty close too.

But this is just a single uarch which is pretty new, I guess the 32 or
64 bytes cap should be good enough for most, especially those where REP;
STOSB isn't as good.

Hmm.

Loops: 1000000
size builtin_memset rep;stosb memset_rep
0 46 85 131
8 88 65 104
16 66 65 103
24 66 65 103
32 66 65 104
40 66 65 104
48 66 65 103
56 66 65 103
64 66 65 103
72 66 65 103
80 66 65 103
88 66 65 103
96 66 65 103
104 66 65 103
112 66 65 103
120 66 65 103
128 66 65 103
136 66 71 108
144 73 71 108
152 72 71 108
160 73 71 108
168 73 71 108
176 73 71 108
184 73 71 108
192 72 74 107
200 75 78 116
208 80 78 116
216 80 78 116
224 80 78 116
232 80 78 116
240 80 78 116
248 80 78 116


---
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef unsigned long long u64;

#define DECLARE_ARGS(val, low, high) unsigned low, high
#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)

static __always_inline unsigned long long rdtsc(void)
{
DECLARE_ARGS(val, low, high);

asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

return EAX_EDX_VAL(val, low, high);
}

/**
* rdtsc_ordered() - read the current TSC in program order
*
* rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
* It is ordered like a load to a global in-memory counter. It should
* be impossible to observe non-monotonic rdtsc_unordered() behavior
* across multiple CPUs as long as the TSC is synced.
*/
static __always_inline unsigned long long rdtsc_ordered(void)
{
/*
* The RDTSC instruction is not ordered relative to memory
* access. The Intel SDM and the AMD APM are both vague on this
* point, but empirically an RDTSC instruction can be
* speculatively executed before prior loads. An RDTSC
* immediately after an appropriate barrier appears to be
* ordered as a normal load, that is, it provides the same
* ordering guarantees as reading from a global memory location
* that some other imaginary CPU is updating continuously with a
* time stamp.
*/
asm volatile("mfence");
return rdtsc();
}

static __always_inline void *rep_stosb(void *dest, int c, size_t n)
{
void *ret, *dummy;

asm volatile("rep; stosb"
: "=&D" (ret), "=c" (dummy)
: "0" (dest), "a" (c), "c" (n)
: "memory");

return ret;
}

static __always_inline void *memset_rep(void *dest, int c, size_t n)
{
void *ret, *dummy;

asm volatile("push %%rdi\n\t"
"mov %%rax, %%rsi\n\t"
"mov %%rcx, %%rdx\n\t"
"andl $7,%%edx\n\t"
"shrq $3,%%rcx\n\t"
"movzbl %%sil,%%esi\n\t"
"movabs $0x0101010101010101,%%rax\n\t"
"imulq %%rsi,%%rax\n\t"
"rep stosq\n\t"
"movl %%edx,%%ecx\n\t"
"rep stosb\n\t"
"pop %%rax\n\t"
: "=&D" (ret), "=c" (dummy)
: "0" (dest), "a" (c), "c" (n)
: "rsi", "rdx", "memory");

return ret;
}

#define BUF_SIZE 1024
static void bench_memset(unsigned int loops)
{
u64 p0, p1, m_cycles = 0, r_cycles = 0, rep_cyc = 0;
int i, s;
void *dst;

dst = malloc(BUF_SIZE);
if (!dst)
perror("malloc");

printf("Loops: %d\n", loops);
printf("size\t\tbuiltin_memset\trep;stosb\tmemset_rep\n");

for (s = 0; s < BUF_SIZE; s += 16) {
for (i = 0; i < loops; i++) {
p0 = rdtsc_ordered();
__builtin_memset(dst, 0, s);
p1 = rdtsc_ordered();

m_cycles += p1 - p0;
}

for (i = 0; i < loops; i++) {
p0 = rdtsc_ordered();
rep_stosb(dst, 0, s);
p1 = rdtsc_ordered();

r_cycles += p1 - p0;
}

for (i = 0; i < loops; i++) {
p0 = rdtsc_ordered();
memset_rep(dst, 0, s);
p1 = rdtsc_ordered();

rep_cyc += p1 - p0;
}


m_cycles /= loops;
r_cycles /= loops;
rep_cyc /= loops;

printf("%d\t\t%llu\t\t%llu\t\t%llu\n", s, m_cycles, r_cycles, rep_cyc);
m_cycles = r_cycles = rep_cyc = 0;
}

free(dst);
}

int main(void)
{
bench_memset(1000000);

return 0;
}


--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette