Re: [PATCH] x86/asm/entry/64: better check for canonical address

From: Denys Vlasenko
Date: Fri Mar 27 2015 - 08:14:47 EST


On 03/27/2015 12:34 PM, Ingo Molnar wrote:
>
> * Brian Gerst <brgerst@xxxxxxxxx> wrote:
>
>>> Btw., there's a neat trick we could do: in the HLT, MWAIT and
>>> ACPI-idle code we could attempt to set up RCX to match RIP, to
>>> trigger this optimization in the common 'irq interrupted the idle
>>> task' case?
>>
>> sysret only returns to CPL3.
>
> Indeed, an IRET ought to be pretty cheap for same-ring interrupt
> returns in any case.

Unfortunately, it is not. Try attached program.

On this CPU, 1 ns ~= 3 cycles.

$ ./timing_test64 callret
10000 loops in 0.00008s = 7.87 nsec/loop for callret
100000 loops in 0.00076s = 7.56 nsec/loop for callret
1000000 loops in 0.00548s = 5.48 nsec/loop for callret
10000000 loops in 0.02882s = 2.88 nsec/loop for callret
100000000 loops in 0.18334s = 1.83 nsec/loop for callret
200000000 loops in 0.36051s = 1.80 nsec/loop for callret
400000000 loops in 0.71632s = 1.79 nsec/loop for callret

Near call + near ret = 5 cycles

$ ./timing_test64 lret
10000 loops in 0.00034s = 33.95 nsec/loop for lret
100000 loops in 0.00328s = 32.83 nsec/loop for lret
1000000 loops in 0.04541s = 45.41 nsec/loop for lret
10000000 loops in 0.32130s = 32.13 nsec/loop for lret
20000000 loops in 0.64191s = 32.10 nsec/loop for lret

push my_cs + push next_label + far ret = ~90 cycles

$ ./timing_test64 iret
10000 loops in 0.00344s = 343.90 nsec/loop for iret
100000 loops in 0.01890s = 188.97 nsec/loop for iret
1000000 loops in 0.08228s = 82.28 nsec/loop for iret
10000000 loops in 0.77910s = 77.91 nsec/loop for iret

This is the "same-ring interrupt return". ~230 cycles! :(


// To be unaffected by random cacheline placement, use generous "align":
//
// i686-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static
// x86_64-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static

#include <inttypes.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <stdio.h>

#if !defined(__i386__)
#define get_sysenter_addr() 0
#else
#include <elf.h>
long sysenter_addr;
long get_sysenter_addr(char **envp)
{
Elf32_auxv_t *auxv;
while (*envp++ != NULL)
continue;
for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++)
if( auxv->a_type == AT_SYSINFO)
return (sysenter_addr = auxv->a_un.a_val);
fprintf(stderr, "AT_SYSINFO not supplied, can't test\n");
exit(0); /* this is not a failure */
}

void sysenter_getpid(void)
{
asm volatile(
"\n" " mov $20,%eax" // GETPID
"\n" " call *sysenter_addr"
);
}
#endif

#if defined(__i386__)
#define L_or_Q "l"
#define E_or_R "e"
#else
#define L_or_Q "q"
#define E_or_R "r"
#endif

asm (
"\n" " .text"
"\n" "ret__: ret"
);

int main(int argc, char **argv, char **envp)
{
struct timespec start, end;
unsigned long long duration;
size_t loops, i;
const char *mode;

if (argc < 2) {
printf("Usage: timing_test [MILLIONS_OF_ITERATIONS] MODE\n");
return 1;
}
mode = argv[2];
if (!mode) {
mode = argv[1];
loops = 10*1000;
} else {
loops = (size_t)atol(argv[1]) * 1000000;
}

again:
if (!strcmp(mode, "nothing")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("# nothing");
}
} else if (!strcmp(mode, "nop")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("nop");
}
} else if (!strcmp(mode, "rdtsc")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "lfence_rdtsc")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("lfence;rdtsc" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "lfence_rdtsc_lfence")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("");
asm volatile ("lfence;rdtsc;lfence" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "mfence_rdtsc_mfence")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("mfence;rdtsc;mfence" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "rdtscp")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, c, d;
asm volatile ("rdtscp" : "=a" (a), "=c" (c), "=d" (d));
}
} else if (!strcmp(mode, "gettimeofday")) {
struct timeval tv;
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
gettimeofday(&tv, 0);
} else if (!strcmp(mode, "getpid")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
syscall(SYS_getpid);
#if defined(__i386__)
} else if (!strcmp(mode, "sysenter_getpid")) {
get_sysenter_addr(envp);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
sysenter_getpid();
} else if (!strcmp(mode, "iret")) {
/* "push cs" is itself a bit expensive, moving it out of loop */
long saved_cs;
asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push $0" // flags
"\n" " push %0" // cs
"\n" " push $1f" // ip
"\n" " iret"
"\n" "1:"
:
: "r" (saved_cs)
);
}
#endif
#if defined(__x86_64__)
} else if (!strcmp(mode, "iret")) {
long saved_cs;
long saved_ss;
asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " mov %%rsp,%%rax"
"\n" " push %0" // ss
"\n" " push %%rax" // sp
"\n" " push $0" // flags
"\n" " push %1" // cs
"\n" " push $1f" // ip
"\n" " iretq"
"\n" "1:"
:
: "r" (saved_ss), "r" (saved_cs)
: "ax"
);
}
#endif
} else if (!strcmp(mode, "lret")) {
/* "push cs" is itself a bit expensive, moving it out of loop */
long saved_cs;
asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push %0"
"\n" " push $1f"
"\n" " lret"L_or_Q
"\n" "1:"
:
: "r" (saved_cs)
);
}
} else if (!strcmp(mode, "callret")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("call ret__");
}
} else if (!strcmp(mode, "ret")) {
/* This is useful to measure delays due to
* return stack branch prediction not working
* (we aren't using paired call/rets here, as CPU expects).
* I observed "callret" test above being 4 times faster than this:
*/
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push $1f"
"\n" " ret"
"\n" "1:"
);
}
} else if (!strcmp(mode, "loadss")) {
long saved_ss;
asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("mov %0,%%ss" : : "r" (saved_ss));
}
} else if (!strcmp(mode, "pushf")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " pushf"
"\n" " pop %%"E_or_R"ax"
:
:
: "ax"
);
}
} else if (!strcmp(mode, "popf")) {
long flags;
asm volatile (
"\n" " pushf"
"\n" " pop %0"
: "=r" (flags)
);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push %0"
"\n" " popf"
:
: "r" (flags)
: "ax"
);
}
} else if (!strcmp(mode, "rdpmc")) {
// Unlikely to work.
unsigned int eax, edx;
unsigned int ecx = 0;
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx));
} else {
printf("Unknown mode %s\n", mode);
return 1;
}

clock_gettime(CLOCK_MONOTONIC, &end);
duration = (1000*1000*1000ULL * end.tv_sec + end.tv_nsec)
- (1000*1000*1000ULL * start.tv_sec + start.tv_nsec);
printf("%lu loops in %.5fs = %.2f nsec/loop for %s\n",
(unsigned long)loops, (double)duration * 1e-9,
(double)duration / loops,
mode
);
if (!argv[2]) {
if (duration < 90*1000*1000) {
loops *= 10;
goto again;
}
if (duration < 490*1000*1000) {
loops *= 2;
goto again;
}
}
return 0;
}