Re: [PATCH RFC] [INET]: Get cirtical word in first 64bit of cache line
From: Ling Ma
Date: Sun Dec 02 2012 - 08:25:00 EST
Hi Eric,
Attached benchmark test-cwf.c(cc -o test-cwf test-cwf.c), the result
shows when last level cache(LLC) miss and CPU fetches data from
memory, critical word as first 64bit member in cache line has better
performance(costs 158290336 cycles ) than other positions(offset 0x10,
costs 164100732 ) in cache line, the performance is improved by 3.6%
in this case.
cpu-info is also involved too.
Thanks
Ling
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<unistd.h>
#define MAX_BUF_NUM (1 << 20)
#define MAX_BUF_SIZE (1 << 8)
#define ACCESS_OFFSET (0x10)
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })
#define repeat_times (64)
static void init_buf(char **buf)
{
int i = 0;
char *start;
char *end;
int pagesize = getpagesize();
*buf = malloc(MAX_BUF_SIZE * MAX_BUF_NUM + pagesize);
if(*buf == NULL) {
printf("\nfait to malloc space!\n");
exit(1);
} else {
*buf = *buf + pagesize;
*buf = (char *)(((unsigned long)*buf) & (-pagesize));
}
start = *buf;
end = *buf + (MAX_BUF_SIZE * MAX_BUF_NUM) - MAX_BUF_SIZE;
while(1) {
*((unsigned char **)start) = end;
*((unsigned char **)(start + ACCESS_OFFSET)) = (end + ACCESS_OFFSET);
start = start + MAX_BUF_SIZE;
if(start == end)
break;
*((unsigned char **)end) = start;
*((unsigned char **)(end + ACCESS_OFFSET)) = start + ACCESS_OFFSET;
end = end - MAX_BUF_SIZE;
}
}
unsigned long lookingup_memmory(char *access, int num)
{
__asm__("sub $1, %rsi");
__asm__("xor %rax, %rax");
__asm__("1:");
__asm__("mov (%rdi), %r8");
__asm__("add %r8, %rax");
__asm__("mov %r8, %rdi");
__asm__("sub $1, %rsi");
__asm__("jae 1b");
}
static unsigned long test_lookup_time(char *buf)
{
unsigned long i, start, end, best_time = ~0;
for(i = 0; i < repeat_times; i++) {
HP_TIMING_NOW(start);
lookingup_memmory(buf, MAX_BUF_NUM);
HP_TIMING_NOW(end);
if(best_time > (end - start))
best_time = (end - start);
}
return best_time;
}
void main (void)
{
char *buf1 = NULL;
char *buf2 = NULL;
unsigned long aligned_time, unaligned_time;
init_buf(&buf1);
init_buf(&buf2);
aligned_time = test_lookup_time(buf1);
unaligned_time = test_lookup_time(buf2 + ACCESS_OFFSET);
printf("looking-up aligned time %ld, looking-up unaligned time %ld\n", aligned_time, unaligned_time);
}
Attachment:
cpu-info
Description: Binary data