Re: [PATCH 02/24] iov_iter: Renumber the ITER_* constants in uio.h
From: David Howells
Date: Tue Oct 23 2018 - 09:20:46 EST
Al Viro <viro@xxxxxxxxxxxxxxxxxx> wrote:
> On Sat, Oct 20, 2018 at 02:10:52AM +0100, David Howells wrote:
> > Renumber the ITER_* constants in uio.h to be contiguous to make comparing
> > them more efficient in a switch-statement.
>
> Are you sure that they *are* more efficient that way? Some of those paths
> are fairly hot, so much that I would really like to see profiles before
> and after these changes (both this and the previous one).
I updated my test program to try and overcome the effects of branch prediction
by going forwards through the set of iterator types and then back again inside
the inner loop rather than putting it on the outside.
I also made it run all the tests itself, do the stats calculations itself on
the output and only output at the end.
Firstly, with 32MiB bufsize to cause the dcache to be fully cycled (values are
times in nanoseconds, lower is better):
TEST TOTAL LOWEST HIGHEST MEAN STDDEV
0-AND 86690784677 10743312 11185591 10836348 36292
1-EQ 86718137278 10743467 11189117 10839767 34984
2-SWITC 86716408086 10724792 11278450 10839551 35031
3-ASM 86755866820 10742027 11214087 10844483 33932
4-ASM 86587355807 10728937 11291730 10823419 34628
5-SW-BT 86532876718 10725349 11303746 10816609 31168
And, secondly, with 1MiB bufsize to keep the data fully within the dcache:
TEST TOTAL LOWEST HIGHEST MEAN STDDEV
0-AND 665992395 81216 136894 83249 2598
1-EQ 665442229 81146 102292 83180 1767
2-SWITC 665263828 81133 103332 83157 1819
3-ASM 667402344 81525 119804 83425 2251
4-ASM 664493028 81178 104506 83061 2230
5-SW-BT 665792835 81378 115603 83224 2145
Each test was run 8000 times (samples parameter). The differences appear to
be around .1% and are quite variable, presumably because of interrupts.
Interestingly, if I comment out all the memcpy calls and set iter.type to 5, I
get something that looks like:
TEST TOTAL LOWEST HIGHEST MEAN STDDEV
0-AND 5091749489 569535 925088 636468 13459
1-EQ 4565513562 569496 620423 570689 3138
2-SWITC 4126907116 498480 650886 515863 10394
3-ASM 9126139444 1137597 1177709 1140767 5006
4-ASM 7411945298 924526 1123532 926493 4301
5-SW-BT 5135087754 640499 679302 641885 3217
This is a pretty consistent picture over all runs. The numbers are larger
than for the 1MiB buffer size because it's doing 32x as many chunks, even
those chunks aren't actually being copied. Reducing bufsize to 1MiB:
TEST TOTAL LOWEST HIGHEST MEAN STDDEV
0-AND 176123187 19299 54659 22015 2837
1-EQ 153711816 19123 36712 19213 810
2-SWITC 140564024 16917 32145 17570 947
3-ASM 296678820 36948 54856 37084 1113
4-ASM 242791274 30210 49183 30348 1188
5-SW-BT 171511145 21351 35625 21438 765
3-ASM sticks out like a sore thumb, presumably because of the adjacent jumps.
Letting gcc optimise a switch-statement with sequentially numbered cases seems
to be optimal.
David
---
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <math.h>
#include <sys/mman.h>
/* Adjustable parameters */
#define bufsize (32ULL * 1024 * 1024)
#define chunksize (1 * 128)
#define samples (8000)
#define noinline __attribute__((noinline))
#define unlikely(x) __builtin_expect(!!(x), 0)
struct iov_iter {
int type;
size_t iov_offset;
size_t count;
char *p;
};
noinline void iterate_iovec(const void *addr, size_t bytes, struct iov_iter *i)
{
memcpy(i->p + i->iov_offset, addr, bytes);
}
noinline void iterate_kvec(const void *addr, size_t bytes, struct iov_iter *i)
{
memcpy(i->p + i->iov_offset, addr, bytes);
}
noinline void iterate_bvec(const void *addr, size_t bytes, struct iov_iter *i)
{
memcpy(i->p + i->iov_offset, addr, bytes);
}
noinline void iterate_pipe(const void *addr, size_t bytes, struct iov_iter *i)
{
memcpy(i->p + i->iov_offset, addr, bytes);
}
noinline void iterate_discard(const void *addr, size_t bytes, struct iov_iter *i)
{
memcpy(i->p + i->iov_offset, addr, bytes);
}
noinline void iterate_mapping(const void *addr, size_t bytes, struct iov_iter *i)
{
memcpy(i->p + i->iov_offset, addr, bytes);
}
enum b_iter_type {
B_ITER_IOVEC = 0,
B_ITER_KVEC = 2,
B_ITER_BVEC = 4,
B_ITER_PIPE = 8,
B_ITER_DISCARD = 16,
B_ITER_MAPPING = 32,
};
enum s_iter_type {
S_ITER_IOVEC = 0,
S_ITER_KVEC = 1,
S_ITER_BVEC = 2,
S_ITER_PIPE = 3,
S_ITER_DISCARD = 4,
S_ITER_MAPPING = 5,
};
/*
* 0-AND: Use bitwise-AND if-chain tests
*/
noinline void TEST0_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
if ( unlikely(i->type & B_ITER_BVEC)) iterate_bvec(addr, bytes, i);
else if (unlikely(i->type & B_ITER_KVEC)) iterate_kvec(addr, bytes, i);
else if (unlikely(i->type & B_ITER_PIPE)) iterate_pipe(addr, bytes, i);
else if (unlikely(i->type & B_ITER_DISCARD)) iterate_discard(addr, bytes, i);
else if (unlikely(i->type & B_ITER_MAPPING)) iterate_mapping(addr, bytes, i);
else iterate_iovec(addr, bytes, i);
i->iov_offset += bytes;
}
/*
* 1-EQ: Use integer-equals if-chain tests on sequential integers
*/
noinline void TEST1_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
if ( unlikely(i->type == S_ITER_IOVEC)) iterate_iovec(addr, bytes, i);
else if (unlikely(i->type == S_ITER_KVEC)) iterate_kvec(addr, bytes, i);
else if (unlikely(i->type == S_ITER_BVEC)) iterate_bvec(addr, bytes, i);
else if (unlikely(i->type == S_ITER_PIPE)) iterate_pipe(addr, bytes, i);
else if (unlikely(i->type == S_ITER_DISCARD)) iterate_discard(addr, bytes, i);
else if (unlikely(i->type == S_ITER_MAPPING)) iterate_mapping(addr, bytes, i);
else {
printf("unknown iter %d\n", i->type);
exit(2);
}
i->iov_offset += bytes;
}
/*
* 2-SWITC: Use switch with sequential integers
*/
noinline void TEST2_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
switch (i->type) {
case S_ITER_IOVEC: iterate_iovec(addr, bytes, i); break;
case S_ITER_BVEC: iterate_bvec(addr, bytes, i); break;
case S_ITER_KVEC: iterate_kvec(addr, bytes, i); break;
case S_ITER_PIPE: iterate_pipe(addr, bytes, i); break;
case S_ITER_DISCARD: iterate_discard(addr, bytes, i); break;
case S_ITER_MAPPING: iterate_mapping(addr, bytes, i); break;
default:
printf("unknown iter %d\n", i->type);
exit(2);
}
i->iov_offset += bytes;
}
/*
* 3-ASM: Like 1-EQ, but hand-optimised to remove half the CMP instructions.
*/
extern void TEST3_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
asm(
"TEST3_copy_to_iter: \n"
" push %rbp \n"
" mov %rsi,%rbp \n"
" push %rbx \n"
" mov %rdx,%rbx \n"
" sub $0x8,%rsp \n"
" mov (%rdx),%esi \n"
" cmp $0x1,%esi \n"
" jc 72f \n"
" je 96f \n"
" cmp $0x3,%esi \n"
" jc 112f \n"
" je 128f \n"
" cmp $0x5,%esi \n"
" jc 144f \n"
" je 160f \n"
" xor %eax,%eax \n"
" jmp 80f \n"
"72: \n"
" mov %rbp,%rsi \n"
" callq iterate_iovec \n"
"80: \n"
" add %rbp,0x8(%rbx) \n"
" add $0x8,%rsp \n"
" pop %rbx \n"
" pop %rbp \n"
" retq \n"
" nopl 0x0(%rax,%rax,1) \n"
"96: \n"
" mov %rbp,%rsi \n"
" callq iterate_kvec \n"
" jmp 80b \n"
"112: \n"
" mov %rbp,%rsi \n"
" callq iterate_bvec \n"
" jmp 80b \n"
"128: \n"
" mov %rbp,%rsi \n"
" callq iterate_pipe \n"
" jmp 80b \n"
"144: \n"
" mov %rbp,%rsi \n"
" callq iterate_discard \n"
" jmp 80b \n"
"160: \n"
" mov %rbp,%rsi \n"
" callq iterate_mapping \n"
" jmp 80b \n"
);
/*
* 4-ASM: Like 3-ASM, but further optimised to move some of the jumps out to
* branches.
*/
extern void TEST4_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
asm(
"TEST4_copy_to_iter: \n"
" push %rbp \n"
" mov %rsi,%rbp \n"
" push %rbx \n"
" mov %rdx,%rbx \n"
" sub $0x8,%rsp \n"
" mov (%rdx),%esi \n"
" cmp $0x1,%esi \n"
" jle 72f \n"
" cmp $0x3,%esi \n"
" jle 112f \n"
" cmp $0x5,%esi \n"
" jle 144f \n"
" xor %eax,%eax \n"
" jmp 80f \n"
"72: \n"
" je 96f \n"
" mov %rbp,%rsi \n"
" callq iterate_iovec \n"
"80: \n"
" add %rbp,0x8(%rbx) \n"
" add $0x8,%rsp \n"
" pop %rbx \n"
" pop %rbp \n"
" retq \n"
" nopl 0x0(%rax,%rax,1) \n"
"96: \n"
" mov %rbp,%rsi \n"
" callq iterate_kvec \n"
" jmp 80b \n"
"112: \n"
" je 128f \n"
" mov %rbp,%rsi \n"
" callq iterate_bvec \n"
" jmp 80b \n"
"128: \n"
" mov %rbp,%rsi \n"
" callq iterate_pipe \n"
" jmp 80b \n"
"144: \n"
" je 160f \n"
" mov %rbp,%rsi \n"
" callq iterate_discard \n"
" jmp 80b \n"
"160: \n"
" mov %rbp,%rsi \n"
" callq iterate_mapping \n"
" jmp 80b \n"
);
/*
* 5-SW-BIT: Like 1-SWITC, but using the bitwise-optimised constants used
* by 0-AND.
*/
noinline void TEST5_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
switch (i->type) {
case B_ITER_IOVEC: iterate_iovec(addr, bytes, i); break;
case B_ITER_BVEC: iterate_bvec(addr, bytes, i); break;
case B_ITER_KVEC: iterate_kvec(addr, bytes, i); break;
case B_ITER_PIPE: iterate_pipe(addr, bytes, i); break;
case B_ITER_DISCARD: iterate_discard(addr, bytes, i); break;
case B_ITER_MAPPING: iterate_mapping(addr, bytes, i); break;
default:
printf("unknown iter %d\n", i->type);
exit(2);
}
i->iov_offset += bytes;
}
#define nr_tests 6
const char *const test_names[nr_tests] = {
"0-AND",
"1-EQ",
"2-SWITC",
"3-ASM",
"4-ASM",
"5-SW-BT",
};
const int b_types[] = {
B_ITER_IOVEC,
B_ITER_KVEC,
B_ITER_BVEC,
B_ITER_PIPE,
B_ITER_DISCARD,
B_ITER_MAPPING,
-1
};
const int s_types[] = {
S_ITER_IOVEC,
S_ITER_KVEC,
S_ITER_BVEC,
S_ITER_PIPE,
S_ITER_DISCARD,
S_ITER_MAPPING,
-1
};
unsigned long long sample_buf[nr_tests][samples];
static __always_inline void test(unsigned int test, void *a, void *b)
{
unsigned long long tmp;
struct timespec tv_1, tv_2;
int i = 0, j, k;
for (j = 0; j < samples; j++) {
struct iov_iter iter = {
.p = a,
.iov_offset = 0,
};
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tv_1);
for (k = 0; k < bufsize / chunksize; k++) {
const int *types;
void *p;
int x;
switch (test) {
case 0: types = b_types; break;
case 1: types = s_types; break;
case 2: types = s_types; break;
case 3: types = s_types; break;
case 4: types = s_types; break;
case 5: types = b_types; break;
default: types = NULL; break;
}
/* For 6 tests, do 0 1 2 3 4 5 5 4 3 2 1 0 */
x = i++ % (nr_tests * 2);
if (x >= nr_tests)
x = nr_tests * 2 - 1 - x;
iter.type = types[x];
p = b + k * chunksize;
switch (test) {
case 0: TEST0_copy_to_iter(p, chunksize, &iter); break;
case 1: TEST1_copy_to_iter(p, chunksize, &iter); break;
case 2: TEST2_copy_to_iter(p, chunksize, &iter); break;
case 3: TEST3_copy_to_iter(p, chunksize, &iter); break;
case 4: TEST4_copy_to_iter(p, chunksize, &iter); break;
case 5: TEST5_copy_to_iter(p, chunksize, &iter); break;
}
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tv_2);
tmp = (tv_2.tv_sec - tv_1.tv_sec) * 1000ULL * 1000 * 1000;
tmp += tv_2.tv_nsec - tv_1.tv_nsec;
sample_buf[test][j] = tmp;
}
}
static noinline void test_0(void *a, void *b) { test(0, a, b); }
static noinline void test_1(void *a, void *b) { test(1, a, b); }
static noinline void test_2(void *a, void *b) { test(2, a, b); }
static noinline void test_3(void *a, void *b) { test(3, a, b); }
static noinline void test_4(void *a, void *b) { test(4, a, b); }
static noinline void test_5(void *a, void *b) { test(5, a, b); }
int main()
{
void *a, *b;
int i, j;
a = mmap(NULL, bufsize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (a == MAP_FAILED) {
perror("mmap");
exit(1);
}
b = mmap(NULL, bufsize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (a == MAP_FAILED) {
perror("mmap");
exit(1);
}
memset(a, 1, bufsize);
memset(b, 1, bufsize);
test_0(a, b);
test_1(a, b);
test_2(a, b);
test_3(a, b);
test_4(a, b);
test_5(a, b);
for (i = 0; i < nr_tests; i++) {
for (j = 0; j < samples; j++) {
printf("sample,%s,%llu\n", test_names[i], sample_buf[i][j]);
}
}
printf("\n");
printf("TEST TOTAL LOWEST HIGHEST MEAN STDDEV\n");
for (i = 0; i < nr_tests; i++) {
long long lowest = ULLONG_MAX, highest = 0;;
long long total = 0, mean;
double stddev;
for (j = 0; j < samples; j++) {
total += sample_buf[i][j];
if (sample_buf[i][j] < lowest)
lowest = sample_buf[i][j];
if (sample_buf[i][j] > highest)
highest = sample_buf[i][j];
}
mean = total / samples;
stddev = 0;
for (j = 0; j < samples; j++) {
long long x = sample_buf[i][j];
x -= mean;
stddev += (double)x * (double)x;
}
stddev /= (samples - 1);
stddev = sqrt(stddev);
printf("%7s %12llu %12llu %12llu %12llu %12llu\n",
test_names[i], total, lowest, highest, mean,
(unsigned long long)stddev);
}
return 0;
}