Re: [PATCH] lib/strscpy: remove word-at-a-time optimization.
From: Andrey Ryabinin
Date: Tue Jan 09 2018 - 11:47:05 EST
Attached user space program I used to see the difference.
Usage:
gcc -02 -o strscpy strscpy_test.c
./strscpy {b|w} src_str_len count
src_str_len - length of source string in between 1-4096
count - how many strscpy() to execute.
Also I've noticed something strange. I'm not sure why, but certain
src_len values (e.g. 30) drives branch predictor crazy causing worse than usual results
for byte-at-a-time copy:
$ perf stat ./strscpy b 29 10000000
Performance counter stats for './strscpy b 29 10000000':
165.354974 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
48 page-faults:u # 0.290 K/sec
640,475,981 cycles:u # 3.873 GHz
2,500,090,080 instructions:u # 3.90 insn per cycle
640,017,126 branches:u # 3870.565 M/sec
1,589 branch-misses:u # 0.00% of all branches
0.165568346 seconds time elapsed
Performance counter stats for './strscpy b 30 10000000':
250.835659 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
46 page-faults:u # 0.183 K/sec
974,528,780 cycles:u # 3.885 GHz
2,580,090,165 instructions:u # 2.65 insn per cycle
660,017,211 branches:u # 2631.273 M/sec
14,488,234 branch-misses:u # 2.20% of all branches
0.251147341 seconds time elapsed
Performance counter stats for './strscpy b 31 10000000':
176.598368 task-clock:u (msec) # 0.997 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
46 page-faults:u # 0.260 K/sec
681,367,948 cycles:u # 3.858 GHz
2,660,090,092 instructions:u # 3.90 insn per cycle
680,017,138 branches:u # 3850.642 M/sec
1,817 branch-misses:u # 0.00% of all branches
0.177150181 seconds time elapsed
#include <stdlib.h>
#include <string.h>
#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
#define E2BIG -1
#define PAGE_SIZE 4096
struct word_at_a_time {
const unsigned long one_bits, high_bits;
};
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
static inline long count_masked_bytes(unsigned long mask)
{
return mask*0x0001020304050608ul >> 56;
}
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
*bits = mask;
return mask;
}
static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
return bits;
}
static inline unsigned long create_zero_mask(unsigned long bits)
{
bits = (bits - 1) & ~bits;
return bits >> 7;
}
/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)
static inline unsigned long find_zero(unsigned long mask)
{
return count_masked_bytes(mask);
}
__attribute__((noinline))
int strscpy_word(char *dest, const char *src, size_t count)
{
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
size_t max = count;
long res = 0;
if (count == 0)
return -E2BIG;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
/*
* If src is unaligned, don't cross a page boundary,
* since we don't know if the next page is mapped.
*/
if ((long)src & (sizeof(long) - 1)) {
size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
if (limit < max)
max = limit;
}
#else
/* If src or dest is unaligned, don't do word-at-a-time. */
if (((long) dest | (long) src) & (sizeof(long) - 1))
max = 0;
#endif
while (max >= sizeof(unsigned long)) {
unsigned long c, data;
c = *(unsigned long *)(src+res);
if (has_zero(c, &data, &constants)) {
data = prep_zero_mask(c, data, &constants);
data = create_zero_mask(data);
*(unsigned long *)(dest+res) = c & zero_bytemask(data);
return res + find_zero(data);
}
*(unsigned long *)(dest+res) = c;
res += sizeof(unsigned long);
count -= sizeof(unsigned long);
max -= sizeof(unsigned long);
}
while (count) {
char c;
c = src[res];
dest[res] = c;
if (!c)
return res;
res++;
count--;
}
/* Hit buffer length without finding a NUL; force NUL-termination. */
if (res)
dest[res-1] = '\0';
return -E2BIG;
}
__attribute__((noinline))
int strscpy_byte(char *dest, const char *src, int count)
{
int res = 0;
while (count) {
char c;
c = src[res];
dest[res] = c;
if (!c)
return res;
res++;
count--;
}
/* Hit buffer length without finding a NUL; force NUL-termination. */
if (res)
dest[res-1] = '\0';
return -E2BIG;
}
char dest[4096] __attribute__((aligned(4096)));
char src[4096] __attribute__((aligned(4096)));
int main(int argc, char **argv)
{
unsigned long long i;
unsigned long src_len;
unsigned long count;
if (argc < 4)
return -1;
src_len = atoi(argv[2]);
count = atoi(argv[3]);
memset(src, 1, src_len);
if (argv[1][0] == 'w') {
for (i = 0; i < count; i++) {
strscpy_word(dest, src, sizeof(dest));
}
} else if (argv[1][0] == 'b') {
for (i = 0; i < count; i++) {
strscpy_byte(dest, src, sizeof(dest));
}
}
return 0;
}