Re: [PATCH 2/2] lib/string.c: Optimize memchr()
From: Yu-Jen Chang
Date: Fri Jul 29 2022 - 11:42:44 EST
On Sat, Jul 23, 2022 at 12:56 AM Andy Shevchenko
<andy.shevchenko@xxxxxxxxx> wrote:
>
>
>
> On Friday, July 22, 2022, Yu-Jen Chang <arthurchang09@xxxxxxxxx> wrote:
>>
>> On Wed, Jul 13, 2022 at 6:24 PM David Laight <David.Laight@xxxxxxxxxx> wrote:
>> >
>> > From: Andrey Semashev
>> > > Sent: 13 July 2022 11:03
>> > >
>> > > On 7/13/22 12:49, Andrey Semashev wrote:
>> > > > On 7/13/22 12:39, David Laight wrote:
>> > > >> From: Yu-Jen Chang
>> > > >>> Sent: 12 July 2022 15:59
>> > > >> ...
>> > > >>>> I think you're missing the point. Loads at unaligned addresses may not
>> > > >>>> be allowed by hardware using conventional load instructions or may be
>> > > >>>> inefficient. Given that this memchr implementation is used as a fallback
>> > > >>>> when no hardware-specific version is available, you should be
>> > > >>>> conservative wrt. hardware capabilities and behavior. You should
>> > > >>>> probably have a pre-alignment loop.
>> > > >>>
>> > > >>> Got it. I add pre-alignment loop. It aligns the address to 8 or 4bytes.
>> > > >>
>> > > >> That should be predicated on !HAS_EFFICIENT_UNALIGNED_ACCESS.
>> > > >>
>> > > >> ...
>> > > >>> for (; p <= end - 8; p += 8) {
>> > > >>> val = *(u64*)p ^ mask;
>> > > >>> if ((val + 0xfefefefefefefeffull)
>> > > >>> & (~val & 0x8080808080808080ull))
>> > > >>> break;
>> > > >>
>> > > >> I would add a couple of comments, like:
>> > > >> // Convert to check for zero byte.
>> > > >> // Standard check for a zero byte in a word.
>> > > >> (But not the big 4 line explanation you had.
>> > > >>
>> > > >> It is also worth looking at how that code compiles
>> > > >> on 32bit arch that don't have a carry flag.
>> > > >> That is everything based on MIPS, including riscv.
>> > > >
>> > > > It may be worth looking at how glibc does it:
>> > > >
>> > > >
>> > > https://sourceware.org/git/?p=glibc.git;a=blob;f=string/memchr.c;h=422bcd0cd646ea46711a57fa3cbdb8a3329
>> > > fc302;hb=refs/heads/release/2.35/master#l46
>> > > >
>> > > > They do use 32-bit words on 32-bit targets and 64-bit on 64-bit ones. I
>> > > > think memchr in the kernel should follow this.
>> > >
>> > > Also, if by chance this optimization is aimed for x86-64, it may be
>> > > worth adding an arch-specific version that uses ERMS.
>> >
>> > Don't believe everything you see in glibc.
>> > The common cases in the kernel are different from the ones they
>> > tend to optimise for..
>> >
>> > You might try using:
>> > #define GEN_MASK(x) (x * (unsigned long)0x0101010101010101ull)
>> > for the mask and the two constants.
>> > Then make all the variables 'long'.
>> >
>> > I'm not at all sure what the test for fast multiply is about.
>> > It may be very historic, for modern cpu generating the 64bit
>> > constant is likely to be more problematic (check arm64).
>> > If the input value is 'unsigned char' (or masked) then the
>> > compiler may decide to do the repeated <<= itself.
>> >
>> > David
>> >
>> > -
>> > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
>> > Registration No: 1397386 (Wales)
>>
>>
>> I have rewrite my code as the following. I use several macros
>
>
>
> My gosh, have you checked what strscpy() does under the hood?
>
> Why can’t you just reuse parts of it?
>
>>
>> to
>> generate the mask and detect target char for 32-bit or 64-bit.
>> The DETECT_CHAR macro is the same as this part:
>>
>> (val + 0xfefefefefefefeffull) & (~val & 0x8080808080808080ull)
>>
>> The 0xfefefefefefefeffull is the 2's complement of 0x0101010101010101ULL.
>> I perform subtraction here exactly.
>>
>> If the CPU architecture is unable to perform unaligned access
>> efficiently, the pre-alignment loop will align the string at first.
>>
>>
>> #define LBLOCKSIZE (sizeof(long))
>> #if BITS_PER_LONG == 64
>>
>> #define DETECT_CHAR(X) \
>> (((X) - 0x0101010101010101ULL) & ~(X) & 0x8080808080808080ULL)
>>
>> #define MEMCHR_MASK_GEN(mask) \
>> do { \
>> (mask) |= (mask) << 8; \
>> (mask) |= (mask) << 16; \
>> (mask) |= (mask) << 32; \
>> } while (0)
>>
>> #else
>>
>> #define DETECT_CHAR(X) (((X)-0x01010101UL) & ~(X)&0x80808080UL)
>>
>> #define MEMCHR_MASK_GEN(mask) \
>> do { \
>> (mask) |= (mask) << 8; \
>> (mask) |= (mask) << 16; \
>> } while (0)
>>
>> #endif
>>
>> void *memchr(const void *p, int c, size_t length)
>> {
>> unsigned long mask, val;
>> const void *end = p + length;
>> c &= 0xff;
>> #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> while ((long)p & (sizeof(long) - 1)) {
>> if (p >= end)
>> return NULL;
>> if (*(unsigned char *)p == c)
>> return (void *)p;
>> p++;
>> }
>> #endif
>> if (p <= end - LBLOCKSIZE) {
>> mask = c;
>> MEMCHR_MASK_GEN(mask);
>>
>> for (; p <= end - LBLOCKSIZE; p += LBLOCKSIZE) {
>> val = *(unsigned long *)p ^ mask;
>> if (DETECT_CHAR(val))
>> break;
>> }
>> }
>>
>> for (; p < end; p++)
>> if (*(unsigned char *)p == c)
>> return (void *)p;
>>
>> return NULL;
>> }
>
>
>
> --
> With Best Regards,
> Andy Shevchenko
>
>
I reuse some parts of the strscpy and use "read_word_at_a_time"
to read strings and "has_zero" to find if the address of the target char
is in the current word.
Yu-Jen Chang
#if BITS_PER_LONG == 64
#define MEMCHR_MASK_GEN(mask) \
do { \
(mask) |= (mask) << 8; \
(mask) |= (mask) << 16; \
(mask) |= (mask) << 32; \
} while (0)
#else
#define MEMCHR_MASK_GEN(mask) \
do { \
(mask) |= (mask) << 8; \
(mask) |= (mask) << 16; \
} while (0)
#endif
void *memchr(const void *p, int c, size_t length)
{
unsigned long mask;
unsigned char *src = (unsigned char *) p;
const void *end = p + length;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
size_t max = length;
c &= 0xff;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
/*
* If src is unaligned, don't cross a page boundary,
* since we don't know if the next page is mapped.
*/
if ((long)p & (sizeof(long) - 1)) {
size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
if (limit < max)
max = limit;
}
#else
/* If src is unaligned, don't do word-at-a-time. */
if (((long) p) & (sizeof(long) - 1))
max = 0;
#endif
mask = c;
MEMCHR_MASK_GEN(mask);
for ( ; max >= sizeof(unsigned long); max -= sizeof(unsigned
long), src += sizeof(unsigned long)) {
unsigned long data, result;
data = read_word_at_a_time(src);
data = data ^ mask;
if (has_zero(data, &result, &constants))
break;
}
for (; src < end; src++)
if (*src == c)
return (void *)src;
return NULL;
}