[PATCH] bitops: Optimize fns() for improved performance

From: Kuan-Wei Chiu
Date: Thu Apr 25 2024 - 23:52:05 EST


The current fns() repeatedly uses __ffs() to find the index of the
least significant bit and then clears the corresponding bit using
__clear_bit(). The method for clearing the least significant bit can be
optimized by using word &= word - 1 instead.

Typically, the execution time of one __ffs() plus one __clear_bit() is
longer than that of a bitwise AND operation and a subtraction. To
improve performance, the loop for clearing the least significant bit
has been replaced with word &= word - 1, followed by a single __ffs()
operation to obtain the answer. This change reduces the number of
__ffs() iterations from n to just one, enhancing overall performance.

The following microbenchmark data, conducted on my x86-64 machine,
shows the execution time (in microseconds) required for 1000000 test
data generated by get_random_u64() and executed by fns() under
different values of n:

+-----+---------------+---------------+
| n | time_old | time_new |
+-----+---------------+---------------+
| 0 | 29194 | 25878 |
| 1 | 25510 | 25497 |
| 2 | 27836 | 25721 |
| 3 | 30140 | 25673 |
| 4 | 32569 | 25426 |
| 5 | 34792 | 25690 |
| 6 | 37117 | 25651 |
| 7 | 39742 | 25383 |
| 8 | 42360 | 25657 |
| 9 | 44672 | 25897 |
| 10 | 47237 | 25819 |
| 11 | 49884 | 26530 |
| 12 | 51864 | 26647 |
| 13 | 54265 | 28915 |
| 14 | 56440 | 28373 |
| 15 | 58839 | 28616 |
| 16 | 62383 | 29128 |
| 17 | 64257 | 30041 |
| 18 | 66805 | 29773 |
| 19 | 69368 | 33203 |
| 20 | 72942 | 33688 |
| 21 | 77006 | 34518 |
| 22 | 80926 | 34298 |
| 23 | 85723 | 35586 |
| 24 | 90324 | 36376 |
| 25 | 95992 | 37465 |
| 26 | 101101 | 37599 |
| 27 | 106520 | 37466 |
| 28 | 113287 | 38163 |
| 29 | 120552 | 38810 |
| 30 | 128040 | 39373 |
| 31 | 135624 | 40500 |
| 32 | 142580 | 40343 |
| 33 | 148915 | 40460 |
| 34 | 154005 | 41294 |
| 35 | 157996 | 41730 |
| 36 | 160806 | 41523 |
| 37 | 162975 | 42088 |
| 38 | 163426 | 41530 |
| 39 | 164872 | 41789 |
| 40 | 164477 | 42505 |
| 41 | 164758 | 41879 |
| 42 | 164182 | 41415 |
| 43 | 164842 | 42119 |
| 44 | 164881 | 42297 |
| 45 | 164870 | 42145 |
| 46 | 164673 | 42066 |
| 47 | 164616 | 42051 |
| 48 | 165055 | 41902 |
| 49 | 164847 | 41862 |
| 50 | 165171 | 41960 |
| 51 | 164851 | 42089 |
| 52 | 164763 | 41717 |
| 53 | 164635 | 42154 |
| 54 | 164757 | 41983 |
| 55 | 165095 | 41419 |
| 56 | 164641 | 42381 |
| 57 | 164601 | 41654 |
| 58 | 164864 | 41834 |
| 59 | 164594 | 41920 |
| 60 | 165207 | 42020 |
| 61 | 165056 | 41185 |
| 62 | 165160 | 41722 |
| 63 | 164923 | 41702 |
| 64 | 164777 | 41880 |
+-----+---------------+---------------+

Signed-off-by: Kuan-Wei Chiu <visitorckw@xxxxxxxxx>
---

This patch has undergone unit testing and micro benchmarking using the
following code:

static int test(void)
{
unsigned int i, n;
volatile unsigned long x, y, z;
const unsigned long round = 1000000;
ktime_t start, end;
s32 time_old[65], time_new[65];

for (n = 0; n <= 64; n++) {
start = ktime_get();
for(i = 0; i < round; i++) {
x = get_random_u64();
y = fns(x, n);
}
end = ktime_get();
time_old[n] = ktime_us_delta(end, start);

start = ktime_get();
for(i = 0; i < round; i++) {
x = get_random_u64();
y = fns_new(x, n);
}
end = ktime_get();
time_new[n] = ktime_us_delta(end, start);
}

for (n = 0; n <= 64; n++) {
for(i = 0; i < round; i++) {
x = get_random_u64();
y = fns(x, n);
z = fns_new(x, n);
if (y != z) {
printk(KERN_INFO "test failed!\n");
return -1;
}
}
}

printk(KERN_INFO "test passed!\n");
for (n = 0; n <= 64; n++) {
printk(KERN_INFO "n = %u\n", n);
printk(KERN_INFO "time old = %d\n", time_old[n]);
printk(KERN_INFO "time new = %d\n", time_new[n]);
}
return 0;
}

include/linux/bitops.h | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 2ba557e067fe..ffeb71e72154 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -254,16 +254,12 @@ static inline unsigned long __ffs64(u64 word)
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
- unsigned int bit;
+ unsigned int i;

- while (word) {
- bit = __ffs(word);
- if (n-- == 0)
- return bit;
- __clear_bit(bit, &word);
- }
+ for (i = 0; word && i < n; i++)
+ word &= word - 1;

- return BITS_PER_LONG;
+ return word ? __ffs(word) : BITS_PER_LONG;
}

/**
--
2.34.1