[PATCH 1/5] lib/sort: Make swap functions more generic

From: George Spelvin
Date: Fri Mar 08 2019 - 22:21:47 EST


Rather than u32_swap and u64_swap working on 4- and 8-byte objects
directly, let them handle any multiple of 4 or 8 bytes. This speeds
up most users of sort() by avoiding fallback to the byte copy loop.

Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
claims, very few users of sort() sort pointers (or pointer-sized
objects); most sort structures containing at least two words.
(E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
struct acpi_fan_fps.)

x86-64 code size 872 -> 885 bytes (+8)

Signed-off-by: George Spelvin <lkml@xxxxxxx>
---
lib/sort.c | 117 +++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 96 insertions(+), 21 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..dff2ab2e196e 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,110 @@
#include <linux/export.h>
#include <linux/sort.h>

-static int alignment_ok(const void *base, int align)
+/**
+ * alignment_ok - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+static bool __attribute_const__
+alignment_ok(const void *base, size_t size, unsigned int align)
{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
+ unsigned int lsbits = (unsigned int)size;
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ (void)base;
+#else
+ lsbits |= (unsigned int)(size_t)base;
+#endif
+ lsbits &= align - 1;
+ return lsbits == 0;
}

+/**
+ * u32_swap - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
static void u32_swap(void *a, void *b, int size)
{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
+ size_t n = size;
+
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
}

+/**
+ * u64_swap - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+
static void u64_swap(void *a, void *b, int size)
{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
- char t;
+ size_t n = size;

do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * generic_swap - swap two elements a byte at a time
+ * @a, @b: pointers to the elements
+ * @size: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void generic_swap(void *a, void *b, int size)
+{
+ size_t n = size;
+
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
}

/**
@@ -67,10 +142,10 @@ void sort(void *base, size_t num, size_t size,
int i = (num/2 - 1) * size, n = num * size, c, r;

if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
+ if (alignment_ok(base, size, 8))
swap_func = u64_swap;
+ else if (alignment_ok(base, size, 4))
+ swap_func = u32_swap;
else
swap_func = generic_swap;
}
--
2.20.1