[POC 11/12] x86-64: implement _rai_bucket_shift

From: Rasmus Villemoes
Date: Wed Oct 17 2018 - 18:33:55 EST


The only slightly tricky issue is that for implementing the thunk, we
need some temporary registers (with %ecx being one of them), and we
don't know whether the hash input and/or destination register collide
with whichever we choose. One _could_ attempt text parsing in asm in
order to find a safe set of temps, but they would need to be restored
anyway.

So instead, just pick %edx and %ecx, and start by pushing them on the
stack. Then compute the result we need, push that to the stack, restore
%edx and %ecx, and finally pop the result into the destination
register (which may be %rdx or %rcx or any other) and adjust the stack
pointer.

The patched code does need to do a shr, so I don't think there's a way
around the cc clobber.

Signed-off-by: Rasmus Villemoes <linux@xxxxxxxxxxxxxxxxxx>
---
arch/x86/include/asm/rai.S | 59 ++++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/rai.h | 21 +++++++++++++-
arch/x86/kernel/rai.c | 13 +++++++++
3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/rai.S b/arch/x86/include/asm/rai.S
index f42cdd8db876..144697e146b6 100644
--- a/arch/x86/include/asm/rai.S
+++ b/arch/x86/include/asm/rai.S
@@ -54,5 +54,64 @@
.popsection
.endm /* rai_load */

+ /*
+ * For convenience, and because it should not cause that much
+ * worse code gen, we tie the hash to an output register, to
+ * avoid it being given in the same register where we must
+ * place the actual output. Since the hash output is unused,
+ * gcc is free to pick that register for anything immediately
+ * afterwards.
+ */
+.macro rai_bucket_shift dst, hash, hashq, base, shift
+ .pushsection .rai_templ, "aw"
+10: movabs $0x1234567812345678, \dst
+ /*
+ * Actually, the hash output contains the shifted hash
+ * value. But I don't think there's a way to inform gcc about
+ * that, and I don't know how useful it would be anyway. So in
+ * the thunk below, we don't do anything to have the same
+ * property, though it would be doable.
+ */
+ shr $6, \hash
+ lea (\dst, \hashq, 8), \dst
+11:
+ .popsection
+
+ .pushsection .text.rai_thunk, "ax"
+20: /* dst and hash are registers, we can clobber hash */
+ push %rdx
+ push %rcx
+ mov \hash, %edx
+ mov \shift(%rip), %ecx
+ shr %cl,%edx
+ /* move the shifted value into \hash, so the below works regardless of whether \dst is %rdx or not */
+ mov %edx, \hash
+ mov \base(%rip), \dst
+ lea (\dst, \hashq, 8), \dst
+ /* We have our final value. */
+ push \dst
+ /* Now restore %rdx and %rcx, then finally restore \dst and adjust the stack pointer */
+ mov 0x8(%rsp), %rcx
+ mov 0x10(%rsp), %rdx
+ pop \dst
+ add $0x10, %rsp
+ jmp 32f
+21:
+ .popsection
+ /* The part that goes into .text */
+30: jmp 20b
+31: .skip -(((11b - 10b)-(31b - 30b)) > 0)*((11b - 10b)-(31b - 30b)), 0x90
+32:
+
+ .pushsection .rai_data, "a"
+40:
+ rai_entry RAI_BUCKET_SHIFT_8_4_4 30b 32b 10b 11b 20b
+ .quad \base /* .bucket_shift.base_addr */
+ .quad \shift /* .bucket_shift.shift_addr */
+41:
+ rai_entry_pad 40b 41b
+ .popsection
+.endm /* rai_bucket_shift */
+

#endif
diff --git a/arch/x86/include/asm/rai.h b/arch/x86/include/asm/rai.h
index b57494c98d0f..c9726d1e40ed 100644
--- a/arch/x86/include/asm/rai.h
+++ b/arch/x86/include/asm/rai.h
@@ -3,8 +3,9 @@

#define RAI_LOAD_4 0
#define RAI_LOAD_8 1
+#define RAI_BUCKET_SHIFT_8_4_4 2

-#define STRUCT_RAI_ENTRY_SIZE 32
+#define STRUCT_RAI_ENTRY_SIZE 40

/* Put the asm macros in a separate file for easier editing. */
#include <asm/rai.S>
@@ -23,6 +24,10 @@ struct rai_entry {
struct {
void *addr;
} load;
+ struct {
+ void *base_addr;
+ void *shift_addr;
+ } bucket_shift;
};
};
_Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
@@ -48,6 +53,20 @@ _Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
ret__; \
})

+#define _rai_bucket_shift(base, shift, hash) ({ \
+ typeof(base) ret__; \
+ typeof(hash) unused__; \
+ if (sizeof(*(base)) == 8 && sizeof(shift) == 4 \
+ && sizeof(hash) == 4) \
+ asm("rai_bucket_shift %0 %1 %q1 %c3 %c4" \
+ : "=r" (ret__), "=r" (unused__) \
+ : "1" (hash), "i" (&(base)), "i" (&(shift)) \
+ : "cc"); \
+ else \
+ ret__ = _rai_bucket_shift_fallback(base, shift, hash); \
+ ret__; \
+ })
+
#endif /* !__ASSEMBLY */

#endif /* _ASM_X86_RAI_H */
diff --git a/arch/x86/kernel/rai.c b/arch/x86/kernel/rai.c
index c4284ce7478f..3aa2e3b2c31b 100644
--- a/arch/x86/kernel/rai.c
+++ b/arch/x86/kernel/rai.c
@@ -32,6 +32,19 @@ rai_patch_one(const struct rai_entry *r)
memcpy(templ + r->templ_len - sizeof(*imm), imm, sizeof(*imm));
break;
}
+ case RAI_BUCKET_SHIFT_8_4_4: {
+ const u32 *shiftp = r->bucket_shift.shift_addr;
+ const u64 *basep = r->bucket_shift.base_addr;
+ /*
+ * This should be made more robust. For now, assume we
+ * have a 10-byte movabs followed by a 3-byte shr. And
+ * while *shiftp is 4 bytes wide, we just need the
+ * LSB.
+ */
+ memcpy(templ + 2, basep, sizeof(*basep));
+ memcpy(templ + 12, shiftp, 1);
+ break;
+ }
default:
WARN_ONCE(1, "unhandled RAI type %d\n", r->type);
return;
--
2.19.1.6.gbde171bbf5