[PATCH v5] kasan: speed up mte_set_mem_tag_range

From: Evgenii Stepanov
Date: Thu May 20 2021 - 21:00:34 EST


Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.

The first cacheline is always tagged using STG/STZG even if the address is
cacheline-aligned, as benchmarks show it is faster than a conditional
branch.

Signed-off-by: Evgenii Stepanov <eugenis@xxxxxxxxxx>
Co-developed-by: Peter Collingbourne <pcc@xxxxxxxxxx>
Signed-off-by: Peter Collingbourne <pcc@xxxxxxxxxx>
Reviewed-by: Catalin Marinas <catalin.marinas@xxxxxxx>
---
Changelog since v1:
- Added Co-developed-by.

Changelog since v2:
- Added Signed-off-by.

Changelog since v3:
- Move the implementation back to C with a bit of inline asm.

Changelog since v3:
- Fixed coding style issues.

arch/arm64/include/asm/mte-kasan.h | 93 +++++++++++++++++++++---------
1 file changed, 67 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index ddd4d17cf9a0..d952352bd008 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -48,43 +48,84 @@ static inline u8 mte_get_random_tag(void)
return mte_get_ptr_tag(addr);
}

+static inline u64 __stg_post(u64 p)
+{
+ asm volatile(__MTE_PREAMBLE "stg %0, [%0], #16"
+ : "+r"(p)
+ :
+ : "memory");
+ return p;
+}
+
+static inline u64 __stzg_post(u64 p)
+{
+ asm volatile(__MTE_PREAMBLE "stzg %0, [%0], #16"
+ : "+r"(p)
+ :
+ : "memory");
+ return p;
+}
+
+static inline void __dc_gva(u64 p)
+{
+ asm volatile(__MTE_PREAMBLE "dc gva, %0" : : "r"(p) : "memory");
+}
+
+static inline void __dc_gzva(u64 p)
+{
+ asm volatile(__MTE_PREAMBLE "dc gzva, %0" : : "r"(p) : "memory");
+}
+
/*
* Assign allocation tags for a region of memory based on the pointer tag.
* Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ * size must be MTE_GRANULE_SIZE aligned.
*/
-static inline void mte_set_mem_tag_range(void *addr, size_t size,
- u8 tag, bool init)
+static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
+ bool init)
{
- u64 curr, end;
+ u64 curr, mask, dczid_bs, end1, end2, end3;

- if (!size)
- return;
+ /* Read DC G(Z)VA block size from the system register. */
+ dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf);

curr = (u64)__tag_set(addr, tag);
- end = curr + size;
+ mask = dczid_bs - 1;
+ /* STG/STZG up to the end of the first block. */
+ end1 = curr | mask;
+ end3 = curr + size;
+ /* DC GVA / GZVA in [end1, end2) */
+ end2 = end3 & ~mask;

/*
- * 'asm volatile' is required to prevent the compiler to move
- * the statement outside of the loop.
+ * The following code uses STG on the first DC GVA block even if the
+ * start address is aligned - it appears to be faster than an alignment
+ * check + conditional branch. Also, if the range size is at least 2 DC
+ * GVA blocks, the first two loops can use post-condition to save one
+ * branch each.
*/
- if (init) {
- do {
- asm volatile(__MTE_PREAMBLE "stzg %0, [%0]"
- :
- : "r" (curr)
- : "memory");
- curr += MTE_GRANULE_SIZE;
- } while (curr != end);
- } else {
- do {
- asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
- :
- : "r" (curr)
- : "memory");
- curr += MTE_GRANULE_SIZE;
- } while (curr != end);
- }
+#define SET_MEMTAG_RANGE(stg_post, dc_gva) \
+ do { \
+ if (size >= 2 * dczid_bs) { \
+ do { \
+ curr = stg_post(curr); \
+ } while (curr < end1); \
+ \
+ do { \
+ dc_gva(curr); \
+ curr += dczid_bs; \
+ } while (curr < end2); \
+ } \
+ \
+ while (curr < end3) \
+ curr = stg_post(curr); \
+ } while (0)
+
+ if (init)
+ SET_MEMTAG_RANGE(__stzg_post, __dc_gzva);
+ else
+ SET_MEMTAG_RANGE(__stg_post, __dc_gva);
+#undef SET_MEMTAG_RANGE
}

void mte_enable_kernel_sync(void);
--
2.31.1.818.g46aad6cb9e-goog