[PATCH] LoongArch: Support dbar with different hints

From: Huacai Chen
Date: Tue May 16 2023 - 08:46:11 EST


Traditionally, LoongArch uses "dbar 0" (full completion barrier) for
everything. But the full completion barrier is a performance killer, so
Loongson-3A6000 and newer processors introduce different hints:

Bit4: ordering or completion (0: completion, 1: ordering)
Bit3: barrier for previous read (0: true, 1: false)
Bit2: barrier for previous write (0: true, 1: false)
Bit1: barrier for succeeding read (0: true, 1: false)
Bit0: barrier for succedding write (0: true, 1: false)

Hint 0x700: barrier for "read after read" from the same address, which
is needed by LL-SC loops.

This patch enable various hints for different memory barries, it brings
performance improvements for Loongson-3A6000 series, and doesn't impact
Loongson-3A5000 series because they treat all hints as "dbar 0".

Signed-off-by: Jun Yi <yijun@xxxxxxxxxxx>
Signed-off-by: Huacai Chen <chenhuacai@xxxxxxxxxxx>
---
arch/loongarch/include/asm/barrier.h | 130 ++++++++++++---------------
arch/loongarch/include/asm/io.h | 2 +-
arch/loongarch/kernel/smp.c | 2 +-
arch/loongarch/mm/tlbex.S | 6 +-
4 files changed, 60 insertions(+), 80 deletions(-)

diff --git a/arch/loongarch/include/asm/barrier.h b/arch/loongarch/include/asm/barrier.h
index cda977675854..0286ae7e3636 100644
--- a/arch/loongarch/include/asm/barrier.h
+++ b/arch/loongarch/include/asm/barrier.h
@@ -5,27 +5,56 @@
#ifndef __ASM_BARRIER_H
#define __ASM_BARRIER_H

-#define __sync() __asm__ __volatile__("dbar 0" : : : "memory")
+/*
+ * Hint types:
+ *
+ * Bit4: ordering or completion (0: completion, 1: ordering)
+ * Bit3: barrier for previous read (0: true, 1: false)
+ * Bit2: barrier for previous write (0: true, 1: false)
+ * Bit1: barrier for succeeding read (0: true, 1: false)
+ * Bit0: barrier for succedding write (0: true, 1: false)
+ *
+ * Hint 0x700: barrier for "read after read" from the same address
+ */
+
+#define DBAR(hint) __asm__ __volatile__("dbar %0 " : : "I"(hint) : "memory")
+
+#define crwrw 0b00000
+#define cr_r_ 0b00101
+#define c_w_w 0b01010

-#define fast_wmb() __sync()
-#define fast_rmb() __sync()
-#define fast_mb() __sync()
-#define fast_iob() __sync()
-#define wbflush() __sync()
+#define orwrw 0b10000
+#define or_r_ 0b10101
+#define o_w_w 0b11010

-#define wmb() fast_wmb()
-#define rmb() fast_rmb()
-#define mb() fast_mb()
-#define iob() fast_iob()
+#define orw_w 0b10010
+#define or_rw 0b10100

-#define __smp_mb() __asm__ __volatile__("dbar 0" : : : "memory")
-#define __smp_rmb() __asm__ __volatile__("dbar 0" : : : "memory")
-#define __smp_wmb() __asm__ __volatile__("dbar 0" : : : "memory")
+#define c_sync() DBAR(crwrw)
+#define c_rsync() DBAR(cr_r_)
+#define c_wsync() DBAR(c_w_w)
+
+#define o_sync() DBAR(orwrw)
+#define o_rsync() DBAR(or_r_)
+#define o_wsync() DBAR(o_w_w)
+
+#define ldacq_mb() DBAR(or_rw)
+#define strel_mb() DBAR(orw_w)
+
+#define mb() c_sync()
+#define rmb() c_rsync()
+#define wmb() c_wsync()
+#define iob() c_sync()
+#define wbflush() c_sync()
+
+#define __smp_mb() o_sync()
+#define __smp_rmb() o_rsync()
+#define __smp_wmb() o_wsync()

#ifdef CONFIG_SMP
-#define __WEAK_LLSC_MB " dbar 0 \n"
+#define __WEAK_LLSC_MB " dbar 0x700 \n"
#else
-#define __WEAK_LLSC_MB " \n"
+#define __WEAK_LLSC_MB " \n"
#endif

#define __smp_mb__before_atomic() barrier()
@@ -59,68 +88,19 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
return mask;
}

-#define __smp_load_acquire(p) \
-({ \
- union { typeof(*p) __val; char __c[1]; } __u; \
- unsigned long __tmp = 0; \
- compiletime_assert_atomic_type(*p); \
- switch (sizeof(*p)) { \
- case 1: \
- *(__u8 *)__u.__c = *(volatile __u8 *)p; \
- __smp_mb(); \
- break; \
- case 2: \
- *(__u16 *)__u.__c = *(volatile __u16 *)p; \
- __smp_mb(); \
- break; \
- case 4: \
- __asm__ __volatile__( \
- "amor_db.w %[val], %[tmp], %[mem] \n" \
- : [val] "=&r" (*(__u32 *)__u.__c) \
- : [mem] "ZB" (*(u32 *) p), [tmp] "r" (__tmp) \
- : "memory"); \
- break; \
- case 8: \
- __asm__ __volatile__( \
- "amor_db.d %[val], %[tmp], %[mem] \n" \
- : [val] "=&r" (*(__u64 *)__u.__c) \
- : [mem] "ZB" (*(u64 *) p), [tmp] "r" (__tmp) \
- : "memory"); \
- break; \
- } \
- (typeof(*p))__u.__val; \
+#define __smp_load_acquire(p) \
+({ \
+ typeof(*p) ___p1 = READ_ONCE(*p); \
+ compiletime_assert_atomic_type(*p); \
+ ldacq_mb(); \
+ ___p1; \
})

-#define __smp_store_release(p, v) \
-do { \
- union { typeof(*p) __val; char __c[1]; } __u = \
- { .__val = (__force typeof(*p)) (v) }; \
- unsigned long __tmp; \
- compiletime_assert_atomic_type(*p); \
- switch (sizeof(*p)) { \
- case 1: \
- __smp_mb(); \
- *(volatile __u8 *)p = *(__u8 *)__u.__c; \
- break; \
- case 2: \
- __smp_mb(); \
- *(volatile __u16 *)p = *(__u16 *)__u.__c; \
- break; \
- case 4: \
- __asm__ __volatile__( \
- "amswap_db.w %[tmp], %[val], %[mem] \n" \
- : [mem] "+ZB" (*(u32 *)p), [tmp] "=&r" (__tmp) \
- : [val] "r" (*(__u32 *)__u.__c) \
- : ); \
- break; \
- case 8: \
- __asm__ __volatile__( \
- "amswap_db.d %[tmp], %[val], %[mem] \n" \
- : [mem] "+ZB" (*(u64 *)p), [tmp] "=&r" (__tmp) \
- : [val] "r" (*(__u64 *)__u.__c) \
- : ); \
- break; \
- } \
+#define __smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ strel_mb(); \
+ WRITE_ONCE(*p, v); \
} while (0)

#define __smp_store_mb(p, v) \
diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h
index 545e2708fbf7..1c9410220040 100644
--- a/arch/loongarch/include/asm/io.h
+++ b/arch/loongarch/include/asm/io.h
@@ -62,7 +62,7 @@ extern pgprot_t pgprot_wc;
#define ioremap_cache(offset, size) \
ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL))

-#define mmiowb() asm volatile ("dbar 0" ::: "memory")
+#define mmiowb() wmb()

/*
* String version of I/O memory access operations.
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index ed167e244cda..8daa97148c8e 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -118,7 +118,7 @@ static u32 ipi_read_clear(int cpu)
action = iocsr_read32(LOONGARCH_IOCSR_IPI_STATUS);
/* Clear the ipi register to clear the interrupt */
iocsr_write32(action, LOONGARCH_IOCSR_IPI_CLEAR);
- smp_mb();
+ wbflush();

return action;
}
diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
index 244e2f5aeee5..240ced55586e 100644
--- a/arch/loongarch/mm/tlbex.S
+++ b/arch/loongarch/mm/tlbex.S
@@ -184,7 +184,7 @@ tlb_huge_update_load:
ertn

nopage_tlb_load:
- dbar 0
+ dbar 0x700
csrrd ra, EXCEPTION_KS2
la_abs t0, tlb_do_page_fault_0
jr t0
@@ -333,7 +333,7 @@ tlb_huge_update_store:
ertn

nopage_tlb_store:
- dbar 0
+ dbar 0x700
csrrd ra, EXCEPTION_KS2
la_abs t0, tlb_do_page_fault_1
jr t0
@@ -480,7 +480,7 @@ tlb_huge_update_modify:
ertn

nopage_tlb_modify:
- dbar 0
+ dbar 0x700
csrrd ra, EXCEPTION_KS2
la_abs t0, tlb_do_page_fault_1
jr t0
--
2.39.1