[RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions

From: Zhenyu Ye
Date: Mon Nov 11 2019 - 08:24:08 EST


ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
range of input addresses. This patch adds support for this feature.
This is the second version of the patch.

I traced the __flush_tlb_range() for a minute and get some statistical
data as below:

PAGENUM COUNT
1 34944
2 5683
3 1343
4 7857
5 838
9 339
16 933
19 427
20 5821
23 279
41 338
141 279
512 428
1668 120
2038 100

Those data are based on kernel-5.4.0, where PAGENUM = end - start, COUNT
shows number of calls to the __flush_tlb_range() in a minute. There only
shows the data which COUNT >= 100. The kernel is started normally, and
transparent hugepage is opened. As we can see, though most user TLBI
ranges were 1 pages long, the num of long-range can not be ignored.

The new feature of TLB range can improve lots of performance compared to
the current implementation. As an example, flush 512 ranges needs only 1
instruction as opposed to 512 instructions using current implementation.

And for a new hardware feature, support is better than not.

Signed-off-by: Zhenyu Ye <yezhenyu2@xxxxxxxxxx>
---
ChangeLog v1 -> v2:
- Change the main implementation of this feature.
- Add some comments.

---
arch/arm64/include/asm/cpucaps.h | 3 +-
arch/arm64/include/asm/sysreg.h | 4 ++
arch/arm64/include/asm/tlbflush.h | 99 ++++++++++++++++++++++++++++++-
arch/arm64/kernel/cpufeature.c | 10 ++++
4 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index ac1dbca3d0cd..5b5230060e5b 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -54,7 +54,8 @@
#define ARM64_WORKAROUND_1463225 44
#define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM 45
#define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM 46
+#define ARM64_HAS_TLBI_RANGE 47

-#define ARM64_NCAPS 47
+#define ARM64_NCAPS 48

#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 6e919fafb43d..a6abbf2b067d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -539,6 +539,7 @@
ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_RES1)

/* id_aa64isar0 */
+#define ID_AA64ISAR0_TLB_SHIFT 56
#define ID_AA64ISAR0_TS_SHIFT 52
#define ID_AA64ISAR0_FHM_SHIFT 48
#define ID_AA64ISAR0_DP_SHIFT 44
@@ -552,6 +553,9 @@
#define ID_AA64ISAR0_SHA1_SHIFT 8
#define ID_AA64ISAR0_AES_SHIFT 4

+#define ID_AA64ISAR0_TLB_NI 0x0
+#define ID_AA64ISAR0_TLB_RANGE 0x2
+
/* id_aa64isar1 */
#define ID_AA64ISAR1_SB_SHIFT 36
#define ID_AA64ISAR1_FRINTTS_SHIFT 32
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bc3949064725..f49bed7ecb68 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -59,6 +59,33 @@
__ta; \
})

+/*
+ * This macro creates a properly formatted VA operand for the TLBI RANGE.
+ * The value bit assignments are:
+ *
+ * +----------+------+-------+-------+-------+----------------------+
+ * | ASID | TG | SCALE | NUM | TTL | BADDR |
+ * +-----------------+-------+-------+-------+----------------------+
+ * |63 48|47 46|45 44|43 39|38 37|36 0|
+ *
+ * The address range is determined by below formula:
+ * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
+ *
+ */
+#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl) \
+ ({ \
+ unsigned long __ta = (addr) >> PAGE_SHIFT; \
+ __ta |= (unsigned long)(ttl) << 37; \
+ __ta |= (unsigned long)(num) << 39; \
+ __ta |= (unsigned long)(scale) << 44; \
+ __ta |= (unsigned long)(tg) << 46; \
+ __ta |= (unsigned long)(asid) << 48; \
+ __ta; \
+ })
+
+#define TLB_RANGE_MASK_SHIFT 5
+#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT, 0)
+
/*
* TLB Invalidation
* ================
@@ -177,9 +204,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
*/
#define MAX_TLBI_OPS PTRS_PER_PTE

-static inline void __flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long stride, bool last_level)
+static inline void __flush_tlb_range_old(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long stride, bool last_level)
{
unsigned long asid = ASID(vma->vm_mm);
unsigned long addr;
@@ -211,6 +238,72 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
dsb(ish);
}

+static inline void __flush_tlb_range_new(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long stride, bool last_level)
+{
+ int num = 0;
+ int scale = 0;
+ int ttl = 0;
+ int tg = (PAGE_SHIFT - 12) / 2 + 1;
+ unsigned long asid = ASID(vma->vm_mm);
+ unsigned long addr = 0;
+ unsigned long offset = (end - start) >> PAGE_SHIFT;
+
+ if (offset > (1UL << 21)) {
+ flush_tlb_mm(vma->vm_mm);
+ return;
+ }
+
+ dsb(ishst);
+
+ /*
+ * The minimum size of TLB RANGE is 2 PAGE;
+ * Use normal TLB instruction to handle odd PAGEs
+ */
+ if (offset % 2 == 1) {
+ addr = __TLBI_VADDR(start, asid);
+ if (last_level) {
+ __tlbi(vale1is, addr);
+ __tlbi_user(vale1is, addr);
+ } else {
+ __tlbi(vae1is, addr);
+ __tlbi_user(vae1is, addr);
+ }
+ start += 1 << PAGE_SHIFT;
+ offset -= 1;
+ }
+
+ while (offset > 0) {
+ num = (offset & TLB_RANGE_MASK) - 1;
+ if (num >= 0) {
+ addr = __TLBI_VADDR_RANGE(start, asid, tg,
+ scale, num, ttl);
+ if (last_level) {
+ __tlbi(rvale1is, addr);
+ __tlbi_user(rvale1is, addr);
+ } else {
+ __tlbi(rvae1is, addr);
+ __tlbi_user(rvae1is, addr);
+ }
+ start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT;
+ }
+ scale++;
+ offset >>= TLB_RANGE_MASK_SHIFT;
+ }
+ dsb(ish);
+}
+
+static inline void __flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long stride, bool last_level)
+{
+ if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
+ __flush_tlb_range_new(vma, start, end, stride, last_level);
+ else
+ __flush_tlb_range_old(vma, start, end, stride, last_level);
+}
+
static inline void flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 80f459ad0190..bdefd8a34729 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1566,6 +1566,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.min_field_value = 1,
},
#endif
+ {
+ .desc = "TLB range maintenance instruction",
+ .capability = ARM64_HAS_TLBI_RANGE,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64ISAR0_EL1,
+ .field_pos = ID_AA64ISAR0_TLB_SHIFT,
+ .sign = FTR_UNSIGNED,
+ .min_field_value = ID_AA64ISAR0_TLB_RANGE,
+ },
{},
};

--
2.19.1