The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zoneHi Guanhui, I do some basic test based on v1,it works well.
(enable crashkernel, disable rodata full, disable kfence), the mem_map will
use non block/section mapping(for crashkernel requires to shrink the region
in page granularity). But it will degrade performance when doing larging
continuous mem access in kernel(memcpy/memmove, etc).
There are many changes and discussions:
commit 031495635b46 ("arm64: Do not defer reserve_crashkernel() for
platforms with no DMA memory zones")
commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into
mem_init()")
commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel
reservation is required")
This patch changes mem_map to use block/section mapping with crashkernel.
Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
mem_map, reserve crashkernel memory. And then walking pagetable to split
block/section mapping to non block/section mapping(normally 4K) [[[only]]]
for crashkernel mem. So the linear mem mapping use block/section mapping
as more as possible. We will reduce the cpu dTLB miss conspicuously, and
accelerate mem access about 10-20% performance improvement.
I have tested it with pft(Page Fault Test) and fio, obtained great
performace improvement.
For fio test:
1.prepare ramdisk
modprobe -r brd
modprobe brd rd_nr=1 rd_size=67108864
dmsetup remove_all
wipefs -a --force /dev/ram0
mkfs -t ext4 -E lazy_itable_init=0,lazy_journal_init=0 -q -F /dev/ram0
mkdir -p /fs/ram0
mount -t ext4 /dev/ram0 /fs/ram0
2.prepare fio paremeter in x.fio file:
[global]
bs=4k
ioengine=psync
iodepth=128
size=32G
direct=1
invalidate=1
group_reporting
thread=1
rw=read
directory=/fs/ram0
numjobs=1
[task_0]
cpus_allowed=16
stonewall=1
3.run testcase:
perf stat -e dTLB-load-misses fio x.fio
4.contrast
------------------------
without patch with patch
fio READ aggrb=1493.2MB/s aggrb=1775.3MB/s
dTLB-load-misses 1,818,320,693 438,729,774
time elapsed(s) 70.500326434 62.877316408
user(s) 15.926332000 15.684721000
sys(s) 54.211939000 47.046165000
5.conclusion
Using this patch will reduce dTLB misses and improve performace greatly.
There are many comment in init.c which added by commit 031495635b46 ("arm64:
Signed-off-by: Guanghui Feng <guanghuifeng@xxxxxxxxxxxxxxxxx>
---
arch/arm64/include/asm/mmu.h | 1 +
arch/arm64/mm/init.c | 8 +-
arch/arm64/mm/mmu.c | 231 ++++++++++++++++++++++++++++++-------------
3 files changed, 168 insertions(+), 72 deletions(-)
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 48f8466..1a46b81 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
extern void arm64_memblock_init(void);
extern void paging_init(void);
extern void bootmem_init(void);
+extern void map_crashkernel(void);
extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
extern void init_mem_pgprot(void);
extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 339ee84..241d27e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -190,6 +190,7 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
+ map_crashkernel();
}
/*ditto
@@ -388,10 +389,6 @@ void __init arm64_memblock_init(void)
}
early_init_fdt_scan_reserved_mem();
-
- if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
- reserve_crashkernel();
-
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
}
@@ -438,8 +435,7 @@ void __init bootmem_init(void)
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
- if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))Skip mutex_lock too as v1 said.
- reserve_crashkernel();
+ reserve_crashkernel();
memblock_dump_all();
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 626ec32..4b779cf 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -42,6 +42,7 @@
#define NO_BLOCK_MAPPINGS BIT(0)
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
+#define NO_SEC_REMAPPINGS BIT(3) /* rebuild with non block/sec mapping*/
u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
@@ -156,11 +157,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
}
static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot)
+ phys_addr_t phys, pgprot_t prot, int flags)
{
pte_t *ptep;
- ptep = pte_set_fixmap_offset(pmdp, addr);
+ ptep = (flags & NO_SEC_REMAPPINGS) ? pte_offset_kernel(pmdp, addr) :
+ pte_set_fixmap_offset(pmdp, addr);
do {
pte_t old_pte = READ_ONCE(*ptep);
@@ -176,7 +178,8 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
phys += PAGE_SIZE;
} while (ptep++, addr += PAGE_SIZE, addr != end);
- pte_clear_fixmap();
+ if (!(flags & NO_SEC_REMAPPINGS))
+ pte_clear_fixmap();
}
static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
@@ -208,16 +211,59 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
next = pte_cont_addr_end(addr, end);
/* use a contiguous mapping if the range is suitably aligned */
- if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
+ if (!(flags & NO_SEC_REMAPPINGS) &&
+ (((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
(flags & NO_CONT_MAPPINGS) == 0)
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
- init_pte(pmdp, addr, next, phys, __prot);
+ init_pte(pmdp, addr, next, phys, __prot, flags);
phys += next - addr;
} while (addr = next, addr != end);
}
+static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+ phys_addr_t map_offset;
+ pmdval_t pmdval;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
+ phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
+ pmd_clear(pmdp);
+ pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
+ if (flags & NO_EXEC_MAPPINGS)
+ pmdval |= PMD_TABLE_PXN;
+ __pmd_populate(pmdp, pte_phys, pmdval);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+ map_offset = addr - (addr & PMD_MASK);
+ if (map_offset)
+ alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
+ phys - map_offset, prot,
+ pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+
+ if (next < (addr & PMD_MASK) + PMD_SIZE)
+ alloc_init_cont_pte(pmdp, next,
+ (addr & PUD_MASK) + PUD_SIZE,
+ next - addr + phys,
+ prot, pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+ }
+ alloc_init_cont_pte(pmdp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+ phys += next - addr;
+ } while (pmdp++, addr = next, addr != end);
+}
+
static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(int), int flags)
@@ -286,16 +332,87 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
next = pmd_cont_addr_end(addr, end);
/* use a contiguous mapping if the range is suitably aligned */
- if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
+ if (!(flags & NO_SEC_REMAPPINGS) &&
+ (((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
(flags & NO_CONT_MAPPINGS) == 0)
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
- init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
+ if (flags & NO_SEC_REMAPPINGS)
+ init_pmd_remap(pudp, addr, next, phys, __prot,
+ pgtable_alloc, flags);
+ else
+ init_pmd(pudp, addr, next, phys, __prot,
+ pgtable_alloc, flags);
phys += next - addr;
} while (addr = next, addr != end);
}
+static void init_pud_remap(pud_t *pudp, unsigned long addr, unsigned long next,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
+{
+ pudval_t pudval;
+ phys_addr_t map_offset;
+
+ if (!pud_none(*pudp) && pud_sect(*pudp)) {
+ phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
+ pud_clear(pudp);
+ pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
+ if (flags & NO_EXEC_MAPPINGS)
+ pudval |= PUD_TABLE_PXN;
+
+ __pud_populate(pudp, pmd_phys, pudval);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+ map_offset = addr - (addr & PUD_MASK);
+ if (map_offset)
+ alloc_init_cont_pmd(pudp, addr & PUD_MASK,
+ addr, phys - map_offset,
+ prot, pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+
+ if (next < (addr & PUD_MASK) + PUD_SIZE)
+ alloc_init_cont_pmd(pudp, next,
+ (addr & PUD_MASK) + PUD_SIZE,
+ next - addr + phys,
+ prot, pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+ }
+ alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+}
+
+static void init_pud(pud_t *pudp, unsigned long addr, unsigned long next,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
+{
+ pud_t old_pud = READ_ONCE(*pudp);
+ /*
+ * For 4K granule only, attempt to put down a 1GB block
+ */
+ if (pud_sect_supported() &&
+ ((addr | next | phys) & ~PUD_MASK) == 0 &&
+ (flags & NO_BLOCK_MAPPINGS) == 0) {
+ pud_set_huge(pudp, phys, prot);
+
+ /*
+ * After the PUD entry has been populated once, we
+ * only allow updates to the permission attributes.
+ */
+ BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
+ READ_ONCE(pud_val(*pudp))));
+ } else {
+ alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+
+ BUG_ON(pud_val(old_pud) != 0 &&
+ pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
+ }
+}
+
static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(int),
@@ -325,37 +442,24 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
*/
if (system_state != SYSTEM_BOOTING)
mutex_lock(&fixmap_lock);
- pudp = pud_set_fixmap_offset(p4dp, addr);
- do {
- pud_t old_pud = READ_ONCE(*pudp);
+ pudp = (flags & NO_SEC_REMAPPINGS) ? pud_offset(p4dp, addr) :
+ pud_set_fixmap_offset(p4dp, addr);
+ do {
next = pud_addr_end(addr, end);
- /*
- * For 4K granule only, attempt to put down a 1GB block
- */
- if (pud_sect_supported() &&
- ((addr | next | phys) & ~PUD_MASK) == 0 &&
- (flags & NO_BLOCK_MAPPINGS) == 0) {
- pud_set_huge(pudp, phys, prot);
-
- /*
- * After the PUD entry has been populated once, we
- * only allow updates to the permission attributes.
- */
- BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
- READ_ONCE(pud_val(*pudp))));
- } else {
- alloc_init_cont_pmd(pudp, addr, next, phys, prot,
- pgtable_alloc, flags);
-
- BUG_ON(pud_val(old_pud) != 0 &&
- pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
- }
+ if (flags & NO_SEC_REMAPPINGS)
+ init_pud_remap(pudp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+ else
+ init_pud(pudp, addr, next, phys, prot, pgtable_alloc,
+ flags);
phys += next - addr;
} while (pudp++, addr = next, addr != end);
- pud_clear_fixmap();
+ if (!(flags & NO_SEC_REMAPPINGS))
+ pud_clear_fixmap();
+
if (system_state != SYSTEM_BOOTING)
mutex_unlock(&fixmap_lock);
}
@@ -483,20 +587,39 @@ void __init mark_linear_text_alias_ro(void)
PAGE_KERNEL_RO);
}
-static bool crash_mem_map __initdata;
+#ifdef CONFIG_KEXEC_CORE
+static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
+{
+ phys_addr_t phys;
+ void *ptr;
-static int __init enable_crash_mem_map(char *arg)
+ phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_NOLEAKTRACE);
+ if (!phys)
+ panic("Failed to allocate page table page\n");
+
+ ptr = (void *)__phys_to_virt(phys);
+ memset(ptr, 0, PAGE_SIZE);
+ return phys;
+}
+
+void __init map_crashkernel(void)
{
- /*
- * Proper parameter parsing is done by reserve_crashkernel(). We only
- * need to know if the linear map has to avoid block mappings so that
- * the crashkernel reservations can be unmapped later.
- */
- crash_mem_map = true;
+ if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
+ return;
- return 0;
+ if (!crashk_res.end)
+ return;
+
+ __create_pgd_mapping(swapper_pg_dir, crashk_res.start,
+ __phys_to_virt(crashk_res.start),
+ crashk_res.end + 1 - crashk_res.start, PAGE_KERNEL,
+ early_crashkernel_pgtable_alloc,
+ NO_EXEC_MAPPINGS | NO_SEC_REMAPPINGS);
}
-early_param("crashkernel", enable_crash_mem_map);
+#else
+void __init map_crashkernel(void) {}
+#endif
static void __init map_mem(pgd_t *pgdp)
{
@@ -527,17 +650,6 @@ static void __init map_mem(pgd_t *pgdp)
*/
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
-#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map) {
- if (IS_ENABLED(CONFIG_ZONE_DMA) ||
- IS_ENABLED(CONFIG_ZONE_DMA32))
- flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
- else if (crashk_res.end)
- memblock_mark_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
-#endif
-
/* map all the memory banks */
for_each_mem_range(i, &start, &end) {
if (start >= end)
@@ -570,19 +682,6 @@ static void __init map_mem(pgd_t *pgdp)
* in page granularity and put back unused memory to buddy system
* through /sys/kernel/kexec_crash_size interface.
*/
-#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map &&
- !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
- if (crashk_res.end) {
- __map_memblock(pgdp, crashk_res.start,
- crashk_res.end + 1,
- PAGE_KERNEL,
- NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
- memblock_clear_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
- }
-#endif
}
void mark_rodata_ro(void)