Re: [RFC PATCH v2 17/19] x86/mm/cpa: PKS protect direct map page tables

From: Mike Rapoport
Date: Tue Aug 31 2021 - 06:14:27 EST


On Mon, Aug 30, 2021 at 04:59:25PM -0700, Rick Edgecombe wrote:
> Protecting direct map page tables is a bit more difficult because a page
> table may be needed for a page split as part of setting the PKS
> permission the new page table. So in the case of an empty cache of page
> tables the page table allocator could get into a situation where it cannot
> create any more page tables.
>
> Several solutions were looked at:
>
> 1. Break the direct map with pages allocated from the large page being
> converted to PKS. This would result in a window where the table could be
> written to right before it was linked into the page tables. It also
> depends on high order pages being available, and so would regress from
> the un-protected behavior in that respect.
> 2. Hold some page tables in reserve to be able to break the large page
> for a new 2MB page, but if there are no 2MB page's available we may need
> to add a single page to the cache, in which case we would use up the
> reserve of page tables needed to break a new page, but not get enough
> page tables back to replenish the resereve.
> 3. Always map the direct map at 4k when protecting page tables so that
> pages don't need to be broken to map them with a PKS key. This would have
> undesirable performance.
>
> 4. Lastly, the strategy employed in this patch, have a separate cache of
> page tables just used for the direct map. Early in boot, squirrel away
> enough page tables to map the direct map at 4k. This comes with the same
> memory overhead of mapping the direct map at 4k, but gets the other
> benefits of mapping the direct map as large pages.
>
> There is also the problem of protecting page tables that are allocated
> during boot. Instead of recording the tables to protect later, create a
> page table traversing infrastructure to walk every page table in init_mm
> and apply protection. This also covers non-direct map odds-and-ends page
> tables that are allocated during boot. The existing page table traversing
> in pagewalk.c cannot be used for this purpose because there are not actual
> vmas for all of the kernel address space.
>
> The algorithm for protecting the direct map page table cache, while also
> allocating from it for direct map splits is described in the comments of
> init_pks_dmap_tables().
>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
> ---
> arch/x86/include/asm/set_memory.h | 2 +
> arch/x86/mm/init.c | 89 ++++++++++
> arch/x86/mm/pat/set_memory.c | 263 +++++++++++++++++++++++++++++-
> 3 files changed, 350 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
> index 1ba2fb45ed05..9f8d0d0ae063 100644
> --- a/arch/x86/include/asm/set_memory.h
> +++ b/arch/x86/include/asm/set_memory.h
> @@ -90,6 +90,8 @@ bool kernel_page_present(struct page *page);
>
> extern int kernel_set_to_readonly;
>
> +void add_dmap_table(unsigned long addr);
> +
> #ifdef CONFIG_X86_64
> /*
> * Prevent speculative access to the page by either unmapping
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index c8933c6d5efd..a91696e3da96 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -6,6 +6,7 @@
> #include <linux/swapfile.h>
> #include <linux/swapops.h>
> #include <linux/kmemleak.h>
> +#include <linux/hugetlb.h>
> #include <linux/sched/task.h>
>
> #include <asm/set_memory.h>
> @@ -26,6 +27,7 @@
> #include <asm/pti.h>
> #include <asm/text-patching.h>
> #include <asm/memtype.h>
> +#include <asm/pgalloc.h>
>
> /*
> * We need to define the tracepoints somewhere, and tlb.c
> @@ -119,6 +121,17 @@ __ref void *alloc_low_pages(unsigned int num)
> if (after_bootmem) {
> unsigned int order;
>
> + if (cpu_feature_enabled(X86_FEATURE_PKS_TABLES)) {
> + struct page *page;
> +
> + /* 64 bit only allocates order 0 pages */
> + WARN_ON(num != 1);
> +
> + page = alloc_table(GFP_ATOMIC | __GFP_ZERO);
> + if (!page)
> + return NULL;
> + return (void *)page_address(page);
> + }
> order = get_order((unsigned long)num << PAGE_SHIFT);
> return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
> }
> @@ -504,6 +517,79 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
> return false;
> }
>
> +#ifdef CONFIG_PKS_PG_TABLES
> +/* Page tables needed in bytes */
> +static u64 calc_tables_needed(unsigned int size)
> +{
> + unsigned int puds = size >> PUD_SHIFT;
> + unsigned int pmds = size >> PMD_SHIFT;
> +
> + /*
> + * Catch if direct map ever might need more page tables to split
> + * down to 4k.
> + */
> + BUILD_BUG_ON(p4d_huge(foo));
> + BUILD_BUG_ON(pgd_huge(foo));
> +
> + return (puds + pmds) << PAGE_SHIFT;
> +}
> +
> +/*
> + * If pre boot, reserve large pages from memory that will be mapped. It's ok that this is not
> + * mapped as PKS, other init code in CPA will handle the conversion.
> + */
> +static unsigned int __init reserve_pre_boot(u64 start, u64 end)
> +{
> + u64 cur = memblock_find_in_range(start, end, HPAGE_SIZE, HPAGE_SIZE);
> + int i;

Please use memblock_phys_alloc_range() here.
Besides, it seems this reserved pages are not accessed until late_initcall
time, so there is no need to limit the allocation to already mapped areas,
memblock_alloc_raw() would suffice.

> +
> + if (!cur)
> + return 0;
> + memblock_reserve(cur, HPAGE_SIZE);
> + for (i = 0; i < HPAGE_SIZE; i += PAGE_SIZE)
> + add_dmap_table((unsigned long)__va(cur + i));
> + return HPAGE_SIZE;
> +}
> +
> +/* If post boot, memblock is not available. Just reserve from other memory regions */
> +static unsigned int __init reserve_post_boot(void)
> +{
> + struct page *page = alloc_table(GFP_KERNEL);
> +
> + if (!page)
> + return 0;
> +
> + add_dmap_table((unsigned long)page_address(page));

add_dmap_table() calls use casting everywhere, maybe make it
add_dmap_table(void *)?

> +
> + return PAGE_SIZE;
> +}
> +
> +static void __init reserve_page_tables(u64 start, u64 end)
> +{
> + u64 reserve_size = calc_tables_needed(end - start);
> + u64 reserved = 0;
> + u64 cur_reserved;
> +
> + while (reserved < reserve_size) {
> + if (after_bootmem)
> + cur_reserved = reserve_post_boot();
> + else
> + cur_reserved = reserve_pre_boot(start, end);
> +
> + if (!cur_reserved) {
> + WARN(1, "Could not reserve direct map page tables %llu/%llu\n",
> + reserved,
> + reserve_size);
> + return;
> + }
> +
> + reserved += cur_reserved;
> + }
> +}
> +#else
> +static inline void reserve_page_tables(u64 start, u64 end) { }
> +#endif
> +
> /*
> * Setup the direct mapping of the physical memory at PAGE_OFFSET.
> * This runs before bootmem is initialized and gets pages directly from
> @@ -529,6 +615,9 @@ unsigned long __ref init_memory_mapping(unsigned long start,
>
> add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
>
> + if (cpu_feature_enabled(X86_FEATURE_PKS_TABLES))
> + reserve_page_tables(start, end);
> +
> return ret >> PAGE_SHIFT;
> }

--
Sincerely yours,
Mike.