Re: [tip: x86/urgent] x86/mm: Fix pti_clone_entry_text() for i386

From: Guenter Roeck
Date: Tue Aug 06 2024 - 16:16:34 EST


On 8/6/24 11:48, Peter Zijlstra wrote:
On Tue, Aug 06, 2024 at 09:42:23AM -0700, Guenter Roeck wrote:
On 8/6/24 08:59, Peter Zijlstra wrote:
On Tue, Aug 06, 2024 at 05:46:53PM +0200, Peter Zijlstra wrote:
On Tue, Aug 06, 2024 at 05:05:15PM +0200, Peter Zijlstra wrote:
On Tue, Aug 06, 2024 at 04:56:32PM +0200, Peter Zijlstra wrote:
On Tue, Aug 06, 2024 at 07:25:42AM -0700, Guenter Roeck wrote:

I created http://server.roeck-us.net/qemu/x86-v6.11-rc2/ with all
the relevant information. Please let me know if you need anything else.

So I grabbed that config, stuck it in the build dir I used last time and
upgraded gcc-13 from 13.2 ro 13.3. But alas, my build runs successfully
:/

Is there anything else special I missed?

run.sh is not exacrlty the same this time, different CPU model, that
made it go.

OK, lemme poke at this.

Urgh, so crypto's late_initcall() does user-mode-helper based modprobe
looking for algorithms before we kick off /bin/init :/

This makes things difficult.

Urgh.

So the problem is that mark_readonly() splits a code PMD due to NX. Then
the second pti_clone_entry_text() finds a kernel PTE but a user PMD
mapping for the same address (from the early clone) and gets upset.

And we can't run mark_readonly() sooner, because initcall expect stuff
to be RW. But initcalls do modprobe, which runs user crap before we're
done initializing everything.

This is a right mess, and I really don't know what to do.

And there was me thinking this one should be easy to solve. Oh well.

Maybe Linus has an idea ? I am getting a bit wary to reporting all those
weird problems to him, though.

While I had dinner with the family, Thomas cooked up the below, which
seems to make it happy.

It basically splits the PMD on the late copy.


The patch below passes all my tests.

Tested-by: Guenter Roeck <linux@xxxxxxxxxxxx>

Thanks!

Guenter

---
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index bfdf5f45b137..b6ebbc9c23b1 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd;
@@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
if (!pmd)
return NULL;
- /* We can't do anything sensible if we hit a large mapping. */
+ /* Large PMD mapping found */
if (pmd_leaf(*pmd)) {
- WARN_ON(1);
- return NULL;
+ /* Clear the PMD if we hit a large mapping from the first round */
+ if (late_text) {
+ set_pmd(pmd, __pmd(0));
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
}
if (pmd_none(*pmd)) {
@@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void)
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
- target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false);
if (WARN_ON(!target_pte))
return;
@@ -301,7 +306,7 @@ enum pti_clone_level {
static void
pti_clone_pgtable(unsigned long start, unsigned long end,
- enum pti_clone_level level)
+ enum pti_clone_level level, bool late_text)
{
unsigned long addr;
@@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
return;
/* Allocate PTE in the user page-table */
- target_pte = pti_user_pagetable_walk_pte(addr);
+ target_pte = pti_user_pagetable_walk_pte(addr, late_text);
if (WARN_ON(!target_pte))
return;
@@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void)
phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
pte_t *target_pte;
- target_pte = pti_user_pagetable_walk_pte(va);
+ target_pte = pti_user_pagetable_walk_pte(va, false);
if (WARN_ON(!target_pte))
return;
@@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void)
start = CPU_ENTRY_AREA_BASE;
end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
- pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD, false);
}
#endif /* CONFIG_X86_64 */
@@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry text and force it RO.
*/
-static void pti_clone_entry_text(void)
+static void pti_clone_entry_text(bool late)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
(unsigned long) __entry_text_end,
- PTI_LEVEL_KERNEL_IMAGE);
+ PTI_LEVEL_KERNEL_IMAGE, late);
}
/*
@@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false);
/*
* pti_clone_pgtable() will set the global bit in any PMDs
@@ -639,7 +644,7 @@ void __init pti_init(void)
/* Undo all global bits from the init pagetables in head_64.S: */
pti_set_kernel_image_nonglobal();
/* Replace some of the global bits just for shared entry text: */
- pti_clone_entry_text();
+ pti_clone_entry_text(false);
pti_setup_espfix64();
pti_setup_vsyscall();
}
@@ -659,7 +664,7 @@ void pti_finalize(void)
* We need to clone everything (again) that maps parts of the
* kernel image.
*/
- pti_clone_entry_text();
+ pti_clone_entry_text(true);
pti_clone_kernel_text();
debug_checkwx_user();