Re: frequent lockups in 3.18rc4

From: Linus Torvalds
Date: Fri Nov 21 2014 - 17:55:34 EST


On Fri, Nov 21, 2014 at 1:11 PM, Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote:
>
> I'm fine with that. I just think it's not horrid enough, but that can
> be fixed easily :)

Oh, I think it's plenty horrid.

Anyway, here's an actual patch. As usual, it has seen absolutely no
actual testing, but I did try to make sure it compiles and seems to do
the right thing on:
- x86-32 no-PAE
- x86-32 no-PAE with PARAVIRT
- x86-32 PAE
- x86-64

also, I just removed the noise that is "vmalloc_sync_all()", since
it's just all garbage and nothing actually uses it. Yeah, it's used by
"register_die_notifier()", which makes no sense what-so-ever.
Whatever. It's gone.

Can somebody actually *test* this? In particular, in any kind of real
paravirt environment? Or, any comments even without testing?

I *really* am not proud of the mess wrt the whole

#ifdef CONFIG_PARAVIRT
#ifdef CONFIG_X86_32
...

but I think that from a long-term perspective, we're actually better
off with this kind of really ugly - but very explcit - hack that very
clearly shows what is going on.

The old code that actually "walked" the page tables was more
"portable", but was somewhat misleading about what was actually going
on.

Comments?

Linus
arch/x86/mm/fault.c | 243 +++++++++++++---------------------------------------
1 file changed, 58 insertions(+), 185 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d973e61e450d..4b0a1b9404b1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -42,6 +42,64 @@ enum x86_pf_error_code {
};

/*
+ * Handle a possible vmalloc fault. We just copy the
+ * top-level page table entry if necessary.
+ *
+ * With PAE, the top-most pgd entry is always shared,
+ * and that's where the vmalloc area is. So PAE had
+ * better never have any vmalloc faults.
+ *
+ * NOTE! This on purpose does *NOT* use pgd_present()
+ * and such generic accessor functions, because
+ * the pgd may contain a folded pud/pmd, and is thus
+ * always "present". We access the actual hardware
+ * state directly, except for the final "set_pgd()"
+ * that may go through a paravirtualization layer.
+ *
+ * Also note the disgusting hackery for the whole
+ * paravirtualization case. Since PAE isn't an issue,
+ * we know that the pmd is the top level, and we just
+ * short-circuit it all.
+ *
+ * We *seriously* need to get rid of the crazy
+ * paravirtualization crud.
+ */
+static nokprobe_inline int vmalloc_fault(unsigned long address)
+{
+#ifdef CONFIG_X86_PAE
+ return -1;
+#else
+ pgd_t *pgd_dst, pgd_entry;
+ unsigned index = pgd_index(address);
+
+ if (index < KERNEL_PGD_BOUNDARY)
+ return -1;
+
+ pgd_entry = init_mm.pgd[index];
+ if (!(pgd_entry.pgd & 1))
+ return -1;
+
+ pgd_dst = __va(PAGE_MASK & read_cr3());
+ pgd_dst += index;
+
+ if (pgd_dst->pgd)
+ return -1;
+
+#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_X86_32
+ set_pmd((pmd_t *)pgd_dst, (pmd_t){(pud_t){pgd_entry}});
+#else
+ set_pgd(pgd_dst, pgd_entry);
+ arch_flush_lazy_mmu_mode(); // WTF?
+#endif
+#else
+ *pgd_dst = pgd_entry;
+#endif
+ return 0;
+#endif
+}
+
+/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
@@ -189,110 +247,6 @@ DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
- unsigned index = pgd_index(address);
- pgd_t *pgd_k;
- pud_t *pud, *pud_k;
- pmd_t *pmd, *pmd_k;
-
- pgd += index;
- pgd_k = init_mm.pgd + index;
-
- if (!pgd_present(*pgd_k))
- return NULL;
-
- /*
- * set_pgd(pgd, *pgd_k); here would be useless on PAE
- * and redundant with the set_pmd() on non-PAE. As would
- * set_pud.
- */
- pud = pud_offset(pgd, address);
- pud_k = pud_offset(pgd_k, address);
- if (!pud_present(*pud_k))
- return NULL;
-
- pmd = pmd_offset(pud, address);
- pmd_k = pmd_offset(pud_k, address);
- if (!pmd_present(*pmd_k))
- return NULL;
-
- if (!pmd_present(*pmd))
- set_pmd(pmd, *pmd_k);
- else
- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-
- return pmd_k;
-}
-
-void vmalloc_sync_all(void)
-{
- unsigned long address;
-
- if (SHARED_KERNEL_PMD)
- return;
-
- for (address = VMALLOC_START & PMD_MASK;
- address >= TASK_SIZE && address < FIXADDR_TOP;
- address += PMD_SIZE) {
- struct page *page;
-
- spin_lock(&pgd_lock);
- list_for_each_entry(page, &pgd_list, lru) {
- spinlock_t *pgt_lock;
- pmd_t *ret;
-
- /* the pgt_lock only for Xen */
- pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
-
- spin_lock(pgt_lock);
- ret = vmalloc_sync_one(page_address(page), address);
- spin_unlock(pgt_lock);
-
- if (!ret)
- break;
- }
- spin_unlock(&pgd_lock);
- }
-}
-
-/*
- * 32-bit:
- *
- * Handle a fault on the vmalloc or module mapping area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
- unsigned long pgd_paddr;
- pmd_t *pmd_k;
- pte_t *pte_k;
-
- /* Make sure we are in vmalloc area: */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
-
- WARN_ON_ONCE(in_nmi());
-
- /*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
- *
- * Do _not_ use "current" here. We might be inside
- * an interrupt in the middle of a task switch..
- */
- pgd_paddr = read_cr3();
- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
- if (!pmd_k)
- return -1;
-
- pte_k = pte_offset_kernel(pmd_k, address);
- if (!pte_present(*pte_k))
- return -1;
-
- return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
@@ -347,87 +301,6 @@ out:

#else /* CONFIG_X86_64: */

-void vmalloc_sync_all(void)
-{
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
-}
-
-/*
- * 64-bit:
- *
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
- pgd_t *pgd, *pgd_ref;
- pud_t *pud, *pud_ref;
- pmd_t *pmd, *pmd_ref;
- pte_t *pte, *pte_ref;
-
- /* Make sure we are in vmalloc area: */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
-
- WARN_ON_ONCE(in_nmi());
-
- /*
- * Copy kernel mappings over when needed. This can also
- * happen within a race in page table update. In the later
- * case just flush:
- */
- pgd = pgd_offset(current->active_mm, address);
- pgd_ref = pgd_offset_k(address);
- if (pgd_none(*pgd_ref))
- return -1;
-
- if (pgd_none(*pgd)) {
- set_pgd(pgd, *pgd_ref);
- arch_flush_lazy_mmu_mode();
- } else {
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- }
-
- /*
- * Below here mismatches are bugs because these lower tables
- * are shared:
- */
-
- pud = pud_offset(pgd, address);
- pud_ref = pud_offset(pgd_ref, address);
- if (pud_none(*pud_ref))
- return -1;
-
- if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
- BUG();
-
- pmd = pmd_offset(pud, address);
- pmd_ref = pmd_offset(pud_ref, address);
- if (pmd_none(*pmd_ref))
- return -1;
-
- if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
- BUG();
-
- pte_ref = pte_offset_kernel(pmd_ref, address);
- if (!pte_present(*pte_ref))
- return -1;
-
- pte = pte_offset_kernel(pmd, address);
-
- /*
- * Don't use pte_page here, because the mappings can point
- * outside mem_map, and the NUMA hash lookup cannot handle
- * that:
- */
- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
- BUG();
-
- return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR