[RFC][PATCH 1/6] mm: Dont assume page-table invariance during faults

From: Peter Zijlstra
Date: Mon Oct 20 2014 - 18:42:23 EST


One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the relyance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
mm/memory.c | 76 ++++++++++++++++--------------------------------------------
1 file changed, 21 insertions(+), 55 deletions(-)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1933,31 +1933,6 @@ int apply_to_page_range(struct mm_struct
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

-/*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically. Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
- pte_t *page_table, pte_t orig_pte)
-{
- int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
- if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
- same = pte_same(*page_table, orig_pte);
- spin_unlock(ptl);
- }
-#endif
- pte_unmap(page_table);
- return same;
-}
-
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
debug_dma_assert_idle(src);
@@ -2407,21 +2382,18 @@ EXPORT_SYMBOL(unmap_mapping_range);
* as does filemap_fault().
*/
static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned long address, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
- pte_t pte;
+ pte_t *page_table, pte;
int locked;
int exclusive = 0;
int ret = 0;

- if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- goto out;
-
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
@@ -2624,15 +2596,13 @@ static inline int check_stack_guard_page
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned long address, pmd_t *pmd,
unsigned int flags)
{
struct mem_cgroup *memcg;
struct page *page;
spinlock_t *ptl;
- pte_t entry;
-
- pte_unmap(page_table);
+ pte_t entry, *page_table;

/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
@@ -3031,13 +3001,12 @@ static int do_shared_fault(struct mm_str
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned long address, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;

- pte_unmap(page_table);
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
@@ -3059,16 +3028,13 @@ static int do_linear_fault(struct mm_str
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned long address, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff;

flags |= FAULT_FLAG_NONLINEAR;

- if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- return 0;
-
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
@@ -3103,7 +3069,7 @@ static int numa_migrate_prep(struct page
}

static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+ unsigned long addr, pte_t pte, pmd_t *pmd)
{
struct page *page = NULL;
spinlock_t *ptl;
@@ -3112,6 +3078,7 @@ static int do_numa_page(struct mm_struct
int target_nid;
bool migrated = false;
int flags = 0;
+ pte_t *ptep;

/*
* The "pte" at this point cannot be used safely without
@@ -3122,8 +3089,7 @@ static int do_numa_page(struct mm_struct
* the _PAGE_NUMA bit and it is not really expected that there
* would be concurrent hardware modifications to the PTE.
*/
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (unlikely(!pte_same(*ptep, pte))) {
pte_unmap_unlock(ptep, ptl);
goto out;
@@ -3195,34 +3161,32 @@ static int do_numa_page(struct mm_struct
*/
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+ pte_t entry, pmd_t *pmd, unsigned int flags)
{
- pte_t entry;
spinlock_t *ptl;
+ pte_t *pte;

- entry = ACCESS_ONCE(*pte);
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
- pte, pmd, flags, entry);
+ pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
+ pmd, flags);
}
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
- pte, pmd, flags, entry);
+ pmd, flags, entry);
return do_swap_page(mm, vma, address,
- pte, pmd, flags, entry);
+ pmd, flags, entry);
}

if (pte_numa(entry))
- return do_numa_page(mm, vma, address, entry, pte, pmd);
+ return do_numa_page(mm, vma, address, entry, pmd);

- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
@@ -3261,7 +3225,7 @@ static int __handle_mm_fault(struct mm_s
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte;
+ pte_t *pte, entry;

if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3331,8 +3295,10 @@ static int __handle_mm_fault(struct mm_s
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address);
+ entry = ACCESS_ONCE(*pte);
+ pte_unmap(pte);

- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ return handle_pte_fault(mm, vma, address, entry, pmd, flags);
}

/*


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/