Re: [PATCH v8 02/23] mm: Teach core mm about pte markers
From: Alistair Popple
Date: Mon Apr 11 2022 - 21:53:07 EST
I've been reviewing existing pte_none() call sites and noticed the following:
>From khugepaged_scan_pmd():
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
if (++unmapped <= khugepaged_max_ptes_swap) {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
* comment below for pte_uffd_wp().
*/
if (pte_swp_uffd_wp(pteval)) {
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
++none_or_zero <= khugepaged_max_ptes_none) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
}
I think the above could encounter a marker PTE right? So the behviour would be
wrong in that case. As I understand things the is_swap_pte() path will be taken
rather than pte_none() but in the absence of any special handling shouldn't
marker PTE's mostly be treated as pte_none() here?
I think you need to s/pte_none/pte_none_mostly/ here and swap the order of
conditionals around.
- Alistair
Peter Xu <peterx@xxxxxxxxxx> writes:
> This patch still does not use pte marker in any way, however it teaches the
> core mm about the pte marker idea.
>
> For example, handle_pte_marker() is introduced that will parse and handle all
> the pte marker faults.
>
> Many of the places are more about commenting it up - so that we know there's
> the possibility of pte marker showing up, and why we don't need special code
> for the cases.
>
> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
> ---
> fs/userfaultfd.c | 10 ++++++----
> mm/filemap.c | 5 +++++
> mm/hmm.c | 2 +-
> mm/memcontrol.c | 8 ++++++--
> mm/memory.c | 23 +++++++++++++++++++++++
> mm/mincore.c | 3 ++-
> mm/mprotect.c | 3 +++
> 7 files changed, 46 insertions(+), 8 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index aa0c47cb0d16..8b4a94f5a238 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -249,9 +249,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
>
> /*
> * Lockless access: we're in a wait_event so it's ok if it
> - * changes under us.
> + * changes under us. PTE markers should be handled the same as none
> + * ptes here.
> */
> - if (huge_pte_none(pte))
> + if (huge_pte_none_mostly(pte))
> ret = true;
> if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
> ret = true;
> @@ -330,9 +331,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> pte = pte_offset_map(pmd, address);
> /*
> * Lockless access: we're in a wait_event so it's ok if it
> - * changes under us.
> + * changes under us. PTE markers should be handled the same as none
> + * ptes here.
> */
> - if (pte_none(*pte))
> + if (pte_none_mostly(*pte))
> ret = true;
> if (!pte_write(*pte) && (reason & VM_UFFD_WP))
> ret = true;
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 3a5ffb5587cd..ef77dae8c28d 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3382,6 +3382,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
> vmf->pte += xas.xa_index - last_pgoff;
> last_pgoff = xas.xa_index;
>
> + /*
> + * NOTE: If there're PTE markers, we'll leave them to be
> + * handled in the specific fault path, and it'll prohibit the
> + * fault-around logic.
> + */
> if (!pte_none(*vmf->pte))
> goto unlock;
>
> diff --git a/mm/hmm.c b/mm/hmm.c
> index af71aac3140e..3fd3242c5e50 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -239,7 +239,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> pte_t pte = *ptep;
> uint64_t pfn_req_flags = *hmm_pfn;
>
> - if (pte_none(pte)) {
> + if (pte_none_mostly(pte)) {
> required_fault =
> hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
> if (required_fault)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7a08737bac4b..08af97c73f0f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5644,10 +5644,14 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
>
> if (pte_present(ptent))
> page = mc_handle_present_pte(vma, addr, ptent);
> + else if (pte_none_mostly(ptent))
> + /*
> + * PTE markers should be treated as a none pte here, separated
> + * from other swap handling below.
> + */
> + page = mc_handle_file_pte(vma, addr, ptent);
> else if (is_swap_pte(ptent))
> page = mc_handle_swap_pte(vma, ptent, &ent);
> - else if (pte_none(ptent))
> - page = mc_handle_file_pte(vma, addr, ptent);
>
> if (!page && !ent.val)
> return ret;
> diff --git a/mm/memory.c b/mm/memory.c
> index 2c5d1bb4694f..3f396241a7db 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -100,6 +100,8 @@ struct page *mem_map;
> EXPORT_SYMBOL(mem_map);
> #endif
>
> +static vm_fault_t do_fault(struct vm_fault *vmf);
> +
> /*
> * A number of key systems in x86 including ioremap() rely on the assumption
> * that high_memory defines the upper bound on direct map memory, then end
> @@ -1415,6 +1417,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> if (!should_zap_page(details, page))
> continue;
> rss[mm_counter(page)]--;
> + } else if (is_pte_marker_entry(entry)) {
> + /* By default, simply drop all pte markers when zap */
> } else if (is_hwpoison_entry(entry)) {
> if (!should_zap_cows(details))
> continue;
> @@ -3555,6 +3559,23 @@ static inline bool should_try_to_free_swap(struct page *page,
> page_count(page) == 2;
> }
>
> +static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> +{
> + swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
> + unsigned long marker = pte_marker_get(entry);
> +
> + /*
> + * PTE markers should always be with file-backed memories, and the
> + * marker should never be empty. If anything weird happened, the best
> + * thing to do is to kill the process along with its mm.
> + */
> + if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
> + return VM_FAULT_SIGBUS;
> +
> + /* TODO: handle pte markers */
> + return 0;
> +}
> +
> /*
> * We enter with non-exclusive mmap_lock (to exclude vma changes,
> * but allow concurrent faults), and pte mapped but not yet locked.
> @@ -3592,6 +3613,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> } else if (is_hwpoison_entry(entry)) {
> ret = VM_FAULT_HWPOISON;
> + } else if (is_pte_marker_entry(entry)) {
> + ret = handle_pte_marker(vmf);
> } else {
> print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
> ret = VM_FAULT_SIGBUS;
> diff --git a/mm/mincore.c b/mm/mincore.c
> index f4f627325e12..fa200c14185f 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -122,7 +122,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> for (; addr != end; ptep++, addr += PAGE_SIZE) {
> pte_t pte = *ptep;
>
> - if (pte_none(pte))
> + /* We need to do cache lookup too for pte markers */
> + if (pte_none_mostly(pte))
> __mincore_unmapped_range(addr, addr + PAGE_SIZE,
> vma, vec);
> else if (pte_present(pte))
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 56060acdabd3..709a6f73b764 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -188,6 +188,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> newpte = pte_swp_mksoft_dirty(newpte);
> if (pte_swp_uffd_wp(oldpte))
> newpte = pte_swp_mkuffd_wp(newpte);
> + } else if (is_pte_marker_entry(entry)) {
> + /* Skip it, the same as none pte */
> + continue;
> } else {
> newpte = oldpte;
> }