Re: [PATCH v3 1/2] mm/uffd: UFFD_FEATURE_WP_UNPOPULATED

From: David Hildenbrand
Date: Tue Mar 07 2023 - 11:13:35 EST


On 06.03.23 22:39, Peter Xu wrote:

Note that I wodnered for a second if we'd call it "UFFD_FEATURE_WP_MISSING" instead (similar to the definition of MISSING uffd that triggers when we have nothing mapped).

Just a thought.

[...]

With WP_UNPOPUATED, application like QEMU can avoid pre-read faults all the
memory before wr-protect during taking a live snapshot. Quotting from
Muhammad's test result here [3] based on a simple program [4]:

(1) With huge page disabled
echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
./uffd_wp_perf
Test DEFAULT: 4
Test PRE-READ: 1111453 (pre-fault 1101011)
Test MADVISE: 278276 (pre-fault 266378)
Test WP-UNPOPULATE: 11712

(2) With Huge page enabled
echo always > /sys/kernel/mm/transparent_hugepage/enabled
./uffd_wp_perf
Test DEFAULT: 4
Test PRE-READ: 22521 (pre-fault 22348)
Test MADVISE: 4909 (pre-fault 4743)
Test WP-UNPOPULATE: 14448

There'll be a great perf boost for no-thp case, while for thp enabled with
extreme case of all-thp-zero WP_UNPOPULATED can be slower than MADVISE, but
that's low possibility in reality, also the overhead was not reduced but
postponed until a follow up write on any huge zero thp, so potentitially it

s/potentitially/potentially/

is faster by making the follow up writes slower.

What I realized, interrestingly not only the writes, but also the reads. In case of background snapshots we'll be reading all VM memory I think ... but we could optimize in QEMU by consulting the pagemap if there is anything mapped at all, and not read zeros in that case [an optimization brought up several times already].

I am not sure yet if we want to change the QEMU implementation. But anyhow, that's a different discussion.


[1] https://lore.kernel.org/all/20210401092226.102804-4-andrey.gruzdev@xxxxxxxxxxxxx/
[2] https://lore.kernel.org/all/Y+v2HJ8+3i%2FKzDBu@x1n/
[3] https://lore.kernel.org/all/d0eb0a13-16dc-1ac1-653a-78b7273781e3@xxxxxxxxxxxxx/
[4] https://github.com/xzpeter/clibs/blob/master/uffd-test/uffd-wp-perf.c

Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
fs/userfaultfd.c | 14 ++++++++
include/linux/mm_inline.h | 6 ++++
include/linux/userfaultfd_k.h | 6 ++++
include/uapi/linux/userfaultfd.h | 10 +++++-
mm/memory.c | 56 ++++++++++++++++++++++--------
mm/mprotect.c | 59 ++++++++++++++++++++++++++------
6 files changed, 126 insertions(+), 25 deletions(-)

[...]

+static vm_fault_t handle_pte_missing(struct vm_fault *vmf)
+{
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
+ else
+ return do_fault(vmf);
+}
+
/*
* This is actually a page-missing access, but with uffd-wp special pte
* installed. It means this pte was wr-protected before being unmapped.
@@ -3634,11 +3664,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
* Just in case there're leftover special ptes even after the region
* got unregistered - we can simply clear them.
*/
- if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+ if (unlikely(!userfaultfd_wp(vmf->vma)))
return pte_marker_clear(vmf);
- /* do_fault() can handle pte markers too like none pte */
- return do_fault(vmf);
+ return handle_pte_missing(vmf);
}
static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -4008,6 +4037,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
struct vm_area_struct *vma = vmf->vma;
struct folio *folio;
vm_fault_t ret = 0;
@@ -4041,7 +4071,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (!pte_none(*vmf->pte)) {
+ if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
@@ -4081,7 +4111,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (!pte_none(*vmf->pte)) {
+ if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
@@ -4101,6 +4131,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
folio_add_new_anon_rmap(folio, vma, vmf->address);
folio_add_lru_vma(folio, vma);
setpte:
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
@@ -4268,7 +4300,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
- bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry;
@@ -4915,12 +4947,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
}
- if (!vmf->pte) {
- if (vma_is_anonymous(vmf->vma))
- return do_anonymous_page(vmf);
- else
- return do_fault(vmf);
- }
+ if (!vmf->pte)
+ return handle_pte_missing(vmf);

It would better blend in if it would be called "do_pte_missing()".

if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 231929f119d9..6a2df93158ee 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -276,7 +276,16 @@ static long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
- if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
+
+ /*
+ * Nobody plays with any none ptes besides
+ * userfaultfd when applying the protections.
+ */
+ if (likely(!uffd_wp))
+ continue;
+
+ if (!vma_is_anonymous(vma) ||
+ userfaultfd_wp_unpopulated(vma)) {

I think it would make sense to replace all 3 instances of this check by a new function (userfaultfd_wp_use_markers() ? ) and move some doc from pgtable_populate_needed() in there.

/*
* For file-backed mem, we need to be able to
* wr-protect a none pte, because even if the
@@ -320,23 +329,53 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
return 0;
}
-/* Return true if we're uffd wr-protecting file-backed memory, or false */
+/*
+ * Return true if we want to split huge thps in change protection
+ * procedure, false otherwise.
+ */
static inline bool
-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
{
+ /*
+ * pte markers only resides in pte level, if we need pte markers,
+ * we need to split. We cannot wr-protect shmem thp because file
+ * thp is handled differently when split by erasing the pmd so far.
+ */
return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
}
/*
- * If wr-protecting the range for file-backed, populate pgtable for the case
- * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc()
- * failed we treat it the same way as pgtable allocation failures during
- * page faults by kicking OOM and returning error.
+ * Return true if we want to populate pgtables in change protection
+ * procedure, false otherwise
+ */
+static inline bool
+pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+ /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
+ if (!(cp_flags & MM_CP_UFFD_WP))
+ return false;
+
+ /* Either if this is file-based, we need it for pte markers */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /*
+ * Or anonymous, we only need this if WP_ZEROPAGE enabled (to
+ * install zero pages).

s/WP_ZEROPAGE/WP_UNPOPULATED/

+ */
+ return userfaultfd_wp_unpopulated(vma);
+}
+



--
Thanks,

David / dhildenb