[PATCH 1/1] mm: userfaultfd: avoid leaving stale TLB after userfaultfd_writeprotect()

From: Andrea Arcangeli
Date: Tue Dec 22 2020 - 14:30:16 EST


change_protection() is called by userfaultfd_writeprotect() with the
mmap_lock_read like in change_prot_numa().

The page fault code in wp_copy_page() rightfully assumes if the CPU
issued a write fault and the write bit in the pagetable is not set, no
CPU can write to the page. That's wrong assumption after
userfaultfd_writeprotect(). That's also wrong assumption after
change_prot_numa() where the regular page fault code also would assume
that if the present bit is not set and the page fault is running,
there should be no stale TLB entry, but there is still.

So to stay safe, the page fault code must be prevented to run as long
as long as the TLB flush remains pending. That is already achieved by
the do_numa_page() path for change_prot_numa() and by the
userfaultfd_pte_wp() path for userfaultfd_writeprotect().

The problem that needs fixing is that an un-wrprotect
(i.e. userfaultfd_writeprotect() with UFFDIO_WRITEPROTECT_MODE_WP not
set) could run in between the original wrprotect
(i.e. userfaultfd_writeprotect() with UFFDIO_WRITEPROTECT_MODE_WP set)
and wp_copy_page, while the TLB flush remains pending.

In such case the userfaultfd_pte_wp() check will stop being effective
to prevent the regular page fault code to run.

CPU0 CPU 1 CPU 2
------ -------- -------
userfaultfd_wrprotect(mode_wp = true)
PT lock
atomic set _PAGE_UFFD_WP and clear _PAGE_WRITE
PT unlock

do_page_fault FAULT_FLAG_WRITE
userfaultfd_wrprotect(mode_wp = false)
PT lock
ATOMIC clear _PAGE_UFFD_WP <- problem
/* _PAGE_WRITE not set */
PT unlock
XXXXXXXXXXXXXX BUG RACE window open here

PT lock
FAULT_FLAG_WRITE is set by CPU
_PAGE_WRITE is still clear in pte
PT unlock

wp_page_copy
copy_user_page runs with stale TLB

deferred tlb flush <- too late
XXXXXXXXXXXXXX BUG RACE window close here
================================================================================

If the userfaultfd_wrprotect(mode_wp = false) can only run after the
deferred TLB flush the above cannot happen either, because the uffd_wp
bit will remain set until after the TLB flush and the page fault would
reliably hit the handle_userfault() dead end as long as the TLB is
stale.

So to fix this we need to prevent any un-wrprotect to start until the
last outstanding wrprotect completed and to prevent any further
wrprotect until the last outstanding un-wrprotect completed. Then
wp_page_copy can still run in parallel but only with the un-wrprotect,
and that's fine since it's a permission promotion.

We would need a new two_group_semaphore, where each group can run as
many parallel instances as it wants, but no instance of one group can
run in parallel with any instance of the other group. The below rwsem
with two atomics approximates that kind lock.

Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
---
fs/userfaultfd.c | 39 +++++++++++++++++++++++++++++++++++++++
mm/memory.c | 20 ++++++++++++++++++++
2 files changed, 59 insertions(+)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 894cc28142e7..3729ca99dae5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -76,6 +76,20 @@ struct userfaultfd_ctx {
bool mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
+ /*
+ * We cannot let userfaultd_writeprotect(mode_wp = false)
+ * clear _PAGE_UFFD_WP from the pgtable, until the last
+ * outstanding userfaultd_writeprotect(mode_wp = true)
+ * completes, because the TLB flush is deferred. The page
+ * fault must be forced to enter handle_userfault() while the
+ * TLB flush is deferred, and that's achieved with the
+ * _PAGE_UFFD_WP bit. The _PAGE_UFFD_WP can only be cleared
+ * after there are no stale TLB entries left, only then it'll
+ * be safe again for the page fault to continue and not hit
+ * the handle_userfault() dead end anymore.
+ */
+ struct rw_semaphore wrprotect_rwsem;
+ atomic64_t wrprotect_count[2];
};

struct userfaultfd_fork_ctx {
@@ -1783,6 +1797,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
struct uffdio_writeprotect __user *user_uffdio_wp;
struct userfaultfd_wake_range range;
bool mode_wp, mode_dontwake;
+ bool contention;

if (READ_ONCE(ctx->mmap_changing))
return -EAGAIN;
@@ -1808,9 +1823,30 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
if (mode_wp && mode_dontwake)
return -EINVAL;

+ VM_WARN_ON(atomic64_read(&ctx->wrprotect_count[0]) < 0);
+ VM_WARN_ON(atomic64_read(&ctx->wrprotect_count[1]) < 0);
+ atomic64_inc(&ctx->wrprotect_count[mode_wp ? 0 : 1]);
+ smp_mb__after_atomic();
+ contention = atomic64_read(&ctx->wrprotect_count[mode_wp ? 1 : 0]) > 0;
+ if (!contention)
+ down_read(&ctx->wrprotect_rwsem);
+ else {
+ down_write(&ctx->wrprotect_rwsem);
+ if (!atomic64_read(&ctx->wrprotect_count[mode_wp ? 1 : 0])) {
+ contention = false;
+ downgrade_write(&ctx->wrprotect_rwsem);
+ }
+
+ }
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
uffdio_wp.range.len, mode_wp,
&ctx->mmap_changing);
+ if (!contention)
+ up_read(&ctx->wrprotect_rwsem);
+ else
+ up_write(&ctx->wrprotect_rwsem);
+ smp_mb__before_atomic();
+ atomic64_dec(&ctx->wrprotect_count[mode_wp ? 0 : 1]);
if (ret)
return ret;

@@ -1958,6 +1994,9 @@ static void init_once_userfaultfd_ctx(void *mem)
init_waitqueue_head(&ctx->fault_wqh);
init_waitqueue_head(&ctx->event_wqh);
init_waitqueue_head(&ctx->fd_wqh);
+ init_rwsem(&ctx->wrprotect_rwsem);
+ atomic64_set(&ctx->wrprotect_count[0], 0);
+ atomic64_set(&ctx->wrprotect_count[1], 0);
seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}

diff --git a/mm/memory.c b/mm/memory.c
index 7d608765932b..4ff9f21d5a7b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3085,6 +3085,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;

+ /*
+ * STALE_TLB_WARNING: while the uffd_wp bit is set, the TLB
+ * can be stale. We cannot allow do_wp_page to proceed or
+ * it'll wrongly assume that nobody can still be writing to
+ * the page if !pte_write.
+ */
if (userfaultfd_pte_wp(vma, *vmf->pte)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
@@ -4274,6 +4280,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
if (vma_is_anonymous(vmf->vma)) {
+ /*
+ * STALE_TLB_WARNING: while the uffd_wp bit is set,
+ * the TLB can be stale. We cannot allow wp_huge_pmd()
+ * to proceed or it'll wrongly assume that nobody can
+ * still be writing to the page if !pmd_write.
+ */
if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf, orig_pmd);
@@ -4388,6 +4400,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);

+ /*
+ * STALE_TLB_WARNING: if the pte is NUMA protnone the TLB can
+ * be stale.
+ */
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);

@@ -4503,6 +4519,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return 0;
}
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+ /*
+ * STALE_TLB_WARNING: if the pmd is NUMA
+ * protnone the TLB can be stale.
+ */
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);