Re: [PATCH v2 4/6] mm: lock vma explicitly before doing vm_flags_reset and vm_flags_reset_once
From: Liam R. Howlett
Date: Wed Aug 02 2023 - 13:14:21 EST
* Suren Baghdasaryan <surenb@xxxxxxxxxx> [230801 18:07]:
> Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is
> not obvious and makes it hard to understand where vma locking is happening.
> Also in some cases (like in dup_userfaultfd()) vma should be locked earlier
> than vma_flags modification. To make locking more visible, change these
> functions to assert that the vma write lock is taken and explicitly lock
> the vma beforehand. Fix userfaultfd functions which should lock the vma
> earlier.
>
> Suggested-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
> ---
> arch/powerpc/kvm/book3s_hv_uvmem.c | 1 +
> drivers/infiniband/hw/hfi1/file_ops.c | 1 +
> fs/userfaultfd.c | 6 ++++++
> include/linux/mm.h | 10 +++++++---
> mm/madvise.c | 5 ++---
> mm/mlock.c | 3 ++-
> mm/mprotect.c | 1 +
> 7 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 709ebd578394..e2d6f9327f77 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -410,6 +410,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
> ret = H_STATE;
> break;
> }
> + vma_start_write(vma);
> /* Copy vm_flags to avoid partial modifications in ksm_madvise */
> vm_flags = vma->vm_flags;
> ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
> diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
> index a5ab22cedd41..5920bfc1e1c5 100644
> --- a/drivers/infiniband/hw/hfi1/file_ops.c
> +++ b/drivers/infiniband/hw/hfi1/file_ops.c
> @@ -344,6 +344,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
> goto done;
> }
>
> + vma_start_write(vma);
> /*
> * vm_pgoff is used as a buffer selector cookie. Always mmap from
> * the beginning.
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 7cecd49e078b..6cde95533dcd 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -667,6 +667,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
> mmap_write_lock(mm);
> for_each_vma(vmi, vma) {
> if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
> + vma_start_write(vma);
> vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> userfaultfd_set_vm_flags(vma,
> vma->vm_flags & ~__VM_UFFD_FLAGS);
> @@ -702,6 +703,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
>
> octx = vma->vm_userfaultfd_ctx.ctx;
> if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> + vma_start_write(vma);
> vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
> return 0;
> @@ -783,6 +785,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> atomic_inc(&ctx->mmap_changing);
> } else {
> /* Drop uffd context if remap feature not enabled */
> + vma_start_write(vma);
> vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
> }
> @@ -940,6 +943,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
> prev = vma;
> }
>
> + vma_start_write(vma);
> userfaultfd_set_vm_flags(vma, new_flags);
> vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> }
> @@ -1502,6 +1506,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> * the next vma was merged into the current one and
> * the current one has not been updated yet.
> */
> + vma_start_write(vma);
> userfaultfd_set_vm_flags(vma, new_flags);
> vma->vm_userfaultfd_ctx.ctx = ctx;
>
> @@ -1685,6 +1690,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> * the next vma was merged into the current one and
> * the current one has not been updated yet.
> */
> + vma_start_write(vma);
> userfaultfd_set_vm_flags(vma, new_flags);
> vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 262b5f44101d..2c720c9bb1ae 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -780,18 +780,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
> ACCESS_PRIVATE(vma, __vm_flags) = flags;
> }
>
> -/* Use when VMA is part of the VMA tree and modifications need coordination */
> +/*
> + * Use when VMA is part of the VMA tree and modifications need coordination
> + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
> + * it should be locked explicitly beforehand.
> + */
> static inline void vm_flags_reset(struct vm_area_struct *vma,
> vm_flags_t flags)
> {
> - vma_start_write(vma);
> + vma_assert_write_locked(vma);
> vm_flags_init(vma, flags);
> }
>
> static inline void vm_flags_reset_once(struct vm_area_struct *vma,
> vm_flags_t flags)
> {
> - vma_start_write(vma);
> + vma_assert_write_locked(vma);
> WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
> }
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index bfe0e06427bd..507b1d299fec 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -173,9 +173,8 @@ static int madvise_update_vma(struct vm_area_struct *vma,
> }
>
> success:
> - /*
> - * vm_flags is protected by the mmap_lock held in write mode.
> - */
> + /* vm_flags is protected by the mmap_lock held in write mode. */
> + vma_start_write(vma);
> vm_flags_reset(vma, new_flags);
> if (!vma->vm_file || vma_is_anon_shmem(vma)) {
> error = replace_anon_vma_name(vma, anon_name);
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 479e09d0994c..06bdfab83b58 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -387,6 +387,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
> */
> if (newflags & VM_LOCKED)
> newflags |= VM_IO;
> + vma_start_write(vma);
> vm_flags_reset_once(vma, newflags);
>
> lru_add_drain();
> @@ -461,9 +462,9 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
> * It's okay if try_to_unmap_one unmaps a page just after we
> * set VM_LOCKED, populate_vma_page_range will bring it back.
> */
> -
> if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
> /* No work to do, and mlocking twice would be wrong */
> + vma_start_write(vma);
> vm_flags_reset(vma, newflags);
> } else {
> mlock_vma_pages_range(vma, start, end, newflags);
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 3aef1340533a..362e190a8f81 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -657,6 +657,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
> * vm_flags and vm_page_prot are protected by the mmap_lock
> * held in write mode.
> */
> + vma_start_write(vma);
> vm_flags_reset(vma, newflags);
> if (vma_wants_manual_pte_write_upgrade(vma))
> mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
> --
> 2.41.0.585.gd2178a4bd4-goog
>