Re: [PATCH v2.5] userfaultfd: support minor fault handling for shmem

From: Axel Rasmussen
Date: Tue Mar 09 2021 - 23:45:43 EST


+Stephen to CC, as an FYI.

On Tue, Mar 9, 2021 at 2:58 PM Axel Rasmussen <axelrasmussen@xxxxxxxxxx> wrote:
>
> Modify the userfaultfd register API to allow registering shmem VMAs in
> minor mode. Modify the shmem mcopy implementation to support
> UFFDIO_CONTINUE in order to resolve such faults.
>
> Combine the shmem mcopy handler functions into a single
> shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how
> the hugetlbfs implementation is structured, and lets us remove a good
> chunk of boilerplate.
>
> Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
> ---
> fs/userfaultfd.c | 6 +-
> include/linux/shmem_fs.h | 26 ++++-----
> include/uapi/linux/userfaultfd.h | 4 +-
> mm/memory.c | 8 ++-
> mm/shmem.c | 94 +++++++++++++++-----------------
> mm/userfaultfd.c | 27 ++++-----
> 6 files changed, 81 insertions(+), 84 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 14f92285d04f..9f3b8684cf3c 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
> }
>
> if (vm_flags & VM_UFFD_MINOR) {
> - /* FIXME: Add minor fault interception for shmem. */
> - if (!is_vm_hugetlb_page(vma))
> + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
> return false;
> }
>
> @@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
> /* report all available features and ioctls to userland */
> uffdio_api.features = UFFD_API_FEATURES;
> #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
> - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
> + uffdio_api.features &=
> + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
> #endif
> uffdio_api.ioctls = UFFD_API_IOCTLS;
> ret = -EFAULT;
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index d82b6f396588..f0919c3722e7 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -9,6 +9,7 @@
> #include <linux/percpu_counter.h>
> #include <linux/xattr.h>
> #include <linux/fs_parser.h>
> +#include <linux/userfaultfd_k.h>
>
> /* inode in-kernel data */
>
> @@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file)
> extern bool shmem_charge(struct inode *inode, long pages);
> extern void shmem_uncharge(struct inode *inode, long pages);
>
> +#ifdef CONFIG_USERFAULTFD
> #ifdef CONFIG_SHMEM
> -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr,
> - unsigned long src_addr,
> - struct page **pagep);
> -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr);
> -#else
> -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
> - src_addr, pagep) ({ BUG(); 0; })
> -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
> - dst_addr) ({ BUG(); 0; })
> -#endif
> +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> + struct vm_area_struct *dst_vma,
> + unsigned long dst_addr, unsigned long src_addr,
> + enum mcopy_atomic_mode mode, struct page **pagep);
> +#else /* !CONFIG_SHMEM */
> +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
> + src_addr, mode, pagep) ({ BUG(); 0; })
> +#endif /* CONFIG_SHMEM */
> +#endif /* CONFIG_USERFAULTFD */
>
> #endif
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index bafbeb1a2624..47d9790d863d 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -31,7 +31,8 @@
> UFFD_FEATURE_MISSING_SHMEM | \
> UFFD_FEATURE_SIGBUS | \
> UFFD_FEATURE_THREAD_ID | \
> - UFFD_FEATURE_MINOR_HUGETLBFS)
> + UFFD_FEATURE_MINOR_HUGETLBFS | \
> + UFFD_FEATURE_MINOR_SHMEM)
> #define UFFD_API_IOCTLS \
> ((__u64)1 << _UFFDIO_REGISTER | \
> (__u64)1 << _UFFDIO_UNREGISTER | \
> @@ -196,6 +197,7 @@ struct uffdio_api {
> #define UFFD_FEATURE_SIGBUS (1<<7)
> #define UFFD_FEATURE_THREAD_ID (1<<8)
> #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
> +#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
> __u64 features;
>
> __u64 ioctls;
> diff --git a/mm/memory.c b/mm/memory.c
> index c8e357627318..a1e5ff55027e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
> * something).
> */
> if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
> - ret = do_fault_around(vmf);
> - if (ret)
> - return ret;
> + if (likely(!userfaultfd_minor(vmf->vma))) {
> + ret = do_fault_around(vmf);
> + if (ret)
> + return ret;
> + }
> }
>
> ret = __do_fault(vmf);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index b2db4ed0fbc7..ef8c9f5e92fc 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt;
> #include <linux/syscalls.h>
> #include <linux/fcntl.h>
> #include <uapi/linux/memfd.h>
> -#include <linux/userfaultfd_k.h>
> #include <linux/rmap.h>
> #include <linux/uuid.h>
>
> @@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
> * vm. If we swap it in we mark it dirty since we also free the swap
> * entry since a page cannot live in both the swap and page cache.
> *
> - * vmf and fault_type are only supplied by shmem_fault:
> - * otherwise they are NULL.
> + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
> + * are NULL.
> */
> static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> struct page **pagep, enum sgp_type sgp, gfp_t gfp,
> @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> return error;
> }
>
> + if (page && vma && userfaultfd_minor(vma)) {
> + unlock_page(page);
> + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
> + return 0;
> + }
> +
> if (page)
> hindex = page->index;
> if (page && sgp == SGP_WRITE)
> @@ -2354,14 +2359,13 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
> return inode;
> }
>
> -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr,
> - unsigned long src_addr,
> - bool zeropage,
> - struct page **pagep)
> +#ifdef CONFIG_USERFAULTFD
> +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> + struct vm_area_struct *dst_vma,
> + unsigned long dst_addr, unsigned long src_addr,
> + enum mcopy_atomic_mode mode, struct page **pagep)
> {
> + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
> struct inode *inode = file_inode(dst_vma->vm_file);
> struct shmem_inode_info *info = SHMEM_I(inode);
> struct address_space *mapping = inode->i_mapping;
> @@ -2378,12 +2382,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> if (!shmem_inode_acct_block(inode, 1))
> goto out;
>
> - if (!*pagep) {
> + if (is_continue) {
> + ret = -EFAULT;
> + page = find_lock_page(mapping, pgoff);
> + if (!page)
> + goto out_unacct_blocks;
> + } else if (!*pagep) {
> page = shmem_alloc_page(gfp, info, pgoff);
> if (!page)
> goto out_unacct_blocks;
>
> - if (!zeropage) { /* mcopy_atomic */
> + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */
> page_kaddr = kmap_atomic(page);
> ret = copy_from_user(page_kaddr,
> (const void __user *)src_addr,
> @@ -2397,7 +2406,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> /* don't free the page */
> return -ENOENT;
> }
> - } else { /* mfill_zeropage_atomic */
> + } else { /* zeropage */
> clear_highpage(page);
> }
> } else {
> @@ -2405,10 +2414,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> *pagep = NULL;
> }
>
> - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
> - __SetPageLocked(page);
> - __SetPageSwapBacked(page);
> - __SetPageUptodate(page);
> + if (!is_continue) {
> + VM_BUG_ON(PageSwapBacked(page));
> + VM_BUG_ON(PageLocked(page));
> + __SetPageLocked(page);
> + __SetPageSwapBacked(page);
> + __SetPageUptodate(page);
> + }
>
> ret = -EFAULT;
> offset = linear_page_index(dst_vma, dst_addr);
> @@ -2416,10 +2428,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> if (unlikely(offset >= max_off))
> goto out_release;
>
> - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> - gfp & GFP_RECLAIM_MASK, dst_mm);
> - if (ret)
> - goto out_release;
> + /* If page wasn't already in the page cache, add it. */
> + if (!is_continue) {
> + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> + gfp & GFP_RECLAIM_MASK, dst_mm);
> + if (ret)
> + goto out_release;
> + }
>
> _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> if (dst_vma->vm_flags & VM_WRITE)
> @@ -2446,13 +2461,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> if (!pte_none(*dst_pte))
> goto out_release_unlock;
>
> - lru_cache_add(page);
> + if (!is_continue) {
> + lru_cache_add(page);
>
> - spin_lock_irq(&info->lock);
> - info->alloced++;
> - inode->i_blocks += BLOCKS_PER_PAGE;
> - shmem_recalc_inode(inode);
> - spin_unlock_irq(&info->lock);
> + spin_lock_irq(&info->lock);
> + info->alloced++;
> + inode->i_blocks += BLOCKS_PER_PAGE;
> + shmem_recalc_inode(inode);
> + spin_unlock_irq(&info->lock);
> + }
>
> inc_mm_counter(dst_mm, mm_counter_file(page));
> page_add_file_rmap(page, false);
> @@ -2476,28 +2493,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> shmem_inode_unacct_blocks(inode, 1);
> goto out;
> }
> -
> -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr,
> - unsigned long src_addr,
> - struct page **pagep)
> -{
> - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
> - dst_addr, src_addr, false, pagep);
> -}
> -
> -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr)
> -{
> - struct page *page = NULL;
> -
> - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
> - dst_addr, 0, true, &page);
> -}
> +#endif /* CONFIG_USERFAULTFD */
>
> #ifdef CONFIG_TMPFS
> static const struct inode_operations shmem_symlink_inode_operations;
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index ce6cb4760d2c..6cd7ab531aec 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> unsigned long dst_addr,
> unsigned long src_addr,
> struct page **page,
> - bool zeropage,
> + enum mcopy_atomic_mode mode,
> bool wp_copy)
> {
> ssize_t err;
> @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> * and not in the radix tree.
> */
> if (!(dst_vma->vm_flags & VM_SHARED)) {
> - if (!zeropage)
> + switch (mode) {
> + case MCOPY_ATOMIC_NORMAL:
> err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
> dst_addr, src_addr, page,
> wp_copy);
> - else
> + break;
> + case MCOPY_ATOMIC_ZEROPAGE:
> err = mfill_zeropage_pte(dst_mm, dst_pmd,
> dst_vma, dst_addr);
> + break;
> + case MCOPY_ATOMIC_CONTINUE:
> + err = -EINVAL;
> + break;
> + }
> } else {
> VM_WARN_ON_ONCE(wp_copy);
> - if (!zeropage)
> - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
> - dst_vma, dst_addr,
> - src_addr, page);
> - else
> - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
> - dst_vma, dst_addr);
> + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> + src_addr, mode, page);
> }
>
> return err;
> @@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> long copied;
> struct page *page;
> bool wp_copy;
> - bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
>
> /*
> * Sanitize the command parameters:
> @@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>
> if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> goto out_unlock;
> - if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
> + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
> goto out_unlock;
>
> /*
> @@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> BUG_ON(pmd_trans_huge(*dst_pmd));
>
> err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> - src_addr, &page, zeropage, wp_copy);
> + src_addr, &page, mcopy_mode, wp_copy);
> cond_resched();
>
> if (unlikely(err == -ENOENT)) {
> --
> 2.30.1.766.gb4fecdf3b7-goog
>