Re: [PATCH v26 2/5] fs/proc/task_mmu: Implement IOCTL to get and optionally clear info about PTEs

From: Muhammad Usama Anjum
Date: Fri Aug 04 2023 - 02:09:15 EST


On 8/3/23 8:08 PM, Andrei Vagin wrote:
> On Thu, Jul 27, 2023 at 02:36:34PM +0500, Muhammad Usama Anjum wrote:
>
> <snip>
>
>> +
>> +static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
>> + unsigned long addr, unsigned long end,
>> + unsigned long end_addr)
>
> It hard to figure out what difference between end and end_addr. I would
> add a comment here.
I'll fix these in next version.

>
>> +{
>> + struct page_region *cur_buf = &p->cur_buf;
>> +
>> + if (cur_buf->start != addr)
>> + cur_buf->end = addr;
>> + else
>> + cur_buf->start = cur_buf->end = 0;
>> +
>> + p->end_addr = end_addr;
>> + p->found_pages -= (end - addr) / PAGE_SIZE;
>> +}
>> +
>> +static int pagemap_scan_output(unsigned long categories,
>> + struct pagemap_scan_private *p,
>> + unsigned long addr, unsigned long *end)
>> +{
>> + unsigned long n_pages, total_pages;
>> + int ret = 0;
>> +
>> + if (!pagemap_scan_is_interesting_page(categories, p)) {
>> + *end = addr;
>> + return 0;
>> + }
>> +
>> + if (!p->vec_buf)
>> + return 0;
>> +
>> + categories &= p->arg.return_mask;
>> +
>> + n_pages = (*end - addr) / PAGE_SIZE;
>> + if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
>> + total_pages > p->arg.max_pages) {
>
> why do we need to use check_add_overflow here?
>
>> + size_t n_too_much = total_pages - p->arg.max_pages;
>
> it is unsafe to use total_pages if check_add_overflow returns non-zero.
>
>> + *end -= n_too_much * PAGE_SIZE;
>> + n_pages -= n_too_much;
>> + ret = -ENOSPC;
>> + }
>> +
>> + if (!pagemap_scan_push_range(categories, p, addr, *end)) {
>> + *end = addr;
>> + n_pages = 0;
>> + ret = -ENOSPC;
>> + }
>> +
>> + p->found_pages += n_pages;
>> + if (ret)
>> + p->end_addr = *end;
>> +
>> + return ret;
>> +}
>> +
>> +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
>> + unsigned long end, struct mm_walk *walk)
>> +{
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> + struct pagemap_scan_private *p = walk->private;
>> + struct vm_area_struct *vma = walk->vma;
>> + unsigned long categories;
>> + spinlock_t *ptl;
>> + int ret = 0;
>> +
>> + ptl = pmd_trans_huge_lock(pmd, vma);
>> + if (!ptl)
>> + return -ENOENT;
>> +
>> + categories = p->cur_vma_category | pagemap_thp_category(*pmd);
>> +
>> + ret = pagemap_scan_output(categories, p, start, &end);
>> + if (start == end)
>> + goto out_unlock;
>> +
>> + if (~p->arg.flags & PM_SCAN_WP_MATCHING)
>> + goto out_unlock;
>> + if (~categories & PAGE_IS_WRITTEN)
>> + goto out_unlock;
>> +
>> + /*
>> + * Break huge page into small pages if the WP operation
>> + * need to be performed is on a portion of the huge page.
>> + */
>> + if (end != start + HPAGE_SIZE) {
>> + spin_unlock(ptl);
>> + split_huge_pmd(vma, pmd, start);
>> + pagemap_scan_backout_range(p, start, end, 0);
>
> pagemap_scan_backout_range looks "weird"... imho, it makes the code
> harder for understanding.
>
>> + return -ENOENT;
>
> I think you need to add a comment that this ENOENT is a special case.
>
>> + }
>> +
>> + make_uffd_wp_pmd(vma, start, pmd);
>> + flush_tlb_range(vma, start, end);
>> +out_unlock:
>> + spin_unlock(ptl);
>> + return ret;
>> +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
>> + return -ENOENT;
>> +#endif
>> +}
>> +
>> +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
>> + unsigned long end, struct mm_walk *walk)
>> +{
>> + struct pagemap_scan_private *p = walk->private;
>> + struct vm_area_struct *vma = walk->vma;
>> + pte_t *pte, *start_pte;
>> + unsigned long addr;
>> + bool flush = false;
>> + spinlock_t *ptl;
>> + int ret;
>> +
>> + arch_enter_lazy_mmu_mode();
>> +
>> + ret = pagemap_scan_thp_entry(pmd, start, end, walk);
>> + if (ret != -ENOENT) {
>> + arch_leave_lazy_mmu_mode();
>> + return ret;
>> + }
>> +
>> + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
>> + if (!pte) {
>> + arch_leave_lazy_mmu_mode();
>> + walk->action = ACTION_AGAIN;
>> + return 0;
>> + }
>> +
>> + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
>> + unsigned long categories = p->cur_vma_category |
>> + pagemap_page_category(vma, addr, ptep_get(pte));
>> + unsigned long next = addr + PAGE_SIZE;
>> +
>> + ret = pagemap_scan_output(categories, p, addr, &next);
>> + if (next == addr) {
>> + if (!ret)
>> + continue;
>> + break;
>> + }
>> +
>> + if (~p->arg.flags & PM_SCAN_WP_MATCHING)
>> + continue;
>> + if (~categories & PAGE_IS_WRITTEN)
>> + continue;
>> +
>> + make_uffd_wp_pte(vma, addr, pte);
>> + if (!flush) {
>> + start = addr;
>> + flush = true;
>> + }
>> + }
>> +
>> + if (flush)
>> + flush_tlb_range(vma, start, addr);
>> +
>> + pte_unmap_unlock(start_pte, ptl);
>> + arch_leave_lazy_mmu_mode();
>> +
>> + cond_resched();
>> + return ret;
>> +}
>> +
>> +#ifdef CONFIG_HUGETLB_PAGE
>> +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
>> + unsigned long start, unsigned long end,
>> + struct mm_walk *walk)
>> +{
>> + struct pagemap_scan_private *p = walk->private;
>> + struct vm_area_struct *vma = walk->vma;
>> + unsigned long categories;
>> + spinlock_t *ptl;
>> + int ret = 0;
>> + pte_t pte;
>> +
>> + if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
>> + /* Go the short route when not write-protecting pages. */
>> +
>> + pte = huge_ptep_get(ptep);
>> + categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
>> +
>> + return pagemap_scan_output(categories, p, start, &end);
>> + }
>> +
>> + i_mmap_lock_write(vma->vm_file->f_mapping);
>> + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
>> +
>> + pte = huge_ptep_get(ptep);
>> + categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
>> +
>> + ret = pagemap_scan_output(categories, p, start, &end);
>> + if (start == end)
>> + goto out_unlock;
>> +
>> + if (~categories & PAGE_IS_WRITTEN)
>> + goto out_unlock;
>> +
>> + if (end != start + HPAGE_SIZE) {
>> + /* Partial HugeTLB page WP isn't possible. */
>> + pagemap_scan_backout_range(p, start, end, start);
>> + ret = -EINVAL;
>
> Why is it EINVAL in this case?
>
>> + goto out_unlock;
>> + }
>> +
>> + make_uffd_wp_huge_pte(vma, start, ptep, pte);
>> + flush_hugetlb_tlb_range(vma, start, end);
>> +
>> +out_unlock:
>> + spin_unlock(ptl);
>> + i_mmap_unlock_write(vma->vm_file->f_mapping);
>> +
>> + return ret;
>> +}
>> +#else
>> +#define pagemap_scan_hugetlb_entry NULL
>> +#endif
>> +
>> +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
>> + int depth, struct mm_walk *walk)
>> +{
>> + struct pagemap_scan_private *p = walk->private;
>> + struct vm_area_struct *vma = walk->vma;
>> + int ret, err;
>> +
>> + if (!vma)
>> + return 0;
>> +
>> + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
>> + if (addr == end)
>> + return ret;
>> +
>> + if (~p->arg.flags & PM_SCAN_WP_MATCHING)
>> + return ret;
>> +
>> + err = uffd_wp_range(vma, addr, end - addr, true);
>> + if (err < 0)
>> + ret = err;
>> +
>> + return ret;
>> +}
>> +
>> +static const struct mm_walk_ops pagemap_scan_ops = {
>> + .test_walk = pagemap_scan_test_walk,
>> + .pmd_entry = pagemap_scan_pmd_entry,
>> + .pte_hole = pagemap_scan_pte_hole,
>> + .hugetlb_entry = pagemap_scan_hugetlb_entry,
>> +};
>> +
>> +static int pagemap_scan_get_args(struct pm_scan_arg *arg,
>> + unsigned long uarg)
>> +{
>> + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
>> + return -EFAULT;
>> +
>> + if (arg->size != sizeof(struct pm_scan_arg))
>> + return -EINVAL;
>> +
>> + /* Validate requested features */
>> + if (arg->flags & ~PM_SCAN_FLAGS)
>> + return -EINVAL;
>> + if ((arg->category_inverted | arg->category_mask |
>> + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
>> + return -EINVAL;
>> +
>> + arg->start = untagged_addr((unsigned long)arg->start);
>> + arg->end = untagged_addr((unsigned long)arg->end);
>> + arg->vec = untagged_addr((unsigned long)arg->vec);
>> +
>> + /* Validate memory pointers */
>> + if (!IS_ALIGNED(arg->start, PAGE_SIZE))
>> + return -EINVAL;
>> + if (!access_ok((void __user *)arg->start, arg->end - arg->start))
>> + return -EFAULT;
>> + if (!arg->vec && arg->vec_len)
>> + return -EFAULT;
>> + if (arg->vec && !access_ok((void __user *)arg->vec,
>> + arg->vec_len * sizeof(struct page_region)))
>> + return -EFAULT;
>> +
>> + /* Fixup default values */
>> + arg->end = ALIGN(arg->end, PAGE_SIZE);
>> + if (!arg->max_pages)
>> + arg->max_pages = ULONG_MAX;
>> +
>> + return 0;
>> +}
>> +
>> +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
>> + unsigned long uargl)
>> +{
>> + struct pm_scan_arg __user *uarg = (void __user *)uargl;
>> +
>> + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
>> + return -EFAULT;
>> +
>> + return 0;
>> +}
>> +
>> +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
>> +{
>> + if (!p->arg.vec_len) {
>> + /*
>> + * An arbitrary non-page-aligned sentinel value for
>> + * pagemap_scan_push_range().
>> + */
>> + p->cur_buf.start = p->cur_buf.end = ULLONG_MAX;
>> + if (p->arg.vec)
>> + p->vec_buf = ZERO_SIZE_PTR;
>> + return 0;
>> + }
>> +
>> + /*
>> + * Allocate a smaller buffer to get output from inside the page
>> + * walk functions and walk the range in PAGEMAP_WALK_SIZE chunks.
>> + * The last range is always stored in p.cur_buf to allow coalescing
>> + * consecutive ranges that have the same categories returned across
>> + * walk_page_range() calls.
>> + */
>> + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
>> + p->arg.vec_len - 1);
>> + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
>> + GFP_KERNEL);
>> + if (!p->vec_buf)
>> + return -ENOMEM;
>> +
>> + p->vec_out = (struct page_region __user *)p->arg.vec;
>> +
>> + return 0;
>> +}
>> +
>> +static int pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
>> +{
>> + const struct page_region *buf = p->vec_buf;
>> + int n = (int)p->vec_buf_index;
>> +
>> + if (!n)
>> + return 0;
>> +
>> + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
>> + return -EFAULT;
>> +
>> + p->arg.vec_len -= n;
>> + p->vec_out += n;
>> +
>> + p->vec_buf_index = 0;
>> + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len - 1);
>> +
>> + return n;
>> +}
>> +
>> +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
>> +{
>> + unsigned long walk_start, walk_end;
>> + struct mmu_notifier_range range;
>> + struct pagemap_scan_private p;
>> + size_t n_ranges_out = 0;
>> + int ret;
>> +
>> + memset(&p, 0, sizeof(p));
>> + ret = pagemap_scan_get_args(&p.arg, uarg);
>> + if (ret)
>> + return ret;
>> +
>> + ret = pagemap_scan_init_bounce_buffer(&p);
>> + if (ret)
>> + return ret;
>> +
>> + /* Protection change for the range is going to happen. */
>> + if (p.arg.flags & PM_SCAN_WP_MATCHING) {
>> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
>> + mm, p.arg.start, p.arg.end);
>> + mmu_notifier_invalidate_range_start(&range);
>> + }
>> +
>> + walk_start = walk_end = p.arg.start;
>> + for (; walk_end != p.arg.end; walk_start = walk_end) {
>> + int n_out;
>> +
>> + walk_end = min_t(unsigned long,
>> + (walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK,
>> + p.arg.end);
>> +
>
> if (fatal_signal_pending(current)) {
> ret = EINTR;
> break;
> }
>
>> + ret = mmap_read_lock_killable(mm);
>> + if (ret)
>> + break;
>> + ret = walk_page_range(mm, walk_start, walk_end,
>> + &pagemap_scan_ops, &p);
>> + mmap_read_unlock(mm);
>> +
>> + n_out = pagemap_scan_flush_buffer(&p);
>> + if (n_out < 0)
>> + ret = n_out;
>> + else
>> + n_ranges_out += n_out;
>> +
>> + if (ret)
>> + break;
>> + }
>> +
>> + if (p.cur_buf.start != p.cur_buf.end) {
>> + if (copy_to_user(p.vec_out, &p.cur_buf, sizeof(p.cur_buf)))
>> + ret = -EFAULT;
>> + else
>> + ++n_ranges_out;
>> + }
>> +
>> + /* ENOSPC signifies early stop (buffer full) from the walk. */
>> + if (!ret || ret == -ENOSPC)
>> + ret = n_ranges_out;
>> +
>> + p.arg.walk_end = p.end_addr ? p.end_addr : walk_start;
>> + if (pagemap_scan_writeback_args(&p.arg, uarg))
>> + ret = -EFAULT;
>> +
>> + if (p.arg.flags & PM_SCAN_WP_MATCHING)
>> + mmu_notifier_invalidate_range_end(&range);
>> +
>> + kfree(p.vec_buf);
>> + return ret;
>> +}
>> +
>> +static long do_pagemap_cmd(struct file *file, unsigned int cmd,
>> + unsigned long arg)
>> +{
>> + struct mm_struct *mm = file->private_data;
>> +
>> + switch (cmd) {
>> + case PAGEMAP_SCAN:
>> + return do_pagemap_scan(mm, arg);
>> +
>> + default:
>> + return -EINVAL;
>> + }
>> +}
>> +
>> const struct file_operations proc_pagemap_operations = {
>> .llseek = mem_lseek, /* borrow this */
>> .read = pagemap_read,
>> .open = pagemap_open,
>> .release = pagemap_release,
>> + .unlocked_ioctl = do_pagemap_cmd,
>> + .compat_ioctl = do_pagemap_cmd,
>> };
>> #endif /* CONFIG_PROC_PAGE_MONITOR */
>>
>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>> index 0a393bc02f25b..8f8ff07453f22 100644
>> --- a/include/linux/hugetlb.h
>> +++ b/include/linux/hugetlb.h
>> @@ -259,6 +259,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
>> unsigned long cp_flags);
>>
>> bool is_hugetlb_entry_migration(pte_t pte);
>> +bool is_hugetlb_entry_hwpoisoned(pte_t pte);
>> void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
>>
>> #else /* !CONFIG_HUGETLB_PAGE */
>> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
>> index b7b56871029c5..1bb3c625c2381 100644
>> --- a/include/uapi/linux/fs.h
>> +++ b/include/uapi/linux/fs.h
>> @@ -305,4 +305,62 @@ typedef int __bitwise __kernel_rwf_t;
>> #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
>> RWF_APPEND)
>>
>> +/* Pagemap ioctl */
>> +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg)
>> +
>> +/* Bits are set in flags of the page_region and masks in pm_scan_args */
>> +#define PAGE_IS_WPALLOWED (1 << 0)
>> +#define PAGE_IS_WRITTEN (1 << 1)
>> +#define PAGE_IS_FILE (1 << 2)
>> +#define PAGE_IS_PRESENT (1 << 3)
>> +#define PAGE_IS_SWAPPED (1 << 4)
>> +#define PAGE_IS_PFNZERO (1 << 5)
>> +
>> +/*
>> + * struct page_region - Page region with flags
>> + * @start: Start of the region
>> + * @end: End of the region (exclusive)
>> + * @categories: PAGE_IS_* category bitmask for the region
>> + */
>> +struct page_region {
>> + __u64 start;
>> + __u64 end;
>> + __u64 categories;
>> +};
>> +
>> +/* Flags for PAGEMAP_SCAN ioctl */
>> +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */
>> +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */
>> +
>> +/*
>> + * struct pm_scan_arg - Pagemap ioctl argument
>> + * @size: Size of the structure
>> + * @flags: Flags for the IOCTL
>> + * @start: Starting address of the region
>> + * @end: Ending address of the region
>> + * @walk_end: Ending address of the visited memory is returned
>> + * (This helps if entire range hasn't been visited)
>> + * @vec: Address of page_region struct array for output
>> + * @vec_len: Length of the page_region struct array
>> + * @max_pages: Optional limit for number of returned pages (0 = disabled)
>> + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1
>> + * @category_mask: Skip pages for which any category doesn't match
>> + * @category_anyof_mask: Skip pages for which no category matches
>> + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned
>> + */
>> +struct pm_scan_arg {
>> + __u64 size;
>> + __u64 flags;
>> + __u64 start;
>> + __u64 end;
>> + __u64 walk_end;
>> + __u64 vec;
>> + __u64 vec_len;
>> + __u64 max_pages;
>> + __u64 category_inverted;
>> + __u64 category_mask;
>> + __u64 category_anyof_mask;
>> + __u64 return_mask;
>> +};
>> +
>> #endif /* _UAPI_LINUX_FS_H */
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index a073e6ed8900b..3b07db0a4f2d9 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -5008,7 +5008,7 @@ bool is_hugetlb_entry_migration(pte_t pte)
>> return false;
>> }
>>
>> -static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
>> +bool is_hugetlb_entry_hwpoisoned(pte_t pte)
>> {
>> swp_entry_t swp;
>>
>> --
>> 2.39.2
>>

--
BR,
Muhammad Usama Anjum