Re: [RFC 6/7] mm: extend process_madvise syscall to support vector arrary
From: Minchan Kim
Date: Mon May 20 2019 - 22:52:39 EST
On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote:
> [Cc linux-api]
>
> On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> > Currently, process_madvise syscall works for only one address range
> > so user should call the syscall several times to give hints to
> > multiple address range.
>
> Is that a problem? How big of a problem? Any numbers?
We easily have 2000+ vma so it's not trivial overhead. I will come up
with number in the description at respin.
>
> > This patch extends process_madvise syscall to support multiple
> > hints, address ranges and return vaules so user could give hints
> > all at once.
> >
> > struct pr_madvise_param {
> > int size; /* the size of this structure */
> > const struct iovec __user *vec; /* address range array */
> > }
> >
> > int process_madvise(int pidfd, ssize_t nr_elem,
> > int *behavior,
> > struct pr_madvise_param *results,
> > struct pr_madvise_param *ranges,
> > unsigned long flags);
> >
> > - pidfd
> >
> > target process fd
> >
> > - nr_elem
> >
> > the number of elemenent of array behavior, results, ranges
> >
> > - behavior
> >
> > hints for each address range in remote process so that user could
> > give different hints for each range.
>
> What is the guarantee of a single call? Do all hints get applied or the
> first failure backs of? What are the atomicity guarantees?
All hints will be tried even though one of them is failed. User will
see the success or failure from the restuls parameter.
For the single call, there is no guarantee of atomicity.
>
> >
> > - results
> >
> > array of buffers to get results for associated remote address range
> > action.
> >
> > - ranges
> >
> > array to buffers to have remote process's address ranges to be
> > processed
> >
> > - flags
> >
> > extra argument for the future. It should be zero this moment.
> >
> > Example)
> >
> > struct pr_madvise_param {
> > int size;
> > const struct iovec *vec;
> > };
> >
> > int main(int argc, char *argv[])
> > {
> > struct pr_madvise_param retp, rangep;
> > struct iovec result_vec[2], range_vec[2];
> > int hints[2];
> > long ret[2];
> > void *addr[2];
> >
> > pid_t pid;
> > char cmd[64] = {0,};
> > addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
> > MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> >
> > if (MAP_FAILED == addr[0])
> > return 1;
> >
> > addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
> > MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> >
> > if (MAP_FAILED == addr[1])
> > return 1;
> >
> > hints[0] = MADV_COLD;
> > range_vec[0].iov_base = addr[0];
> > range_vec[0].iov_len = ALLOC_SIZE;
> > result_vec[0].iov_base = &ret[0];
> > result_vec[0].iov_len = sizeof(long);
> > retp.vec = result_vec;
> > retp.size = sizeof(struct pr_madvise_param);
> >
> > hints[1] = MADV_COOL;
> > range_vec[1].iov_base = addr[1];
> > range_vec[1].iov_len = ALLOC_SIZE;
> > result_vec[1].iov_base = &ret[1];
> > result_vec[1].iov_len = sizeof(long);
> > rangep.vec = range_vec;
> > rangep.size = sizeof(struct pr_madvise_param);
> >
> > pid = fork();
> > if (!pid) {
> > sleep(10);
> > } else {
> > int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC);
> > if (pidfd < 0)
> > return 1;
> >
> > /* munmap to make pages private for the child */
> > munmap(addr[0], ALLOC_SIZE);
> > munmap(addr[1], ALLOC_SIZE);
> > system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
> > if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
> > &retp, &rangep, 0))
> > perror("process_madvise fail\n");
> > system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
> > }
> >
> > return 0;
> > }
> >
> > Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
> > ---
> > include/uapi/asm-generic/mman-common.h | 5 +
> > mm/madvise.c | 184 +++++++++++++++++++++----
> > 2 files changed, 166 insertions(+), 23 deletions(-)
> >
> > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> > index b9b51eeb8e1a..b8e230de84a6 100644
> > --- a/include/uapi/asm-generic/mman-common.h
> > +++ b/include/uapi/asm-generic/mman-common.h
> > @@ -74,4 +74,9 @@
> > #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
> > PKEY_DISABLE_WRITE)
> >
> > +struct pr_madvise_param {
> > + int size; /* the size of this structure */
> > + const struct iovec __user *vec; /* address range array */
> > +};
> > +
> > #endif /* __ASM_GENERIC_MMAN_COMMON_H */
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index af02aa17e5c1..f4f569dac2bd 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
> > struct page *page;
> > struct vm_area_struct *vma = walk->vma;
> > unsigned long next;
> > + long nr_pages = 0;
> >
> > next = pmd_addr_end(addr, end);
> > if (pmd_trans_huge(*pmd)) {
> > @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
> >
> > ptep_test_and_clear_young(vma, addr, pte);
> > deactivate_page(page);
> > + nr_pages++;
> > +
> > }
> >
> > pte_unmap_unlock(orig_pte, ptl);
> > + *(long *)walk->private += nr_pages;
> > cond_resched();
> >
> > return 0;
> > @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
> >
> > static void madvise_cool_page_range(struct mmu_gather *tlb,
> > struct vm_area_struct *vma,
> > - unsigned long addr, unsigned long end)
> > + unsigned long addr, unsigned long end,
> > + long *nr_pages)
> > {
> > struct mm_walk cool_walk = {
> > .pmd_entry = madvise_cool_pte_range,
> > .mm = vma->vm_mm,
> > + .private = nr_pages
> > };
> >
> > tlb_start_vma(tlb, vma);
> > @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
> > }
> >
> > static long madvise_cool(struct vm_area_struct *vma,
> > - unsigned long start_addr, unsigned long end_addr)
> > + unsigned long start_addr, unsigned long end_addr,
> > + long *nr_pages)
> > {
> > struct mm_struct *mm = vma->vm_mm;
> > struct mmu_gather tlb;
> > @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,
> >
> > lru_add_drain();
> > tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> > - madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
> > + madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
> > tlb_finish_mmu(&tlb, start_addr, end_addr);
> >
> > return 0;
> > @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> > int isolated = 0;
> > struct vm_area_struct *vma = walk->vma;
> > unsigned long next;
> > + long nr_pages = 0;
> >
> > next = pmd_addr_end(addr, end);
> > if (pmd_trans_huge(*pmd)) {
> > @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> > list_add(&page->lru, &page_list);
> > if (isolated >= SWAP_CLUSTER_MAX) {
> > pte_unmap_unlock(orig_pte, ptl);
> > - reclaim_pages(&page_list);
> > + nr_pages += reclaim_pages(&page_list);
> > isolated = 0;
> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> > orig_pte = pte;
> > @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> > }
> >
> > pte_unmap_unlock(orig_pte, ptl);
> > - reclaim_pages(&page_list);
> > + nr_pages += reclaim_pages(&page_list);
> > cond_resched();
> >
> > + *(long *)walk->private += nr_pages;
> > return 0;
> > }
> >
> > static void madvise_cold_page_range(struct mmu_gather *tlb,
> > struct vm_area_struct *vma,
> > - unsigned long addr, unsigned long end)
> > + unsigned long addr, unsigned long end,
> > + long *nr_pages)
> > {
> > struct mm_walk warm_walk = {
> > .pmd_entry = madvise_cold_pte_range,
> > .mm = vma->vm_mm,
> > + .private = nr_pages,
> > };
> >
> > tlb_start_vma(tlb, vma);
> > @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
> >
> >
> > static long madvise_cold(struct vm_area_struct *vma,
> > - unsigned long start_addr, unsigned long end_addr)
> > + unsigned long start_addr, unsigned long end_addr,
> > + long *nr_pages)
> > {
> > struct mm_struct *mm = vma->vm_mm;
> > struct mmu_gather tlb;
> > @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,
> >
> > lru_add_drain();
> > tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> > - madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> > + madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
> > tlb_finish_mmu(&tlb, start_addr, end_addr);
> >
> > return 0;
> > @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
> > static long
> > madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
> > struct vm_area_struct **prev, unsigned long start,
> > - unsigned long end, int behavior)
> > + unsigned long end, int behavior, long *nr_pages)
> > {
> > switch (behavior) {
> > case MADV_REMOVE:
> > @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
> > case MADV_WILLNEED:
> > return madvise_willneed(vma, prev, start, end);
> > case MADV_COOL:
> > - return madvise_cool(vma, start, end);
> > + return madvise_cool(vma, start, end, nr_pages);
> > case MADV_COLD:
> > - return madvise_cold(vma, start, end);
> > + return madvise_cold(vma, start, end, nr_pages);
> > case MADV_FREE:
> > case MADV_DONTNEED:
> > return madvise_dontneed_free(tsk, vma, prev, start,
> > @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
> > }
> >
> > static int madvise_core(struct task_struct *tsk, unsigned long start,
> > - size_t len_in, int behavior)
> > + size_t len_in, int behavior, long *nr_pages)
> > {
> > unsigned long end, tmp;
> > struct vm_area_struct *vma, *prev;
> > @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> >
> > if (start & ~PAGE_MASK)
> > return error;
> > +
> > len = (len_in + ~PAGE_MASK) & PAGE_MASK;
> >
> > /* Check to see whether len was rounded up from small -ve to zero */
> > @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> > blk_start_plug(&plug);
> > for (;;) {
> > /* Still start < end. */
> > + long pages = 0;
> > +
> > error = -ENOMEM;
> > if (!vma)
> > goto out;
> > @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> > tmp = end;
> >
> > /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
> > - error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
> > + error = madvise_vma(tsk, vma, &prev, start, tmp,
> > + behavior, &pages);
> > if (error)
> > goto out;
> > + *nr_pages += pages;
> > start = tmp;
> > if (prev && start < prev->vm_end)
> > start = prev->vm_end;
> > @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> > */
> > SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> > {
> > - return madvise_core(current, start, len_in, behavior);
> > + unsigned long dummy;
> > +
> > + return madvise_core(current, start, len_in, behavior, &dummy);
> > }
> >
> > -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> > - size_t, len_in, int, behavior)
> > +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
> > + struct pr_madvise_param *param)
> > +{
> > + u32 size;
> > + int ret;
> > +
> > + memset(param, 0, sizeof(*param));
> > +
> > + ret = get_user(size, &u_param->size);
> > + if (ret)
> > + return ret;
> > +
> > + if (size > PAGE_SIZE)
> > + return -E2BIG;
> > +
> > + if (!size || size > sizeof(struct pr_madvise_param))
> > + return -EINVAL;
> > +
> > + ret = copy_from_user(param, u_param, size);
> > + if (ret)
> > + return -EFAULT;
> > +
> > + return ret;
> > +}
> > +
> > +static int process_madvise_core(struct task_struct *tsk, int *behaviors,
> > + struct iov_iter *iter,
> > + const struct iovec *range_vec,
> > + unsigned long riovcnt,
> > + unsigned long flags)
> > +{
> > + int i;
> > + long err;
> > +
> > + for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
> > + long ret = 0;
> > +
> > + err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
> > + range_vec[i].iov_len, behaviors[i],
> > + &ret);
> > + if (err)
> > + ret = err;
> > +
> > + if (copy_to_iter(&ret, sizeof(long), iter) !=
> > + sizeof(long)) {
> > + err = -EFAULT;
> > + break;
> > + }
> > +
> > + err = 0;
> > + }
> > +
> > + return err;
> > +}
> > +
> > +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
> > + const int __user *, hints,
> > + struct pr_madvise_param __user *, results,
> > + struct pr_madvise_param __user *, ranges,
> > + unsigned long, flags)
> > {
> > int ret;
> > struct fd f;
> > struct pid *pid;
> > struct task_struct *tsk;
> > struct mm_struct *mm;
> > + struct pr_madvise_param result_p, range_p;
> > + const struct iovec __user *result_vec, __user *range_vec;
> > + int *behaviors;
> > + struct iovec iovstack_result[UIO_FASTIOV];
> > + struct iovec iovstack_r[UIO_FASTIOV];
> > + struct iovec *iov_l = iovstack_result;
> > + struct iovec *iov_r = iovstack_r;
> > + struct iov_iter iter;
> > +
> > + if (flags != 0)
> > + return -EINVAL;
> > +
> > + ret = pr_madvise_copy_param(results, &result_p);
> > + if (ret)
> > + return ret;
> > +
> > + ret = pr_madvise_copy_param(ranges, &range_p);
> > + if (ret)
> > + return ret;
> > +
> > + result_vec = result_p.vec;
> > + range_vec = range_p.vec;
> > +
> > + if (result_p.size != sizeof(struct pr_madvise_param) ||
> > + range_p.size != sizeof(struct pr_madvise_param))
> > + return -EINVAL;
> > +
> > + behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
> > + if (!behaviors)
> > + return -ENOMEM;
> > +
> > + ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
> > + if (ret < 0)
> > + goto free_behavior_vec;
> > +
> > + ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
> > + &iov_l, &iter);
> > + if (ret < 0)
> > + goto free_behavior_vec;
> > +
> > + if (!iov_iter_count(&iter)) {
> > + ret = -EINVAL;
> > + goto free_iovecs;
> > + }
> > +
> > + ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
> > + UIO_FASTIOV, iovstack_r, &iov_r);
> > + if (ret <= 0)
> > + goto free_iovecs;
> >
> > f = fdget(pidfd);
> > - if (!f.file)
> > - return -EBADF;
> > + if (!f.file) {
> > + ret = -EBADF;
> > + goto free_iovecs;
> > + }
> >
> > pid = pidfd_to_pid(f.file);
> > if (IS_ERR(pid)) {
> > ret = PTR_ERR(pid);
> > - goto err;
> > + goto put_fd;
> > }
> >
> > ret = -EINVAL;
> > @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> > tsk = pid_task(pid, PIDTYPE_PID);
> > if (!tsk) {
> > rcu_read_unlock();
> > - goto err;
> > + goto put_fd;
> > }
> > get_task_struct(tsk);
> > rcu_read_unlock();
> > @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> > ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> > if (ret == -EACCES)
> > ret = -EPERM;
> > - goto err;
> > + goto put_task;
> > }
> > - ret = madvise_core(tsk, start, len_in, behavior);
> > +
> > + ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
> > + nr_elem, flags);
> > mmput(mm);
> > +put_task:
> > put_task_struct(tsk);
> > -err:
> > +put_fd:
> > fdput(f);
> > +free_iovecs:
> > + if (iov_r != iovstack_r)
> > + kfree(iov_r);
> > + kfree(iov_l);
> > +free_behavior_vec:
> > + kfree(behaviors);
> > +
> > return ret;
> > }
> > --
> > 2.21.0.1020.gf2820cf01a-goog
> >
>
> --
> Michal Hocko
> SUSE Labs