Re: [RFC 6/7] mm: extend process_madvise syscall to support vector arrary

From: Michal Hocko
Date: Mon May 20 2019 - 05:25:23 EST


[Cc linux-api]

On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> Currently, process_madvise syscall works for only one address range
> so user should call the syscall several times to give hints to
> multiple address range.

Is that a problem? How big of a problem? Any numbers?

> This patch extends process_madvise syscall to support multiple
> hints, address ranges and return vaules so user could give hints
> all at once.
>
> struct pr_madvise_param {
> int size; /* the size of this structure */
> const struct iovec __user *vec; /* address range array */
> }
>
> int process_madvise(int pidfd, ssize_t nr_elem,
> int *behavior,
> struct pr_madvise_param *results,
> struct pr_madvise_param *ranges,
> unsigned long flags);
>
> - pidfd
>
> target process fd
>
> - nr_elem
>
> the number of elemenent of array behavior, results, ranges
>
> - behavior
>
> hints for each address range in remote process so that user could
> give different hints for each range.

What is the guarantee of a single call? Do all hints get applied or the
first failure backs of? What are the atomicity guarantees?

>
> - results
>
> array of buffers to get results for associated remote address range
> action.
>
> - ranges
>
> array to buffers to have remote process's address ranges to be
> processed
>
> - flags
>
> extra argument for the future. It should be zero this moment.
>
> Example)
>
> struct pr_madvise_param {
> int size;
> const struct iovec *vec;
> };
>
> int main(int argc, char *argv[])
> {
> struct pr_madvise_param retp, rangep;
> struct iovec result_vec[2], range_vec[2];
> int hints[2];
> long ret[2];
> void *addr[2];
>
> pid_t pid;
> char cmd[64] = {0,};
> addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
> MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
>
> if (MAP_FAILED == addr[0])
> return 1;
>
> addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
> MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
>
> if (MAP_FAILED == addr[1])
> return 1;
>
> hints[0] = MADV_COLD;
> range_vec[0].iov_base = addr[0];
> range_vec[0].iov_len = ALLOC_SIZE;
> result_vec[0].iov_base = &ret[0];
> result_vec[0].iov_len = sizeof(long);
> retp.vec = result_vec;
> retp.size = sizeof(struct pr_madvise_param);
>
> hints[1] = MADV_COOL;
> range_vec[1].iov_base = addr[1];
> range_vec[1].iov_len = ALLOC_SIZE;
> result_vec[1].iov_base = &ret[1];
> result_vec[1].iov_len = sizeof(long);
> rangep.vec = range_vec;
> rangep.size = sizeof(struct pr_madvise_param);
>
> pid = fork();
> if (!pid) {
> sleep(10);
> } else {
> int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC);
> if (pidfd < 0)
> return 1;
>
> /* munmap to make pages private for the child */
> munmap(addr[0], ALLOC_SIZE);
> munmap(addr[1], ALLOC_SIZE);
> system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
> if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
> &retp, &rangep, 0))
> perror("process_madvise fail\n");
> system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
> }
>
> return 0;
> }
>
> Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
> ---
> include/uapi/asm-generic/mman-common.h | 5 +
> mm/madvise.c | 184 +++++++++++++++++++++----
> 2 files changed, 166 insertions(+), 23 deletions(-)
>
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index b9b51eeb8e1a..b8e230de84a6 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -74,4 +74,9 @@
> #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
> PKEY_DISABLE_WRITE)
>
> +struct pr_madvise_param {
> + int size; /* the size of this structure */
> + const struct iovec __user *vec; /* address range array */
> +};
> +
> #endif /* __ASM_GENERIC_MMAN_COMMON_H */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index af02aa17e5c1..f4f569dac2bd 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
> struct page *page;
> struct vm_area_struct *vma = walk->vma;
> unsigned long next;
> + long nr_pages = 0;
>
> next = pmd_addr_end(addr, end);
> if (pmd_trans_huge(*pmd)) {
> @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>
> ptep_test_and_clear_young(vma, addr, pte);
> deactivate_page(page);
> + nr_pages++;
> +
> }
>
> pte_unmap_unlock(orig_pte, ptl);
> + *(long *)walk->private += nr_pages;
> cond_resched();
>
> return 0;
> @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>
> static void madvise_cool_page_range(struct mmu_gather *tlb,
> struct vm_area_struct *vma,
> - unsigned long addr, unsigned long end)
> + unsigned long addr, unsigned long end,
> + long *nr_pages)
> {
> struct mm_walk cool_walk = {
> .pmd_entry = madvise_cool_pte_range,
> .mm = vma->vm_mm,
> + .private = nr_pages
> };
>
> tlb_start_vma(tlb, vma);
> @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
> }
>
> static long madvise_cool(struct vm_area_struct *vma,
> - unsigned long start_addr, unsigned long end_addr)
> + unsigned long start_addr, unsigned long end_addr,
> + long *nr_pages)
> {
> struct mm_struct *mm = vma->vm_mm;
> struct mmu_gather tlb;
> @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,
>
> lru_add_drain();
> tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> - madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
> + madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
> tlb_finish_mmu(&tlb, start_addr, end_addr);
>
> return 0;
> @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> int isolated = 0;
> struct vm_area_struct *vma = walk->vma;
> unsigned long next;
> + long nr_pages = 0;
>
> next = pmd_addr_end(addr, end);
> if (pmd_trans_huge(*pmd)) {
> @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> list_add(&page->lru, &page_list);
> if (isolated >= SWAP_CLUSTER_MAX) {
> pte_unmap_unlock(orig_pte, ptl);
> - reclaim_pages(&page_list);
> + nr_pages += reclaim_pages(&page_list);
> isolated = 0;
> pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> orig_pte = pte;
> @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> }
>
> pte_unmap_unlock(orig_pte, ptl);
> - reclaim_pages(&page_list);
> + nr_pages += reclaim_pages(&page_list);
> cond_resched();
>
> + *(long *)walk->private += nr_pages;
> return 0;
> }
>
> static void madvise_cold_page_range(struct mmu_gather *tlb,
> struct vm_area_struct *vma,
> - unsigned long addr, unsigned long end)
> + unsigned long addr, unsigned long end,
> + long *nr_pages)
> {
> struct mm_walk warm_walk = {
> .pmd_entry = madvise_cold_pte_range,
> .mm = vma->vm_mm,
> + .private = nr_pages,
> };
>
> tlb_start_vma(tlb, vma);
> @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
>
>
> static long madvise_cold(struct vm_area_struct *vma,
> - unsigned long start_addr, unsigned long end_addr)
> + unsigned long start_addr, unsigned long end_addr,
> + long *nr_pages)
> {
> struct mm_struct *mm = vma->vm_mm;
> struct mmu_gather tlb;
> @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,
>
> lru_add_drain();
> tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> - madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> + madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
> tlb_finish_mmu(&tlb, start_addr, end_addr);
>
> return 0;
> @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
> static long
> madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
> struct vm_area_struct **prev, unsigned long start,
> - unsigned long end, int behavior)
> + unsigned long end, int behavior, long *nr_pages)
> {
> switch (behavior) {
> case MADV_REMOVE:
> @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
> case MADV_WILLNEED:
> return madvise_willneed(vma, prev, start, end);
> case MADV_COOL:
> - return madvise_cool(vma, start, end);
> + return madvise_cool(vma, start, end, nr_pages);
> case MADV_COLD:
> - return madvise_cold(vma, start, end);
> + return madvise_cold(vma, start, end, nr_pages);
> case MADV_FREE:
> case MADV_DONTNEED:
> return madvise_dontneed_free(tsk, vma, prev, start,
> @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
> }
>
> static int madvise_core(struct task_struct *tsk, unsigned long start,
> - size_t len_in, int behavior)
> + size_t len_in, int behavior, long *nr_pages)
> {
> unsigned long end, tmp;
> struct vm_area_struct *vma, *prev;
> @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>
> if (start & ~PAGE_MASK)
> return error;
> +
> len = (len_in + ~PAGE_MASK) & PAGE_MASK;
>
> /* Check to see whether len was rounded up from small -ve to zero */
> @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> blk_start_plug(&plug);
> for (;;) {
> /* Still start < end. */
> + long pages = 0;
> +
> error = -ENOMEM;
> if (!vma)
> goto out;
> @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> tmp = end;
>
> /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
> - error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
> + error = madvise_vma(tsk, vma, &prev, start, tmp,
> + behavior, &pages);
> if (error)
> goto out;
> + *nr_pages += pages;
> start = tmp;
> if (prev && start < prev->vm_end)
> start = prev->vm_end;
> @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
> */
> SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> {
> - return madvise_core(current, start, len_in, behavior);
> + unsigned long dummy;
> +
> + return madvise_core(current, start, len_in, behavior, &dummy);
> }
>
> -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> - size_t, len_in, int, behavior)
> +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
> + struct pr_madvise_param *param)
> +{
> + u32 size;
> + int ret;
> +
> + memset(param, 0, sizeof(*param));
> +
> + ret = get_user(size, &u_param->size);
> + if (ret)
> + return ret;
> +
> + if (size > PAGE_SIZE)
> + return -E2BIG;
> +
> + if (!size || size > sizeof(struct pr_madvise_param))
> + return -EINVAL;
> +
> + ret = copy_from_user(param, u_param, size);
> + if (ret)
> + return -EFAULT;
> +
> + return ret;
> +}
> +
> +static int process_madvise_core(struct task_struct *tsk, int *behaviors,
> + struct iov_iter *iter,
> + const struct iovec *range_vec,
> + unsigned long riovcnt,
> + unsigned long flags)
> +{
> + int i;
> + long err;
> +
> + for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
> + long ret = 0;
> +
> + err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
> + range_vec[i].iov_len, behaviors[i],
> + &ret);
> + if (err)
> + ret = err;
> +
> + if (copy_to_iter(&ret, sizeof(long), iter) !=
> + sizeof(long)) {
> + err = -EFAULT;
> + break;
> + }
> +
> + err = 0;
> + }
> +
> + return err;
> +}
> +
> +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
> + const int __user *, hints,
> + struct pr_madvise_param __user *, results,
> + struct pr_madvise_param __user *, ranges,
> + unsigned long, flags)
> {
> int ret;
> struct fd f;
> struct pid *pid;
> struct task_struct *tsk;
> struct mm_struct *mm;
> + struct pr_madvise_param result_p, range_p;
> + const struct iovec __user *result_vec, __user *range_vec;
> + int *behaviors;
> + struct iovec iovstack_result[UIO_FASTIOV];
> + struct iovec iovstack_r[UIO_FASTIOV];
> + struct iovec *iov_l = iovstack_result;
> + struct iovec *iov_r = iovstack_r;
> + struct iov_iter iter;
> +
> + if (flags != 0)
> + return -EINVAL;
> +
> + ret = pr_madvise_copy_param(results, &result_p);
> + if (ret)
> + return ret;
> +
> + ret = pr_madvise_copy_param(ranges, &range_p);
> + if (ret)
> + return ret;
> +
> + result_vec = result_p.vec;
> + range_vec = range_p.vec;
> +
> + if (result_p.size != sizeof(struct pr_madvise_param) ||
> + range_p.size != sizeof(struct pr_madvise_param))
> + return -EINVAL;
> +
> + behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
> + if (!behaviors)
> + return -ENOMEM;
> +
> + ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
> + if (ret < 0)
> + goto free_behavior_vec;
> +
> + ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
> + &iov_l, &iter);
> + if (ret < 0)
> + goto free_behavior_vec;
> +
> + if (!iov_iter_count(&iter)) {
> + ret = -EINVAL;
> + goto free_iovecs;
> + }
> +
> + ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
> + UIO_FASTIOV, iovstack_r, &iov_r);
> + if (ret <= 0)
> + goto free_iovecs;
>
> f = fdget(pidfd);
> - if (!f.file)
> - return -EBADF;
> + if (!f.file) {
> + ret = -EBADF;
> + goto free_iovecs;
> + }
>
> pid = pidfd_to_pid(f.file);
> if (IS_ERR(pid)) {
> ret = PTR_ERR(pid);
> - goto err;
> + goto put_fd;
> }
>
> ret = -EINVAL;
> @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> tsk = pid_task(pid, PIDTYPE_PID);
> if (!tsk) {
> rcu_read_unlock();
> - goto err;
> + goto put_fd;
> }
> get_task_struct(tsk);
> rcu_read_unlock();
> @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> if (ret == -EACCES)
> ret = -EPERM;
> - goto err;
> + goto put_task;
> }
> - ret = madvise_core(tsk, start, len_in, behavior);
> +
> + ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
> + nr_elem, flags);
> mmput(mm);
> +put_task:
> put_task_struct(tsk);
> -err:
> +put_fd:
> fdput(f);
> +free_iovecs:
> + if (iov_r != iovstack_r)
> + kfree(iov_r);
> + kfree(iov_l);
> +free_behavior_vec:
> + kfree(behaviors);
> +
> return ret;
> }
> --
> 2.21.0.1020.gf2820cf01a-goog
>

--
Michal Hocko
SUSE Labs