[RFC 6/7] mm: extend process_madvise syscall to support vector arrary

From: Minchan Kim
Date: Sun May 19 2019 - 23:57:03 EST


Currently, process_madvise syscall works for only one address range
so user should call the syscall several times to give hints to
multiple address range.

This patch extends process_madvise syscall to support multiple
hints, address ranges and return vaules so user could give hints
all at once.

struct pr_madvise_param {
int size; /* the size of this structure */
const struct iovec __user *vec; /* address range array */
}

int process_madvise(int pidfd, ssize_t nr_elem,
int *behavior,
struct pr_madvise_param *results,
struct pr_madvise_param *ranges,
unsigned long flags);

- pidfd

target process fd

- nr_elem

the number of elemenent of array behavior, results, ranges

- behavior

hints for each address range in remote process so that user could
give different hints for each range.

- results

array of buffers to get results for associated remote address range
action.

- ranges

array to buffers to have remote process's address ranges to be
processed

- flags

extra argument for the future. It should be zero this moment.

Example)

struct pr_madvise_param {
int size;
const struct iovec *vec;
};

int main(int argc, char *argv[])
{
struct pr_madvise_param retp, rangep;
struct iovec result_vec[2], range_vec[2];
int hints[2];
long ret[2];
void *addr[2];

pid_t pid;
char cmd[64] = {0,};
addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

if (MAP_FAILED == addr[0])
return 1;

addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

if (MAP_FAILED == addr[1])
return 1;

hints[0] = MADV_COLD;
range_vec[0].iov_base = addr[0];
range_vec[0].iov_len = ALLOC_SIZE;
result_vec[0].iov_base = &ret[0];
result_vec[0].iov_len = sizeof(long);
retp.vec = result_vec;
retp.size = sizeof(struct pr_madvise_param);

hints[1] = MADV_COOL;
range_vec[1].iov_base = addr[1];
range_vec[1].iov_len = ALLOC_SIZE;
result_vec[1].iov_base = &ret[1];
result_vec[1].iov_len = sizeof(long);
rangep.vec = range_vec;
rangep.size = sizeof(struct pr_madvise_param);

pid = fork();
if (!pid) {
sleep(10);
} else {
int pidfd = open(cmd, O_DIRECTORY | O_CLOEXEC);
if (pidfd < 0)
return 1;

/* munmap to make pages private for the child */
munmap(addr[0], ALLOC_SIZE);
munmap(addr[1], ALLOC_SIZE);
system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
&retp, &rangep, 0))
perror("process_madvise fail\n");
system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
}

return 0;
}

Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
include/uapi/asm-generic/mman-common.h | 5 +
mm/madvise.c | 184 +++++++++++++++++++++----
2 files changed, 166 insertions(+), 23 deletions(-)

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index b9b51eeb8e1a..b8e230de84a6 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -74,4 +74,9 @@
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)

+struct pr_madvise_param {
+ int size; /* the size of this structure */
+ const struct iovec __user *vec; /* address range array */
+};
+
#endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index af02aa17e5c1..f4f569dac2bd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct vm_area_struct *vma = walk->vma;
unsigned long next;
+ long nr_pages = 0;

next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
@@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,

ptep_test_and_clear_young(vma, addr, pte);
deactivate_page(page);
+ nr_pages++;
+
}

pte_unmap_unlock(orig_pte, ptl);
+ *(long *)walk->private += nr_pages;
cond_resched();

return 0;
@@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,

static void madvise_cool_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ unsigned long addr, unsigned long end,
+ long *nr_pages)
{
struct mm_walk cool_walk = {
.pmd_entry = madvise_cool_pte_range,
.mm = vma->vm_mm,
+ .private = nr_pages
};

tlb_start_vma(tlb, vma);
@@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
}

static long madvise_cool(struct vm_area_struct *vma,
- unsigned long start_addr, unsigned long end_addr)
+ unsigned long start_addr, unsigned long end_addr,
+ long *nr_pages)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
@@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,

lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
- madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
+ madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
tlb_finish_mmu(&tlb, start_addr, end_addr);

return 0;
@@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
int isolated = 0;
struct vm_area_struct *vma = walk->vma;
unsigned long next;
+ long nr_pages = 0;

next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
@@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
list_add(&page->lru, &page_list);
if (isolated >= SWAP_CLUSTER_MAX) {
pte_unmap_unlock(orig_pte, ptl);
- reclaim_pages(&page_list);
+ nr_pages += reclaim_pages(&page_list);
isolated = 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
orig_pte = pte;
@@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
}

pte_unmap_unlock(orig_pte, ptl);
- reclaim_pages(&page_list);
+ nr_pages += reclaim_pages(&page_list);
cond_resched();

+ *(long *)walk->private += nr_pages;
return 0;
}

static void madvise_cold_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ unsigned long addr, unsigned long end,
+ long *nr_pages)
{
struct mm_walk warm_walk = {
.pmd_entry = madvise_cold_pte_range,
.mm = vma->vm_mm,
+ .private = nr_pages,
};

tlb_start_vma(tlb, vma);
@@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,


static long madvise_cold(struct vm_area_struct *vma,
- unsigned long start_addr, unsigned long end_addr)
+ unsigned long start_addr, unsigned long end_addr,
+ long *nr_pages)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
@@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,

lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
- madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
tlb_finish_mmu(&tlb, start_addr, end_addr);

return 0;
@@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
static long
madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
- unsigned long end, int behavior)
+ unsigned long end, int behavior, long *nr_pages)
{
switch (behavior) {
case MADV_REMOVE:
@@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_COOL:
- return madvise_cool(vma, start, end);
+ return madvise_cool(vma, start, end, nr_pages);
case MADV_COLD:
- return madvise_cold(vma, start, end);
+ return madvise_cold(vma, start, end, nr_pages);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(tsk, vma, prev, start,
@@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
}

static int madvise_core(struct task_struct *tsk, unsigned long start,
- size_t len_in, int behavior)
+ size_t len_in, int behavior, long *nr_pages)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,

if (start & ~PAGE_MASK)
return error;
+
len = (len_in + ~PAGE_MASK) & PAGE_MASK;

/* Check to see whether len was rounded up from small -ve to zero */
@@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
+ long pages = 0;
+
error = -ENOMEM;
if (!vma)
goto out;
@@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
tmp = end;

/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
+ error = madvise_vma(tsk, vma, &prev, start, tmp,
+ behavior, &pages);
if (error)
goto out;
+ *nr_pages += pages;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
@@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
*/
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
- return madvise_core(current, start, len_in, behavior);
+ unsigned long dummy;
+
+ return madvise_core(current, start, len_in, behavior, &dummy);
}

-SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
- size_t, len_in, int, behavior)
+static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
+ struct pr_madvise_param *param)
+{
+ u32 size;
+ int ret;
+
+ memset(param, 0, sizeof(*param));
+
+ ret = get_user(size, &u_param->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ if (!size || size > sizeof(struct pr_madvise_param))
+ return -EINVAL;
+
+ ret = copy_from_user(param, u_param, size);
+ if (ret)
+ return -EFAULT;
+
+ return ret;
+}
+
+static int process_madvise_core(struct task_struct *tsk, int *behaviors,
+ struct iov_iter *iter,
+ const struct iovec *range_vec,
+ unsigned long riovcnt,
+ unsigned long flags)
+{
+ int i;
+ long err;
+
+ for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
+ long ret = 0;
+
+ err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
+ range_vec[i].iov_len, behaviors[i],
+ &ret);
+ if (err)
+ ret = err;
+
+ if (copy_to_iter(&ret, sizeof(long), iter) !=
+ sizeof(long)) {
+ err = -EFAULT;
+ break;
+ }
+
+ err = 0;
+ }
+
+ return err;
+}
+
+SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
+ const int __user *, hints,
+ struct pr_madvise_param __user *, results,
+ struct pr_madvise_param __user *, ranges,
+ unsigned long, flags)
{
int ret;
struct fd f;
struct pid *pid;
struct task_struct *tsk;
struct mm_struct *mm;
+ struct pr_madvise_param result_p, range_p;
+ const struct iovec __user *result_vec, __user *range_vec;
+ int *behaviors;
+ struct iovec iovstack_result[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_result;
+ struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ ret = pr_madvise_copy_param(results, &result_p);
+ if (ret)
+ return ret;
+
+ ret = pr_madvise_copy_param(ranges, &range_p);
+ if (ret)
+ return ret;
+
+ result_vec = result_p.vec;
+ range_vec = range_p.vec;
+
+ if (result_p.size != sizeof(struct pr_madvise_param) ||
+ range_p.size != sizeof(struct pr_madvise_param))
+ return -EINVAL;
+
+ behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
+ if (!behaviors)
+ return -ENOMEM;
+
+ ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
+ if (ret < 0)
+ goto free_behavior_vec;
+
+ ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
+ &iov_l, &iter);
+ if (ret < 0)
+ goto free_behavior_vec;
+
+ if (!iov_iter_count(&iter)) {
+ ret = -EINVAL;
+ goto free_iovecs;
+ }
+
+ ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
+ UIO_FASTIOV, iovstack_r, &iov_r);
+ if (ret <= 0)
+ goto free_iovecs;

f = fdget(pidfd);
- if (!f.file)
- return -EBADF;
+ if (!f.file) {
+ ret = -EBADF;
+ goto free_iovecs;
+ }

pid = pidfd_to_pid(f.file);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
- goto err;
+ goto put_fd;
}

ret = -EINVAL;
@@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
tsk = pid_task(pid, PIDTYPE_PID);
if (!tsk) {
rcu_read_unlock();
- goto err;
+ goto put_fd;
}
get_task_struct(tsk);
rcu_read_unlock();
@@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
if (ret == -EACCES)
ret = -EPERM;
- goto err;
+ goto put_task;
}
- ret = madvise_core(tsk, start, len_in, behavior);
+
+ ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
+ nr_elem, flags);
mmput(mm);
+put_task:
put_task_struct(tsk);
-err:
+put_fd:
fdput(f);
+free_iovecs:
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+ kfree(iov_l);
+free_behavior_vec:
+ kfree(behaviors);
+
return ret;
}
--
2.21.0.1020.gf2820cf01a-goog