Re: [PATCH] [RFC] vm: add a syscall to map a process memory into a pipe

From: Michael Kerrisk (man-pages)
Date: Mon Oct 30 2017 - 08:48:15 EST


Hi Andrei,

On 10 August 2017 at 20:46, Andrei Vagin <avagin@xxxxxxxxxx> wrote:
> It is a hybrid of process_vm_readv() and vmsplice().
>
> vmsplice can map memory from a current address space into a pipe.
> process_vm_readv can read memory of another process.
>
> A new system call can map memory of another process into a pipe.
>
> ssize_t process_vmsplice(pid_t pid, int fd, const struct iovec *iov,
> unsigned long nr_segs, unsigned int flags)
>
> All arguments are identical with vmsplice except pid which specifies a
> target process.

Can we have a man page for this new syscall please?

Thanks,

Michael


> Currently if we want to dump a process memory to a file or to a socket,
> we can use process_vm_readv() + write(), but it works slow, because data
> are copied into a temporary user-space buffer.
>
> A second way is to use vmsplice() + splice(). It is more effective,
> because data are not copied into a temporary buffer, but here is another
> problem. vmsplice works with the currect address space, so it can be
> used only if we inject our code into a target process.
>
> The second way suffers from a few other issues:
> * a process has to be stopped to run a parasite code
> * a number of pipes is limited, so it may be impossible to dump all
> memory in one iteration, and we have to stop process and inject our
> code a few times.
> * pages in pipes are unreclaimable, so it isn't good to hold a lot of
> memory in pipes.
>
> The introduced syscall allows to use a second way without injecting any
> code into a target process.
>
> My experiments shows that process_vmsplice() + splice() works two time
> faster than process_vm_readv() + write().
>
> It is particularly useful on a pre-dump stage. On this stage we enable a
> memory tracker, and then we are dumping a process memory while a
> process continues work. On the first iteration we are dumping all
> memory, and then we are dumpung only modified memory from a previous
> iteration. After a few pre-dump operations, a process is stopped and
> dumped finally. The pre-dump operations allow to significantly decrease
> a process downtime, when a process is migrated to another host.
>
> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
> Cc: Arnd Bergmann <arnd@xxxxxxxx>
> Cc: Pavel Emelyanov <xemul@xxxxxxxxxxxxx>
> Cc: Michael Kerrisk <mtk.manpages@xxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Andrei Vagin <avagin@xxxxxxxxxx>
> ---
> fs/splice.c | 219 ++++++++++++++++++++++++++++++++++++++
> include/linux/compat.h | 3 +
> include/linux/syscalls.h | 4 +
> include/uapi/asm-generic/unistd.h | 5 +-
> 4 files changed, 230 insertions(+), 1 deletion(-)
>
> diff --git a/fs/splice.c b/fs/splice.c
> index ae41201..4b050a4 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -34,6 +34,7 @@
> #include <linux/socket.h>
> #include <linux/compat.h>
> #include <linux/sched/signal.h>
> +#include <linux/sched/mm.h>
>
> #include "internal.h"
>
> @@ -1374,6 +1375,201 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
> return error;
> }
>
> +/*
> + * Map pages from a specified task into a pipe
> + */
> +static int remote_single_vec_to_pipe(struct task_struct *task,
> + struct mm_struct *mm,
> + const struct iovec *rvec,
> + struct pipe_inode_info *pipe,
> + unsigned int flags,
> + size_t *total)
> +{
> + struct pipe_buffer buf = {
> + .ops = &user_page_pipe_buf_ops,
> + .flags = flags
> + };
> + unsigned long addr = (unsigned long) rvec->iov_base;
> + unsigned long pa = addr & PAGE_MASK;
> + unsigned long start_offset = addr - pa;
> + unsigned long nr_pages;
> + ssize_t len = rvec->iov_len;
> + struct page *process_pages[16];
> + bool failed = false;
> + int ret = 0;
> +
> + nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
> + while (nr_pages) {
> + long pages = min(nr_pages, 16UL);
> + int locked = 1, n;
> + ssize_t copied;
> +
> + /*
> + * Get the pages we're interested in. We must
> + * access remotely because task/mm might not
> + * current/current->mm
> + */
> + down_read(&mm->mmap_sem);
> + pages = get_user_pages_remote(task, mm, pa, pages, flags,
> + process_pages, NULL, &locked);
> + if (locked)
> + up_read(&mm->mmap_sem);
> + if (pages <= 0) {
> + failed = true;
> + ret = -EFAULT;
> + break;
> + }
> +
> + copied = pages * PAGE_SIZE - start_offset;
> + if (copied > len)
> + copied = len;
> + len -= copied;
> +
> + for (n = 0; copied; n++, start_offset = 0) {
> + int size = min_t(int, copied, PAGE_SIZE - start_offset);
> +
> + if (!failed) {
> + buf.page = process_pages[n];
> + buf.offset = start_offset;
> + buf.len = size;
> + ret = add_to_pipe(pipe, &buf);
> + if (unlikely(ret < 0))
> + failed = true;
> + else
> + *total += ret;
> + } else {
> + put_page(process_pages[n]);
> + }
> + copied -= size;
> + }
> + if (failed)
> + break;
> + start_offset = 0;
> + nr_pages -= pages;
> + pa += pages * PAGE_SIZE;
> + }
> + return ret < 0 ? ret : 0;
> +}
> +
> +static ssize_t remote_iovec_to_pipe(struct task_struct *task,
> + struct mm_struct *mm,
> + const struct iovec *rvec,
> + unsigned long riovcnt,
> + struct pipe_inode_info *pipe,
> + unsigned int flags)
> +{
> + size_t total = 0;
> + int ret = 0, i;
> +
> + for (i = 0; i < riovcnt; i++) {
> + /* Work out address and page range required */
> + if (rvec[i].iov_len == 0)
> + continue;
> +
> + ret = remote_single_vec_to_pipe(
> + task, mm, &rvec[i], pipe, flags, &total);
> + if (ret < 0)
> + break;
> + }
> + return total ? total : ret;
> +}
> +
> +static long process_vmsplice_to_pipe(struct task_struct *task,
> + struct mm_struct *mm, struct file *file,
> + const struct iovec __user *uiov,
> + unsigned long nr_segs, unsigned int flags)
> +{
> + struct pipe_inode_info *pipe;
> + struct iovec iovstack[UIO_FASTIOV];
> + struct iovec *iov = iovstack;
> + unsigned int buf_flag = 0;
> + long ret;
> +
> + if (flags & SPLICE_F_GIFT)
> + buf_flag = PIPE_BUF_FLAG_GIFT;
> +
> + pipe = get_pipe_info(file);
> + if (!pipe)
> + return -EBADF;
> +
> + ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, uiov, nr_segs,
> + UIO_FASTIOV, iovstack, &iov);
> + if (ret < 0)
> + return ret;
> +
> + pipe_lock(pipe);
> + ret = wait_for_space(pipe, flags);
> + if (!ret)
> + ret = remote_iovec_to_pipe(task, mm, iov,
> + nr_segs, pipe, buf_flag);
> + pipe_unlock(pipe);
> + if (ret > 0)
> + wakeup_pipe_readers(pipe);
> +
> + if (iov != iovstack)
> + kfree(iov);
> + return ret;
> +}
> +
> +/* process_vmsplice splices a process address range into a pipe. */
> +SYSCALL_DEFINE5(process_vmsplice, int, pid, int, fd,
> + const struct iovec __user *, iov,
> + unsigned long, nr_segs, unsigned int, flags)
> +{
> + struct task_struct *task;
> + struct mm_struct *mm;
> + struct fd f;
> + long ret;
> +
> + if (unlikely(flags & ~SPLICE_F_ALL))
> + return -EINVAL;
> + if (unlikely(nr_segs > UIO_MAXIOV))
> + return -EINVAL;
> + else if (unlikely(!nr_segs))
> + return 0;
> +
> + f = fdget(fd);
> + if (!f.file)
> + return -EBADF;
> +
> + /* Get process information */
> + rcu_read_lock();
> + task = find_task_by_vpid(pid);
> + if (task)
> + get_task_struct(task);
> + rcu_read_unlock();
> + if (!task) {
> + ret = -ESRCH;
> + goto out_fput;
> + }
> +
> + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
> + if (!mm || IS_ERR(mm)) {
> + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> + /*
> + * Explicitly map EACCES to EPERM as EPERM is a more a
> + * appropriate error code for process_vw_readv/writev
> + */
> + if (ret == -EACCES)
> + ret = -EPERM;
> + goto put_task_struct;
> + }
> +
> + ret = -EBADF;
> + if (f.file->f_mode & FMODE_WRITE)
> + ret = process_vmsplice_to_pipe(task, mm, f.file,
> + iov, nr_segs, flags);
> + mmput(mm);
> +
> +put_task_struct:
> + put_task_struct(task);
> +
> +out_fput:
> + fdput(f);
> +
> + return ret;
> +}
> +
> #ifdef CONFIG_COMPAT
> COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
> unsigned int, nr_segs, unsigned int, flags)
> @@ -1393,6 +1589,29 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
> }
> return sys_vmsplice(fd, iov, nr_segs, flags);
> }
> +
> +COMPAT_SYSCALL_DEFINE5(process_vmsplice, pid_t, pid, int, fd,
> + const struct compat_iovec __user *, iov32,
> + unsigned int, nr_segs, unsigned int, flags)
> +{
> + struct iovec __user *iov;
> + unsigned int i;
> +
> + if (nr_segs > UIO_MAXIOV)
> + return -EINVAL;
> +
> + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
> + for (i = 0; i < nr_segs; i++) {
> + struct compat_iovec v;
> +
> + if (get_user(v.iov_base, &iov32[i].iov_base) ||
> + get_user(v.iov_len, &iov32[i].iov_len) ||
> + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
> + put_user(v.iov_len, &iov[i].iov_len))
> + return -EFAULT;
> + }
> + return sys_process_vmsplice(pid, fd, iov, nr_segs, flags);
> +}
> #endif
>
> SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index 5a6a109..3590cc7 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -550,6 +550,9 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
> unsigned int count);
> asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *,
> unsigned int nr_segs, unsigned int flags);
> +asmlinkage long compat_sys_process_vmsplice(pid_t pid, int fd,
> + const struct compat_iovec __user *,
> + unsigned int nr_segs, unsigned int flags);
> asmlinkage long compat_sys_open(const char __user *filename, int flags,
> umode_t mode);
> asmlinkage long compat_sys_openat(int dfd, const char __user *filename,
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 3cb15ea..49bdf96 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -906,4 +906,8 @@ asmlinkage long sys_pkey_free(int pkey);
> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> unsigned mask, struct statx __user *buffer);
>
> +asmlinkage long sys_process_vmsplice(pid_t pid,
> + int fd, const struct iovec __user *iov,
> + unsigned long nr_segs, unsigned int flags);
> +
> #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 061185a..d18019d 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -731,9 +731,12 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
> __SYSCALL(__NR_pkey_free, sys_pkey_free)
> #define __NR_statx 291
> __SYSCALL(__NR_statx, sys_statx)
> +#define __NR_process_vmsplice 292
> +__SC_COMP(__NR_process_vmsplice, sys_process_vmsplice,
> + compat_sys_process_vmsplice)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 292
> +#define __NR_syscalls 293
>
> /*
> * All syscalls below here should go away really,
> --
> 2.9.4
>



--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/