Re: [PATCH 3/8] fork: add option to not clone or dup files

From: Christian Brauner
Date: Fri Sep 17 2021 - 04:55:04 EST


On Thu, Sep 16, 2021 at 04:20:46PM -0500, Mike Christie wrote:
> Each vhost device gets a thread that is used to perform IO and management
> operations. Instead of a thread that is accessing a device, the thread is
> part of the device, so when it calls kernel_copy_process we can't dup or
> clone the parent's (Qemu thread that does the VHOST_SET_OWNER ioctl)
> files/FDS because it would do an extra increment on ourself.
>
> Later, when we do:
>
> Qemu process exits:
> do_exit -> exit_files -> put_files_struct -> close_files
>
> we would leak the device's resources because of that extra refcount
> on the fd or file_struct.
>
> This patch adds a no_files option so these worker threads can prevent
> taking an extra refcount on themselves.
>
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>
> ---
> include/linux/sched/task.h | 3 ++-
> kernel/fork.c | 14 +++++++++++---
> 2 files changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index c55f1eb69d41..d0b0872f56cc 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -32,6 +32,7 @@ struct kernel_clone_args {
> size_t set_tid_size;
> int cgroup;
> int io_thread;
> + int no_files;
> struct cgroup *cgrp;
> struct css_set *cset;
> };
> @@ -86,7 +87,7 @@ extern pid_t kernel_clone(struct kernel_clone_args *kargs);
> struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
> struct task_struct *kernel_copy_process(int (*fn)(void *), void *arg, int node,
> unsigned long clone_flags,
> - int io_thread);
> + int io_thread, int no_files);
> struct task_struct *fork_idle(int);
> struct mm_struct *copy_init_mm(void);
> extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cec7b6011beb..a0468e30b27e 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1532,7 +1532,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
> return 0;
> }
>
> -static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
> +static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
> + int no_files)
> {
> struct files_struct *oldf, *newf;
> int error = 0;
> @@ -1544,6 +1545,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
> if (!oldf)
> goto out;
>
> + if (no_files) {
> + tsk->files = NULL;
> + goto out;
> + }
> +
> if (clone_flags & CLONE_FILES) {
> atomic_inc(&oldf->count);
> goto out;
> @@ -2179,7 +2185,7 @@ static __latent_entropy struct task_struct *copy_process(
> retval = copy_semundo(clone_flags, p);
> if (retval)
> goto bad_fork_cleanup_security;
> - retval = copy_files(clone_flags, p);
> + retval = copy_files(clone_flags, p, args->no_files);
> if (retval)
> goto bad_fork_cleanup_semundo;
> retval = copy_fs(clone_flags, p);
> @@ -2539,6 +2545,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
> * @node: numa node to allocate task from
> * @clone_flags: CLONE flags
> * @io_thread: 1 if this will be a PF_IO_WORKER else 0.
> + * @no_files: Do not duplicate or copy the parent's open files.
> *
> * This returns a created task, or an error pointer. The returned task is
> * inactive, and the caller must fire it up through wake_up_new_task(p). If
> @@ -2546,7 +2553,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
> */
> struct task_struct *kernel_copy_process(int (*fn)(void *), void *arg, int node,
> unsigned long clone_flags,
> - int io_thread)
> + int io_thread, int no_files)

I think that the addition of no_files together with io_thread might be a
sign to introduce a set of kernel internal clone flags in
kernel_clone_args.