Re: [PATCH v2 3/3] mm/maps: read proc/pid/maps under RCU

From: Suren Baghdasaryan
Date: Thu Jan 25 2024 - 16:32:22 EST


On Tue, Jan 23, 2024 at 3:10 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
>
> With maple_tree supporting vma tree traversal under RCU and per-vma locks
> making vma access RCU-safe, /proc/pid/maps can be read under RCU and
> without the need to read-lock mmap_lock. However vma content can change
> from under us, therefore we make a copy of the vma and we pin pointer
> fields used when generating the output (currently only vm_file and
> anon_name). Afterwards we check for concurrent address space
> modifications, wait for them to end and retry. That last check is needed
> to avoid possibility of missing a vma during concurrent maple_tree
> node replacement, which might report a NULL when a vma is replaced
> with another one. While we take the mmap_lock for reading during such
> contention, we do that momentarily only to record new mm_wr_seq counter.
> This change is designed to reduce mmap_lock contention and prevent a
> process reading /proc/pid/maps files (often a low priority task, such as
> monitoring/data collection services) from blocking address space updates.
>
> Note that this change has a userspace visible disadvantage: it allows for
> sub-page data tearing as opposed to the previous mechanism where data
> tearing could happen only between pages of generated output data.
> Since current userspace considers data tearing between pages to be
> acceptable, we assume is will be able to handle sub-page data tearing
> as well.
>
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> ---
> Changes since v1 [1]:
> - Fixed CONFIG_ANON_VMA_NAME=n build by introducing
> anon_vma_name_{get|put}_if_valid, per SeongJae Park
> - Fixed misspelling of get_vma_snapshot()
>
> [1] https://lore.kernel.org/all/20240122071324.2099712-3-surenb@xxxxxxxxxx/
>
> fs/proc/internal.h | 2 +
> fs/proc/task_mmu.c | 113 +++++++++++++++++++++++++++++++++++---
> include/linux/mm_inline.h | 18 ++++++
> 3 files changed, 126 insertions(+), 7 deletions(-)
>
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index a71ac5379584..e0247225bb68 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -290,6 +290,8 @@ struct proc_maps_private {
> struct task_struct *task;
> struct mm_struct *mm;
> struct vma_iterator iter;
> + unsigned long mm_wr_seq;
> + struct vm_area_struct vma_copy;
> #ifdef CONFIG_NUMA
> struct mempolicy *task_mempolicy;
> #endif
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 3f78ebbb795f..0d5a515156ee 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -126,11 +126,95 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
> }
> #endif
>
> -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> - loff_t *ppos)
> +#ifdef CONFIG_PER_VMA_LOCK
> +
> +static const struct seq_operations proc_pid_maps_op;
> +
> +/*
> + * Take VMA snapshot and pin vm_file and anon_name as they are used by
> + * show_map_vma.
> + */
> +static int get_vma_snapshot(struct proc_maps_private *priv, struct vm_area_struct *vma)
> +{
> + struct vm_area_struct *copy = &priv->vma_copy;
> + int ret = -EAGAIN;
> +
> + memcpy(copy, vma, sizeof(*vma));
> + if (copy->vm_file && !get_file_rcu(&copy->vm_file))

There is a problem in this patchset. I assumed that get_file_rcu() can
be used against vma->vm_file but that's not true. vma->vm_file is
freed via a call to fput() which schedules its freeing using
schedule_delayed_work(..., 1) but I don't think that constitutes RCU
grace period, so it can be freed from under us.
Andrew, could you please remove this patchset from your tree until I
sort this out?
Thanks,
Suren.

> + goto out;
> +
> + if (!anon_vma_name_get_if_valid(copy))
> + goto put_file;
> +
> + if (priv->mm_wr_seq == mmap_write_seq_read(priv->mm))
> + return 0;
> +
> + /* Address space got modified, vma might be stale. Wait and retry */
> + rcu_read_unlock();
> + ret = mmap_read_lock_killable(priv->mm);
> + mmap_write_seq_record(priv->mm, &priv->mm_wr_seq);
> + mmap_read_unlock(priv->mm);
> + rcu_read_lock();
> +
> + if (!ret)
> + ret = -EAGAIN; /* no other errors, ok to retry */
> +
> + anon_vma_name_put_if_valid(copy);
> +put_file:
> + if (copy->vm_file)
> + fput(copy->vm_file);
> +out:
> + return ret;
> +}
> +
> +static void put_vma_snapshot(struct proc_maps_private *priv)
> +{
> + struct vm_area_struct *vma = &priv->vma_copy;
> +
> + anon_vma_name_put_if_valid(vma);
> + if (vma->vm_file)
> + fput(vma->vm_file);
> +}
> +
> +static inline bool needs_mmap_lock(struct seq_file *m)
> +{
> + /*
> + * smaps and numa_maps perform page table walk, therefore require
> + * mmap_lock but maps can be read under RCU.
> + */
> + return m->op != &proc_pid_maps_op;
> +}
> +
> +#else /* CONFIG_PER_VMA_LOCK */
> +
> +/* Without per-vma locks VMA access is not RCU-safe */
> +static inline bool needs_mmap_lock(struct seq_file *m) { return true; }
> +
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
> +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
> {
> + struct proc_maps_private *priv = m->private;
> struct vm_area_struct *vma = vma_next(&priv->iter);
>
> +#ifdef CONFIG_PER_VMA_LOCK
> + if (vma && !needs_mmap_lock(m)) {
> + int ret;
> +
> + put_vma_snapshot(priv);
> + while ((ret = get_vma_snapshot(priv, vma)) == -EAGAIN) {
> + /* lookup the vma at the last position again */
> + vma_iter_init(&priv->iter, priv->mm, *ppos);
> + vma = vma_next(&priv->iter);
> + }
> +
> + if (ret) {
> + put_vma_snapshot(priv);
> + return NULL;
> + }
> + vma = &priv->vma_copy;
> + }
> +#endif
> if (vma) {
> *ppos = vma->vm_start;
> } else {
> @@ -169,12 +253,20 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> return ERR_PTR(-EINTR);
> }
>
> + /* Drop mmap_lock if possible */
> + if (!needs_mmap_lock(m)) {
> + mmap_write_seq_record(priv->mm, &priv->mm_wr_seq);
> + mmap_read_unlock(priv->mm);
> + rcu_read_lock();
> + memset(&priv->vma_copy, 0, sizeof(priv->vma_copy));
> + }
> +
> vma_iter_init(&priv->iter, mm, last_addr);
> hold_task_mempolicy(priv);
> if (last_addr == -2UL)
> return get_gate_vma(mm);
>
> - return proc_get_vma(priv, ppos);
> + return proc_get_vma(m, ppos);
> }
>
> static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
> @@ -183,7 +275,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
> *ppos = -1UL;
> return NULL;
> }
> - return proc_get_vma(m->private, ppos);
> + return proc_get_vma(m, ppos);
> }
>
> static void m_stop(struct seq_file *m, void *v)
> @@ -195,7 +287,10 @@ static void m_stop(struct seq_file *m, void *v)
> return;
>
> release_task_mempolicy(priv);
> - mmap_read_unlock(mm);
> + if (needs_mmap_lock(m))
> + mmap_read_unlock(mm);
> + else
> + rcu_read_unlock();
> mmput(mm);
> put_task_struct(priv->task);
> priv->task = NULL;
> @@ -283,8 +378,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
> start = vma->vm_start;
> end = vma->vm_end;
> show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
> - if (mm)
> - anon_name = anon_vma_name(vma);
> + if (mm) {
> + anon_name = needs_mmap_lock(m) ? anon_vma_name(vma) :
> + anon_vma_name_get_rcu(vma);
> + }
>
> /*
> * Print the dentry name for named mappings, and a
> @@ -338,6 +435,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
> seq_puts(m, name);
> }
> seq_putc(m, '\n');
> + if (anon_name && !needs_mmap_lock(m))
> + anon_vma_name_put(anon_name);
> }
>
> static int show_map(struct seq_file *m, void *v)
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index bbdb0ca857f1..a4a644fe005e 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -413,6 +413,21 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
>
> struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma);
>
> +/*
> + * Takes a reference if anon_vma is valid and stable (has references).
> + * Fails only if anon_vma is valid but we failed to get a reference.
> + */
> +static inline bool anon_vma_name_get_if_valid(struct vm_area_struct *vma)
> +{
> + return !vma->anon_name || anon_vma_name_get_rcu(vma);
> +}
> +
> +static inline void anon_vma_name_put_if_valid(struct vm_area_struct *vma)
> +{
> + if (vma->anon_name)
> + anon_vma_name_put(vma->anon_name);
> +}
> +
> #else /* CONFIG_ANON_VMA_NAME */
> static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
> static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
> @@ -432,6 +447,9 @@ struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma)
> return NULL;
> }
>
> +static inline bool anon_vma_name_get_if_valid(struct vm_area_struct *vma) { return true; }
> +static inline void anon_vma_name_put_if_valid(struct vm_area_struct *vma) {}
> +
> #endif /* CONFIG_ANON_VMA_NAME */
>
> static inline void init_tlb_flush_pending(struct mm_struct *mm)
> --
> 2.43.0.429.g432eaa2c6b-goog
>