Re: [PATCH] aio: Convert ioctx_table to XArray

From: Matthew Wilcox
Date: Thu Dec 06 2018 - 16:08:09 EST


ping

On Wed, Nov 28, 2018 at 10:35:31AM -0800, Matthew Wilcox wrote:
> This custom resizing array was vulnerable to a Spectre attack (speculating
> off the end of an array to a user-controlled offset). The XArray is
> not vulnerable to Spectre as it always masks its lookups to be within
> the bounds of the array.
>
> Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> ---
> fs/aio.c | 182 ++++++++++++++-------------------------
> include/linux/mm_types.h | 5 +-
> kernel/fork.c | 3 +-
> mm/debug.c | 6 --
> 4 files changed, 67 insertions(+), 129 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 301e6314183b..51ba7446a24f 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -71,12 +71,6 @@ struct aio_ring {
>
> #define AIO_RING_PAGES 8
>
> -struct kioctx_table {
> - struct rcu_head rcu;
> - unsigned nr;
> - struct kioctx __rcu *table[];
> -};
> -
> struct kioctx_cpu {
> unsigned reqs_available;
> };
> @@ -313,27 +307,22 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
> {
> struct file *file = vma->vm_file;
> struct mm_struct *mm = vma->vm_mm;
> - struct kioctx_table *table;
> - int i, res = -EINVAL;
> + struct kioctx *ctx;
> + unsigned long index;
> + int res = -EINVAL;
>
> - spin_lock(&mm->ioctx_lock);
> - rcu_read_lock();
> - table = rcu_dereference(mm->ioctx_table);
> - for (i = 0; i < table->nr; i++) {
> - struct kioctx *ctx;
> -
> - ctx = rcu_dereference(table->table[i]);
> - if (ctx && ctx->aio_ring_file == file) {
> - if (!atomic_read(&ctx->dead)) {
> - ctx->user_id = ctx->mmap_base = vma->vm_start;
> - res = 0;
> - }
> - break;
> + xa_lock(&mm->ioctx);
> + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) {
> + if (ctx->aio_ring_file != file)
> + continue;
> + if (!atomic_read(&ctx->dead)) {
> + ctx->user_id = ctx->mmap_base = vma->vm_start;
> + res = 0;
> }
> + break;
> }
> + xa_unlock(&mm->ioctx);
>
> - rcu_read_unlock();
> - spin_unlock(&mm->ioctx_lock);
> return res;
> }
>
> @@ -617,57 +606,21 @@ static void free_ioctx_users(struct percpu_ref *ref)
>
> static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
> {
> - unsigned i, new_nr;
> - struct kioctx_table *table, *old;
> struct aio_ring *ring;
> + int err;
>
> - spin_lock(&mm->ioctx_lock);
> - table = rcu_dereference_raw(mm->ioctx_table);
> -
> - while (1) {
> - if (table)
> - for (i = 0; i < table->nr; i++)
> - if (!rcu_access_pointer(table->table[i])) {
> - ctx->id = i;
> - rcu_assign_pointer(table->table[i], ctx);
> - spin_unlock(&mm->ioctx_lock);
> -
> - /* While kioctx setup is in progress,
> - * we are protected from page migration
> - * changes ring_pages by ->ring_lock.
> - */
> - ring = kmap_atomic(ctx->ring_pages[0]);
> - ring->id = ctx->id;
> - kunmap_atomic(ring);
> - return 0;
> - }
> -
> - new_nr = (table ? table->nr : 1) * 4;
> - spin_unlock(&mm->ioctx_lock);
> -
> - table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
> - new_nr, GFP_KERNEL);
> - if (!table)
> - return -ENOMEM;
> -
> - table->nr = new_nr;
> -
> - spin_lock(&mm->ioctx_lock);
> - old = rcu_dereference_raw(mm->ioctx_table);
> -
> - if (!old) {
> - rcu_assign_pointer(mm->ioctx_table, table);
> - } else if (table->nr > old->nr) {
> - memcpy(table->table, old->table,
> - old->nr * sizeof(struct kioctx *));
> + err = xa_alloc(&mm->ioctx, &ctx->id, UINT_MAX, ctx, GFP_KERNEL);
> + if (err)
> + return err;
>
> - rcu_assign_pointer(mm->ioctx_table, table);
> - kfree_rcu(old, rcu);
> - } else {
> - kfree(table);
> - table = old;
> - }
> - }
> + /*
> + * While kioctx setup is in progress, we are protected from
> + * page migration changing ring_pages by ->ring_lock.
> + */
> + ring = kmap_atomic(ctx->ring_pages[0]);
> + ring->id = ctx->id;
> + kunmap_atomic(ring);
> + return 0;
> }
>
> static void aio_nr_sub(unsigned nr)
> @@ -793,27 +746,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
> return ERR_PTR(err);
> }
>
> -/* kill_ioctx
> - * Cancels all outstanding aio requests on an aio context. Used
> - * when the processes owning a context have all exited to encourage
> - * the rapid destruction of the kioctx.
> - */
> -static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
> - struct ctx_rq_wait *wait)
> +static void __kill_ioctx(struct kioctx *ctx, struct ctx_rq_wait *wait)
> {
> - struct kioctx_table *table;
> -
> - spin_lock(&mm->ioctx_lock);
> - if (atomic_xchg(&ctx->dead, 1)) {
> - spin_unlock(&mm->ioctx_lock);
> - return -EINVAL;
> - }
> -
> - table = rcu_dereference_raw(mm->ioctx_table);
> - WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
> - RCU_INIT_POINTER(table->table[ctx->id], NULL);
> - spin_unlock(&mm->ioctx_lock);
> -
> /* free_ioctx_reqs() will do the necessary RCU synchronization */
> wake_up_all(&ctx->wait);
>
> @@ -831,6 +765,30 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
>
> ctx->rq_wait = wait;
> percpu_ref_kill(&ctx->users);
> +}
> +
> +/* kill_ioctx
> + * Cancels all outstanding aio requests on an aio context. Used
> + * when the processes owning a context have all exited to encourage
> + * the rapid destruction of the kioctx.
> + */
> +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
> + struct ctx_rq_wait *wait)
> +{
> + struct kioctx *old;
> +
> + /* I don't understand what this lock is protecting against */
> + xa_lock(&mm->ioctx);
> + if (atomic_xchg(&ctx->dead, 1)) {
> + xa_unlock(&mm->ioctx);
> + return -EINVAL;
> + }
> +
> + old = __xa_erase(&mm->ioctx, ctx->id);
> + WARN_ON(old != ctx);
> + xa_unlock(&mm->ioctx);
> +
> + __kill_ioctx(ctx, wait);
> return 0;
> }
>
> @@ -844,26 +802,21 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
> */
> void exit_aio(struct mm_struct *mm)
> {
> - struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
> + struct kioctx *ctx;
> struct ctx_rq_wait wait;
> - int i, skipped;
> + unsigned long index;
>
> - if (!table)
> + if (xa_empty(&mm->ioctx))
> return;
>
> - atomic_set(&wait.count, table->nr);
> + /*
> + * Prevent count from getting to 0 until we're ready for it
> + */
> + atomic_set(&wait.count, 1);
> init_completion(&wait.comp);
>
> - skipped = 0;
> - for (i = 0; i < table->nr; ++i) {
> - struct kioctx *ctx =
> - rcu_dereference_protected(table->table[i], true);
> -
> - if (!ctx) {
> - skipped++;
> - continue;
> - }
> -
> + xa_lock(&mm->ioctx);
> + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) {
> /*
> * We don't need to bother with munmap() here - exit_mmap(mm)
> * is coming and it'll unmap everything. And we simply can't,
> @@ -872,16 +825,16 @@ void exit_aio(struct mm_struct *mm)
> * that it needs to unmap the area, just set it to 0.
> */
> ctx->mmap_size = 0;
> - kill_ioctx(mm, ctx, &wait);
> + atomic_inc(&wait.count);
> + __xa_erase(&mm->ioctx, ctx->id);
> + __kill_ioctx(ctx, &wait);
> }
> + xa_unlock(&mm->ioctx);
>
> - if (!atomic_sub_and_test(skipped, &wait.count)) {
> + if (!atomic_sub_and_test(1, &wait.count)) {
> /* Wait until all IO for the context are done. */
> wait_for_completion(&wait.comp);
> }
> -
> - RCU_INIT_POINTER(mm->ioctx_table, NULL);
> - kfree(table);
> }
>
> static void put_reqs_available(struct kioctx *ctx, unsigned nr)
> @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
> struct aio_ring __user *ring = (void __user *)ctx_id;
> struct mm_struct *mm = current->mm;
> struct kioctx *ctx, *ret = NULL;
> - struct kioctx_table *table;
> unsigned id;
>
> if (get_user(id, &ring->id))
> return NULL;
>
> rcu_read_lock();
> - table = rcu_dereference(mm->ioctx_table);
> -
> - if (!table || id >= table->nr)
> - goto out;
> -
> - ctx = rcu_dereference(table->table[id]);
> + ctx = xa_load(&mm->ioctx, id);
> if (ctx && ctx->user_id == ctx_id) {
> if (percpu_ref_tryget_live(&ctx->users))
> ret = ctx;
> }
> -out:
> rcu_read_unlock();
> return ret;
> }
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 5ed8f6292a53..1a95bb27f5a7 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -14,6 +14,7 @@
> #include <linux/uprobes.h>
> #include <linux/page-flags-layout.h>
> #include <linux/workqueue.h>
> +#include <linux/xarray.h>
>
> #include <asm/mmu.h>
>
> @@ -336,7 +337,6 @@ struct core_state {
> struct completion startup;
> };
>
> -struct kioctx_table;
> struct mm_struct {
> struct {
> struct vm_area_struct *mmap; /* list of VMAs */
> @@ -431,8 +431,7 @@ struct mm_struct {
> atomic_t membarrier_state;
> #endif
> #ifdef CONFIG_AIO
> - spinlock_t ioctx_lock;
> - struct kioctx_table __rcu *ioctx_table;
> + struct xarray ioctx;
> #endif
> #ifdef CONFIG_MEMCG
> /*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 07cddff89c7b..acb775f9592d 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -946,8 +946,7 @@ __setup("coredump_filter=", coredump_filter_setup);
> static void mm_init_aio(struct mm_struct *mm)
> {
> #ifdef CONFIG_AIO
> - spin_lock_init(&mm->ioctx_lock);
> - mm->ioctx_table = NULL;
> + xa_init_flags(&mm->ioctx, XA_FLAGS_ALLOC);
> #endif
> }
>
> diff --git a/mm/debug.c b/mm/debug.c
> index cdacba12e09a..d45ec63aed8c 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -127,9 +127,6 @@ void dump_mm(const struct mm_struct *mm)
> "start_brk %lx brk %lx start_stack %lx\n"
> "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
> "binfmt %px flags %lx core_state %px\n"
> -#ifdef CONFIG_AIO
> - "ioctx_table %px\n"
> -#endif
> #ifdef CONFIG_MEMCG
> "owner %px "
> #endif
> @@ -158,9 +155,6 @@ void dump_mm(const struct mm_struct *mm)
> mm->start_brk, mm->brk, mm->start_stack,
> mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
> mm->binfmt, mm->flags, mm->core_state,
> -#ifdef CONFIG_AIO
> - mm->ioctx_table,
> -#endif
> #ifdef CONFIG_MEMCG
> mm->owner,
> #endif
> --
> 2.19.1
>