Re: [PATCH] vmalloc: respect the GFP_NOIO and GFP_NOFS flags

From: Michal Hocko
Date: Fri Jun 30 2017 - 04:13:23 EST


On Thu 29-06-17 22:25:09, Mikulas Patocka wrote:
> The __vmalloc function has a parameter gfp_mask with the allocation flags,
> however it doesn't fully respect the GFP_NOIO and GFP_NOFS flags. The
> pages are allocated with the specified gfp flags, but the pagetables are
> always allocated with GFP_KERNEL. This allocation can cause unexpected
> recursion into the filesystem or I/O subsystem.
>
> It is not practical to extend page table allocation routines with gfp
> flags because it would require modification of architecture-specific code
> in all architecturs. However, the process can temporarily request that all
> allocations are done with GFP_NOFS or GFP_NOIO with with the functions
> memalloc_nofs_save and memalloc_noio_save.
>
> This patch makes the vmalloc code use memalloc_nofs_save or
> memalloc_noio_save if the supplied gfp flags do not contain __GFP_FS or
> __GFP_IO. It fixes some possible deadlocks in drivers/mtd/ubi/io.c,
> fs/gfs2/, fs/btrfs/free-space-tree.c, fs/ubifs/,
> fs/nfs/blocklayout/extent_tree.c where __vmalloc is used with the GFP_NOFS
> flag.

I strongly believe this is a step in the _wrong_ direction. Why? Because
the memalloc_no{io,fs}_save API is for the scope allocation context. We
want users of the scope to define it and document why it is needed.
GFP_NOFS (I haven't checked GFP_NOIO users) is overused a _lot_ mostly
based on the filesystem should rather use it to prevent deadlock cargo
cult. This should change longterm because heavy fs workloads can cause
troubles to the memory reclaim. So we really want to encourage those
users to define nofs scopes (e.g. on journal locked contexts etc.)
rather than have them use the GFP_NOFS explicitly and very often
mindlessly.

I am not going to nack this patch because it not incorrect but I would
really like to discourage you from it because while it saves 24 lines of
code it (ab)uses the scope allocation context at a wrong layer.

> The patch also simplifies code in dm-bufio.c, dm-ioctl.c and fs/xfs/kmem.c
> by removing explicit calls to memalloc_nofs_save and memalloc_noio_save
> before the call to __vmalloc.
>
> Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>
>
> ---
> drivers/md/dm-bufio.c | 24 +-----------------------
> drivers/md/dm-ioctl.c | 6 +-----
> fs/xfs/kmem.c | 14 --------------
> mm/util.c | 6 +++---
> mm/vmalloc.c | 18 +++++++++++++++++-
> 5 files changed, 22 insertions(+), 46 deletions(-)
>
> Index: linux-2.6/mm/vmalloc.c
> ===================================================================
> --- linux-2.6.orig/mm/vmalloc.c
> +++ linux-2.6/mm/vmalloc.c
> @@ -31,6 +31,7 @@
> #include <linux/compiler.h>
> #include <linux/llist.h>
> #include <linux/bitops.h>
> +#include <linux/sched/mm.h>
>
> #include <linux/uaccess.h>
> #include <asm/tlbflush.h>
> @@ -1670,6 +1671,8 @@ static void *__vmalloc_area_node(struct
> unsigned int nr_pages, array_size, i;
> const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
> const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
> + unsigned noio_flag;
> + int r;
>
> nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> array_size = (nr_pages * sizeof(struct page *));
> @@ -1712,8 +1715,21 @@ static void *__vmalloc_area_node(struct
> cond_resched();
> }
>
> - if (map_vm_area(area, prot, pages))
> + if (unlikely(!(gfp_mask & __GFP_IO)))
> + noio_flag = memalloc_noio_save();
> + else if (unlikely(!(gfp_mask & __GFP_FS)))
> + noio_flag = memalloc_nofs_save();
> +
> + r = map_vm_area(area, prot, pages);
> +
> + if (unlikely(!(gfp_mask & __GFP_IO)))
> + memalloc_noio_restore(noio_flag);
> + else if (unlikely(!(gfp_mask & __GFP_FS)))
> + memalloc_nofs_restore(noio_flag);
> +
> + if (unlikely(r))
> goto fail;
> +
> return area->addr;
>
> fail:
> Index: linux-2.6/mm/util.c
> ===================================================================
> --- linux-2.6.orig/mm/util.c
> +++ linux-2.6/mm/util.c
> @@ -351,10 +351,10 @@ void *kvmalloc_node(size_t size, gfp_t f
> void *ret;
>
> /*
> - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
> - * so the given set of flags has to be compatible.
> + * vmalloc uses blocking allocations for some internal allocations
> + * (e.g page tables) so the given set of flags has to be compatible.
> */
> - WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
> + WARN_ON_ONCE(!gfpflags_allow_blocking(flags));
>
> /*
> * We want to attempt a large physically contiguous block first because
> Index: linux-2.6/drivers/md/dm-bufio.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-bufio.c
> +++ linux-2.6/drivers/md/dm-bufio.c
> @@ -386,9 +386,6 @@ static void __cache_size_refresh(void)
> static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
> enum data_mode *data_mode)
> {
> - unsigned noio_flag;
> - void *ptr;
> -
> if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
> *data_mode = DATA_MODE_SLAB;
> return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
> @@ -402,26 +399,7 @@ static void *alloc_buffer_data(struct dm
> }
>
> *data_mode = DATA_MODE_VMALLOC;
> -
> - /*
> - * __vmalloc allocates the data pages and auxiliary structures with
> - * gfp_flags that were specified, but pagetables are always allocated
> - * with GFP_KERNEL, no matter what was specified as gfp_mask.
> - *
> - * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
> - * all allocations done by this process (including pagetables) are done
> - * as if GFP_NOIO was specified.
> - */
> -
> - if (gfp_mask & __GFP_NORETRY)
> - noio_flag = memalloc_noio_save();
> -
> - ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
> -
> - if (gfp_mask & __GFP_NORETRY)
> - memalloc_noio_restore(noio_flag);
> -
> - return ptr;
> + return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
> }
>
> /*
> Index: linux-2.6/drivers/md/dm-ioctl.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-ioctl.c
> +++ linux-2.6/drivers/md/dm-ioctl.c
> @@ -1691,7 +1691,6 @@ static int copy_params(struct dm_ioctl _
> struct dm_ioctl *dmi;
> int secure_data;
> const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
> - unsigned noio_flag;
>
> if (copy_from_user(param_kernel, user, minimum_data_size))
> return -EFAULT;
> @@ -1714,10 +1713,7 @@ static int copy_params(struct dm_ioctl _
> * suspended and the ioctl is needed to resume it.
> * Use kmalloc() rather than vmalloc() when we can.
> */
> - dmi = NULL;
> - noio_flag = memalloc_noio_save();
> - dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
> - memalloc_noio_restore(noio_flag);
> + dmi = kvmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH);
>
> if (!dmi) {
> if (secure_data && clear_user(user, param_kernel->data_size))
> Index: linux-2.6/fs/xfs/kmem.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/kmem.c
> +++ linux-2.6/fs/xfs/kmem.c
> @@ -48,7 +48,6 @@ kmem_alloc(size_t size, xfs_km_flags_t f
> void *
> kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
> {
> - unsigned nofs_flag = 0;
> void *ptr;
> gfp_t lflags;
>
> @@ -56,22 +55,9 @@ kmem_zalloc_large(size_t size, xfs_km_fl
> if (ptr)
> return ptr;
>
> - /*
> - * __vmalloc() will allocate data pages and auxillary structures (e.g.
> - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
> - * here. Hence we need to tell memory reclaim that we are in such a
> - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
> - * the filesystem here and potentially deadlocking.
> - */
> - if (flags & KM_NOFS)
> - nofs_flag = memalloc_nofs_save();
> -
> lflags = kmem_flags_convert(flags);
> ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL);
>
> - if (flags & KM_NOFS)
> - memalloc_nofs_restore(nofs_flag);
> -
> return ptr;
> }
>

--
Michal Hocko
SUSE Labs