[PATCH v2 1/3] mm/vmalloc: honor GFP constraints in pcpu_get_vm_areas()
From: Kaitao Cheng
Date: Thu Jun 04 2026 - 07:39:23 EST
From: Kaitao Cheng <chengkaitao@xxxxxxxxxx>
pcpu_alloc_noprof() derives pcpu_gfp from the caller supplied GFP mask
and passes it down to the backing percpu allocator. However, when the
percpu vmalloc allocator has to create a new chunk, pcpu_create_chunk()
calls pcpu_get_vm_areas() to allocate the corresponding vmalloc areas.
pcpu_get_vm_areas() currently performs its internal allocations with
GFP_KERNEL, including vmap area metadata, vm_struct metadata and KASAN
vmalloc shadow population. This means that a caller which deliberately
uses GFP_NOFS or GFP_NOIO can still enter FS or IO reclaim while creating
the vmalloc areas for a new percpu chunk.
One possible case is blk-cgroup after commit 5d726c4dbeed
("blk-cgroup: fix possible deadlock while configuring policy").
blkg_conf_prep() now serializes against blkcg_deactivate_policy() with
q->blkcg_mutex, and blkg_alloc() was changed to GFP_NOIO for that reason:
CPU0: blkg_conf_prep()
mutex_lock(q->blkcg_mutex)
blkg_alloc(..., GFP_NOIO)
alloc_percpu_gfp(..., GFP_NOIO)
pcpu_alloc_noprof(..., GFP_NOIO)
pcpu_create_chunk(GFP_NOIO)
pcpu_get_vm_areas()
-> if percpu chunks are exhausted, chunk create may do
internal GFP_KERNEL allocations
-> direct reclaim / writeback can issue IO to this queue
-> IO waits because the queue is frozen
CPU1: blkcg_deactivate_policy()
blk_mq_freeze_queue(q)
mutex_lock(q->blkcg_mutex)
-> waits for CPU0
... unfreeze only happens after q->blkcg_mutex is acquired/released
So the concern is that the caller deliberately uses GFP_NOIO because it
may hold a lock which can be acquired after queue freeze, but the percpu
slow path can temporarily lose that allocation context.
Pass the caller supplied GFP mask from pcpu_create_chunk() to
pcpu_get_vm_areas(), and use it for the internal vmalloc metadata and
KASAN shadow allocations.
Fixes: 9a5b183941b5 ("mm, percpu: do not consider sleepable allocations atomic")
Signed-off-by: Kaitao Cheng <chengkaitao@xxxxxxxxxx>
---
include/linux/vmalloc.h | 4 ++--
mm/percpu-vm.c | 2 +-
mm/vmalloc.c | 23 ++++++++++++-----------
3 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3b02c0c6b371..9601e06624c8 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -308,14 +308,14 @@ static inline void set_vm_flush_reset_perms(void *addr) {}
#if defined(CONFIG_MMU) && defined(CONFIG_SMP)
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align);
+ size_t align, gfp_t gfp);
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
# else
static inline struct vm_struct **
pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align)
+ size_t align, gfp_t gfp)
{
return NULL;
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 4f5937090590..69b00741dc68 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -340,7 +340,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
return NULL;
vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
- pcpu_nr_groups, pcpu_atom_size);
+ pcpu_nr_groups, pcpu_atom_size, gfp);
if (!vms) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1afca3568b9b..08f468135e4d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4946,16 +4946,17 @@ pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
* @sizes: array containing size of each area
* @nr_vms: the number of areas to allocate
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
+ * @gfp: allocation flags passed to the underlying memory allocator
*
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
* vm_structs on success, %NULL on failure
*
* Percpu allocator wants to use congruent vm areas so that it can
* maintain the offsets among percpu areas. This function allocates
- * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
- * be scattered pretty far, distance between two areas easily going up
- * to gigabytes. To avoid interacting with regular vmallocs, these
- * areas are allocated from top.
+ * congruent vmalloc areas for it. These areas tend to be scattered
+ * pretty far, distance between two areas easily going up to gigabytes.
+ * To avoid interacting with regular vmallocs, these areas are allocated
+ * from top.
*
* Despite its complicated look, this allocator is rather simple. It
* does everything top-down and scans free blocks from the end looking
@@ -4966,7 +4967,7 @@ pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align)
+ size_t align, gfp_t gfp)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -5004,14 +5005,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
- vms = kzalloc_objs(vms[0], nr_vms);
- vas = kzalloc_objs(vas[0], nr_vms);
+ vms = kzalloc_objs(vms[0], nr_vms, gfp);
+ vas = kzalloc_objs(vas[0], nr_vms, gfp);
if (!vas || !vms)
goto err_free2;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
- vms[area] = kzalloc_obj(struct vm_struct);
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, gfp);
+ vms[area] = kzalloc_obj(struct vm_struct, gfp);
if (!vas[area] || !vms[area])
goto err_free;
}
@@ -5101,7 +5102,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
/* populate the kasan shadow space */
for (area = 0; area < nr_vms; area++) {
- if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], gfp))
goto err_free_shadow;
}
@@ -5158,7 +5159,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
continue;
vas[area] = kmem_cache_zalloc(
- vmap_area_cachep, GFP_KERNEL);
+ vmap_area_cachep, gfp);
if (!vas[area])
goto err_free;
}
--
2.43.0