[PATCH v2 10/13] fork: Store task pointer in unpopulated stack ptes

From: David Stevens

Date: Fri Apr 24 2026 - 15:22:38 EST


Store the task pointer in the ptes of the unpopulated pages of dynamic
stacks, to allow the vm_struct pointer to be retrieved without relying
on any locks or current.

This relies on being able to pack the struct task_struct pointer into a
pte. Since the struct is 64 byte aligned, that gives 5 bits of leeway,
which should be viable on most architectures. Any architecture which
enables dynamic thread stacks must provide make_data_kpte() and
unpack_data_kpte(), which pack/unpack a right shifted pointer value
into/from a pte.

Signed-off-by: David Stevens <stevensd@xxxxxxxxxx>
---
include/linux/sched/task_stack.h | 1 +
kernel/fork.c | 74 +++++++++++++++++++++++++++++---
mm/vmalloc.c | 2 +-
3 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index 7dcff2836d7e..7cf00ce97f7c 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -105,6 +105,7 @@ void exit_task_stack_account(struct task_struct *tsk);
void dynamic_stack_refill_pages(void);
unsigned long dynamic_stack_accounting(struct task_struct *tsk, bool finalize);
bool dynamic_stack_fault(struct task_struct *tsk, unsigned long address, bool *on_stack);
+struct task_struct *task_from_stack_address(unsigned long address);

/*
* Refill and charge for the used pages.
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ac9d23f5f4b..733fc1f58b8b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -296,16 +296,40 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)

static DEFINE_PER_CPU(struct page *, dynamic_stack_pages[DYNSTK_PAGE_POOL_NR]);

+#define TASK_PTR_SHIFT (ilog2(__alignof__(struct task_struct)))
+
static void link_vmap_stack_to_task(struct task_struct *tsk, struct vm_struct *vm_area)
{
+ int i;
+ unsigned long addr;
+ pte_t *ptep, pte;
+
+ pte = make_data_kpte(((unsigned long)tsk) >> TASK_PTR_SHIFT);
+
tsk->stack_vm_area = vm_area;
tsk->packed_stack = (unsigned long)kasan_reset_tag(vm_area->addr);
+
+ addr = (unsigned long)vm_area->addr;
+ ptep = virt_to_kpte(addr);
+ for (i = vm_area->nr_pages; i < THREAD_SIZE >> PAGE_SHIFT;
+ i++, addr += PAGE_SIZE, ptep++)
+ set_pte_at(&init_mm, addr, ptep, pte);
}

-static void free_vmap_stack(struct vm_struct *vm_area)
+static void free_vmap_stack(struct vm_struct *vm_area, bool was_mapped)
{
int i;

+ /* Clear data kptes since vunmap expects present or none. */
+ if (was_mapped) {
+ unsigned long addr = (unsigned long)vm_area->addr;
+ pte_t *ptep = virt_to_kpte(addr);
+ unsigned int nr_to_clear = (THREAD_SIZE >> PAGE_SHIFT) - vm_area->nr_pages;
+
+ if (nr_to_clear)
+ clear_ptes(&init_mm, addr, ptep, nr_to_clear);
+ }
+
remove_vm_area(vm_area->addr);

for (i = 0; i < vm_area->nr_pages; i++)
@@ -354,7 +378,7 @@ static struct vm_struct *alloc_vmap_stack(int node)

return vm_area;
cleanup_err:
- free_vmap_stack(vm_area);
+ free_vmap_stack(vm_area, false);
return NULL;
}

@@ -477,6 +501,42 @@ unsigned long dynamic_stack_accounting(struct task_struct *tsk, bool finalize)
return i;
}

+noinstr struct task_struct *task_from_stack_address(unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ BUILD_BUG_ON((BITS_PER_LONG - TASK_PTR_SHIFT) > KPTE_AVAILABLE_DATA_BITS);
+
+ if (!is_vmalloc_addr((void *)address))
+ return NULL;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd) || pgd_leaf(*pgd))
+ return NULL;
+
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d) || p4d_leaf(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, address);
+ if (pud_none(*pud) || pud_leaf(*pud))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd) || pmd_leaf(*pmd))
+ return NULL;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (pte_present(*pte) || pte_none(*pte))
+ return NULL;
+
+ return (struct task_struct *)(unpack_data_kpte(*pte) << TASK_PTR_SHIFT);
+}
+
bool noinstr dynamic_stack_fault(struct task_struct *tsk, unsigned long address, bool *on_stack)
{
unsigned long stack, hole_end, addr;
@@ -570,7 +630,7 @@ static inline struct vm_struct *alloc_vmap_stack(int node)
return stack ? find_vm_area(stack) : NULL;
}

-static inline void free_vmap_stack(struct vm_struct *vm_area)
+static inline void free_vmap_stack(struct vm_struct *vm_area, bool was_mapped)
{
vfree(vm_area->addr);
}
@@ -590,7 +650,7 @@ static void thread_stack_free_work(struct work_struct *work)
if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
return;

- free_vmap_stack(vm_area);
+ free_vmap_stack(vm_area, true);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
@@ -618,7 +678,7 @@ static int free_vm_stack_cache(unsigned int cpu)
if (!vm_area)
continue;

- free_vmap_stack(vm_area);
+ free_vmap_stack(vm_area, true);
cached_vm_stack_areas[i] = NULL;
}

@@ -653,7 +713,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
unsigned long memset_offset = 0;

if (memcg_charge_kernel_stack(vm_area)) {
- free_vmap_stack(vm_area);
+ free_vmap_stack(vm_area, true);
return -ENOMEM;
}

@@ -674,7 +734,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
return -ENOMEM;

if (memcg_charge_kernel_stack(vm_area)) {
- free_vmap_stack(vm_area);
+ free_vmap_stack(vm_area, true);
return -ENOMEM;
}
link_vmap_stack_to_task(tsk, vm_area);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39b7e118cbce..76955c101180 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -76,7 +76,7 @@ early_param("nohugevmalloc", set_nohugevmalloc);
static const bool vmap_allow_huge = false;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */

-bool is_vmalloc_addr(const void *x)
+noinstr bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)kasan_reset_tag(x);

--
2.54.0.rc2.544.gc7ae2d5bb8-goog