[RFC PATCH 8/8] mm: add MADV_RESERVED_THP range policy
From: Qi Zheng
Date: Sat Jun 27 2026 - 03:29:14 EST
From: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
Introduce MADV_RESERVED_THP as a new madvise() policy to enable PMD-sized
reserved THP allocations for anonymous VMAs.
The policy enforces the following rules:
- Limited to private anonymous VMAs with PMD-aligned ranges.
- Mutually exclusive with other hugepage VMA flags (handled via
hugepage_madvise()).
- Charges the reserved THP quota upon success, and rolls back the charge
if madvise_update_vma() fails.
- Rejects partial madvise operations that could split PMD THPs.
- Rejects MADV_COLLAPSE, as khugepaged cannot currently allocate from
the reserved capacity.
Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
---
arch/alpha/include/uapi/asm/mman.h | 2 +
arch/mips/include/uapi/asm/mman.h | 2 +
arch/parisc/include/uapi/asm/mman.h | 2 +
arch/xtensa/include/uapi/asm/mman.h | 2 +
include/uapi/asm-generic/mman-common.h | 2 +
mm/khugepaged.c | 8 +++
mm/madvise.c | 83 +++++++++++++++++++++++++-
7 files changed, 100 insertions(+), 1 deletion(-)
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 1e700468a6858..672a2fc343861 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -78,6 +78,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_RESERVED_THP 26 /* Use reserved transparent hugepages */
+
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index b700dae28c482..a94bf74dee21c 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -105,6 +105,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_RESERVED_THP 26 /* Use reserved transparent hugepages */
+
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index b6a709506987e..fe2fddefb6c5d 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -72,6 +72,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_RESERVED_THP 26 /* Use reserved transparent hugepages */
+
#define MADV_HWPOISON 100 /* poison a page for testing */
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 99d4ccee7f6e8..bb603530ba799 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -113,6 +113,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_RESERVED_THP 26 /* Use reserved transparent hugepages */
+
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ef1c27fa3c570..b3d1448935ead 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,8 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_RESERVED_THP 26 /* Use reserved transparent hugepages */
+
#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
#define MADV_GUARD_REMOVE 103 /* unguard range */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 617bca76db49b..80293e8c1e4e7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -451,6 +451,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
switch (advice) {
case MADV_HUGEPAGE:
*vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags &= ~VM_RESERVED_THP;
*vm_flags |= VM_HUGEPAGE;
/*
* If the vma become good for khugepaged to scan,
@@ -461,6 +462,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
break;
case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags &= ~VM_RESERVED_THP;
*vm_flags |= VM_NOHUGEPAGE;
/*
* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
@@ -468,6 +470,12 @@ int hugepage_madvise(struct vm_area_struct *vma,
* it got registered before VM_NOHUGEPAGE was set.
*/
break;
+ case MADV_RESERVED_THP:
+ *vm_flags &= ~(VM_HUGEPAGE | VM_NOHUGEPAGE);
+ *vm_flags |= VM_RESERVED_THP;
+ break;
+ default:
+ return -EINVAL;
}
return 0;
diff --git a/mm/madvise.c b/mm/madvise.c
index cd9bb077072cc..dd91105db68c7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -13,6 +13,7 @@
#include <linux/page-isolation.h>
#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
+#include <linux/huge_mm.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/fadvise.h>
@@ -1331,6 +1332,65 @@ static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
}
#endif
+static bool reserved_thp_madvise_aligned(struct vm_area_struct *vma,
+ struct madvise_behavior_range *range)
+{
+ if (!(vma->vm_flags & VM_RESERVED_THP))
+ return true;
+
+ return IS_ALIGNED(range->start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(range->end, HPAGE_PMD_SIZE);
+}
+
+static int madvise_hugepage_policy(struct madvise_behavior *madv_behavior,
+ vm_flags_t *new_flags,
+ unsigned long *reserved_hpages,
+ bool *charge_reserved_thp,
+ bool *uncharge_reserved_thp)
+{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ unsigned long hpages;
+ int behavior = madv_behavior->behavior;
+ int error;
+
+ switch (behavior) {
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, new_flags, behavior);
+ if (error)
+ return error;
+ *uncharge_reserved_thp = (vma->vm_flags & VM_RESERVED_THP) &&
+ !(*new_flags & VM_RESERVED_THP);
+ return 0;
+ case MADV_RESERVED_THP:
+ if (!IS_ENABLED(CONFIG_64BIT))
+ return -EINVAL;
+ if (!vma_is_anonymous(vma) || (*new_flags & VM_SHARED) ||
+ (*new_flags & VM_SPECIAL))
+ return -EINVAL;
+ if (!IS_ALIGNED(range->start, HPAGE_PMD_SIZE) ||
+ !IS_ALIGNED(range->end, HPAGE_PMD_SIZE))
+ return -EINVAL;
+
+ error = hugepage_madvise(vma, new_flags, behavior);
+ if (error)
+ return error;
+
+ if (!(vma->vm_flags & VM_RESERVED_THP)) {
+ hpages = reserved_thp_hpage_nr(range->start, range->end);
+ error = reserved_thp_charge(hpages);
+ if (error)
+ return error;
+ *reserved_hpages = hpages;
+ *charge_reserved_thp = true;
+ }
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1342,6 +1402,9 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
struct vm_area_struct *vma = madv_behavior->vma;
vm_flags_t new_flags = vma->vm_flags;
struct madvise_behavior_range *range = &madv_behavior->range;
+ unsigned long reserved_hpages = 0;
+ bool charge_reserved_thp = false;
+ bool uncharge_reserved_thp = false;
int error;
if (unlikely(!can_madvise_modify(madv_behavior)))
@@ -1353,14 +1416,22 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
case MADV_WILLNEED:
return madvise_willneed(madv_behavior);
case MADV_COLD:
+ if (!reserved_thp_madvise_aligned(vma, range))
+ return -EINVAL;
return madvise_cold(madv_behavior);
case MADV_PAGEOUT:
+ if (!reserved_thp_madvise_aligned(vma, range))
+ return -EINVAL;
return madvise_pageout(madv_behavior);
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
+ if (!reserved_thp_madvise_aligned(vma, range))
+ return -EINVAL;
return madvise_dontneed_free(madv_behavior);
case MADV_COLLAPSE:
+ if (vma->vm_flags & VM_RESERVED_THP)
+ return -EINVAL;
return madvise_collapse(vma, range->start, range->end,
&madv_behavior->lock_dropped);
case MADV_GUARD_INSTALL:
@@ -1416,7 +1487,11 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
- error = hugepage_madvise(vma, &new_flags, behavior);
+ case MADV_RESERVED_THP:
+ error = madvise_hugepage_policy(madv_behavior, &new_flags,
+ &reserved_hpages,
+ &charge_reserved_thp,
+ &uncharge_reserved_thp);
if (error)
goto out;
break;
@@ -1431,6 +1506,11 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
error = madvise_update_vma(new_flags, madv_behavior);
+ if (error && charge_reserved_thp)
+ reserved_thp_uncharge(reserved_hpages);
+ else if (!error && uncharge_reserved_thp)
+ reserved_thp_uncharge(reserved_thp_hpage_nr(range->start,
+ range->end));
out:
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1541,6 +1621,7 @@ madvise_behavior_valid(int behavior)
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
case MADV_COLLAPSE:
+ case MADV_RESERVED_THP:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
--
2.54.0