Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support

From: Dev Jain
Date: Sun Jan 12 2025 - 06:23:54 EST




On 11/01/25 3:18 am, Nico Pache wrote:
On Fri, Jan 10, 2025 at 2:06 AM Dev Jain <dev.jain@xxxxxxx> wrote:



On 09/01/25 5:01 am, Nico Pache wrote:
khugepaged scans PMD ranges for potential collapse to a hugepage. To add
mTHP support we use this scan to instead record chunks of fully utilized
sections of the PMD.

create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks.
by default we will set this to order 3. The reasoning is that for 4K 512
PMD size this results in a 64 bit bitmap which has some optimizations.
For other arches like ARM64 64K, we can set a larger order if needed.

khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
that represents chunks of fully utilized regions. We can then determine
what mTHP size fits best and in the following patch, we set this bitmap
while scanning the PMD.

max_ptes_none is used as a scale to determine how "full" an order must
be before being considered for collapse.

Signed-off-by: Nico Pache <npache@xxxxxxxxxx>
---
include/linux/khugepaged.h | 4 +-
mm/khugepaged.c | 129 +++++++++++++++++++++++++++++++++++--
2 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 1f46046080f5..31cff8aeec4a 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -1,7 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H
-

Nit: I don't think this line needs to be deleted.

+#define MIN_MTHP_ORDER 3
+#define MIN_MTHP_NR (1<<MIN_MTHP_ORDER)

Nit: Insert a space: (1 << MIN_MTHP_ORDER)

+#define MTHP_BITMAP_SIZE (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER))
extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9eb161b04ee4..de1dc6ea3c71 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct kmem_cache *mm_slot_cache __ro_after_init;

+struct scan_bit_state {
+ u8 order;
+ u8 offset;
+};
+
struct collapse_control {
bool is_khugepaged;

@@ -102,6 +107,15 @@ struct collapse_control {

/* nodemask for allocation fallback */
nodemask_t alloc_nmask;
+
+ /* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */
+ unsigned long *mthp_bitmap;
+ unsigned long *mthp_bitmap_temp;
+ struct scan_bit_state *mthp_bitmap_stack;
+};
+
+struct collapse_control khugepaged_collapse_control = {
+ .is_khugepaged = true,
};

/**
@@ -389,6 +403,25 @@ int __init khugepaged_init(void)
if (!mm_slot_cache)
return -ENOMEM;

+ /*
+ * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at
+ * compile time for some architectures.
+ */
+ khugepaged_collapse_control.mthp_bitmap = kmalloc_array(
+ BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+ if (!khugepaged_collapse_control.mthp_bitmap)
+ return -ENOMEM;
+
+ khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array(
+ BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+ if (!khugepaged_collapse_control.mthp_bitmap_temp)
+ return -ENOMEM;
+
+ khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array(
+ MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
+ if (!khugepaged_collapse_control.mthp_bitmap_stack)
+ return -ENOMEM;
+
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
@@ -400,6 +433,9 @@ int __init khugepaged_init(void)
void __init khugepaged_destroy(void)
{
kmem_cache_destroy(mm_slot_cache);
+ kfree(khugepaged_collapse_control.mthp_bitmap);
+ kfree(khugepaged_collapse_control.mthp_bitmap_temp);
+ kfree(khugepaged_collapse_control.mthp_bitmap_stack);
}

static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void)
remove_wait_queue(&khugepaged_wait, &wait);
}

-struct collapse_control khugepaged_collapse_control = {
- .is_khugepaged = true,
-};
-
static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
{
int i;
@@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,

static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
int referenced, int unmapped,
- struct collapse_control *cc)
+ struct collapse_control *cc, bool *mmap_locked,
+ int order, int offset)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
@@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
struct mmu_notifier_range range;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);

+ /* if collapsing mTHPs we may have already released the read_lock, and
+ * need to reaquire it to keep the proper locking order.
+ */
+ if (!*mmap_locked)
+ mmap_read_lock(mm);

There is no need to take the read lock again, because we drop it just
after this.

collapse_huge_page expects the mmap_lock to already be taken, and it
returns with it unlocked. If we are collapsing multiple mTHPs under
the same PMD, then I think we need to reacquire the lock before
calling unlock on it.

I cannot figure out a potential place where we drop the lock before entering collapse_huge_page(). In any case, wouldn't this be better:
if (*mmap_locked)
mmap_read_unlock(mm);

Basically, instead of putting the if condition around the lock, you do it around the unlock?



/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
@@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
+ *mmap_locked = false;

result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
@@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
out_up_write:
mmap_write_unlock(mm);
out_nolock:
+ *mmap_locked = false;
if (folio)
folio_put(folio);
trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
return result;
}

+// Recursive function to consume the bitmap
+static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped, struct collapse_control *cc,
+ bool *mmap_locked, unsigned long enabled_orders)
+{
+ u8 order, offset;
+ int num_chunks;
+ int bits_set, max_percent, threshold_bits;
+ int next_order, mid_offset;
+ int top = -1;
+ int collapsed = 0;
+ int ret;
+ struct scan_bit_state state;
+
+ cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+ { HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 };
+
+ while (top >= 0) {
+ state = cc->mthp_bitmap_stack[top--];
+ order = state.order;
+ offset = state.offset;
+ num_chunks = 1 << order;
+ // Skip mTHP orders that are not enabled
+ if (!(enabled_orders >> (order + MIN_MTHP_ORDER)) & 1)
+ goto next;
+
+ // copy the relavant section to a new bitmap
+ bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
+ MTHP_BITMAP_SIZE);
+
+ bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
+
+ // Check if the region is "almost full" based on the threshold
+ max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100)
+ / (HPAGE_PMD_NR - 1);
+ threshold_bits = (max_percent * num_chunks) / 100;
+
+ if (bits_set >= threshold_bits) {
+ ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
+ mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR);
+ if (ret == SCAN_SUCCEED)
+ collapsed += (1 << (order + MIN_MTHP_ORDER));
+ continue;
+ }
+
+next:
+ if (order > 0) {
+ next_order = order - 1;
+ mid_offset = offset + (num_chunks / 2);
+ cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+ { next_order, mid_offset };
+ cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
+ { next_order, offset };
+ }
+ }
+ return collapsed;
+}
+
static int khugepaged_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, bool *mmap_locked,
@@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
result = collapse_huge_page(mm, address, referenced,
- unmapped, cc);
+ unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0);
/* collapse_huge_page will return with the mmap_lock released */
*mmap_locked = false;
}
@@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
return -ENOMEM;
cc->is_khugepaged = false;

+ cc->mthp_bitmap = kmalloc_array(
+ BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+ if (!cc->mthp_bitmap)
+ return -ENOMEM;
+
+ cc->mthp_bitmap_temp = kmalloc_array(
+ BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL);
+ if (!cc->mthp_bitmap_temp)
+ return -ENOMEM;
+
+ cc->mthp_bitmap_stack = kmalloc_array(
+ MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL);
+ if (!cc->mthp_bitmap_stack)
+ return -ENOMEM;
+
mmgrab(mm);
lru_add_drain_all();

@@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
out_nolock:
mmap_assert_locked(mm);
mmdrop(mm);
+ kfree(cc->mthp_bitmap);
+ kfree(cc->mthp_bitmap_temp);
+ kfree(cc->mthp_bitmap_stack);
kfree(cc);

+
return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
: madvise_collapse_errno(last_fail);
}