Re: [PATCH v3 1/2] mm/hugetlb: extract sysfs into hugetlb_sysfs.c

From: David Hildenbrand

Date: Wed Nov 05 2025 - 06:42:15 EST

On 04.11.25 09:37, Hui Zhu wrote:

From: Hui Zhu <zhuhui@xxxxxxxxxx>

Currently, hugetlb.c contains both core management logic and sysfs
interface implementations, making it difficult to maintain. This patch
extracts the sysfs-related code into a dedicated file to improve code
organization.

The following components are moved to mm/hugetlb_sysfs.c:
- hugetlb page demote functions (demote_free_hugetlb_folios,
demote_pool_huge_page)
- sysfs attribute definitions and handlers
- sysfs kobject management functions
- NUMA per-node hstate attribute registration

Several inline helper functions and macros are moved to
mm/hugetlb_internal.h:
- hstate_is_gigantic_no_runtime()
- next_node_allowed()
- get_valid_node_allowed()
- hstate_next_node_to_alloc()
- hstate_next_node_to_free()
- for_each_node_mask_to_alloc/to_free macros

To support code sharing, these functions are changed from static to
exported symbols:
- remove_hugetlb_folio()
- add_hugetlb_folio()
- init_new_hugetlb_folio()
- prep_and_add_allocated_folios()
- __nr_hugepages_store_common()

The Makefile is updated to compile hugetlb_sysfs.o when
CONFIG_HUGETLBFS is enabled. This maintains all existing functionality
while improving maintainability by separating concerns.

MAINTAINERS is updated to add new file hugetlb_sysfs.c.

Signed-off-by: Geliang Tang <geliang@xxxxxxxxxx>
Signed-off-by: Hui Zhu <zhuhui@xxxxxxxxxx>
---

[...]

--- /dev/null
+++ b/mm/hugetlb_internal.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Internal HugeTLB definitions.
+ */
+
+#ifndef _LINUX_HUGETLB_INTERNAL_H
+#define _LINUX_HUGETLB_INTERNAL_H
+
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+
+/*
+ * Check if the hstate represents gigantic pages but gigantic page
+ * runtime support is not available. This is a common condition used to
+ * skip operations that cannot be performed on gigantic pages when runtime
+ * support is disabled.
+ */
+static inline bool hstate_is_gigantic_no_runtime(struct hstate *h)
+{
+ return hstate_is_gigantic(h) && !gigantic_page_runtime_supported();
+}
+
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static inline int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node_in(nid, *nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static inline int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static inline int hstate_next_node_to_alloc(int *next_node,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(*next_node, nodes_allowed);
+ *next_node = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * helper for remove_pool_hugetlb_folio() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static inline int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+extern void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus);
+extern void add_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus);
+extern void init_new_hugetlb_folio(struct folio *folio);
+extern void prep_and_add_allocated_folios(struct hstate *h,
+ struct list_head *folio_list);
+extern ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
+ struct hstate *h, int nid,
+ unsigned long count, size_t len);
+

Using extern for functions is no longer used.

+extern void hugetlb_sysfs_init(void) __init;
+
+#endif /* _LINUX_HUGETLB_INTERNAL_H */
diff --git a/mm/hugetlb_sysfs.c b/mm/hugetlb_sysfs.c
new file mode 100644
index 000000000000..a2567947d32c
--- /dev/null
+++ b/mm/hugetlb_sysfs.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HugeTLB sysfs interfaces.
+ */
+

As I said, we usually keep the copyright from the original file.

+#include <linux/swap.h>
+#include <linux/page_owner.h>
+#include <linux/page-isolation.h>
+
+#include "hugetlb_vmemmap.h"
+#include "hugetlb_internal.h"
+
+static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
+ struct list_head *src_list)
+{
+ long rc;
+ struct folio *folio, *next;
+ LIST_HEAD(dst_list);
+ LIST_HEAD(ret_list);
+
+ rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
+ list_splice_init(&ret_list, src_list);
+
+ /*
+ * Taking target hstate mutex synchronizes with set_max_huge_pages.
+ * Without the mutex, pages added to target hstate could be marked
+ * as surplus.
+ *
+ * Note that we already hold src->resize_lock. To prevent deadlock,
+ * use the convention of always taking larger size hstate mutex first.
+ */
+ mutex_lock(&dst->resize_lock);
+
+ list_for_each_entry_safe(folio, next, src_list, lru) {
+ int i;
+ bool cma;
+
+ if (folio_test_hugetlb_vmemmap_optimized(folio))
+ continue;
+
+ cma = folio_test_hugetlb_cma(folio);
+
+ list_del(&folio->lru);
+
+ split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
+ pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));
+
+ for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
+ struct page *page = folio_page(folio, i);
+ /* Careful: see __split_huge_page_tail() */
+ struct folio *new_folio = (struct folio *)page;
+
+ clear_compound_head(page);
+ prep_compound_page(page, dst->order);
+
+ new_folio->mapping = NULL;
+ init_new_hugetlb_folio(new_folio);
+ /* Copy the CMA flag so that it is freed correctly */
+ if (cma)
+ folio_set_hugetlb_cma(new_folio);
+ list_add(&new_folio->lru, &dst_list);
+ }
+ }
+
+ prep_and_add_allocated_folios(dst, &dst_list);
+
+ mutex_unlock(&dst->resize_lock);
+
+ return rc;
+}
+
+static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
+ unsigned long nr_to_demote)
+ __must_hold(&hugetlb_lock)
+{
+ int nr_nodes, node;
+ struct hstate *dst;
+ long rc = 0;
+ long nr_demoted = 0;
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ /* We should never get here if no demote order */
+ if (!src->demote_order) {
+ pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
+ return -EINVAL; /* internal error */
+ }
+ dst = size_to_hstate(PAGE_SIZE << src->demote_order);
+
+ for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
+ LIST_HEAD(list);
+ struct folio *folio, *next;
+
+ list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
+ if (folio_test_hwpoison(folio))
+ continue;
+
+ remove_hugetlb_folio(src, folio, false);
+ list_add(&folio->lru, &list);
+
+ if (++nr_demoted == nr_to_demote)
+ break;
+ }
+
+ spin_unlock_irq(&hugetlb_lock);
+
+ rc = demote_free_hugetlb_folios(src, dst, &list);
+
+ spin_lock_irq(&hugetlb_lock);
+
+ list_for_each_entry_safe(folio, next, &list, lru) {
+ list_del(&folio->lru);
+ add_hugetlb_folio(src, folio, false);
+
+ nr_demoted--;
+ }
+
+ if (rc < 0 || nr_demoted == nr_to_demote)
+ break;
+ }
+
+ /*
+ * Not absolutely necessary, but for consistency update max_huge_pages
+ * based on pool changes for the demoted page.
+ */
+ src->max_huge_pages -= nr_demoted;
+ dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));
+
+ if (rc < 0)
+ return rc;
+
+ if (nr_demoted)
+ return nr_demoted;
+ /*
+ * Only way to get here is if all pages on free lists are poisoned.
+ * Return -EBUSY so that caller will not retry.
+ */
+ return -EBUSY;
+}

[...]

The core demotion logic should stay in hugetlb.c. In egenral, I think hugetlb_sysfs.c should not contain any actualy hugetlb logic, but primarily only the sysfs interface.

This will also avoid having to export low-level functions like init_new_hugetlb_folio() in the internal header.

Likely, demote_store() should simply call a new function demote_pool_huge_pages() that ...

+
+static ssize_t demote_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_demote;
+ unsigned long nr_available;
+ nodemask_t nodes_allowed, *n_mask;
+ struct hstate *h;
+ int err;
+ int nid;
+
+ err = kstrtoul(buf, 10, &nr_demote);
+ if (err)
+ return err;
+ h = kobj_to_hstate(kobj, &nid);
+
+ if (nid != NUMA_NO_NODE) {
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ } else {
+ n_mask = &node_states[N_MEMORY];
+ }
+

... encapsulated the locking + loop below.

+ /* Synchronize with other sysfs operations modifying huge pages */
+ mutex_lock(&h->resize_lock);
+ spin_lock_irq(&hugetlb_lock);
+> + while (nr_demote) {
+ long rc;
+
+ /*
+ * Check for available pages to demote each time thorough the
+ * loop as demote_pool_huge_page will drop hugetlb_lock.
+ */
+ if (nid != NUMA_NO_NODE)
+ nr_available = h->free_huge_pages_node[nid];
+ else
+ nr_available = h->free_huge_pages;
+ nr_available -= h->resv_huge_pages;
+ if (!nr_available)
+ break;
+
+ rc = demote_pool_huge_page(h, n_mask, nr_demote);
+ if (rc < 0) {
+ err = rc;
+ break;
+ }
+
+ nr_demote -= rc;
+ }
+
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
+
+ if (err)
+ return err;
+ return len;
+}
+HSTATE_ATTR_WO(demote);

--
Cheers

David