[PATCH 6/6] hugetlb: support free hugepage pre zero out

From: Liang Li
Date: Tue Jan 05 2021 - 22:52:56 EST


This patch add support of pre zero out free hugepage, we can use
this feature to speed up page population and page fault handing.

Cc: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxx>
Cc: David Hildenbrand <david@xxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Alex Williamson <alex.williamson@xxxxxxxxxx>
Cc: Michael S. Tsirkin <mst@xxxxxxxxxx>
Cc: Jason Wang <jasowang@xxxxxxxxxx>
Cc: Liang Li <liliang324@xxxxxxxxx>
Signed-off-by: Liang Li <liliangleo@xxxxxxxxxxxxxx>
---
include/linux/page-flags.h | 12 ++
mm/Kconfig | 10 ++
mm/huge_memory.c | 3 +-
mm/hugetlb.c | 243 +++++++++++++++++++++++++++++++++++++
mm/memory.c | 4 +
5 files changed, 271 insertions(+), 1 deletion(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec5d0290e0ee..f177c5e85632 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -173,6 +173,9 @@ enum pageflags {

/* Only valid for buddy pages. Used to track pages that are reported */
PG_reported = PG_uptodate,
+
+ /* Only valid for hugetlb pages. Used to mark zero pages */
+ PG_zero = PG_slab,
};

#ifndef __GENERATING_BOUNDS_H
@@ -451,6 +454,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
*/
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

+/*
+ * PageZero() is used to track hugetlb free pages within the free list
+ * of hugetlbfs. We can use the non-atomic version of the test and set
+ * operations as both should be shielded with the hugetlb lock to prevent
+ * any possible races on the setting or clearing of the bit.
+ */
+__PAGEFLAG(Zero, zero, PF_ONLY_HEAD)
+
+
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/mm/Kconfig b/mm/Kconfig
index 630cde982186..1d91e182825d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -254,6 +254,16 @@ config PAGE_REPORTING
those pages to another entity, such as a hypervisor, so that the
memory can be freed within the host for other uses.

+#
+# support for pre zero out hugetlbfs free page
+config PREZERO_HPAGE
+ bool "Pre zero out hugetlbfs free page"
+ def_bool n
+ depends on PAGE_REPORTING
+ help
+ Allows pre zero out hugetlbfs free pages in freelist based on free
+ page reporting
+
#
# support for page migration
#
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9237976abe72..4ff99724d669 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2407,7 +2407,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ (1L << PG_zero)));

/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0fccd5f96954..2029668a0864 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1029,6 +1029,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
+ __ClearPageZero(page);
if (hugepage_reported(page))
__ClearPageReported(page);
hugepage_reporting_notify_free(h->order);
@@ -1315,6 +1316,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
+ __ClearPageZero(page);
if (hstate_is_gigantic(h)) {
/*
* Temporarily drop the hugetlb_lock, because
@@ -2963,6 +2965,237 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
return retval;
}

+#ifdef CONFIG_PREZERO_HPAGE
+
+#define PRE_ZERO_STOP 0
+#define PRE_ZERO_RUN 1
+
+static int mini_page_order;
+static unsigned long batch_size = 16 * 1024 * 1024;
+static unsigned long delay_millisecs = 1000;
+static unsigned long prezero_enable __read_mostly;
+static DEFINE_MUTEX(kprezerod_mutex);
+static struct page_reporting_dev_info pre_zero_hpage_dev_info;
+
+static int zero_out_pages(struct page_reporting_dev_info *pr_dev_info,
+ struct scatterlist *sgl, unsigned int nents)
+{
+ struct scatterlist *sg = sgl;
+
+ might_sleep();
+ do {
+ struct page *page = sg_page(sg);
+ unsigned int i, order = get_order(sg->length);
+
+ VM_BUG_ON(PageBuddy(page) || buddy_order(page));
+
+ if (PageZero(page))
+ continue;
+ for (i = 0; i < (1 << order); i++) {
+ cond_resched();
+ clear_highpage(page + i);
+ }
+ __SetPageZero(page);
+ } while ((sg = sg_next(sg)));
+
+ return 0;
+}
+
+static int start_kprezerod(void)
+{
+ int err = 0;
+
+ if (prezero_enable == PRE_ZERO_RUN) {
+ pre_zero_hpage_dev_info.report = zero_out_pages;
+ pre_zero_hpage_dev_info.mini_order = mini_page_order;
+ pre_zero_hpage_dev_info.batch_size = batch_size;
+ pre_zero_hpage_dev_info.delay_jiffies = msecs_to_jiffies(delay_millisecs);
+
+ err = hugepage_reporting_register(&pre_zero_hpage_dev_info);
+ pr_info("Pre zero hugepage enabled\n");
+ } else {
+ hugepage_reporting_unregister(&pre_zero_hpage_dev_info);
+ pr_info("Pre zero hugepage disabled\n");
+ }
+
+ return err;
+}
+
+static int restart_kprezerod(void)
+{
+ int err = 0;
+
+ mutex_lock(&kprezerod_mutex);
+ if (prezero_enable == PRE_ZERO_RUN) {
+ hugepage_reporting_unregister(&pre_zero_hpage_dev_info);
+
+ pre_zero_hpage_dev_info.report = zero_out_pages;
+ pre_zero_hpage_dev_info.mini_order = mini_page_order;
+ pre_zero_hpage_dev_info.batch_size = batch_size;
+ pre_zero_hpage_dev_info.delay_jiffies = msecs_to_jiffies(delay_millisecs);
+
+ err = hugepage_reporting_register(&pre_zero_hpage_dev_info);
+ }
+ mutex_unlock(&kprezerod_mutex);
+
+ return err;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", prezero_enable);
+}
+
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret = 0;
+ unsigned long flags;
+ int err;
+
+ err = kstrtoul(buf, 10, &flags);
+ if (err || flags > UINT_MAX)
+ return -EINVAL;
+ if (flags > PRE_ZERO_RUN)
+ return -EINVAL;
+
+ mutex_lock(&kprezerod_mutex);
+ if (prezero_enable != flags) {
+ prezero_enable = flags;
+ ret = start_kprezerod();
+ }
+ mutex_unlock(&kprezerod_mutex);
+
+ return count;
+}
+
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+
+static ssize_t batch_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", batch_size);
+}
+
+static ssize_t batch_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long size;
+ int err;
+
+ err = kstrtoul(buf, 10, &size);
+ if (err || size >= UINT_MAX)
+ return -EINVAL;
+
+ batch_size = size;
+
+ restart_kprezerod();
+ return count;
+}
+
+static struct kobj_attribute batch_size_attr =
+ __ATTR(batch_size, 0644, batch_size_show, batch_size_store);
+
+static ssize_t delay_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", delay_millisecs);
+}
+
+static ssize_t delay_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs >= UINT_MAX)
+ return -EINVAL;
+
+ delay_millisecs = msecs;
+
+ restart_kprezerod();
+
+ return count;
+}
+
+static struct kobj_attribute wake_delay_millisecs_attr =
+ __ATTR(delay_millisecs, 0644, delay_millisecs_show,
+ delay_millisecs_store);
+
+static ssize_t mini_order_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", mini_page_order);
+}
+
+static ssize_t mini_order_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int order;
+ int err;
+
+ err = kstrtouint(buf, 10, &order);
+ if (err || order >= MAX_ORDER)
+ return -EINVAL;
+
+ if (mini_page_order != order) {
+ mutex_lock(&kprezerod_mutex);
+ mini_page_order = order;
+ mutex_unlock(&kprezerod_mutex);
+ }
+
+ restart_kprezerod();
+ return count;
+}
+
+static struct kobj_attribute mini_order_attr =
+ __ATTR(mini_order, 0644, mini_order_show, mini_order_store);
+
+static struct attribute *pre_zero_attr[] = {
+ &enabled_attr.attr,
+ &mini_order_attr.attr,
+ &wake_delay_millisecs_attr.attr,
+ &batch_size_attr.attr,
+ NULL,
+};
+
+static struct attribute_group pre_zero_attr_group = {
+ .attrs = pre_zero_attr,
+};
+
+static int __init zeropage_init_sysfs(struct kobject *parent_kobj)
+{
+ int err;
+ struct kobject *pre_zero_kobj;
+
+ pre_zero_kobj = kobject_create_and_add("pre_zero", parent_kobj);
+ if (unlikely(!pre_zero_kobj)) {
+ pr_err("pre_zero: failed to create pre_zero kobject\n");
+ return -ENOMEM;
+ }
+
+ err = sysfs_create_group(pre_zero_kobj, &pre_zero_attr_group);
+ if (err) {
+ pr_err("pre_zero: failed to register pre_zero group\n");
+ goto delete_obj;
+ }
+
+ return 0;
+
+delete_obj:
+ kobject_put(pre_zero_kobj);
+ return err;
+}
+#endif
+
static void __init hugetlb_sysfs_init(void)
{
struct hstate *h;
@@ -2978,6 +3211,16 @@ static void __init hugetlb_sysfs_init(void)
if (err)
pr_err("HugeTLB: Unable to add hstate %s", h->name);
}
+
+ if (err)
+ return;
+#ifdef CONFIG_PREZERO_HPAGE
+ err = zeropage_init_sysfs(hugepages_kobj);
+ if (err)
+ return;
+
+ start_kprezerod();
+#endif
}

#ifdef CONFIG_NUMA
diff --git a/mm/memory.c b/mm/memory.c
index 7d608765932b..e98eed1a59a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5100,6 +5100,10 @@ void clear_huge_page(struct page *page,
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

+ if (PageZero(page)) {
+ __ClearPageZero(page);
+ return;
+ }
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
--
2.18.2