[PATCH] dynamic allocation of huge continuous pages

From: Hirokazu Takahashi
Date: Thu Jan 08 2004 - 06:40:29 EST


Hello,

I just implemented a patch which allow us to allocate huge
continuous pages easily. As We know it's very hard to allocate
them on demand, since free memory on system may be fragmented.
My approach is that I let annoying pages move to another place
so that we can make free coninuous space on memory. Iwamoto's
memory-hot-removal patch will help to do it.

I believe moving pages approach will be much better than random swaping
page out approach for this purpose.

iwamoto> This is an update of the memory hot removal patch.
:
iwamoto> http://people.valinux.co.jp/~iwamoto/mh.html

My patch needs the iwamoto's memory-hot-removeval patch.
You should apply both of them against linux-2.6.0.

Known problems:
- This patch doesn't work if CONFIG_HUGETLB_PAGE isn't set.
Does anybody have good idea to solve it,
since it's difficult to know whether a specified page
is free or a part of a large continuous page without
PG_compound flag.

ToDos:
- It's hard to allocate HugePages for hugetlbfs on a box
which dosen't have HighMem zones yet.
- We will make some continuous pages allocation work
at the same time.

Thank you,
Hirokazu Takahashi.


--- include/linux/page-flags.h.ORG Thu Jan 8 19:06:48 2032
+++ include/linux/page-flags.h Thu Jan 8 19:08:42 2032
@@ -77,6 +77,7 @@
#define PG_compound 19 /* Part of a compound page */

#define PG_again 20
+#define PG_booked 21


/*
@@ -274,6 +275,10 @@ extern void get_full_page_state(struct p
#define PageAgain(page) test_bit(PG_again, &(page)->flags)
#define SetPageAgain(page) set_bit(PG_again, &(page)->flags)
#define ClearPageAgain(page) clear_bit(PG_again, &(page)->flags)
+
+#define PageBooked(page) test_bit(PG_booked, &(page)->flags)
+#define SetPageBooked(page) set_bit(PG_booked, &(page)->flags)
+#define ClearPageBooked(page) clear_bit(PG_booked, &(page)->flags)

/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
--- include/linux/mmzone.h.ORG Thu Jan 8 19:06:56 2032
+++ include/linux/mmzone.h Thu Jan 8 19:12:07 2032
@@ -154,6 +154,9 @@ struct zone {
char *name;
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
+ unsigned long contig_pages_alloc_hint;
+ unsigned long booked_pages;
+ long scan_pages;
} ____cacheline_maxaligned_in_smp;

#define ZONE_DMA 0
--- mm/page_alloc.c.ORG Thu Jan 8 19:07:27 2032
+++ mm/page_alloc.c Thu Jan 8 19:51:24 2032
@@ -185,7 +185,11 @@ static inline void __free_pages_bulk (st
BUG();
index = page_idx >> (1 + order);

- zone->free_pages -= mask;
+ if (!PageBooked(page))
+ zone->free_pages -= mask;
+ else {
+ zone->booked_pages -= mask;
+ }
while (mask + (1 << (MAX_ORDER-1))) {
struct page *buddy1, *buddy2;

@@ -204,6 +208,9 @@ static inline void __free_pages_bulk (st
buddy2 = base + page_idx;
BUG_ON(bad_range(zone, buddy1));
BUG_ON(bad_range(zone, buddy2));
+ if (PageBooked(buddy1) != PageBooked(buddy2)) {
+ break;
+ }
list_del(&buddy1->list);
mask <<= 1;
area++;
@@ -352,13 +359,20 @@ static struct page *__rmqueue(struct zon
unsigned int current_order;
struct page *page;
unsigned int index;
+ struct list_head *p;

for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = zone->free_area + current_order;
if (list_empty(&area->free_list))
continue;
+ list_for_each(p, &area->free_list) {
+ page = list_entry(p, struct page, list);
+ if (!PageBooked(page))
+ goto gotit;
+ }
+ continue;

- page = list_entry(area->free_list.next, struct page, list);
+gotit:
list_del(&page->list);
index = page - zone->zone_mem_map;
if (current_order != MAX_ORDER-1)
@@ -456,6 +470,11 @@ static void free_hot_cold_page(struct pa
struct per_cpu_pages *pcp;
unsigned long flags;

+ if (PageBooked(page)) {
+ __free_pages_ok(page, 0);
+ return;
+ }
+
kernel_map_pages(page, 1, 0);
inc_page_state(pgfree);
free_pages_check(__FUNCTION__, page);
@@ -542,6 +561,242 @@ zone_activep(const struct zone *z)
}
#endif

+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_MEMHOTPLUGTEST)
+/*
+ * Check wheter the page is freeable or not.
+ * It might not be free even if this function says OK,
+ * when it is just being allocated.
+ * This check is almost sufficient but not perfect.
+ */
+static inline int is_page_freeable(struct page *page)
+{
+ return (page->mapping || page_mapped(page) || !page_count(page)) &&
+ !(page->flags & (1<<PG_reserved|1<<PG_compound|1<<PG_booked|1<<PG_slab));
+}
+
+static inline int is_free_page(struct page *page)
+{
+ return !(page_mapped(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 0 ||
+ (page->flags & (
+ 1 << PG_reserved|
+ 1 << PG_compound|
+ 1 << PG_booked |
+ 1 << PG_lru |
+ 1 << PG_private |
+ 1 << PG_locked |
+ 1 << PG_active |
+ 1 << PG_reclaim |
+ 1 << PG_dirty |
+ 1 << PG_slab |
+ 1 << PG_writeback )));
+}
+
+static int
+try_to_book_pages(struct zone *zone, struct page *page, unsigned int order)
+{
+ struct page *p;
+ int booked_count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ for (p = page; p < &page[1<<order]; p++) {
+ if (!is_page_freeable(p))
+ goto out;
+ if (is_free_page(p))
+ booked_count++;
+ SetPageBooked(p);
+ }
+
+ zone->booked_pages = booked_count;
+ zone->free_pages -= booked_count;
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return 1;
+out:
+ for (p--; p >= page; p--) {
+ ClearPageBooked(p);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return 0;
+}
+
+static struct page *
+book_pages(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+ unsigned long num = 1<<order;
+ unsigned long slot = zone->contig_pages_alloc_hint;
+ struct page *page;
+
+ slot = (slot + num - 1) & ~(num - 1); /* align */
+
+ for ( ; zone->scan_pages > 0; slot += num) {
+ zone->scan_pages -= num;
+ if (slot + num > zone->present_pages)
+ slot = 0;
+ page = &zone->zone_mem_map[slot];
+ if (try_to_book_pages(zone, page, order)) {
+ zone->contig_pages_alloc_hint = slot + num;
+ return page;
+ }
+ }
+ return NULL;
+}
+
+static void
+unbook_pages(struct zone *zone, struct page *page, unsigned int order)
+{
+ struct page *p;
+ for (p = page; p < &page[1<<order]; p++) {
+ ClearPageBooked(p);
+ }
+}
+
+extern int remap_onepage(struct page *);
+/*
+ * sweepout_pages() might not work well as the booked pages
+ * may include some unfreeable pages.
+ */
+static int
+sweepout_pages(struct zone *zone, struct page *page, int num)
+{
+ struct page *p;
+ int failed = 0;
+ int retry = 0;
+ int retry_save = 0;
+ int retry_count = 20;
+
+again:
+ on_each_cpu((void (*)(void*))drain_local_pages, NULL, 1, 1);
+ for (p = page; p <= &page[num - 1]; p++) {
+ if (!page_count(p))
+ continue;
+ if (!PageBooked(p)) {
+ printk(KERN_ERR "ERROR sweepout_pages: page:%p isn't booked. page(%p) num(%d)\n", p, page, num);
+ }
+
+ spin_lock_irq(&zone->lru_lock);
+ if (!PageLRU(p)) {
+ spin_unlock_irq(&zone->lru_lock);
+ retry++;
+ continue;
+ }
+ list_del(&p->lru);
+ if (!TestClearPageLRU(p))
+ BUG();
+ if (PageActive(p)) {
+ zone->nr_active--;
+ if (page_count(p) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(p);
+ list_add(&p->lru, &zone->active_list);
+ spin_unlock_irq(&zone->lru_lock);
+ continue;
+ }
+ } else {
+ zone->nr_inactive--;
+ if (page_count(p) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(p);
+ list_add(&p->lru, &zone->inactive_list);
+ spin_unlock_irq(&zone->lru_lock);
+ continue;
+ }
+ }
+ page_cache_get(p);
+ spin_unlock_irq(&zone->lru_lock);
+ if (remap_onepage(p)) {
+ failed++;
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(p)) {
+ list_add(&p->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&p->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(p);
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(p);
+ }
+ }
+ if (retry && (retry_count--)) {
+ retry_save = retry;
+ retry = 0;
+ schedule_timeout(HZ/4);
+ /* Actually we should wait on the pages */
+ goto again;
+ }
+ on_each_cpu((void (*)(void*))drain_local_pages, NULL, 1, 1);
+ return failed;
+}
+
+/*
+ * Allocate contiguous pages enen if pages are fragmented in zones.
+ * Migrating pages helps to make enough space in them.
+ */
+static struct page *
+force_alloc_pages(unsigned int gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ struct zone **zones = zonelist->zones;
+ struct zone *zone;
+ struct page *page = NULL;
+ unsigned long flags;
+ int i;
+ int ret;
+
+ static DECLARE_MUTEX(bookedpage_sem);
+
+ if (down_trylock(&bookedpage_sem)) {
+ down(&bookedpage_sem);
+ }
+
+ for (i = 0; zones[i] != NULL; i++) {
+ zone = zones[i];
+ zone->scan_pages = zone->present_pages;
+ while (zone->scan_pages > 0) {
+ page = book_pages(zone, gfp_mask, order);
+ if (!page)
+ break;
+ ret = sweepout_pages(zone, page, 1<<order);
+ if (ret) {
+ spin_lock_irqsave(&zone->lock, flags);
+ unbook_pages(zone, page, order);
+ page = NULL;
+
+ zone->free_pages += zone->booked_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ continue;
+ }
+ spin_lock_irqsave(&zone->lock, flags);
+ unbook_pages(zone, page, order);
+ zone->free_pages += zone->booked_pages;
+ page = __rmqueue(zone, order);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ if (page) {
+ prep_compound_page(page, order);
+ up(&bookedpage_sem);
+ return page;
+ }
+ }
+ }
+ up(&bookedpage_sem);
+ return NULL;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static inline int
+enough_pages(struct zone *zone, unsigned long min, const int wait)
+{
+ return (long)zone->free_pages - (long)min >= 0 ||
+ (!wait && (long)zone->free_pages - (long)zone->pages_high >= 0);
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*
@@ -602,8 +857,7 @@ __alloc_pages(unsigned int gfp_mask, uns
local_low >>= 1;
min += local_low;

- if (z->free_pages >= min ||
- (!wait && z->free_pages >= z->pages_high)) {
+ if (enough_pages(z, min, wait)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -631,8 +885,7 @@ __alloc_pages(unsigned int gfp_mask, uns
if (rt_task(p))
local_min >>= 1;
min += local_min;
- if (z->free_pages >= min ||
- (!wait && z->free_pages >= z->pages_high)) {
+ if (enough_pages(z, min, wait)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -682,14 +935,27 @@ rebalance:
continue;
#endif
min += z->pages_min;
- if (z->free_pages >= min ||
- (!wait && z->free_pages >= z->pages_high)) {
+ if (enough_pages(z, min, wait)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
}
min += z->pages_low * sysctl_lower_zone_protection;
}
+
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_MEMHOTPLUGTEST)
+ /*
+ * Defrag pages to allocate large contiguous pages
+ *
+ * FIXME: The following code will work only if CONFIG_HUGETLB_PAGE
+ * flag is on.
+ */
+ if (order) {
+ page = force_alloc_pages(gfp_mask, order, zonelist);
+ if (page)
+ goto got_pg;
+ }
+#endif /* CONFIG_HUGETLB_PAGE */

/*
* Don't let big-order allocations loop unless the caller explicitly
--- mm/vmscan.c.ORG Thu Jan 8 19:07:40 2032
+++ mm/vmscan.c Thu Jan 8 19:08:42 2032
@@ -1183,7 +1183,7 @@ bufferdone:
}
/* don't __put_page(page) here. truncate may be in progress */
newpage->flags |= page->flags & ~(1 << PG_uptodate) &
- ~(1 << PG_highmem) & ~(1 << PG_chainlock) &
+ ~(1 << PG_highmem) & ~(1 << PG_chainlock) & ~(1 << PG_booked) &
~(1 << PG_direct) & ~(~0UL << ZONE_SHIFT);

/* list_del(&page->list); XXX */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/