[PATCH] mm: sort freelist by rank number
From: pullip . cho
Date:  Mon Aug 03 2020 - 02:18:15 EST
From: Cho KyongHo <pullip.cho@xxxxxxxxxxx>
LPDDR5 introduces rank switch delay. If three successive DRAM accesses
happens and the first and the second ones access one rank and the last
access happens on the other rank, the latency of the last access will
be longer than the second one.
To address this panelty, we can sort the freelist so that a specific
rank is allocated prior to another rank. We expect the page allocator
can allocate the pages from the same rank successively with this
change. It will hopefully improves the proportion of the consecutive
memory accesses to the same rank.
Signed-off-by: Cho KyongHo <pullip.cho@xxxxxxxxxxx>
---
 mm/Kconfig      |  23 +++++++++++
 mm/compaction.c |   6 +--
 mm/internal.h   |  11 ++++++
 mm/page_alloc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 mm/shuffle.h    |   6 ++-
 5 files changed, 144 insertions(+), 21 deletions(-)
diff --git a/mm/Kconfig b/mm/Kconfig
index 6c97488..789c02b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -868,4 +868,27 @@ config ARCH_HAS_HUGEPD
 config MAPPING_DIRTY_HELPERS
         bool
 
+config RANK_SORTED_FREELIST
+	bool "Prefer allocating free pages in a half of DRAM ranks"
+
+	help
+	  Rank switch delay in DRAM access become larger in LPDDR5 than before.
+	  If two successive memory accesses happen on different ranks in LPDDR5,
+	  the latency of the second access becomes longer due to the rank switch
+	  overhead. This is a power-performance tradeoff achieved in LPDDR5.
+	  By default, sorting freelist by rank number is disabled even though
+	  RANK_SORTED_FREELIST is set to y. To enable, set "dram_rank_granule"
+	  boot argument to a larger or an equal value to pageblock_nr_pages. The
+	  values should be the exact the rank interleaving granule that your
+	  system is using. The rank interleaving granule is 2^(the lowest CS bit
+	  number). CS stands for Chip Select and is also called SS which stands
+	  for Slave Select.
+	  This is not beneficial to single rank memory system. Also this is not
+	  necessary to quad rank and octal rank memory systems because they are
+	  not in LPDDR5 specifications.
+
+	  This is marked experimental because this disables freelist shuffling
+	  (SHUFFLE_PAGE_ALLOCATOR). Also you should set the correct rank
+	  interleaving granule.
+
 endmenu
diff --git a/mm/compaction.c b/mm/compaction.c
index 061dacf..bee9567 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1218,8 +1218,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
 
 	if (!list_is_last(freelist, &freepage->lru)) {
 		list_cut_before(&sublist, freelist, &freepage->lru);
-		if (!list_empty(&sublist))
-			list_splice_tail(&sublist, freelist);
+		freelist_splice_tail(&sublist, freelist);
 	}
 }
 
@@ -1236,8 +1235,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
 
 	if (!list_is_first(freelist, &freepage->lru)) {
 		list_cut_position(&sublist, freelist, &freepage->lru);
-		if (!list_empty(&sublist))
-			list_splice_tail(&sublist, freelist);
+		freelist_splice_tail(&sublist, freelist);
 	}
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 10c6776..c945b3d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -266,6 +266,17 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
 
 #endif
 
+#ifdef CONFIG_RANK_SORTED_FREELIST
+void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist);
+#else
+#include <linux/list.h>
+static inline
+void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
+{
+	if (!list_empty(sublist))
+		list_splice_tail(sublist, freelist);
+}
+#endif
 /*
  * This function returns the order of a free page in the buddy system. In
  * general, page_zone(page)->lock must be held by the caller to prevent the
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2824e116..7823a3b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -854,6 +854,69 @@ compaction_capture(struct capture_control *capc, struct page *page,
 }
 #endif /* CONFIG_COMPACTION */
 
+#ifdef CONFIG_RANK_SORTED_FREELIST
+static unsigned long dram_rank_nr_pages __read_mostly;
+
+static inline bool preferred_rank_enabled(void)
+{
+	return dram_rank_nr_pages >= pageblock_nr_pages;
+}
+
+static int __init dram_rank_granule(char *buf)
+{
+	unsigned long val = (unsigned long)(memparse(buf, NULL) / PAGE_SIZE);
+
+	if (val < pageblock_nr_pages) {
+		pr_err("too small rank granule %lu\n", val);
+		return -EINVAL;
+	}
+
+	dram_rank_nr_pages = val;
+
+	return 0;
+}
+
+early_param("dram_rank_granule", dram_rank_granule);
+
+static inline bool __preferred_rank(struct page *page)
+{
+	return !(page_to_pfn(page) & dram_rank_nr_pages);
+}
+
+static inline bool preferred_rank(struct page *page)
+{
+	return !preferred_rank_enabled() || __preferred_rank(page);
+}
+
+void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
+{
+	while (!list_empty(sublist)) {
+		struct page *page;
+
+		page = list_first_entry(sublist, struct page, lru);
+		if (!preferred_rank_enabled() || !__preferred_rank(page))
+			list_move_tail(&page->lru, freelist);
+		else
+			list_move(&page->lru, freelist);
+	}
+}
+#else
+static inline bool __preferred_rank(struct page *page)
+{
+	return true;
+}
+
+static inline bool preferred_rank(struct page *page)
+{
+	return true;
+}
+
+static inline bool preferred_rank_enabled(void)
+{
+	return false;
+}
+#endif
+
 /* Used for pages not on another list */
 static inline void add_to_free_list(struct page *page, struct zone *zone,
 				    unsigned int order, int migratetype)
@@ -880,7 +943,10 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_move(&page->lru, &area->free_list[migratetype]);
+	if (preferred_rank(page))
+		list_move(&page->lru, &area->free_list[migratetype]);
+	else
+		list_move_tail(&page->lru, &area->free_list[migratetype]);
 }
 
 static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -1029,7 +1095,9 @@ static inline void __free_one_page(struct page *page,
 done_merging:
 	set_page_order(page, order);
 
-	if (is_shuffle_order(order))
+	if (preferred_rank_enabled())
+		to_tail = !__preferred_rank(page);
+	else if (is_shuffle_order(order))
 		to_tail = shuffle_pick_tail();
 	else
 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -2257,20 +2325,29 @@ static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
+	int retry = preferred_rank_enabled() ? 2 : 1;
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 
-	/* Find a page of the appropriate size in the preferred list */
-	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
-		area = &(zone->free_area[current_order]);
-		page = get_page_from_free_area(area, migratetype);
-		if (!page)
-			continue;
-		del_page_from_free_list(page, zone, current_order);
-		expand(zone, page, order, current_order, migratetype);
-		set_pcppage_migratetype(page, migratetype);
-		return page;
+	while (retry-- > 0) {
+		/* Find a page of the appropriate size in the preferred list */
+		for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+			area = &zone->free_area[current_order];
+			page = get_page_from_free_area(area, migratetype);
+			if (!page)
+				continue;
+			/*
+			 * In the first try, search for a page in the preferred
+			 * rank upward even though a free page is found.
+			 */
+			if (retry > 0 && !preferred_rank(page))
+				continue;
+			del_page_from_free_list(page, zone, current_order);
+			expand(zone, page, order, current_order, migratetype);
+			set_pcppage_migratetype(page, migratetype);
+			return page;
+		}
 	}
 
 	return NULL;
@@ -2851,8 +2928,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * head, thus also in the physical page order. This is useful
 		 * for IO devices that can merge IO requests if the physical
 		 * pages are ordered properly.
+		 * However, preferred_rank_enabled() is true, we always sort
+		 * freelists in the buddy and the pcp in the order of rank
+		 * number for the performance reason.
 		 */
-		list_add_tail(&page->lru, list);
+		if (preferred_rank_enabled() && __preferred_rank(page))
+			list_add(&page->lru, list);
+		else
+			list_add_tail(&page->lru, list);
 		alloced++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3136,7 +3219,10 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
 	}
 
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
-	list_add(&page->lru, &pcp->lists[migratetype]);
+	if (preferred_rank(page))
+		list_add(&page->lru, &pcp->lists[migratetype]);
+	else
+		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = READ_ONCE(pcp->batch);
@@ -8813,7 +8899,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
 			continue;
 
 		if (current_buddy != target) {
-			add_to_free_list(current_buddy, zone, high, migratetype);
+			if (preferred_rank(current_buddy))
+				add_to_free_list(current_buddy, zone, high, migratetype);
+			else
+				add_to_free_list_tail(current_buddy, zone, high, migratetype);
 			set_page_order(current_buddy, high);
 			page = next_page;
 		}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 71b784f..59cbfde 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -12,7 +12,8 @@ extern void __shuffle_free_memory(pg_data_t *pgdat);
 extern bool shuffle_pick_tail(void);
 static inline void shuffle_free_memory(pg_data_t *pgdat)
 {
-	if (!static_branch_unlikely(&page_alloc_shuffle_key))
+	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
+	    preferred_rank_enabled())
 		return;
 	__shuffle_free_memory(pgdat);
 }
@@ -20,7 +21,8 @@ static inline void shuffle_free_memory(pg_data_t *pgdat)
 extern void __shuffle_zone(struct zone *z);
 static inline void shuffle_zone(struct zone *z)
 {
-	if (!static_branch_unlikely(&page_alloc_shuffle_key))
+	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
+	    preferred_rank_enabled())
 		return;
 	__shuffle_zone(z);
 }
-- 
2.7.4