Re: [PATCH 4/6] x86, mm: make split_mem_range() more easy to read

From: Wei Yang
Date: Wed Mar 27 2019 - 18:05:19 EST


On Sun, Mar 24, 2019 at 03:29:04PM +0100, Thomas Gleixner wrote:
>Wei,
>
>On Tue, 12 Feb 2019, Wei Yang wrote:
>>
>> This patch changes the implementation from the first perception to the
>> second to reduce one different handling on end_pfn. After doing so, the
>> code is easier to read.
>
>It's maybe slightly easier to read, but it's still completely unreadable
>garbage.
>
> Not your fault, it was garbage before.
>
>But refining garbage still results in garbage. Just the smell is slightly
>different.
>
>Why?
>
> 1) Doing all the calculations PFN based is just a pointless
> indirection. Just look at all the rounding magic and back and forth
> conversions.
>
> All of that can be done purely address/size based which makes the code
> truly readable.
>
> 2) The 5(3) sections are more or less copied code with a few changes of
> constants, except for the first section (A) which has an extra quirk
> for 32bit. Plus all the 64bit vs. 32bit #ifdeffery which is not making
> it any better.
>
> This copied mess can be avoided by using helper functions and proper
> loops.
>
> 3) During the bootmem phase the code tries to preserve large mappings
> _AFTER_ splitting them up and then it tries to merge the resulting
> overlaps.
>
> This is completely backwards because the expansion of the range can be
> tried right away when then mapping of a large page is attempted. Surely
> not with the current mess, but with a proper loop based approach it can
> be done nicely.
>
> Btw, there is a bug in that expansion code which could result in
> undoing the enforced 4K mapping of the lower 2M/4M range on 32bit. It's
> probably not an issue in practice because the low range is usually not
> contiguous. But it works by chance not by design.
>
> 4) The debug print string retrieval function is just silly.
>
>The logic for rewriting this is pretty obvious:
>
> while (addr < end) {
> setup_mr(mr, addr, end);
> for_each_map_size(1G, 2M, 4K) {
> if (try_map(mr, size))
> break;
> }
> addr = mr->end;
> }
>
> setup_mr() takes care of the 32bit 0-2/4M range by limiting the
> range and setting the allowed size mask in mr to 4k only.
>
> try_map() does:
>
> 1) Try to expand the range to preserve large pages during bootmem
>
> 2) If the range start is not aligned, limit the end to the large
> size boundary, so the next lower map size will only cover the
> unaligned portion.
>
> 3) If the range end is not aligned, fit as much large
> size as possible.
>
> No magic A-E required, because it just falls into place naturally and
> the expansion is done at the right place and not after the fact.
>
>Find a mostly untested patch which implements this below. I just booted it
>in a 64bit guest and it did not explode.
>
>It removes 55 lines of code instead of adding 35 and reduces the binary
>size by 408 bytes on 64bit and 128 bytes on 32bit.
>
>Note, it's a combo of changes (including your patch 1/6) and needs to be
>split up. It would be nice if you have time to split it up into separate
>patches, add proper changelogs and test the heck out of it on both 32 and
>64 bit. If you don't have time, please let me know.
>

Thanks for your suggestions :-)

Just get my head up, will try to understand the code and test on both
arch.

BTW, do you have some suggestions in the test? Currently I just use
bootup test. Basicly I think this is fine to cover the cases. Maybe you
would have some better idea.

>Thanks,
>
> tglx
>
>8<---------------
> arch/x86/mm/init.c | 259 ++++++++++++++++++++---------------------------------
> 1 file changed, 102 insertions(+), 157 deletions(-)
>
>--- a/arch/x86/mm/init.c
>+++ b/arch/x86/mm/init.c
>@@ -157,17 +157,28 @@ void __init early_alloc_pgt_buf(void)
> pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
> }
>
>-int after_bootmem;
>+int after_bootmem __ro_after_init;
>
> early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
>
> struct map_range {
>- unsigned long start;
>- unsigned long end;
>- unsigned page_size_mask;
>+ unsigned long start;
>+ unsigned long end;
>+ unsigned int page_size_mask;
> };
>
>-static int page_size_mask;
>+#ifdef CONFIG_X86_32
>+#define NR_RANGE_MR 3
>+#else
>+#define NR_RANGE_MR 5
>+#endif
>+
>+struct mapinfo {
>+ unsigned int mask;
>+ unsigned int size;
>+};
>+
>+static unsigned int page_size_mask __ro_after_init;
>
> static void __init probe_page_size_mask(void)
> {
>@@ -177,7 +188,7 @@ static void __init probe_page_size_mask(
> * large pages into small in interrupt context, etc.
> */
> if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
>- page_size_mask |= 1 << PG_LEVEL_2M;
>+ page_size_mask |= 1U << PG_LEVEL_2M;
> else
> direct_gbpages = 0;
>
>@@ -201,7 +212,7 @@ static void __init probe_page_size_mask(
> /* Enable 1 GB linear kernel mappings if available: */
> if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
> printk(KERN_INFO "Using GB pages for direct mapping\n");
>- page_size_mask |= 1 << PG_LEVEL_1G;
>+ page_size_mask |= 1U << PG_LEVEL_1G;
> } else {
> direct_gbpages = 0;
> }
>@@ -249,185 +260,119 @@ static void setup_pcid(void)
> }
> }
>
>-#ifdef CONFIG_X86_32
>-#define NR_RANGE_MR 3
>-#else /* CONFIG_X86_64 */
>-#define NR_RANGE_MR 5
>+static void __meminit mr_print(struct map_range *mr, unsigned int maxidx)
>+{
>+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
>+ static const char *sz[2] = { "4k", "4M" };
>+#else
>+ static const char *sz[4] = { "4k", "2M", "1G", "" };
> #endif
>+ unsigned int idx, s;
>
>-static int __meminit save_mr(struct map_range *mr, int nr_range,
>- unsigned long start_pfn, unsigned long end_pfn,
>- unsigned long page_size_mask)
>-{
>- if (start_pfn < end_pfn) {
>- if (nr_range >= NR_RANGE_MR)
>- panic("run out of range for init_memory_mapping\n");
>- mr[nr_range].start = start_pfn<<PAGE_SHIFT;
>- mr[nr_range].end = end_pfn<<PAGE_SHIFT;
>- mr[nr_range].page_size_mask = page_size_mask;
>- nr_range++;
>+ for (idx = 0; idx < maxidx; idx++, mr++) {
>+ s = (mr->page_size_mask >> PG_LEVEL_2M) & (ARRAY_SIZE(sz) -1);
>+ pr_debug(" [mem %#010lx-%#010lx] page size %s\n",
>+ mr->start, mr->end - 1, sz[s]);
> }
>-
>- return nr_range;
> }
>
> /*
>- * adjust the page_size_mask for small range to go with
>- * big page size instead small one if nearby are ram too.
>+ * Try to preserve large mappings during bootmem by expanding the current
>+ * range to large page mapping of @size and verifying that the result is
>+ * within a memory region.
> */
>-static void __ref adjust_range_page_size_mask(struct map_range *mr,
>- int nr_range)
>+static void __meminit mr_expand(struct map_range *mr, unsigned int size)
> {
>- int i;
>-
>- for (i = 0; i < nr_range; i++) {
>- if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
>- !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
>- unsigned long start = round_down(mr[i].start, PMD_SIZE);
>- unsigned long end = round_up(mr[i].end, PMD_SIZE);
>+ unsigned long start = round_down(mr->start, size);
>+ unsigned long end = round_up(mr->end, size);
>
>-#ifdef CONFIG_X86_32
>- if ((end >> PAGE_SHIFT) > max_low_pfn)
>- continue;
>-#endif
>+ if (IS_ENABLED(CONFIG_X86_32) && (end >> PAGE_SHIFT) > max_low_pfn)
>+ return;
>
>- if (memblock_is_region_memory(start, end - start))
>- mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
>- }
>- if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
>- !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
>- unsigned long start = round_down(mr[i].start, PUD_SIZE);
>- unsigned long end = round_up(mr[i].end, PUD_SIZE);
>-
>- if (memblock_is_region_memory(start, end - start))
>- mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
>- }
>+ if (memblock_is_region_memory(start, end - start)) {
>+ mr->start = start;
>+ mr->end = end;
> }
> }
>
>-static const char *page_size_string(struct map_range *mr)
>+static bool __meminit mr_try_map(struct map_range *mr, const struct mapinfo *mi)
> {
>- static const char str_1g[] = "1G";
>- static const char str_2m[] = "2M";
>- static const char str_4m[] = "4M";
>- static const char str_4k[] = "4k";
>+ unsigned long len;
>
>- if (mr->page_size_mask & (1<<PG_LEVEL_1G))
>- return str_1g;
>- /*
>- * 32-bit without PAE has a 4M large page size.
>- * PG_LEVEL_2M is misnamed, but we can at least
>- * print out the right size in the string.
>- */
>- if (IS_ENABLED(CONFIG_X86_32) &&
>- !IS_ENABLED(CONFIG_X86_PAE) &&
>- mr->page_size_mask & (1<<PG_LEVEL_2M))
>- return str_4m;
>+ /* Check whether the map size is supported. PAGE_SIZE always is. */
>+ if (mi->mask && !(mr->page_size_mask & mi->mask))
>+ return false;
>
>- if (mr->page_size_mask & (1<<PG_LEVEL_2M))
>- return str_2m;
>+ if (!after_bootmem)
>+ mr_expand(mr, mi->size);
>
>- return str_4k;
>-}
>+ if (!IS_ALIGNED(mr->start, mi->size)) {
>+ /* Limit the range to the next boundary of this size. */
>+ mr->end = min_t(unsigned long, mr->end,
>+ round_up(mr->start, mi->size));
>+ return false;
>+ }
>
>-static int __meminit split_mem_range(struct map_range *mr, int nr_range,
>- unsigned long start,
>- unsigned long end)
>-{
>- unsigned long start_pfn, end_pfn, limit_pfn;
>- unsigned long pfn;
>- int i;
>+ if (!IS_ALIGNED(mr->end, mi->size)) {
>+ /* Try to fit as much as possible */
>+ len = round_down(mr->end - mr->start, mi->size);
>+ if (!len)
>+ return false;
>+ mr->end = mr->start + len;
>+ }
>
>- limit_pfn = PFN_DOWN(end);
>+ /* Store the effective page size mask */
>+ mr->page_size_mask = mi->mask;
>+ return true;
>+}
>
>- /* head if not big page alignment ? */
>- pfn = start_pfn = PFN_DOWN(start);
>-#ifdef CONFIG_X86_32
>+static void __meminit mr_setup(struct map_range *mr, unsigned long start,
>+ unsigned long end)
>+{
> /*
>- * Don't use a large page for the first 2/4MB of memory
>- * because there are often fixed size MTRRs in there
>- * and overlapping MTRRs into large pages can cause
>- * slowdowns.
>+ * On 32bit the first 2/4MB are often covered by fixed size MTRRs.
>+ * Overlapping MTRRs on large pages can cause slowdowns. Force 4k
>+ * mappings.
> */
>- if (pfn == 0)
>- end_pfn = PFN_DOWN(PMD_SIZE);
>- else
>- end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
>-#else /* CONFIG_X86_64 */
>- end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
>-#endif
>- if (end_pfn > limit_pfn)
>- end_pfn = limit_pfn;
>- if (start_pfn < end_pfn) {
>- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
>- pfn = end_pfn;
>- }
>-
>- /* big page (2M) range */
>- start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
>-#ifdef CONFIG_X86_32
>- end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
>-#else /* CONFIG_X86_64 */
>- end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
>- if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
>- end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
>-#endif
>-
>- if (start_pfn < end_pfn) {
>- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
>- page_size_mask & (1<<PG_LEVEL_2M));
>- pfn = end_pfn;
>+ if (IS_ENABLED(CONFIG_X86_32) && start < PMD_SIZE) {
>+ mr->page_size_mask = 0;
>+ mr->end = min_t(unsigned long, end, PMD_SIZE);
>+ } else {
>+ /* Set the possible mapping sizes and allow full range. */
>+ mr->page_size_mask = page_size_mask;
>+ mr->end = end;
> }
>+ mr->start = start;
>+}
>
>+static int __meminit split_mem_range(struct map_range *mr, unsigned long start,
>+ unsigned long end)
>+{
>+ static const struct mapinfo mapinfos[] = {
> #ifdef CONFIG_X86_64
>- /* big page (1G) range */
>- start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
>- end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
>- if (start_pfn < end_pfn) {
>- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
>- page_size_mask &
>- ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
>- pfn = end_pfn;
>- }
>-
>- /* tail is not big page (1G) alignment */
>- start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
>- end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
>- if (start_pfn < end_pfn) {
>- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
>- page_size_mask & (1<<PG_LEVEL_2M));
>- pfn = end_pfn;
>- }
>+ { .mask = 1U << PG_LEVEL_1G, .size = PUD_SIZE },
> #endif
>+ { .mask = 1U << PG_LEVEL_2M, .size = PMD_SIZE },
>+ { .mask = 0, .size = PAGE_SIZE },
>+ };
>+ const struct mapinfo *mi;
>+ struct map_range *curmr;
>+ unsigned long addr;
>+ int idx;
>+
>+ for (idx = 0, addr = start, curmr = mr; addr < end; idx++, curmr++) {
>+ BUG_ON(idx == NR_RANGE_MR);
>
>- /* tail is not big page (2M) alignment */
>- start_pfn = pfn;
>- end_pfn = limit_pfn;
>- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
>+ mr_setup(curmr, addr, end);
>
>- if (!after_bootmem)
>- adjust_range_page_size_mask(mr, nr_range);
>+ /* Try map sizes top down. PAGE_SIZE will always succeed. */
>+ for (mi = mapinfos; !mr_try_map(curmr, mi); mi++);
>
>- /* try to merge same page size and continuous */
>- for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
>- unsigned long old_start;
>- if (mr[i].end != mr[i+1].start ||
>- mr[i].page_size_mask != mr[i+1].page_size_mask)
>- continue;
>- /* move it */
>- old_start = mr[i].start;
>- memmove(&mr[i], &mr[i+1],
>- (nr_range - 1 - i) * sizeof(struct map_range));
>- mr[i--].start = old_start;
>- nr_range--;
>+ /* Get the start address for the next range */
>+ addr = curmr->end;
> }
>-
>- for (i = 0; i < nr_range; i++)
>- pr_debug(" [mem %#010lx-%#010lx] page %s\n",
>- mr[i].start, mr[i].end - 1,
>- page_size_string(&mr[i]));
>-
>- return nr_range;
>+ mr_print(mr, idx);
>+ return idx;
> }
>
> struct range pfn_mapped[E820_MAX_ENTRIES];
>@@ -474,7 +419,7 @@ unsigned long __ref init_memory_mapping(
> start, end - 1);
>
> memset(mr, 0, sizeof(mr));
>- nr_range = split_mem_range(mr, 0, start, end);
>+ nr_range = split_mem_range(mr, start, end);
>
> for (i = 0; i < nr_range; i++)
> ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,

--
Wei Yang
Help you, Help me