[RFC PATCH 3/7] x86, mm: arch_add_dev_memory()

From: Dan Williams
Date: Wed Aug 12 2015 - 23:56:11 EST


Use struct vmem_altmap to augment vmemmap_{populate|free}().

In support of providing struct page coverage for persistent memory,
use struct vmem_altmap to change the default policy for mapping pfns for
a page range. The default vmemmap_populate() allocates page table
storage area from the page allocator. In support of storing struct page
infrastructure on device memory (pmem) directly vmem_altmap directs
vmmemap_populate() to use a pre-allocated block of contiguous pfns for
storage of the new vmemmap entries.

Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: linux-mm@xxxxxxxxx
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
arch/x86/mm/init_64.c | 55 +++++++++++++++++++++++++++++++++++++---
include/linux/memory_hotplug.h | 4 +++
include/linux/mm.h | 38 +++++++++++++++++++++++++++-
mm/memory_hotplug.c | 12 +++++++++
mm/page_alloc.c | 4 +++
mm/sparse-vmemmap.c | 31 +++++++++++++++++++++++
mm/sparse.c | 17 +++++++++++-
7 files changed, 154 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c2f872a379d2..eda65ec8484e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -719,6 +719,21 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);

+#ifdef CONFIG_ZONE_DEVICE
+/*
+ * The primary difference vs arch_add_memory is that the zone is known
+ * apriori.
+ */
+int arch_add_dev_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = pgdat->node_zones + ZONE_DEVICE;
+
+ return __arch_add_memory(nid, start, size, zone, altmap);
+}
+#endif
+
#define PAGE_INUSE 0xFD

static void __meminit free_pagetable(struct page *page, int order)
@@ -771,8 +786,13 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud,
return;
}

- /* free a pmd talbe */
- free_pagetable(pud_page(*pud), 0);
+ /*
+ * Free a pmd table if it came from the page allocator (i.e. !altmap).
+ * In the altmap case the pages are being freed implicitly by the
+ * section becoming unmapped / unplugged.
+ */
+ if (!altmap)
+ free_pagetable(pud_page(*pud), 0);
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
spin_unlock(&init_mm.page_table_lock);
@@ -890,7 +910,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
if (pmd_large(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE)) {
- if (!direct)
+ if (!direct && !altmap)
free_pagetable(pmd_page(*pmd),
get_order(PMD_SIZE));

@@ -946,7 +966,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
if (pud_large(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
IS_ALIGNED(next, PUD_SIZE)) {
- if (!direct)
+ if (!direct && !altmap)
free_pagetable(pud_page(*pud),
get_order(PUD_SIZE));

@@ -993,6 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct,
pud_t *pud;
bool pgd_changed = false;

+ WARN_ON_ONCE(direct && altmap);
+
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);

@@ -1041,6 +1063,31 @@ static int __ref __arch_remove_memory(u64 start, u64 size, struct zone *zone,
__phys_to_pfn(size), altmap);
}

+int __ref arch_remove_dev_memory(u64 start, u64 size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long pfn = __phys_to_pfn(start);
+ struct zone *zone;
+ int rc;
+
+ /*
+ * Reserve pages will not have initialized pfns, so we need to
+ * calulate the page zone from the first valid pfn.
+ */
+ if (altmap) {
+ if (altmap->base_pfn != pfn) {
+ WARN_ONCE(1, "pfn: %#lx expected: %#lx\n",
+ pfn, altmap->base_pfn);
+ return -EINVAL;
+ }
+ pfn += altmap->reserve;
+ }
+ zone = page_zone(pfn_to_page(pfn));
+ rc = __arch_remove_memory(start, size, zone, altmap);
+ WARN_ON_ONCE(rc);
+ return rc;
+}
+
int __ref arch_remove_memory(u64 start, u64 size)
{
struct zone *zone = page_zone(pfn_to_page(__phys_to_pfn(start)));
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 48a4e0a5e13d..6a9f05e2c02f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -102,6 +102,8 @@ extern int try_online_node(int nid);
#ifdef CONFIG_MEMORY_HOTREMOVE
extern bool is_pageblock_removable_nolock(struct page *page);
extern int arch_remove_memory(u64 start, u64 size);
+extern int arch_remove_dev_memory(u64 start, u64 size,
+ struct vmem_altmap *altmap);
extern int __remove_pages_altmap(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap);
extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
@@ -279,6 +281,8 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
extern int add_memory(int nid, u64 start, u64 size);
extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default);
extern int arch_add_memory(int nid, u64 start, u64 size);
+extern int arch_add_dev_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern bool is_memblock_offlined(struct memory_block *mem);
extern void remove_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index de44de70e63a..8a4f24d7fdb0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2215,7 +2215,43 @@ void sparse_mem_maps_populate_node(struct page **map_map,
unsigned long map_count,
int nodeid);

-struct vmem_altmap;
+/**
+ * struct vmem_altmap - augment vmemap_populate with pre-allocated pte storage
+ * @base: first pfn of the allocation
+ * @reserve: number of pfns reserved by the device relative to base
+ * @free: range of memmap storage / offset to data from section0
+ * @alloc: tracks num pfns consumed for page map, private to vmemmap_populate()
+ */
+struct vmem_altmap {
+ const unsigned long base_pfn;
+ const unsigned long reserve;
+ unsigned long free;
+ unsigned long alloc;
+};
+
+static inline unsigned long vmem_altmap_nr_free(struct vmem_altmap *altmap)
+{
+ if (altmap->free > altmap->alloc)
+ return altmap->free - altmap->alloc;
+ return 0;
+}
+
+static inline unsigned long vmem_altmap_next_pfn(struct vmem_altmap *altmap)
+{
+ return altmap->base_pfn + altmap->alloc;
+}
+
+static inline unsigned long vmem_altmap_alloc(struct vmem_altmap *altmap,
+ unsigned long nr_pfns)
+{
+ unsigned long pfn = vmem_altmap_next_pfn(altmap);
+
+ if (nr_pfns > vmem_altmap_nr_free(altmap))
+ return ULONG_MAX;
+ altmap->alloc += nr_pfns;
+ return pfn;
+}
+
struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
struct page *sparse_alt_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4bcfeaaec37..79cb7595b659 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -505,6 +505,18 @@ int __ref __add_pages_altmap(int nid, struct zone *zone,
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);

+ if (altmap) {
+ /*
+ * Validate altmap is within bounds of the total request
+ */
+ if (altmap->base_pfn != phys_start_pfn || (altmap->reserve
+ + altmap->free) > nr_pages) {
+ pr_warn_once("memory add fail, invalid altmap\n");
+ return -EINVAL;
+ }
+ altmap->alloc = 0;
+ }
+
for (i = start_sec; i <= end_sec; i++) {
err = __add_section(nid, zone, section_nr_to_pfn(i), altmap);

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c18520831dbc..498193b8811d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4590,6 +4590,10 @@ void __meminit __memmap_init_zone(unsigned long size, int nid,
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;

+ /* skip initializing a number of pfns from the start of the section */
+ if (altmap && start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+
z = &pgdat->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 16ec1675b793..6ea8027daf00 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -86,10 +86,41 @@ static void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node)
return ptr;
}

+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long pfn, start_pfn = vmem_altmap_next_pfn(altmap);
+ unsigned long align = 0;
+ void *ptr;
+
+ if (!is_power_of_2(size) || size < PAGE_SIZE) {
+ pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
+ __func__, PAGE_SIZE);
+ return NULL;
+ }
+
+ size >>= PAGE_SHIFT;
+ if (start_pfn & (size - 1))
+ align = ALIGN(start_pfn, size) - start_pfn;
+
+ pfn = vmem_altmap_alloc(altmap, align + size);
+ if (pfn < ULONG_MAX)
+ ptr = __va(__pfn_to_phys(pfn));
+ else
+ ptr = NULL;
+ pr_debug("%s: start: %#lx align: %#lx next: %#lx nr: %#lx %p\n",
+ __func__, start_pfn, align,
+ vmem_altmap_next_pfn(altmap), size + align, ptr);
+
+ return ptr;
+}
+
/* need to make sure size is all the same during early stage */
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap)
{
+ if (altmap)
+ return altmap_alloc_block_buf(size, altmap);
return __vmemmap_alloc_block_buf(size, node);
}

diff --git a/mm/sparse.c b/mm/sparse.c
index eda783903b1d..529b16509eca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -369,6 +369,13 @@ static void __init sparse_early_usemaps_alloc_node(void *data,
}

#ifndef CONFIG_SPARSEMEM_VMEMMAP
+struct page __init *sparse_alt_map_populate(unsigned long pnum, int nid,
+ struct vmem_altmap *altmap)
+{
+ pr_warn_once("%s: requires CONFIG_SPARSEMEM_VMEMMAP=y\n", __func__);
+ return NULL;
+}
+
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
{
struct page *map;
@@ -598,7 +605,10 @@ void __init sparse_init(void)
static struct page *alloc_section_memmap(unsigned long pnum, int nid,
struct vmem_altmap *altmap)
{
- return sparse_mem_map_populate(pnum, nid);
+ if (altmap)
+ return sparse_alt_map_populate(pnum, nid, altmap);
+ else
+ return sparse_mem_map_populate(pnum, nid);
}

static inline void free_section_memmap(struct page *memmap,
@@ -607,7 +617,10 @@ static inline void free_section_memmap(struct page *memmap,
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);

- __vmemmap_free(start, end, NULL);
+ if (altmap)
+ __vmemmap_free(start, end, altmap);
+ else
+ __vmemmap_free(start, end, NULL);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/