Re: [v3 0/9] parallelized "struct page" zeroing

From: Michal Hocko
Date: Tue May 09 2017 - 14:12:43 EST


On Fri 05-05-17 13:03:07, Pavel Tatashin wrote:
> Changelog:
> v2 - v3
> - Addressed David's comments about one change per patch:
> * Splited changes to platforms into 4 patches
> * Made "do not zero vmemmap_buf" as a separate patch
> v1 - v2
> - Per request, added s390 to deferred "struct page" zeroing
> - Collected performance data on x86 which proofs the importance to
> keep memset() as prefetch (see below).
>
> When deferred struct page initialization feature is enabled, we get a
> performance gain of initializing vmemmap in parallel after other CPUs are
> started. However, we still zero the memory for vmemmap using one boot CPU.
> This patch-set fixes the memset-zeroing limitation by deferring it as well.

I like the idea of postponing the zeroing from the allocation to the
init time. To be honest the improvement looks much larger than I would
expect (Btw. this should be a part of the changelog rather than a
outside link).

The implementation just looks too large to what I would expect. E.g. do
we really need to add zero argument to the large part of the memblock
API? Wouldn't it be easier to simply export memblock_virt_alloc_internal
(or its tiny wrapper memblock_virt_alloc_core) and move the zeroing
outside to its 2 callers? A completely untested scratched version at the
end of the email.

Also it seems that this is not 100% correct either as it only cares
about VMEMMAP while DEFERRED_STRUCT_PAGE_INIT might be enabled also for
SPARSEMEM. This would suggest that we would zero out pages twice,
right?

A similar concern would go to the memory hotplug patch which will
fall back to the slab/page allocator IIRC. On the other hand
__init_single_page is shared with the hotplug code so again we would
initialize 2 times.

So I suspect more changes are needed. I will have a closer look tomorrow.

> arch/powerpc/mm/init_64.c | 4 +-
> arch/s390/mm/vmem.c | 5 ++-
> arch/sparc/mm/init_64.c | 26 +++++++----------------
> arch/x86/mm/init_64.c | 3 +-
> include/linux/bootmem.h | 3 ++
> include/linux/mm.h | 15 +++++++++++--
> mm/memblock.c | 46 ++++++++++++++++++++++++++++++++++++------
> mm/page_alloc.c | 3 ++
> mm/sparse-vmemmap.c | 48 +++++++++++++++++++++++++++++---------------
> 9 files changed, 103 insertions(+), 50 deletions(-)


The bootmem API change mentioned above.

include/linux/bootmem.h | 3 +++
mm/memblock.c | 41 ++++++++++++++++++++++++++---------------
mm/sparse-vmemmap.c | 2 +-
3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 962164d36506..c9a08463d9a8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -160,6 +160,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)

/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void * memblock_virt_alloc_core(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid);
void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
phys_addr_t align, phys_addr_t min_addr,
phys_addr_t max_addr, int nid);
diff --git a/mm/memblock.c b/mm/memblock.c
index b049c9b2dba8..eab7da94f873 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1271,8 +1271,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
*
* The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
*
- * The phys address of allocated boot memory block is converted to virtual and
- * allocated memory is reset to 0.
+ * The function has to be zeroed out explicitly.
*
* In addition, function sets the min_count to 0 using kmemleak_alloc for
* allocated boot memory block, so that it is never reported as leaks.
@@ -1280,15 +1279,18 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* RETURNS:
* Virtual address of allocated memory block on success, NULL on failure.
*/
-static void * __init memblock_virt_alloc_internal(
+static inline void * __init memblock_virt_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
+ int nid, void *caller)
{
phys_addr_t alloc;
void *ptr;
ulong flags = choose_memblock_flags();

+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, caller);
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;

@@ -1334,7 +1336,6 @@ static void * __init memblock_virt_alloc_internal(
return NULL;
done:
ptr = phys_to_virt(alloc);
- memset(ptr, 0, size);

/*
* The min_count is set to 0 so that bootmem allocated blocks
@@ -1347,6 +1348,14 @@ static void * __init memblock_virt_alloc_internal(
return ptr;
}

+void * __init memblock_virt_alloc_core(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ return memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid,
+ (void *)_RET_IP_);
+}
+
/**
* memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
* @size: size of memory block to be allocated in bytes
@@ -1369,11 +1378,14 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
- return memblock_virt_alloc_internal(size, align, min_addr,
- max_addr, nid);
+ void *ptr;
+
+ ptr = memblock_virt_alloc_internal(size, align, min_addr,
+ max_addr, nid, (void *)_RET_IP_);
+ if (ptr)
+ memset(ptr, 0, size);
+
+ return ptr;
}

/**
@@ -1401,13 +1413,12 @@ void * __init memblock_virt_alloc_try_nid(
{
void *ptr;

- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
- min_addr, max_addr, nid);
- if (ptr)
+ min_addr, max_addr, nid, (void *)_RET_IP_);
+ if (ptr) {
+ memset(ptr, 0, size);
return ptr;
+ }

panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
__func__, (u64)size, (u64)align, nid, (u64)min_addr,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a56c3989f773..4e060f0f9fe5 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -41,7 +41,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return memblock_virt_alloc_try_nid(size, align, goal,
+ return memblock_virt_alloc_core(size, align, goal,
BOOTMEM_ALLOC_ACCESSIBLE, node);
}

--
Michal Hocko
SUSE Labs