Re: [Xen-devel] [PATCH] xen/p2m: fix extra memory regions accounting

From: Juergen Gross
Date: Fri Sep 04 2015 - 01:07:49 EST


Hi Roger,

On 09/03/2015 05:39 PM, Roger Pau Monnà wrote:
El 03/09/15 a les 17.20, Juergen Gross ha escrit:
On 09/03/2015 05:01 PM, David Vrabel wrote:
On 03/09/15 15:55, Juergen Gross wrote:
On 09/03/2015 04:52 PM, David Vrabel wrote:
On 03/09/15 15:45, David Vrabel wrote:
On 03/09/15 15:38, Roger Pau Monnà wrote:
El 03/09/15 a les 14.25, Juergen Gross ha escrit:
On 09/03/2015 02:05 PM, Roger Pau Monne wrote:
On systems with memory maps with ranges that don't end at page
boundaries,
like:

[...]
(XEN) 0000000000100000 - 00000000dfdf9c00 (usable)
(XEN) 00000000dfdf9c00 - 00000000dfe4bc00 (ACPI NVS)
[...]

xen_add_extra_mem will create a protected range that ends up at
0xdfdf9c00,
but the function used to check if a memory address is inside of a
protected
range works with pfns, which means that an attempt to map
0xdfdf9c00
will be
refused because the check is performed against 0xdfdf9000
instead of
0xdfdf9c00.

In order to fix this, make sure that the ranges that are added
to the
xen_extra_mem array are aligned to page boundaries.

Signed-off-by: Roger Pau Monnà <roger.pau@xxxxxxxxxx>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Cc: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
Cc: David Vrabel <david.vrabel@xxxxxxxxxx>
Cc: Juergen Gross <jgross@xxxxxxxx>
Cc: xen-devel@xxxxxxxxxxxxxxxxxxxx
---
AFAICT this patch needs to be backported to 3.19, 4.0, 4.1 and 4.2.
---
arch/x86/xen/setup.c | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 55f388e..dcf5865 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -68,6 +68,9 @@ static void __init xen_add_extra_mem(phys_addr_t
start, phys_addr_t size)
{
int i;

+ start = PAGE_ALIGN(start);
+ size &= PAGE_MASK;

This is not correct. If start wasn't page aligned and size was,
you'll
add one additional page to xen_extra_mem.

I'm not understanding this, let's put an example:

start = 0x8c00
size = 0x1000

After the fixup added above this would become:

start = 0x9000
size = 0x1000

So if anything, I'm adding one page less (because 0x8000 was partly
added, and with the fixup it is not added).

We expand the reserved (i.e., non-RAM) areas down so they're fully
covered with whole pages when we depopulate and 1:1 map them, we
should
add extra memory regions that cover these same areas.

Ignore this. This was nonsense.

We expand the reserved (i.e., non-RAM) areas so they're fully covered
with whole pages when we depopulate and 1:1 map them, we should add the
extra memory such that it does not overlap with with expanded regions.
i.e., round up the start and round down the end (like Roger's patch
does).

Nearly. Roger's patch rounds up start and rounds down the size. It might
add non-RAM partial pages to xen_extra_mem.

Yes. You're right.

Hmm, thinking more about it, I'd prefer to change xen_extra_mem to use
pfns instead of physical addresses. This would make things much more
clear.

Roger, do you want to do the patch or should I?

I can certainly take care of it if you are busy, otherwise I leave it to
you since you have more expertise on it :).

Could you try the attached patch? It should do the job. It is booting
fine on my laptop, but I think you should try it on the machine with
the memory ranges not at page boundary.


Juergen

commit 3d0f8aa4d1b4c9c16a81902a197b5d6a77e182a0
Author: Juergen Gross <jgross@xxxxxxxx>
Date: Thu Sep 3 17:44:04 2015 +0200

xen: switch extra memory accounting to use pfns

Instead of using physical addresses for accounting of extra memory
areas available for ballooning switch to pfns as this is much less
error prone regarding partial pages.

Reported-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
Signed-off-by: Juergen Gross <jgross@xxxxxxxx>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 7a5d566..aa58bc4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -90,62 +90,65 @@ static void __init xen_parse_512gb(void)
xen_512gb_limit = val;
}

-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
{
int i;

for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
/* Add new region. */
- if (xen_extra_mem[i].size == 0) {
- xen_extra_mem[i].start = start;
- xen_extra_mem[i].size = size;
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
break;
}
/* Append to existing region. */
- if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
- xen_extra_mem[i].size += size;
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
break;
}
}
if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");

- memblock_reserve(start, size);
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}

-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
{
int i;
- phys_addr_t start_r, size_r;
+ unsigned long start_r, size_r;

for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- start_r = xen_extra_mem[i].start;
- size_r = xen_extra_mem[i].size;
+ start_r = xen_extra_mem[i].start_pfn;
+ size_r = xen_extra_mem[i].n_pfns;

/* Start of region. */
- if (start_r == start) {
- BUG_ON(size > size_r);
- xen_extra_mem[i].start += size;
- xen_extra_mem[i].size -= size;
+ if (start_r == start_pfn) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].start_pfn += n_pfns;
+ xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* End of region. */
- if (start_r + size_r == start + size) {
- BUG_ON(size > size_r);
- xen_extra_mem[i].size -= size;
+ if (start_r + size_r == start_pfn + n_pfns) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* Mid of region. */
- if (start > start_r && start < start_r + size_r) {
- BUG_ON(start + size > start_r + size_r);
- xen_extra_mem[i].size = start - start_r;
+ if (start_pfn > start_r && start_pfn < start_r + size_r) {
+ BUG_ON(start_pfn + n_pfns > start_r + size_r);
+ xen_extra_mem[i].n_pfns = start_pfn - start_r;
/* Calling memblock_reserve() again is okay. */
- xen_add_extra_mem(start + size, start_r + size_r -
- (start + size));
+ xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+ (start_pfn + n_pfns));
break;
}
}
- memblock_free(start, size);
+ memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}

/*
@@ -156,11 +159,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
int i;
- phys_addr_t addr = PFN_PHYS(pfn);

for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- if (addr >= xen_extra_mem[i].start &&
- addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+ if (pfn >= xen_extra_mem[i].start_pfn &&
+ pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
return INVALID_P2M_ENTRY;
}

@@ -176,10 +178,10 @@ void __init xen_inv_extra_mem(void)
int i;

for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- if (!xen_extra_mem[i].size)
+ if (!xen_extra_mem[i].n_pfns)
continue;
- pfn_s = PFN_DOWN(xen_extra_mem[i].start);
- pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+ pfn_s = xen_extra_mem[i].start_pfn;
+ pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
for (pfn = pfn_s; pfn < pfn_e; pfn++)
set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
@@ -507,7 +509,7 @@ void __init xen_remap_memory(void)
} else if (pfn_s + len == xen_remap_buf.target_pfn) {
len += xen_remap_buf.size;
} else {
- xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ xen_del_extra_mem(pfn_s, len);
pfn_s = xen_remap_buf.target_pfn;
len = xen_remap_buf.size;
}
@@ -517,7 +519,7 @@ void __init xen_remap_memory(void)
}

if (pfn_s != ~0UL && len)
- xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ xen_del_extra_mem(pfn_s, len);

set_pte_mfn(buf, mfn_save, PAGE_KERNEL);

@@ -744,7 +746,7 @@ static void __init xen_reserve_xen_mfnlist(void)
**/
char * __init xen_memory_setup(void)
{
- unsigned long max_pfn;
+ unsigned long max_pfn, pfn_s, n_pfns;
phys_addr_t mem_end, addr, size, chunk_size;
u32 type;
int rc;
@@ -831,9 +833,11 @@ char * __init xen_memory_setup(void)
chunk_size = min(size, mem_end - addr);
} else if (extra_pages) {
chunk_size = min(size, PFN_PHYS(extra_pages));
- extra_pages -= PFN_DOWN(chunk_size);
- xen_add_extra_mem(addr, chunk_size);
- xen_max_p2m_pfn = PFN_DOWN(addr + chunk_size);
+ pfn_s = PFN_UP(addr);
+ n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+ extra_pages -= n_pfns;
+ xen_add_extra_mem(pfn_s, n_pfns);
+ xen_max_p2m_pfn = pfn_s + n_pfns;
} else
type = E820_UNUSABLE;
}
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index bf4a23c..1fa633b 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -638,9 +638,9 @@ static int __init balloon_init(void)
* regions (see arch/x86/xen/setup.c).
*/
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
- if (xen_extra_mem[i].size)
- balloon_add_region(PFN_UP(xen_extra_mem[i].start),
- PFN_DOWN(xen_extra_mem[i].size));
+ if (xen_extra_mem[i].n_pfns)
+ balloon_add_region(xen_extra_mem[i].start_pfn,
+ xen_extra_mem[i].n_pfns);

return 0;
}
diff --git a/include/xen/page.h b/include/xen/page.h
index c5ed20b..a5983da 100644
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -9,8 +9,8 @@ static inline unsigned long page_to_mfn(struct page *page)
}

struct xen_memory_region {
- phys_addr_t start;
- phys_addr_t size;
+ unsigned long start_pfn;
+ unsigned long n_pfns;
};

#define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */