[RFC PATCH 1/1] Move two pinned pages to non-movable node in kvm.

From: Tang Chen
Date: Wed Jun 18 2014 - 01:49:19 EST


Hi,

I met a problem when offlining memory with a kvm guest running.


[Problem]
When qemu creates vpus, it will call the following two functions
to allocate two pages:
1. alloc_apic_access_page(): allocate apic access page for FlexPriority in intel cpu.
2. alloc_identity_pagetable(): allocate ept identity pagetable for real mode.

And unfortunately, these two pages will be pinned in memory, and they cannot
be migrated. As a result, they cannot be offlined. And memory hot-remove will fail.



[The way I tried]
I tried to migrate these two pages, but I think I cannot find a proper way
to migrate them.

Let's take ept identity pagetable for example:
In my opinion, since it is pagetable, CPU will access this page every time the guest
read/write memory. For example, the following code will access memory:
int a;
a = 0;
So this ept identity pagetable page can be accessed at any time by CPU automatically.



[Solution]
I have a basic idea to solve this problem: allocate these two pages in non-movable nodes.
(For now, we can only hot-remove memory in movable nodes.)

alloc_identity_pagetable()
|-> __kvm_set_memory_region()
| |-> kvm_arch_prepare_memory_region()
| |-> userspace_addr = vm_mmap();
| |-> memslot->userspace_addr = userspace_addr; /* map usespace address (qemu) */
|
| /*
| * Here, set memory policy for the mapped but not allocated page,
| * make it can only be allocated in non-movable nodes.
| * (We can reuse "numa_kernel_nodes" node mask in movable_node functionality.)
| */
|
|-> page = gfn_to_page() /* allocate and pin page */

Please refer to the attached patch for detail.
I did some basic test for the patch, and it will make memory offline succeed.



[Questions]
And by the way, would you guys please answer the following questions for me ?

1. What's the ept identity pagetable for ? Only one page is enough ?

2. Is the ept identity pagetable only used in realmode ?
Can we free it once the guest is up (vcpu in protect mode)?

3. Now, ept identity pagetable is allocated in qemu userspace.
Can we allocate it in kernel space ?

4. If I want to migrate these two pages, what do you think is the best way ?

Thanks.


Signed-off-by: Tang Chen <tangchen@xxxxxxxxxxxxxx>
---
arch/x86/include/asm/numa.h | 1 +
arch/x86/kvm/vmx.c | 5 +++++
arch/x86/kvm/x86.c | 1 +
arch/x86/mm/numa.c | 3 ++-
include/linux/mempolicy.h | 6 ++++++
mm/mempolicy.c | 9 +++++++++
6 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..6312577 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -30,6 +30,7 @@ extern int numa_off;
*/
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
extern nodemask_t numa_nodes_parsed __initdata;
+extern nodemask_t numa_kernel_nodes;

extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
extern void __init numa_set_distance(int from, int to, int distance);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 801332e..4a3b5b5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,6 +32,7 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/hrtimer.h>
+#include <linux/mempolicy.h>
#include "kvm_cache_regs.h"
#include "x86.h"

@@ -3988,6 +3989,8 @@ static int alloc_apic_access_page(struct kvm *kvm)
if (r)
goto out;

+ numa_bind_non_movable(kvm_userspace_mem.userspace_addr, PAGE_SIZE);
+
page = gfn_to_page(kvm, 0xfee00);
if (is_error_page(page)) {
r = -EFAULT;
@@ -4018,6 +4021,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
if (r)
goto out;

+ numa_bind_non_movable(kvm_userspace_mem.userspace_addr, PAGE_SIZE);
+
page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
if (is_error_page(page)) {
r = -EFAULT;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f32a025..3962a23 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7295,6 +7295,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return PTR_ERR((void *)userspace_addr);

memslot->userspace_addr = userspace_addr;
+ mem->userspace_addr = userspace_addr;
}

return 0;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..d706148 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -22,6 +22,8 @@

int __initdata numa_off;
nodemask_t numa_nodes_parsed __initdata;
+nodemask_t numa_kernel_nodes;
+EXPORT_SYMBOL(numa_kernel_nodes);

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
@@ -557,7 +559,6 @@ static void __init numa_init_array(void)
static void __init numa_clear_kernel_node_hotplug(void)
{
int i, nid;
- nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
unsigned long start, end;
struct memblock_region *r;

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index f230a97..14f3f04 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -140,6 +140,7 @@ bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
+extern long numa_bind_non_movable(unsigned long start, unsigned long len);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
enum mpol_rebind_step step);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
@@ -243,6 +244,11 @@ static inline void numa_default_policy(void)
{
}

+static inline long numa_bind_non_movable(unsigned long start, unsigned long len)
+{
+ return -EINVAL;
+}
+
static inline void mpol_rebind_task(struct task_struct *tsk,
const nodemask_t *new,
enum mpol_rebind_step step)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2849742..20065a9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,6 +97,7 @@

#include <asm/tlbflush.h>
#include <asm/uaccess.h>
+#include <asm/numa.h>
#include <linux/random.h>

#include "internal.h"
@@ -2663,6 +2664,14 @@ void numa_default_policy(void)
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

+/* Bind a memory range to non-movable nodes. */
+long numa_bind_non_movable(unsigned long start, unsigned long len)
+{
+ return do_mbind(start, len, MPOL_BIND, MPOL_MODE_FLAGS,
+ &numa_kernel_nodes, MPOL_MF_STRICT);
+}
+EXPORT_SYMBOL(numa_bind_non_movable);
+
/*
* Parse and format mempolicy from/to strings
*/
--
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/