Re: [RFC][Patch v8 6/7] KVM: Enables the kernel to isolate and report free pages

From: Michael S. Tsirkin
Date: Tue Feb 05 2019 - 15:45:36 EST


On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> This patch enables the kernel to scan the per cpu array and
> compress it by removing the repetitive/re-allocated pages.
> Once the per cpu array is completely filled with pages in the
> buddy it wakes up the kernel per cpu thread which re-scans the
> entire per cpu array by acquiring a zone lock corresponding to
> the page which is being scanned. If the page is still free and
> present in the buddy it tries to isolate the page and adds it
> to another per cpu array.
>
> Once this scanning process is complete and if there are any
> isolated pages added to the new per cpu array kernel thread
> invokes hyperlist_ready().
>
> In hyperlist_ready() a hypercall is made to report these pages to
> the host using the virtio-balloon framework. In order to do so
> another virtqueue 'hinting_vq' is added to the balloon framework.
> As the host frees all the reported pages, the kernel thread returns
> them back to the buddy.
>
> Signed-off-by: Nitesh Narayan Lal <nitesh@xxxxxxxxxx>


This looks kind of like what early iterations of Wei's patches did.

But this has lots of issues, for example you might end up with
a hypercall per a 4K page.

So in the end, he switched over to just reporting only
MAX_ORDER - 1 pages.

Would that be a good idea for you too?

An alternative would be a different much lighter weight
way to report these pages and to free them on the host.

> ---
> drivers/virtio/virtio_balloon.c | 56 +++++++-
> include/linux/page_hinting.h | 18 ++-
> include/uapi/linux/virtio_balloon.h | 1 +
> mm/page_alloc.c | 2 +-
> virt/kvm/page_hinting.c | 202 +++++++++++++++++++++++++++-
> 5 files changed, 269 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 728ecd1eea30..8af34e0b9a32 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
> VIRTIO_BALLOON_VQ_INFLATE,
> VIRTIO_BALLOON_VQ_DEFLATE,
> VIRTIO_BALLOON_VQ_STATS,
> + VIRTIO_BALLOON_VQ_HINTING,
> VIRTIO_BALLOON_VQ_FREE_PAGE,
> VIRTIO_BALLOON_VQ_MAX
> };
>
> struct virtio_balloon {
> struct virtio_device *vdev;
> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
> + *hinting_vq;
>
> /* Balloon's own wq for cpu-intensive work items */
> struct workqueue_struct *balloon_wq;
> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
> { 0 },
> };
>
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
> + int hyper_entries)
> +{
> + u64 gpaddr = virt_to_phys((void *)gvaddr);
> +
> + virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
> + virtqueue_kick_sync(vb->hinting_vq);
> +}
> +
> +static void hinting_ack(struct virtqueue *vq)
> +{
> + struct virtio_balloon *vb = vq->vdev->priv;
> +
> + wake_up(&vb->acked);
> +}
> +
> +static void enable_hinting(struct virtio_balloon *vb)
> +{
> + guest_page_hinting_flag = 1;
> + static_branch_enable(&guest_page_hinting_key);
> + request_hypercall = (void *)&virtballoon_page_hinting;
> + balloon_ptr = vb;
> + WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
> +}
> +
> +static void disable_hinting(void)
> +{
> + guest_page_hinting_flag = 0;
> + static_branch_enable(&guest_page_hinting_key);
> + balloon_ptr = NULL;
> +}
> +#endif
> +
> static u32 page_to_balloon_pfn(struct page *page)
> {
> unsigned long pfn = page_to_pfn(page);
> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
> names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> + names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
>
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
> callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> }
>
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
> + names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
> + callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
> + }
> err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> vqs, callbacks, names, NULL, NULL);
> if (err)
> return err;
>
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> + vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
> +
> vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
> if (err)
> goto out_del_balloon_wq;
> }
> +
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> + enable_hinting(vb);
> +#endif
> virtio_device_ready(vdev);
>
> if (towards_target(vb))
> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
> cancel_work_sync(&vb->update_balloon_size_work);
> cancel_work_sync(&vb->update_balloon_stats_work);
>
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> + disable_hinting();
> +#endif
> if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> cancel_work_sync(&vb->report_free_page_work);
> destroy_workqueue(vb->balloon_wq);
> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
> VIRTIO_BALLOON_F_MUST_TELL_HOST,
> VIRTIO_BALLOON_F_STATS_VQ,
> VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> + VIRTIO_BALLOON_F_HINTING,
> VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> VIRTIO_BALLOON_F_PAGE_POISON,
> };
> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
> index e800c6b07561..3ba8c1f3b4a4 100644
> --- a/include/linux/page_hinting.h
> +++ b/include/linux/page_hinting.h
> @@ -1,15 +1,12 @@
> #include <linux/smpboot.h>
>
> -/*
> - * Size of the array which is used to store the freed pages is defined by
> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
> - * we can get rid of the hardcoded array size.
> - */
> #define MAX_FGPT_ENTRIES 1000
> /*
> * hypervisor_pages - It is a dummy structure passed with the hypercall.
> - * @pfn: page frame number for the page which needs to be sent to the host.
> - * @order: order of the page needs to be reported to the host.
> + * @pfn - page frame number for the page which is to be freed.
> + * @pages - number of pages which are supposed to be freed.
> + * A global array object is used to to hold the list of pfn and pages and is
> + * passed as part of the hypercall.
> */
> struct hypervisor_pages {
> unsigned long pfn;
> @@ -19,11 +16,18 @@ struct hypervisor_pages {
> extern int guest_page_hinting_flag;
> extern struct static_key_false guest_page_hinting_key;
> extern struct smp_hotplug_thread hinting_threads;
> +extern void (*request_hypercall)(void *, u64, int);
> +extern void *balloon_ptr;
> extern bool want_page_poisoning;
>
> int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> void __user *buffer, size_t *lenp, loff_t *ppos);
> void guest_free_page(struct page *page, int order);
> +extern int __isolate_free_page(struct page *page, unsigned int order);
> +extern void free_one_page(struct zone *zone,
> + struct page *page, unsigned long pfn,
> + unsigned int order,
> + int migratetype);
>
> static inline void disable_page_poisoning(void)
> {

I guess you will want to put this in some other header. Function
declarations belong close to where they are implemented, not used.

> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index a1966cd7b677..2b0f62814e22 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -36,6 +36,7 @@
> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */
> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */
>
> /* Size of a PFN in the balloon interface. */
> #define VIRTIO_BALLOON_PFN_SHIFT 12
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d295c9bc01a8..93224cba9243 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
> spin_unlock(&zone->lock);
> }
>
> -static void free_one_page(struct zone *zone,
> +void free_one_page(struct zone *zone,
> struct page *page, unsigned long pfn,
> unsigned int order,
> int migratetype)
> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> index be529f6f2bc0..315099fcda43 100644
> --- a/virt/kvm/page_hinting.c
> +++ b/virt/kvm/page_hinting.c
> @@ -1,6 +1,8 @@
> #include <linux/gfp.h>
> #include <linux/mm.h>
> +#include <linux/page_ref.h>
> #include <linux/kvm_host.h>
> +#include <linux/sort.h>
> #include <linux/kernel.h>
>
> /*
> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
> EXPORT_SYMBOL(guest_page_hinting_flag);
> static DEFINE_PER_CPU(struct task_struct *, hinting_task);
>
> +void (*request_hypercall)(void *, u64, int);
> +EXPORT_SYMBOL(request_hypercall);
> +void *balloon_ptr;
> +EXPORT_SYMBOL(balloon_ptr);
> +
> int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> void __user *buffer, size_t *lenp,
> loff_t *ppos)
> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> return ret;
> }
>
> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
> +{
> + int i = 0;
> + int mt = 0;
> +
> + if (balloon_ptr)
> + request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
> + entries);
> +
> + while (i < entries) {
> + struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
> +
> + mt = get_pageblock_migratetype(page);
> + free_one_page(page_zone(page), page, page_to_pfn(page),
> + guest_isolated_pages[i].order, mt);
> + i++;
> + }
> +}
> +
> +struct page *get_buddy_page(struct page *page)
> +{
> + unsigned long pfn = page_to_pfn(page);
> + unsigned int order;
> +
> + for (order = 0; order < MAX_ORDER; order++) {
> + struct page *page_head = page - (pfn & ((1 << order) - 1));
> +
> + if (PageBuddy(page_head) && page_private(page_head) >= order)
> + return page_head;
> + }
> + return NULL;
> +}
> +
> static void hinting_fn(unsigned int cpu)
> {
> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> + int idx = 0, ret = 0;
> + struct zone *zone_cur;
> + unsigned long flags = 0;
> +
> + while (idx < MAX_FGPT_ENTRIES) {
> + unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
> + unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
> + (1 << page_hinting_obj->kvm_pt[idx].order) - 1;
> +
> + while (pfn <= pfn_end) {
> + struct page *page = pfn_to_page(pfn);
> + struct page *buddy_page = NULL;
> +
> + zone_cur = page_zone(page);
> + spin_lock_irqsave(&zone_cur->lock, flags);
> +
> + if (PageCompound(page)) {
> + struct page *head_page = compound_head(page);
> + unsigned long head_pfn = page_to_pfn(head_page);
> + unsigned int alloc_pages =
> + 1 << compound_order(head_page);
> +
> + pfn = head_pfn + alloc_pages;
> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> + continue;
> + }
> +
> + if (page_ref_count(page)) {
> + pfn++;
> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> + continue;
> + }
> +
> + if (PageBuddy(page)) {
> + int buddy_order = page_private(page);
>
> + ret = __isolate_free_page(page, buddy_order);
> + if (!ret) {
> + } else {
> + int l_idx = page_hinting_obj->hyp_idx;
> + struct hypervisor_pages *l_obj =
> + page_hinting_obj->hypervisor_pagelist;
> +
> + l_obj[l_idx].pfn = pfn;
> + l_obj[l_idx].order = buddy_order;
> + page_hinting_obj->hyp_idx += 1;
> + }
> + pfn = pfn + (1 << buddy_order);
> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> + continue;
> + }
> +
> + buddy_page = get_buddy_page(page);
> + if (buddy_page) {
> + int buddy_order = page_private(buddy_page);
> +
> + ret = __isolate_free_page(buddy_page,
> + buddy_order);
> + if (!ret) {
> + } else {
> + int l_idx = page_hinting_obj->hyp_idx;
> + struct hypervisor_pages *l_obj =
> + page_hinting_obj->hypervisor_pagelist;
> + unsigned long buddy_pfn =
> + page_to_pfn(buddy_page);
> +
> + l_obj[l_idx].pfn = buddy_pfn;
> + l_obj[l_idx].order = buddy_order;
> + page_hinting_obj->hyp_idx += 1;
> + }
> + pfn = page_to_pfn(buddy_page) +
> + (1 << buddy_order);
> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> + continue;
> + }
> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> + pfn++;
> + }
> + page_hinting_obj->kvm_pt[idx].pfn = 0;
> + page_hinting_obj->kvm_pt[idx].order = -1;
> + page_hinting_obj->kvm_pt[idx].zonenum = -1;
> + idx++;
> + }
> + if (page_hinting_obj->hyp_idx > 0) {
> + hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
> + page_hinting_obj->hyp_idx);
> + page_hinting_obj->hyp_idx = 0;
> + }
> page_hinting_obj->kvm_pt_idx = 0;
> put_cpu_var(hinting_obj);
> }
>
> +int if_exist(struct page *page)
> +{
> + int i = 0;
> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +
> + while (i < MAX_FGPT_ENTRIES) {
> + if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
> + return 1;
> + i++;
> + }
> + return 0;
> +}
> +
> +void pack_array(void)
> +{
> + int i = 0, j = 0;
> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +
> + while (i < MAX_FGPT_ENTRIES) {
> + if (page_hinting_obj->kvm_pt[i].pfn != 0) {
> + if (i != j) {
> + page_hinting_obj->kvm_pt[j].pfn =
> + page_hinting_obj->kvm_pt[i].pfn;
> + page_hinting_obj->kvm_pt[j].order =
> + page_hinting_obj->kvm_pt[i].order;
> + page_hinting_obj->kvm_pt[j].zonenum =
> + page_hinting_obj->kvm_pt[i].zonenum;
> + }
> + j++;
> + }
> + i++;
> + }
> + i = j;
> + page_hinting_obj->kvm_pt_idx = j;
> + while (j < MAX_FGPT_ENTRIES) {
> + page_hinting_obj->kvm_pt[j].pfn = 0;
> + page_hinting_obj->kvm_pt[j].order = -1;
> + page_hinting_obj->kvm_pt[j].zonenum = -1;
> + j++;
> + }
> +}
> +
> void scan_array(void)
> {
> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> + int i = 0;
>
> + while (i < MAX_FGPT_ENTRIES) {
> + struct page *page =
> + pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
> + struct page *buddy_page = get_buddy_page(page);
> +
> + if (!PageBuddy(page) && buddy_page) {
> + if (if_exist(buddy_page)) {
> + page_hinting_obj->kvm_pt[i].pfn = 0;
> + page_hinting_obj->kvm_pt[i].order = -1;
> + page_hinting_obj->kvm_pt[i].zonenum = -1;
> + } else {
> + page_hinting_obj->kvm_pt[i].pfn =
> + page_to_pfn(buddy_page);
> + page_hinting_obj->kvm_pt[i].order =
> + page_private(buddy_page);
> + }
> + }
> + i++;
> + }
> + pack_array();
> if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> wake_up_process(__this_cpu_read(hinting_task));
> }
> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
> page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
> order;
> page_hinting_obj->kvm_pt_idx += 1;
> - if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> + if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
> + /*
> + * We are depending on the buddy free-list to identify
> + * if a page is free or not. Hence, we are dumping all
> + * the per-cpu pages back into the buddy allocator. This
> + * will ensure less failures when we try to isolate free
> + * captured pages and hence more memory reporting to the
> + * host.
> + */
> + drain_local_pages(NULL);
> scan_array();
> + }
> }
> local_irq_restore(flags);
> }
> --
> 2.17.2