Re: [RFC PATCH 4/4] x86/Hyper-V: Add memory hot remove function

From: Vitaly Kuznetsov
Date: Wed Dec 11 2019 - 10:06:44 EST


lantianyu1986@xxxxxxxxx writes:

> From: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx>
>
> Hyper-V provides dynamic memory hot add/remove function.
> Memory hot-add has already enabled in Hyper-V balloon driver.
> Now add memory hot-remove function.
>
> When driver receives hot-remove msg, it first checks whether
> request remove page number is aligned with hot plug unit(128MB).
> If there are remainder pages(pages%128MB), handle remainder pages
> via balloon way(allocate pages, offline pages and return back to
> Hyper-V).
>
> To remove memory chunks, search memory in the hot add blocks first
> and then other system memory.
>
> Hyper-V has a bug of sending unballoon msg to request memory
> hot-add after doing memory hot-remove. Fix it to handle all
> unballoon msg with memory hot-add operation.
>
> Signed-off-by: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx>
> ---
> drivers/hv/hv_balloon.c | 686 +++++++++++++++++++++++++++++++++++++++++++-----

This patch is too big to review and the logic in it is not trivial at
all. Please try to split this into a series so we can take a look.

> 1 file changed, 616 insertions(+), 70 deletions(-)
>
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 4d1a3b1e2490..015e9e993188 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -19,6 +19,7 @@
> #include <linux/completion.h>
> #include <linux/memory_hotplug.h>
> #include <linux/memory.h>
> +#include <linux/memblock.h>
> #include <linux/notifier.h>
> #include <linux/percpu_counter.h>
>
> @@ -46,12 +47,17 @@
> * Changes to 0.2 on 2009/05/14
> * Changes to 0.3 on 2009/12/03
> * Changed to 1.0 on 2011/04/05
> + * Changed to 2.0 on 2019/12/10
> */
>
> #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
> #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
> #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
>
> +#define MAX_HOT_REMOVE_ENTRIES \
> + ((PAGE_SIZE - sizeof(struct dm_hot_remove_response)) \
> + / sizeof(union dm_mem_page_range))
> +
> enum {
> DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
> DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
> @@ -91,7 +97,13 @@ enum dm_message_type {
> * Version 1.0.
> */
> DM_INFO_MESSAGE = 12,
> - DM_VERSION_1_MAX = 12
> + DM_VERSION_1_MAX = 12,
> +
> + /*
> + * Version 2.0
> + */
> + DM_MEM_HOT_REMOVE_REQUEST = 13,
> + DM_MEM_HOT_REMOVE_RESPONSE = 14
> };
>
>
> @@ -120,7 +132,8 @@ union dm_caps {
> * represents an alignment of 2^n in mega bytes.
> */
> __u64 hot_add_alignment:4;
> - __u64 reservedz:58;
> + __u64 hot_remove:1;
> + __u64 reservedz:57;
> } cap_bits;
> __u64 caps;
> } __packed;
> @@ -231,7 +244,9 @@ struct dm_capabilities {
> struct dm_capabilities_resp_msg {
> struct dm_header hdr;
> __u64 is_accepted:1;
> - __u64 reservedz:63;
> + __u64 hot_remove:1;
> + __u64 suppress_pressure_reports:1;
> + __u64 reservedz:61;
> } __packed;
>
> /*
> @@ -376,6 +391,27 @@ struct dm_hot_add_response {
> __u32 result;
> } __packed;
>
> +struct dm_hot_remove {
> + struct dm_header hdr;
> + __u32 virtual_node;
> + __u32 page_count;
> + __u32 qos_flags;
> + __u32 reservedZ;
> +} __packed;
> +
> +struct dm_hot_remove_response {
> + struct dm_header hdr;
> + __u32 result;
> + __u32 range_count;
> + __u64 more_pages:1;
> + __u64 reservedz:63;
> + union dm_mem_page_range range_array[];
> +} __packed;
> +
> +#define DM_REMOVE_QOS_LARGE (1 << 0)
> +#define DM_REMOVE_QOS_LOCAL (1 << 1)
> +#define DM_REMOVE_QoS_MASK (0x3)

Capitalize 'QoS' to make it match previous two lines please.

> +
> /*
> * Types of information sent from host to the guest.
> */
> @@ -457,6 +493,13 @@ struct hot_add_wrk {
> struct work_struct wrk;
> };
>
> +struct hot_remove_wrk {
> + __u32 virtual_node;
> + __u32 page_count;
> + __u32 qos_flags;
> + struct work_struct wrk;
> +};
> +
> static bool hot_add = true;
> static bool do_hot_add;
> /*
> @@ -489,6 +532,7 @@ enum hv_dm_state {
> DM_BALLOON_UP,
> DM_BALLOON_DOWN,
> DM_HOT_ADD,
> + DM_HOT_REMOVE,
> DM_INIT_ERROR
> };
>
> @@ -515,11 +559,13 @@ struct hv_dynmem_device {
> * State to manage the ballooning (up) operation.
> */
> struct balloon_state balloon_wrk;
> + struct balloon_state unballoon_wrk;
>
> /*
> * State to execute the "hot-add" operation.

This comment is stale now.

> */
> struct hot_add_wrk ha_wrk;
> + struct hot_remove_wrk hr_wrk;

Do we actually want to work struct and all the problems with their
serialization? Can we get away with one?

>
> /*
> * This state tracks if the host has specified a hot-add
> @@ -569,6 +615,42 @@ static struct hv_dynmem_device dm_device;
>
> static void post_status(struct hv_dynmem_device *dm);
>
> +static int hv_send_hot_remove_response(
> + struct dm_hot_remove_response *resp,
> + long array_index, bool more_pages)
> +{
> + struct hv_dynmem_device *dm = &dm_device;
> + int ret;
> +
> + resp->hdr.type = DM_MEM_HOT_REMOVE_RESPONSE;
> + resp->range_count = array_index;
> + resp->more_pages = more_pages;
> + resp->hdr.size = sizeof(struct dm_hot_remove_response)
> + + sizeof(union dm_mem_page_range) * array_index;
> +
> + if (array_index)
> + resp->result = 0;
> + else
> + resp->result = 1;
> +
> + do {
> + resp->hdr.trans_id = atomic_inc_return(&trans_id);
> + ret = vmbus_sendpacket(dm->dev->channel, resp,
> + resp->hdr.size,
> + (unsigned long)NULL,
> + VM_PKT_DATA_INBAND, 0);
> +
> + if (ret == -EAGAIN)
> + msleep(20);
> + post_status(&dm_device);
> + } while (ret == -EAGAIN);
> +
> + if (ret)
> + pr_err("Fail to send hot-remove response msg.\n");
> +
> + return ret;
> +}
> +
> #ifdef CONFIG_MEMORY_HOTPLUG
> static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
> unsigned long pfn)
> @@ -628,7 +710,9 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
> void *v)
> {
> struct memory_notify *mem = (struct memory_notify *)v;
> - unsigned long flags, pfn_count;
> + unsigned long pfn_count;
> + unsigned long flags = 0;
> + int unlocked;
>
> switch (val) {
> case MEM_ONLINE:
> @@ -640,7 +724,11 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
> break;
>
> case MEM_OFFLINE:
> - spin_lock_irqsave(&dm_device.ha_lock, flags);
> + if (dm_device.lock_thread != current) {
> + spin_lock_irqsave(&dm_device.ha_lock, flags);
> + unlocked = 1;
> + }
> +
> pfn_count = hv_page_offline_check(mem->start_pfn,
> mem->nr_pages);
> if (pfn_count <= dm_device.num_pages_onlined) {
> @@ -654,7 +742,10 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
> WARN_ON_ONCE(1);
> dm_device.num_pages_onlined = 0;
> }
> - spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
> + if (unlocked)
> + spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
> break;
> case MEM_GOING_ONLINE:
> case MEM_GOING_OFFLINE:
> @@ -727,9 +818,17 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
> init_completion(&dm_device.ol_waitevent);
> dm_device.ha_waiting = !memhp_auto_online;
>
> - nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
> - ret = add_memory(nid, PFN_PHYS((start_pfn)),
> - (HA_CHUNK << PAGE_SHIFT));
> + /*
> + * If memory section of hot add region is online,
> + * just bring pages online in the region.
> + */
> + if (online_section_nr(pfn_to_section_nr(start_pfn))) {
> + hv_bring_pgs_online(has, start_pfn, processed_pfn);
> + } else {
> + nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
> + ret = add_memory(nid, PFN_PHYS((start_pfn)),
> + (HA_CHUNK << PAGE_SHIFT));
> + }
>
> if (ret) {
> pr_err("hot_add memory failed error is %d\n", ret);
> @@ -765,8 +864,8 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
> static void hv_online_page(struct page *pg, unsigned int order)
> {
> struct hv_hotadd_state *has;
> - unsigned long flags;
> unsigned long pfn = page_to_pfn(pg);
> + unsigned long flags = 0;

Why is this change needed?

> int unlocked;
>
> if (dm_device.lock_thread != current) {
> @@ -806,10 +905,12 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
> continue;
>
> /*
> - * If the current start pfn is not where the covered_end
> - * is, create a gap and update covered_end_pfn.
> + * If the current start pfn is great than covered_end_pfn,
> + * create a gap and update covered_end_pfn. Start pfn may
> + * locate at gap which is created during hot remove. The
> + * gap range is less than covered_end_pfn.
> */
> - if (has->covered_end_pfn != start_pfn) {
> + if (has->covered_end_pfn < start_pfn) {
> gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
> if (!gap) {
> ret = -ENOMEM;
> @@ -848,6 +949,91 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
> return ret;
> }
>
> +static int handle_hot_add_in_gap(unsigned long start, unsigned long pg_cnt,
> + struct hv_hotadd_state *has)
> +{
> + struct hv_hotadd_gap *gap, *new_gap, *tmp_gap;
> + unsigned long pfn_cnt = pg_cnt;
> + unsigned long start_pfn = start;
> + unsigned long end_pfn;
> + unsigned long pages;
> + unsigned long pgs_ol;
> + unsigned long block_pages = HA_CHUNK;
> + unsigned long pfn;
> + int nid;
> + int ret;
> +
> + list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
> +
> + if ((start_pfn < gap->start_pfn)
> + || (start_pfn >= gap->end_pfn))
> + continue;
> +
> + end_pfn = min(gap->end_pfn, start_pfn + pfn_cnt);
> + pgs_ol = end_pfn - start_pfn;
> +
> + /*
> + * hv_bring_pgs_online() identifies whether pfn
> + * should be online or not via checking pfn is in
> + * hot add covered range or gap range(Detail see
> + * has_pfn_is_backed()). So adjust gap before bringing
> + * online or add memory.
> + */
> + if (gap->end_pfn - gap->start_pfn == pgs_ol) {
> + list_del(&gap->list);
> + kfree(gap);
> + } else if (gap->start_pfn < start && gap->end_pfn == end_pfn) {
> + gap->end_pfn = start_pfn;
> + } else if (gap->end_pfn > end_pfn
> + && gap->start_pfn == start_pfn) {
> + gap->start_pfn = end_pfn;
> + } else {
> + gap->end_pfn = start_pfn;
> +
> + new_gap = kzalloc(sizeof(struct hv_hotadd_gap),
> + GFP_ATOMIC);
> + if (!new_gap) {
> + do_hot_add = false;
> + return -ENOMEM;
> + }
> +
> + INIT_LIST_HEAD(&new_gap->list);
> + new_gap->start_pfn = end_pfn;
> + new_gap->end_pfn = gap->end_pfn;
> + list_add_tail(&gap->list, &has->gap_list);
> + }
> +
> + /* Bring online or add memmory in gaps. */
> + for (pfn = start_pfn; pfn < end_pfn;
> + pfn = round_up(pfn + 1, block_pages)) {
> + pages = min(round_up(pfn + 1, block_pages),
> + end_pfn) - pfn;
> +
> + if (online_section_nr(pfn_to_section_nr(pfn))) {
> + hv_bring_pgs_online(has, pfn, pages);
> + } else {
> + nid = memory_add_physaddr_to_nid(PFN_PHYS(pfn));
> + ret = add_memory(nid, PFN_PHYS(pfn),
> + round_up(pages, block_pages)
> + << PAGE_SHIFT);
> + if (ret) {
> + pr_err("Fail to add memory in gaps(error=%d).\n",
> + ret);
> + do_hot_add = false;
> + return ret;
> + }
> + }
> + }
> +
> + start_pfn += pgs_ol;
> + pfn_cnt -= pgs_ol;
> + if (!pfn_cnt)
> + break;
> + }
> +
> + return pg_cnt - pfn_cnt;
> +}
> +
> static unsigned long handle_pg_range(unsigned long pg_start,
> unsigned long pg_count)
> {
> @@ -874,6 +1060,22 @@ static unsigned long handle_pg_range(unsigned long pg_start,
>
> old_covered_state = has->covered_end_pfn;
>
> + /*
> + * If start_pfn is less than cover_end_pfn, the hot-add memory
> + * area is in the gap range.
> + */
> + if (start_pfn < has->covered_end_pfn) {
> + pgs_ol = handle_hot_add_in_gap(start_pfn, pfn_cnt, has);
> +
> + pfn_cnt -= pgs_ol;
> + if (!pfn_cnt) {
> + res = pgs_ol;
> + break;
> + }
> +
> + start_pfn += pgs_ol;
> + }
> +
> if (start_pfn < has->ha_end_pfn) {
> /*
> * This is the case where we are backing pages
> @@ -931,6 +1133,23 @@ static unsigned long handle_pg_range(unsigned long pg_start,
> return res;
> }
>
> +static void free_allocated_pages(__u64 start_frame, int num_pages)
> +{
> + struct page *pg;
> + int i;
> +
> + for (i = 0; i < num_pages; i++) {
> + pg = pfn_to_page(i + start_frame);
> +
> + if (page_private(pfn_to_page(i)))
> + set_page_private(pfn_to_page(i), 0);
> +
> + __ClearPageOffline(pg);
> + __free_page(pg);
> + dm_device.num_pages_ballooned--;
> + }
> +}
> +
> static unsigned long process_hot_add(unsigned long pg_start,
> unsigned long pfn_cnt,
> unsigned long rg_start,
> @@ -940,18 +1159,40 @@ static unsigned long process_hot_add(unsigned long pg_start,
> int covered;
> unsigned long flags;
>
> - if (pfn_cnt == 0)
> - return 0;
> + /*
> + * Check whether page is allocated by driver via page private
> + * data due to remainder pages.
> + */
> + if (present_section_nr(pfn_to_section_nr(pg_start))
> + && page_private(pfn_to_page(pg_start))) {
> + free_allocated_pages(pg_start, pfn_cnt);
> + return pfn_cnt;
> + }
>
> - if (!dm_device.host_specified_ha_region) {
> - covered = pfn_covered(pg_start, pfn_cnt);
> - if (covered < 0)
> - return 0;
> + if ((rg_start == 0) && (!dm_device.host_specified_ha_region)) {
> + /*
> + * The host has not specified the hot-add region.
> + * Based on the hot-add page range being specified,
> + * compute a hot-add region that can cover the pages
> + * that need to be hot-added while ensuring the alignment
> + * and size requirements of Linux as it relates to hot-add.
> + */
> + rg_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
> + if (pfn_cnt % HA_CHUNK)
> + rg_size += HA_CHUNK;
>
> - if (covered)
> - goto do_pg_range;
> + rg_start = (pg_start / HA_CHUNK) * HA_CHUNK;
> }
>
> + if (pfn_cnt == 0)
> + return 0;
> +
> + covered = pfn_covered(pg_start, pfn_cnt);
> + if (covered < 0)
> + return 0;
> + else if (covered)
> + goto do_pg_range;
> +
> /*
> * If the host has specified a hot-add range; deal with it first.
> */
> @@ -983,8 +1224,321 @@ static unsigned long process_hot_add(unsigned long pg_start,
> return handle_pg_range(pg_start, pfn_cnt);
> }
>
> +static int check_memblock_online(struct memory_block *mem, void *arg)
> +{
> + if (mem->state != MEM_ONLINE)
> + return -1;
> +
> + return 0;
> +}
> +
> +static int change_memblock_state(struct memory_block *mem, void *arg)
> +{
> + unsigned long state = (unsigned long)arg;
> +
> + mem->state = state;
> +
> + return 0;
> +}
> +
> +static bool hv_offline_pages(unsigned long start_pfn, unsigned long nr_pages)
> +{
> + const unsigned long start = PFN_PHYS(start_pfn);
> + const unsigned long size = PFN_PHYS(nr_pages);
> +
> + lock_device_hotplug();
> +
> + if (walk_memory_blocks(start, size, NULL, check_memblock_online)) {
> + unlock_device_hotplug();
> + return false;
> + }
> +
> + walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
> + change_memblock_state);
> +
> + if (offline_pages(start_pfn, nr_pages)) {
> + walk_memory_blocks(start_pfn, nr_pages, (void *)MEM_ONLINE,
> + change_memblock_state);
> + unlock_device_hotplug();
> + return false;
> + }
> +
> + walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
> + change_memblock_state);
> +
> + unlock_device_hotplug();
> + return true;
> +}
> +
> +static int hv_hot_remove_range(unsigned int nid, unsigned long start_pfn,
> + unsigned long end_pfn, unsigned long nr_pages,
> + unsigned long *array_index,
> + union dm_mem_page_range *range_array,
> + struct hv_hotadd_state *has)
> +{
> + unsigned long block_pages = HA_CHUNK;
> + unsigned long rm_pages = nr_pages;
> + unsigned long pfn;
> +
> + for (pfn = start_pfn; pfn < end_pfn; pfn += block_pages) {
> + struct hv_hotadd_gap *gap;
> + int in_gaps = 0;
> +
> + if (*array_index >= MAX_HOT_REMOVE_ENTRIES) {
> + struct dm_hot_remove_response *resp =
> + (struct dm_hot_remove_response *)
> + balloon_up_send_buffer;
> + int ret;
> +
> + /* Flush out all remove response entries. */
> + ret = hv_send_hot_remove_response(resp, *array_index,
> + true);
> + if (ret)
> + return ret;
> +
> + memset(resp, 0x00, PAGE_SIZE);
> + *array_index = 0;
> + }
> +
> + if (has) {
> + /*
> + * Memory in gaps has been offlined or removed and
> + * so skip it if remove range overlap with gap.
> + */
> + list_for_each_entry(gap, &has->gap_list, list)
> + if (!(pfn >= gap->end_pfn ||
> + pfn + block_pages < gap->start_pfn)) {
> + in_gaps = 1;
> + break;
> + }
> +
> + if (in_gaps)
> + continue;
> + }
> +
> + if (online_section_nr(pfn_to_section_nr(pfn))
> + && is_mem_section_removable(pfn, block_pages)
> + && hv_offline_pages(pfn, block_pages)) {
> + remove_memory(nid, pfn << PAGE_SHIFT,
> + block_pages << PAGE_SHIFT);
> +
> + range_array[*array_index].finfo.start_page = pfn;
> + range_array[*array_index].finfo.page_cnt = block_pages;
> +
> + (*array_index)++;
> + nr_pages -= block_pages;
> +
> + if (!nr_pages)
> + break;
> + }
> + }
> +
> + return rm_pages - nr_pages;
> +}
> +
> +static int hv_hot_remove_from_ha_list(unsigned int nid, unsigned long nr_pages,
> + unsigned long *array_index,
> + union dm_mem_page_range *range_array)
> +{
> + struct hv_hotadd_state *has;
> + unsigned long start_pfn, end_pfn;
> + unsigned long flags, rm_pages;
> + int old_index;
> + int ret, i;
> +
> + spin_lock_irqsave(&dm_device.ha_lock, flags);
> + dm_device.lock_thread = current;
> + list_for_each_entry(has, &dm_device.ha_region_list, list) {
> + start_pfn = has->start_pfn;
> + end_pfn = has->covered_end_pfn;
> + rm_pages = min(nr_pages, has->covered_end_pfn - has->start_pfn);
> + old_index = *array_index;
> +
> + if (!rm_pages || pfn_to_nid(start_pfn) != nid)
> + continue;
> +
> + rm_pages = hv_hot_remove_range(nid, start_pfn, end_pfn,
> + rm_pages, array_index, range_array, has);
> +
> + if (rm_pages < 0)
> + return rm_pages;
> + else if (!rm_pages)
> + continue;
> +
> + nr_pages -= rm_pages;
> + dm_device.num_pages_added -= rm_pages;
> +
> + /* Create gaps for hot remove regions. */
> + for (i = old_index; i < *array_index; i++) {
> + struct hv_hotadd_gap *gap;
> +
> + gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
> + if (!gap) {
> + ret = -ENOMEM;
> + do_hot_add = false;
> + return ret;
> + }
> +
> + INIT_LIST_HEAD(&gap->list);
> + gap->start_pfn = range_array[i].finfo.start_page;
> + gap->end_pfn =
> + gap->start_pfn + range_array[i].finfo.page_cnt;
> + list_add_tail(&gap->list, &has->gap_list);
> + }
> +
> + if (!nr_pages)
> + break;
> + }
> + dm_device.lock_thread = NULL;
> + spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
> + return nr_pages;
> +}
> +
> +static void free_balloon_pages(struct hv_dynmem_device *dm,
> + union dm_mem_page_range *range_array)
> +{
> + int num_pages = range_array->finfo.page_cnt;
> + __u64 start_frame = range_array->finfo.start_page;
> +
> + free_allocated_pages(start_frame, num_pages);
> +}
> +
> +static int hv_hot_remove_pages(struct dm_hot_remove_response *resp,
> + u64 nr_pages, unsigned long *array_index,
> + bool more_pages)
> +{
> + int i, j, alloc_unit = PAGES_IN_2M;
> + struct page *pg;
> + int ret;
> +
> + for (i = 0; i < nr_pages; i += alloc_unit) {
> + if (*array_index >= MAX_HOT_REMOVE_ENTRIES) {
> + /* Flush out all remove response entries. */
> + ret = hv_send_hot_remove_response(resp,
> + *array_index, true);
> + if (ret)
> + goto free_pages;
> +
> + /*
> + * Continue to allocate memory for hot remove
> + * after resetting send buffer and array index.
> + */
> + memset(resp, 0x00, PAGE_SIZE);
> + *array_index = 0;
> + }
> +retry:
> + pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
> + __GFP_NOMEMALLOC | __GFP_NOWARN,
> + get_order(alloc_unit << PAGE_SHIFT));
> + if (!pg) {
> + if (alloc_unit == 1) {
> + ret = -ENOMEM;
> + goto free_pages;
> + }
> +
> + alloc_unit = 1;
> + goto retry;
> + }
> +
> + if (alloc_unit != 1)
> + split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
> +
> + for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT));
> + j++) {
> + __SetPageOffline(pg + j);
> +
> + /*
> + * Set page's private data to non-zero and use it
> + * to identify whehter the page is allocated by driver
> + * or new hot-add memory in process_hot_add().
> + */
> + set_page_private(pg + j, 1);
> + }
> +
> + resp->range_array[*array_index].finfo.start_page
> + = page_to_pfn(pg);
> + resp->range_array[*array_index].finfo.page_cnt
> + = alloc_unit;
> + (*array_index)++;
> +
> + dm_device.num_pages_ballooned += alloc_unit;
> + }
> +
> + ret = hv_send_hot_remove_response(resp, *array_index, more_pages);
> + if (ret)
> + goto free_pages;
> +
> + return 0;
> +
> +free_pages:
> + for (i = 0; i < *array_index; i++)
> + free_balloon_pages(&dm_device, &resp->range_array[i]);
> +
> + /* Response hot remove failure. */
> + hv_send_hot_remove_response(resp, 0, false);
> + return ret;
> +}
> +
> +static void hv_hot_remove_mem_from_node(unsigned int nid, u64 nr_pages)
> +{
> + struct dm_hot_remove_response *resp
> + = (struct dm_hot_remove_response *)balloon_up_send_buffer;
> + unsigned long remainder = nr_pages % HA_CHUNK;
> + unsigned long start_pfn = node_start_pfn(nid);
> + unsigned long end_pfn = node_end_pfn(nid);
> + unsigned long array_index = 0;
> + int ret;
> +
> + /*
> + * If page number isn't aligned with memory hot plug unit,
> + * handle remainder pages via balloon way.
> + */
> + if (remainder) {
> + memset(resp, 0x00, PAGE_SIZE);
> + ret = hv_hot_remove_pages(resp, remainder, &array_index,
> + !!(nr_pages - remainder));
> + if (ret)
> + return;
> +
> + nr_pages -= remainder;
> + if (!nr_pages)
> + return;
> + }
> +
> + memset(resp, 0x00, PAGE_SIZE);
> + array_index = 0;
> + nr_pages = hv_hot_remove_from_ha_list(nid, nr_pages, &array_index,
> + resp->range_array);
> + if (nr_pages < 0) {
> + /* Set array_index to 0 and response failure in resposne msg. */
> + array_index = 0;
> + } else if (nr_pages) {
> + start_pfn = ALIGN(start_pfn, HA_CHUNK);
> + hv_hot_remove_range(nid, start_pfn, end_pfn, nr_pages,
> + &array_index, resp->range_array, NULL);
> + }
> +
> + hv_send_hot_remove_response(resp, array_index, false);
> +}
> +
> #endif
>
> +static void hot_remove_req(struct work_struct *dummy)
> +{
> + struct hv_dynmem_device *dm = &dm_device;
> + unsigned int numa_node = dm->hr_wrk.virtual_node;
> + unsigned int page_count = dm->hr_wrk.page_count;
> +
> + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) || do_hot_add)
> + hv_hot_remove_mem_from_node(numa_node, page_count);
> + else
> + hv_send_hot_remove_response((struct dm_hot_remove_response *)
> + balloon_up_send_buffer, 0, false);
> +
> + dm->state = DM_INITIALIZED;
> +}
> +
> static void hot_add_req(struct work_struct *dummy)
> {
> struct dm_hot_add_response resp;
> @@ -1005,28 +1559,6 @@ static void hot_add_req(struct work_struct *dummy)
> rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
> rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
>
> - if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
> - unsigned long region_size;
> - unsigned long region_start;
> -
> - /*
> - * The host has not specified the hot-add region.
> - * Based on the hot-add page range being specified,
> - * compute a hot-add region that can cover the pages
> - * that need to be hot-added while ensuring the alignment
> - * and size requirements of Linux as it relates to hot-add.
> - */
> - region_start = pg_start;
> - region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
> - if (pfn_cnt % HA_CHUNK)
> - region_size += HA_CHUNK;
> -
> - region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
> -
> - rg_start = region_start;
> - rg_sz = region_size;
> - }
> -
> if (do_hot_add)
> resp.page_count = process_hot_add(pg_start, pfn_cnt,
> rg_start, rg_sz);
> @@ -1190,24 +1722,6 @@ static void post_status(struct hv_dynmem_device *dm)
>
> }
>
> -static void free_balloon_pages(struct hv_dynmem_device *dm,
> - union dm_mem_page_range *range_array)
> -{
> - int num_pages = range_array->finfo.page_cnt;
> - __u64 start_frame = range_array->finfo.start_page;
> - struct page *pg;
> - int i;
> -
> - for (i = 0; i < num_pages; i++) {
> - pg = pfn_to_page(i + start_frame);
> - __ClearPageOffline(pg);
> - __free_page(pg);
> - dm->num_pages_ballooned--;
> - }
> -}
> -
> -
> -
> static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
> unsigned int num_pages,
> struct dm_balloon_response *bl_resp,
> @@ -1354,22 +1868,38 @@ static void balloon_up(struct work_struct *dummy)
>
> }
>
> -static void balloon_down(struct hv_dynmem_device *dm,
> - struct dm_unballoon_request *req)
> +static void balloon_down(struct work_struct *dummy)
> {
> + struct dm_unballoon_request *req =
> + (struct dm_unballoon_request *)recv_buffer;
> union dm_mem_page_range *range_array = req->range_array;
> int range_count = req->range_count;
> struct dm_unballoon_response resp;
> - int i;
> + struct hv_dynmem_device *dm = &dm_device;
> unsigned int prev_pages_ballooned = dm->num_pages_ballooned;
> + int i;
>
> for (i = 0; i < range_count; i++) {
> - free_balloon_pages(dm, &range_array[i]);
> - complete(&dm_device.config_event);
> + /*
> + * Hyper-V has a bug of sending unballoon msg instead
> + * of hot add msg when there is no balloon msg sent before
> + * Do hot add operation for all unballoon msg If hot add
> + * capability is enabled,
> + */
> + if (do_hot_add) {
> + dm->host_specified_ha_region = false;
> + dm->num_pages_added +=
> + process_hot_add(range_array[i].finfo.start_page,
> + range_array[i].finfo.page_cnt, 0, 0);
> + } else {
> + free_balloon_pages(dm, &range_array[i]);
> + }
> }
> + complete(&dm_device.config_event);
>
> - pr_debug("Freed %u ballooned pages.\n",
> - prev_pages_ballooned - dm->num_pages_ballooned);
> + if (!do_hot_add)
> + pr_debug("Freed %u ballooned pages.\n",
> + prev_pages_ballooned - dm->num_pages_ballooned);
>
> if (req->more_pages == 1)
> return;
> @@ -1489,6 +2019,7 @@ static void balloon_onchannelcallback(void *context)
> struct hv_dynmem_device *dm = hv_get_drvdata(dev);
> struct dm_balloon *bal_msg;
> struct dm_hot_add *ha_msg;
> + struct dm_hot_remove *hr_msg;
> union dm_mem_page_range *ha_pg_range;
> union dm_mem_page_range *ha_region;
>
> @@ -1522,8 +2053,7 @@ static void balloon_onchannelcallback(void *context)
>
> case DM_UNBALLOON_REQUEST:
> dm->state = DM_BALLOON_DOWN;
> - balloon_down(dm,
> - (struct dm_unballoon_request *)recv_buffer);
> + schedule_work(&dm_device.unballoon_wrk.wrk);
> break;
>
> case DM_MEM_HOT_ADD_REQUEST:
> @@ -1554,6 +2084,19 @@ static void balloon_onchannelcallback(void *context)
> }
> schedule_work(&dm_device.ha_wrk.wrk);
> break;
> + case DM_MEM_HOT_REMOVE_REQUEST:
> + if (dm->state == DM_HOT_REMOVE)
> + pr_warn("Currently hot-removing.\n");
> +
> + dm->state = DM_HOT_REMOVE;
> + hr_msg = (struct dm_hot_remove *)recv_buffer;
> +
> + dm->hr_wrk.virtual_node = hr_msg->virtual_node;
> + dm->hr_wrk.page_count = hr_msg->page_count;
> + dm->hr_wrk.qos_flags = hr_msg->qos_flags;
> +
> + schedule_work(&dm_device.hr_wrk.wrk);
> + break;
>
> case DM_INFO_MESSAGE:
> process_info(dm, (struct dm_info_msg *)dm_msg);
> @@ -1628,6 +2171,7 @@ static int balloon_connect_vsp(struct hv_device *dev)
>
> cap_msg.caps.cap_bits.balloon = 1;
> cap_msg.caps.cap_bits.hot_add = 1;
> + cap_msg.caps.cap_bits.hot_remove = 1;
>
> /*
> * Specify our alignment requirements as it relates
> @@ -1688,7 +2232,9 @@ static int balloon_probe(struct hv_device *dev,
> INIT_LIST_HEAD(&dm_device.ha_region_list);
> spin_lock_init(&dm_device.ha_lock);
> INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
> + INIT_WORK(&dm_device.unballoon_wrk.wrk, balloon_down);
> INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
> + INIT_WORK(&dm_device.hr_wrk.wrk, hot_remove_req);
> dm_device.host_specified_ha_region = false;
>
> #ifdef CONFIG_MEMORY_HOTPLUG

--
Vitaly