RE: [PATCH v3] x86/Hyper-V: Support for free page reporting
From: Michael Kelley
Date: Thu Feb 04 2021 - 18:37:03 EST
From: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx> Sent: Wednesday, January 6, 2021 3:21 PM
>
> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
>
> Signed-off-by: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx>
> Tested-by: Matheus Castello <matheus@xxxxxxxxxxxxxxx>
> ---
> In V2:
> - Addressed feedback comments
> - Added page reporting config option tied to hyper-v balloon config
>
> In V3:
> - Addressed feedback from Vitaly
> ---
> arch/x86/hyperv/hv_init.c | 31 +++++++++++
> arch/x86/kernel/cpu/mshyperv.c | 6 +-
> drivers/hv/Kconfig | 1 +
> drivers/hv/hv_balloon.c | 93 +++++++++++++++++++++++++++++++
> include/asm-generic/hyperv-tlfs.h | 32 ++++++++++-
> include/asm-generic/mshyperv.h | 2 +
> 6 files changed, 162 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index e04d90af4c27..5b610e47d091 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -528,3 +528,34 @@ bool hv_is_hibernation_supported(void)
> return acpi_sleep_state_supported(ACPI_STATE_S4);
> }
> EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
> +
> +/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
> +bool hv_query_ext_cap(u64 cap_query)
> +{
> + u64 *cap;
> + unsigned long flags;
> + u64 ext_cap = 0;
> +
> + /*
> + * Querying extended capabilities is an extended hypercall. Check if the
> + * partition supports extended hypercall, first.
> + */
> + if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
> + return 0;
Return 'false' since the function is declared as bool?
> +
> + /*
> + * Repurpose the input page arg to accept output from Hyper-V for
> + * now because this is the only call that needs output from the
> + * hypervisor. It should be fixed properly by introducing an
> + * output arg once we have more places that require output.
> + */
> + local_irq_save(flags);
> + cap = *(u64 **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> + HV_STATUS_SUCCESS)
Need to mask before checking for HV_STATUS_SUCCESS. With regard to the
reserved fields in the returned 64 bit status, the TLFS says "Callers should ignore the
value in these bits". There's no promise that they are zero.
> + ext_cap = *cap;
> +
> + local_irq_restore(flags);
> + return ext_cap & cap_query;
> +}
As I noted in a review comment back in May, the output arg here is
only 64 bits in size and could just live on the stack with assurance that
it won't cross a page boundary. So the code could be:
bool hv_query_ext_cap(u64 cap_query)
{
u64 cap;
u64 status;
if(!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
return false;
status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, &cap);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
cap = 0;
return extcap & cap;
}
But if you think there's value in using the designated page for hypercall args,
I'm OK with just fixing the testing of the status.
> +EXPORT_SYMBOL_GPL(hv_query_ext_cap);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 05ef1f4550cb..f4c0d69c61ae 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -225,11 +225,13 @@ static void __init ms_hyperv_init_platform(void)
> * Extract the features and hints
> */
> ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> + ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
> ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
> ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
>
> - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> + pr_info("Hyper-V: privilege flags low:0x%x, high:0x%x, hints:0x%x, misc:0x%x\n",
Nit. Could we just use a space instead of a colon before each of the printed hex values?
> + ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
> + ms_hyperv.misc_features);
>
> ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
> ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 79e5356a737a..66c794d92391 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -23,6 +23,7 @@ config HYPERV_UTILS
> config HYPERV_BALLOON
> tristate "Microsoft Hyper-V Balloon driver"
> depends on HYPERV
> + select PAGE_REPORTING
With this selection made, are the #ifdef CONFIG_PAGE_REPORTING occurrences
below really needed? I looked at the virtio balloon driver, which is also does
"select PAGE_REPORTING", and it does not have any #ifdef's.
> help
> Select this option to enable Hyper-V Balloon driver.
>
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 8c471823a5af..c0ff0a48f540 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -21,6 +21,7 @@
> #include <linux/memory.h>
> #include <linux/notifier.h>
> #include <linux/percpu_counter.h>
> +#include <linux/page_reporting.h>
>
> #include <linux/hyperv.h>
> #include <asm/hyperv-tlfs.h>
> @@ -563,6 +564,10 @@ struct hv_dynmem_device {
> * The negotiated version agreed by host.
> */
> __u32 version;
> +
> +#ifdef CONFIG_PAGE_REPORTING
> + struct page_reporting_dev_info pr_dev_info;
> +#endif
> };
>
> static struct hv_dynmem_device dm_device;
> @@ -1568,6 +1573,84 @@ static void balloon_onchannelcallback(void *context)
>
> }
>
> +#ifdef CONFIG_PAGE_REPORTING
> +/* Hyper-V only supports reporting 2MB pages or higher */
I'm guessing the above is the same on ARM64 where the guest is using 16K
or 64K page size, because Hyper-V always uses 4K pages and expects all guest
communication to be in units of 4K pages.
> +#define HV_MIN_PAGE_REPORTING_ORDER 9
> +#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER)
> +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> + struct scatterlist *sgl, unsigned int nents)
> +{
> + unsigned long flags;
> + struct hv_memory_hint *hint;
> + int i;
> + u64 status;
> + struct scatterlist *sg;
> +
> + WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
> + local_irq_save(flags);
> + hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + if (!hint) {
> + local_irq_restore(flags);
> + return -ENOSPC;
> + }
> +
> + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
> + hint->reserved = 0;
> + for_each_sg(sgl, sg, nents, i) {
> + union hv_gpa_page_range *range;
> +
> + range = &hint->ranges[i];
> + range->address_space = 0;
> + /* page reportting only reports 2MB pages or higher */
> + range->page.largepage = 1;
> + range->page.additional_pages =
> + (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
Perhaps verify that sg->length is at least 2 Meg? (similar to verifying that nents
isn't too big). If it isn't at least 2 Meg, then additional_pages will get set to -1,
and I suspect weird things will happen.
I was also thinking about whether sg->length could be big enough to overflow
the additional_pages field. sg->length is an unsigned int, so I don't think so.
> + range->base_large_pfn =
> + page_to_pfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;
page_to_pfn() will do the wrong thing on ARM64 with 16K or 64K pages.
Use page_to_hvpfn() instead.
> + }
> +
> + status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
> + hint, NULL);
> + local_irq_restore(flags);
> + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
> + pr_err("Cold memory discard hypercall failed with status %llx\n",
> + status);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static void enable_page_reporting(void)
> +{
> + int ret;
> +
> + BUILD_BUG_ON(pageblock_order < HV_MIN_PAGE_REPORTING_ORDER);
The BUILD_BUG_ON won't work in the case where pageblock_order is
actually a variable rather than a constant, though that's currently only ia64 and
powerpc, which we don't directly care about. Nonetheless, this would break if
pageblock_order were to become a variable.
> + if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
> + pr_debug("Cold memory discard hint not supported by Hyper-V\n");
> + return;
> + }
> +
> + BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
> + dm_device.pr_dev_info.report = hv_free_page_report;
> + ret = page_reporting_register(&dm_device.pr_dev_info);
> + if (ret < 0) {
> + dm_device.pr_dev_info.report = NULL;
> + pr_err("Failed to enable cold memory discard: %d\n", ret);
> + } else {
> + pr_info("Cold memory discard hint enabled\n");
> + }
Should the above two messages be prefixed with "Hyper-V: "?
> +}
> +
> +static void disable_page_reporting(void)
> +{
> + if (dm_device.pr_dev_info.report) {
> + page_reporting_unregister(&dm_device.pr_dev_info);
> + dm_device.pr_dev_info.report = NULL;
> + }
> +}
> +#endif /* CONFIG_PAGE_REPORTING */
> +
> static int balloon_connect_vsp(struct hv_device *dev)
> {
> struct dm_version_request version_req;
> @@ -1713,6 +1796,10 @@ static int balloon_probe(struct hv_device *dev,
> if (ret != 0)
> return ret;
>
> +#ifdef CONFIG_PAGE_REPORTING
> + enable_page_reporting();
> +#endif
> +
> dm_device.state = DM_INITIALIZED;
>
> dm_device.thread =
> @@ -1731,6 +1818,9 @@ static int balloon_probe(struct hv_device *dev,
> #ifdef CONFIG_MEMORY_HOTPLUG
> unregister_memory_notifier(&hv_memory_nb);
> restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> + disable_page_reporting();
> #endif
Nit: Typically the error path undoes things in the reverse order. So
the disable_page_reporting() would occur before the call to
vmbus_close().
> return ret;
> }
> @@ -1753,6 +1843,9 @@ static int balloon_remove(struct hv_device *dev)
> #ifdef CONFIG_MEMORY_HOTPLUG
> unregister_memory_notifier(&hv_memory_nb);
> restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> + disable_page_reporting();
> #endif
Same here regarding the ordering.
> spin_lock_irqsave(&dm_device.ha_lock, flags);
> list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
> index e73a11850055..75c20be2cc44 100644
> --- a/include/asm-generic/hyperv-tlfs.h
> +++ b/include/asm-generic/hyperv-tlfs.h
> @@ -89,6 +89,7 @@
> #define HV_ACCESS_STATS BIT(8)
> #define HV_DEBUGGING BIT(11)
> #define HV_CPU_POWER_MANAGEMENT BIT(12)
> +#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20)
>
>
> /*
> @@ -152,11 +153,18 @@ struct ms_hyperv_tsc_page {
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
>
> +/* Extended hypercalls */
> +#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001
> +#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003
> +
> #define HV_FLUSH_ALL_PROCESSORS BIT(0)
> #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
> #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
> #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
>
> +/* Extended capability bits */
> +#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
> +
> enum HV_GENERIC_SET_FORMAT {
> HV_GENERIC_SET_SPARSE_4K,
> HV_GENERIC_SET_ALL,
> @@ -367,7 +375,7 @@ struct hv_guest_mapping_flush {
> */
> #define HV_MAX_FLUSH_PAGES (2048)
>
> -/* HvFlushGuestPhysicalAddressList hypercall */
> +/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */
> union hv_gpa_page_range {
> u64 address_space;
> struct {
> @@ -375,6 +383,12 @@ union hv_gpa_page_range {
> u64 largepage:1;
> u64 basepfn:52;
> } page;
> + struct {
> + u64 reserved:12;
> + u64 page_size:1;
> + u64 reserved1:8;
> + u64 base_large_pfn:43;
> + };
> };
>
> /*
> @@ -494,4 +508,20 @@ struct hv_set_vp_registers_input {
> } element[];
> } __packed;
>
> +/*
> + * The whole argument should fit in a page to be able to pass to the hypervisor
> + * in one hypercall.
> + */
> +#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES \
> + ((PAGE_SIZE - sizeof(struct hv_memory_hint)) / \
Use HV_HYP_PAGE_SIZE instead of PAGE_SIZE.
> + sizeof(union hv_gpa_page_range))
> +
> +/* HvExtCallMemoryHeatHint hypercall */
> +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2
> +struct hv_memory_hint {
> + u64 type:2;
> + u64 reserved:62;
> + union hv_gpa_page_range ranges[];
> +} __packed;
> +
> #endif
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index c57799684170..93c1303f5e00 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -27,6 +27,7 @@
>
> struct ms_hyperv_info {
> u32 features;
> + u32 priv_high;
> u32 misc_features;
> u32 hints;
> u32 nested_features;
> @@ -170,6 +171,7 @@ void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
> bool hv_is_hyperv_initialized(void);
> bool hv_is_hibernation_supported(void);
> void hyperv_cleanup(void);
> +bool hv_query_ext_cap(u64 cap_query);
> #else /* CONFIG_HYPERV */
> static inline bool hv_is_hyperv_initialized(void) { return false; }
> static inline bool hv_is_hibernation_supported(void) { return false; }
> --
> 2.17.1