[RFC PATCH 2/4] kvm: Add host side support for free memory hints

From: Alexander Duyck
Date: Mon Feb 04 2019 - 13:15:53 EST


From: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx>

Add the host side of the KVM memory hinting support. With this we expose a
feature bit indicating that the host will pass the messages along to the
new madvise function.

This functionality is mutually exclusive with device assignment. If a
device is assigned we will disable the functionality as it could lead to a
potential memory corruption if a device writes to a page after KVM has
flagged it as not being used.

The logic as it is currently defined limits the hint to only supporting a
hugepage or larger notifications. This is meant to help prevent us from
potentially breaking up huge pages by hinting that only a portion of the
page is not needed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx>
---
Documentation/virtual/kvm/cpuid.txt | 4 +++
Documentation/virtual/kvm/hypercalls.txt | 14 ++++++++++++
arch/x86/include/uapi/asm/kvm_para.h | 3 +++
arch/x86/kvm/cpuid.c | 6 ++++-
arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++
include/uapi/linux/kvm_para.h | 1 +
6 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 97ca1940a0dc..fe3395a58b7e 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -66,6 +66,10 @@ KVM_FEATURE_PV_SEND_IPI || 11 || guest checks this feature bit
|| || before using paravirtualized
|| || send IPIs.
------------------------------------------------------------------------------
+KVM_FEATURE_PV_UNUSED_PAGE_HINT || 12 || guest checks this feature bit
+ || || before using paravirtualized
+ || || unused page hints.
+------------------------------------------------------------------------------
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
|| || per-cpu warps are expected in
|| || kvmclock.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index da24c138c8d1..b374678ac1f9 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -141,3 +141,17 @@ a0 corresponds to the APIC ID in the third argument (a2), bit 1
corresponds to the APIC ID a2+1, and so on.

Returns the number of CPUs to which the IPIs were delivered successfully.
+
+7. KVM_HC_UNUSED_PAGE_HINT
+------------------------
+Architecture: x86
+Status: active
+Purpose: Send unused page hint to host
+
+a0: physical address of region unused, page aligned
+a1: size of unused region, page aligned
+
+The hypercall lets a guest send notifications to the host that it will no
+longer be using a given page in memory. Multiple pages can be hinted at by
+using the size field to hint that a higher order page is available by
+specifying the higher order page size.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 19980ec1a316..f066c23060df 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -29,6 +29,7 @@
#define KVM_FEATURE_PV_TLB_FLUSH 9
#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
#define KVM_FEATURE_PV_SEND_IPI 11
+#define KVM_FEATURE_PV_UNUSED_PAGE_HINT 12

#define KVM_HINTS_REALTIME 0

@@ -119,4 +120,6 @@ struct kvm_vcpu_pv_apf_data {
#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
#define KVM_PV_EOI_DISABLED 0x0

+#define KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER HUGETLB_PAGE_ORDER
+
#endif /* _UAPI_ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbffa6c54697..b82bcbfbc420 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -136,6 +136,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+ if (kvm_arch_has_assigned_device(vcpu->kvm) && best &&
+ (best->eax & KVM_FEATURE_PV_UNUSED_PAGE_HINT))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNUSED_PAGE_HINT);

/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -637,7 +640,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_PV_UNHALT) |
(1 << KVM_FEATURE_PV_TLB_FLUSH) |
(1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
- (1 << KVM_FEATURE_PV_SEND_IPI);
+ (1 << KVM_FEATURE_PV_SEND_IPI) |
+ (1 << KVM_FEATURE_PV_UNUSED_PAGE_HINT);

if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d27206f6c01..3ec75ab849e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,6 +55,7 @@
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
#include <linux/mem_encrypt.h>
+#include <linux/mm.h>

#include <trace/events/kvm.h>

@@ -7052,6 +7053,37 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
}

+static int kvm_pv_unused_page_hint_op(struct kvm *kvm, gpa_t gpa, size_t len)
+{
+ unsigned long start;
+
+ /*
+ * Guarantee the following:
+ * len meets minimum size
+ * len is a power of 2
+ * gpa is aligned to len
+ */
+ if (len < (PAGE_SIZE << KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER))
+ return -KVM_EINVAL;
+ if (!is_power_of_2(len) || !IS_ALIGNED(gpa, len))
+ return -KVM_EINVAL;
+
+ /*
+ * If a device is assigned we cannot use use madvise as memory
+ * is shared with the device and could lead to memory corruption
+ * if the device writes to it after free.
+ */
+ if (kvm_arch_has_assigned_device(kvm))
+ return -KVM_EOPNOTSUPP;
+
+ start = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+
+ if (kvm_is_error_hva(start + len))
+ return -KVM_EFAULT;
+
+ return do_madvise_dontneed(start, len);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
@@ -7098,6 +7130,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_SEND_IPI:
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
+ case KVM_HC_UNUSED_PAGE_HINT:
+ ret = kvm_pv_unused_page_hint_op(vcpu->kvm, a0, a1);
+ break;
default:
ret = -KVM_ENOSYS;
break;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index 6c0ce49931e5..75643b862a4e 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -28,6 +28,7 @@
#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
#define KVM_HC_CLOCK_PAIRING 9
#define KVM_HC_SEND_IPI 10
+#define KVM_HC_UNUSED_PAGE_HINT 11

/*
* hypercalls use architecture specific