[PATCH V4 4/5] KVM: X86: Adding support for byte granular memory ROE

From: Ahmed Abd El Mawgood
Date: Sat Oct 20 2018 - 18:23:37 EST


This patch documents and implements ROE_MPROTECT_CHUNK, a part of ROE
hypercall designed to protect regions of a memory page with byte
granularity. This feature provides a key primitive to protect against
attacks involving pages remapping. However this attack will be
addressed in future patches.

Signed-off-by: Ahmed Abd El Mawgood <ahmedsoliman0x666@xxxxxxxxx>
---
Documentation/virtual/kvm/hypercalls.txt | 9 ++
arch/x86/kvm/mmu.c | 6 +-
arch/x86/kvm/x86.c | 156 +++++++++++++++++++++--
include/linux/kvm_host.h | 26 ++++
include/uapi/linux/kvm_para.h | 1 +
virt/kvm/kvm_main.c | 88 +++++++++++--
6 files changed, 266 insertions(+), 20 deletions(-)

diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index 8af64d826f03..8708d69a7725 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -164,6 +164,15 @@ This configuration lets a guest kernel have part of its read/write memory
converted into read-only. This action is irreversible.
Upon successful run, the number of pages protected is returned.

+Usage 3:
+ a0: ROE_MPROTECT_CHUNK (requires version >= 2)
+ a1: Start address aligned to page boundary.
+ a2: Number of bytes to be protected.
+This configuration lets a guest kernel have part of its read/write memory
+converted into read-only with bytes granularity. ROE_MPROTECT_CHUNK is
+relatively slow compared to ROE_MPROTECT. This action is irreversible.
+Upon successful run, the number of pages protected is returned.
+
Error codes:
-KVM_ENOSYS: system call being triggered from ring 3 or it is not
implemented.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c54aa5287e14..c3d681bfa105 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1507,9 +1507,11 @@ static bool __rmap_write_protect_roe(struct kvm *kvm,
struct rmap_iterator iter;
bool prot;
bool flush = false;
-
+ void *full_bmp = d->memslot->roe_bitmap;
+ void *part_bmp = d->memslot->partial_roe_bitmap;
for_each_rmap_spte(rmap_head, &iter, sptep) {
- prot = !test_bit(d->i, d->memslot->roe_bitmap) && pt_protect;
+ prot = !(test_bit(d->i, full_bmp) || test_bit(d->i, part_bmp));
+ prot = prot && pt_protect;
flush |= spte_write_protect(sptep, prot);
d->i++;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70f2b42a2f91..0c767ddd26a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6800,17 +6800,23 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,

#ifdef CONFIG_KVM_ROE
static void kvm_roe_protect_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, u64 npages)
+ gfn_t gfn, u64 npages, bool partial)
{
int i;
+ void *bitmap;

+ if (partial)
+ bitmap = slot->partial_roe_bitmap;
+ else
+ bitmap = slot->roe_bitmap;
for (i = gfn - slot->base_gfn; i < gfn + npages - slot->base_gfn; i++)
- set_bit(i, slot->roe_bitmap);
+ set_bit(i, bitmap);
kvm_mmu_slot_apply_write_access(kvm, slot);
kvm_arch_flush_shadow_memslot(kvm, slot);
}

-static int __kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
+static int __kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages,
+ bool partial)
{
struct kvm_memory_slot *slot;
gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -6826,12 +6832,12 @@ static int __kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
if (gfn + npages > slot->base_gfn + slot->npages) {
u64 _npages = slot->base_gfn + slot->npages - gfn;

- kvm_roe_protect_slot(kvm, slot, gfn, _npages);
+ kvm_roe_protect_slot(kvm, slot, gfn, _npages, partial);
gfn += _npages;
count += _npages;
npages -= _npages;
} else {
- kvm_roe_protect_slot(kvm, slot, gfn, npages);
+ kvm_roe_protect_slot(kvm, slot, gfn, npages, partial);
count += npages;
npages = 0;
}
@@ -6841,12 +6847,13 @@ static int __kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
return count;
}

-static int kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
+static int kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages,
+ bool partial)
{
int r;

mutex_lock(&kvm->slots_lock);
- r = __kvm_roe_protect_range(kvm, gpa, npages);
+ r = __kvm_roe_protect_range(kvm, gpa, npages, partial);
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -6895,7 +6902,7 @@ static int kvm_roe_full_protect_range(struct kvm_vcpu *vcpu, u64 gva,
continue;
if (!access_ok(VERIFY_WRITE, hva, 1 << PAGE_SHIFT))
continue;
- status = kvm_roe_protect_range(vcpu->kvm, gpa, 1);
+ status = kvm_roe_protect_range(vcpu->kvm, gpa, 1, false);
if (status > 0)
count += status;
}
@@ -6903,7 +6910,135 @@ static int kvm_roe_full_protect_range(struct kvm_vcpu *vcpu, u64 gva,
return -EINVAL;
return count;
}
+static int kvm_roe_insert_chunk_next(struct list_head *pos, u64 gpa, u64 size)
+{
+ struct protected_chunk *chunk;
+
+ chunk = kvzalloc(sizeof(struct protected_chunk), GFP_KERNEL);
+ chunk->gpa = gpa;
+ chunk->size = size;
+ INIT_LIST_HEAD(&chunk->list);
+ list_add(&chunk->list, pos);
+ return size;
+}
+static int kvm_roe_expand_chunk(struct protected_chunk *pos, u64 gpa, u64 size)
+{
+ u64 old_ptr = pos->gpa;
+ u64 old_size = pos->size;
+
+ if (gpa < old_ptr)
+ pos->gpa = gpa;
+ if (gpa + size > old_ptr + old_size)
+ pos->size = gpa + size - pos->gpa;
+ return size;
+}
+
+static bool kvm_roe_merge_chunks(struct protected_chunk *chunk)
+{
+ /*attempt merging 2 consecutive given the first one*/
+ struct protected_chunk *next = list_next_entry(chunk, list);
+
+ if (!kvm_roe_range_overlap(chunk, next->gpa, next->size))
+ return false;
+ kvm_roe_expand_chunk(chunk, next->gpa, next->size);
+ list_del(&next->list);
+ kvfree(next);
+ return true;
+}
+static int __kvm_roe_insert_chunk(struct kvm_memory_slot *slot, u64 gpa,
+ u64 size)
+{
+ /* kvm->slots_lock must be acquired*/
+ struct protected_chunk *pos;
+ struct list_head *head = slot->prot_list;
+
+ if (list_empty(head))
+ return kvm_roe_insert_chunk_next(head, gpa, size);
+ /*
+ * pos here will never get deleted maybe the next one will
+ * that is why list_for_each_entry_safe is completely unsafe
+ */
+ list_for_each_entry(pos, head, list) {
+ if (kvm_roe_range_overlap(pos, gpa, size)) {
+ int ret = kvm_roe_expand_chunk(pos, gpa, size);
+
+ while (head != pos->list.next)
+ if (!kvm_roe_merge_chunks(pos))
+ break;
+ return ret;
+ }
+ if (pos->gpa > gpa) {
+ struct protected_chunk *prev;

+ prev = list_prev_entry(pos, list);
+ return kvm_roe_insert_chunk_next(&prev->list, gpa,
+ size);
+ }
+ }
+ pos = list_last_entry(head, struct protected_chunk, list);
+
+ return kvm_roe_insert_chunk_next(&pos->list, gpa, size);
+}
+static int kvm_roe_insert_chunk(struct kvm *kvm, u64 gpa, u64 size)
+{
+ struct kvm_memory_slot *slot;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int ret;
+
+ mutex_lock(&kvm->slots_lock);
+ slot = gfn_to_memslot(kvm, gfn);
+ ret = __kvm_roe_insert_chunk(slot, gpa, size);
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+
+static int kvm_roe_partial_page_protect(struct kvm_vcpu *vcpu, u64 gva,
+ u64 size)
+{
+ gpa_t gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+ kvm_roe_protect_range(vcpu->kvm, gpa, 1, true);
+ return kvm_roe_insert_chunk(vcpu->kvm, gpa, size);
+}
+
+static int kvm_roe_partial_protect(struct kvm_vcpu *vcpu, u64 gva, u64 size)
+{
+ u64 gva_start = gva;
+ u64 gva_end = gva+size;
+ u64 gpn_start = gva_start >> PAGE_SHIFT;
+ u64 gpn_end = gva_end >> PAGE_SHIFT;
+ u64 _size;
+ int count = 0;
+ // We need to make sure that there will be no overflow or zero size
+ if (gva_end <= gva_start)
+ return -EINVAL;
+
+ // protect the partial page at the start
+ if (gpn_end > gpn_start)
+ _size = PAGE_SIZE - (gva_start & PAGE_MASK) + 1;
+ else
+ _size = size;
+ size -= _size;
+ count += kvm_roe_partial_page_protect(vcpu, gva_start, _size);
+ // full protect in the middle pages
+ if (gpn_end - gpn_start > 1) {
+ int ret;
+ u64 _gva = (gpn_start + 1) << PAGE_SHIFT;
+ u64 npages = gpn_end - gpn_start - 1;
+
+ size -= npages << PAGE_SHIFT;
+ ret = kvm_roe_full_protect_range(vcpu, _gva, npages);
+ if (ret > 0)
+ count += ret << PAGE_SHIFT;
+ }
+ // protect the partial page at the end
+ if (size != 0)
+ count += kvm_roe_partial_page_protect(vcpu,
+ gpn_end << PAGE_SHIFT, size);
+ if (count == 0)
+ return -EINVAL;
+ return count;
+}
static int kvm_roe(struct kvm_vcpu *vcpu, u64 a0, u64 a1, u64 a2, u64 a3)
{
int ret;
@@ -6915,11 +7050,14 @@ static int kvm_roe(struct kvm_vcpu *vcpu, u64 a0, u64 a1, u64 a2, u64 a3)
return -KVM_ENOSYS;
switch (a0) {
case ROE_VERSION:
- ret = 1; //current version
+ ret = 2; //current version
break;
case ROE_MPROTECT:
ret = kvm_roe_full_protect_range(vcpu, a1, a2);
break;
+ case ROE_MPROTECT_CHUNK:
+ ret = kvm_roe_partial_protect(vcpu, a1, a2);
+ break;
default:
ret = -EINVAL;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index be6885bc28bc..a6749a52386b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -294,11 +294,37 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
*/
#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)

+#ifdef CONFIG_KVM_ROE
+/*
+ * This structure is used to hold memory areas that are to be protected in a
+ * memory frame with mixed page permissions.
+ **/
+struct protected_chunk {
+ gpa_t gpa;
+ u64 size;
+ struct list_head list;
+};
+
+static inline bool kvm_roe_range_overlap(struct protected_chunk *chunk,
+ gpa_t gpa, int len) {
+ /*
+ * https://stackoverflow.com/questions/325933/
+ * determine-whether-two-date-ranges-overlap
+ * Assuming that it works, that link ^ provides a solution that is
+ * better than anything I would ever come up with.
+ */
+ return (gpa <= chunk->gpa + chunk->size - 1) &&
+ (gpa + len - 1 >= chunk->gpa);
+}
+#endif
+
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
#ifdef CONFIG_KVM_ROE
unsigned long *roe_bitmap;
+ unsigned long *partial_roe_bitmap;
+ struct list_head *prot_list;
#endif
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index e6004e0750fd..4a84f974bc58 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -33,6 +33,7 @@
/* ROE Functionality parameters */
#define ROE_VERSION 0
#define ROE_MPROTECT 1
+#define ROE_MPROTECT_CHUNK 2
/*
* hypercalls use architecture specific
*/
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 423a9c014120..d4f36faacd29 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -555,10 +555,19 @@ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
#ifdef CONFIG_KVM_ROE
- if (!dont)
+ if (!dont) {
+ //TODO still this might leak
+ struct protected_chunk *pos, *n;
+ struct list_head *head = free->prot_list;
kvfree(free->roe_bitmap);
+ kvfree(free->partial_roe_bitmap);
+ list_for_each_entry_safe(pos, n, head, list) {
+ list_del(&pos->list);
+ kvfree(pos);
+ }
+ kvfree(free->prot_list);
+ }
#endif
-
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);

@@ -805,13 +814,22 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
return 0;
}

-static int kvm_init_roe_bitmap(struct kvm_memory_slot *slot)
+static int kvm_init_roe(struct kvm_memory_slot *slot)
{
#ifdef CONFIG_KVM_ROE
slot->roe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) *
sizeof(unsigned long), GFP_KERNEL);
if (!slot->roe_bitmap)
return -ENOMEM;
+ slot->partial_roe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) *
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!slot->partial_roe_bitmap) {
+ kvfree(slot->roe_bitmap);
+ return -ENOMEM;
+ }
+ slot->prot_list = kvzalloc(sizeof(struct list_head), GFP_KERNEL);
+ INIT_LIST_HEAD(slot->prot_list);
+
#endif
return 0;
}
@@ -1033,7 +1051,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free;
}
- if (kvm_init_roe_bitmap(&new) < 0)
+ if (kvm_init_roe(&new) < 0)
goto out_free;

slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
@@ -1287,26 +1305,37 @@ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
{
return slot->flags & KVM_MEM_READONLY;
}
+#ifdef CONFIG_KVM_ROE
+static bool gfn_is_partially_protected(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+
+ return test_bit(gfn - slot->base_gfn, slot->partial_roe_bitmap);
+}

+static bool gfn_is_fully_protected(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return test_bit(gfn - slot->base_gfn, slot->roe_bitmap);
+}
+#endif
static bool gfn_is_readonly(struct kvm_memory_slot *slot, gfn_t gfn)
{
#ifdef CONFIG_KVM_ROE
- return test_bit(gfn - slot->base_gfn, slot->roe_bitmap) ||
- memslot_is_readonly(slot);
+ return gfn_is_fully_protected(slot, gfn) ||
+ gfn_is_partially_protected(slot, gfn) ||
+ memslot_is_readonly(slot);
#else
return memslot_is_readonly(slot);
#endif
}

+
static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return KVM_HVA_ERR_BAD;
-
if (gfn_is_readonly(slot, gfn) && write)
return KVM_HVA_ERR_RO_BAD;
-
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);

@@ -1864,14 +1893,55 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
+#ifdef CONFIG_KVM_ROE

+static bool kvm_roe_protected_range(struct kvm_memory_slot *slot, gpa_t gpa,
+ int len)
+{
+ struct list_head *pos;
+ struct protected_chunk *cur_chunk;
+
+ list_for_each(pos, slot->prot_list) {
+ cur_chunk = list_entry(pos, struct protected_chunk, list);
+ if (kvm_roe_range_overlap(cur_chunk, gpa, len))
+ return true;
+ }
+ return false;
+}
+static bool kvm_roe_check_range(struct kvm_memory_slot *slot,
+ gfn_t gfn, int offset, int len)
+{
+ gpa_t gpa = (gfn << PAGE_SHIFT) + offset;
+
+ if (!gfn_is_partially_protected(slot, gfn))
+ return false;
+ return kvm_roe_protected_range(slot, gpa, len);
+}
+#endif
+static u64 roe_gfn_to_hva(struct kvm_memory_slot *slot, gfn_t gfn, int offset,
+ int len)
+{
+ u64 addr;
+#ifdef CONFIG_KVM_ROE
+ if (kvm_roe_check_range(slot, gfn, offset, len))
+ return KVM_HVA_ERR_RO_BAD;
+ if (memslot_is_readonly(slot))
+ return KVM_HVA_ERR_RO_BAD;
+ if (gfn_is_fully_protected(slot, gfn))
+ return KVM_HVA_ERR_RO_BAD;
+ addr = __gfn_to_hva_many(slot, gfn, NULL, false);
+#else
+ addr = gfn_to_hva_memslot(slot, gfn);
+#endif
+ return addr;
+}
static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
const void *data, int offset, int len)
{
int r;
unsigned long addr;

- addr = gfn_to_hva_memslot(memslot, gfn);
+ addr = roe_gfn_to_hva(memslot, gfn, offset, len);
if (kvm_is_error_hva(addr))
return -EFAULT;
r = __copy_to_user((void __user *)addr + offset, data, len);
--
2.18.1