[RFC] kvm: Adding skelaton for Memory ROE

From: Ahmed Abd El Mawgood
Date: Mon Jul 16 2018 - 15:37:07 EST


This is my first patch, an attempt to implement Memory ROE discussed by me
earlier as a way to prevent Rootkits. I have already explained in details
in this thread:
https://www.mail-archive.com/kernelnewbies@xxxxxxxxxxxxxxxxx/msg18826.html
So I think there is no need for saying the exact same thing again.
The problem is that the code isn't working and I can't figure out why

I tried implementing the protection to follow similar behavior to that
of KVM_MEM_READONLY but to be on page (SPTE) level
The current problem I am facing is that when handling the hypercall
vcpu->mode turns to be OUTSIDE_GUEST_MODE but KVM_REQ_TLB_FLUSH doesn't
seem to be handled correctly. KVM documentation promised that when VCPU is
not in GUEST_MODE VCPU are handled asap and kvm_vcpu_kick(vcpu); will
even force that, but it doesn't seem to be the case for me. This is the
kind of logging I am getting:

[3556.312299] kvm_mmu_slot_apply_flags: visited
[3556.312301] kvm_mmu_slot_apply_write_access: Flush = false
[3557.034243] gfn_is_readonly: test_bit = 0
[3557.034251] gfn_is_readonly: test_bit = 0
[3557.034254] gfn_is_readonly: test_bit = 0
[3557.034463] Hypercall received, page address 0x0
[3557.034466] gfn_is_readonly: test_bit = 0
[3557.034469] kvm_mroe: flush state = Done
[3557.034472] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE
[3557.034475] Setting page number 0 in slot number 0
[3557.034480] slot_rmap_apply_protection: The 0th page is readonly, Flush = True
[3557.034483] kvm_mmu_slot_apply_write_access: Flush = true
[3557.034486] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE
[3557.034488] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE
[3557.034490] kvm_mroe: flush state = Waiting

For some reason kvm_vcpu_kick() didn't force the KVM_REQ_TLB_FLUSH to
kick into the virtual cpu (I am talking about the last 2 lines).

I am aware that there is still alot missing (like dealing with malicious
guest remappings) and the code quality sucks, but any ideas about what I
could be doing wrong (or ideas in general) would be apprciated. I am
already planning to do everything cleanly once it works.

Thansk.

Signed-off-by: Ahmed Abd El Mawgood <ahmedsoliman0x666@xxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 7 ++-
arch/x86/kvm/Kconfig | 7 +++
arch/x86/kvm/mmu.c | 127 +++++++++++++++++++++++++++-------------
arch/x86/kvm/x86.c | 83 ++++++++++++++++++++++++--
include/linux/kvm_host.h | 17 ++++++
include/uapi/linux/kvm_para.h | 4 +-
virt/kvm/kvm_main.c | 36 +++++++++---
7 files changed, 226 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..c66e9245f750 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -235,7 +235,10 @@ struct kvm_mmu_memory_cache {
int nobjs;
void *objects[KVM_NR_MEM_OBJS];
};
-
+struct kvm_write_access_data {
+ int i;
+ struct kvm_memory_slot *memslot;
+};
/*
* the pages used as guest page table on soft mmu are tracked by
* kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
@@ -1130,7 +1133,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 acc_track_mask, u64 me_mask);

void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+void kvm_mmu_slot_apply_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 92fd433c50b9..8ae822a8dc7a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -96,6 +96,13 @@ config KVM_MMU_AUDIT
This option adds a R/W kVM module parameter 'mmu_audit', which allows
auditing of KVM MMU events at runtime.

+config KVM_MROE
+ bool "Hypercall Memory Read-Only Enforcement"
+ depends on KVM && X86
+ help
+ This option add KVM_HC_HMROE hypercall to kvm which as hardening
+ mechanism to protect memory pages from being edited.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..946545b8b8cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,7 +70,7 @@ enum {
#undef MMU_DEBUG

#ifdef MMU_DEBUG
-static bool dbg = 0;
+static bool dbg = 1;
module_param(dbg, bool, 0644);

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
@@ -1402,7 +1402,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
static bool spte_write_protect(u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
-
if (!is_writable_pte(spte) &&
!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
return false;
@@ -1418,15 +1417,23 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)

static bool __rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- bool pt_protect)
+ bool pt_protect,
+ struct kvm_write_access_data *d)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
-
- for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_write_protect(sptep, pt_protect);
-
+ if (d == NULL) {
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ flush |= spte_write_protect(sptep,
+ !test_bit(d->i, d->memslot->mroe_bitmap)
+ && pt_protect);
+ d->i++;
+ }
+ } else {
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ flush |= spte_write_protect(sptep, pt_protect);
+ }
return flush;
}

@@ -1457,7 +1464,8 @@ static bool wrprot_ad_disabled_spte(u64 *sptep)
* - W bit on ad-disabled SPTEs.
* Returns true iff any D or W bits were cleared.
*/
-static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1483,7 +1491,8 @@ static bool spte_set_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte);
}

-static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1515,7 +1524,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmap_head, false);
+ __rmap_write_protect(kvm, rmap_head, false, NULL);

/* clear the first set bit */
mask &= mask - 1;
@@ -1541,7 +1550,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PT_PAGE_TABLE_LEVEL, slot);
- __rmap_clear_dirty(kvm, rmap_head);
+ __rmap_clear_dirty(kvm, rmap_head, NULL);

/* clear the first set bit */
mask &= mask - 1;
@@ -1591,10 +1600,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
-
+ struct kvm_write_access_data data = {
+ .i = 0,
+ .memslot = slot,
+ };
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true,
+ &data);
}

return write_protected;
@@ -1608,7 +1621,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
}

-static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1628,7 +1642,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
- return kvm_zap_rmapp(kvm, rmap_head);
+ return kvm_zap_rmapp(kvm, rmap_head, NULL);
}

static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -5086,13 +5100,15 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
}

/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+typedef bool (*slot_level_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head, void *data);

/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb,
+ void *data)
{
struct slot_rmap_walk_iterator iterator;
bool flush = false;
@@ -5100,7 +5116,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap);
+ flush |= fn(kvm, iterator.rmap, data);

if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
@@ -5122,36 +5138,36 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
static __always_inline bool
slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+ bool lock_flush_tlb, void *data)
{
return slot_handle_level_range(kvm, memslot, fn, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
+ lock_flush_tlb, data);
}

static __always_inline bool
slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool lock_flush_tlb, void *data)
{
return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data);
}

static __always_inline bool
slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool lock_flush_tlb, void *data)
{
return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data);
}

static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool lock_flush_tlb, void *data)
{
return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb, data);
}

void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
@@ -5173,7 +5189,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)

slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true);
+ start, end - 1, true, NULL);
}
}

@@ -5181,23 +5197,52 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}

static bool slot_rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head)
+ struct kvm_rmap_head *rmap_head,
+ void *data)
{
- return __rmap_write_protect(kvm, rmap_head, false);
+ return __rmap_write_protect(kvm, rmap_head, false,
+ (struct kvm_write_access_data *)data);
}

-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+static bool slot_rmap_apply_protection(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ void *data)
+{
+ struct kvm_write_access_data *d = (struct kvm_write_access_data *) data;
+ unsigned long *protection = d->memslot->mroe_bitmap;
+ bool prot_mask = d->memslot->flags & KVM_MEM_READONLY;
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ flush |= spte_write_protect(sptep,
+ !(test_bit(d->i, protection) || prot_mask));
+ if (test_bit(d->i, protection)) {
+ pr_info("%s: The %dth page is readonly, Flush = %s\n",
+ __func__, d->i, flush?"True" : "False");
+ }
+ d->i++;
+ }
+ return flush;
+}
+
+void kvm_mmu_slot_apply_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
bool flush;
-
+ struct kvm_write_access_data data = {
+ .i = 0,
+ .memslot = memslot,
+ };
spin_lock(&kvm->mmu_lock);
- flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
- false);
+ flush = slot_handle_all_level(kvm, memslot, slot_rmap_apply_protection,
+ false, &data);
+ pr_info("%s: Flush = %s\n", __func__, flush ? "true":"false");
spin_unlock(&kvm->mmu_lock);

/*
- * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+ * kvm_mmu_slot_apply_write_access() and kvm_vm_ioctl_get_dirty_log()
* which do tlb flush out of mmu-lock should be serialized by
* kvm->slots_lock otherwise tlb flush would be missed.
*/
@@ -5219,7 +5264,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}

static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head)
+ struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -5257,7 +5303,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
spin_lock(&kvm->mmu_lock);
slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
- kvm_mmu_zap_collapsible_spte, true);
+ kvm_mmu_zap_collapsible_spte, true, NULL);
spin_unlock(&kvm->mmu_lock);
}

@@ -5267,7 +5313,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
bool flush;

spin_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false, NULL);
spin_unlock(&kvm->mmu_lock);

lockdep_assert_held(&kvm->slots_lock);
@@ -5290,10 +5336,10 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,

spin_lock(&kvm->mmu_lock);
flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
- false);
+ false, NULL);
spin_unlock(&kvm->mmu_lock);

- /* see kvm_mmu_slot_remove_write_access */
+ /* see kvm_mmu_slot_apply_write_access */
lockdep_assert_held(&kvm->slots_lock);

if (flush)
@@ -5307,7 +5353,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
bool flush;

spin_lock(&kvm->mmu_lock);
- flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+ flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false,
+ NULL);
spin_unlock(&kvm->mmu_lock);

lockdep_assert_held(&kvm->slots_lock);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0046aa70205a..96e967199fda 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,7 +55,7 @@
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
#include <linux/mem_encrypt.h>
-
+#include <linux/mempolicy.h>
#include <trace/events/kvm.h>

#include <asm/debugreg.h>
@@ -4177,7 +4177,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)

/*
* All the TLBs can be flushed out of mmu lock, see the comments in
- * kvm_mmu_slot_remove_write_access().
+ * kvm_mmu_slot_apply_write_access().
*/
lockdep_assert_held(&kvm->slots_lock);
if (is_dirty)
@@ -6669,7 +6669,74 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
return ret;
}
#endif
+#ifdef CONFIG_KVM_MROE
+static int roe_protect_frame(struct kvm *kvm, gpa_t gpa)
+{
+ struct kvm_memory_slot *slot;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ //XXX do some error checking dude.
+ if (gfn > slot->base_gfn + slot->npages) {
+ //XXX use a better language
+ pr_err("You have an overflow\n");
+ return -1;
+ }
+ pr_info("Setting page number %lld in slot number %d\n",
+ gfn - slot->base_gfn, slot->id);
+ // something is wrong with the locking here
+ // you should lock the area before writing the bit
+ set_bit(gfn - slot->base_gfn, slot->mroe_bitmap);
+ kvm_mmu_slot_apply_write_access(kvm, slot);
+ return 0;
+}
+void debug_cpu_mode(struct kvm_vcpu *vcpu)
+{
+ char *mode = "Unknown";
+
+ if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ mode = "OUTSIDE_GUEST_MODE";
+ else if (vcpu->mode == IN_GUEST_MODE)
+ mode = "IN_GUEST_MODE";
+ else if (vcpu->mode == EXITING_GUEST_MODE)
+ mode = "EXITING_GUEST_MODE";
+ else if (vcpu->mode == READING_SHADOW_PAGE_TABLES)
+ mode = "READING_SHADOW_PAGE_TABLES";
+ pr_info("kvm_mroe: cpu mode = %s\n", mode);
+}
+static int kvm_mroe(struct kvm_vcpu *vcpu, u64 gva)
+{
+ struct kvm *kvm = vcpu->kvm;
+ gpa_t gpa;
+ u64 hva;
+ int ret;

+ //XXX check that the hypercall is done from kernel mode
+ if (gva & ~PAGE_MASK)
+ return -EINVAL;
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+ hva = gfn_to_hva(kvm, gpa >> PAGE_SHIFT);
+ //XXX This doesn't work but it will be ok to check that we can access
+ // the address and make sure that the mapping makes sense
+ if (!access_ok(VERIFY_WRITE, hva, PAGE_SIZE)) {
+ pr_info("Duplicate request\n");
+ return -KVM_EROEDUPLICATR;
+ }
+ pr_info("%s: flush state = %s\n", __func__,
+ kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu) ? "Waiting" :
+ "Done");
+ debug_cpu_mode(vcpu);
+ ret = roe_protect_frame(vcpu->kvm, gpa);
+ debug_cpu_mode(vcpu);
+ kvm_vcpu_kick(vcpu);
+ debug_cpu_mode(vcpu);
+ pr_info("%s: flush state = %s\n", __func__,
+ kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu) ? "Waiting" :
+ "Done");
+
+ return ret;
+}
+#endif
/*
* kvm_pv_kick_cpu_op: Kick a vcpu.
*
@@ -6737,6 +6804,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_CLOCK_PAIRING:
ret = kvm_pv_clock_pairing(vcpu, a0, a1);
break;
+#endif
+#ifdef CONFIG_KVM_MROE
+ case KVM_HC_HMROE:
+ pr_info("Hypercall received, page address 0x%lx\n", a0);
+ ret = kvm_mroe(vcpu, a0);
+ break;
#endif
default:
ret = -KVM_ENOSYS;
@@ -8971,8 +9044,10 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
struct kvm_memory_slot *new)
{
/* Still write protect RO slot */
+ pr_info("%s: visited\n", __func__);
+ kvm_mmu_slot_apply_write_access(kvm, new);
+ return;
if (new->flags & KVM_MEM_READONLY) {
- kvm_mmu_slot_remove_write_access(kvm, new);
return;
}

@@ -9010,7 +9085,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (kvm_x86_ops->slot_enable_log_dirty)
kvm_x86_ops->slot_enable_log_dirty(kvm, new);
else
- kvm_mmu_slot_remove_write_access(kvm, new);
+ kvm_mmu_slot_apply_write_access(kvm, new);
} else {
if (kvm_x86_ops->slot_disable_log_dirty)
kvm_x86_ops->slot_disable_log_dirty(kvm, new);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ee7bc548a83..1ca6db7b8931 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -7,6 +7,7 @@
*/

#include <linux/types.h>
+#include <linux/hashtable.h>
#include <linux/hardirq.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -297,6 +298,9 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
+#ifdef CONFIG_KVM_MROE
+ unsigned long *mroe_bitmap;
+#endif
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr;
@@ -387,6 +391,13 @@ struct kvm_memslots {
int used_slots;
};

+#ifdef CONFIG_KVM_MROE
+struct roe_page {
+ void *page_start;
+ struct hlist_node hash_list;
+};
+#endif
+
struct kvm {
spinlock_t mmu_lock;
struct mutex slots_lock;
@@ -440,6 +451,12 @@ struct kvm {
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
#endif
+
+#ifdef CONFIG_KVM_MROE
+ //TODO tune hash size;
+ #define KVM_MROE_HASH_SIZE 8
+ DECLARE_HASHTABLE(roe_pages, KVM_MROE_HASH_SIZE);
+#endif
long tlbs_dirty;
struct list_head devices;
struct dentry *debugfs_dentry;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index dcf629dd2889..2be960477649 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -17,6 +17,8 @@
#define KVM_EPERM EPERM
#define KVM_EOPNOTSUPP 95

+#define KVM_EROEDUPLICATR 1
+
#define KVM_HC_VAPIC_POLL_IRQ 1
#define KVM_HC_MMU_OP 2
#define KVM_HC_FEATURES 3
@@ -26,7 +28,7 @@
#define KVM_HC_MIPS_EXIT_VM 7
#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
#define KVM_HC_CLOCK_PAIRING 9
-
+#define KVM_HC_HMROE 10
/*
* hypercalls use architecture specific
*/
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b47507faab5..ca1b95a16a8b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -634,7 +634,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(&kvm->slots_lock);
refcount_set(&kvm->users_count, 1);
INIT_LIST_HEAD(&kvm->devices);
-
r = kvm_arch_init_vm(kvm, type);
if (r)
goto out_err_no_disable;
@@ -794,6 +793,17 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
return 0;
}

+static int kvm_init_mroe_bitmap(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_KVM_MROE
+ slot->mroe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) *
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!slot->mroe_bitmap)
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
/*
* Insert memslot and re-sort memslots based on their GFN,
* so binary search could be used to lookup GFN.
@@ -1011,7 +1021,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free;
}
-
+ if (kvm_init_mroe_bitmap(&new) < 0)
+ goto out_free;
slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!slots)
goto out_free;
@@ -1263,16 +1274,25 @@ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
{
return slot->flags & KVM_MEM_READONLY;
}
-
+static bool gfn_is_readonly(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+#ifdef CONFIG_KVM_MROE
+ pr_info("%s: test_bit = %d", __func__,
+ test_bit(gfn - slot->base_gfn, slot->mroe_bitmap));
+ ///dump_stack();
+ return test_bit(gfn - slot->base_gfn, slot->mroe_bitmap) ||
+ memslot_is_readonly(slot);
+#else
+ return memslot_is_readonly(slot);
+#endif
+}
static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return KVM_HVA_ERR_BAD;
-
- if (memslot_is_readonly(slot) && write)
+ if (gfn_is_readonly(slot, gfn) && write)
return KVM_HVA_ERR_RO_BAD;
-
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);

@@ -1314,7 +1334,7 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);

if (!kvm_is_error_hva(hva) && writable)
- *writable = !memslot_is_readonly(slot);
+ *writable = !gfn_is_readonly(slot, gfn);

return hva;
}
@@ -1554,7 +1574,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
}

/* Do not map writable pfn in the readonly memslot. */
- if (writable && memslot_is_readonly(slot)) {
+ if (writable && gfn_is_readonly(slot, gfn)) {
*writable = false;
writable = NULL;
}
--
2.16.4