[RFC 3/3] KVM, HWPoison, unpoison address across rebooting

From: Huang Ying
Date: Tue Dec 21 2010 - 21:53:12 EST


In HWPoison processing code, not only the struct page corresponding
the error physical memory page is marked as HWPoison, but also the
virtual address in processes mapping the error physical memory page is
marked as HWPoison. So that, the further accessing to the virtual
address will kill corresponding processes with SIGBUS.

If the error physical memory page is used by a KVM guest, the SIGBUS
will be sent to QEMU, and QEMU will simulate a MCE to report that
memory error to the guest OS. If the guest OS can not recover from
the error (for example, the page is accessed by kernel code), guest OS
will reboot the system. But because the underlying host virtual
address backing the guest physical memory is still poisoned, if the
guest system accesses the corresponding guest physical memory even
after rebooting, the SIGBUS will still be sent to QEMU and MCE will be
simulated. That is, guest system can not recover via rebooting.

In fact, across rebooting, the contents of guest physical memory page
need not to be kept. We can allocate a new host physical page to
back the corresponding guest physical address.

To do that, a mechanism in KVM to "unpoison" poisoned virtual address
by clearing the corresponding PTE is provided. So that, when doing
rebooting, QEMU can unpoison the poisoned virtual address, and when
the unpoisoned memory page is accessed, a new physical memory may be
allocated if possible.

Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx>
---
include/linux/kvm.h | 1 +
include/linux/mm.h | 8 ++++++++
mm/memory-failure.c | 39 +++++++++++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 14 ++++++++++++++
4 files changed, 62 insertions(+)

--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -676,6 +676,7 @@ struct kvm_clock_data {
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
/* Available with KVM_CAP_PPC_GET_PVINFO */
#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
+#define KVM_UNPOISON_ADDRESS _IO(KVMIO, 0xa2)

/*
* ioctls for vcpu fds
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1512,6 +1512,14 @@ extern int sysctl_memory_failure_recover
extern void shake_page(struct page *p, int access);
extern atomic_long_t mce_bad_pages;
extern int soft_offline_page(struct page *page, int flags);
+#ifdef CONFIG_MEMORY_FAILURE
+int unpoison_address(unsigned long addr);
+#else
+static inline int unpoison_address(unsigned long addr)
+{
+ return -EINVAL;
+}
+#endif

extern void dump_page(struct page *page);

--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1433,3 +1433,42 @@ done:
/* keep elevated page count for bad page */
return ret;
}
+
+int unpoison_address(unsigned long addr)
+{
+ struct mm_struct *mm;
+ pgd_t *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t pte, *ptep;
+ spinlock_t *ptl;
+ swp_entry_t entry;
+ int rc;
+
+ mm = current->mm;
+ pgdp = pgd_offset(mm, addr);
+ if (!pgd_present(*pgdp))
+ return -EINVAL;
+ pudp = pud_offset(pgdp, addr);
+ pud = *pudp;
+ if (!pud_present(pud) || pud_large(pud))
+ return -EINVAL;
+ pmdp = pmd_offset(pudp, addr);
+ pmd = *pmdp;
+ /* can not unpoison huge page yet */
+ if (!pmd_present(pmd) || pmd_large(pmd))
+ return -EINVAL;
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ pte = *ptep;
+ rc = -EINVAL;
+ if (!is_swap_pte(pte))
+ goto out;
+ entry = pte_to_swp_entry(pte);
+ if (!is_hwpoison_entry(entry))
+ goto out;
+ pte_clear(mm, addr, ptep);
+out:
+ pte_unmap_unlock(ptep, ptl);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(unpoison_address);
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -774,6 +774,17 @@ int kvm_vm_ioctl_set_memory_region(struc
return kvm_set_memory_region(kvm, mem, user_alloc);
}

+static int kvm_unpoison_address(struct kvm *kvm, unsigned long address)
+{
+ int r;
+
+ down_read(&current->mm->mmap_sem);
+ r = unpoison_address(address);
+ up_read(&current->mm->mmap_sem);
+
+ return r;
+}
+
int kvm_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log, int *is_dirty)
{
@@ -1728,6 +1739,9 @@ static long kvm_vm_ioctl(struct file *fi
mutex_unlock(&kvm->lock);
break;
#endif
+ case KVM_UNPOISON_ADDRESS:
+ r = kvm_unpoison_address(kvm, arg);
+ break;
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
if (r == -ENOTTY)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/