From: Zhang, Yanmin<yanmin_zhang@xxxxxxxxxxxxxxx>
Based on Ingo's idea, I implement a para virt interface for perf to support
statistics collection in guest os. That means we could run tool perf in guest
os directly.
Great thanks to Peter Zijlstra. He is really the architect and gave me architecture
design suggestions. I also want to thank Yangsheng and LinMing for their generous
help.
The design is:
1) Add a kvm_pmu whose callbacks mostly just calls hypercall to vmexit to host kernel;
2) Create a host perf_event per guest perf_event;
3) Host kernel syncs perf_event count/overflows data changes to guest perf_event
when processing perf_event overflows after NMI arrives. Host kernel inject NMI to guest
kernel if a guest event overflows.
4) Guest kernel goes through all enabled event on current cpu and output data when they
overflows.
5) No change in user space.
-#define KVM_MAX_MMU_OP_BATCH 32
+/* Operations for KVM_PERF_OP */
+#define KVM_PERF_OP_OPEN 1
+#define KVM_PERF_OP_CLOSE 2
+#define KVM_PERF_OP_ENABLE 3
+#define KVM_PERF_OP_DISABLE 4
+#define KVM_PERF_OP_START 5
+#define KVM_PERF_OP_STOP 6
+#define KVM_PERF_OP_READ 7
+static bool kvm_reserve_pmc_hardware(void)
+{
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ disable_lapic_nmi_watchdog();
+
+ return true;
+}
+
+static void kvm_release_pmc_hardware(void)
+{
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+}
+
+static int kvm_pmu_enable(struct perf_event *event)
+{
+ int ret;
+ unsigned long addr = __pa(event);
+
+ if (kvm_add_event(event))
+ return -1;
+
+ ret = kvm_hypercall3(KVM_PERF_OP, KVM_PERF_OP_ENABLE,
+ addr, (unsigned long) event->shadow);
+int __init kvm_init_hw_perf_events(void)
+{
+ if (!kvm_para_available())
+ return -1;
+
+ x86_pmu.handle_irq = kvm_default_x86_handle_irq;
+
+ pr_cont("KVM PARA PMU driver.\n");
+ register_die_notifier(&kvm_perf_event_nmi_notifier);
+
+ return 0;
+}
+
+static int __kvm_hw_perf_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc =&event->hw;
+ int err;
+ unsigned long result;
+ unsigned long addr;
+
+ err = 0;
+ if (!atomic_inc_not_zero(&active_events)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_events) == 0) {
+ if (!kvm_reserve_pmc_hardware())
+ err = -EBUSY;
+ }
+ if (!err)
+ atomic_inc(&active_events);
+ mutex_unlock(&pmc_reserve_mutex);
+ if (err)
+ return err;
+ }
+
+ event->destroy = kvm_hw_perf_event_destroy;
+
+ hwc->idx = -1;
+ hwc->last_cpu = -1;
+ hwc->last_tag = ~0ULL;
+
+ addr = __pa(event);
+ result = kvm_hypercall3(KVM_PERF_OP, KVM_PERF_OP_OPEN, addr, 0);
+
+ if (result)
+ event->shadow = (void *) result;
+ else
+ err = -1;
+
+static void kvm_sync_event_to_guest(struct perf_event *event, int overflows)
+{
+ struct hw_perf_event *hwc =&event->hw;
+ struct kvmperf_event *kvmevent;
+ int offset, len, data_len, copied, page_offset;
+ struct page *event_page;
+ void *shared_kaddr;
+
+ kvmevent = event->shadow;
+ offset = kvmevent->event_offset;
+
+ /* Copy perf_event->count firstly */
+ offset += offsetof(struct perf_event, count);
+ if (offset< PAGE_SIZE) {
+ event_page = kvmevent->event_page;
+ page_offset = offset;
+ } else {
+ event_page = kvmevent->event_page2;
+ page_offset = offset - PAGE_SIZE;
+ }
+ shared_kaddr = kmap_atomic(event_page, KM_USER0);
+ *((atomic64_t *)(shared_kaddr + page_offset)) = event->count;
+
+ offset = kvmevent->event_offset;
+ offset += offsetof(struct perf_event, hw);
+ if (offset< PAGE_SIZE) {
+ if (event_page == kvmevent->event_page2) {
+ kunmap_atomic(shared_kaddr, KM_USER0);
+ event_page = kvmevent->event_page;
+ shared_kaddr = kmap_atomic(event_page, KM_USER0);
+ }
+ page_offset = offset;
+ } else {
+ if (event_page == kvmevent->event_page) {
+ kunmap_atomic(shared_kaddr, KM_USER0);
+ event_page = kvmevent->event_page2;
+ shared_kaddr = kmap_atomic(event_page, KM_USER0);
+ }
+ page_offset = offset - PAGE_SIZE;
+ }
+
+ if (overflows)
+ atomic_add(overflows, (atomic_t *)(shared_kaddr + page_offset));
+
+ kunmap_atomic(shared_kaddr, KM_USER0);
+#if 0
+ offset += offsetof(struct hw_perf_event, prev_count);
+ data_len = sizeof(struct hw_perf_event) -
+ offsetof(struct hw_perf_event, prev_count);
+ if (event_page == kvmevent->event_page2) {
+ page_offset += offsetof(struct hw_perf_event, prev_count);
+ memcpy(shared_kaddr + page_offset,
+ &hwc->prev_count, data_len);
+ kunmap_atomic(shared_kaddr, KM_USER0);
+
+ return;
+ }
+
+ copied = 0;
+ if (offset< PAGE_SIZE) {
+ len = PAGE_SIZE - offset;
+ if (len> data_len)
+ len = data_len;
+ memcpy(shared_kaddr + offset,
+ &hwc->prev_count, data_len);
+ copied = len;
+ page_offset = 0;
+ } else
+ page_offset = offset - PAGE_SIZE;
+
+ kunmap_atomic(shared_kaddr, KM_USER0);
+ len = data_len - copied;
+ if (len) {
+ /* Copy across pages */
+ shared_kaddr = kmap_atomic(kvmevent->event_page2, KM_USER0);
+ memcpy(shared_kaddr + page_offset,
+ ((void *)&hwc->prev_count) + copied, len);
+ kunmap_atomic(shared_kaddr, KM_USER0);
+ }
+#endif
+}
+
+static struct perf_event *
+kvm_pv_perf_op_open(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ int ret;
+ struct perf_event *event;
+ struct perf_event *host_event = NULL;
+ struct kvmperf_event *shadow = NULL;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ goto out;
+
+ shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
+ if (!shadow)
+ goto out;
+
+ shadow->event_page = gfn_to_page(vcpu->kvm, addr>> PAGE_SHIFT);
+ shadow->event_offset = addr& ~PAGE_MASK;
+ if (shadow->event_offset + sizeof(struct perf_event)> PAGE_SIZE) {
+ shadow->event_page2 = gfn_to_page(vcpu->kvm,
+ (addr>> PAGE_SHIFT) + 1);
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, addr, event, sizeof(*event));
+ if (ret)
+ goto out;
+
+ /*
+ * By default, we disable the host event. Later on, guets os
+ * triggers a perf_event_attach to enable it
+ */
+ event->attr.disabled = 1;
+ event->attr.inherit = 0;
+ event->attr.enable_on_exec = 0;
+ /*
+ * We don't support exclude mode of user and kernel for guest os,
+ * which mean we always collect both user and kernel for guest os
+ */
+ event->attr.exclude_user = 0;
+ event->attr.exclude_kernel = 0;
+ /* We always create a cpu context host perf event */
+
+ host_event = perf_event_create_kernel_counter(&event->attr, -1,
+ current->pid, kvm_perf_event_overflow);
+
+ if (IS_ERR(host_event)) {
+ host_event = NULL;
+ goto out;
+ }
+ host_event->shadow = shadow;
+
+out:
+ if (!host_event)
+ kfree(shadow);
+ kfree(event);
+
+ return host_event;
+}
+
+
+int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
+ unsigned long a2, unsigned long *result)
+{
+ unsigned long ret;
+ struct perf_event *host_event;
+ gpa_t addr;
+
+ addr = (gpa_t)(a1);
+
+ switch(op_code) {
+ case KVM_PERF_OP_OPEN:
+ ret = (unsigned long) kvm_pv_perf_op_open(vcpu, addr);
+ break;
+ case KVM_PERF_OP_CLOSE:
+ host_event = (struct perf_event *) a2;
@@ -4052,6 +4054,16 @@ static unsigned long kvm_get_guest_ip(vo
return ip;
}
+int kvm_notify_event_overflow(void)
+{
+ if (percpu_read(current_vcpu)) {
+ kvm_inject_nmi(percpu_read(current_vcpu));
+ return 0;
+ }
+
+ return -1;
+}