Re: [PART1 RFC v2 05/10] KVM: x86: Detect and Initialize AVIC support

From: Paolo Bonzini
Date: Mon Mar 07 2016 - 11:41:36 EST




On 04/03/2016 21:46, Suravee Suthikulpanit wrote:

> @@ -162,6 +170,37 @@ struct vcpu_svm {
>
> /* cached guest cpuid flags for faster access */
> bool nrips_enabled : 1;
> +
> + struct page *avic_bk_page;
> + void *in_kernel_lapic_regs;
> +};
> +
> +struct __attribute__ ((__packed__))
> +svm_avic_log_ait_entry {
> + u32 guest_phy_apic_id : 8,
> + res : 23,
> + valid : 1;
> +};
> +
> +struct __attribute__ ((__packed__))
> +svm_avic_phy_ait_entry {
> + u64 host_phy_apic_id : 8,
> + res1 : 4,
> + bk_pg_ptr : 40,
> + res2 : 10,
> + is_running : 1,
> + valid : 1;
> +};

Please don't use bitfields.

> +/* Note: This structure is per VM */
> +struct svm_vm_data {
> + atomic_t count;
> + u32 ldr_mode;
> + u32 avic_max_vcpu_id;
> + u32 avic_tag;
> +
> + struct page *avic_log_ait_page;
> + struct page *avic_phy_ait_page;

You can put these directly in kvm_arch. Do not use abbreviations:

struct page *avic_logical_apic_id_table_page;
struct page *avic_physical_apic_id_table_page;

> @@ -1000,6 +1057,41 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
> mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> }
>
> +static void avic_init_vmcb(struct vcpu_svm *svm)
> +{
> + void *vapic_bkpg;
> + struct vmcb *vmcb = svm->vmcb;
> + struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data;
> + phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page));
> + phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page));
> + phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page));

Please use page_to_phys.

> + if (!vmcb)
> + return;
> +
> + pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n",
> + __func__, (unsigned long long)bpa,
> + (unsigned long long)lpa, (unsigned long long)ppa);

No pr_debug.

> +
> + /*
> + * Here we copy the value from the emulated LAPIC register
> + * to the vAPIC backign page, and then flip regs frame.
> + */
> + if (!svm->in_kernel_lapic_regs)
> + svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs;
> +
> + vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page));

Please use kmap instead.

> + memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE);
> + svm->vcpu.arch.apic->regs = vapic_bkpg;

Can you explain the flipping logic, and why you cannot just use the
existing apic.regs?

> + vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK;

No abbreviations ("avic_backing_page").

> + vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK;
> + vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK;
> + vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX;
> + vmcb->control.avic_enable = 1;
> +}
> +
> static void init_vmcb(struct vcpu_svm *svm)
> {
> struct vmcb_control_area *control = &svm->vmcb->control;
> @@ -1113,6 +1205,293 @@ static void init_vmcb(struct vcpu_svm *svm)
> mark_all_dirty(svm->vmcb);
>
> enable_gif(svm);
> +
> + if (avic)
> + avic_init_vmcb(svm);
> +}
> +
> +static struct svm_avic_phy_ait_entry *
> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)
> +{
> + struct svm_avic_phy_ait_entry *avic_phy_ait;
> + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +
> + if (!vm_data)
> + return NULL;
> +
> + /* Note: APIC ID = 0xff is used for broadcast.
> + * APIC ID > 0xff is reserved.
> + */
> + if (index >= 0xff)
> + return NULL;
> +
> + avic_phy_ait = page_address(vm_data->avic_phy_ait_page);
> +
> + return &avic_phy_ait[index];
> +}
> +
> +struct svm_avic_log_ait_entry *
> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)
> +{
> + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> + int index;
> + struct svm_avic_log_ait_entry *avic_log_ait;
> +
> + if (!vm_data)
> + return NULL;
> +
> + if (is_flat) { /* flat */
> + if (mda > 7)
> + return NULL;
> + index = mda;
> + } else { /* cluster */
> + int apic_id = mda & 0xf;
> + int cluster_id = (mda & 0xf0) >> 8;
> +
> + if (apic_id > 4 || cluster_id >= 0xf)
> + return NULL;
> + index = (cluster_id << 2) + apic_id;
> + }
> + avic_log_ait = (struct svm_avic_log_ait_entry *)
> + page_address(vm_data->avic_log_ait_page);
> +
> + return &avic_log_ait[index];
> +}

Instead of these functions, create a complete function to handle APIC_ID
and APIC_LDR writes. Then use kmap/kunmap instead of page_address.

> +static inline void
> +avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val)
> +{
> + void *avic_bk = page_address(svm->avic_bk_page);
> +
> + *((u32 *) (avic_bk + reg_off)) = val;
> +}

Unused function.

> +static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset)
> +{
> + char *tmp = (char *)page_address(svm->avic_bk_page);
> +
> + return (u32 *)(tmp+offset);
> +}

I think you should be able to use kvm_apic_get_reg instead.

> +static int avic_init_bk_page(struct kvm_vcpu *vcpu)

No abbreviations (but again, please try reusing apic.regs).

> +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id)
> +{
> + int ret = 0, i;
> + bool realloc = false;
> + struct kvm_vcpu *vcpu;
> + struct kvm *kvm = svm->vcpu.kvm;
> + struct svm_vm_data *vm_data = kvm->arch.arch_data;
> +
> + mutex_lock(&kvm->slots_lock);
> +
> + /* Check if we have already allocated vAPIC backing
> + * page for this vCPU. If not, we need to realloc
> + * a new one and re-assign all other vCPU.
> + */
> + if (kvm->arch.apic_access_page_done &&
> + (id > vm_data->avic_max_vcpu_id)) {
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + avic_unalloc_bk_page(vcpu);
> +
> + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> + 0, 0);
> + realloc = true;
> + vm_data->avic_max_vcpu_id = 0;
> + }
> +
> + /*
> + * We are allocating vAPIC backing page
> + * upto the max vCPU ID
> + */
> + if (id >= vm_data->avic_max_vcpu_id) {
> + ret = __x86_set_memory_region(kvm,
> + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> + APIC_DEFAULT_PHYS_BASE,
> + PAGE_SIZE * (id + 1));

Why is this necessary? The APIC access page is a peculiarity of Intel
processors (and the special memslot for only needs to map 0xfee00000 to
0xfee00fff; after that there is the MSI area).

> + if (ret)
> + goto out;
> +
> + vm_data->avic_max_vcpu_id = id;
> + }
> +
> + /* Reinit vAPIC backing page for exisinting vcpus */
> + if (realloc)
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + avic_init_bk_page(vcpu);

Why is this necessary?

> + avic_init_bk_page(&svm->vcpu);
> +
> + kvm->arch.apic_access_page_done = true;
> +
> +out:
> + mutex_unlock(&kvm->slots_lock);
> + return ret;
> +}
> +
> +static void avic_vm_uninit(struct kvm *kvm)
> +{
> + struct svm_vm_data *vm_data = kvm->arch.arch_data;
> +
> + if (!vm_data)
> + return;
> +
> + if (vm_data->avic_log_ait_page)
> + __free_page(vm_data->avic_log_ait_page);
> + if (vm_data->avic_phy_ait_page)
> + __free_page(vm_data->avic_phy_ait_page);
> + kfree(vm_data);
> + kvm->arch.arch_data = NULL;
> +}
> +
> +static void avic_vcpu_uninit(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +
> + if (!avic)
> + return;
> +
> + avic_unalloc_bk_page(vcpu);
> + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
> + svm->in_kernel_lapic_regs = NULL;
> +
> + if (vm_data &&
> + (atomic_read(&vm_data->count) == 0 ||
> + atomic_dec_and_test(&vm_data->count)))
> + avic_vm_uninit(vcpu->kvm);

Add a new kvm_x86_ops callback .free_vcpus and call it from
kvm_free_vcpus; then count is not necessary.

We probably should use it for Intel too. The calls to
x86_set_memory_region in kvm_arch_destroy_vm are hideous.

> +}
> +
> +static atomic_t avic_tag_gen = ATOMIC_INIT(1);
> +
> +static inline u32 avic_get_next_tag(void)
> +{
> + u32 tag = atomic_read(&avic_tag_gen);
> +
> + atomic_inc(&avic_tag_gen);
> + return tag;
> +}
> +
> +static int avic_vm_init(struct kvm *kvm)
> +{
> + int err = -ENOMEM;
> + struct svm_vm_data *vm_data;
> + struct page *avic_phy_ait_page;
> + struct page *avic_log_ait_page;

This is probably also best moved to a new callback .init_vm (called from
kvm_arch_init_vm).

> + vm_data = kzalloc(sizeof(struct svm_vm_data),
> + GFP_KERNEL);
> + if (!vm_data)
> + return err;
> +
> + kvm->arch.arch_data = vm_data;
> + atomic_set(&vm_data->count, 0);
> +
> + /* Allocating physical APIC ID table (4KB) */
> + avic_phy_ait_page = alloc_page(GFP_KERNEL);
> + if (!avic_phy_ait_page)
> + goto free_avic;
> +
> + vm_data->avic_phy_ait_page = avic_phy_ait_page;
> + clear_page(page_address(avic_phy_ait_page));

You can use get_zeroed_page and free_page, instead of
alloc_page/clear_page/__free_page.

Thanks,

Paolo

> + /* Allocating logical APIC ID table (4KB) */
> + avic_log_ait_page = alloc_page(GFP_KERNEL);
> + if (!avic_log_ait_page)
> + goto free_avic;
> +
> + vm_data->avic_log_ait_page = avic_log_ait_page;
> + clear_page(page_address(avic_log_ait_page));
> +
> + vm_data->avic_tag = avic_get_next_tag();
> +
> + return 0;
> +
> +free_avic:
> + avic_vm_uninit(kvm);
> + return err;
> +}
> +
> +static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id)
> +{
> + int err;
> + struct svm_vm_data *vm_data = NULL;
> +
> + /* Note: svm_vm_data is per VM */
> + if (!kvm->arch.arch_data) {
> + err = avic_vm_init(kvm);
> + if (err)
> + return err;
> + }
> +
> + err = avic_alloc_bk_page(svm, id);
> + if (err) {
> + avic_vcpu_uninit(&svm->vcpu);
> + return err;
> + }
> +
> + vm_data = kvm->arch.arch_data;
> + atomic_inc(&vm_data->count);
> +
> + return 0;
> }
>
> static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> @@ -1131,6 +1510,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>
> kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
> kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
> +
> + if (avic && !init_event)
> + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
> }
>
> static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
> @@ -1169,6 +1551,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
> if (!hsave_page)
> goto free_page3;
>
> + if (avic) {
> + err = avic_vcpu_init(kvm, svm, id);
> + if (err)
> + goto free_page4;
> + }
> +
> svm->nested.hsave = page_address(hsave_page);
>
> svm->msrpm = page_address(msrpm_pages);
> @@ -1187,6 +1575,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>
> return &svm->vcpu;
>
> +free_page4:
> + __free_page(hsave_page);
> free_page3:
> __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
> free_page2:
> @@ -1209,6 +1599,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
> __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
> __free_page(virt_to_page(svm->nested.hsave));
> __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
> + avic_vcpu_uninit(vcpu);
> kvm_vcpu_uninit(vcpu);
> kmem_cache_free(kvm_vcpu_cache, svm);
> }
> @@ -3382,6 +3773,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
> pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
> pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
> pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
> + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
> pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
> pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
> pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
> @@ -3613,11 +4005,31 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
>
> static bool svm_get_enable_apicv(void)
> {
> - return false;
> + return avic;
> +}
> +
> +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
> +{
> }
>
> +static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
> +{
> +}
> +
> +/* Note: Currently only used by Hyper-V. */
> static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
> {
> + struct vcpu_svm *svm = to_svm(vcpu);
> + struct vmcb *vmcb = svm->vmcb;
> +
> + if (!avic)
> + return;
> +
> + if (!svm->in_kernel_lapic_regs)
> + return;
> +
> + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
> + vmcb->control.avic_enable = 0;
> }
>
> static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> @@ -4387,6 +4799,8 @@ static struct kvm_x86_ops svm_x86_ops = {
> .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
> .load_eoi_exitmap = svm_load_eoi_exitmap,
> .sync_pir_to_irr = svm_sync_pir_to_irr,
> + .hwapic_irr_update = svm_hwapic_irr_update,
> + .hwapic_isr_update = svm_hwapic_isr_update,
>
> .set_tss_addr = svm_set_tss_addr,
> .get_tdp_level = get_npt_level,
>