Re: [RFC 40/55] KVM: arm/arm64: Handle vttbr_el2 write operation from the guest hypervisor

From: Christoffer Dall
Date: Wed Feb 22 2017 - 13:07:20 EST


On Mon, Jan 09, 2017 at 01:24:36AM -0500, Jintack Lim wrote:
> Each nested VM is supposed to have a mmu (i.e. shadow stage-2 page

to have a 'struct kvm_mmu' ?

> table), and we create it when the guest hypervisor writes to vttbr_el2
> with a new vmid.

I think the commit message should also mention that you maintain a list
of seen nested stage 2 translation contexts and associated shadow page
tables.

>
> In case the guest hypervisor writes to vttbr_el2 with existing vmid, we
> check if the base address is changed. If so, then what we have in the
> shadow page table is not valid any more. So ummap it.

unmap? We clear the entire shadow stage 2 page table, right?

>
> Signed-off-by: Jintack Lim <jintack@xxxxxxxxxxxxxxx>
> ---
> arch/arm/include/asm/kvm_host.h | 1 +
> arch/arm/kvm/arm.c | 1 +
> arch/arm64/include/asm/kvm_host.h | 1 +
> arch/arm64/include/asm/kvm_mmu.h | 6 ++++
> arch/arm64/kvm/mmu-nested.c | 71 +++++++++++++++++++++++++++++++++++++++
> arch/arm64/kvm/sys_regs.c | 15 ++++++++-
> 6 files changed, 94 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index fbde48d..ebf2810 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -84,6 +84,7 @@ struct kvm_arch {
>
> /* Never used on arm but added to be compatible with arm64 */
> struct list_head nested_mmu_list;
> + spinlock_t mmu_list_lock;
>
> /* Interrupt controller */
> struct vgic_dist vgic;
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 147df97..6fa5754 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -147,6 +147,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> kvm->arch.mmu.vmid.vmid_gen = 0;
> kvm->arch.mmu.el2_vmid.vmid_gen = 0;
> INIT_LIST_HEAD(&kvm->arch.nested_mmu_list);
> + spin_lock_init(&kvm->arch.mmu_list_lock);
>
> /* The maximum number of VCPUs is limited by the host's GIC model */
> kvm->arch.max_vcpus = vgic_present ?
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 23e2267..52eea76 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -99,6 +99,7 @@ struct kvm_arch {
>
> /* Stage 2 shadow paging contexts for nested L2 VM */
> struct list_head nested_mmu_list;
> + spinlock_t mmu_list_lock;

I'm wondering if we really need the separate spin lock or if we could
just grab the KVM mutex?

> };
>
> #define KVM_NR_MEM_OBJS 40
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index d1ef650..fdc9327 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -327,6 +327,7 @@ static inline unsigned int kvm_get_vmid_bits(void)
> #ifdef CONFIG_KVM_ARM_NESTED_HYP
> struct kvm_nested_s2_mmu *get_nested_mmu(struct kvm_vcpu *vcpu, u64 vttbr);
> struct kvm_s2_mmu *vcpu_get_active_s2_mmu(struct kvm_vcpu *vcpu);
> +bool handle_vttbr_update(struct kvm_vcpu *vcpu, u64 vttbr);
> #else
> static inline struct kvm_nested_s2_mmu *get_nested_mmu(struct kvm_vcpu *vcpu,
> u64 vttbr)
> @@ -337,6 +338,11 @@ static inline struct kvm_s2_mmu *vcpu_get_active_s2_mmu(struct kvm_vcpu *vcpu)
> {
> return &vcpu->kvm->arch.mmu;
> }
> +
> +static inline bool handle_vttbr_update(struct kvm_vcpu *vcpu, u64 vttbr)
> +{
> + return false;
> +}
> #endif
>
> static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
> diff --git a/arch/arm64/kvm/mmu-nested.c b/arch/arm64/kvm/mmu-nested.c
> index d52078f..0811d94 100644
> --- a/arch/arm64/kvm/mmu-nested.c
> +++ b/arch/arm64/kvm/mmu-nested.c
> @@ -53,3 +53,74 @@ struct kvm_s2_mmu *vcpu_get_active_s2_mmu(struct kvm_vcpu *vcpu)
>
> return &nested_mmu->mmu;
> }
> +
> +static struct kvm_nested_s2_mmu *create_nested_mmu(struct kvm_vcpu *vcpu,
> + u64 vttbr)
> +{
> + struct kvm_nested_s2_mmu *nested_mmu, *tmp_mmu;
> + struct list_head *nested_mmu_list = &vcpu->kvm->arch.nested_mmu_list;
> + bool need_free = false;
> + int ret;
> +
> + nested_mmu = kzalloc(sizeof(struct kvm_nested_s2_mmu), GFP_KERNEL);
> + if (!nested_mmu)
> + return NULL;
> +
> + ret = __kvm_alloc_stage2_pgd(&nested_mmu->mmu);
> + if (ret) {
> + kfree(nested_mmu);
> + return NULL;
> + }
> +
> + spin_lock(&vcpu->kvm->arch.mmu_list_lock);
> + tmp_mmu = get_nested_mmu(vcpu, vttbr);
> + if (!tmp_mmu)
> + list_add_rcu(&nested_mmu->list, nested_mmu_list);
> + else /* Somebody already created and put a new nested_mmu to the list */
> + need_free = true;
> + spin_unlock(&vcpu->kvm->arch.mmu_list_lock);
> +
> + if (need_free) {
> + __kvm_free_stage2_pgd(&nested_mmu->mmu);
> + kfree(nested_mmu);
> + nested_mmu = tmp_mmu;
> + }
> +
> + return nested_mmu;
> +}
> +
> +static void kvm_nested_s2_unmap(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_nested_s2_mmu *nested_mmu;
> + struct list_head *nested_mmu_list = &vcpu->kvm->arch.nested_mmu_list;
> +
> + list_for_each_entry_rcu(nested_mmu, nested_mmu_list, list)
> + kvm_unmap_stage2_range(&nested_mmu->mmu, 0, KVM_PHYS_SIZE);
> +}
> +
> +bool handle_vttbr_update(struct kvm_vcpu *vcpu, u64 vttbr)
> +{
> + struct kvm_nested_s2_mmu *nested_mmu;
> +
> + /* See if we can relax this */

huh?

> + if (!vttbr)

why is this a special case?

Theoretically an IPA of zero and VMID zero could be a valid page table
base pointer, right?

I'm gussing because the guest hypervisor occasionally writes zero into
VTTBR_EL2, for example when not using stage 2 translation, so perhaps
what you need to do is to defer creating a new nested mmu structure
until you actually enter the VM with stage 2 paging enabled?

> + return true;
> +
> + nested_mmu = (struct kvm_nested_s2_mmu *)get_nested_mmu(vcpu, vttbr);
> + if (!nested_mmu) {
> + nested_mmu = create_nested_mmu(vcpu, vttbr);
> + if (!nested_mmu)
> + return false;

I'm wondering if this can be simplified by having get_nested_mmu lookup
and allocate the struct and renaming the get_nested_mmu funtion to
lookup_nested_mmu? This caller looks racy, even though it isn't, which
would be improved by my suggestion as well.

> + } else {
> + /*
> + * unmap the shadow page table if vttbr_el2 is

While the function is called unmap, what we really do is
clearing/flushing the shadow stage 2 page table.

> + * changed to different value
> + */
> + if (vttbr != nested_mmu->virtual_vttbr)
> + kvm_nested_s2_unmap(vcpu);
> + }
> +
> + nested_mmu->virtual_vttbr = vttbr;
> +
> + return true;
> +}
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index e66f40d..ddb641c 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -960,6 +960,19 @@ static bool access_cpacr(struct kvm_vcpu *vcpu,
> return true;
> }
>
> +static bool access_vttbr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> + const struct sys_reg_desc *r)
> +{
> + u64 vttbr = p->regval;
> +
> + if (!p->is_write) {
> + p->regval = vcpu_el2_reg(vcpu, r->reg);
> + return true;
> + }
> +
> + return handle_vttbr_update(vcpu, vttbr);
> +}
> +
> static bool trap_el2_reg(struct kvm_vcpu *vcpu,
> struct sys_reg_params *p,
> const struct sys_reg_desc *r)
> @@ -1306,7 +1319,7 @@ static bool trap_el2_reg(struct kvm_vcpu *vcpu,
> trap_el2_reg, reset_el2_val, TCR_EL2, 0 },
> /* VTTBR_EL2 */
> { Op0(0b11), Op1(0b100), CRn(0b0010), CRm(0b0001), Op2(0b000),
> - trap_el2_reg, reset_el2_val, VTTBR_EL2, 0 },
> + access_vttbr, reset_el2_val, VTTBR_EL2, 0 },
> /* VTCR_EL2 */
> { Op0(0b11), Op1(0b100), CRn(0b0010), CRm(0b0001), Op2(0b010),
> trap_el2_reg, reset_el2_val, VTCR_EL2, 0 },
> --
> 1.9.1
>
>

Thanks,
-Christoffer