[PATCH RFC 2/2] KVM: RCU protected dynamic vcpus array
From: Radim KrÄmÃÅ
Date: Wed Aug 16 2017 - 15:41:17 EST
This is a prototype with many TODO comments to give a better idea of
what would be needed.
The main missing piece a rework of every kvm_for_each_vcpu() into a less
inefficient loop, but RCU readers cannot block, so the rewrite cannot be
scripted. Is there a more suitable protection scheme?
I didn't test it much ... I am still leaning towards the internally
simpler version, (1), even if it requires userspace changes.
---
arch/mips/kvm/mips.c | 8 +++---
arch/powerpc/kvm/powerpc.c | 6 +++--
arch/s390/kvm/kvm-s390.c | 27 ++++++++++++++------
arch/x86/kvm/hyperv.c | 3 +--
arch/x86/kvm/vmx.c | 3 ++-
arch/x86/kvm/x86.c | 5 ++--
include/linux/kvm_host.h | 61 ++++++++++++++++++++++++++++++++++------------
virt/kvm/arm/arm.c | 10 +++-----
virt/kvm/kvm_main.c | 58 +++++++++++++++++++++++++++++++++++--------
9 files changed, 132 insertions(+), 49 deletions(-)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bce2a6431430..4c9d383babe7 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -164,6 +164,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
{
unsigned int i;
struct kvm_vcpu *vcpu;
+ struct kvm_vcpus *vcpus;
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_arch_vcpu_free(vcpu);
@@ -171,8 +172,9 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
+ // TODO: resetting online vcpus shouldn't be needed
+ vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock));
+ vcpus->online = 0;
atomic_set(&kvm->online_vcpus, 0);
@@ -499,7 +501,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
if (irq->cpu == -1)
dvcpu = vcpu;
else
- dvcpu = vcpu->kvm->vcpus[irq->cpu];
+ dvcpu = kvm_get_vcpu(vcpu->kvm, irq->cpu);
if (intr == 2 || intr == 3 || intr == 4) {
kvm_mips_callbacks->queue_io_int(dvcpu, irq);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3480faaf1ef8..9d1a16fd629f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -460,6 +460,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
unsigned int i;
struct kvm_vcpu *vcpu;
+ struct kvm_vcpus *vcpus;
#ifdef CONFIG_KVM_XICS
/*
@@ -475,8 +476,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_arch_vcpu_free(vcpu);
mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
+
+ vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock));
+ vcpus->online = 0;
atomic_set(&kvm->online_vcpus, 0);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 9f23a9e81a91..dd8592c67ef4 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1945,15 +1945,16 @@ static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
struct kvm_vcpu *vcpu;
+ struct kvm_vcpus *vcpus;
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_destroy(vcpu);
mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
- atomic_set(&kvm->online_vcpus, 0);
+ vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock));
+ vcpus->online = 0;
+
mutex_unlock(&kvm->lock);
}
@@ -3415,6 +3416,7 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
{
int i, online_vcpus, started_vcpus = 0;
+ struct kvm_vcpus *vcpus;
if (!is_vcpu_stopped(vcpu))
return;
@@ -3422,12 +3424,16 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
/* Only one cpu at a time may enter/leave the STOPPED state. */
spin_lock(&vcpu->kvm->arch.start_stop_lock);
- online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
- for (i = 0; i < online_vcpus; i++) {
- if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
+ rcu_read_lock();
+ vcpus = rcu_dereference(vcpu->kvm->vcpus);
+ // TODO: this pattern is kvm_for_each_vcpu
+ for (i = 0; i < vcpus->online; i++) {
+ if (!is_vcpu_stopped(vcpus->array[i]))
started_vcpus++;
+ // TODO: if (started_vcpus > 1) break;
}
+ rcu_read_unlock();
if (started_vcpus == 0) {
/* we're the only active VCPU -> speed it up */
@@ -3455,6 +3461,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
{
int i, online_vcpus, started_vcpus = 0;
struct kvm_vcpu *started_vcpu = NULL;
+ struct kvm_vcpus *vcpus;
if (is_vcpu_stopped(vcpu))
return;
@@ -3470,12 +3477,16 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
atomic_or(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
__disable_ibs_on_vcpu(vcpu);
+ rcu_read_lock();
+ vcpus = rcu_dereference(vcpu->kvm->vcpus);
+ // TODO: use kvm_for_each_vcpu
for (i = 0; i < online_vcpus; i++) {
- if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) {
+ if (!is_vcpu_stopped(vcpus->array[i])) {
started_vcpus++;
- started_vcpu = vcpu->kvm->vcpus[i];
+ started_vcpu = vcpus->array[i];
}
}
+ rcu_read_unlock();
if (started_vcpus == 1) {
/*
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index dc97f2544b6f..bf4037344729 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -111,8 +111,7 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
struct kvm_vcpu *vcpu = NULL;
int i;
- if (vpidx < KVM_MAX_VCPUS)
- vcpu = kvm_get_vcpu(kvm, vpidx);
+ vcpu = kvm_get_vcpu(kvm, vpidx);
if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
return vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index df8d2f127508..1f492a1b64e9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11742,7 +11742,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ // TODO: make apicv state accessible directly from struct kvm
+ !kvm_vcpu_apicv_active(kvm_get_vcpu(kvm, 0)))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e10eda86bc7b..5d8af3e4eab1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8081,6 +8081,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
struct kvm_vcpu *vcpu;
+ struct kvm_vcpus *vcpus;
/*
* Unpin any mmu pages first.
@@ -8093,8 +8094,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
kvm_arch_vcpu_free(vcpu);
mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
+ vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock));
+ vcpus->online = 0;
atomic_set(&kvm->online_vcpus, 0);
mutex_unlock(&kvm->lock);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c8df733eed41..eb9fb5b493ac 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -386,12 +386,17 @@ struct kvm_memslots {
int used_slots;
};
+struct kvm_vcpus {
+ u32 online;
+ struct kvm_vcpu *array[];
+};
+
struct kvm {
spinlock_t mmu_lock;
struct mutex slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
- struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+ struct kvm_vcpus *vcpus;
/*
* created_vcpus is protected by kvm->lock, and is incremented
@@ -483,45 +488,71 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx)
static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
{
- /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case
- * the caller has read kvm->online_vcpus before (as is the case
- * for kvm_for_each_vcpu, for example).
- */
- smp_rmb();
- return kvm->vcpus[i];
+ struct kvm_vcpu *r = NULL;
+ struct kvm_vcpus *vcpus;
+
+ rcu_read_lock();
+ vcpus = rcu_dereference(kvm->vcpus);
+ if (i < vcpus->online)
+ r = vcpus->array[i];
+ // TODO: check for bounds & return NULL in that case?
+ rcu_read_unlock();
+
+ return r;
}
+// This is unacceptably inefficient implementation, but rcu critical section
+// imposes limitations on preemption and we'll have to check all users before
+// converting them.
#define kvm_for_each_vcpu(idx, vcpup, kvm) \
for (idx = 0; \
- idx < atomic_read(&kvm->online_vcpus) && \
- (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
+ ({struct kvm_vcpus *__vcpus; \
+ rcu_read_lock(); \
+ __vcpus = rcu_dereference(kvm->vcpus); \
+ vcpup = idx < __vcpus->online ? __vcpus->array[idx] : NULL; \
+ rcu_read_unlock(); \
+ vcpup;}); \
idx++)
+#define kvm_for_each_vcpu_rcu(idx, vcpup, vcpus, kvm) \
+ for (vcpus = rcu_dereference(kvm->vcpus), idx = 0; \
+ idx < vcpus->online && (vcpup = vcpus->array[idx]); \
+ idx++) \
+
static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
{
struct kvm_vcpu *vcpu = NULL;
+ struct kvm_vcpus *vcpus;
int i;
if (id < 0)
return NULL;
- if (id < KVM_MAX_VCPUS)
- vcpu = kvm_get_vcpu(kvm, id);
+ vcpu = kvm_get_vcpu(kvm, id);
if (vcpu && vcpu->vcpu_id == id)
return vcpu;
- kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu->vcpu_id == id)
+ rcu_read_lock();
+ kvm_for_each_vcpu_rcu(i, vcpu, vcpus, kvm)
+ if (vcpu->vcpu_id == id) {
+ rcu_read_unlock();
return vcpu;
+ }
+ rcu_read_unlock();
return NULL;
}
static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tmp;
+ struct kvm_vcpus *vcpus;
int idx;
- kvm_for_each_vcpu(idx, tmp, vcpu->kvm)
- if (tmp == vcpu)
+ rcu_read_lock();
+ kvm_for_each_vcpu_rcu(idx, tmp, vcpus, vcpu->kvm)
+ if (tmp == vcpu) {
+ rcu_read_unlock();
return idx;
+ }
+ rcu_read_unlock();
BUG();
}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index b9f68e4add71..f0774a08083d 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -174,16 +174,14 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
int i;
+ struct kvm_vcpu *vcpu;
free_percpu(kvm->arch.last_vcpu_ran);
kvm->arch.last_vcpu_ran = NULL;
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+ // other arches zeroed online here, for no apparent reason :)
kvm_vgic_destroy(kvm);
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2eac2c62795f..578285ff74a4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -640,13 +640,31 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
return 0;
}
+// TODO: preallocate some VCPUs
static inline struct kvm *kvm_alloc_vm(void)
{
- return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ struct kvm *kvm;
+ struct kvm_vcpus *vcpus;
+
+ kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ if (!kvm)
+ return NULL;
+
+ vcpus = kvzalloc(sizeof(*vcpus), GFP_KERNEL);
+ if (!vcpus) {
+ kfree(kvm);
+ return NULL;
+ }
+ vcpus->online = 0;
+ rcu_assign_pointer(kvm->vcpus, vcpus);
+
+ return kvm;
}
static inline void kvm_free_vm(struct kvm *kvm)
{
+ if (kvm)
+ kvfree(rcu_dereference_protected(kvm->vcpus, 1));
kfree(kvm);
}
@@ -2473,6 +2491,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
{
int r;
struct kvm_vcpu *vcpu;
+ struct kvm_vcpus *old, *new = NULL;
if (id >= KVM_MAX_VCPU_ID)
return -EINVAL;
@@ -2508,7 +2527,22 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
goto unlock_vcpu_destroy;
}
- BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+ old = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock));
+
+ // TODO: make it a function for it that returns new/NULL
+ {
+ // OPTIMIZE: do not allocate every time. Requires atomic online
+ // counter.
+ u32 size = old->online + 1;
+
+ new = kvzalloc(sizeof(*new) + size * sizeof(*new->array), GFP_KERNEL);
+ if (!new) {
+ r = -ENOMEM;
+ goto unlock_vcpu_destroy;
+ }
+ new->online = size;
+ memcpy(new->array, old->array, old->online * sizeof(*old->array));
+ }
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
@@ -2518,20 +2552,24 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
goto unlock_vcpu_destroy;
}
- kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
-
- /*
- * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
- * before kvm->online_vcpu's incremented value.
- */
- smp_wmb();
- atomic_inc(&kvm->online_vcpus);
+ new->array[old->online] = vcpu;
+ rcu_assign_pointer(kvm->vcpus, new);
mutex_unlock(&kvm->lock);
+
+ // we could schedule a callback instead
+ synchronize_rcu();
+ kfree(old);
+
+ // TODO: No longer synchronizes anything in the common code.
+ // Remove if the arch-specific uses were mostly hacks.
+ atomic_inc(&kvm->online_vcpus);
+
kvm_arch_vcpu_postcreate(vcpu);
return r;
unlock_vcpu_destroy:
+ kvfree(new);
mutex_unlock(&kvm->lock);
debugfs_remove_recursive(vcpu->debugfs_dentry);
vcpu_destroy:
--
2.13.3