On Thu, Aug 01, 2019 at 04:06:13AM -0400, Jason Wang wrote:
On 2019/8/1 äå2:29, Michael S. Tsirkin wrote:If we need an mb in the begin() we can switch to
On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:I admit the patch is tricky. Some questions:
We used to use RCU to synchronize MMU notifier with worker. This leadsUnfortunately this needs a lot of review and testing, so this can't make
calling synchronize_rcu() in invalidate_range_start(). But on a busy
system, there would be many factors that may slow down the
synchronize_rcu() which makes it unsuitable to be called in MMU
notifier.
A solution is SRCU but its overhead is obvious with the expensive full
memory barrier. Another choice is to use seqlock, but it doesn't
provide a synchronization method between readers and writers. The last
choice is to use vq mutex, but it need to deal with the worst case
that MMU notifier must be blocked and wait for the finish of swap in.
So this patch switches use a counter to track whether or not the map
was used. The counter was increased when vq try to start or finish
uses the map. This means, when it was even, we're sure there's no
readers and MMU notifier is synchronized. When it was odd, it means
there's a reader we need to wait it to be even again then we are
synchronized. To avoid full memory barrier, store_release +
load_acquire on the counter is used.
rc2, and I don't think this is the kind of patch I can merge after rc3.
Subtle memory barrier tricks like this can introduce new bugs while they
are fixing old ones.
- Do we must address the case of e.g swap in? If not, a simple
vhost_work_flush() instead of synchronize_rcu() may work.
- Having some hard thought, I think we can use seqlock, it looks
to me smp_wmb() is in write_segcount_begin() is sufficient, we don't
care vq->map read before smp_wmb(), and for the other we all have
good data devendency so smp_wmb() in the write_seqbegin_end() is
sufficient.
dependent_ptr_mb. if you need me to fix it up
and repost, let me know.
Why isn't it a problem if the map is
accessed outside the lock?
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index db2c81cb1e90..6d9501303258 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -363,39 +363,29 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
{
- int ref = READ_ONCE(vq->ref);
-
- smp_store_release(&vq->ref, ref + 1);
- /* Make sure ref counter is visible before accessing the map */
- smp_load_acquire(&vq->ref);
+ write_seqcount_begin(&vq->seq);
}
static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
{
- int ref = READ_ONCE(vq->ref);
-
- /* Make sure vq access is done before increasing ref counter */
- smp_store_release(&vq->ref, ref + 1);
+ write_seqcount_end(&vq->seq);
}
static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
{
- int ref;
+ unsigned int ret;
/* Make sure map change was done before checking ref counter */
smp_mb();
-
- ref = READ_ONCE(vq->ref);
- if (ref & 0x1) {
- /* When ref change, we are sure no reader can see
+ ret = raw_read_seqcount(&vq->seq);
+ if (ret & 0x1) {
+ /* When seq changes, we are sure no reader can see
* previous map */
- while (READ_ONCE(vq->ref) == ref) {
- set_current_state(TASK_RUNNING);
+ while (raw_read_seqcount(&vq->seq) == ret)
schedule();
So why do we set state here?
And should not we
check need_sched?
- }OK is the feature still a performance win? or should we drop it for now?
}
- /* Make sure ref counter was checked before any other
- * operations that was dene on map. */
+ /* Make sure seq was checked before any other operations that
+ * was dene on map. */
smp_mb();
}
@@ -691,7 +681,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
- vq->ref = 0;
+ seqcount_init(&vq->seq);
mutex_init(&vq->mutex);
spin_lock_init(&vq->mmu_lock);
vhost_vq_reset(dev, vq);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3d10da0ae511..1a705e181a84 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -125,7 +125,7 @@ struct vhost_virtqueue {
*/
struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
#endif
- int ref;
+ seqcount_t seq;
const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
struct file *kick;
--
2.18.1
Compare to without this patch.
Consider the read critical section is pretty small the synchronizationSorry what do you mean by this last sentence? This degrades performance
should be done very fast.
Note the patch lead about 3% PPS dropping.
compared to what?
I don't think it's correct since by this logicYes.Reported-by: Michael S. Tsirkin <mst@xxxxxxxxxx>The map access is after this sequence, correct?
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
---
drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
drivers/vhost/vhost.h | 7 +-
2 files changed, 94 insertions(+), 58 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cfc11f9ed9c9..db2c81cb1e90 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
spin_lock(&vq->mmu_lock);
for (i = 0; i < VHOST_NUM_ADDRS; i++) {
- map[i] = rcu_dereference_protected(vq->maps[i],
- lockdep_is_held(&vq->mmu_lock));
+ map[i] = vq->maps[i];
if (map[i]) {
vhost_set_map_dirty(vq, map[i], i);
- rcu_assign_pointer(vq->maps[i], NULL);
+ vq->maps[i] = NULL;
}
}
spin_unlock(&vq->mmu_lock);
- /* No need for synchronize_rcu() or kfree_rcu() since we are
- * serialized with memory accessors (e.g vq mutex held).
+ /* No need for synchronization since we are serialized with
+ * memory accessors (e.g vq mutex held).
*/
for (i = 0; i < VHOST_NUM_ADDRS; i++)
@@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
}
+static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
+{
+ int ref = READ_ONCE(vq->ref);
+
+ smp_store_release(&vq->ref, ref + 1);
+ /* Make sure ref counter is visible before accessing the map */
+ smp_load_acquire(&vq->ref);
Just going by the rules in Documentation/memory-barriers.txt,Yes, but it's not clear about the order of ACQUIRE the same location
I think that this pair will not order following accesses with ref store.
Documentation/memory-barriers.txt says:
+ In addition, a RELEASE+ACQUIRE
+ pair is -not- guaranteed to act as a full memory barrier.
The guarantee that is made is this:
after
an ACQUIRE on a given variable, all memory accesses preceding any prior
RELEASE on that same variable are guaranteed to be visible.
of previous RELEASE. And it only has a example like:
"
*A = a;
RELEASE M
ACQUIRE N
*B = b;
could occur as:
ACQUIRE N, STORE *B, STORE *A, RELEASE M
"
But it doesn't explain what happen when
*A = a
RELEASE M
ACQUIRE M
*B = b;
And tools/memory-model/Documentation said
"
First, when a lock-acquire reads from a lock-release, the LKMM
requires that every instruction po-before the lock-release must
execute before any instruction po-after the lock-acquire.
"
Is this a hint that I was correct?
memory barriers can be nops on x86.
That kinds of violates the RELEASE?
And if we also had the reverse rule we'd end up with a full barrier,
won't we?
Cc Paul in case I missed something here. And if I'm right,
maybe we should call this out, adding
"The opposite is not true: a prior RELEASE is not
guaranteed to be visible before memory accesses following
the subsequent ACQUIRE".
"
This also acts as a one-way permeable barrier. It guarantees that all
memory operations before the RELEASE operation will appear to happen
before the RELEASE operation with respect to the other components of the
"
yes but we are talking about RELEASE itself versus stuff
that comes after it.
any answers here?Ok.
+}Please document the even/odd trick here too, not just in the commit log.
+
+static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
+{
+ int ref = READ_ONCE(vq->ref);
+
+ /* Make sure vq access is done before increasing ref counter */
+ smp_store_release(&vq->ref, ref + 1);
+}
+
+static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
+{
+ int ref;
+
+ /* Make sure map change was done before checking ref counter */
+ smp_mb();
+
+ ref = READ_ONCE(vq->ref);
+ if (ref & 0x1) {
+ /* When ref change,changes
we are sure no reader can see
+ * previous map */
+ while (READ_ONCE(vq->ref) == ref) {
what is the below line in aid of?
+ set_current_state(TASK_RUNNING);
But we will spin for a very long time in this case.Yes, better.+ schedule();if (need_resched())
schedule();
?
I'm not sure I get, if the odd is not the original value we read,+ }On an interruptible kernel, there's a risk here is that
a task got preempted with an odd ref.
So I suspect we'll have to disable preemption when we
make ref odd.
we're sure it won't read the new map here I believe.
Yes.
+ }was dene -> were done?
+ /* Make sure ref counter was checked before any other
+ * operations that was dene on map. */
Not a must, let me fix.+ smp_mb();Is it important that this is signed? If not I'd do unsigned here:
+}
+
static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
int index,
unsigned long start,
@@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
spin_lock(&vq->mmu_lock);
++vq->invalidate_count;
- map = rcu_dereference_protected(vq->maps[index],
- lockdep_is_held(&vq->mmu_lock));
+ map = vq->maps[index];
if (map) {
vhost_set_map_dirty(vq, map, index);
- rcu_assign_pointer(vq->maps[index], NULL);
+ vq->maps[index] = NULL;
}
spin_unlock(&vq->mmu_lock);
if (map) {
- synchronize_rcu();
+ vhost_vq_sync_access(vq);
vhost_map_unprefetch(map);
}
}
@@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev)
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
for (j = 0; j < VHOST_NUM_ADDRS; j++)
- RCU_INIT_POINTER(vq->maps[j], NULL);
+ vq->maps[j] = NULL;
}
}
#endif
@@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
+ vq->ref = 0;
mutex_init(&vq->mutex);
spin_lock_init(&vq->mmu_lock);
vhost_vq_reset(dev, vq);
@@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
map->npages = npages;
map->pages = pages;
- rcu_assign_pointer(vq->maps[index], map);
+ vq->maps[index] = map;
/* No need for a synchronize_rcu(). This function should be
* called by dev->worker so we are serialized with all
* readers.
@@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
struct vring_used *used;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ map = vq->maps[VHOST_ADDR_USED];
if (likely(map)) {
used = map->addr;
*((__virtio16 *)&used->ring[vq->num]) =
cpu_to_vhost16(vq, vq->avail_idx);
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
size_t size;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ map = vq->maps[VHOST_ADDR_USED];
if (likely(map)) {
used = map->addr;
size = count * sizeof(*head);
memcpy(used->ring + idx, head, size);
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
struct vring_used *used;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ map = vq->maps[VHOST_ADDR_USED];
if (likely(map)) {
used = map->addr;
used->flags = cpu_to_vhost16(vq, vq->used_flags);
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
struct vring_used *used;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ map = vq->maps[VHOST_ADDR_USED];
if (likely(map)) {
used = map->addr;
used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
struct vring_avail *avail;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ map = vq->maps[VHOST_ADDR_AVAIL];
if (likely(map)) {
avail = map->addr;
*idx = avail->idx;
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
struct vring_avail *avail;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ map = vq->maps[VHOST_ADDR_AVAIL];
if (likely(map)) {
avail = map->addr;
*head = avail->ring[idx & (vq->num - 1)];
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
struct vring_avail *avail;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ map = vq->maps[VHOST_ADDR_AVAIL];
if (likely(map)) {
avail = map->addr;
*flags = avail->flags;
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
struct vring_avail *avail;
if (!vq->iotlb) {
- rcu_read_lock();
- map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ vhost_vq_access_map_begin(vq);
+ map = vq->maps[VHOST_ADDR_AVAIL];
if (likely(map)) {
avail = map->addr;
*event = (__virtio16)avail->ring[vq->num];
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
struct vring_used *used;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ map = vq->maps[VHOST_ADDR_USED];
if (likely(map)) {
used = map->addr;
*idx = used->idx;
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
struct vring_desc *d;
if (!vq->iotlb) {
- rcu_read_lock();
+ vhost_vq_access_map_begin(vq);
- map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
+ map = vq->maps[VHOST_ADDR_DESC];
if (likely(map)) {
d = map->addr;
*desc = *(d + idx);
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
return 0;
}
- rcu_read_unlock();
+ vhost_vq_access_map_end(vq);
}
#endif
@@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
#if VHOST_ARCH_CAN_ACCEL_UACCESS
static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
{
- struct vhost_map __rcu *map;
+ struct vhost_map *map;
int i;
for (i = 0; i < VHOST_NUM_ADDRS; i++) {
- rcu_read_lock();
- map = rcu_dereference(vq->maps[i]);
- rcu_read_unlock();
+ map = vq->maps[i];
if (unlikely(!map))
vhost_map_prefetch(vq, i);
}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a9a2a93857d2..f9e9558a529d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -115,16 +115,17 @@ struct vhost_virtqueue {
#if VHOST_ARCH_CAN_ACCEL_UACCESS
/* Read by memory accessors, modified by meta data
* prefetching, MMU notifier and vring ioctl().
- * Synchonrized through mmu_lock (writers) and RCU (writers
- * and readers).
+ * Synchonrized through mmu_lock (writers) and ref counters,
+ * see vhost_vq_access_map_begin()/vhost_vq_access_map_end().
*/
- struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
+ struct vhost_map *maps[VHOST_NUM_ADDRS];
/* Read by MMU notifier, modified by vring ioctl(),
* synchronized through MMU notifier
* registering/unregistering.
*/
struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
#endif
+ int ref;
even though kernel does compile with 2s complement sign overflow,
it seems cleaner not to depend on that.
Thanks
const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
struct file *kick;
--
2.18.1