Re: [PATCH] virtio_ring: Fix the stale index in available ring
From: Gavin Shan
Date: Sun Mar 17 2024 - 19:42:06 EST
On 3/18/24 02:50, Michael S. Tsirkin wrote:
On Fri, Mar 15, 2024 at 09:24:36PM +1000, Gavin Shan wrote:
On 3/15/24 21:05, Michael S. Tsirkin wrote:
On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote:
Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I tried
to reproduce it with my own driver where one thread writes to the shared buffer
and another thread reads from the buffer. I don't hit the out-of-order issue so
far.
Make sure the 2 areas you are accessing are in different cache lines.
Yes, I already put those 2 areas to separate cache lines.
My driver may be not correct somewhere and I will update if I can reproduce
the issue with my driver in the future.
Then maybe your change is just making virtio slower and masks the bug
that is actually elsewhere?
You don't really need a driver. Here's a simple test: without barriers
assertion will fail. With barriers it will not.
(Warning: didn't bother testing too much, could be buggy.
---
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define FIRST values[0]
#define SECOND values[64]
volatile int values[100] = {};
void* writer_thread(void* arg) {
while (1) {
FIRST++;
// NEED smp_wmb here
__asm__ volatile("dmb ishst" : : : "memory");
SECOND++;
}
}
void* reader_thread(void* arg) {
while (1) {
int first = FIRST;
// NEED smp_rmb here
__asm__ volatile("dmb ishld" : : : "memory");
int second = SECOND;
assert(first - second == 1 || first - second == 0);
}
}
int main() {
pthread_t writer, reader;
pthread_create(&writer, NULL, writer_thread, NULL);
pthread_create(&reader, NULL, reader_thread, NULL);
pthread_join(writer, NULL);
pthread_join(reader, NULL);
return 0;
}
Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit
the assert on both of them. After replacing 'dmb' with 'dsb', I can
hit assert on both of them too. I need to look at the code closely.
[root@virt-mtcollins-02 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
Aborted (core dumped)
[root@nvidia-grace-hopper-05 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
Aborted (core dumped)
Thanks,
Gavin
Actually this test is broken. No need for ordering it's a simple race.
The following works on x86 though (x86 does not need barriers
though).
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#if 0
#define x86_rmb() asm volatile("lfence":::"memory")
#define x86_mb() asm volatile("mfence":::"memory")
#define x86_smb() asm volatile("sfence":::"memory")
#else
#define x86_rmb() asm volatile("":::"memory")
#define x86_mb() asm volatile("":::"memory")
#define x86_smb() asm volatile("":::"memory")
#endif
#define FIRST values[0]
#define SECOND values[640]
#define FLAG values[1280]
volatile unsigned values[2000] = {};
void* writer_thread(void* arg) {
while (1) {
/* Now synchronize with reader */
while(FLAG);
FIRST++;
x86_smb();
SECOND++;
x86_smb();
FLAG = 1;
}
}
void* reader_thread(void* arg) {
while (1) {
/* Now synchronize with writer */
while(!FLAG);
x86_rmb();
unsigned first = FIRST;
x86_rmb();
unsigned second = SECOND;
assert(first - second == 1 || first - second == 0);
FLAG = 0;
if (!(first %1000000))
printf("%d\n", first);
}
}
int main() {
pthread_t writer, reader;
pthread_create(&writer, NULL, writer_thread, NULL);
pthread_create(&reader, NULL, reader_thread, NULL);
pthread_join(writer, NULL);
pthread_join(reader, NULL);
return 0;
}
I tried it on host and VM of NVidia's grace-hopper. Without the barriers, I
can hit assert. With the barriers, it's working fine without hitting the
assert.
I also had some code to mimic virtio vring last weekend, and it's just
working well. Back to our original issue, __smb_wmb() is issued by guest
while __smb_rmb() is executed on host. The VM and host are running at
different exception level: EL2 vs EL1. I'm not sure it's the cause. I
need to modify my code so that __smb_wmb() and __smb_rmb() can be executed
from guest and host.
[gshan@gshan code]$ cat test.h
#ifndef __TEST_H
#define __TEST_H
struct vring_desc {
uint64_t addr;
uint32_t len;
uint16_t flags;
uint16_t next;
} __attribute__((aligned(4)));
struct vring_avail {
uint16_t flags;
uint16_t idx;
uint16_t ring[];
} __attribute__((aligned(4)));
struct vring_used_elem {
uint32_t id;
uint32_t len;
} __attribute__((aligned(4)));
struct vring_used {
uint16_t flags;
uint16_t idx;
struct vring_used_elem ring[];
} __attribute__((aligned(4)));
struct vring {
struct vring_desc *desc;
struct vring_avail *avail;
struct vring_used *used;
uint8_t pad0[64];
/* Writer */
uint32_t num;
uint32_t w_num_free;
uint32_t w_free_head;
uint16_t w_avail_idx;
uint16_t w_last_used_idx;
uint16_t *w_extra_data;
uint16_t *w_extra_next;
uint8_t pad1[64];
/* Reader */
uint16_t r_avail_idx;
uint16_t r_last_avail_idx;
uint16_t r_last_used_idx;
uint8_t pad2[64];
};
static inline unsigned int vring_size(unsigned int num, unsigned long align)
{
return ((sizeof(struct vring_desc) * num +
sizeof(uint16_t) * (3 + num) + (align - 1)) & ~(align - 1)) +
sizeof(uint16_t) * 3 + sizeof(struct vring_used_elem) * num;
}
static inline void __smp_rmb(void)
{
#ifdef WEAK_BARRIER
__asm__ volatile("dmb ishld" : : : "memory");
#else
__asm__ volatile("dsb sy" : : : "memory");
#endif
}
static inline void __smp_wmb(void)
{
#ifdef WEAK_BARRIER
__asm__ volatile("dmb ishst" : : : "memory");
#else
__asm__ volatile("dsb sy" : : : "memory");
#endif
}
#endif /* __TEST_H */
[gshan@gshan code]$ cat test.c
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <sys/types.h>
#include <assert.h>
#include <sched.h>
#include <pthread.h>
#include "test.h"
static struct vring *vring;
static int bind_cpu(int cpuid)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpuid, &cpuset);
return pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
}
static void write_free_used_desc(void)
{
uint16_t last_used;
uint32_t idx;
if ((uint16_t)(vring->used->idx - vring->w_last_used_idx) < 64)
return;
while (true) {
if (vring->w_last_used_idx == vring->used->idx)
return;
__smp_rmb();
/* Retrieve the head */
last_used = vring->w_last_used_idx & (vring->num - 1);
idx = vring->used->ring[last_used].id;
assert(idx < vring->num);
assert(vring->w_extra_data[idx]);
/* Reclaim the descriptor */
vring->w_extra_data[idx] = 0;
vring->w_extra_next[idx] = vring->w_free_head;
vring->w_free_head = idx;
/* Update statistics */
vring->w_num_free++;
vring->w_last_used_idx++;
}
}
static void write_push_desc(void)
{
uint32_t head = vring->w_free_head;
uint32_t avail_idx;
if (vring->w_num_free < 1)
return;
/*
* The data in the descriptor doesn't matter. The idea here
* is to dirty the cache line.
*/
vring->desc[head].flags = 1;
vring->desc[head].addr = 0xffffffffffffffff;
vring->desc[head].len = 0xffffffff;
vring->desc[head].next = vring->w_extra_next[head];
vring->desc[head].flags = 0;
vring->w_num_free--;
vring->w_free_head = vring->w_extra_next[head];
vring->w_extra_data[head] = 1;
avail_idx = vring->w_avail_idx & (vring->num - 1);
vring->avail->ring[avail_idx] = head;
__smp_wmb();
vring->w_avail_idx++;
vring->avail->idx = vring->w_avail_idx;
}
static void *write_worker(void *arg)
{
assert(!bind_cpu(10));
while (true) {
write_free_used_desc();
write_push_desc();
}
return NULL;
}
static void read_pull_desc(void)
{
uint16_t avail_idx, last_avail_idx;
uint32_t head;
last_avail_idx = vring->r_last_avail_idx;
if (vring->r_avail_idx == vring->r_last_avail_idx) {
vring->r_avail_idx = vring->avail->idx;
if (vring->r_avail_idx == last_avail_idx)
return;
__smp_rmb();
}
head = vring->avail->ring[last_avail_idx & (vring->num - 1)];
assert(head < vring->num);
vring->r_last_avail_idx++;
vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].id = head;
vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].len = 0;
vring->r_last_used_idx++;
__smp_wmb();
vring->used->idx = vring->r_last_used_idx;
}
static void *read_worker(void *arg)
{
assert(!bind_cpu(60));
while (true) {
read_pull_desc();
}
return NULL;
}
static void init_vring(unsigned int num, unsigned long align)
{
unsigned int size, i;
/* vring */
vring = malloc(sizeof(*vring));
assert(vring);
memset(vring, 0, sizeof(*vring));
/* Descriptors */
size = vring_size(num, align);
vring->desc = (struct vring_desc *)malloc(size);
assert(vring->desc);
memset(vring->desc, 0, size);
vring->avail = (struct vring_avail *)((void *)vring->desc +
num * sizeof(struct vring_desc));
vring->used = (struct vring_used *)(((unsigned long)&vring->avail->ring[num] +
sizeof(uint16_t) + (align - 1)) & ~(align - 1));
/* Writer's extra data */
vring->w_extra_data = malloc(sizeof(uint16_t) * num);
assert(vring->w_extra_data);
memset(vring->w_extra_data, 0, sizeof(uint16_t) * num);
vring->w_extra_next = malloc(sizeof(uint16_t) * num);
assert(vring->w_extra_next);
memset(vring->w_extra_next, 0, sizeof(uint16_t) * num);
for (i = 0; i < num - 1; i++)
vring->w_extra_next[i] = i + 1;
/* Statistics */
vring->num = num;
vring->w_num_free = num;
vring->w_free_head = 0;
vring->w_avail_idx = 0;
vring->w_last_used_idx = 0;
vring->r_avail_idx = 0;
vring->r_last_avail_idx = 0;
vring->r_last_used_idx = 0;
}
int main(int argc, char **argv)
{
pthread_t w_tid, r_tid;
int ret;
assert(!bind_cpu(0));
init_vring(256, 64);
ret = pthread_create(&w_tid, NULL, write_worker, NULL);
assert(!ret);
ret = pthread_create(&r_tid, NULL, read_worker, NULL);
assert(!ret);
ret = pthread_join(w_tid, NULL);
assert(!ret);
ret = pthread_join(r_tid, NULL);
assert(!ret);
while (true) {
sleep(1);
}
return 0;
}
Thanks,
Gavin