Re: [PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test

From: Jason Wang
Date: Sat Feb 03 2024 - 20:31:00 EST


On Fri, Feb 2, 2024 at 8:24 PM Yunsheng Lin <linyunsheng@xxxxxxxxxx> wrote:
>
> On 2024/2/2 12:05, Jason Wang wrote:
> > On Tue, Jan 30, 2024 at 7:38 PM Yunsheng Lin <linyunsheng@xxxxxxxxxx> wrote:
> >>
> >> introduce vhost_net_test basing on virtio_test to test
> >> vhost_net changing in the kernel.
> >
> > Let's describe what kind of test is being done and how it is done here.
>
> How about something like below:
>
> This patch introduces testing for both vhost_net tx and rx.
> Steps for vhost_net tx testing:
> 1. Prepare a out buf
> 2. Kick the vhost_net to do tx processing
> 3. Do the receiving in the tun side
> 4. verify the data received by tun is correct
>
> Steps for vhost_net rx testing::
> 1. Prepare a in buf
> 2. Do the sending in the tun side
> 3. Kick the vhost_net to do rx processing
> 4. verify the data received by vhost_net is correct

It looks like some important details were lost, e.g the logic for batching etc.

>
>
> >> +
> >> +static int tun_alloc(struct vdev_info *dev)
> >> +{
> >> + struct ifreq ifr;
> >> + int len = HDR_LEN;
> >
> > Any reason you can't just use the virtio_net uapi?
>
> I didn't find a macro for that in include/uapi/linux/virtio_net.h.
>
> Did you mean using something like below?
> sizeof(struct virtio_net_hdr_mrg_rxbuf)

Yes.

>
> >
> >> + int fd, e;
> >> +
> >> + fd = open("/dev/net/tun", O_RDWR);
> >> + if (fd < 0) {
> >> + perror("Cannot open /dev/net/tun");
> >> + return fd;
> >> + }
> >> +
> >> + memset(&ifr, 0, sizeof(ifr));
> >> +
> >> + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
> >> + snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
> >> +
> >> + e = ioctl(fd, TUNSETIFF, &ifr);
> >> + if (e < 0) {
> >> + perror("ioctl[TUNSETIFF]");
> >> + close(fd);
> >> + return e;
> >> + }
> >> +
> >> + e = ioctl(fd, TUNSETVNETHDRSZ, &len);
> >> + if (e < 0) {
> >> + perror("ioctl[TUNSETVNETHDRSZ]");
> >> + close(fd);
> >> + return e;
> >> + }
> >> +
> >> + e = ioctl(fd, SIOCGIFHWADDR, &ifr);
> >> + if (e < 0) {
> >> + perror("ioctl[SIOCGIFHWADDR]");
> >> + close(fd);
> >> + return e;
> >> + }
> >> +
> >> + memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
> >> + return fd;
> >> +}
> >> +
> >> +static void vdev_create_socket(struct vdev_info *dev)
> >> +{
> >> + struct ifreq ifr;
> >> +
> >> + dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
> >> + assert(dev->sock != -1);
> >> +
> >> + snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
> >
> > Nit: it might be better to accept the device name instead of repeating
> > the snprintf trick here, this would facilitate the future changes.
>
> I am not sure I understand what did you mean by "accept the device name"
> here.
>
> The above is used to get ifindex of the tun netdevice created in
> tun_alloc(), so that we can use it in vdev_send_packet() to send
> a packet using the tun netdevice created in tun_alloc(). Is there
> anything obvious I missed here?

I meant a const char *ifname for this function and let the caller to
pass the name.

>
> >
> >> + assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >= 0);
> >> +
> >> + dev->ifindex = ifr.ifr_ifindex;
> >> +
> >> + /* Set the flags that bring the device up */
> >> + assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >= 0);
> >> + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
> >> + assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >= 0);
> >> +}
> >> +
> >> +static void vdev_send_packet(struct vdev_info *dev)
> >> +{
> >> + char *sendbuf = dev->test_buf + HDR_LEN;
> >> + struct sockaddr_ll saddrll = {0};
> >> + int sockfd = dev->sock;
> >> + int ret;
> >> +
> >> + saddrll.sll_family = PF_PACKET;
> >> + saddrll.sll_ifindex = dev->ifindex;
> >> + saddrll.sll_halen = ETH_ALEN;
> >> + saddrll.sll_protocol = htons(TEST_PTYPE);
> >> +
> >> + ret = sendto(sockfd, sendbuf, TEST_BUF_LEN, 0,
> >> + (struct sockaddr *)&saddrll,
> >> + sizeof(struct sockaddr_ll));
> >> + assert(ret >= 0);
> >> +}
> >> +
>
> ...
>
> >> +
> >> +static void vq_info_add(struct vdev_info *dev, int idx, int num, int fd)
> >> +{
> >> + struct vhost_vring_file backend = { .index = idx, .fd = fd };
> >> + struct vq_info *info = &dev->vqs[idx];
> >> + int r;
> >> +
> >> + info->idx = idx;
> >> + info->kick = eventfd(0, EFD_NONBLOCK);
> >> + info->call = eventfd(0, EFD_NONBLOCK);
> >
> > If we don't care about the callback, let's just avoid to set the call here?
> >
> > (As I see vq_callback is a NULL)
>
> Sure, will remove the vq_callback related code.
>
> >
> >> + r = posix_memalign(&info->ring, 4096, vring_size(num, 4096));
> >> + assert(r >= 0);
> >> + vq_reset(info, num, &dev->vdev);
> >> + vhost_vq_setup(dev, info);
> >> + info->fds.fd = info->call;
> >> + info->fds.events = POLLIN;
> >> +
> >> + r = ioctl(dev->control, VHOST_NET_SET_BACKEND, &backend);
> >> + assert(!r);
> >> +}
> >> +
> >> +static void vdev_info_init(struct vdev_info *dev, unsigned long long features)
> >> +{
> >> + struct ether_header *eh;
> >> + int i, r;
> >> +
> >> + dev->vdev.features = features;
> >> + INIT_LIST_HEAD(&dev->vdev.vqs);
> >> + spin_lock_init(&dev->vdev.vqs_list_lock);
> >> +
> >> + dev->buf_size = (HDR_LEN + TEST_BUF_LEN) * 2;
> >> + dev->buf = malloc(dev->buf_size);
> >> + assert(dev->buf);
> >> + dev->test_buf = dev->buf;
> >> + dev->res_buf = dev->test_buf + HDR_LEN + TEST_BUF_LEN;
> >> +
> >> + memset(dev->test_buf, 0, HDR_LEN + TEST_BUF_LEN);
> >> + eh = (struct ether_header *)(dev->test_buf + HDR_LEN);
> >> + eh->ether_type = htons(TEST_PTYPE);
> >> + memcpy(eh->ether_dhost, dev->mac, ETHER_ADDR_LEN);
> >> + memcpy(eh->ether_shost, dev->mac, ETHER_ADDR_LEN);
> >> +
> >> + for (i = sizeof(*eh); i < TEST_BUF_LEN; i++)
> >> + dev->test_buf[i + HDR_LEN] = (char)i;
> >> +
> >> + dev->control = open("/dev/vhost-net", O_RDWR);
> >> + assert(dev->control >= 0);
> >> +
> >> + r = ioctl(dev->control, VHOST_SET_OWNER, NULL);
> >> + assert(r >= 0);
> >> +
> >> + dev->mem = malloc(offsetof(struct vhost_memory, regions) +
> >> + sizeof(dev->mem->regions[0]));
> >> + assert(dev->mem);
> >> + memset(dev->mem, 0, offsetof(struct vhost_memory, regions) +
> >> + sizeof(dev->mem->regions[0]));
> >> + dev->mem->nregions = 1;
> >> + dev->mem->regions[0].guest_phys_addr = (long)dev->buf;
> >> + dev->mem->regions[0].userspace_addr = (long)dev->buf;
> >> + dev->mem->regions[0].memory_size = dev->buf_size;
> >> +
> >> + r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
> >> + assert(r >= 0);
> >> +
> >> + r = ioctl(dev->control, VHOST_SET_FEATURES, &features);
> >> + assert(r >= 0);
> >> +
> >> + dev->nvqs = 2;
> >> +}
> >> +
> >> +static void wait_for_interrupt(struct vq_info *vq)
> >> +{
> >> + unsigned long long val;
> >> +
> >> + poll(&vq->fds, 1, -1);
> >> +
> >> + if (vq->fds.revents & POLLIN)
> >> + read(vq->fds.fd, &val, sizeof(val));
> >> +}
> >> +
> >> +static void verify_res_buf(char *res_buf)
> >> +{
> >> + int i;
> >> +
> >> + for (i = ETHER_HDR_LEN; i < TEST_BUF_LEN; i++)
> >> + assert(res_buf[i] == (char)i);
> >> +}
> >> +
> >> +static void run_tx_test(struct vdev_info *dev, struct vq_info *vq,
> >> + bool delayed, int batch, int bufs)
> >
> > It might be better to describe the test design briefly above as a
> > comment. Or we can start from simple test logic and add sophisticated
> > ones on top.
>
> Does something described in the comment log as suggested by you make
> sense to you?
> Steps for vhost_net tx testing:
> 1. Prepare a out buf
> 2. Kick the vhost_net to do tx processing
> 3. Do the receiving in the tun side
> 4. verify the data received by tun is correct

See my reply above.

>
> >
> >> +{
> >> + const bool random_batch = batch == RANDOM_BATCH;
> >> + long long spurious = 0;
> >> + struct scatterlist sl;
> >> + unsigned int len;
> >> + int r;
> >> +
> >> + for (;;) {
> >> + long started_before = vq->started;
> >> + long completed_before = vq->completed;
> >> +
> >> + virtqueue_disable_cb(vq->vq);
> >> + do {
> >> + if (random_batch)
> >> + batch = (random() % vq->vring.num) + 1;
> >> +
> >> + while (vq->started < bufs &&
> >> + (vq->started - vq->completed) < batch) {
> >> + sg_init_one(&sl, dev->test_buf, HDR_LEN + TEST_BUF_LEN);
> >> + r = virtqueue_add_outbuf(vq->vq, &sl, 1,
> >> + dev->test_buf + vq->started,
> >> + GFP_ATOMIC);
> >> + if (unlikely(r != 0)) {
> >> + if (r == -ENOSPC &&
> >> + vq->started > started_before)
> >> + r = 0;
> >> + else
> >> + r = -1;
> >> + break;
> >> + }
> >> +
> >> + ++vq->started;
> >> +
> >> + if (unlikely(!virtqueue_kick(vq->vq))) {
> >> + r = -1;
> >> + break;
> >> + }
> >> + }
> >> +
> >> + if (vq->started >= bufs)
> >> + r = -1;
> >> +
> >> + /* Flush out completed bufs if any */
> >> + while (virtqueue_get_buf(vq->vq, &len)) {
> >> + int n;
> >> +
> >> + n = recvfrom(dev->sock, dev->res_buf, TEST_BUF_LEN, 0, NULL, NULL);
> >> + assert(n == TEST_BUF_LEN);
> >> + verify_res_buf(dev->res_buf);
> >> +
> >> + ++vq->completed;
> >> + r = 0;
> >> + }
> >> + } while (r == 0);
> >> +
> >> + if (vq->completed == completed_before && vq->started == started_before)
> >> + ++spurious;
> >> +
> >> + assert(vq->completed <= bufs);
> >> + assert(vq->started <= bufs);
> >> + if (vq->completed == bufs)
> >> + break;
> >> +
> >> + if (delayed) {
> >> + if (virtqueue_enable_cb_delayed(vq->vq))
> >> + wait_for_interrupt(vq);
> >> + } else {
> >> + if (virtqueue_enable_cb(vq->vq))
> >> + wait_for_interrupt(vq);
> >> + }
> >> + }
> >> + printf("TX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n",
> >> + spurious, vq->started, vq->completed);
> >> +}
> >> +
> >> +static void run_rx_test(struct vdev_info *dev, struct vq_info *vq,
> >> + bool delayed, int batch, int bufs)
> >> +{
> >> + const bool random_batch = batch == RANDOM_BATCH;
> >> + long long spurious = 0;
> >> + struct scatterlist sl;
> >> + unsigned int len;
> >> + int r;
> >> +
> >> + for (;;) {
> >> + long started_before = vq->started;
> >> + long completed_before = vq->completed;
> >> +
> >> + do {
> >> + if (random_batch)
> >> + batch = (random() % vq->vring.num) + 1;
> >> +
> >> + while (vq->started < bufs &&
> >> + (vq->started - vq->completed) < batch) {
> >> + sg_init_one(&sl, dev->res_buf, HDR_LEN + TEST_BUF_LEN);
> >> +
> >> + r = virtqueue_add_inbuf(vq->vq, &sl, 1,
> >> + dev->res_buf + vq->started,
> >> + GFP_ATOMIC);
> >> + if (unlikely(r != 0)) {
> >> + if (r == -ENOSPC &&
> >
> > Drivers usually maintain a #free_slots, this can help to avoid the
> > trick for checking ENOSPC?
>
> The above "(vq->started - vq->completed) < batch" seems to ensure that
> the 'r' can't be '-ENOSPC'?

Well, if this is true any reason we still check ENOSPEC here?

> We just need to ensure the batch <= desc_num,
> and the 'r == -ENOSPC' checking seems to be unnecessary.
>
> >
> >> + vq->started > started_before)
> >> + r = 0;
> >> + else
> >> + r = -1;
> >> + break;
> >> + }
> >> +
> >> + ++vq->started;
> >> +
> >> + vdev_send_packet(dev);
> >> +
> >> + if (unlikely(!virtqueue_kick(vq->vq))) {
> >> + r = -1;
> >> + break;
> >> + }
> >> + }
> >> +
> >> + if (vq->started >= bufs)
> >> + r = -1;
> >> +
> >> + /* Flush out completed bufs if any */
> >> + while (virtqueue_get_buf(vq->vq, &len)) {
> >> + struct ether_header *eh;
> >> +
> >> + eh = (struct ether_header *)(dev->res_buf + HDR_LEN);
> >> +
> >> + /* tun netdev is up and running, ignore the
> >> + * non-TEST_PTYPE packet.
> >> + */
> >
> > I wonder if it's better to set up some kind of qdisc to avoid the
> > unexpected packet here, or is it too complicated?
>
> Yes, at least I don't know to do that yet.

For example, the blackhole qdisc?

Thanks