Re: BUG: vnet0 selects TX queue 11, but real number of TX queues is 11

From: Stanislav Fomichev
Date: Tue Jan 29 2019 - 21:52:43 EST


On Tue, Jan 29, 2019 at 6:16 PM George Amanakis <gamanakis@xxxxxxxxx> wrote:
>
> Since 4.20.4 when running a KVM with vhost_net I am seeing in dmesg:
> vnet0 selects TX queue 11, but real number of TX queues is 11
>
> The corresponding part in the xml definition of the virtual machine is:
> -------8<-------
> <interface type='bridge'>
> <mac address='xx:xx:xx:xx:xx:xx'/>
> <source bridge='br0'/>
> <model type='virtio'/>
> <driver name='vhost' queues='12'>
> </driver>
> <address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/>
> </interface>
> -------8<-------
>
> Doing a git-bisect with 4.20.3 last known good, and 4.20.4 as bad, this
> commit turned up:
> -------8<-------
> commit 9ff0436e2c3575ffe64d359fb3b67aee237dc519
> Author: Stanislav Fomichev <sdf@xxxxxxxxxx>
> Date: Mon Jan 7 13:38:38 2019 -0800
>
> tun: publish tfile after it's fully initialized
>
> [ Upstream commit 0b7959b6257322f7693b08a459c505d4938646f2 ]
>
> BUG: unable to handle kernel NULL pointer dereference at
> 00000000000000d1
> -------8<-------
>
>
> Applying the following patch corrects it in 4.20.5. Would this be the
> correct thing to do?
Ouch, tun_set_real_num_queues uses tun->numqueues internally :-(
Your patch looks good to me, care to do a proper submission (with a Fixes tag)?
I wonder whether we should use it as an opportunity to also do
something like the following (to make it more explicit):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 18656c4094b3..ea9928b3b930 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -632,10 +632,10 @@ static inline bool tun_not_capable(struct tun_struct *tun)
!ns_capable(net->user_ns, CAP_NET_ADMIN);
}

-static void tun_set_real_num_queues(struct tun_struct *tun)
+static void tun_set_real_num_queues(struct tun_struct *tun, unsigned int nr)
{
- netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
- netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
+ netif_set_real_num_tx_queues(tun->dev, nr);
+ netif_set_real_num_rx_queues(tun->dev, nr);
}

static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
@@ -712,7 +712,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun_flow_delete_by_queue(tun, tun->numqueues + 1);
/* Drop read queue */
tun_queue_purge(tfile);
- tun_set_real_num_queues(tun);
+ tun_set_real_num_queues(tun, tun->numqueues);
} else if (tfile->detached && clean) {
tun = tun_enable_queue(tfile);
sock_put(&tfile->sk);
@@ -866,8 +866,6 @@ static int tun_attach(struct tun_struct *tun,
struct file *file,
if (rtnl_dereference(tun->xdp_prog))
sock_set_flag(&tfile->sk, SOCK_XDP);

- tun_set_real_num_queues(tun);
-
/* device is allowed to go away first, so no need to hold extra
* refcnt.
*/
@@ -879,6 +877,7 @@ static int tun_attach(struct tun_struct *tun,
struct file *file,
rcu_assign_pointer(tfile->tun, tun);
rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
tun->numqueues++;
+ tun_set_real_num_queues(tun, tun->numqueues);
out:
return err;
}


> ---
> drivers/net/tun.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 6658658246d2..e0dc004c6483 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -862,8 +862,6 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
> if (rtnl_dereference(tun->xdp_prog))
> sock_set_flag(&tfile->sk, SOCK_XDP);
>
> - tun_set_real_num_queues(tun);
> -
> /* device is allowed to go away first, so no need to hold extra
> * refcnt.
> */
> @@ -875,6 +873,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
> rcu_assign_pointer(tfile->tun, tun);
> rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> tun->numqueues++;
> +
> + tun_set_real_num_queues(tun);
> +
> out:
> return err;
> }
> --
> 2.20.1
>