Re: [RFC PATCH] nvme-tcp: fix possible crash when only partilal queues are allocated by nvme_tcp_alloc_queue

From: Sagi Grimberg
Date: Mon Mar 13 2023 - 08:08:54 EST



 So we query that directly freeing these allocated
I/O queues may result in this issue. To avoid this issuse, we try to make
the following code modifications, but have not verified to be valid yet
because of the occasionality of this issue.

Now, we can not confirm the caller of queuing on the io_work during
allocating all the I/O queues, but we doubt that the socket interface
sk_data_ready=nvme_tcp_data_ready initialized by nvme_tcp_alloc_queue
may do this action in this interval. Hope to get some suggestions, Thanks!

I would start by determining that the running io_work is an io queue or
the admin queue. It is also possible that the admin queue cleanup after
we io queues allocation failed is the problematic teardown that is
causing this bug.


Signed-off-by: Yanjun Zhang <zhangyanjun@xxxxxxxx>
---
  drivers/nvme/host/tcp.c | 7 +++++--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9b47dcb2a..5782183b8 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1761,6 +1761,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
  static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
  {
      int i, ret;
+    struct nvme_tcp_ctrl *tcp_ctrl = NULL;
      for (i = 1; i < ctrl->queue_count; i++) {
          ret = nvme_tcp_alloc_queue(ctrl, i);
@@ -1771,9 +1772,11 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
      return 0;
  out_free_queues:
-    for (i--; i >= 1; i--)
+    for (i--; i >= 1; i--) {
+        tcp_ctrl = to_tcp_ctrl(ctrl);
+        __nvme_tcp_stop_queue(&tcp_ctrl->queues[i]);
          nvme_tcp_free_queue(ctrl, i);
-
+    }
      return ret;
  }

The crash dump reminds me of a reported issue fixed by:
160f3549a907 ("nvme-tcp: fix UAF when detecting digest errors")

Is this applied in your host kernel?

I'd also add this trace to your reproduction attempts:
--
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0ec7d3329595..0cbc8afb41c8 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -919,6 +919,10 @@ static void nvme_tcp_data_ready(struct sock *sk)

        read_lock_bh(&sk->sk_callback_lock);
        queue = sk->sk_user_data;
+       if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) {
+               pr_err("got data on a deallocated queue %d\n", nvme_tcp_queue_id(queue));
+               WARN_ON_ONCE(1);
+       }
        if (likely(queue && queue->rd_enabled) &&
            !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
--

If indeed this is the io_queue and for some reason it is running,
perhaps the correct fix would be to set the socket ops later in
start_queue. It will also be symmetric because we clear them in
stop_queue:
--
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0ec7d3329595..b3b1e39966c0 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1617,22 +1617,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
if (ret)
goto err_init_connect;

- queue->rd_enabled = true;
set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
- nvme_tcp_init_recv_ctx(queue);
-
- write_lock_bh(&queue->sock->sk->sk_callback_lock);
- queue->sock->sk->sk_user_data = queue;
- queue->state_change = queue->sock->sk->sk_state_change;
- queue->data_ready = queue->sock->sk->sk_data_ready;
- queue->write_space = queue->sock->sk->sk_write_space;
- queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
- queue->sock->sk->sk_state_change = nvme_tcp_state_change;
- queue->sock->sk->sk_write_space = nvme_tcp_write_space;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- queue->sock->sk->sk_ll_usec = 1;
-#endif
- write_unlock_bh(&queue->sock->sk->sk_callback_lock);

return 0;

@@ -1690,6 +1675,21 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
int ret;

+ queue->rd_enabled = true;
+ nvme_tcp_init_recv_ctx(queue);
+ write_lock_bh(&queue->sock->sk->sk_callback_lock);
+ queue->sock->sk->sk_user_data = queue;
+ queue->state_change = queue->sock->sk->sk_state_change;
+ queue->data_ready = queue->sock->sk->sk_data_ready;
+ queue->write_space = queue->sock->sk->sk_write_space;
+ queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+ queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+ queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ queue->sock->sk->sk_ll_usec = 1;
+#endif
+ write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+
if (idx)
ret = nvmf_connect_io_queue(nctrl, idx);
else
--