[PATCH v3 19/21] nvme-tcp: Extend FENCING state per TP4129 on CCR failure

From: Mohamed Khalfella

Date: Fri Feb 13 2026 - 23:34:40 EST


If CCR operations fail and CQT is supported, we must defer the retry of
inflight requests per TP4129. Update ctrl->fencing_work to schedule
ctrl->fenced_work, effectively extending the FENCING state. This delay
ensures that inflight requests are held until it is safe for them to be
retired.

Signed-off-by: Mohamed Khalfella <mkhalfella@xxxxxxxxxxxxxxx>
---
drivers/nvme/host/tcp.c | 39 +++++++++++++++++++++++++++++++++++----
1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 229cfdffd848..054e8a350d75 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -194,6 +194,7 @@ struct nvme_tcp_ctrl {
struct nvme_ctrl ctrl;

struct work_struct fencing_work;
+ struct delayed_work fenced_work;
struct work_struct err_work;
struct delayed_work connect_work;
struct nvme_tcp_request async_req;
@@ -2477,6 +2478,18 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
nvme_tcp_reconnect_or_remove(ctrl, ret);
}

+static void nvme_tcp_fenced_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+ struct nvme_tcp_ctrl, fenced_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+ dev_info(ctrl->device, "Time-based recovery finished\n");
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
+}
+
static void nvme_tcp_fencing_work(struct work_struct *work)
{
struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
@@ -2485,23 +2498,40 @@ static void nvme_tcp_fencing_work(struct work_struct *work)
unsigned long rem;

rem = nvme_fence_ctrl(ctrl);
- if (rem) {
+ if (!rem)
+ goto done;
+
+ if (!ctrl->cqt) {
dev_info(ctrl->device,
- "CCR failed, skipping time-based recovery\n");
+ "CCR failed, CQT not supported, skip time-based recovery\n");
+ goto done;
}

+ dev_info(ctrl->device,
+ "CCR failed, switch to time-based recovery, timeout = %ums\n",
+ jiffies_to_msecs(rem));
+ queue_delayed_work(nvme_wq, &tcp_ctrl->fenced_work, rem);
+ return;
+
+done:
nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
}

+static void nvme_tcp_flush_fencing_works(struct nvme_ctrl *ctrl)
+{
+ flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+ flush_delayed_work(&to_tcp_ctrl(ctrl)->fenced_work);
+}
+
static void nvme_tcp_error_recovery_work(struct work_struct *work)
{
struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;

- flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+ nvme_tcp_flush_fencing_works(ctrl);
if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_keep_alive(ctrl);
@@ -2544,7 +2574,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
container_of(work, struct nvme_ctrl, reset_work);
int ret;

- flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+ nvme_tcp_flush_fencing_works(ctrl);
if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_ctrl(ctrl);
@@ -2934,6 +2964,7 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
INIT_DELAYED_WORK(&ctrl->connect_work,
nvme_tcp_reconnect_ctrl_work);
INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
+ INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_tcp_fenced_work);
INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);

--
2.52.0