Re: [PATCH v2 10/14] nvme-tcp: Use CCR to recover controller that hits an error

From: Hannes Reinecke

Date: Tue Feb 03 2026 - 00:37:23 EST

On 1/30/26 23:34, Mohamed Khalfella wrote:

An alive nvme controller that hits an error now will move to FENCING
state instead of RESETTING state. ctrl->fencing_work attempts CCR to
terminate inflight IOs. If CCR succeeds, switch to FENCED -> RESETTING
and continue error recovery as usual. If CCR fails, the behavior depends
on whether the subsystem supports CQT or not. If CQT is not supported
then reset the controller immediately as if CCR succeeded in order to
maintain the current behavior. If CQT is supported switch to time-based
recovery. Schedule ctrl->fenced_work resets the controller when time
based recovery finishes.

Either ctrl->err_work or ctrl->reset_work can run after a controller is
fenced. Flush fencing work when either work run.

Signed-off-by: Mohamed Khalfella <mkhalfella@xxxxxxxxxxxxxxx>
---
drivers/nvme/host/tcp.c | 62 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 69cb04406b47..af8d3b36a4bb 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -193,6 +193,8 @@ struct nvme_tcp_ctrl {
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
+ struct work_struct fencing_work;
+ struct delayed_work fenced_work;
struct work_struct err_work;
struct delayed_work connect_work;
struct nvme_tcp_request async_req;
@@ -611,6 +613,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
{
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCING)) {
+ dev_warn(ctrl->device, "starting controller fencing\n");
+ queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->fencing_work);
+ return;
+ }
+

Don't you need to flush any outstanding 'fenced_work' queue items here
before calling 'queue_work()'?

if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return;
@@ -2470,12 +2478,59 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
nvme_tcp_reconnect_or_remove(ctrl, ret);
}
+static void nvme_tcp_fenced_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+ struct nvme_tcp_ctrl, fenced_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
+}
+
+static void nvme_tcp_fencing_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+ struct nvme_tcp_ctrl, fencing_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+ unsigned long rem;
+
+ rem = nvme_fence_ctrl(ctrl);
+ if (!rem)
+ goto done;
+
+ if (!ctrl->cqt) {
+ dev_info(ctrl->device,
+ "CCR failed, CQT not supported, skip time-based recovery\n");
+ goto done;
+ }
+

As mentioned, cqt handling should be part of another patchset.

+ dev_info(ctrl->device,
+ "CCR failed, switch to time-based recovery, timeout = %ums\n",
+ jiffies_to_msecs(rem));
+ queue_delayed_work(nvme_wq, &tcp_ctrl->fenced_work, rem);
+ return;
+

Why do you need the 'fenced' workqueue at all? All it does is queing yet another workqueue item, which certainly can be done from the 'fencing' workqueue directly, no?

+done:
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
+}
+
+static void nvme_tcp_flush_fencing_work(struct nvme_ctrl *ctrl)
+{
+ flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+ flush_delayed_work(&to_tcp_ctrl(ctrl)->fenced_work);
+}
+
static void nvme_tcp_error_recovery_work(struct work_struct *work)
{
struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+ nvme_tcp_flush_fencing_work(ctrl);

Why not 'fenced_work' ?

if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_keep_alive(ctrl);
@@ -2518,6 +2573,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
container_of(work, struct nvme_ctrl, reset_work);
int ret;
+ nvme_tcp_flush_fencing_work(ctrl);

Same.

if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_ctrl(ctrl);
@@ -2643,13 +2699,15 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_command *cmd = &pdu->cmd;
int qid = nvme_tcp_queue_id(req->queue);
+ enum nvme_ctrl_state state;
dev_warn(ctrl->device,
"I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
nvme_fabrics_opcode_str(qid, cmd), qid);
- if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
+ state = nvme_ctrl_state(ctrl);
+ if (state != NVME_CTRL_LIVE && state != NVME_CTRL_FENCING) {

'FENCED' too, presumably?

/*
* If we are resetting, connecting or deleting we should
* complete immediately because we may block controller
@@ -2904,6 +2962,8 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
INIT_DELAYED_WORK(&ctrl->connect_work,
nvme_tcp_reconnect_ctrl_work);
+ INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_tcp_fenced_work);
+ INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);

Here you are calling CCR whenever error recovery is triggered.
This will cause CCR to be send from a command timeout, which is
technically wrong (CCR should be send when the KATO timeout expires,
not when a command timout expires). Both could be vastly different.

So I'd prefer to have CCR send whenever KATO timeout triggers, and
lease to current command timeout mechanism in place.

Cheers,

Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@xxxxxxx +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich