[PATCH v6 5/5] nvme-fabrics: handle transient auth failures

From: Daniel Wagner
Date: Mon Apr 15 2024 - 08:43:57 EST


Retry authentication a couple of times to avoid error out on transient
errors. E.g. if a new key is deployed on the host and the target. At the
same time the connection drops. The host might use the wrong key to
reconnect.

Suggested-by: Sagi Grimberg <sagi@xxxxxxxxxxx>
Signed-off-by: Daniel Wagner <dwagner@xxxxxxx>
---
drivers/nvme/host/auth.c | 4 ++--
drivers/nvme/host/fabrics.c | 10 +++++++++-
drivers/nvme/host/fc.c | 2 ++
drivers/nvme/host/nvme.h | 1 +
drivers/nvme/host/rdma.c | 1 +
drivers/nvme/host/tcp.c | 1 +
6 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index a264b3ae078b..368dcc6ee11b 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -797,7 +797,7 @@ static void nvme_queue_auth_work(struct work_struct *work)
NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1);
if (ret) {
chap->status = ret;
- chap->error = -ECONNREFUSED;
+ chap->error = -EKEYREJECTED;
return;
}

@@ -818,7 +818,7 @@ static void nvme_queue_auth_work(struct work_struct *work)
ret = nvme_auth_process_dhchap_success1(ctrl, chap);
if (ret) {
/* Controller authentication failed */
- chap->error = -ECONNREFUSED;
+ chap->error = -EKEYREJECTED;
goto fail2;
}

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index d9a73b1b41c4..6dfa45dce232 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -573,8 +573,16 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
*/
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl, int status)
{
- if (status < 0)
+ if (status < 0) {
+ /*
+ * authentication errors can be transient, thus retry a couple
+ * of times before giving up.
+ */
+ if (status == -EKEYREJECTED &&
+ ++ctrl->nr_auth_retries < 3)
+ return true;
return false;
+ }
else if (status > 0 && (status & NVME_SC_DNR))
return false;

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index f0b081332749..a22d2af18553 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3176,6 +3176,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);

ctrl->ctrl.nr_reconnects = 0;
+ ctrl->ctrl.nr_auth_retries = 0;

if (changed)
nvme_start_ctrl(&ctrl->ctrl);
@@ -3491,6 +3492,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,

ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
+ ctrl->ctrl.nr_auth_retries = 0;
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9b8904a476b8..a9af20f25405 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -382,6 +382,7 @@ struct nvme_ctrl {
u16 icdoff;
u16 maxcmd;
int nr_reconnects;
+ int nr_auth_retries;
unsigned long flags;
struct nvmf_ctrl_options *opts;

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 821ab3e0fd3b..f5b3201a11b5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1117,6 +1117,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
ctrl->ctrl.nr_reconnects);

ctrl->ctrl.nr_reconnects = 0;
+ ctrl->ctrl.nr_auth_retries = 0;

return;

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3e0c33323320..45270c626c87 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2266,6 +2266,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
ctrl->nr_reconnects);

ctrl->nr_reconnects = 0;
+ ctrl->nr_auth_retries = 0;

return;

--
2.44.0