Re: [PATCH 2/2] nvme/host: add delayed retries upon non-fatal error during ns validation
From: Sagi Grimberg
Date: Thu Dec 25 2025 - 08:00:23 EST
On 21/12/2025 23:26, Alex Tran wrote:
If a non-fatal error is received during nvme namespace validation, it
should not be ignored and the namespace should not be removed immediately.
Rather, delayed retires should be performed on the namespace validation
process.
This handles non-fatal issues more robustly, by retrying a few times before
giving up and removing the namespace. The number of retries is set
to 3 and the interval between retries is set to 3 seconds.
Signed-off-by: Alex Tran <alex.t.tran@xxxxxxxxx>
---
drivers/nvme/host/core.c | 43 +++++++++++++++++++++++++++++++++++++++----
drivers/nvme/host/nvme.h | 9 +++++++++
2 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fab321e79b7cdbb89d96d950c1cc8c1128906770..2e208d894b27f85f7f6358eb697be262ce45aed6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -139,6 +139,7 @@ static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
+static void nvme_validate_ns_work(struct work_struct *work);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@@ -4118,6 +4119,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
ns->ctrl = ctrl;
kref_init(&ns->kref);
+ INIT_DELAYED_WORK(&ns->validate_work, nvme_validate_ns_work);
+
if (nvme_init_ns_head(ns, info))
goto out_cleanup_disk;
@@ -4215,6 +4218,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
{
bool last_path = false;
+ cancel_delayed_work_sync(&ns->validate_work);
+
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
@@ -4285,12 +4290,42 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
out:
/*
* Only remove the namespace if we got a fatal error back from the
- * device, otherwise ignore the error and just move on.
- *
- * TODO: we should probably schedule a delayed retry here.
+ * device, otherwise delayed retries are performed.
*/
- if (ret > 0 && (ret & NVME_STATUS_DNR))
+ if (ret > 0 && (ret & NVME_STATUS_DNR)) {
nvme_ns_remove(ns);
+ } else if (ret > 0) {
+ if (ns->validate_retries < NVME_NS_VALIDATION_MAX_RETRIES) {
+ ns->validate_retries++;
+
+ if (!nvme_get_ns(ns))
+ return;
+
+ dev_warn(
+ ns->ctrl->device,
+ "validation failed for nsid %d, retry %d/%d in %ds\n",
+ ns->head->ns_id, ns->validate_retries,
+ NVME_NS_VALIDATION_MAX_RETRIES,
+ NVME_NS_VALIDATION_RETRY_INTERVAL);
+ memcpy(&ns->pending_info, info, sizeof(*info));
+ schedule_delayed_work(
+ &ns->validate_work,
+ NVME_NS_VALIDATION_RETRY_INTERVAL * HZ);
Given that ns scanning is already async, wouldn't it be simpler to
simply retry locally
in a loop?