[PATCH 1/2] scsi: megaraid_sas: Fix DCMD issue command handling
From: mengfanhui
Date: Thu May 30 2024 - 05:46:31 EST
If DCMD timeout not handled, the next interaction between the driver and firmware will still
result in DCMD timeout, which may cause system crashes or hang up
This patch will do proper error handling for DCMD command
for Fusion adapters:
1. What action needs to be taken in case of DCMD timeout is decided by
function dcmd_timeout_ocr_possible(). DCMD timeout causing OCR is
applicable to the following situation:
INITIATE_OCR
KILL_ADAPTER
IGNORE_TIMEOUT
2. If those DCMDs fail, driver bails out.
Error log:
[ 201.689759] megaraid_sas 0001:05:00.0: megasas_sync_pd_seq_num DCMD timed out, continue without JBOD sequence map
[ 242.649061] [] megasas_init+0x114/0x4000 [megaraid_sas]
[ 363.481009] [] megasas_issue_blocked_cmd+0x1d8/0x268 [megaraid_sas]
[ 363.481159] [] megasas_get_pd_list+0x548/0x688 [megaraid_sas]
[ 363.481309] [] megasas_init_fw+0xb38/0x1104 [megaraid_sas]
[ 363.481459] [] megasas_probe_one+0x1f4/0x5c4 [megaraid_sas]
[ 363.482419] [] megasas_init+0x114/0x4000 [megaraid_sas]
[ 381.912298] megaraid_sas 0001:05:00.0: DCMD(opcode: 0x2010100) is timed out, func:megasas_issue_blocked_cmd
[ 381.912979] megaraid_sas 0001:05:00.0: Ignore DCMD timeout: megasas_get_pd_list 4727
[ 484.313526] [] megasas_init+0x114/0x4000 [megaraid_sas]
[ 562.136294] megaraid_sas 0001:05:00.0: DCMD(opcode: 0x3010100) is timed out, func:megasas_issue_blocked_cmd
[ 562.137074] megaraid_sas 0001:05:00.0: Ignore DCMD timeout: megasas_ld_list_query 4973
[ 562.137081] megaraid_sas 0001:05:00.0: failed to get LD list
[ 562.137425] megaraid_sas 0001:05:00.0: megasas_init_fw: megasas_get_device_list failed
[ 562.137767] megaraid_sas 0001:05:00.0: megasas_disable_intr_fusion is called outbound_intr_mask:0x40000009
[ 562.139232] megaraid_sas 0001:05:00.0: Failed from megasas_init_fw 6572
Co-developed-by: Jackie Liu <liuyun01@xxxxxxxxxx>
Signed-off-by: Jackie Liu <liuyun01@xxxxxxxxxx>
Signed-off-by: mengfanhui <mengfanhui@xxxxxxxxxx>
Suggested-by: Geliang Tang <geliang@xxxxxxxxxx>
---
drivers/scsi/megaraid/megaraid_sas.h | 1 +
drivers/scsi/megaraid/megaraid_sas_base.c | 4 +-
drivers/scsi/megaraid/megaraid_sas_fusion.c | 71 +++++++++++++++++----
3 files changed, 62 insertions(+), 14 deletions(-)
diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h
index 5680c6cdb221..91570c5e8456 100644
--- a/drivers/scsi/megaraid/megaraid_sas.h
+++ b/drivers/scsi/megaraid/megaraid_sas.h
@@ -2760,5 +2760,6 @@ void megasas_exit_debugfs(void);
void megasas_setup_debugfs(struct megasas_instance *instance);
void megasas_destroy_debugfs(struct megasas_instance *instance);
int megasas_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num);
+int dcmd_timeout_ocr_possible(struct megasas_instance *instance);
#endif /*LSI_MEGARAID_SAS_H */
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 170b38f04655..ba8061ea2078 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -4518,8 +4518,8 @@ int megasas_alloc_cmds(struct megasas_instance *instance)
* Return 0 for only Fusion adapter, if driver load/unload is not in progress
* or FW is not under OCR.
*/
-inline int
-dcmd_timeout_ocr_possible(struct megasas_instance *instance) {
+int dcmd_timeout_ocr_possible(struct megasas_instance *instance)
+{
if (instance->adapter_type == MFI_SERIES)
return KILL_ADAPTER;
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index 6c1fb8149553..f0aeb1ee83a2 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -1363,17 +1363,42 @@ megasas_sync_pd_seq_num(struct megasas_instance *instance, bool pend) {
"driver supports max %d JBOD, but FW reports %d\n",
MAX_PHYSICAL_DEVICES, le32_to_cpu(pd_sync->count));
ret = -EINVAL;
+ goto out;
}
- if (ret == DCMD_TIMEOUT)
- dev_warn(&instance->pdev->dev,
- "%s DCMD timed out, continue without JBOD sequence map\n",
- __func__);
-
- if (ret == DCMD_SUCCESS)
+ switch (ret) {
+ case DCMD_SUCCESS:
instance->pd_seq_map_id++;
+ break;
+ case DCMD_TIMEOUT:
+ switch (dcmd_timeout_ocr_possible(instance)) {
+ case INITIATE_OCR:
+ cmd->flags |= DRV_DCMD_SKIP_REFIRE;
+ mutex_unlock(&instance->reset_mutex);
+ megasas_reset_fusion(instance->host,
+ MFI_IO_TIMEOUT_OCR);
+ mutex_lock(&instance->reset_mutex);
+ break;
+ case KILL_ADAPTER:
+ megaraid_sas_kill_hba(instance);
+ break;
+ case IGNORE_TIMEOUT:
+ dev_info(&instance->pdev->dev, "Ignore DCMD timeout: %s %d\n",
+ __func__, __LINE__);
+ break;
+ }
+ break;
+ case DCMD_FAILED:
+ dev_err(&instance->pdev->dev,
+ "%s: MR_DCMD_SYSTEM_PD_MAP_GET_INFO failed\n",
+ __func__);
+ break;
+ }
+
+out:
+ if (ret != DCMD_TIMEOUT)
+ megasas_return_cmd(instance, cmd);
- megasas_return_cmd(instance, cmd);
return ret;
}
@@ -1449,12 +1474,34 @@ megasas_get_ld_map_info(struct megasas_instance *instance)
else
ret = megasas_issue_polled(instance, cmd);
- if (ret == DCMD_TIMEOUT)
- dev_warn(&instance->pdev->dev,
- "%s DCMD timed out, RAID map is disabled\n",
- __func__);
+ switch (ret) {
+ case DCMD_TIMEOUT:
+ switch (dcmd_timeout_ocr_possible(instance)) {
+ case INITIATE_OCR:
+ cmd->flags |= DRV_DCMD_SKIP_REFIRE;
+ mutex_unlock(&instance->reset_mutex);
+ megasas_reset_fusion(instance->host,
+ MFI_IO_TIMEOUT_OCR);
+ mutex_lock(&instance->reset_mutex);
+ break;
+ case KILL_ADAPTER:
+ megaraid_sas_kill_hba(instance);
+ break;
+ case IGNORE_TIMEOUT:
+ dev_info(&instance->pdev->dev, "Ignore DCMD timeout: %s %d\n",
+ __func__, __LINE__);
+ break;
+ }
+ break;
+ case DCMD_FAILED:
+ dev_err(&instance->pdev->dev,
+ "%s: MR_DCMD_LD_MAP_GET_INFO failed\n",
+ __func__);
+ break;
+ }
- megasas_return_cmd(instance, cmd);
+ if (ret != DCMD_TIMEOUT)
+ megasas_return_cmd(instance, cmd);
return ret;
}
--
2.25.1