Re: [PATCH 1/2] scsi: megaraid_sas: Fix DCMD issue command handling

From: mengfanhui
Date: Thu Jun 13 2024 - 02:52:08 EST


Can someone help review it? Thank you!

在 2024/5/30 17:45, mengfanhui 写道:
> If DCMD timeout not handled, the next interaction between the driver and firmware will still
> result in DCMD timeout, which may cause system crashes or hang up
>
> This patch will do proper error handling for DCMD command
> for Fusion adapters:
>
> 1. What action needs to be taken in case of DCMD timeout is decided by
> function dcmd_timeout_ocr_possible(). DCMD timeout causing OCR is
> applicable to the following situation:
> INITIATE_OCR
> KILL_ADAPTER
> IGNORE_TIMEOUT
>
> 2. If those DCMDs fail, driver bails out.
>
> Error log:
> [ 201.689759] megaraid_sas 0001:05:00.0: megasas_sync_pd_seq_num DCMD timed out, continue without JBOD sequence map
> [ 242.649061] [] megasas_init+0x114/0x4000 [megaraid_sas]
> [ 363.481009] [] megasas_issue_blocked_cmd+0x1d8/0x268 [megaraid_sas]
> [ 363.481159] [] megasas_get_pd_list+0x548/0x688 [megaraid_sas]
> [ 363.481309] [] megasas_init_fw+0xb38/0x1104 [megaraid_sas]
> [ 363.481459] [] megasas_probe_one+0x1f4/0x5c4 [megaraid_sas]
> [ 363.482419] [] megasas_init+0x114/0x4000 [megaraid_sas]
> [ 381.912298] megaraid_sas 0001:05:00.0: DCMD(opcode: 0x2010100) is timed out, func:megasas_issue_blocked_cmd
> [ 381.912979] megaraid_sas 0001:05:00.0: Ignore DCMD timeout: megasas_get_pd_list 4727
> [ 484.313526] [] megasas_init+0x114/0x4000 [megaraid_sas]
> [ 562.136294] megaraid_sas 0001:05:00.0: DCMD(opcode: 0x3010100) is timed out, func:megasas_issue_blocked_cmd
> [ 562.137074] megaraid_sas 0001:05:00.0: Ignore DCMD timeout: megasas_ld_list_query 4973
> [ 562.137081] megaraid_sas 0001:05:00.0: failed to get LD list
> [ 562.137425] megaraid_sas 0001:05:00.0: megasas_init_fw: megasas_get_device_list failed
> [ 562.137767] megaraid_sas 0001:05:00.0: megasas_disable_intr_fusion is called outbound_intr_mask:0x40000009
> [ 562.139232] megaraid_sas 0001:05:00.0: Failed from megasas_init_fw 6572
>
> Co-developed-by: Jackie Liu <liuyun01@xxxxxxxxxx>
> Signed-off-by: Jackie Liu <liuyun01@xxxxxxxxxx>
> Signed-off-by: mengfanhui <mengfanhui@xxxxxxxxxx>
> Suggested-by: Geliang Tang <geliang@xxxxxxxxxx>
> ---
> drivers/scsi/megaraid/megaraid_sas.h | 1 +
> drivers/scsi/megaraid/megaraid_sas_base.c | 4 +-
> drivers/scsi/megaraid/megaraid_sas_fusion.c | 71 +++++++++++++++++----
> 3 files changed, 62 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h
> index 5680c6cdb221..91570c5e8456 100644
> --- a/drivers/scsi/megaraid/megaraid_sas.h
> +++ b/drivers/scsi/megaraid/megaraid_sas.h
> @@ -2760,5 +2760,6 @@ void megasas_exit_debugfs(void);
> void megasas_setup_debugfs(struct megasas_instance *instance);
> void megasas_destroy_debugfs(struct megasas_instance *instance);
> int megasas_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num);
> +int dcmd_timeout_ocr_possible(struct megasas_instance *instance);
>
> #endif /*LSI_MEGARAID_SAS_H */
> diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
> index 170b38f04655..ba8061ea2078 100644
> --- a/drivers/scsi/megaraid/megaraid_sas_base.c
> +++ b/drivers/scsi/megaraid/megaraid_sas_base.c
> @@ -4518,8 +4518,8 @@ int megasas_alloc_cmds(struct megasas_instance *instance)
> * Return 0 for only Fusion adapter, if driver load/unload is not in progress
> * or FW is not under OCR.
> */
> -inline int
> -dcmd_timeout_ocr_possible(struct megasas_instance *instance) {
> +int dcmd_timeout_ocr_possible(struct megasas_instance *instance)
> +{
>
> if (instance->adapter_type == MFI_SERIES)
> return KILL_ADAPTER;
> diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> index 6c1fb8149553..f0aeb1ee83a2 100644
> --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
> +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> @@ -1363,17 +1363,42 @@ megasas_sync_pd_seq_num(struct megasas_instance *instance, bool pend) {
> "driver supports max %d JBOD, but FW reports %d\n",
> MAX_PHYSICAL_DEVICES, le32_to_cpu(pd_sync->count));
> ret = -EINVAL;
> + goto out;
> }
>
> - if (ret == DCMD_TIMEOUT)
> - dev_warn(&instance->pdev->dev,
> - "%s DCMD timed out, continue without JBOD sequence map\n",
> - __func__);
> -
> - if (ret == DCMD_SUCCESS)
> + switch (ret) {
> + case DCMD_SUCCESS:
> instance->pd_seq_map_id++;
> + break;
> + case DCMD_TIMEOUT:
> + switch (dcmd_timeout_ocr_possible(instance)) {
> + case INITIATE_OCR:
> + cmd->flags |= DRV_DCMD_SKIP_REFIRE;
> + mutex_unlock(&instance->reset_mutex);
> + megasas_reset_fusion(instance->host,
> + MFI_IO_TIMEOUT_OCR);
> + mutex_lock(&instance->reset_mutex);
> + break;
> + case KILL_ADAPTER:
> + megaraid_sas_kill_hba(instance);
> + break;
> + case IGNORE_TIMEOUT:
> + dev_info(&instance->pdev->dev, "Ignore DCMD timeout: %s %d\n",
> + __func__, __LINE__);
> + break;
> + }
> + break;
> + case DCMD_FAILED:
> + dev_err(&instance->pdev->dev,
> + "%s: MR_DCMD_SYSTEM_PD_MAP_GET_INFO failed\n",
> + __func__);
> + break;
> + }
> +
> +out:
> + if (ret != DCMD_TIMEOUT)
> + megasas_return_cmd(instance, cmd);
>
> - megasas_return_cmd(instance, cmd);
> return ret;
> }
>
> @@ -1449,12 +1474,34 @@ megasas_get_ld_map_info(struct megasas_instance *instance)
> else
> ret = megasas_issue_polled(instance, cmd);
>
> - if (ret == DCMD_TIMEOUT)
> - dev_warn(&instance->pdev->dev,
> - "%s DCMD timed out, RAID map is disabled\n",
> - __func__);
> + switch (ret) {
> + case DCMD_TIMEOUT:
> + switch (dcmd_timeout_ocr_possible(instance)) {
> + case INITIATE_OCR:
> + cmd->flags |= DRV_DCMD_SKIP_REFIRE;
> + mutex_unlock(&instance->reset_mutex);
> + megasas_reset_fusion(instance->host,
> + MFI_IO_TIMEOUT_OCR);
> + mutex_lock(&instance->reset_mutex);
> + break;
> + case KILL_ADAPTER:
> + megaraid_sas_kill_hba(instance);
> + break;
> + case IGNORE_TIMEOUT:
> + dev_info(&instance->pdev->dev, "Ignore DCMD timeout: %s %d\n",
> + __func__, __LINE__);
> + break;
> + }
> + break;
> + case DCMD_FAILED:
> + dev_err(&instance->pdev->dev,
> + "%s: MR_DCMD_LD_MAP_GET_INFO failed\n",
> + __func__);
> + break;
> + }
>
> - megasas_return_cmd(instance, cmd);
> + if (ret != DCMD_TIMEOUT)
> + megasas_return_cmd(instance, cmd);
>
> return ret;
> }