Re: [PATCH V5 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work

From: Stanley Chu
Date: Wed Dec 02 2020 - 03:49:51 EST


Hi Can,

On Wed, 2020-12-02 at 00:24 -0800, Can Guo wrote:
> In current task abort routine, if task abort happens to the device W-LU,
> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
> full reset and restore then returns FAIL or SUCCESS. Commands sent to the
> device W-LU are most likely the SSU cmds sent during UFS PM operations. If
> such SSU cmd enters task abort routine, when ufshcd_eh_host_reset_handler()
> flushes eh_work, it will get stuck there since err_handler is serialized
> with PM operations.
>
> In order to unblock above call path, we merely clean up the lrb taken by
> this cmd, queue the eh_work and return SUCCESS. Once the cmd is aborted,
> the PM operation which sends out the cmd just errors out, then err_handler
> shall be able to proceed with the full reset and restore.
>
> In this scenario, the cmd is aborted even before it is actually cleared by
> HW, set the lrb->in_use flag to prevent subsequent cmds, including SCSI
> cmds and dev cmds, from taking the lrb released from abort. The flag shall
> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the full
> reset and restore from err_handler.
>
> Reviewed-by: Asutosh Das <asutoshd@xxxxxxxxxxxxxx>
> Signed-off-by: Can Guo <cang@xxxxxxxxxxxxxx>
> ---
> drivers/scsi/ufs/ufshcd.c | 55 ++++++++++++++++++++++++++++++++++++-----------
> drivers/scsi/ufs/ufshcd.h | 2 ++
> 2 files changed, 45 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index f0bb3fc..fa90e15 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
> (hba->clk_gating.state != CLKS_ON));
>
> lrbp = &hba->lrb[tag];
> + if (unlikely(lrbp->in_use)) {
> + if (hba->pm_op_in_progress)
> + set_host_byte(cmd, DID_BAD_TARGET);
> + else
> + err = SCSI_MLQUEUE_HOST_BUSY;
> + ufshcd_release(hba);
> + goto out;
> + }
>
> WARN_ON(lrbp->cmd);
> lrbp->cmd = cmd;
> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>
> init_completion(&wait);
> lrbp = &hba->lrb[tag];
> + if (unlikely(lrbp->in_use)) {
> + err = -EBUSY;
> + goto out;
> + }
> +
> WARN_ON(lrbp->cmd);
> err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
> if (unlikely(err))
> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>
> err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
>
> +out:
> ufshcd_add_query_upiu_trace(hba, tag,
> err ? "query_complete_err" : "query_complete");
>
> @@ -4932,6 +4946,7 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
>
> for_each_set_bit(index, &completed_reqs, hba->nutrs) {
> lrbp = &hba->lrb[index];
> + lrbp->in_use = false;
> lrbp->compl_time_stamp = ktime_get();
> cmd = lrbp->cmd;
> if (cmd) {
> @@ -6374,8 +6389,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
>
> init_completion(&wait);
> lrbp = &hba->lrb[tag];
> - WARN_ON(lrbp->cmd);
> + if (unlikely(lrbp->in_use)) {
> + err = -EBUSY;
> + goto out;
> + }
>
> + WARN_ON(lrbp->cmd);
> lrbp->cmd = NULL;
> lrbp->sense_bufflen = 0;
> lrbp->sense_buffer = NULL;
> @@ -6447,6 +6466,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
> }
> }
>
> +out:
> blk_put_request(req);
> out_unlock:
> up_read(&hba->clk_scaling_lock);
> @@ -6696,16 +6716,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> BUG();
> }
>
> - /*
> - * Task abort to the device W-LUN is illegal. When this command
> - * will fail, due to spec violation, scsi err handling next step
> - * will be to send LU reset which, again, is a spec violation.
> - * To avoid these unnecessary/illegal step we skip to the last error
> - * handling stage: reset and restore.
> - */
> - if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
> - return ufshcd_eh_host_reset_handler(cmd);
> -
> ufshcd_hold(hba, false);
> reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
> /* If command is already aborted/completed, return SUCCESS */
> @@ -6726,7 +6736,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> * to reduce repeated printouts. For other aborted requests only print
> * basic details.
> */
> - scsi_print_command(hba->lrb[tag].cmd);
> + scsi_print_command(cmd);
> if (!hba->req_abort_count) {
> ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
> ufshcd_print_host_regs(hba);
> @@ -6745,6 +6755,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> goto cleanup;
> }
>
> + /*
> + * Task abort to the device W-LUN is illegal. When this command
> + * will fail, due to spec violation, scsi err handling next step
> + * will be to send LU reset which, again, is a spec violation.
> + * To avoid these unnecessary/illegal steps, first we clean up
> + * the lrb taken by this cmd and mark the lrb as in_use, then
> + * queue the eh_work and bail.
> + */
> + if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
> + spin_lock_irqsave(host->host_lock, flags);
> + if (lrbp->cmd) {
> + __ufshcd_transfer_req_compl(hba, (1UL << tag));

The timed out "tag" is cleared once here.

> + __set_bit(tag, &hba->outstanding_reqs);
> + lrbp->in_use = true;
> + hba->force_reset = true;
> + ufshcd_schedule_eh_work(hba);

The same "tag" would be cleared again in error handler with
__ufshcd_transfer_req_compl().

In __ufshcd_transfer_req_compl, hba->clk_scaling.active_reqs will be
decreased if clk-scaling is supported as below,

if (ufshcd_is_clkscaling_supported(hba))
hba->clk_scaling.active_reqs--;

Will be the hba->clk_scaling.active_reqs value become abnormal by this
flow?

Thanks,
Stanley Chu

> + }
> + spin_unlock_irqrestore(host->host_lock, flags);
> + goto out;
> + }
> +
> /* Skip task abort in case previous aborts failed and report failure */
> if (lrbp->req_abort_skip)
> err = -EIO;
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index 1e680bf..66e5338 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
> * @crypto_key_slot: the key slot to use for inline crypto (-1 if none)
> * @data_unit_num: the data unit number for the first block for inline crypto
> * @req_abort_skip: skip request abort task flag
> + * @in_use: indicates that this lrb is still in use
> */
> struct ufshcd_lrb {
> struct utp_transfer_req_desc *utr_descriptor_ptr;
> @@ -192,6 +193,7 @@ struct ufshcd_lrb {
> #endif
>
> bool req_abort_skip;
> + bool in_use;
> };
>
> /**