Re: ata: BUG in ata_sff_hsm_move

From: Tejun Heo
Date: Fri Jan 29 2016 - 15:23:55 EST


Hello,

On Fri, Jan 29, 2016 at 02:40:33PM +0100, Dmitry Vyukov wrote:
> It now deadlocks at this stack. It is probably not OK to call
> ata_sff_hsm_move holding the spinlock.

Yeah, the locking is pretty messed up with the polling path. Can you
please try the following?

Thanks.

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index cdf6215..7dbba38 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -997,12 +997,9 @@ static inline int ata_hsm_ok_in_wq(struct ata_port *ap,
static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
{
struct ata_port *ap = qc->ap;
- unsigned long flags;

if (ap->ops->error_handler) {
if (in_wq) {
- spin_lock_irqsave(ap->lock, flags);
-
/* EH might have kicked in while host lock is
* released.
*/
@@ -1014,8 +1011,6 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
} else
ata_port_freeze(ap);
}
-
- spin_unlock_irqrestore(ap->lock, flags);
} else {
if (likely(!(qc->err_mask & AC_ERR_HSM)))
ata_qc_complete(qc);
@@ -1024,10 +1019,8 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
}
} else {
if (in_wq) {
- spin_lock_irqsave(ap->lock, flags);
ata_sff_irq_on(ap);
ata_qc_complete(qc);
- spin_unlock_irqrestore(ap->lock, flags);
} else
ata_qc_complete(qc);
}
@@ -1048,9 +1041,10 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
{
struct ata_link *link = qc->dev->link;
struct ata_eh_info *ehi = &link->eh_info;
- unsigned long flags = 0;
int poll_next;

+ lockdep_assert_held(ap->lock);
+
WARN_ON_ONCE((qc->flags & ATA_QCFLAG_ACTIVE) == 0);

/* Make sure ata_sff_qc_issue() does not throw things
@@ -1112,14 +1106,6 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
}
}

- /* Send the CDB (atapi) or the first data block (ata pio out).
- * During the state transition, interrupt handler shouldn't
- * be invoked before the data transfer is complete and
- * hsm_task_state is changed. Hence, the following locking.
- */
- if (in_wq)
- spin_lock_irqsave(ap->lock, flags);
-
if (qc->tf.protocol == ATA_PROT_PIO) {
/* PIO data out protocol.
* send first data block.
@@ -1135,9 +1121,6 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
/* send CDB */
atapi_send_cdb(ap, qc);

- if (in_wq)
- spin_unlock_irqrestore(ap->lock, flags);
-
/* if polling, ata_sff_pio_task() handles the rest.
* otherwise, interrupt handler takes over from here.
*/
@@ -1361,12 +1344,14 @@ static void ata_sff_pio_task(struct work_struct *work)
u8 status;
int poll_next;

+ spin_lock_irq(ap->lock);
+
BUG_ON(ap->sff_pio_task_link == NULL);
/* qc can be NULL if timeout occurred */
qc = ata_qc_from_tag(ap, link->active_tag);
if (!qc) {
ap->sff_pio_task_link = NULL;
- return;
+ goto out_unlock;
}

fsm_start:
@@ -1381,11 +1366,14 @@ static void ata_sff_pio_task(struct work_struct *work)
*/
status = ata_sff_busy_wait(ap, ATA_BUSY, 5);
if (status & ATA_BUSY) {
+ spin_unlock_irq(ap->lock);
ata_msleep(ap, 2);
+ spin_lock_irq(ap->lock);
+
status = ata_sff_busy_wait(ap, ATA_BUSY, 10);
if (status & ATA_BUSY) {
ata_sff_queue_pio_task(link, ATA_SHORT_PAUSE);
- return;
+ goto out_unlock;
}
}

@@ -1402,6 +1390,8 @@ static void ata_sff_pio_task(struct work_struct *work)
*/
if (poll_next)
goto fsm_start;
+out_unlock:
+ spin_unlock_irq(ap->lock);
}

/**