Re: 2.6.29 regression: ATA bus errors on resume

From: Tejun Heo
Date: Thu Apr 02 2009 - 02:52:29 EST


Niel Lambrechts wrote:
>> Attached is the debug patch. Please reproduce the problem with the
>> patch applied and post full kernel log.
>>
> Hi there,
>
> Here is the output, it took 2 attempts to reproduce the EXT4 corruption
> (clean with errors).

Strange. Maybe IO commands are getting through while the sdev is
still in quiesce state? Can you please repeat the test with the
attached patch?

Thanks.

--
tejun
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 0183131..87f5f4e 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1274,7 +1274,7 @@ void ata_eh_about_to_do(struct ata_link *link, struct ata_device *dev,
{
struct ata_port *ap = link->ap;
struct ata_eh_info *ehi = &link->eh_info;
- struct ata_eh_context *ehc = &link->eh_context;
+ //struct ata_eh_context *ehc = &link->eh_context;
unsigned long flags;

spin_lock_irqsave(ap->lock, flags);
@@ -1284,7 +1284,7 @@ void ata_eh_about_to_do(struct ata_link *link, struct ata_device *dev,
/* About to take EH action, set RECOVERED. Ignore actions on
* slave links as master will do them again.
*/
- if (!(ehc->i.flags & ATA_EHI_QUIET) && link != ap->slave_link)
+ if (/*!(ehc->i.flags & ATA_EHI_QUIET) && */link != ap->slave_link)
ap->pflags |= ATA_PFLAG_RECOVERED;

spin_unlock_irqrestore(ap->lock, flags);
@@ -2017,8 +2017,13 @@ static void ata_eh_link_autopsy(struct ata_link *link)

/* determine whether the command is worth retrying */
if (!(qc->err_mask & AC_ERR_INVALID) &&
- ((qc->flags & ATA_QCFLAG_IO) || qc->err_mask != AC_ERR_DEV))
+ ((qc->flags & ATA_QCFLAG_IO) || qc->err_mask != AC_ERR_DEV)) {
+ ata_dev_printk(qc->dev, KERN_INFO,
+ "XXX setting retry on qc%d\n", tag);
qc->flags |= ATA_QCFLAG_RETRY;
+ } else
+ ata_dev_printk(qc->dev, KERN_INFO,
+ "XXX no retry for qc%d\n", tag);

/* accumulate error info */
ehc->i.dev = qc->dev;
@@ -2126,8 +2131,8 @@ static void ata_eh_link_report(struct ata_link *link)
char tries_buf[6];
int tag, nr_failed = 0;

- if (ehc->i.flags & ATA_EHI_QUIET)
- return;
+ /*if (ehc->i.flags & ATA_EHI_QUIET)
+ return;*/

desc = NULL;
if (ehc->i.desc[0] != '\0')
@@ -2147,8 +2152,8 @@ static void ata_eh_link_report(struct ata_link *link)
nr_failed++;
}

- if (!nr_failed && !ehc->i.err_mask)
- return;
+ /*if (!nr_failed && !ehc->i.err_mask)
+ return;*/

frozen = "";
if (ap->pflags & ATA_PFLAG_FROZEN)
@@ -3350,16 +3355,23 @@ void ata_eh_finish(struct ata_port *ap)
* generate sense data in this function,
* considering both err_mask and tf.
*/
- if (qc->flags & ATA_QCFLAG_RETRY)
+ if (qc->flags & ATA_QCFLAG_RETRY) {
+ ata_dev_printk(qc->dev, KERN_INFO, "XXX retrying qc%d, retries=%d allowed=%d\n",
+ tag, qc->scsicmd->retries, qc->scsicmd->allowed);
ata_eh_qc_retry(qc);
- else
+ } else {
+ ata_dev_printk(qc->dev, KERN_INFO, "XXX terminating qc%d\n", tag);
ata_eh_qc_complete(qc);
+ }
} else {
if (qc->flags & ATA_QCFLAG_SENSE_VALID) {
+ ata_dev_printk(qc->dev, KERN_INFO, "XXX terminating qc%d (SENSE), retries=%d\n",
+ tag, qc->scsicmd->retries);
ata_eh_qc_complete(qc);
} else {
/* feed zero TF to sense generation */
memset(&qc->result_tf, 0, sizeof(qc->result_tf));
+ ata_dev_printk(qc->dev, KERN_INFO, "XXX retrying qc%d (bogus SENSE)\n", tag);
ata_eh_qc_retry(qc);
}
}
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 0c2c73b..608bacd 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1569,13 +1569,16 @@ void scsi_eh_flush_done_q(struct list_head *done_q)

list_for_each_entry_safe(scmd, next, done_q, eh_entry) {
list_del_init(&scmd->eh_entry);
+ printk("XXX scsi_eh_flush_done_q: online=%d(%d) noretry=%d retries=%d allowed=%d\n",
+ scsi_device_online(scmd->device), scmd->device->sdev_state,
+ scsi_noretry_cmd(scmd), scmd->retries, scmd->allowed);
if (scsi_device_online(scmd->device) &&
!scsi_noretry_cmd(scmd) &&
(++scmd->retries <= scmd->allowed)) {
- SCSI_LOG_ERROR_RECOVERY(3, printk("%s: flush"
+ /*SCSI_LOG_ERROR_RECOVERY(3, */printk("%s: flush"
" retry cmd: %p\n",
current->comm,
- scmd));
+ scmd)/*)*/;
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
} else {
/*
@@ -1585,9 +1588,9 @@ void scsi_eh_flush_done_q(struct list_head *done_q)
*/
if (!scmd->result)
scmd->result |= (DRIVER_TIMEOUT << 24);
- SCSI_LOG_ERROR_RECOVERY(3, printk("%s: flush finish"
+ /*SCSI_LOG_ERROR_RECOVERY(3,*/ printk("%s: flush finish"
" cmd: %p\n",
- current->comm, scmd));
+ current->comm, scmd)/*)*/;
scsi_finish_command(scmd);
}
}