Re: 2.6.31-rc5 regression: hd don't show up

From: Tejun Heo
Date: Tue Sep 01 2009 - 22:22:42 EST


Tim Blechmann wrote:
>>> booting the machine today, one hd is missing again ... bootlog attached
>> Hmmm... strange. I don't really see how it could be escaping. Can
>> you please apply the attached patch? It still won't change the
>> behavior but should be able to catch where it's escaping.
>
> attached you find two bootlogs, for a correct boot, and with one hd
> missing ...

Heh heh, this is getting a bit embarrassing. Seems like I wasn't
looking at the right path. Can you please try this one too? If it
says "XXX D7 pulldown quick exit path" and then succeed to probe,
that's the previous failure case so you don't need to keep trying to
reproduce the problem.

Thanks.

--
tejun
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 98af50f..6edd2c4 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3656,12 +3656,19 @@ int ata_wait_ready(struct ata_link *link, unsigned long deadline,
!ata_link_offline(link) &&
time_before(now, nodev_deadline))
ready = 0;
+ else
+ ata_link_printk(link, KERN_INFO, "XXX wait_ready ready=%d link_offline=%d\n",
+ ready, ata_link_offline(link));
}

- if (ready)
+ if (ready) {
+ ata_link_printk(link, KERN_INFO, "XXX wait_read returning %d\n", ready);
return ready;
- if (time_after(now, deadline))
+ }
+ if (time_after(now, deadline)) {
+ ata_link_printk(link, KERN_INFO, "XXX wait_read returning -EBUSY\n");
return -EBUSY;
+ }

if (!warned && time_after(now, start + 5 * HZ) &&
(deadline - now > 3 * HZ)) {
@@ -3792,6 +3799,7 @@ int sata_link_resume(struct ata_link *link, const unsigned long *params,

scontrol = (scontrol & 0x0f0) | 0x300;

+ ata_link_printk(link, KERN_INFO, "XXX bringing up link\n");
if ((rc = sata_scr_write(link, SCR_CONTROL, scontrol)))
return rc;

@@ -3800,7 +3808,9 @@ int sata_link_resume(struct ata_link *link, const unsigned long *params,
*/
msleep(200);

- if ((rc = sata_link_debounce(link, params, deadline)))
+ rc = sata_link_debounce(link, params, deadline);
+ ata_link_printk(link, KERN_INFO, "XXX debounced rc=%d\n", rc);
+ if (rc)
return rc;

/* clear SError, some PHYs require this even for SRST to work */
@@ -3926,12 +3936,16 @@ int sata_link_hardreset(struct ata_link *link, const unsigned long *timing,
if (rc)
goto out;
/* if link is offline nothing more to do */
- if (ata_phys_link_offline(link))
+ if (ata_phys_link_offline(link)) {
+ ata_link_printk(link, KERN_INFO, "XXX phys link offline\n");
goto out;
+ }

/* Link is online. From this point, -ENODEV too is an error. */
- if (online)
+ if (online) {
+ ata_link_printk(link, KERN_INFO, "XXX setting online\n");
*online = true;
+ }

if (sata_pmp_supported(link->ap) && ata_is_host_link(link)) {
/* If PMP is supported, we have to do follow-up SRST.
@@ -3964,6 +3978,8 @@ int sata_link_hardreset(struct ata_link *link, const unsigned long *timing,
"COMRESET failed (errno=%d)\n", rc);
}
DPRINTK("EXIT, rc=%d\n", rc);
+ ata_link_printk(link, KERN_INFO, "XXX link_hardreset online=%d returning %d\n",
+ online ? *online : -1, rc);
return rc;
}

@@ -6082,7 +6098,7 @@ static void async_port_probe(void *data, async_cookie_t cookie)

ehi->probe_mask |= ATA_ALL_DEVICES;
ehi->action |= ATA_EH_RESET | ATA_EH_LPM;
- ehi->flags |= ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET;
+ ehi->flags |= ATA_EHI_NO_AUTOPSY/* | ATA_EHI_QUIET*/;

ap->pflags &= ~ATA_PFLAG_INITIALIZING;
ap->pflags |= ATA_PFLAG_LOADING;
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index bbbb1fa..efd6fdf 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -1998,6 +1998,9 @@ unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
if (r_err)
*r_err = err;

+ ata_dev_printk(dev, KERN_INFO, "XXX CLASSIFY TF %02x/%02x:%02x:%02x:%02x\n",
+ tf.command, tf.feature, tf.lbal, tf.lbam, tf.lbah);
+
/* see if device passed diags: continue and warn later */
if (err == 0)
/* diagnostic fail : do nothing _YET_ */
@@ -2006,11 +2009,14 @@ unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
/* do nothing */ ;
else if ((dev->devno == 0) && (err == 0x81))
/* do nothing */ ;
- else
+ else {
+ ata_dev_printk(dev, KERN_INFO, "XXX diag nodev\n");
return ATA_DEV_NONE;
+ }

/* determine if device is ATA or ATAPI */
class = ata_dev_classify(&tf);
+ ata_dev_printk(dev, KERN_INFO, "XXX ata_dev_classify=%d\n", class);

if (class == ATA_DEV_UNKNOWN) {
/* If the device failed diagnostic, it's likely to
@@ -2019,13 +2025,18 @@ unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
* device signature is invalid with diagnostic
* failure.
*/
- if (present && (dev->horkage & ATA_HORKAGE_DIAGNOSTIC))
+ if (present && (dev->horkage & ATA_HORKAGE_DIAGNOSTIC)) {
+ ata_dev_printk(dev, KERN_INFO, "XXX UNK && present -> ATA\n");
class = ATA_DEV_ATA;
- else
+ } else {
class = ATA_DEV_NONE;
+ ata_dev_printk(dev, KERN_INFO, "XXX UNK && !present -> NONE\n");
+ }
} else if ((class == ATA_DEV_ATA) &&
- (ap->ops->sff_check_status(ap) == 0))
+ (ap->ops->sff_check_status(ap) == 0)) {
class = ATA_DEV_NONE;
+ ata_dev_printk(dev, KERN_INFO, "XXX stat==0 -> NONE\n");
+ }

return class;
}
@@ -2064,8 +2075,16 @@ int ata_sff_wait_after_reset(struct ata_link *link, unsigned int devmask,
/* -ENODEV means the odd clown forgot the D7 pulldown resistor
* and TF status is 0xff, bail out on it too.
*/
- if (rc)
- return rc;
+ if (rc) {
+ ata_link_printk(link, KERN_INFO, "XXX D7 pulldown quick exit path stat=%02x, retrying in 50ms\n",
+ link->ap->ops->sff_check_status(link->ap));
+ msleep(50);
+ rc = ata_sff_wait_ready(link, deadline);
+ ata_link_printk(link, KERN_INFO, "XXX retried, rc=%d, stat=%x\n",
+ rc, link->ap->ops->sff_check_status(link->ap));
+ if (rc)
+ return rc;
+ }

/* if device 1 was found in ata_devchk, wait for register
* access briefly, then wait for BSY to clear.
@@ -2162,7 +2181,8 @@ int ata_sff_softreset(struct ata_link *link, unsigned int *classes,
ap->ops->sff_dev_select(ap, 0);

/* issue bus reset */
- DPRINTK("about to softreset, devmask=%x\n", devmask);
+ ata_link_printk(link, KERN_INFO,
+ "about to softreset, devmask=%x\n", devmask);
rc = ata_bus_softreset(ap, devmask, deadline);
/* if link is occupied, -ENODEV too is an error */
if (rc && (rc != -ENODEV || sata_scr_valid(link))) {
@@ -2211,6 +2231,8 @@ int sata_sff_hardreset(struct ata_link *link, unsigned int *class,
*class = ata_sff_dev_classify(link->device, 1, NULL);

DPRINTK("EXIT, class=%u\n", *class);
+ ata_link_printk(link, KERN_INFO, "XXX sff_hardreset rc=%d online=%d *class=%d\n",
+ rc, online, *class);
return rc;
}
EXPORT_SYMBOL_GPL(sata_sff_hardreset);