Re: [REGRESSION] 2.6.24/25: random lockups when accessing externalUSB harddrive

From: Stefan Becker
Date: Thu Jun 26 2008 - 18:15:45 EST


Hi,

ext Alan Stern wrote:

Do you have any of the RT patches installed?

No.

Something else you should try is clearing your "owner" string just
before the spinlock is released. You could also add a check after the
release; if the spinlock can't be locked again immediately then
something is wrong.

Yes, the initial try was misleading. I tinkered around a little bit more and finally figured out that it is usb_hcd_unlink_urb_from_ep() itself that is called with interrupts enabled!


So with this code in place the error disappears:

void usb_hcd_unlink_urb_from_ep(struct usb_hcd *hcd, struct urb *urb)
{
/* clear all state linking urb to this dev (and hcd) */
unsigned int flags;
spin_lock_irqsave(&hcd_urb_list_lock, flags);
list_del_init(&urb->urb_list);
spin_unlock_irqrestore(&hcd_urb_list_lock, flags);
}

This seems to impact USB performance though. In 2.6.23 (without the problem) I get 21MB/s with dd, but with the above "fix" only 14MB/s. But I'll recheck once we have a real error fix in place.


After that I added the following code

if (!raw_irqs_disabled()) {
printk(KERN_CRIT "usb_hcd_unlink_urb_from_ep called with interrupts enabled!\n");
dump_stack();
}

and collected the attached kernel messages. I checked the messages briefly and it seems that the following code paths have the interrupts enabled when calling usb_hcd_unlink_urb_from_ep():

[<c0574d9d>] usb_hcd_unlink_urb_from_ep+0x25/0x6b
[<de850559>] uhci_giveback_urb+0xcd/0x1e3 [uhci_hcd]
[<de850e02>] uhci_scan_schedule+0x511/0x720 [uhci_hcd]
...
[<de8529c3>] uhci_irq+0x131/0x142 [uhci_hcd]
[<c05750cb>] usb_hcd_irq+0x23/0x51

and

[<c0574d9d>] usb_hcd_unlink_urb_from_ep+0x25/0x6b
[<de839d55>] ehci_urb_done+0x73/0x92 [ehci_hcd]
[<de83a92f>] qh_completions+0x373/0x3eb [ehci_hcd]
[<de83aa43>] ehci_work+0x9c/0x6a9 [ehci_hcd]
...
[<de83ec3c>] ehci_irq+0x241/0x265 [ehci_hcd]
...
[<c05750cb>] usb_hcd_irq+0x23/0x51


Is that enough information to fix the problem?

Regards,

Stefan

---
Stefan Becker
E-Mail: Stefan.Becker@xxxxxxxxx
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 09a53e7..400433c 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -45,6 +45,14 @@
#include "hcd.h"
#include "hub.h"

+#ifdef DEBUG
+const char *_urb_list_holder = "NONE";
+#define URB_LIST_HOLDER(n) _urb_list_holder = #n;
+#define URB_LIST_RELEASE _urb_list_holder = "FREE";
+#else
+#define URB_LIST_HOLDER(n)
+#define URB_LIST_RELEASE
+#endif

/*-------------------------------------------------------------------------*/

@@ -1001,7 +1009,12 @@ int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb)
{
int rc = 0;

+#ifdef DEBUG
+ if (!raw_irqs_disabled()) dump_stack();
+#endif
+
spin_lock(&hcd_urb_list_lock);
+ URB_LIST_HOLDER(usb_hcd_link_urb_to_ep)

/* Check that the URB isn't being killed */
if (unlikely(urb->reject)) {
@@ -1034,6 +1047,7 @@ int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb)
goto done;
}
done:
+ URB_LIST_RELEASE
spin_unlock(&hcd_urb_list_lock);
return rc;
}
@@ -1107,9 +1121,33 @@ EXPORT_SYMBOL_GPL(usb_hcd_check_unlink_urb);
void usb_hcd_unlink_urb_from_ep(struct usb_hcd *hcd, struct urb *urb)
{
/* clear all state linking urb to this dev (and hcd) */
+#ifdef DEBUG
+#if 0
+ if (!spin_trylock(&hcd_urb_list_lock)) {
+ int i;
+ printk(KERN_CRIT "HCD URB list locked by %s!\n", _urb_list_holder);
+ dump_stack();
+ for (int i = 0; i < 100; i++) schedule();
+ }
+#endif
+ unsigned int flags;
+
+ if (!raw_irqs_disabled()) {
+ printk(KERN_CRIT "usb_hcd_unlink_urb_from_ep called with interrupts enabled!\n");
+ dump_stack();
+ }
+ spin_lock_irqsave(&hcd_urb_list_lock, flags);
+#else
spin_lock(&hcd_urb_list_lock);
+#endif
+ URB_LIST_HOLDER(usb_hcd_unlink_urb_from_ep)
list_del_init(&urb->urb_list);
+ URB_LIST_RELEASE
+#ifdef DEBUG
+ spin_unlock_irqrestore(&hcd_urb_list_lock, flags);
+#else
spin_unlock(&hcd_urb_list_lock);
+#endif
}
EXPORT_SYMBOL_GPL(usb_hcd_unlink_urb_from_ep);

@@ -1448,6 +1486,7 @@ void usb_hcd_flush_endpoint(struct usb_device *udev,

/* No more submits can occur */
spin_lock_irq(&hcd_urb_list_lock);
+ URB_LIST_HOLDER(usb_hcd_flush_endpoint_INITIAL)
rescan:
list_for_each_entry (urb, &ep->urb_list, urb_list) {
int is_in;
@@ -1456,6 +1495,7 @@ rescan:
continue;
usb_get_urb (urb);
is_in = usb_urb_dir_in(urb);
+ URB_LIST_RELEASE
spin_unlock(&hcd_urb_list_lock);

/* kick hcd */
@@ -1482,13 +1522,16 @@ rescan:

/* list contents may have changed */
spin_lock(&hcd_urb_list_lock);
+ URB_LIST_HOLDER(usb_hcd_flush_endpoint_RESCAN)
goto rescan;
}
+ URB_LIST_RELEASE
spin_unlock_irq(&hcd_urb_list_lock);

/* Wait until the endpoint queue is completely empty */
while (!list_empty (&ep->urb_list)) {
spin_lock_irq(&hcd_urb_list_lock);
+ URB_LIST_HOLDER(usb_hcd_flush_endpoint_LIST)

/* The list may have changed while we acquired the spinlock */
urb = NULL;
@@ -1497,6 +1540,7 @@ rescan:
urb_list);
usb_get_urb (urb);
}
+ URB_LIST_RELEASE
spin_unlock_irq(&hcd_urb_list_lock);

if (urb) {

Attachment: dump_stack.txt.bz2
Description: application/bzip