Re: BUG in drivers/dma/ioat/dma_v2.c:314

From: Dan Williams
Date: Wed Jul 07 2010 - 13:48:05 EST


On 7/6/2010 8:40 PM, David Woodhouse wrote:
On Tue, 2010-07-06 at 17:51 -0700, Dan Williams wrote:
It should always trigger, and I have verified as much with the attached
replacement patch (by forcing the error on a working system), but we run
into a new problem.

IOMMU: can't find DMAR for device 0000:00:0f.0
Allocating domain for 0000:00:0f.0 failed

Yeah, we're actually doing the check in the wrong place. This should
work better, I think.


Ok.

Chris this change will make the driver's dma mapping calls succeed which means we once again need to be prepared for the case where the engine's iommu is not configured correctly. Attached is David's latest combined with the driver error handling from v1. The expectation is that this reports the BIOS misconfiguration each boot and the driver fails cleanly if its transactions are blocked.

Thanks for going through these iterations.

--
Dan
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
index 6d3a73b..5216c8a 100644
--- a/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@ -97,6 +97,7 @@ struct ioat_chan_common {
#define IOAT_RESET_PENDING 2
#define IOAT_KOBJ_INIT_FAIL 3
#define IOAT_RESHAPE_PENDING 4
+ #define IOAT_RUN 5
struct timer_list timer;
#define COMPLETION_TIMEOUT msecs_to_jiffies(100)
#define IDLE_TIMEOUT msecs_to_jiffies(2000)
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index 3c8b32a..779f4da 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -287,7 +287,10 @@ void ioat2_timer_event(unsigned long data)
chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
dev_err(to_dev(chan), "%s: Channel halted (%x)\n",
__func__, chanerr);
- BUG_ON(is_ioat_bug(chanerr));
+ if (test_bit(IOAT_RUN, &chan->state))
+ BUG_ON(is_ioat_bug(chanerr));
+ else /* we never got off the ground */
+ return;
}

/* if we haven't made progress and we have already
@@ -492,6 +495,8 @@ static struct ioat_ring_ent **ioat2_alloc_ring(struct dma_chan *c, int order, gf
return ring;
}

+void ioat2_free_chan_resources(struct dma_chan *c);
+
/* ioat2_alloc_chan_resources - allocate/initialize ioat2 descriptor ring
* @chan: channel to be initialized
*/
@@ -500,6 +505,7 @@ int ioat2_alloc_chan_resources(struct dma_chan *c)
struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
struct ioat_chan_common *chan = &ioat->base;
struct ioat_ring_ent **ring;
+ u64 status;
int order;

/* have we already been set up? */
@@ -540,7 +546,20 @@ int ioat2_alloc_chan_resources(struct dma_chan *c)
tasklet_enable(&chan->cleanup_task);
ioat2_start_null_desc(ioat);

- return 1 << ioat->alloc_order;
+ /* check that we got off the ground */
+ udelay(5);
+ status = ioat_chansts(chan);
+ if (is_ioat_active(status) || is_ioat_idle(status)) {
+ set_bit(IOAT_RUN, &chan->state);
+ return 1 << ioat->alloc_order;
+ } else {
+ u32 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+
+ dev_err(to_dev(chan), "failed to start channel chanerr: %#x\n",
+ chanerr);
+ ioat2_free_chan_resources(c);
+ return -EFAULT;
+ }
}

bool reshape_ring(struct ioat2_dma_chan *ioat, int order)
@@ -778,6 +797,7 @@ void ioat2_free_chan_resources(struct dma_chan *c)
del_timer_sync(&chan->timer);
device->cleanup_fn((unsigned long) c);
device->reset_hw(chan);
+ clear_bit(IOAT_RUN, &chan->state);

spin_lock_bh(&chan->cleanup_lock);
spin_lock_bh(&ioat->prep_lock);
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 1cdd22e..d0f4990 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -361,7 +361,10 @@ static void ioat3_timer_event(unsigned long data)
chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
dev_err(to_dev(chan), "%s: Channel halted (%x)\n",
__func__, chanerr);
- BUG_ON(is_ioat_bug(chanerr));
+ if (test_bit(IOAT_RUN, &chan->state))
+ BUG_ON(is_ioat_bug(chanerr));
+ else /* we never got off the ground */
+ return;
}

/* if we haven't made progress and we have already
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 796828f..333dfdf 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3029,6 +3029,32 @@ static void __init iommu_exit_mempool(void)

}

+static void quirk_ioat_snb_no_catchall(struct pci_dev *pdev)
+{
+ struct dmar_drhd_unit *drhd;
+ int i;
+
+ /* We know that this device on this chipset has its own
+ IOMMU. If we find it under the catch-all IOMMU, then
+ the BIOS is lying to us. Hope that the IOMMU for
+ this device is actually disabled, and it needs no
+ translation... */
+
+ for_each_drhd_unit(drhd) {
+ if (!drhd->include_all)
+ continue;
+ for (i = 0; i < drhd->devices_cnt; i++) {
+ if (drhd->devices[i] == pdev) {
+ WARN_TAINT_ONCE(1, TAINT_FIRMWARE_WORKAROUND,
+ "BIOS wrongly included I/OAT device under catch-all VT-d unit\n");
+ pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+ return;
+ }
+ }
+ }
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_no_catchall);
+
static void __init init_no_remapping_devices(void)
{
struct dmar_drhd_unit *drhd;