Re: [PATCH 07/11] Xen/x86/PCI: Add support for the Xen PCI subsytem

From: Joerg Roedel
Date: Mon May 11 2009 - 05:40:46 EST


On Thu, May 07, 2009 at 02:45:33PM -0700, Jeremy Fitzhardinge wrote:
> From: Alex Nixon <alex.nixon@xxxxxxxxxx>
>
> Impact: add core of Xen PCI support
>
> On boot, the system will search to see if a Xen iommu/pci subsystem is
> available. If the kernel detects it's running in a domain rather than
> on bare hardware, this subsystem will be used. Otherwise, it falls
> back to using hardware as usual.
>
> The frontend stub lives in arch/x86/pci-xen.c, alongside other
> sub-arch PCI init code (e.g. olpc.c)
>
> (All subsequent fixes, API changes and swiotlb operations folded in.)
>
> Signed-off-by: Alex Nixon <alex.nixon@xxxxxxxxxx>
> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@xxxxxxxxxx>
> Signed-off-by: Ian Campbell <ian.campbell@xxxxxxxxxx>
> Reviewed-by: "H. Peter Anvin" <hpa@xxxxxxxxx>
> ---
> arch/x86/Kconfig | 4 +
> arch/x86/include/asm/pci_x86.h | 1 +
> arch/x86/include/asm/xen/iommu.h | 12 ++
> arch/x86/kernel/pci-dma.c | 3 +
> arch/x86/pci/Makefile | 1 +
> arch/x86/pci/init.c | 6 +
> arch/x86/pci/xen.c | 52 +++++++
> drivers/pci/Makefile | 2 +
> drivers/pci/xen-iommu.c | 302 ++++++++++++++++++++++++++++++++++++++
> 9 files changed, 383 insertions(+), 0 deletions(-)
> create mode 100644 arch/x86/include/asm/xen/iommu.h
> create mode 100644 arch/x86/pci/xen.c
> create mode 100644 drivers/pci/xen-iommu.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 4b34082..4a62659 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1830,6 +1830,10 @@ config PCI_OLPC
> def_bool y
> depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
>
> +config PCI_XEN
> + def_bool y
> + depends on XEN_PCI_PASSTHROUGH || XEN_DOM0_PCI
> +
> config PCI_DOMAINS
> def_bool y
> depends on PCI
> diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
> index 5401ca2..34f03a4 100644
> --- a/arch/x86/include/asm/pci_x86.h
> +++ b/arch/x86/include/asm/pci_x86.h
> @@ -107,6 +107,7 @@ extern int pci_direct_probe(void);
> extern void pci_direct_init(int type);
> extern void pci_pcbios_init(void);
> extern int pci_olpc_init(void);
> +extern int pci_xen_init(void);
> extern void __init dmi_check_pciprobe(void);
> extern void __init dmi_check_skip_isa_align(void);
>
> diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h
> new file mode 100644
> index 0000000..75df312
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/iommu.h
> @@ -0,0 +1,12 @@
> +#ifndef ASM_X86__XEN_IOMMU_H
> +
> +#ifdef CONFIG_PCI_XEN
> +extern void xen_iommu_init(void);
> +#else
> +static inline void xen_iommu_init(void)
> +{
> +}
> +#endif
> +
> +#endif
> +
> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> index 745579b..2fffc22 100644
> --- a/arch/x86/kernel/pci-dma.c
> +++ b/arch/x86/kernel/pci-dma.c
> @@ -10,6 +10,7 @@
> #include <asm/gart.h>
> #include <asm/calgary.h>
> #include <asm/amd_iommu.h>
> +#include <asm/xen/iommu.h>
>
> static int forbid_dac __read_mostly;
>
> @@ -275,6 +276,8 @@ static int __init pci_iommu_init(void)
> dma_debug_add_bus(&pci_bus_type);
> #endif
>
> + xen_iommu_init();
> +
> calgary_iommu_init();
>
> intel_iommu_init();
> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
> index d49202e..64182c5 100644
> --- a/arch/x86/pci/Makefile
> +++ b/arch/x86/pci/Makefile
> @@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o
> obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
> obj-$(CONFIG_PCI_DIRECT) += direct.o
> obj-$(CONFIG_PCI_OLPC) += olpc.o
> +obj-$(CONFIG_PCI_XEN) += xen.o
>
> obj-y += fixup.o
> obj-$(CONFIG_ACPI) += acpi.o
> diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
> index 25a1f8e..4e2f90a 100644
> --- a/arch/x86/pci/init.c
> +++ b/arch/x86/pci/init.c
> @@ -15,10 +15,16 @@ static __init int pci_arch_init(void)
> if (!(pci_probe & PCI_PROBE_NOEARLY))
> pci_mmcfg_early_init();
>
> +#ifdef CONFIG_PCI_XEN
> + if (!pci_xen_init())
> + return 0;
> +#endif
> +
> #ifdef CONFIG_PCI_OLPC
> if (!pci_olpc_init())
> return 0; /* skip additional checks if it's an XO */
> #endif
> +
> #ifdef CONFIG_PCI_BIOS
> pci_pcbios_init();
> #endif
> diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
> new file mode 100644
> index 0000000..76f803f
> --- /dev/null
> +++ b/arch/x86/pci/xen.c
> @@ -0,0 +1,52 @@
> +/*
> + * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
> + * x86 PCI core to support the Xen PCI Frontend
> + *
> + * Author: Ryan Wilson <hap9@xxxxxxxxxxxxxx>
> + */
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/pci.h>
> +#include <linux/acpi.h>
> +
> +#include <asm/pci_x86.h>
> +
> +#include <asm/xen/hypervisor.h>
> +
> +static int xen_pcifront_enable_irq(struct pci_dev *dev)
> +{
> + return 0;
> +}
> +
> +extern int isapnp_disable;
> +
> +int __init pci_xen_init(void)
> +{
> + if (!xen_pv_domain() || xen_initial_domain())
> + return -ENODEV;
> +
> + printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
> +
> + pcibios_set_cache_line_size();
> +
> + pcibios_enable_irq = xen_pcifront_enable_irq;
> + pcibios_disable_irq = NULL;
> +
> +#ifdef CONFIG_ACPI
> + /* Keep ACPI out of the picture */
> + acpi_noirq = 1;
> +#endif
> +
> +#ifdef CONFIG_ISAPNP
> + /* Stop isapnp from probing */
> + isapnp_disable = 1;
> +#endif
> +
> + /* Ensure a device still gets scanned even if it's fn number
> + * is non-zero.
> + */
> + pci_scan_all_fns = 1;
> +
> + return 0;
> +}
> +
> diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
> index ba6af16..8db0cb5 100644
> --- a/drivers/pci/Makefile
> +++ b/drivers/pci/Makefile
> @@ -27,6 +27,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o
> # Build Intel IOMMU support
> obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
>
> +# Build Xen IOMMU support
> +obj-$(CONFIG_PCI_XEN) += xen-iommu.o
> obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
>
> obj-$(CONFIG_PCI_IOV) += iov.o
> diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
> new file mode 100644
> index 0000000..e7a22f1
> --- /dev/null
> +++ b/drivers/pci/xen-iommu.c
> @@ -0,0 +1,302 @@
> +#include <linux/types.h>
> +#include <linux/mm.h>
> +#include <linux/string.h>
> +#include <linux/pci.h>
> +#include <linux/module.h>
> +#include <linux/version.h>
> +#include <linux/scatterlist.h>
> +#include <linux/io.h>
> +#include <linux/bug.h>
> +
> +#include <xen/interface/xen.h>
> +#include <xen/grant_table.h>
> +#include <xen/page.h>
> +#include <xen/xen-ops.h>
> +
> +#include <asm/iommu.h>
> +#include <asm/swiotlb.h>
> +#include <asm/tlbflush.h>
> +
> +#define IOMMU_BUG_ON(test) \
> +do { \
> + if (unlikely(test)) { \
> + printk(KERN_ALERT "Fatal DMA error! " \
> + "Please use 'swiotlb=force'\n"); \
> + BUG(); \
> + } \
> +} while (0)
> +
> +/* Print address range with message */
> +#define PAR(msg, addr, size) \
> +do { \
> + printk(msg "[%#llx - %#llx]\n", \
> + (unsigned long long)addr, \
> + (unsigned long long)addr + size); \
> +} while (0)
> +
> +struct dma_coherent_mem {
> + void *virt_base;
> + u32 device_base;
> + int size;
> + int flags;
> + unsigned long *bitmap;
> +};
> +
> +static inline int address_needs_mapping(struct device *hwdev,
> + dma_addr_t addr)
> +{
> + dma_addr_t mask = 0xffffffff;

You can use DMA_32BIT_MASK here.

> + int ret;
> +
> + /* If the device has a mask, use it, otherwise default to 32 bits */
> + if (hwdev && hwdev->dma_mask)
> + mask = *hwdev->dma_mask;

I think the check for a valid hwdev->dma_mask is not necessary. Other
IOMMU drivers also don't check for this.

> +
> + ret = (addr & ~mask) != 0;
> +
> + if (ret) {
> + printk(KERN_ERR "dma address needs mapping\n");
> + printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
> + }
> + return ret;
> +}
> +
> +static int check_pages_physically_contiguous(unsigned long pfn,
> + unsigned int offset,
> + size_t length)
> +{
> + unsigned long next_mfn;
> + int i;
> + int nr_pages;
> +
> + next_mfn = pfn_to_mfn(pfn);
> + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
> +
> + for (i = 1; i < nr_pages; i++) {
> + if (pfn_to_mfn(++pfn) != ++next_mfn)
> + return 0;
> + }
> + return 1;
> +}
> +
> +static int range_straddles_page_boundary(phys_addr_t p, size_t size)
> +{
> + unsigned long pfn = PFN_DOWN(p);
> + unsigned int offset = p & ~PAGE_MASK;
> +
> + if (offset + size <= PAGE_SIZE)
> + return 0;

You can use iommu_num_pages here from lib/iommu_helpers.c

> + if (check_pages_physically_contiguous(pfn, offset, size))
> + return 0;
> + return 1;
> +}
> +
> +static inline void xen_dma_unmap_page(struct page *page)
> +{
> + /* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
> + * to deal with foreign pages. We'll need similar logic here at
> + * some point.
> + */
> +}
> +
> +/* Gets dma address of a page */
> +static inline dma_addr_t xen_dma_map_page(struct page *page)
> +{
> + /* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal
> + * with foreign pages. We'll need similar logic here at some
> + * point.
> + */
> + return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT;
> +}
> +
> +static int xen_map_sg(struct device *hwdev, struct scatterlist *sg,
> + int nents, int direction)
> +{
> + struct scatterlist *s;
> + struct page *page;
> + int i, rc;
> +
> + BUG_ON(direction == DMA_NONE);
> + WARN_ON(nents == 0 || sg[0].length == 0);
> +
> + for_each_sg(sg, s, nents, i) {
> + BUG_ON(!sg_page(s));
> + page = sg_page(s);
> + s->dma_address = xen_dma_map_page(page) + s->offset;
> + s->dma_length = s->length;
> + IOMMU_BUG_ON(range_straddles_page_boundary(
> + page_to_phys(page), s->length));

I have a question on this. How do you make sure that the memory to map
does not cross page boundarys? I have a stats counter for x-page
requests in amd iommu code and around 10% of the requests are actually
x-page for me.

> + }
> +
> + rc = nents;
> +
> + flush_write_buffers();
> + return rc;
> +}
> +
> +static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg,
> + int nents, int direction)
> +{
> + struct scatterlist *s;
> + struct page *page;
> + int i;
> +
> + for_each_sg(sg, s, nents, i) {
> + page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address)));
> + xen_dma_unmap_page(page);
> + }
> +}
> +
> +static void *xen_alloc_coherent(struct device *dev, size_t size,
> + dma_addr_t *dma_handle, gfp_t gfp)
> +{
> + void *ret;
> + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
> + unsigned int order = get_order(size);
> + unsigned long vstart;
> + u64 mask;
> +
> + /* ignore region specifiers */
> + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
> +
> + if (mem) {
> + int page = bitmap_find_free_region(mem->bitmap, mem->size,
> + order);

Can you use iommu_area_alloc here?

> + if (page >= 0) {
> + *dma_handle = mem->device_base + (page << PAGE_SHIFT);
> + ret = mem->virt_base + (page << PAGE_SHIFT);
> + memset(ret, 0, size);
> + return ret;
> + }
> + if (mem->flags & DMA_MEMORY_EXCLUSIVE)
> + return NULL;
> + }
> +
> + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))

DMA_32BIT_MASK

> + gfp |= GFP_DMA;
> +
> + vstart = __get_free_pages(gfp, order);
> + ret = (void *)vstart;
> +
> + if (dev != NULL && dev->coherent_dma_mask)
> + mask = dev->coherent_dma_mask;
> + else
> + mask = 0xffffffff;

DMA_32BIT_MASK

> +
> + if (ret != NULL) {
> + if (xen_create_contiguous_region(vstart, order,
> + fls64(mask)) != 0) {
> + free_pages(vstart, order);
> + return NULL;
> + }
> + memset(ret, 0, size);
> + *dma_handle = virt_to_machine(ret).maddr;
> + }
> + return ret;
> +}
> +
> +static void xen_free_coherent(struct device *dev, size_t size,
> + void *vaddr, dma_addr_t dma_addr)
> +{
> + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
> + int order = get_order(size);
> +
> + if (mem && vaddr >= mem->virt_base &&
> + vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
> + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
> + bitmap_release_region(mem->bitmap, page, order);

iommu_area_free

> + } else {
> + xen_destroy_contiguous_region((unsigned long)vaddr, order);
> + free_pages((unsigned long)vaddr, order);
> + }
> +}
> +
> +static dma_addr_t xen_swiotlb_map_single(struct device *dev, phys_addr_t paddr,
> + size_t size, int direction)
> +{
> + dma_addr_t dma;
> + BUG_ON(direction == DMA_NONE);
> +
> + WARN_ON(size == 0);
> + dma = swiotlb_map_single(dev, phys_to_virt(paddr), size, direction);
> +
> + flush_write_buffers();
> + return dma;
> +}
> +
> +static dma_addr_t xen_map_single(struct device *dev, phys_addr_t paddr,
> + size_t size, int direction)
> +{
> + struct page *page;
> + dma_addr_t dma;
> +
> + BUG_ON(direction == DMA_NONE);
> +
> + WARN_ON(size == 0);
> + page = pfn_to_page(PFN_DOWN(paddr));
> +
> + dma = xen_dma_map_page(page) + offset_in_page(paddr);
> +
> + IOMMU_BUG_ON(address_needs_mapping(dev, dma));
> + IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
> + flush_write_buffers();
> + return dma;
> +}
> +
> +static void xen_unmap_single(struct device *dev, dma_addr_t dma_addr,
> + size_t size, int direction)
> +{
> + BUG_ON(direction == DMA_NONE);
> + xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr))));
> +}
> +
> +static struct dma_mapping_ops xen_dma_ops = {
> + .dma_supported = NULL,
> +
> + .alloc_coherent = xen_alloc_coherent,
> + .free_coherent = xen_free_coherent,
> +
> + .map_single = xen_map_single,
> + .unmap_single = xen_unmap_single,
> +
> + .map_sg = xen_map_sg,
> + .unmap_sg = xen_unmap_sg,
> +
> + .mapping_error = NULL,
> +
> + .is_phys = 0,
> +};
> +
> +static struct dma_mapping_ops xen_swiotlb_dma_ops = {
> + .dma_supported = swiotlb_dma_supported,
> +
> + .alloc_coherent = xen_alloc_coherent,
> + .free_coherent = xen_free_coherent,
> +
> + .map_single = xen_swiotlb_map_single, /* swiotlb_map_single has a different prototype */
> + .unmap_single = swiotlb_unmap_single,
> +
> + .map_sg = swiotlb_map_sg,
> + .unmap_sg = swiotlb_unmap_sg,
> +
> + .mapping_error = swiotlb_dma_mapping_error,
> +
> + .is_phys = 0,
> +};
> +
> +void __init xen_iommu_init(void)
> +{
> + if (!xen_pv_domain())
> + return;
> +
> + printk(KERN_INFO "Xen: Initializing Xen DMA ops\n");
> +
> + force_iommu = 0;
> + dma_ops = &xen_dma_ops;
> +
> + if (swiotlb) {
> + printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n");
> + dma_ops = &xen_swiotlb_dma_ops;
> + }
> +}
> +
> --
> 1.6.0.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/