Re: [PATCH v3 2/3] uio_pci_generic: add MSI/MSI-X support

From: Michael S. Tsirkin
Date: Mon Oct 05 2015 - 15:16:54 EST


On Sun, Oct 04, 2015 at 11:43:17PM +0300, Vlad Zolotarov wrote:
> Add support for MSI and MSI-X interrupt modes:
> - Interrupt mode selection order is:
> INT#X (for backward compatibility) -> MSI-X -> MSI.
> - Add ioctl() commands:
> - UIO_PCI_GENERIC_INT_MODE_GET: query the current interrupt mode.
> - UIO_PCI_GENERIC_IRQ_NUM_GET: query the maximum number of IRQs.

This might be something that humans might want to
read or configure, as well.
In fact, # of interrupts is a limited resource
so it seems very reasonable to have a system
admin configure it rather than the application.
So why not use sysfs attributes for this?

> - UIO_PCI_GENERIC_IRQ_SET: bind the IRQ to eventfd (similar to vfio).

I think that as a first step, you should just use the regular
uio sysfs interface to report interrupts.
Adding eventfd support to uio sounds like a reasonable
extension but I don't see why it needs to be done
as part of the same patch.

> - Add mappings to all bars (memory and portio):

I don't think it's a good idea.
People already can poke at BARs through device sysfs
(and already abuse this interface).
We don't want to add another API to do exactly the same thing.
In any case, this should be a separate patch if necessary.

> some devices have
> registers related to MSI/MSI-X handling outside BAR0.

This is very vague. Definitely not enough to justify a new
kernel/userspace API.


>
> Signed-off-by: Vlad Zolotarov <vladz@xxxxxxxxxxxxxxxxxxxx>
> ---
> New in v3:
> - Add __iomem qualifier to temp buffer receiving ioremap value.
>
> New in v2:
> - Added #include <linux/uaccess.h> to uio_pci_generic.c
>
> Signed-off-by: Vlad Zolotarov <vladz@xxxxxxxxxxxxxxxxxxxx>
> ---
> drivers/uio/uio_pci_generic.c | 410 +++++++++++++++++++++++++++++++++++++---
> include/linux/uio_pci_generic.h | 36 ++++
> 2 files changed, 423 insertions(+), 23 deletions(-)
> create mode 100644 include/linux/uio_pci_generic.h
>
> diff --git a/drivers/uio/uio_pci_generic.c b/drivers/uio/uio_pci_generic.c
> index d0b508b..6b8b1789 100644
> --- a/drivers/uio/uio_pci_generic.c
> +++ b/drivers/uio/uio_pci_generic.c
> @@ -22,16 +22,32 @@
> #include <linux/device.h>
> #include <linux/module.h>
> #include <linux/pci.h>
> +#include <linux/msi.h>
> #include <linux/slab.h>
> #include <linux/uio_driver.h>
> +#include <linux/uio_pci_generic.h>
> +#include <linux/eventfd.h>
> +#include <linux/uaccess.h>
>
> #define DRIVER_VERSION "0.01.0"
> #define DRIVER_AUTHOR "Michael S. Tsirkin <mst@xxxxxxxxxx>"
> #define DRIVER_DESC "Generic UIO driver for PCI 2.3 devices"
>
> +struct msix_info {
> + int num_irqs;
> + struct msix_entry *table;
> + struct uio_msix_irq_ctx {
> + struct eventfd_ctx *trigger; /* MSI-x vector to eventfd */
> + char *name; /* name in /proc/interrupts */
> + } *ctx;
> +};
> +
> struct uio_pci_generic_dev {
> struct uio_info info;
> struct pci_dev *pdev;
> + struct mutex msix_state_lock; /* ioctl mutex */
> + enum uio_int_mode int_mode;
> + struct msix_info msix;
> };
>
> static inline struct uio_pci_generic_dev *
> @@ -40,9 +56,177 @@ to_uio_pci_generic_dev(struct uio_info *info)
> return container_of(info, struct uio_pci_generic_dev, info);
> }
>
> -/* Interrupt handler. Read/modify/write the command register to disable
> - * the interrupt. */
> -static irqreturn_t irqhandler(int irq, struct uio_info *info)
> +/* Unmap previously ioremap'd resources */
> +static void release_iomaps(struct uio_pci_generic_dev *gdev)
> +{
> + int i;
> + struct uio_mem *mem = gdev->info.mem;
> +
> + for (i = 0; i < MAX_UIO_MAPS; i++, mem++) {
> + if (mem->internal_addr) {
> + iounmap(mem->internal_addr);
> + mem->internal_addr = NULL;
> + }
> + }
> +}
> +
> +static int setup_maps(struct pci_dev *pdev, struct uio_info *info)
> +{
> + int i, m = 0, p = 0, err;
> + static const char * const bar_names[] = {
> + "BAR0", "BAR1", "BAR2", "BAR3", "BAR4", "BAR5",
> + };
> +
> + for (i = 0; i < ARRAY_SIZE(bar_names); i++) {
> + unsigned long start = pci_resource_start(pdev, i);
> + unsigned long flags = pci_resource_flags(pdev, i);
> + unsigned long len = pci_resource_len(pdev, i);
> +
> + if (start == 0 || len == 0)
> + continue;
> +
> + if (flags & IORESOURCE_MEM) {
> + void __iomem *addr;
> +
> + if (m >= MAX_UIO_MAPS)
> + continue;
> +
> + addr = ioremap(start, len);
> + if (addr == NULL) {
> + err = -EINVAL;
> + goto fail;
> + }
> +
> + info->mem[m].name = bar_names[i];
> + info->mem[m].addr = start;
> + info->mem[m].internal_addr = addr;
> + info->mem[m].size = len;
> + info->mem[m].memtype = UIO_MEM_PHYS;
> + ++m;
> + } else if (flags & IORESOURCE_IO) {
> + if (p >= MAX_UIO_PORT_REGIONS)
> + continue;
> +
> + info->port[p].name = bar_names[i];
> + info->port[p].start = start;
> + info->port[p].size = len;
> + info->port[p].porttype = UIO_PORT_X86;
> + ++p;
> + }
> + }
> +
> + return 0;
> +fail:
> + for (i = 0; i < m; i++) {
> + iounmap(info->mem[i].internal_addr);
> + info->mem[i].internal_addr = NULL;
> + }
> +
> + return err;
> +}
> +
> +static irqreturn_t msix_irqhandler(int irq, void *arg);
> +
> +/* set the mapping between vector # and existing eventfd. */
> +static int set_irq_eventfd(struct uio_pci_generic_dev *gdev, int vec, int fd)
> +{
> + struct uio_msix_irq_ctx *ctx;
> + struct eventfd_ctx *trigger;
> + struct pci_dev *pdev = gdev->pdev;
> + int irq, err;
> +
> + if (vec >= gdev->msix.num_irqs) {
> + dev_notice(&gdev->pdev->dev, "vec %u >= num_vec %u\n",
> + vec, gdev->msix.num_irqs);
> + return -ERANGE;
> + }
> +
> + irq = gdev->msix.table[vec].vector;
> +
> + /* Cleanup existing irq mapping */
> + ctx = &gdev->msix.ctx[vec];
> + if (ctx->trigger) {
> + free_irq(irq, ctx->trigger);
> + eventfd_ctx_put(ctx->trigger);
> + ctx->trigger = NULL;
> + }
> +
> + /* Passing -1 is used to disable interrupt */
> + if (fd < 0)
> + return 0;
> +
> +
> + trigger = eventfd_ctx_fdget(fd);
> + if (IS_ERR(trigger)) {
> + err = PTR_ERR(trigger);
> + dev_notice(&gdev->pdev->dev,
> + "eventfd ctx get failed: %d\n", err);
> + return err;
> + }
> +
> + err = request_irq(irq, msix_irqhandler, 0, ctx->name, trigger);
> + if (err) {
> + dev_notice(&pdev->dev, "request irq failed: %d\n", err);
> + eventfd_ctx_put(trigger);
> + return err;
> + }
> +
> + dev_dbg(&pdev->dev, "map vector %u to fd %d trigger %p\n",
> + vec, fd, trigger);
> + ctx->trigger = trigger;
> +
> + return 0;
> +}
> +
> +static int uio_pci_generic_ioctl(struct uio_info *info, unsigned int cmd,
> + unsigned long arg)
> +{
> + struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info);
> + struct uio_pci_generic_irq_set hdr;
> + int err;
> +
> + switch (cmd) {
> + case UIO_PCI_GENERIC_IRQ_SET:
> + if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr)))
> + return -EFAULT;
> +
> + /* Locking is needed to ensure two things:
> + * 1) Two IRQ_SET ioctl()'s are not running in parallel.
> + * 2) IRQ_SET ioctl() is not running in parallel with remove().
> + */
> + mutex_lock(&gdev->msix_state_lock);
> + if (gdev->int_mode != UIO_INT_MODE_MSIX) {
> + mutex_unlock(&gdev->msix_state_lock);
> + return -EOPNOTSUPP;
> + }
> +
> + err = set_irq_eventfd(gdev, hdr.vec, hdr.fd);
> + mutex_unlock(&gdev->msix_state_lock);
> +
> + break;
> + case UIO_PCI_GENERIC_IRQ_NUM_GET:
> + if (gdev->int_mode == UIO_INT_MODE_NONE)
> + err = put_user(0, (u32 __user *)arg);
> + else if (gdev->int_mode != UIO_INT_MODE_MSIX)
> + err = put_user(1, (u32 __user *)arg);
> + else
> + err = put_user(gdev->msix.num_irqs,
> + (u32 __user *)arg);
> +
> + break;
> + case UIO_PCI_GENERIC_INT_MODE_GET:
> + err = put_user(gdev->int_mode, (u32 __user *)arg);
> +
> + break;
> + default:
> + err = -EOPNOTSUPP;
> + }
> +
> + return err;
> +}
> +
> +/* INT#X interrupt handler. */
> +static irqreturn_t intx_irqhandler(int irq, struct uio_info *info)
> {
> struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info);
>
> @@ -53,8 +237,162 @@ static irqreturn_t irqhandler(int irq, struct uio_info *info)
> return IRQ_HANDLED;
> }
>
> -static int probe(struct pci_dev *pdev,
> - const struct pci_device_id *id)
> +/* MSI interrupt handler. */
> +static irqreturn_t msi_irqhandler(int irq, struct uio_info *info)
> +{
> + /* UIO core will signal the user process. */
> + return IRQ_HANDLED;
> +}
> +
> +/* MSI-X interrupt handler. */
> +static irqreturn_t msix_irqhandler(int irq, void *arg)
> +{
> + struct eventfd_ctx *trigger = arg;
> +
> + pr_devel("irq %u trigger %p\n", irq, trigger);
> +
> + eventfd_signal(trigger, 1);
> + return IRQ_HANDLED;
> +}
> +
> +static bool enable_intx(struct uio_pci_generic_dev *gdev)
> +{
> + struct pci_dev *pdev = gdev->pdev;
> +
> + if (!pdev->irq || !pci_intx_mask_supported(pdev))
> + return false;
> +
> + gdev->int_mode = UIO_INT_MODE_INTX;
> + gdev->info.irq = pdev->irq;
> + gdev->info.irq_flags = IRQF_SHARED;
> + gdev->info.handler = intx_irqhandler;
> +
> + return true;
> +}
> +
> +static void set_pci_master(struct pci_dev *pdev)
> +{
> + pci_set_master(pdev);
> + dev_warn(&pdev->dev, "Enabling PCI bus mastering. Bogus userspace application is able to trash kernel memory using DMA");
> + add_taint(TAINT_USER, LOCKDEP_STILL_OK);

Basically, users won't notice this until their kernel
crashes, and then it's too late.

Further, this happens silently: it's not an admin doing
something insecure intentionally.
It's the driver doing something insecure.

This doesn't make sense.

IIUC, the issue is that userspace can reprogram MSI/MSI-X
configuration, causing it to write into kernel memory.

So IMHO the right thing to do is to prevent userspace from
poking at MSI/MSI-X configuration when this is enabled.

This includes the MSI/MSI-X capability in config space
and the MSI-X table in the relevant BAR.

For MSI-X it's especially tricky.
Ideally, we'd map a zero page over that region, because
existing applications expect to be able to map the whole
BAR.

This will need some infrastructure work.

> +}
> +
> +static bool enable_msi(struct uio_pci_generic_dev *gdev)
> +{
> + struct pci_dev *pdev = gdev->pdev;
> +
> + set_pci_master(pdev);
> +
> + if (pci_enable_msi(pdev))
> + return false;
> +
> + gdev->int_mode = UIO_INT_MODE_MSI;
> + gdev->info.irq = pdev->irq;
> + gdev->info.irq_flags = 0;
> + gdev->info.handler = msi_irqhandler;
> +
> + return true;
> +}
> +
> +static bool enable_msix(struct uio_pci_generic_dev *gdev)
> +{
> + struct pci_dev *pdev = gdev->pdev;
> + int i, vectors = pci_msix_vec_count(pdev);
> +
> + if (vectors <= 0)
> + return false;
> +
> + gdev->msix.table = kcalloc(vectors, sizeof(struct msix_entry),
> + GFP_KERNEL);
> + if (!gdev->msix.table) {
> + dev_err(&pdev->dev, "Failed to allocate memory for MSI-X table");
> + return false;
> + }
> +
> + gdev->msix.ctx = kcalloc(vectors, sizeof(struct uio_msix_irq_ctx),
> + GFP_KERNEL);
> + if (!gdev->msix.ctx) {
> + dev_err(&pdev->dev, "Failed to allocate memory for MSI-X contexts");
> + goto err_ctx_alloc;
> + }
> +
> + for (i = 0; i < vectors; i++) {
> + gdev->msix.table[i].entry = i;
> + gdev->msix.ctx[i].name = kasprintf(GFP_KERNEL,
> + KBUILD_MODNAME "[%d](%s)",
> + i, pci_name(pdev));
> + if (!gdev->msix.ctx[i].name)
> + goto err_name_alloc;
> + }
> +
> + set_pci_master(pdev);
> +
> + if (pci_enable_msix(pdev, gdev->msix.table, vectors))
> + goto err_msix_enable;
> +
> + gdev->int_mode = UIO_INT_MODE_MSIX;
> + gdev->info.irq = UIO_IRQ_CUSTOM;
> + gdev->msix.num_irqs = vectors;
> +
> + return true;
> +
> +err_msix_enable:
> + pci_clear_master(pdev);
> +err_name_alloc:
> + for (i = 0; i < vectors; i++)
> + kfree(gdev->msix.ctx[i].name);
> +
> + kfree(gdev->msix.ctx);
> +err_ctx_alloc:
> + kfree(gdev->msix.table);
> +
> + return false;
> +}
> +
> +/**
> + * Disable interrupts and free related resources.
> + *
> + * @gdev device handle
> + *
> + * This function should be called after the corresponding UIO device has been
> + * unregistered. This will ensure that there are no currently running ioctl()s
> + * and there won't be any new ones until next probe() call.
> + */
> +static void disable_intr(struct uio_pci_generic_dev *gdev)
> +{
> + struct pci_dev *pdev = gdev->pdev;
> + int i;
> +
> + switch (gdev->int_mode) {
> + case UIO_INT_MODE_MSI:
> + pci_disable_msi(pdev);
> + pci_clear_master(pdev);
> +
> + break;
> + case UIO_INT_MODE_MSIX:
> + /* No need for locking here since there shouldn't be any
> + * ioctl()s running by now.
> + */
> + for (i = 0; i < gdev->msix.num_irqs; i++) {
> + if (gdev->msix.ctx[i].trigger)
> + set_irq_eventfd(gdev, i, -1);
> +
> + kfree(gdev->msix.ctx[i].name);
> + }
> +
> + pci_disable_msix(pdev);
> + pci_clear_master(pdev);
> + kfree(gdev->msix.ctx);
> + kfree(gdev->msix.table);
> +
> + break;
> + default:
> + break;
> + }
> +}
> +
> +
> +static int probe(struct pci_dev *pdev, const struct pci_device_id *id)
> {
> struct uio_pci_generic_dev *gdev;
> int err;
> @@ -66,42 +404,64 @@ static int probe(struct pci_dev *pdev,
> return err;
> }
>
> - if (!pdev->irq) {
> - dev_warn(&pdev->dev, "No IRQ assigned to device: "
> - "no support for interrupts?\n");
> - pci_disable_device(pdev);
> - return -ENODEV;
> - }
> -
> - if (!pci_intx_mask_supported(pdev)) {
> - err = -ENODEV;
> - goto err_verify;
> - }
> -
> gdev = kzalloc(sizeof(struct uio_pci_generic_dev), GFP_KERNEL);
> if (!gdev) {
> err = -ENOMEM;
> goto err_alloc;
> }
>
> + gdev->pdev = pdev;
> gdev->info.name = "uio_pci_generic";
> gdev->info.version = DRIVER_VERSION;
> - gdev->info.irq = pdev->irq;
> - gdev->info.irq_flags = IRQF_SHARED;
> - gdev->info.handler = irqhandler;
> - gdev->pdev = pdev;
> + gdev->info.ioctl = uio_pci_generic_ioctl;
> + mutex_init(&gdev->msix_state_lock);
> +
> + err = pci_request_regions(pdev, "uio_pci_generic");
> + if (err != 0) {
> + dev_err(&pdev->dev, "Cannot request regions\n");
> + goto err_request_regions;
> + }
> +
> + /* Enable the corresponding interrupt mode. Try to enable INT#X first
> + * for backward compatibility.
> + */
> + if (enable_intx(gdev))
> + dev_info(&pdev->dev, "Using INT#x mode: IRQ %ld",
> + gdev->info.irq);
> + else if (enable_msix(gdev))
> + dev_info(&pdev->dev, "Using MSI-X mode: number of IRQs %d",
> + gdev->msix.num_irqs);
> + else if (enable_msi(gdev))
> + dev_info(&pdev->dev, "Using MSI mode: IRQ %ld", gdev->info.irq);
> + else {
> + err = -ENODEV;
> + goto err_verify;
> + }
> +
> + /* remap resources */
> + err = setup_maps(pdev, &gdev->info);
> + if (err)
> + goto err_maps;
>
> err = uio_register_device(&pdev->dev, &gdev->info);
> if (err)
> goto err_register;
> +
> pci_set_drvdata(pdev, gdev);
>
> return 0;
> +
> err_register:
> + release_iomaps(gdev);
> +err_maps:
> + disable_intr(gdev);
> +err_verify:
> + pci_release_regions(pdev);
> +err_request_regions:
> kfree(gdev);
> err_alloc:
> -err_verify:
> pci_disable_device(pdev);
> +
> return err;
> }
>
> @@ -110,8 +470,12 @@ static void remove(struct pci_dev *pdev)
> struct uio_pci_generic_dev *gdev = pci_get_drvdata(pdev);
>
> uio_unregister_device(&gdev->info);
> - pci_disable_device(pdev);
> + disable_intr(gdev);
> + release_iomaps(gdev);
> + pci_release_regions(pdev);
> kfree(gdev);
> + pci_disable_device(pdev);
> + pci_set_drvdata(pdev, NULL);
> }
>
> static struct pci_driver uio_pci_driver = {
> diff --git a/include/linux/uio_pci_generic.h b/include/linux/uio_pci_generic.h
> new file mode 100644
> index 0000000..10716fc
> --- /dev/null
> +++ b/include/linux/uio_pci_generic.h
> @@ -0,0 +1,36 @@
> +/*
> + * include/linux/uio_pci_generic.h
> + *
> + * Userspace generic PCI IO driver.
> + *
> + * Licensed under the GPLv2 only.
> + */
> +
> +#ifndef _UIO_PCI_GENERIC_H_
> +#define _UIO_PCI_GENERIC_H_
> +
> +#include <linux/ioctl.h>
> +
> +enum uio_int_mode {
> + UIO_INT_MODE_NONE,
> + UIO_INT_MODE_INTX,
> + UIO_INT_MODE_MSI,
> + UIO_INT_MODE_MSIX
> +};
> +
> +/* bind the requested IRQ to the given eventfd */
> +struct uio_pci_generic_irq_set {
> + int vec; /* index of the IRQ to connect to starting from 0 */
> + int fd;
> +};
> +
> +#define UIO_PCI_GENERIC_BASE 0x86
> +
> +#define UIO_PCI_GENERIC_IRQ_SET _IOW('I', UIO_PCI_GENERIC_BASE + 1, \
> + struct uio_pci_generic_irq_set)
> +#define UIO_PCI_GENERIC_IRQ_NUM_GET _IOW('I', UIO_PCI_GENERIC_BASE + 2, \
> + uint32_t)
> +#define UIO_PCI_GENERIC_INT_MODE_GET _IOW('I', UIO_PCI_GENERIC_BASE + 3, \
> + uint32_t)
> +
> +#endif /* _UIO_PCI_GENERIC_H_ */
> --
> 2.1.0
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/