Re: [PATCH 07/13] ocxl: Add AFU interrupt support

From: Benjamin Herrenschmidt
Date: Mon Dec 18 2017 - 23:05:57 EST


On Mon, 2017-12-18 at 16:21 +0100, Frederic Barrat wrote:
> Add user APIs through ioctl to allocate, free, and be notified of an
> AFU interrupt.
>
> For opencapi, an AFU can trigger an interrupt on the host by sending a
> specific command targeting a 64-bit object handle. On POWER9, this is
> implemented by mapping a special page in the address space of a
> process and a write to that page will trigger an interrupt.

We need to figure out how that plays with KVM. +Cedric..

For all those "generic xive" interrupts, whether they are used for
OpenCAPI, plain guest IPIs, NX interrupts etc... but also for actual
pass-through ones, we'll need a mechanism to map the trigger and ESB
pages into qemu.

We can't have a bazillion VMAs and KVM memory regions either, so we'll
need some kind of mechanism/driver which allows for a single fairly
large mmap'ed VMA which can then be "populated" with interrupt control
pages.

The issue of course is that we can't really do a "generic" system that
allows to map any interrupt, it's a security issue. So we need the
interrupt "owner" to be the one allowing this. VFIO for PCI for
example, possibly a specific VFIO variant for OpenCAPI, something else
for guest IPIs ?

Food for thoughts...

Ben.

>
> Signed-off-by: Frederic Barrat <fbarrat@xxxxxxxxxxxxxxxxxx>
> ---
> arch/powerpc/include/asm/pnv-ocxl.h | 3 +
> arch/powerpc/platforms/powernv/ocxl.c | 30 +++++
> drivers/misc/ocxl/afu_irq.c | 204 ++++++++++++++++++++++++++++++++++
> drivers/misc/ocxl/context.c | 40 ++++++-
> drivers/misc/ocxl/file.c | 33 ++++++
> drivers/misc/ocxl/link.c | 28 +++++
> drivers/misc/ocxl/ocxl_internal.h | 7 ++
> include/uapi/misc/ocxl.h | 9 ++
> 8 files changed, 352 insertions(+), 2 deletions(-)
> create mode 100644 drivers/misc/ocxl/afu_irq.c
>
> diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h
> index 5a7ae7f28209..1e26f0a39500 100644
> --- a/arch/powerpc/include/asm/pnv-ocxl.h
> +++ b/arch/powerpc/include/asm/pnv-ocxl.h
> @@ -37,4 +37,7 @@ extern int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
> extern void pnv_ocxl_spa_release(void *platform_data);
> extern int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle);
>
> +extern int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr);
> +extern void pnv_ocxl_free_xive_irq(u32 irq);
> +
> #endif /* _ASM_PVN_OCXL_H */
> diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
> index 6c79924b95c8..96cafba6aef1 100644
> --- a/arch/powerpc/platforms/powernv/ocxl.c
> +++ b/arch/powerpc/platforms/powernv/ocxl.c
> @@ -9,6 +9,7 @@
>
> #include <asm/pnv-ocxl.h>
> #include <asm/opal.h>
> +#include <asm/xive.h>
> #include <misc/ocxl-config.h>
> #include "pci.h"
>
> @@ -487,3 +488,32 @@ int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle)
> return rc;
> }
> EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe);
> +
> +int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr)
> +{
> + __be64 flags, trigger_page;
> + s64 rc;
> + u32 hwirq;
> +
> + hwirq = xive_native_alloc_irq();
> + if (!hwirq)
> + return -ENOENT;
> +
> + rc = opal_xive_get_irq_info(hwirq, &flags, NULL, &trigger_page, NULL,
> + NULL);
> + if (rc || !trigger_page) {
> + xive_native_free_irq(hwirq);
> + return -ENOENT;
> + }
> + *irq = hwirq;
> + *trigger_addr = be64_to_cpu(trigger_page);
> + return 0;
> +
> +}
> +EXPORT_SYMBOL_GPL(pnv_ocxl_alloc_xive_irq);
> +
> +void pnv_ocxl_free_xive_irq(u32 irq)
> +{
> + xive_native_free_irq(irq);
> +}
> +EXPORT_SYMBOL_GPL(pnv_ocxl_free_xive_irq);
> diff --git a/drivers/misc/ocxl/afu_irq.c b/drivers/misc/ocxl/afu_irq.c
> new file mode 100644
> index 000000000000..0b217a854837
> --- /dev/null
> +++ b/drivers/misc/ocxl/afu_irq.c
> @@ -0,0 +1,204 @@
> +/*
> + * Copyright 2017 IBM Corp.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +#include <linux/interrupt.h>
> +#include <linux/eventfd.h>
> +#include <asm/pnv-ocxl.h>
> +#include "ocxl_internal.h"
> +
> +struct afu_irq {
> + int id;
> + int hw_irq;
> + unsigned int virq;
> + char *name;
> + u64 trigger_page;
> + struct eventfd_ctx *ev_ctx;
> +};
> +
> +static int irq_offset_to_id(struct ocxl_context *ctx, u64 offset)
> +{
> + return (offset - ctx->afu->irq_base_offset) >> PAGE_SHIFT;
> +}
> +
> +static u64 irq_id_to_offset(struct ocxl_context *ctx, int id)
> +{
> + return ctx->afu->irq_base_offset + (id << PAGE_SHIFT);
> +}
> +
> +static irqreturn_t afu_irq_handler(int virq, void *data)
> +{
> + struct afu_irq *irq = (struct afu_irq *) data;
> +
> + if (irq->ev_ctx)
> + eventfd_signal(irq->ev_ctx, 1);
> + return IRQ_HANDLED;
> +}
> +
> +static int setup_afu_irq(struct ocxl_context *ctx, struct afu_irq *irq)
> +{
> + int rc;
> +
> + irq->virq = irq_create_mapping(NULL, irq->hw_irq);
> + if (!irq->virq) {
> + pr_err("irq_create_mapping failed\n");
> + return -ENOMEM;
> + }
> + pr_debug("hw_irq %d mapped to virq %u\n", irq->hw_irq, irq->virq);
> +
> + irq->name = kasprintf(GFP_KERNEL, "ocxl-afu-%u", irq->virq);
> + if (!irq->name) {
> + irq_dispose_mapping(irq->virq);
> + return -ENOMEM;
> + }
> +
> + rc = request_irq(irq->virq, afu_irq_handler, 0, irq->name, irq);
> + if (rc) {
> + kfree(irq->name);
> + irq->name = NULL;
> + irq_dispose_mapping(irq->virq);
> + pr_err("request_irq failed: %d\n", rc);
> + return rc;
> + }
> + return 0;
> +}
> +
> +static void release_afu_irq(struct afu_irq *irq)
> +{
> + free_irq(irq->virq, irq);
> + irq_dispose_mapping(irq->virq);
> + kfree(irq->name);
> +}
> +
> +int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset)
> +{
> + struct afu_irq *irq;
> + int rc;
> +
> + irq = kzalloc(sizeof(struct afu_irq), GFP_KERNEL);
> + if (!irq)
> + return -ENOMEM;
> +
> + /*
> + * We limit the number of afu irqs per context and per link to
> + * avoid a single process or user depleting the pool of IPIs
> + */
> +
> + mutex_lock(&ctx->irq_lock);
> +
> + irq->id = idr_alloc(&ctx->irq_idr, irq, 0, MAX_IRQ_PER_CONTEXT,
> + GFP_KERNEL);
> + if (irq->id < 0) {
> + rc = -ENOSPC;
> + goto err_unlock;
> + }
> +
> + rc = ocxl_link_irq_alloc(ctx->afu->fn->link, &irq->hw_irq,
> + &irq->trigger_page);
> + if (rc)
> + goto err_idr;
> +
> + rc = setup_afu_irq(ctx, irq);
> + if (rc)
> + goto err_alloc;
> +
> + *irq_offset = irq_id_to_offset(ctx, irq->id);
> +
> + mutex_unlock(&ctx->irq_lock);
> + return 0;
> +
> +err_alloc:
> + ocxl_link_free_irq(ctx->afu->fn->link, irq->hw_irq);
> +err_idr:
> + idr_remove(&ctx->irq_idr, irq->id);
> +err_unlock:
> + mutex_unlock(&ctx->irq_lock);
> + kfree(irq);
> + return rc;
> +}
> +
> +static void afu_irq_free(struct afu_irq *irq, struct ocxl_context *ctx)
> +{
> + if (ctx->mapping)
> + unmap_mapping_range(ctx->mapping,
> + irq_id_to_offset(ctx, irq->id),
> + 1 << PAGE_SHIFT, 1);
> + release_afu_irq(irq);
> + if (irq->ev_ctx)
> + eventfd_ctx_put(irq->ev_ctx);
> + ocxl_link_free_irq(ctx->afu->fn->link, irq->hw_irq);
> + kfree(irq);
> +}
> +
> +int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset)
> +{
> + struct afu_irq *irq;
> + int id = irq_offset_to_id(ctx, irq_offset);
> +
> + mutex_lock(&ctx->irq_lock);
> +
> + irq = idr_find(&ctx->irq_idr, id);
> + if (!irq) {
> + mutex_unlock(&ctx->irq_lock);
> + return -EINVAL;
> + }
> + idr_remove(&ctx->irq_idr, irq->id);
> + afu_irq_free(irq, ctx);
> + mutex_unlock(&ctx->irq_lock);
> + return 0;
> +}
> +
> +void ocxl_afu_irq_free_all(struct ocxl_context *ctx)
> +{
> + struct afu_irq *irq;
> + int id;
> +
> + mutex_lock(&ctx->irq_lock);
> + idr_for_each_entry(&ctx->irq_idr, irq, id)
> + afu_irq_free(irq, ctx);
> + mutex_unlock(&ctx->irq_lock);
> +}
> +
> +int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, int eventfd)
> +{
> + struct afu_irq *irq;
> + struct eventfd_ctx *ev_ctx;
> + int rc = 0, id = irq_offset_to_id(ctx, irq_offset);
> +
> + mutex_lock(&ctx->irq_lock);
> + irq = idr_find(&ctx->irq_idr, id);
> + if (!irq) {
> + rc = -EINVAL;
> + goto unlock;
> + }
> +
> + ev_ctx = eventfd_ctx_fdget(eventfd);
> + if (IS_ERR(ev_ctx)) {
> + rc = -EINVAL;
> + goto unlock;
> + }
> +
> + irq->ev_ctx = ev_ctx;
> +unlock:
> + mutex_unlock(&ctx->irq_lock);
> + return rc;
> +}
> +
> +u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset)
> +{
> + struct afu_irq *irq;
> + int id = irq_offset_to_id(ctx, irq_offset);
> + u64 addr = 0;
> +
> + mutex_lock(&ctx->irq_lock);
> + irq = idr_find(&ctx->irq_idr, id);
> + if (irq)
> + addr = irq->trigger_page;
> + mutex_unlock(&ctx->irq_lock);
> + return addr;
> +}
> diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
> index 0bc0dd97d784..19575269ed22 100644
> --- a/drivers/misc/ocxl/context.c
> +++ b/drivers/misc/ocxl/context.c
> @@ -38,6 +38,8 @@ int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu,
> mutex_init(&ctx->mapping_lock);
> init_waitqueue_head(&ctx->events_wq);
> mutex_init(&ctx->xsl_error_lock);
> + mutex_init(&ctx->irq_lock);
> + idr_init(&ctx->irq_idr);
> /*
> * Keep a reference on the AFU to make sure it's valid for the
> * duration of the life of the context
> @@ -87,6 +89,19 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr)
> return rc;
> }
>
> +static int map_afu_irq(struct vm_area_struct *vma, unsigned long address,
> + u64 offset, struct ocxl_context *ctx)
> +{
> + u64 trigger_addr;
> +
> + trigger_addr = ocxl_afu_irq_get_addr(ctx, offset);
> + if (!trigger_addr)
> + return VM_FAULT_SIGBUS;
> +
> + vm_insert_pfn(vma, address, trigger_addr >> PAGE_SHIFT);
> + return VM_FAULT_NOPAGE;
> +}
> +
> static int map_pp_mmio(struct vm_area_struct *vma, unsigned long address,
> u64 offset, struct ocxl_context *ctx)
> {
> @@ -125,7 +140,10 @@ static int ocxl_mmap_fault(struct vm_fault *vmf)
> pr_debug("%s: pasid %d address 0x%lx offset 0x%llx\n", __func__,
> ctx->pasid, vmf->address, offset);
>
> - rc = map_pp_mmio(vma, vmf->address, offset, ctx);
> + if (offset < ctx->afu->irq_base_offset)
> + rc = map_pp_mmio(vma, vmf->address, offset, ctx);
> + else
> + rc = map_afu_irq(vma, vmf->address, offset, ctx);
> return rc;
> }
>
> @@ -133,6 +151,19 @@ static const struct vm_operations_struct ocxl_vmops = {
> .fault = ocxl_mmap_fault,
> };
>
> +static int check_mmap_afu_irq(struct ocxl_context *ctx,
> + struct vm_area_struct *vma)
> +{
> + /* only one page */
> + if (vma_pages(vma) != 1)
> + return -EINVAL;
> +
> + /* check offset validty */
> + if (!ocxl_afu_irq_get_addr(ctx, vma->vm_pgoff << PAGE_SHIFT))
> + return -EINVAL;
> + return 0;
> +}
> +
> static int check_mmap_mmio(struct ocxl_context *ctx,
> struct vm_area_struct *vma)
> {
> @@ -146,7 +177,10 @@ int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma)
> {
> int rc;
>
> - rc = check_mmap_mmio(ctx, vma);
> + if ((vma->vm_pgoff << PAGE_SHIFT) < ctx->afu->irq_base_offset)
> + rc = check_mmap_mmio(ctx, vma);
> + else
> + rc = check_mmap_afu_irq(ctx, vma);
> if (rc)
> return rc;
>
> @@ -231,6 +265,8 @@ void ocxl_context_free(struct ocxl_context *ctx)
> idr_remove(&ctx->afu->contexts_idr, ctx->pasid);
> mutex_unlock(&ctx->afu->contexts_lock);
>
> + ocxl_afu_irq_free_all(ctx);
> + idr_destroy(&ctx->irq_idr);
> /* reference to the AFU taken in ocxl_context_init */
> ocxl_afu_put(ctx->afu);
> kfree(ctx);
> diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
> index a51386eff4f5..0a73e2c11ba6 100644
> --- a/drivers/misc/ocxl/file.c
> +++ b/drivers/misc/ocxl/file.c
> @@ -110,12 +110,17 @@ static long afu_ioctl_attach(struct ocxl_context *ctx,
> }
>
> #define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" : \
> + x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" : \
> + x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" : \
> + x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" : \
> "UNKNOWN")
>
> static long afu_ioctl(struct file *file, unsigned int cmd,
> unsigned long args)
> {
> struct ocxl_context *ctx = file->private_data;
> + struct ocxl_ioctl_irq_fd irq_fd;
> + u64 irq_offset;
> long rc;
>
> pr_debug("%s for context %d, command %s\n", __func__, ctx->pasid,
> @@ -130,6 +135,34 @@ static long afu_ioctl(struct file *file, unsigned int cmd,
> (struct ocxl_ioctl_attach __user *) args);
> break;
>
> + case OCXL_IOCTL_IRQ_ALLOC:
> + rc = ocxl_afu_irq_alloc(ctx, &irq_offset);
> + if (!rc) {
> + rc = copy_to_user((u64 *) args, &irq_offset,
> + sizeof(irq_offset));
> + if (rc)
> + ocxl_afu_irq_free(ctx, irq_offset);
> + }
> + break;
> +
> + case OCXL_IOCTL_IRQ_FREE:
> + rc = copy_from_user(&irq_offset, (u64 *) args,
> + sizeof(irq_offset));
> + if (rc)
> + return -EFAULT;
> + rc = ocxl_afu_irq_free(ctx, irq_offset);
> + break;
> +
> + case OCXL_IOCTL_IRQ_SET_FD:
> + rc = copy_from_user(&irq_fd, (u64 *) args, sizeof(irq_fd));
> + if (rc)
> + return -EFAULT;
> + if (irq_fd.reserved)
> + return -EINVAL;
> + rc = ocxl_afu_irq_set_fd(ctx, irq_fd.irq_offset,
> + irq_fd.eventfd);
> + break;
> +
> default:
> rc = -EINVAL;
> }
> diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
> index 6b184cd7d2a6..5f12564eea99 100644
> --- a/drivers/misc/ocxl/link.c
> +++ b/drivers/misc/ocxl/link.c
> @@ -608,3 +608,31 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
> mutex_unlock(&spa->spa_lock);
> return rc;
> }
> +
> +int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *trigger_addr)
> +{
> + struct link *link = (struct link *) link_handle;
> + int rc, irq;
> + u64 addr;
> +
> + if (atomic_dec_if_positive(&link->irq_available) < 0)
> + return -ENOSPC;
> +
> + rc = pnv_ocxl_alloc_xive_irq(&irq, &addr);
> + if (rc) {
> + atomic_inc(&link->irq_available);
> + return rc;
> + }
> +
> + *hw_irq = irq;
> + *trigger_addr = addr;
> + return 0;
> +}
> +
> +void ocxl_link_free_irq(void *link_handle, int hw_irq)
> +{
> + struct link *link = (struct link *) link_handle;
> +
> + pnv_ocxl_free_xive_irq(hw_irq);
> + atomic_inc(&link->irq_available);
> +}
> diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
> index e07f7d523275..829369c5f004 100644
> --- a/drivers/misc/ocxl/ocxl_internal.h
> +++ b/drivers/misc/ocxl/ocxl_internal.h
> @@ -197,4 +197,11 @@ extern void ocxl_context_free(struct ocxl_context *ctx);
> extern int ocxl_sysfs_add_afu(struct ocxl_afu *afu);
> extern void ocxl_sysfs_remove_afu(struct ocxl_afu *afu);
>
> +extern int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset);
> +extern int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset);
> +extern void ocxl_afu_irq_free_all(struct ocxl_context *ctx);
> +extern int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset,
> + int eventfd);
> +extern u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset);
> +
> #endif /* _OCXL_INTERNAL_H_ */
> diff --git a/include/uapi/misc/ocxl.h b/include/uapi/misc/ocxl.h
> index 71fa387f2efd..488e75228c33 100644
> --- a/include/uapi/misc/ocxl.h
> +++ b/include/uapi/misc/ocxl.h
> @@ -39,9 +39,18 @@ struct ocxl_ioctl_attach {
> __u64 reserved3;
> };
>
> +struct ocxl_ioctl_irq_fd {
> + __u64 irq_offset;
> + __s32 eventfd;
> + __u32 reserved;
> +};
> +
> /* ioctl numbers */
> #define OCXL_MAGIC 0xCA
> /* AFU devices */
> #define OCXL_IOCTL_ATTACH _IOW(OCXL_MAGIC, 0x10, struct ocxl_ioctl_attach)
> +#define OCXL_IOCTL_IRQ_ALLOC _IOR(OCXL_MAGIC, 0x11, __u64)
> +#define OCXL_IOCTL_IRQ_FREE _IOW(OCXL_MAGIC, 0x12, __u64)
> +#define OCXL_IOCTL_IRQ_SET_FD _IOW(OCXL_MAGIC, 0x13, struct ocxl_ioctl_irq_fd)
>
> #endif /* _UAPI_MISC_OCXL_H */