Re: [PATCH kernel v4 11/11] powerpc/powernv/npu: Enable NVLink pass through

From: Alistair Popple
Date: Thu May 05 2016 - 21:03:49 EST


On Thu, 5 May 2016 15:49:18 Alexey Kardashevskiy wrote:
> On 05/04/2016 12:08 AM, Alistair Popple wrote:
> > Hi Alexey,
> >
> > On Fri, 29 Apr 2016 18:55:24 Alexey Kardashevskiy wrote:
> >> IBM POWER8 NVlink systems come with Tesla K40-ish GPUs each of which
> >> also has a couple of fast speed links (NVLink). The interface to links
> >> is exposed as an emulated PCI bridge which is included into the same
> >> IOMMU group as the corresponding GPU.
> >>
> >> In the kernel, NPUs get a separate PHB of the PNV_PHB_NPU type and a PE
> >> which behave pretty much as the standard IODA2 PHB except NPU PHB has
> >> just a single TVE in the hardware which means it can have either
> >> 32bit window or 64bit window or DMA bypass but never two of these.
> >>
> >> In order to make these links work when GPU is passed to the guest,
> >> these bridges need to be passed as well; otherwise performance will
> >> degrade.
> >>
> >> This implements and exports API to manage NPU state in regard to VFIO;
> >> it replicates iommu_table_group_ops.
> >>
> >> This defines a new pnv_pci_ioda2_npu_ops which is assigned to
> >> the IODA2 bridge if there are NPUs for a GPU on the bridge.
> >> The new callbacks call the default IODA2 callbacks plus new NPU API.
> >> This adds a gpe_table_group_to_npe() helper to find NPU PE for the IODA2
> >> table_group, it is not expected to fail as the helper is only called
> >> from the pnv_pci_ioda2_npu_ops.
> >>
> >> This does not define NPU-specific .release_ownership() so after
> >> VFIO is finished, DMA on NPU is disabled which is ok as the nvidia
> >> driver sets DMA mask when probing which enable 32 or 64bit DMA on NPU.
> >>
> >> This adds a pnv_pci_npu_setup_iommu() helper which adds NPUs to
> >> the GPU group if any found. The helper uses helpers to look for
> >> the "ibm,gpu" property in the device tree which is a phandle of
> >> the corresponding GPU.
> >>
> >> This adds an additional loop over PEs in pnv_ioda_setup_dma() as the main
> >> loop skips NPU PEs as they do not have 32bit DMA segments.
> >>
> >> As pnv_npu_set_window() and pnv_npu_unset_window() are started being used
> >> by the new IODA2-NPU IOMMU group, this makes the helpers public and
> >> adds the DMA window number parameter.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx>
> >> ---
> >> Changes:
> >> v4:
> >> * reused pnv_npu_set_window/pnv_npu_unset_window where possible
> >> * added comments, changed commit log
> >>
> >> v3:
> >> * moved NPU-to-GPU IOMMU grouping later after all PHBs are discovered
> >> * removed hack to make iommu_add_device() work, iommu_group_add_device()
> >> is used instead
> >> * cleanup in gpe_table_group_to_npe_cb()
> >>
> >> v2:
> >> * reimplemented to support NPU + GPU in the same group
> >> * merged "powerpc/powernv/npu: Add NPU devices to IOMMU group" and
> >> "powerpc/powernv/npu: Enable passing through via VFIO" into this patch
> >> ---
> >> arch/powerpc/platforms/powernv/npu-dma.c | 64 +++++++++++++++++--
> >> arch/powerpc/platforms/powernv/pci-ioda.c | 102
> > ++++++++++++++++++++++++++++++
> >> arch/powerpc/platforms/powernv/pci.h | 6 ++
> >> 3 files changed, 166 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c
> > b/arch/powerpc/platforms/powernv/npu-dma.c
> >> index cb2d1da..0459e10 100644
> >> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> >> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> >> @@ -12,6 +12,7 @@
> >> #include <linux/export.h>
> >> #include <linux/pci.h>
> >> #include <linux/memblock.h>
> >> +#include <linux/iommu.h>
> >>
> >> #include <asm/iommu.h>
> >> #include <asm/pnv-pci.h>
> >> @@ -154,7 +155,7 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct
> > pnv_ioda_pe *npe,
> >> return pe;
> >> }
> >>
> >> -static long pnv_npu_set_window(struct pnv_ioda_pe *npe,
> >> +long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
> >> struct iommu_table *tbl)
> >> {
> >> struct pnv_phb *phb = npe->phb;
> >> @@ -182,13 +183,13 @@ static long pnv_npu_set_window(struct pnv_ioda_pe
> > *npe,
> >> pnv_pci_ioda2_tce_invalidate_entire(phb, false);
> >>
> >> /* Add the table to the list so its TCE cache will get invalidated */
> >> - pnv_pci_link_table_and_group(phb->hose->node, 0,
> >> + pnv_pci_link_table_and_group(phb->hose->node, num,
> >> tbl, &npe->table_group);
> >>
> >> return 0;
> >> }
> >>
> >> -static long pnv_npu_unset_window(struct pnv_ioda_pe *npe)
> >> +long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
> >> {
> >> struct pnv_phb *phb = npe->phb;
> >> int64_t rc;
> >> @@ -205,7 +206,7 @@ static long pnv_npu_unset_window(struct pnv_ioda_pe
> > *npe)
> >> }
> >> pnv_pci_ioda2_tce_invalidate_entire(phb, false);
> >>
> >> - pnv_pci_unlink_table_and_group(npe->table_group.tables[0],
> >> + pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
> >> &npe->table_group);
> >>
> >> return 0;
> >> @@ -231,7 +232,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
> >> if (!gpe)
> >> return;
> >>
> >> - rc = pnv_npu_set_window(npe, gpe->table_group.tables[0]);
> >> + rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
> >>
> >> /*
> >> * We don't initialise npu_pe->tce32_table as we always use
> >> @@ -255,7 +256,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe
> > *npe)
> >> if (phb->type != PNV_PHB_NPU || !npe->pdev)
> >> return -EINVAL;
> >>
> >> - rc = pnv_npu_unset_window(npe);
> >> + rc = pnv_npu_unset_window(npe, 0);
> >> if (rc != OPAL_SUCCESS)
> >> return rc;
> >>
> >> @@ -307,3 +308,54 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev,
> > bool bypass)
> >> }
> >> }
> >> }
> >> +
> >> +/* Switch ownership from platform code to external user (e.g. VFIO) */
> >> +void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
> >> +{
> >> + struct pnv_phb *phb = npe->phb;
> >> + int64_t rc;
> >> +
> >> + /*
> >> + * Note: NPU has just a single TVE in the hardware which means that
> >> + * while used by the kernel, it can have either 32bit window or
> >> + * DMA bypass but never both. So we deconfigure 32bit window only
> >> + * if it was enabled at the moment of ownership change.
> >> + */
> >> + if (npe->table_group.tables[0]) {
> >> + pnv_npu_unset_window(npe, 0);
> >> + return;
> >> + }
> >> +
> >> + /* Disable bypass */
> >> + rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
> >> + npe->pe_number, npe->pe_number,
> >> + 0 /* bypass base */, 0);
> >> + if (rc) {
> >> + pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
> >> + return;
> >> + }
> >> + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
> >> +}
> >> +
> >> +struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
> >> +{
> >> + struct pnv_phb *phb = npe->phb;
> >> + struct pci_bus *pbus = phb->hose->bus;
> >> + struct pci_dev *npdev, *gpdev = NULL, *gptmp;
> >> + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
> >> +
> >> + if (!gpe || !gpdev)
> >> + return NULL;
> >> +
> >> + list_for_each_entry(npdev, &pbus->devices, bus_list) {
> >> + gptmp = pnv_pci_get_gpu_dev(npdev);
> >> +
> >> + if (gptmp != gpdev)
> >> + continue;
> >
> > If I'm not mistaken it looks like you are trying to find all the NPU PEs also
> > attached to the same GPU on the same (firmware emulated) NPU PHB? I'm just
> > curious - does this work if the GPU has say 2 NPU PE#s (ie. links) on
> > different NPU PHB's?
> >
> >> + pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
> >> + iommu_group_add_device(gpe->table_group.group, &npdev->dev);
> >> + }
> >> +
> >> + return gpe;
> >> +}
> >> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> > b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> index 922c74c..feabe50 100644
> >> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> >> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> @@ -2273,6 +2273,90 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops
> > = {
> >> .take_ownership = pnv_ioda2_take_ownership,
> >> .release_ownership = pnv_ioda2_release_ownership,
> >> };
> >> +
> >> +static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
> >> +{
> >> + struct pci_controller *hose;
> >> + struct pnv_phb *phb;
> >> + struct pnv_ioda_pe **ptmppe = opaque;
> >> + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
> >> + struct pci_dn *pdn = pci_get_pdn(pdev);
> >> +
> >> + if (!pdn || pdn->pe_number == IODA_INVALID_PE)
> >> + return 0;
> >> +
> >> + hose = pci_bus_to_host(pdev->bus);
> >> + phb = hose->private_data;
> >> + if (phb->type != PNV_PHB_NPU)
> >> + return 0;
> >> +
> >> + *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
> >> +
> >> + return 1;
> >> +}
> >> +
> >> +/*
> >> + * This returns PE of associated NPU.
> >> + * This assumes that NPU is in the same IOMMU group with GPU and there is
> >> + * no other PEs.
> >> + */
> >
> > Which answers my above question as this doesn't look like it supports GPUs
> > with multiple NPU PE#s attached. I don't think this is actually a problem
> > though as no hardware I know of will ever do this, even though theoretically
> > it's possible.
>
>
> I believe when such a situation happens in the future, it will be different
> PVR, PHB4 (or 5 or whatever) and IODA3 (or 4, or...).

In *theory* it could still happen on PHB3/IODA2.

> I could write code in assumption that there can be more NPU PEs per one GPU
> PE but it does not make much sense (to me) to design the hardware this way
> and when/if it will be designed this way, the details most likely will
> differ from what I can imagine today.

I completely agree. No point reworking it and adding complexity for a situation
that most likely will never exist. I just wanted to get an understanding of any
restrictions in case something does change in future.

> >
> > It might be nice if we could warn if this configuration is detected but not
> > really a big issue.
> >
> > Everything else looks reasonable as far as I can tell
> > (although again I'm no vfio iommu groups expert) so happy for you to add my
> > reviewed-by:
> >
> > Reviewed-By: Alistair Popple <alistair@xxxxxxxxxxxx>
>
>
> Thanks!
>
> >
> >> +static struct pnv_ioda_pe *gpe_table_group_to_npe(
> >> + struct iommu_table_group *table_group)
> >> +{
> >> + struct pnv_ioda_pe *npe = NULL;
> >> + int ret = iommu_group_for_each_dev(table_group->group, &npe,
> >> + gpe_table_group_to_npe_cb);
> >> +
> >> + BUG_ON(!ret || !npe);
> >> +
> >> + return npe;
> >> +}
> >> +
> >> +static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group
> > *table_group,
> >> + int num, struct iommu_table *tbl)
> >> +{
> >> + long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
> >> +
> >> + if (ret)
> >> + return ret;
> >> +
> >> + ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num,
> > tbl);
> >> + if (ret)
> >> + pnv_pci_ioda2_unset_window(table_group, num);
> >> +
> >> + return ret;
> >> +}
> >> +
> >> +static long pnv_pci_ioda2_npu_unset_window(
> >> + struct iommu_table_group *table_group,
> >> + int num)
> >> +{
> >> + long ret = pnv_pci_ioda2_unset_window(table_group, num);
> >> +
> >> + if (ret)
> >> + return ret;
> >> +
> >> + return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
> >> +}
> >> +
> >> +static void pnv_ioda2_npu_take_ownership(struct iommu_table_group
> > *table_group)
> >> +{
> >> + /*
> >> + * Detach NPU first as pnv_ioda2_take_ownership() will destroy
> >> + * the iommu_table if 32bit DMA is enabled.
> >> + */
> >> + pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
> >> + pnv_ioda2_take_ownership(table_group);
> >> +}
> >> +
> >> +static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
> >> + .get_table_size = pnv_pci_ioda2_get_table_size,
> >> + .create_table = pnv_pci_ioda2_create_table,
> >> + .set_window = pnv_pci_ioda2_npu_set_window,
> >> + .unset_window = pnv_pci_ioda2_npu_unset_window,
> >> + .take_ownership = pnv_ioda2_npu_take_ownership,
> >> + .release_ownership = pnv_ioda2_release_ownership,
> >> +};
> >> #endif
> >>
> >> static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
> >> @@ -3012,6 +3096,7 @@ static void pnv_pci_ioda_setup_DMA(void)
> >> {
> >> struct pci_controller *hose, *tmp;
> >> struct pnv_phb *phb;
> >> + struct pnv_ioda_pe *pe, *gpe;
> >>
> >> list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
> >> pnv_ioda_setup_dma(hose->private_data);
> >> @@ -3020,6 +3105,23 @@ static void pnv_pci_ioda_setup_DMA(void)
> >> phb = hose->private_data;
> >> phb->initialized = 1;
> >> }
> >> +
> >> + /*
> >> + * Now we have all PHBs discovered, time to add NPU devices to
> >> + * the corresponding IOMMU groups.
> >> + */
> >> + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
> >> + phb = hose->private_data;
> >> +
> >> + if (phb->type != PNV_PHB_NPU)
> >> + continue;
> >> +
> >> + list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
> >> + gpe = pnv_pci_npu_setup_iommu(pe);
> >> + if (gpe)
> >> + gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
> >> + }
> >> + }
> >> }
> >>
> >> static void pnv_pci_ioda_create_dbgfs(void)
> >> diff --git a/arch/powerpc/platforms/powernv/pci.h
> > b/arch/powerpc/platforms/powernv/pci.h
> >> index e117bd8..ebc6ee4 100644
> >> --- a/arch/powerpc/platforms/powernv/pci.h
> >> +++ b/arch/powerpc/platforms/powernv/pci.h
> >> @@ -244,5 +244,11 @@ extern void pe_level_printk(const struct pnv_ioda_pe
> > *pe, const char *level,
> >> /* Nvlink functions */
> >> extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
> >> extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool
> > rm);
> >> +extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe
> > *npe);
> >> +extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
> >> + struct iommu_table *tbl);
> >> +extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
> >> +extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
> >> +extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
> >>
> >> #endif /* __POWERNV_PCI_H */
> >>
> >
>
>
>