Re: [PATCH 2/2] PCI: hv: Support for Hyper-V vPCI for ARM64

From: Marc Zyngier
Date: Mon Sep 13 2021 - 15:03:06 EST


Sunil,

On Mon, 13 Sep 2021 18:37:22 +0100,
Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx> wrote:
>
> This patch adds support for Hyper-V vPCI by adding a PCI MSI
> IRQ domain specific to Hyper-V that is based on SPIs. The IRQ
> domain parents itself to the arch GIC IRQ domain for basic
> vector management.

Given that we literally spent *weeks* discussing this, I would have
appreciated if you had Cc'd me directly instead as a basic courtesy
rather than me spotting it on the list.

>
> Signed-off-by: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx>
> ---
> arch/arm64/hyperv/Makefile | 2 +-
> arch/arm64/hyperv/hv_pci.c | 275 +++++++++++++++++++++++++++
> arch/arm64/include/asm/hyperv-tlfs.h | 9 +
> arch/arm64/include/asm/mshyperv.h | 26 +++
> drivers/pci/Kconfig | 2 +-
> drivers/pci/controller/Kconfig | 2 +-
> drivers/pci/controller/pci-hyperv.c | 5 +
> 7 files changed, 318 insertions(+), 3 deletions(-)
> create mode 100644 arch/arm64/hyperv/hv_pci.c
>
> diff --git a/arch/arm64/hyperv/Makefile b/arch/arm64/hyperv/Makefile
> index 87c31c001da9..af7a66e43ef4 100644
> --- a/arch/arm64/hyperv/Makefile
> +++ b/arch/arm64/hyperv/Makefile
> @@ -1,2 +1,2 @@
> # SPDX-License-Identifier: GPL-2.0
> -obj-y := hv_core.o mshyperv.o
> +obj-y := hv_core.o mshyperv.o hv_pci.o
> diff --git a/arch/arm64/hyperv/hv_pci.c b/arch/arm64/hyperv/hv_pci.c
> new file mode 100644
> index 000000000000..06179e4a6a2d
> --- /dev/null
> +++ b/arch/arm64/hyperv/hv_pci.c

Nit: this is definitely the wrong location. There isn't anything arm64
specific here that warrants hiding it away. Like most other bizarre
MSI implementation, it should either live in drivers/pci or in
drivers/irqchip.

> @@ -0,0 +1,275 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Architecture specific vector management for the Hyper-V vPCI.
> + *
> + * Copyright (C) 2018, Microsoft, Inc.
> + *
> + * Author : Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for more
> + * details.

What is the point of this if you have the SPDX tag?

> + */
> +
> +#include <asm/mshyperv.h>
> +#include <linux/acpi.h>
> +#include <linux/irqdomain.h>
> +#include <linux/irq.h>
> +#include <acpi/acpi_bus.h>
> +
> +/*
> + * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit
> + * of room at the start to allow for SPIs to be specified through ACPI.
> + */
> +#define HV_PCI_MSI_SPI_START 50

If that's the start, it has a good chance of being the wrong
start. Given that the HyperV PCI controller advertises Multi-MSI
support, INTID 50 cannot be used for any device that requires more
than 2 vectors.

> +#define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START)
> +
> +struct hv_pci_chip_data {
> + spinlock_t lock;

Why a spinlock? Either this can be used in interrupt context, and we
require a raw_spinlock_t instead, or it never is used in interrupt
context and should be a good old mutex.

> + DECLARE_BITMAP(bm, HV_PCI_MSI_SPI_NR);
> +};
> +
> +/* Hyper-V vPCI MSI GIC IRQ domain */
> +static struct irq_domain *hv_msi_gic_irq_domain;
> +
> +static struct irq_chip hv_msi_irq_chip = {
> + .name = "Hyper-V ARM64 PCI MSI",

That's a mouthful! How about "MSI" instead?

> + .irq_set_affinity = irq_chip_set_affinity_parent,
> + .irq_eoi = irq_chip_eoi_parent,
> + .irq_mask = irq_chip_mask_parent,
> + .irq_unmask = irq_chip_unmask_parent
> +};
> +
> +/**
> + * Frees the specified number of interrupts.
> + * @domain: The IRQ domain
> + * @virq: The virtual IRQ number.
> + * @nr_irqs: Number of IRQ's to free.
> + */
> +static void hv_pci_vec_irq_domain_free(struct irq_domain *domain,
> + unsigned int virq, unsigned int nr_irqs)
> +{
> + struct hv_pci_chip_data *chip_data = domain->host_data;
> + unsigned long flags;
> + unsigned int i;
> +
> + for (i = 0; i < nr_irqs; i++) {
> + struct irq_data *irqd = irq_domain_get_irq_data(domain,
> + virq + i);
> +
> + spin_lock_irqsave(&chip_data->lock, flags);
> + clear_bit(irqd->hwirq - HV_PCI_MSI_SPI_START, chip_data->bm);
> + spin_unlock_irqrestore(&chip_data->lock, flags);

Really? Why should you disable interrupts here? Why do you need to
lock/unlock on each iteration of this loop?

> + irq_domain_reset_irq_data(irqd);
> + }
> +
> + irq_domain_free_irqs_parent(domain, virq, nr_irqs);
> +}
> +
> +/**
> + * Allocate an interrupt from the domain.
> + * @hwirq: Will be set to the allocated H/W IRQ.
> + *
> + * Return: 0 on success and error value on failure.
> + */
> +static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain,
> + unsigned int virq, irq_hw_number_t *hwirq)
> +{
> + struct hv_pci_chip_data *chip_data = domain->host_data;
> + unsigned long flags;
> + unsigned int index;
> +
> + spin_lock_irqsave(&chip_data->lock, flags);
> + index = find_first_zero_bit(chip_data->bm, HV_PCI_MSI_SPI_NR);
> + if (index == HV_PCI_MSI_SPI_NR) {
> + spin_unlock_irqrestore(&chip_data->lock, flags);
> + pr_err("No more free IRQ vector available\n");

No, we don't shout because we're out of MSIs. It happens, and drivers
can nicely use less vectors if needed.

But more importantly, this is totally breaks MultiMSI, see below.

> + return -ENOSPC;
> + }
> +
> + set_bit(index, chip_data->bm);
> + spin_unlock_irqrestore(&chip_data->lock, flags);
> + *hwirq = index + HV_PCI_MSI_SPI_START;
> +
> + return 0;
> +}
> +
> +/**
> + * Allocate an interrupt from the parent GIC domain.
> + * @domain: The IRQ domain.
> + * @virq: The virtual IRQ number.
> + * @hwirq: The H/W IRQ number that needs to be allocated.
> + *
> + * Return: 0 on success and error value on failure.
> + */
> +static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain,
> + unsigned int virq,
> + irq_hw_number_t hwirq)
> +{
> + struct irq_fwspec fwspec;
> +
> + fwspec.fwnode = domain->parent->fwnode;
> + fwspec.param_count = 2;
> + fwspec.param[0] = hwirq;
> + fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
> +
> + return irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
> +}
> +
> +/**
> + * Allocate specified number of interrupts from the domain.
> + * @domain: The IRQ domain.
> + * @virq: The starting virtual IRQ number.
> + * @nr_irqs: Number of IRQ's to allocate.
> + * @args: The MSI alloc information.
> + *
> + * Return: 0 on success and error value on failure.
> + */
> +static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain,
> + unsigned int virq, unsigned int nr_irqs,
> + void *args)
> +{
> + irq_hw_number_t hwirq;
> + unsigned int i;
> + int ret;
> +
> + for (i = 0; i < nr_irqs; i++) {
> + ret = hv_pci_vec_alloc_device_irq(domain, virq, &hwirq);
> + if (ret)
> + goto free_irq;
> +
> + ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i, hwirq);

Please read the specification for PCI MultiMSI. You offer none of the
alignment and contiguity guarantees that are required.

> + if (ret)
> + goto free_irq;
> +
> + ret = irq_domain_set_hwirq_and_chip(domain, virq + i,
> + hwirq, &hv_msi_irq_chip,
> + domain->host_data);
> + if (ret)
> + goto free_irq;
> +
> + irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(virq + i)));

Why? The GIC is responsible for the distribution, not the MSI layer.
This looks completely bogus.

> + pr_debug("pID:%d vID:%u\n", (int)hwirq, virq + i);
> + }
> +
> + return 0;
> +
> +free_irq:
> + if (i > 0)
> + hv_pci_vec_irq_domain_free(domain, virq, i - 1);
> +
> + return ret;
> +}
> +
> +/**
> + * Activate the interrupt.
> + * @domain: The IRQ domain.
> + * @irqd: IRQ data.
> + * @reserve: Indicates whether the IRQ's can be reserved.
> + *
> + * Return: 0 on success and error value on failure.
> + */
> +static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain,
> + struct irq_data *irqd, bool reserve)
> +{
> + /* All available online CPUs are available for targeting */
> + irq_data_update_effective_affinity(irqd, cpu_online_mask);

Which completely contradicts what you have written above, and doesn't
match what the GIC does either.

> + return 0;
> +}
> +
> +static const struct irq_domain_ops hv_pci_domain_ops = {
> + .alloc = hv_pci_vec_irq_domain_alloc,
> + .free = hv_pci_vec_irq_domain_free,
> + .activate = hv_pci_vec_irq_domain_activate,
> +};
> +
> +
> +/**
> + * This routine performs the architecture specific initialization for vector
> + * domain to operate. It allocates an IRQ domain tree as a child of the GIC
> + * IRQ domain.
> + *
> + * Return: 0 on success and error value on failure.
> + */
> +int hv_pci_vector_init(void)

Why isn't this static?

> +{
> + static struct hv_pci_chip_data *chip_data;
> + struct fwnode_handle *fn = NULL;
> + int ret = -ENOMEM;
> +
> + chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL);
> + if (!chip_data)
> + return ret;
> +
> + spin_lock_init(&chip_data->lock);
> + fn = irq_domain_alloc_named_fwnode("Hyper-V ARM64 vPCI");
> + if (!fn)
> + goto free_chip;
> +
> + hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR,
> + fn, &hv_pci_domain_ops, chip_data);
> +
> + if (!hv_msi_gic_irq_domain) {
> + pr_err("Failed to create Hyper-V ARMV vPCI MSI IRQ domain\n");
> + goto free_chip;
> + }
> +
> + return 0;
> +
> +free_chip:
> + kfree(chip_data);
> + if (fn)
> + irq_domain_free_fwnode(fn);
> +
> + return ret;
> +}
> +
> +/* This routine performs the cleanup for the IRQ domain. */
> +void hv_pci_vector_free(void)

Why isn't this static?

> +{
> + static struct hv_pci_chip_data *chip_data;
> +
> + if (!hv_msi_gic_irq_domain)
> + return;
> +
> + /* Host data cannot be null if the domain was created successfully */
> + chip_data = hv_msi_gic_irq_domain->host_data;
> + irq_domain_remove(hv_msi_gic_irq_domain);
> + hv_msi_gic_irq_domain = NULL;
> + kfree(chip_data);
> +}
> +
> +/* Performs the architecture specific initialization for Hyper-V vPCI. */
> +int hv_pci_arch_init(void)
> +{
> + return hv_pci_vector_init();
> +}
> +EXPORT_SYMBOL_GPL(hv_pci_arch_init);
> +
> +/* Architecture specific cleanup for Hyper-V vPCI. */
> +void hv_pci_arch_free(void)
> +{
> + hv_pci_vector_free();
> +}
> +EXPORT_SYMBOL_GPL(hv_pci_arch_free);
> +
> +struct irq_domain *hv_msi_parent_vector_domain(void)
> +{
> + return hv_msi_gic_irq_domain;
> +}
> +EXPORT_SYMBOL_GPL(hv_msi_parent_vector_domain);
> +
> +unsigned int hv_msi_get_int_vector(struct irq_data *irqd)
> +{
> + irqd = irq_domain_get_irq_data(hv_msi_gic_irq_domain, irqd->irq);
> +
> + return irqd->hwirq;
> +}
> +EXPORT_SYMBOL_GPL(hv_msi_get_int_vector);

I fail to understand why this is all exported instead of being part of
the HyperV PCI module.

> diff --git a/arch/arm64/include/asm/hyperv-tlfs.h b/arch/arm64/include/asm/hyperv-tlfs.h
> index 4d964a7f02ee..bc6c7ac934a1 100644
> --- a/arch/arm64/include/asm/hyperv-tlfs.h
> +++ b/arch/arm64/include/asm/hyperv-tlfs.h
> @@ -64,6 +64,15 @@
> #define HV_REGISTER_STIMER0_CONFIG 0x000B0000
> #define HV_REGISTER_STIMER0_COUNT 0x000B0001
>
> +union hv_msi_entry {
> + u64 as_uint64[2];
> + struct {
> + u64 address;
> + u32 data;
> + u32 reserved;
> + } __packed;
> +};
> +
> #include <asm-generic/hyperv-tlfs.h>
>
> #endif
> diff --git a/arch/arm64/include/asm/mshyperv.h b/arch/arm64/include/asm/mshyperv.h
> index 20070a847304..68bc1617707b 100644
> --- a/arch/arm64/include/asm/mshyperv.h
> +++ b/arch/arm64/include/asm/mshyperv.h
> @@ -20,6 +20,8 @@
>
> #include <linux/types.h>
> #include <linux/arm-smccc.h>
> +#include <linux/interrupt.h>
> +#include <linux/msi.h>
> #include <asm/hyperv-tlfs.h>
>
> /*
> @@ -49,6 +51,30 @@ static inline u64 hv_get_register(unsigned int reg)
> ARM_SMCCC_OWNER_VENDOR_HYP, \
> HV_SMCCC_FUNC_NUMBER)
>
> +#define hv_msi_handler NULL
> +#define hv_msi_handler_name NULL
> +#define hv_msi_irq_delivery_mode 0
> +#define hv_msi_prepare NULL
> +
> +int hv_pci_arch_init(void);
> +void hv_pci_arch_free(void);
> +struct irq_domain *hv_msi_parent_vector_domain(void);
> +unsigned int hv_msi_get_int_vector(struct irq_data *data);
> +static inline irq_hw_number_t
> +hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
> + msi_alloc_info_t *arg)
> +{
> + return arg->hwirq;
> +}
> +
> +static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
> + struct msi_desc *msi_desc)
> +{
> + msi_entry->address = ((u64)msi_desc->msg.address_hi << 32) |
> + msi_desc->msg.address_lo;
> + msi_entry->data = msi_desc->msg.data;
> +}

Why do we need any of this? Why inline? Please explain what you are
trying to achieve here.

Thanks,

M.

--
Without deviation from the norm, progress is not possible.