Re: [RFC PATCH] perf/smmuv3: Fix shared interrupt handling

From: Tuan Phan
Date: Thu Apr 30 2020 - 18:11:14 EST


Hi Robin,

> On Apr 8, 2020, at 9:49 AM, Robin Murphy <robin.murphy@xxxxxxx> wrote:
>
> IRQF_SHARED is dangerous, since it allows other agents to retarget the
> IRQ's affinity without migrating PMU contexts to match, breaking the way
> in which perf manages mutual exclusion for accessing events. Although
> this means it's not realistically possible to support PMU IRQs being
> shared with other drivers, we *can* handle sharing between multiple PMU
> instances with some explicit affinity bookkeeping and manual interrupt
> multiplexing.
>
> RCU helps us handle interrupts efficiently without having to worry about
> fine-grained locking for relatively-theoretical race conditions with the
> probe/remove/CPU hotplug slow paths. The resulting machinery ends up
> looking largely generic, so it should be feasible to factor out with a
> "system PMU" base class for similar multi-instance drivers.
>
> Signed-off-by: Robin Murphy <robin.murphy@xxxxxxx>
> ---
>
> RFC because I don't have the means to test it, and if the general
> approach passes muster then I'd want to tackle the aforementioned
> factoring-out before merging anything anyway.
>
> Robin.
>
>
> drivers/perf/arm_smmuv3_pmu.c | 215 ++++++++++++++++++++++++----------
> 1 file changed, 152 insertions(+), 63 deletions(-)
>
> diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
> index d704eccc548f..8daa7ac6e801 100644
> --- a/drivers/perf/arm_smmuv3_pmu.c
> +++ b/drivers/perf/arm_smmuv3_pmu.c
> @@ -47,8 +47,11 @@
> #include <linux/kernel.h>
> #include <linux/list.h>
> #include <linux/msi.h>
> +#include <linux/mutex.h>
> #include <linux/perf_event.h>
> #include <linux/platform_device.h>
> +#include <linux/rculist.h>
> +#include <linux/refcount.h>
> #include <linux/smp.h>
> #include <linux/sysfs.h>
> #include <linux/types.h>
> @@ -98,13 +101,24 @@
>
> static int cpuhp_state_num;
>
> -struct smmu_pmu {
> +static LIST_HEAD(smmu_pmu_affinities);
> +static DEFINE_MUTEX(smmu_pmu_affinity_lock);
> +
> +struct smmu_pmu_affinity {
> struct hlist_node node;
> + struct list_head affinity_list;
> + struct list_head instance_list;
> + refcount_t refcount;
> + unsigned int irq;
> + unsigned int cpu;
> +};
> +
> +struct smmu_pmu {
> struct perf_event *events[SMMU_PMCG_MAX_COUNTERS];
> DECLARE_BITMAP(used_counters, SMMU_PMCG_MAX_COUNTERS);
> DECLARE_BITMAP(supported_events, SMMU_PMCG_ARCH_MAX_EVENTS);
> - unsigned int irq;
> - unsigned int on_cpu;
> + struct smmu_pmu_affinity *affinity;
> + struct list_head affinity_list;
> struct pmu pmu;
> unsigned int num_counters;
> struct device *dev;
> @@ -394,7 +408,7 @@ static int smmu_pmu_event_init(struct perf_event *event)
> * Ensure all events are on the same cpu so all events are in the
> * same cpu context, to avoid races on pmu_enable etc.
> */
> - event->cpu = smmu_pmu->on_cpu;
> + event->cpu = smmu_pmu->affinity->cpu;
>
> return 0;
> }
> @@ -478,9 +492,10 @@ static ssize_t smmu_pmu_cpumask_show(struct device *dev,
> struct device_attribute *attr,
> char *buf)
> {
> - struct smmu_pmu *smmu_pmu = to_smmu_pmu(dev_get_drvdata(dev));
> + struct pmu *pmu = dev_get_drvdata(dev);
> + struct smmu_pmu_affinity *aff = to_smmu_pmu(pmu)->affinity;
>
> - return cpumap_print_to_pagebuf(true, buf, cpumask_of(smmu_pmu->on_cpu));
> + return cpumap_print_to_pagebuf(true, buf, cpumask_of(aff->cpu));
> }
>
> static struct device_attribute smmu_pmu_cpumask_attr =
> @@ -584,50 +599,140 @@ static const struct attribute_group *smmu_pmu_attr_grps[] = {
>
> static int smmu_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> {
> - struct smmu_pmu *smmu_pmu;
> + struct smmu_pmu_affinity *aff;
> + struct smmu_pmu *pmu;
> unsigned int target;
>
> - smmu_pmu = hlist_entry_safe(node, struct smmu_pmu, node);
> - if (cpu != smmu_pmu->on_cpu)
> + aff = hlist_entry_safe(node, struct smmu_pmu_affinity, node);
> + if (cpu != aff->cpu)
> return 0;
>
> target = cpumask_any_but(cpu_online_mask, cpu);
> if (target >= nr_cpu_ids)
> return 0;
>
> - perf_pmu_migrate_context(&smmu_pmu->pmu, cpu, target);
> - smmu_pmu->on_cpu = target;
> - WARN_ON(irq_set_affinity_hint(smmu_pmu->irq, cpumask_of(target)));
> + /* We're only reading, but this isn't the place to be involving RCU */
> + mutex_lock(&smmu_pmu_affinity_lock);
> + list_for_each_entry(pmu, &aff->instance_list, affinity_list)
> + perf_pmu_migrate_context(&pmu->pmu, aff->cpu, target);
> + mutex_unlock(&smmu_pmu_affinity_lock);
> +
> + WARN_ON(irq_set_affinity_hint(aff->irq, cpumask_of(target)));
> + aff->cpu = target;
>
> return 0;
> }
>
> static irqreturn_t smmu_pmu_handle_irq(int irq_num, void *data)
> {
> - struct smmu_pmu *smmu_pmu = data;
> - u64 ovsr;
> - unsigned int idx;
> + struct smmu_pmu_affinity *aff = data;
> + struct smmu_pmu *smmu_pmu;
> + irqreturn_t ret = IRQ_NONE;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(smmu_pmu, &aff->instance_list, affinity_list) {
> + unsigned int idx;
> + u64 ovsr = readq(smmu_pmu->reloc_base + SMMU_PMCG_OVSSET0);
>
> - ovsr = readq(smmu_pmu->reloc_base + SMMU_PMCG_OVSSET0);
> - if (!ovsr)
> - return IRQ_NONE;
> + if (!ovsr)
> + continue;
>
> - writeq(ovsr, smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
> + writeq(ovsr, smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
>
> - for_each_set_bit(idx, (unsigned long *)&ovsr, smmu_pmu->num_counters) {
> - struct perf_event *event = smmu_pmu->events[idx];
> - struct hw_perf_event *hwc;
> + for_each_set_bit(idx, (unsigned long *)&ovsr, smmu_pmu->num_counters) {
> + struct perf_event *event = smmu_pmu->events[idx];
>
> - if (WARN_ON_ONCE(!event))
> - continue;
> + if (WARN_ON_ONCE(!event))
> + continue;
> +
> + smmu_pmu_event_update(event);
> + smmu_pmu_set_period(smmu_pmu, &event->hw);
> + }
> + ret = IRQ_HANDLED;
> + }
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +
> +static struct smmu_pmu_affinity *__smmu_pmu_get_affinity(int irq)
> +{
> + struct smmu_pmu_affinity *aff;
> + int ret;
> +
> + list_for_each_entry(aff, &smmu_pmu_affinities, affinity_list)
> + if (aff->irq == irq && refcount_inc_not_zero(&aff->refcount))
> + return aff;
> +
> + aff = kzalloc(sizeof(*aff), GFP_KERNEL);
> + if (!aff)
> + return ERR_PTR(-ENOMEM);
> +
Do we need to initialize aff->instance_list?

> + /* Pick one CPU to be the preferred one to use */
> + aff->cpu = raw_smp_processor_id();
> + refcount_set(&aff->refcount, 1);
> +
> + ret = request_irq(irq, smmu_pmu_handle_irq,
> + IRQF_NOBALANCING | IRQF_NO_THREAD,
> + "smmuv3-pmu", aff);
Do we need to save irq to aff->irq?

> + if (ret)
> + goto out_free_aff;
> +
> + ret = irq_set_affinity_hint(irq, cpumask_of(aff->cpu));
> + if (ret)
> + goto out_free_irq;
> +
> + ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &aff->node);
> + if (ret)
> + goto out_free_irq;
> +
> + list_add(&aff->affinity_list, &smmu_pmu_affinities);
> + return aff;
> +
> +out_free_irq:
> + free_irq(irq, aff);
> +out_free_aff:
> + kfree(aff);
> + return ERR_PTR(ret);
> +}
> +
> +static int smmu_pmu_get_affinity(struct smmu_pmu *pmu, int irq)
> +{
> + struct smmu_pmu_affinity *aff;
> +
> + mutex_lock(&smmu_pmu_affinity_lock);
> + aff = __smmu_pmu_get_affinity(irq);
> + mutex_unlock(&smmu_pmu_affinity_lock);
> +
> + if (IS_ERR(aff))
> + return PTR_ERR(aff);
> +
> + pmu->affinity = aff;
> + mutex_lock(&smmu_pmu_affinity_lock);
> + list_add_rcu(&pmu->affinity_list, &aff->instance_list);
> + mutex_unlock(&smmu_pmu_affinity_lock);
> +
> + return 0;
> +}
> +
> +static void smmu_pmu_put_affinity(struct smmu_pmu *pmu)
> +{
> + struct smmu_pmu_affinity *aff = pmu->affinity;
>
> - smmu_pmu_event_update(event);
> - hwc = &event->hw;
> + mutex_lock(&smmu_pmu_affinity_lock);
> + list_del_rcu(&pmu->affinity_list);
>
> - smmu_pmu_set_period(smmu_pmu, hwc);
> + if (!refcount_dec_and_test(&aff->refcount)) {
> + mutex_unlock(&smmu_pmu_affinity_lock);
> + return;
> }
>
> - return IRQ_HANDLED;
> + list_del(&aff->affinity_list);
> + mutex_unlock(&smmu_pmu_affinity_lock);
> +
> + free_irq(aff->irq, aff);
> + cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &aff->node);
> + kfree(aff);
> }
>
> static void smmu_pmu_free_msis(void *data)
> @@ -652,7 +757,7 @@ static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
> pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
> }
>
> -static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
> +static int smmu_pmu_setup_msi(struct smmu_pmu *pmu)
> {
> struct msi_desc *desc;
> struct device *dev = pmu->dev;
> @@ -663,34 +768,34 @@ static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
>
> /* MSI supported or not */
> if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
> - return;
> + return 0;
>
> ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
> if (ret) {
> dev_warn(dev, "failed to allocate MSIs\n");
> - return;
> + return ret;
> }
>
> desc = first_msi_entry(dev);
> if (desc)
> - pmu->irq = desc->irq;
> + ret = desc->irq;
>
> /* Add callback to free MSIs on teardown */
> devm_add_action(dev, smmu_pmu_free_msis, dev);
> + return ret;
> }
>
> static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
> {
> - unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
> - int irq, ret = -ENXIO;
> + int irq;
>
> - smmu_pmu_setup_msi(pmu);
> + irq = smmu_pmu_setup_msi(pmu);
> + if (irq <= 0)
> + irq = platform_get_irq(to_platform_device(pmu->dev), 0);
> + if (irq < 0)
> + return irq;
>
> - irq = pmu->irq;
> - if (irq)
> - ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
> - flags, "smmuv3-pmu", pmu);
> - return ret;
> + return smmu_pmu_get_affinity(pmu, irq);
> }
>
> static void smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
> @@ -730,7 +835,7 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> struct resource *res_0;
> u32 cfgr, reg_size;
> u64 ceid_64[2];
> - int irq, err;
> + int err;
> char *name;
> struct device *dev = &pdev->dev;
>
> @@ -771,10 +876,6 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> smmu_pmu->reloc_base = smmu_pmu->reg_base;
> }
>
> - irq = platform_get_irq(pdev, 0);
> - if (irq > 0)
> - smmu_pmu->irq = irq;
> -
> ceid_64[0] = readq_relaxed(smmu_pmu->reg_base + SMMU_PMCG_CEID0);
> ceid_64[1] = readq_relaxed(smmu_pmu->reg_base + SMMU_PMCG_CEID1);
> bitmap_from_arr32(smmu_pmu->supported_events, (u32 *)ceid_64,
> @@ -789,12 +890,6 @@ static int smmu_pmu_probe(struct platform_device *pdev)
>
> smmu_pmu_reset(smmu_pmu);
>
> - err = smmu_pmu_setup_irq(smmu_pmu);
> - if (err) {
> - dev_err(dev, "Setup irq failed, PMU @%pa\n", &res_0->start);
> - return err;
> - }
> -
> name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "smmuv3_pmcg_%llx",
> (res_0->start) >> SMMU_PMCG_PA_SHIFT);
> if (!name) {
> @@ -804,16 +899,9 @@ static int smmu_pmu_probe(struct platform_device *pdev)
>
> smmu_pmu_get_acpi_options(smmu_pmu);
>
> - /* Pick one CPU to be the preferred one to use */
> - smmu_pmu->on_cpu = raw_smp_processor_id();
> - WARN_ON(irq_set_affinity_hint(smmu_pmu->irq,
> - cpumask_of(smmu_pmu->on_cpu)));
> -
> - err = cpuhp_state_add_instance_nocalls(cpuhp_state_num,
> - &smmu_pmu->node);
> + err = smmu_pmu_setup_irq(smmu_pmu);
> if (err) {
> - dev_err(dev, "Error %d registering hotplug, PMU @%pa\n",
> - err, &res_0->start);
> + dev_err(dev, "Setup irq failed, PMU @%pa\n", &res_0->start);
> return err;
> }
>
> @@ -832,7 +920,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
> return 0;
>
> out_unregister:
> - cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node);
> + smmu_pmu_put_affinity(smmu_pmu);
> + synchronize_rcu();
> return err;
> }
>
> @@ -840,9 +929,9 @@ static int smmu_pmu_remove(struct platform_device *pdev)
> {
> struct smmu_pmu *smmu_pmu = platform_get_drvdata(pdev);
>
> + smmu_pmu_put_affinity(smmu_pmu);
> + /* perf will synchronise RCU before devres can free smmu_pmu */
> perf_pmu_unregister(&smmu_pmu->pmu);
> - cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node);
> -
> return 0;
> }
>
> --
> 2.23.0.dirty
>