Re: [PATCH 1/2] irqchip: RISC-V Local Interrupt Controller Driver

From: Thomas Gleixner
Date: Wed Jun 28 2017 - 16:47:48 EST


On Mon, 26 Jun 2017, Palmer Dabbelt wrote:
> +DEFINE_PER_CPU(struct riscv_irq_data, riscv_irq_data);
> +DEFINE_PER_CPU(atomic_long_t, riscv_early_sie);
> +
> +static void riscv_software_interrupt(void)
> +{
> +#ifdef CONFIG_SMP
> + irqreturn_t ret;
> +
> + ret = handle_ipi();
> + if (ret != IRQ_NONE)
> + return;
> +#endif
> +
> + BUG();

Are you sure you want to crash the system just because a spurious interrupt
happened?

> +}
> +
> +asmlinkage void __irq_entry do_IRQ(unsigned int cause, struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs = set_irq_regs(regs);
> + struct irq_domain *domain;
> +
> + irq_enter();
> +
> + /* There are three classes of interrupt: timer, software, and

Please use proper multiline comment formatting:

/*
* bla.....
* foo.....
*/

> + * external devices. We dispatch between them here. External
> + * device interrupts use the generic IRQ mechanisms.
> + */
> + switch (cause) {
> + case INTERRUPT_CAUSE_TIMER:
> + riscv_timer_interrupt();
> + break;
> + case INTERRUPT_CAUSE_SOFTWARE:
> + riscv_software_interrupt();
> + break;
> + default:
> + domain = per_cpu(riscv_irq_data, smp_processor_id()).domain;
> + generic_handle_irq(irq_find_mapping(domain, cause));
> + break;
> + }
> +
> + irq_exit();
> + set_irq_regs(old_regs);
> +}
> +
> +static int riscv_irqdomain_map(struct irq_domain *d, unsigned int irq,
> + irq_hw_number_t hwirq)
> +{
> + struct riscv_irq_data *data = d->host_data;
> +
> + irq_set_chip_and_handler(irq, &data->chip, handle_simple_irq);
> + irq_set_chip_data(irq, data);
> + irq_set_noprobe(irq);
> +
> + return 0;
> +}
> +
> +static const struct irq_domain_ops riscv_irqdomain_ops = {
> + .map = riscv_irqdomain_map,
> + .xlate = irq_domain_xlate_onecell,
> +};
> +
> +static void riscv_irq_mask(struct irq_data *d)
> +{
> + struct riscv_irq_data *data = irq_data_get_irq_chip_data(d);
> +
> + BUG_ON(smp_processor_id() != data->hart);

Crashing the machine is the last resort if there is no chance to handle a
situation gracefully. Something like

if (WARN_ON_ONCE(smp_processor_id() != data->hart))
do_something_sensible();
else
.....

might at least keep the machine halfways functional for debugging.

> + csr_clear(sie, 1 << (long)d->hwirq);
> +}
> +
> +static void riscv_irq_unmask(struct irq_data *d)
> +{
> + struct riscv_irq_data *data = irq_data_get_irq_chip_data(d);
> +
> + BUG_ON(smp_processor_id() != data->hart);
> + csr_set(sie, 1 << (long)d->hwirq);
> +}
> +
> +static void riscv_irq_enable_helper(void *d)
> +{
> + riscv_irq_unmask(d);
> +}
> +
> +static void riscv_irq_enable(struct irq_data *d)
> +{
> + struct riscv_irq_data *data = irq_data_get_irq_chip_data(d);
> +
> + /* There are two phases to setting up an interrupt: first we set a bit
> + * in this bookkeeping structure, which is used by trap_init to
> + * initialize SIE for each hart as it comes up.

And what exactly has this to do with irq_enable()? Why would you call that
for an interrupt which solely goes to a offline cpu?

> +static void riscv_irq_disable(struct irq_data *d)
> +{
> + struct riscv_irq_data *data = irq_data_get_irq_chip_data(d);
> +
> + /* This is the mirror of riscv_irq_enable. */
> + atomic_long_and(~(1 << (long)d->hwirq),
> + &per_cpu(riscv_early_sie, data->hart));
> + if (data->hart == smp_processor_id())
> + riscv_irq_mask(d);
> + else if (cpu_online(data->hart))
> + smp_call_function_single(data->hart,
> + riscv_irq_disable_helper,
> + d,
> + true);

Same question as above.

> +}
> +
> +static void riscv_irq_mask_noop(struct irq_data *d) { }
> +
> +static void riscv_irq_unmask_noop(struct irq_data *d) { }
>
> +static void riscv_irq_enable_noop(struct irq_data *d)
> +{
> + struct device_node *data = irq_data_get_irq_chip_data(d);
> + u32 hart;
> +
> + if (!of_property_read_u32(data, "reg", &hart))
> + printk(
> + KERN_WARNING "enabled interrupt %d for missing hart %d (this interrupt has no handler)\n",

Has no handler? I really have a hard time to understand the logic here.

> + (int)d->hwirq, hart);
> +}
> +
> +static struct irq_chip riscv_noop_chip = {
> + .name = "riscv,cpu-intc,noop",
> + .irq_mask = riscv_irq_mask_noop,
> + .irq_unmask = riscv_irq_unmask_noop,
> + .irq_enable = riscv_irq_enable_noop,

Please write that in tabular fashion:

.name = "riscv,cpu-intc,noop",
.irq_mask = riscv_irq_mask_noop,

> +};
> +
> +static int riscv_irqdomain_map_noop(struct irq_domain *d, unsigned int irq,
> + irq_hw_number_t hwirq)
> +{
> + struct device_node *data = d->host_data;
> +
> + irq_set_chip_and_handler(irq, &riscv_noop_chip, handle_simple_irq);
> + irq_set_chip_data(irq, data);
> + return 0;
> +}
> +
> +static const struct irq_domain_ops riscv_irqdomain_ops_noop = {
> + .map = riscv_irqdomain_map_noop,
> + .xlate = irq_domain_xlate_onecell,
> +};
> +
> +static int riscv_intc_init(struct device_node *node, struct device_node *parent)
> +{
> + int hart;
> + struct riscv_irq_data *data;
> +
> + if (parent)
> + return 0;
> +
> + hart = riscv_of_processor_hart(node->parent);
> + if (hart < 0) {
> + /* If a hart is disabled, create a no-op irq domain. Devices
> + * may still have interrupts connected to those harts. This is
> + * not wrong... unless they actually load a driver that needs
> + * it!
> + */
> + irq_domain_add_linear(
> + node,
> + 8*sizeof(uintptr_t),
> + &riscv_irqdomain_ops_noop,
> + node->parent);
> + return 0;

I have a hard time to understand the logic here. Why do you need an
interrupt domain for something which does not exist? That does not make any
sense.

> + }
> +
> + data = &per_cpu(riscv_irq_data, hart);
> + snprintf(data->name, sizeof(data->name), "riscv,cpu_intc,%d", hart);
> + data->hart = hart;
> + data->chip.name = data->name;
> + data->chip.irq_mask = riscv_irq_mask;
> + data->chip.irq_unmask = riscv_irq_unmask;
> + data->chip.irq_enable = riscv_irq_enable;
> + data->chip.irq_disable = riscv_irq_disable;
> + data->domain = irq_domain_add_linear(
> + node,
> + 8*sizeof(uintptr_t),
> + &riscv_irqdomain_ops,
> + data);
> + WARN_ON(!data->domain);

That warn_on is great. You warn and then continue as if nothing
happened. Machine will crash later dereferencing a NULL pointer.

Some more epxlanations about the inner workings of all of this would be
appreciated.

Thanks,

tglx