Re: [PATCH 05/14] irqchip/gic-v3-its: Prepare shadow structures for KVM host deprivilege
From: Fuad Tabba
Date: Fri Mar 13 2026 - 07:26:59 EST
Hi Sebastian,
On Tue, 10 Mar 2026 at 12:49, Sebastian Ene <sebastianene@xxxxxxxxxx> wrote:
>
> Expose two helper functions to support emulated ITS in the hypervisor.
> These allow the KVM layer to notify the driver when hypervisor
> initialization is complete.
> The caller is expected to use the functions as follows:
> 1. its_start_deprivilege(): Acquire the ITS locks.
> 2. on_each_cpu(_kvm_host_prot_finalize, ...): Finalizes pKVM init
> 3. its_end_deprivilege(): Shadow the ITS structures, invoke the KVM
> callback, and release locks.
> Specifically, this shadows the ITS command queue and the 1st level
> indirect tables. These shadow buffers will be used by the driver after
> host deprivilege, while the hypervisor unmaps and takes ownership of the
> original structures.
Just a note again on preferring not to use the "shadow" terminology. I
thought about it a bit more, since these are not at the host, perhaps
"proxy" is a better term, to convey that the host is writing to a
middle-man buffer.
Another term is "staging," which is common in DMA: the host "stages"
the commands here, and EL2 "commits" them to the hardware.
>
> Signed-off-by: Sebastian Ene <sebastianene@xxxxxxxxxx>
> ---
> drivers/irqchip/irq-gic-v3-its.c | 165 +++++++++++++++++++++++++++--
> include/linux/irqchip/arm-gic-v3.h | 24 +++++
> 2 files changed, 178 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 291d7668cc8d..278dbc56f962 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -78,17 +78,6 @@ struct its_collection {
> u16 col_id;
> };
>
> -/*
> - * The ITS_BASER structure - contains memory information, cached
> - * value of BASER register configuration and ITS page size.
> - */
> -struct its_baser {
> - void *base;
> - u64 val;
> - u32 order;
> - u32 psz;
> -};
> -
> struct its_device;
>
> /*
> @@ -5232,6 +5221,160 @@ static int __init its_compute_its_list_map(struct its_node *its)
> return its_number;
> }
>
> +static void its_free_shadow_tables(struct its_shadow_tables *shadow)
> +{
> + int i;
> +
> + if (shadow->cmd_shadow)
> + its_free_pages(shadow->cmd_shadow, get_order(ITS_CMD_QUEUE_SZ));
> +
> + for (i = 0; i < GITS_BASER_NR_REGS; i++) {
> + if (!shadow->tables[i].shadow)
> + continue;
> +
> + its_free_pages(shadow->tables[i].shadow, 0);
> + }
> +
> + its_free_pages(shadow, 0);
> +}
> +
> +static struct its_shadow_tables *its_get_shadow_tables(struct its_node *its)
> +{
> + void *page;
> + struct its_shadow_tables *shadow;
> + int i;
Prefer RCT declarations.
> +
> + page = its_alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, 0);
This is called with the raw_spin_lock_irqsave held, and GFP_KERNEL can
sleep. You have one of two options, either use GFP_ATOMIC, but that's
more likely to fail. The alternative is to move this to
its_start_deprivilege(), before any lock is held.
> + if (!page)
> + return NULL;
> +
> + shadow = (void *)page_address(page);
> + page = its_alloc_pages_node(its->numa_node,
> + GFP_KERNEL | __GFP_ZERO,
> + get_order(ITS_CMD_QUEUE_SZ));
> + if (!page)
> + goto err_alloc_shadow;
> +
> + shadow->cmd_shadow = page_address(page);
> + shadow->cmdq_len = ITS_CMD_QUEUE_SZ;
> + shadow->cmd_original = its->cmd_base;
> +
> + memcpy(shadow->tables, its->tables, sizeof(struct its_baser) * GITS_BASER_NR_REGS);
> +
> + for (i = 0; i < GITS_BASER_NR_REGS; i++) {
> + if (!(shadow->tables[i].val & GITS_BASER_VALID))
> + continue;
> +
> + if (!(shadow->tables[i].val & GITS_BASER_INDIRECT))
> + continue;
> +
> + page = its_alloc_pages_node(its->numa_node,
> + GFP_KERNEL | __GFP_ZERO,
> + shadow->tables[i].order);
> + if (!page)
> + goto err_alloc_shadow;
> +
> + shadow->tables[i].shadow = page_address(page);
> +
> + memcpy(shadow->tables[i].shadow, shadow->tables[i].base,
> + PAGE_ORDER_TO_SIZE(shadow->tables[i].order));
> + }
> +
> + return shadow;
> +
> +err_alloc_shadow:
> + its_free_shadow_tables(shadow);
> + return NULL;
> +}
> +
> +void *its_start_depriviledge(void)
Typo here and elsewhere in this patch:
s/depriviledge/deprivilege/g
This is particularly important because it also appears in exported
symbols as well (later in this patch).
> +{
> + struct its_node *its;
> + int num_nodes = 0, i = 0;
> + unsigned long *flags;
RCT declaration order, and please untagle them, i.e., don't declare
the num_nodes and the iterator in the same line.
> +
> + raw_spin_lock(&its_lock);
> + list_for_each_entry(its, &its_nodes, entry) {
> + num_nodes++;
> + }
> +
> + flags = kzalloc(num_nodes * sizeof(unsigned long), GFP_KERNEL_ACCOUNT);
Same as the other allocation. This can sleep. I think that for this as
well, it's better to move it before lock acquisition. Even if you use
a different allocator, it's still better to keep the critical section
short.
> + if (!flags) {
> + raw_spin_unlock(&its_lock);
> + return NULL;
> + }
> +
> + list_for_each_entry(its, &its_nodes, entry) {
> + raw_spin_lock_irqsave(&its->lock, flags[i++]);
> + }
> +
> + return flags;
> +}
> +EXPORT_SYMBOL_GPL(its_start_depriviledge);
> +
> +static int its_switch_to_shadow_locked(struct its_node *its, its_init_emulate init_emulate_cb)
> +{
> + struct its_shadow_tables *hyp_shadow, shadow;
> + int i, ret;
> + u64 baser, baser_phys;
> +
> + hyp_shadow = its_get_shadow_tables(its);
> + if (!hyp_shadow)
> + return -ENOMEM;
> +
> + memcpy(&shadow, hyp_shadow, sizeof(shadow));
> + ret = init_emulate_cb(its->phys_base, hyp_shadow);
You are performing this callback with the lock held and local
interrupts disabled. The hvc call is byitself expensive, especially
since it's going to do stage-2 manipulations.
You should decouple the synchronous pointer swapping (which must be
locked) from the hypervisor notification (which can be done outside
the lock). Instead of executing the callback inside the critical
section, its_end_deprivilege should:
- Lock everything.
- Perform the pointer swaps in the host driver structures.
- Save the hyp_shadow pointers to a temporary array.
- Unlock everything.
- Loop through the temporary array and call the KVM cb to notify EL2.
You should probably split this patch into two. The first patch would
implement the freeze/unfreeze locking mechanism, and the second would
swap the driver's internal memory pointers to the shadow structures,
and invoke the KVM callback to lock down the real hardware.
Cheers,
/fuad
> + if (ret) {
> + its_free_shadow_tables(hyp_shadow);
> + return ret;
> + }
> +
> + /* Switch the driver command queue to use the shadow and save the original */
> + its->cmd_write = (its->cmd_write - its->cmd_base) +
> + (struct its_cmd_block *)shadow.cmd_shadow;
> + its->cmd_base = shadow.cmd_shadow;
> +
> + /* Shadow the first level of the indirect tables */
> + for (i = 0; i < GITS_BASER_NR_REGS; i++) {
> + baser = shadow.tables[i].val;
> +
> + if (!shadow.tables[i].shadow)
> + continue;
> +
> + baser_phys = virt_to_phys(shadow.tables[i].shadow);
> + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && (baser_phys >> 48))
> + baser_phys = GITS_BASER_PHYS_52_to_48(baser_phys);
> +
> + its->tables[i].val &= ~GENMASK(47, 12);
> + its->tables[i].val |= baser_phys;
> + its->tables[i].base = shadow.tables[i].shadow;
> + }
> +
> + return 0;
> +}
> +
> +int its_end_depriviledge(int ret_pkvm_finalize, unsigned long *flags, its_init_emulate cb)
> +{
> + struct its_node *its;
> + int i = 0, ret = 0;
> +
> + if (!flags || !cb)
> + return -EINVAL;
> +
> + list_for_each_entry(its, &its_nodes, entry) {
> + if (!ret_pkvm_finalize && !ret)
> + ret = its_switch_to_shadow_locked(its, cb);
> +
> + raw_spin_unlock_irqrestore(&its->lock, flags[i++]);
> + }
> +
> + kfree(flags);
> + raw_spin_unlock(&its_lock);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(its_end_depriviledge);
> +
> static int __init its_probe_one(struct its_node *its)
> {
> u64 baser, tmp;
> diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
> index 0225121f3013..40457a4375d4 100644
> --- a/include/linux/irqchip/arm-gic-v3.h
> +++ b/include/linux/irqchip/arm-gic-v3.h
> @@ -657,6 +657,30 @@ static inline bool gic_enable_sre(void)
> return !!(val & ICC_SRE_EL1_SRE);
> }
>
> +/*
> + * The ITS_BASER structure - contains memory information, cached
> + * value of BASER register configuration and ITS page size.
> + */
> +struct its_baser {
> + void *base;
> + void *shadow;
> + u64 val;
> + u32 order;
> + u32 psz;
> +};
> +
> +struct its_shadow_tables {
> + struct its_baser tables[GITS_BASER_NR_REGS];
> + void *cmd_shadow;
> + void *cmd_original;
> + size_t cmdq_len;
> +};
> +
> +typedef int (*its_init_emulate)(phys_addr_t its_phys_base, struct its_shadow_tables *shadow);
> +
> +void *its_start_depriviledge(void);
> +int its_end_depriviledge(int ret, unsigned long *flags, its_init_emulate cb);
> +
> #endif
>
> #endif
> --
> 2.53.0.473.g4a7958ca14-goog
>