Re: [PATCH v14 10/44] arm64: RMI: Add support for SRO

From: Steven Price

Date: Thu Jun 04 2026 - 11:29:10 EST

On 21/05/2026 15:35, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:18 +0100,
> Steven Price <steven.price@xxxxxxx> wrote:
>>
>> RMM v2.0 introduces the concept of "Stateful RMI Operations" (SRO). This
>> means that an SMC can return with an operation still in progress. The
>> host is excepted to continue the operation until is reaches a conclusion
>> (either success or failure). During this process the RMM can request
>> additional memory ('donate') or hand memory back to the host
>> ('reclaim'). The host can request an in progress operation is cancelled,
>> but still continue the operation until it has completed (otherwise the
>> incomplete operation may cause future RMM operations to fail).
>>
>> The SRO is tracked using a struct rmi_sro_state object which keeps track
>> of any memory which has been allocated but not yet consumed by the RMM
>> or reclaimed from the RMM. This allows the memory to be reused in a
>> future request within the same operation. It will also permit an
>> operation to be done in a context where memory allocation may be
>> difficult (e.g. atomic context) with the option to abort the operation
>> and retry the memory allocation outside of the atomic context. The
>> memory stored in the struct rmi_sro_state object can then be reused on
>> the subsequent attempt.
>>
>> Signed-off-by: Steven Price <steven.price@xxxxxxx>
>> ---
>> v14:
>> * SRO support has improved although is still not fully complete. The
>> infrastructure has been moved out of KVM.
>> ---
>> arch/arm64/include/asm/rmi_cmds.h | 1 +
>> arch/arm64/kernel/rmi.c | 359 ++++++++++++++++++++++++++++++
>> 2 files changed, 360 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
>> index eb213c8e6f26..1a7b0c8f1e38 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -35,6 +35,7 @@ struct rmi_sro_state {
>>
>> int rmi_delegate_range(phys_addr_t phys, unsigned long size);
>> int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
>> +int free_delegated_page(phys_addr_t phys);
>>
>> static inline int rmi_delegate_page(phys_addr_t phys)
>> {
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 08cef54acadb..a8107ca9bb6d 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -48,6 +48,365 @@ int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
>> return ret;
>> }
>>
>> +static unsigned long donate_req_to_size(unsigned long donatereq)
>> +{
>> + unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +
>> + switch (unit_size) {
>> + case 0:
>> + return PAGE_SIZE;
>> + case 1:
>> + return PMD_SIZE;
>> + case 2:
>> + return PUD_SIZE;
>> + case 3:
>> + return P4D_SIZE;
>
> How does this work when we have folded levels? If this is supposed to
> be the architected size, then it should actively express that:
>
> return BIT(unit_size * (PAGE_SHIFT - 3) + PAGE_SHIFT);

It doesn't work (as Gavin also pointed out). There's an existing macro
to make this even cleaner:

return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(3 - unit_size));

>> + }
>> + unreachable();
>> +}
>> +
>> +static void rmi_smccc_invoke(struct arm_smccc_1_2_regs *regs_in,
>> + struct arm_smccc_1_2_regs *regs_out)
>> +{
>> + struct arm_smccc_1_2_regs regs = *regs_in;
>> + unsigned long status;
>> +
>> + do {
>> + arm_smccc_1_2_invoke(&regs, regs_out);
>> + status = RMI_RETURN_STATUS(regs_out->a0);
>> + } while (status == RMI_BUSY || status == RMI_BLOCKED);
>> +}
>> +
>> +int free_delegated_page(phys_addr_t phys)
>> +{
>> + if (WARN_ON(rmi_undelegate_page(phys))) {
>
> Please drop this WARN_ON(). Or at least make it ONCE. Everywhere.

Happy to change to WARN_ON_ONCE(). I think we should keep a WARN of some
sort as this is causing Linux to leak pages - it's definitely something
the sysadmin would want to know about.

>> + /* Undelegate failed: leak the page */
>> + return -EBUSY;
>> + }
>> +
>> + free_page((unsigned long)phys_to_virt(phys));
>> +
>> + return 0;
>> +}
>> +
>> +static int rmi_sro_ensure_capacity(struct rmi_sro_state *sro,
>> + unsigned long count)
>> +{
>> + if (WARN_ON_ONCE(sro->addr_count > RMI_MAX_ADDR_LIST))
>> + return -EOVERFLOW;
>> +
>> + if (count > RMI_MAX_ADDR_LIST - sro->addr_count)
>> + return -ENOSPC;
>> +
>> + return 0;
>> +}
>> +
>> +static int rmi_sro_donate_contig(struct rmi_sro_state *sro,
>> + unsigned long sro_handle,
>> + unsigned long donatereq,
>> + struct arm_smccc_1_2_regs *out_regs,
>> + gfp_t gfp)
>> +{
>> + unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> + unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> + unsigned long count = RMI_DONATE_COUNT(donatereq);
>> + unsigned long state = RMI_DONATE_STATE(donatereq);
>> + unsigned long size = unit_size_bytes * count;
>> + unsigned long addr_range;
>> + int ret;
>> + void *virt;
>> + phys_addr_t phys;
>> + struct arm_smccc_1_2_regs regs = {
>> + SMC_RMI_OP_MEM_DONATE,
>> + sro_handle
>> + };
>> +
>> + for (int i = 0; i < sro->addr_count; i++) {
>> + unsigned long entry = sro->addr_list[i];
>> +
>> + if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> + RMI_ADDR_RANGE_COUNT(entry) == count &&
>> + RMI_ADDR_RANGE_STATE(entry) == state) {
>> + sro->addr_count--;
>> + swap(sro->addr_list[sro->addr_count],
>> + sro->addr_list[i]);
>> +
>> + goto out;
>> + }
>> + }
>> +
>> + ret = rmi_sro_ensure_capacity(sro, 1);
>> + if (ret)
>> + return ret;
>> +
>> + virt = alloc_pages_exact(size, gfp);
>> + if (!virt)
>> + return -ENOMEM;
>> + phys = virt_to_phys(virt);
>> +
>> + if (state == RMI_OP_MEM_DELEGATED) {
>> + if (rmi_delegate_range(phys, size)) {
>> + free_pages_exact(virt, size);
>> + return -ENXIO;
>> + }
>> + }
>> +
>> + addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> + FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> + FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, count);
>> + FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> + sro->addr_list[sro->addr_count] = addr_range;
>> +
>
> Shouldn't this be moved to a helper that ensures capacity, and returns
> an error otherwise?

I'm not sure quite what you are suggesting. I already have a
rmi_sro_ensure_capacity() helper. By this point we know there's space.

>> +out:
>> + regs.a2 = virt_to_phys(&sro->addr_list[sro->addr_count]);
>> + regs.a3 = 1;
>
> This could really do with context specific helpers that populate regs
> based on a set of parameters. I have no idea what this 1 here is, and
> the init is spread over too much code. Think of the children!
>
> That's valid for the whole patch.

That's a good point. SRO is a bit tricky because I wanted the actual SMC
call to be done in one place so we can handle all the RMI_INCOMPLETE
cases together. But I could certainly add some helpers to setup the
registers rather than assigning directly to regs.a<n>.

Thanks,
Steve

> M.
>> + rmi_smccc_invoke(&regs, out_regs);
>> +
>> + unsigned long donated_granules = out_regs->a1;
>> + unsigned long donated_size = donated_granules << PAGE_SHIFT;
>> +
>> + if (donated_granules == 0) {
>> + /* No pages used by the RMM */
>> + sro->addr_count++;
>> + } else if (donated_size < size) {
>> + phys = sro->addr_list[sro->addr_count] & RMI_ADDR_RANGE_ADDR_MASK;
>> +
>> + /* Not all granules used by the RMM, free the remaining pages */
>> + for (long i = donated_size; i < size; i += PAGE_SIZE) {
>> + if (state == RMI_OP_MEM_DELEGATED)
>> + free_delegated_page(phys + i);
>> + else
>> + __free_page(phys_to_page(phys + i));
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int rmi_sro_donate_noncontig(struct rmi_sro_state *sro,
>> + unsigned long sro_handle,
>> + unsigned long donatereq,
>> + struct arm_smccc_1_2_regs *out_regs,
>> + gfp_t gfp)
>> +{
>> + unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> + unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> + unsigned long count = RMI_DONATE_COUNT(donatereq);
>> + unsigned long state = RMI_DONATE_STATE(donatereq);
>> + unsigned long found = 0;
>> + unsigned long addr_list_start = sro->addr_count;
>> + int ret;
>> + struct arm_smccc_1_2_regs regs = {
>> + SMC_RMI_OP_MEM_DONATE,
>> + sro_handle
>> + };
>> +
>> + for (int i = 0; i < addr_list_start && found < count; i++) {
>> + unsigned long entry = sro->addr_list[i];
>> +
>> + if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> + RMI_ADDR_RANGE_COUNT(entry) == 1 &&
>> + RMI_ADDR_RANGE_STATE(entry) == state) {
>> + addr_list_start--;
>> + swap(sro->addr_list[addr_list_start],
>> + sro->addr_list[i]);
>> + found++;
>> + i--;
>> + }
>> + }
>> +
>> + ret = rmi_sro_ensure_capacity(sro, count - found);
>> + if (ret)
>> + return ret;
>> +
>> + while (found < count) {
>> + unsigned long addr_range;
>> + void *virt = alloc_pages_exact(unit_size_bytes, gfp);
>> + phys_addr_t phys;
>> +
>> + if (!virt)
>> + return -ENOMEM;
>> +
>> + phys = virt_to_phys(virt);
>> +
>> + if (state == RMI_OP_MEM_DELEGATED) {
>> + if (rmi_delegate_range(phys, unit_size_bytes)) {
>> + free_pages_exact(virt, unit_size_bytes);
>> + return -ENXIO;
>> + }
>> + }
>> +
>> + addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> + FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> + FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, 1);
>> + FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> + sro->addr_list[sro->addr_count++] = addr_range;
>> + found++;
>> + }
>> +
>> + regs.a2 = virt_to_phys(&sro->addr_list[addr_list_start]);
>> + regs.a3 = found;
>> + rmi_smccc_invoke(&regs, out_regs);
>> +
>> + unsigned long donated_granules = out_regs->a1;
>> +
>> + if (WARN_ON(donated_granules & ((unit_size_bytes >> PAGE_SHIFT) - 1))) {
>> + /*
>> + * FIXME: RMM has only consumed part of a huge page, this leaks
>> + * the rest of the huge page
>> + */
>> + donated_granules = ALIGN(donated_granules,
>> + (unit_size_bytes >> PAGE_SHIFT));
>> + }
>> + unsigned long donated_blocks = donated_granules / (unit_size_bytes >> PAGE_SHIFT);
>> +
>> + if (WARN_ON(donated_blocks > found))
>> + donated_blocks = found;
>> +
>> + unsigned long undonated_blocks = found - donated_blocks;
>> +
>> + while (donated_blocks && undonated_blocks) {
>> + sro->addr_count--;
>> + swap(sro->addr_list[addr_list_start],
>> + sro->addr_list[sro->addr_count]);
>> + addr_list_start++;
>> +
>> + donated_blocks--;
>> + undonated_blocks--;
>> + }
>> + sro->addr_count -= donated_blocks;
>> +
>> + return 0;
>> +}
>> +
>> +static int rmi_sro_donate(struct rmi_sro_state *sro,
>> + unsigned long sro_handle,
>> + unsigned long donatereq,
>> + struct arm_smccc_1_2_regs *regs,
>> + gfp_t gfp)
>> +{
>> + unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +
>> + if (WARN_ON(!count))
>> + return 0;
>> +
>> + if (RMI_DONATE_CONTIG(donatereq)) {
>> + return rmi_sro_donate_contig(sro, sro_handle, donatereq,
>> + regs, gfp);
>> + } else {
>> + return rmi_sro_donate_noncontig(sro, sro_handle, donatereq,
>> + regs, gfp);
>> + }
>> +}
>> +
>> +static int rmi_sro_reclaim(struct rmi_sro_state *sro,
>> + unsigned long sro_handle,
>> + struct arm_smccc_1_2_regs *out_regs)
>> +{
>> + unsigned long capacity;
>> + struct arm_smccc_1_2_regs regs;
>> + int ret;
>> +
>> + ret = rmi_sro_ensure_capacity(sro, 1);
>> + if (ret)
>> + rmi_sro_free(sro);
>> +
>> + capacity = RMI_MAX_ADDR_LIST - sro->addr_count;
>> +
>> + regs = (struct arm_smccc_1_2_regs){
>> + SMC_RMI_OP_MEM_RECLAIM,
>> + sro_handle,
>> + virt_to_phys(&sro->addr_list[sro->addr_count]),
>> + capacity
>> + };
>> + rmi_smccc_invoke(&regs, out_regs);
>> +
>> + if (WARN_ON_ONCE(out_regs->a1 > capacity))
>> + out_regs->a1 = capacity;
>> +
>> + sro->addr_count += out_regs->a1;
>> +
>> + return 0;
>> +}
>> +
>> +void rmi_sro_free(struct rmi_sro_state *sro)
>> +{
>> + for (int i = 0; i < sro->addr_count; i++) {
>> + unsigned long entry = sro->addr_list[i];
>> + unsigned long addr = RMI_ADDR_RANGE_ADDR(entry);
>> + unsigned long unit_size = RMI_ADDR_RANGE_SIZE(entry);
>> + unsigned long count = RMI_ADDR_RANGE_COUNT(entry);
>> + unsigned long state = RMI_ADDR_RANGE_STATE(entry);
>> + unsigned long size = donate_req_to_size(unit_size) * count;
>> +
>> + if (state == RMI_OP_MEM_DELEGATED) {
>> + if (WARN_ON(rmi_undelegate_range(addr, size))) {
>> + /* Leak the pages */
>> + continue;
>> + }
>> + }
>> + free_pages_exact(phys_to_virt(addr), size);
>> + }
>> +
>> + sro->addr_count = 0;
>> +}
>> +
>> +unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp)
>> +{
>> + unsigned long sro_handle;
>> + struct arm_smccc_1_2_regs regs;
>> + struct arm_smccc_1_2_regs *regs_in = &sro->regs;
>> +
>> + rmi_smccc_invoke(regs_in, &regs);
>> +
>> + sro_handle = regs.a1;
>> +
>> + while (RMI_RETURN_STATUS(regs.a0) == RMI_INCOMPLETE) {
>> + bool can_cancel = RMI_RETURN_CAN_CANCEL(regs.a0);
>> + int ret;
>> +
>> + switch (RMI_RETURN_MEMREQ(regs.a0)) {
>> + case RMI_OP_MEM_REQ_NONE:
>> + regs = (struct arm_smccc_1_2_regs){
>> + SMC_RMI_OP_CONTINUE, sro_handle, 0
>> + };
>> + rmi_smccc_invoke(&regs, &regs);
>> + break;
>> + case RMI_OP_MEM_REQ_DONATE:
>> + ret = rmi_sro_donate(sro, sro_handle, regs.a2, &regs,
>> + gfp);
>> + break;
>> + case RMI_OP_MEM_REQ_RECLAIM:
>> + ret = rmi_sro_reclaim(sro, sro_handle, &regs);
>> + break;
>> + default:
>> + ret = WARN_ON(1);
>> + break;
>> + }
>> +
>> + if (ret) {
>> + if (can_cancel) {
>> + /*
>> + * FIXME: Handle cancelling properly!
>> + *
>> + * If the operation has failed due to memory
>> + * allocation failure then the information on
>> + * the memory allocation should be saved, so
>> + * that the allocation can be repeated outside
>> + * of any context which prevented the
>> + * allocation.
>
> Honestly, this is the sort of stuff that I'd expect to be solved
> *before* posting this code. Since this is so central to the whole
> memory management, it needs to be correct from day-1.
>
> If you can't make it work in time, then tone the supported features
> down. But FIXMEs and WARN_ONs are not the way to go.
>
> M.
>