Re: [PATCH v5 2/2] remoteproc: xlnx: add crash detection mechanism

From: Shah, Tanmay

Date: Tue Jun 16 2026 - 15:53:09 EST


Hello,

Please find my comments below:

On 6/16/2026 11:01 AM, Mathieu Poirier wrote:
> On Wed, Jun 10, 2026 at 11:27:11AM -0700, Tanmay Shah wrote:
>> Remote processor will report the crash reason via the resource table
>> and notify the host via mailbox notification. The host checks this
>> crash reason on every mailbox notification from the remote and report
>> to the rproc core framework. Then the rproc core framework will start
>> the recovery process.
>>
>> Signed-off-by: Tanmay Shah <tanmay.shah@xxxxxxx>
>> ---
>>
>> changes in v5
>> - use local variable to access crash report pointer
>> - End crash report string with '\0' without relying on fw
>>
>> drivers/remoteproc/xlnx_r5_remoteproc.c | 73 ++++++++++++++++++++++++-
>> 1 file changed, 72 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> index 3349d1877751..86afff9f3b40 100644
>> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
>> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> @@ -112,6 +112,10 @@ struct rsc_tbl_data {
>> const uintptr_t rsc_tbl;
>> } __packed;
>>
>> +enum xlnx_rproc_fw_rsc {
>> + XLNX_RPROC_FW_CRASH_REASON = RSC_VENDOR_START,
>
> I would call this XLNX_RPROC_FW_CRASH_REPORT
>

Ack.

>> +};
>> +
>> /*
>> * Hardcoded TCM bank values. This will stay in driver to maintain backward
>> * compatibility with device-tree that does not have TCM information.
>> @@ -131,9 +135,27 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>> {0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
>> };
>>
>> +#define CRASH_REASON_STR_LEN 16
>> +
>> +/**
>> + * struct xlnx_rproc_crash_report - resource to know crash status and reason
>> + *
>> + * @version: version of this resource
>> + * @crash_state: if true, the rproc is notifying crash, time to recover
>> + * @crash_reason: number to describe reason of crash
>> + * @crash_reason_str: short string description of crash reason
>> + */
>> +struct xlnx_rproc_crash_report {
>> + u8 version;
>> + u8 crash_state;
>> + u8 crash_reason;
>
> Do you need 2 variables? Would it be possible for @crash_reason != 0 to
> indicate a crash, making @cash_state irrelevant?
>

Actually initially I had defined crash_reason only, but when I looked at
how crash reason/type is defined in the kernel, I got idea to keep
crash_state separate than crash_reason:

https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git/tree/include/linux/remoteproc.h?h=for-next#n183

enum rproc_crash_type {
RPROC_MMUFAULT,
RPROC_WATCHDOG,
RPROC_FATAL_ERROR,
};

So, if crash_reason is defined like above, then it's easy to make
mistake and not define no_crash = 0. Different firmware projects can
treat crash_reason differently. I would like to keep crash_state and
crash_reason separate and not impose policy on firmware projects on how
to define crash_reason.

>> + char crash_reason_str[CRASH_REASON_STR_LEN];
>> +} __packed;
>> +
>> /**
>> * struct zynqmp_r5_core - remoteproc core's internal data
>> *
>> + * @crash_report: rproc crash state and reason
>> * @rsc_tbl_va: resource table virtual address
>> * @sram: Array of sram memories assigned to this core
>> * @num_sram: number of sram for this core
>> @@ -147,6 +169,7 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>> * @ipi: pointer to mailbox information
>> */
>> struct zynqmp_r5_core {
>> + struct xlnx_rproc_crash_report *crash_report;
>> void __iomem *rsc_tbl_va;
>> struct zynqmp_sram_bank *sram;
>> int num_sram;
>> @@ -204,11 +227,27 @@ static int event_notified_idr_cb(int id, void *ptr, void *data)
>> */
>> static void handle_event_notified(struct work_struct *work)
>> {
>> + struct xlnx_rproc_crash_report *report;
>> + struct zynqmp_r5_core *r5_core;
>> struct mbox_info *ipi;
>> struct rproc *rproc;
>>
>> ipi = container_of(work, struct mbox_info, mbox_work);
>> rproc = ipi->r5_core->rproc;
>> + r5_core = ipi->r5_core;
>> + report = r5_core->crash_report;
>> +
>> + /* report crash only if expected */
>> + if (report && report->crash_state) {
>> + if (rproc->state == RPROC_ATTACHED || rproc->state == RPROC_RUNNING) {
>> + report->crash_reason_str[CRASH_REASON_STR_LEN - 1] = '\0';
>> + dev_warn(&rproc->dev, "crash reason id: %d %s\n",
>> + report->crash_reason, report->crash_reason_str);
>> + rproc_report_crash(rproc, RPROC_FATAL_ERROR);
>> + report->crash_state = false;
>> + return;
>> + }
>> + }
>>
>> /*
>> * We only use IPI for interrupt. The RPU firmware side may or may
>> @@ -448,6 +487,13 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
>> if (ret)
>> dev_err(r5_core->dev, "core force power down failed\n");
>>
>> + /*
>> + * Clear attach on recovery flag during stop operation. The next state
>> + * of the remote processor is expected to be "Running" state. In this
>> + * state boot recovery method must take place over attach on recovery.
>> + */
>> + test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
>
> Why is there a need to play set/clear the RPROC_FEAT_ATTACH_ON_RECOVERY flag
> here and in zynqmp_r5_attach() and zynqmp_r5_detach()? I think we talked about
> that before but can't remember the outcome of that conversation.
>

The remote processor can go through following life cycle:

attach() -> stop() -> start(). When this happens, ATTACH_ON_RECOVERY is
not valid anymore. At this point, it should be BOOT_RECOVERY which is
indicated by clearing ATTACH_RECOVERY flag. That is why it is important
to clear this bit on stop().

Now in detach() it is not needed. I am just clearing the flag as part of
a cleanup sequence.

>> +
>> return ret;
>> }
>>
>> @@ -869,6 +915,9 @@ static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
>>
>> static int zynqmp_r5_attach(struct rproc *rproc)
>> {
>> + /* Enable attach on recovery method. Clear it during rproc stop. */
>> + rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
>> +
>> dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
>>
>> return 0;
>> @@ -883,9 +932,30 @@ static int zynqmp_r5_detach(struct rproc *rproc)
>> */
>> zynqmp_r5_rproc_kick(rproc, 0);
>>
>> + clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
>> +
>> return 0;
>> }
>>
>> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc,
>> + int offset, int avail)
>> +{
>> + struct zynqmp_r5_core *r5_core = rproc->priv;
>> + void *rsc_offset = (r5_core->rsc_tbl_va + offset);
>> +
>
> if (rsc_type != XLNX_RPROC_FW_CRASH_REASON)
> return RSC_IGNORED;
>

Ack.

>> + if (rsc_type == XLNX_RPROC_FW_CRASH_REASON) {
>> + r5_core->crash_report = rsc_offset;
>> + /* reset all values */
>> + r5_core->crash_report->crash_state = false;
>> + r5_core->crash_report->crash_reason = 0;
>> + r5_core->crash_report->crash_reason_str[0] = '\0';
>> + } else {
>> + return RSC_IGNORED;
>> + }
>> +
>> + return RSC_HANDLED;
>> +}
>> +
>> static const struct rproc_ops zynqmp_r5_rproc_ops = {
>> .prepare = zynqmp_r5_rproc_prepare,
>> .unprepare = zynqmp_r5_rproc_unprepare,
>> @@ -900,6 +970,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
>> .get_loaded_rsc_table = zynqmp_r5_get_loaded_rsc_table,
>> .attach = zynqmp_r5_attach,
>> .detach = zynqmp_r5_detach,
>> + .handle_rsc = zynqmp_r5_handle_rsc,
>> };
>>
>> /**
>> @@ -939,7 +1010,7 @@ static struct zynqmp_r5_core *zynqmp_r5_alloc_rproc_core(struct device *cdev)
>>
>> rproc_coredump_set_elf_info(r5_rproc, ELFCLASS32, EM_ARM);
>>
>> - r5_rproc->recovery_disabled = true;
>> + r5_rproc->recovery_disabled = false;
>
> I'm good with the overall architecture of this set.
>

Okay. If above comments looks good, I will send v6 soon.

> Regards,
> Mathieu
>
>> r5_rproc->has_iommu = false;
>> r5_rproc->auto_boot = false;
>>
>> --
>> 2.34.1
>>