Re: [PATCH V4 13/17] i3c: mipi-i3c-hci: Add DMA-mode recovery for internal controller errors

From: Frank Li

Date: Tue Jun 02 2026 - 12:51:15 EST


On Fri, May 15, 2026 at 07:26:17PM +0300, Adrian Hunter wrote:
> Handle internal I3C HCI errors when operating in DMA mode by adding a
> simple recovery mechanism.
>
> On detection of an internal controller error, mark recovery as needed and
> attempt to restore operation by performing a software reset followed by
> state restore. To keep recovery straightforward on this unlikely error
> path, all currently queued transfers are terminated and completed with an
> error.
>
> This allows the controller to resume operation after internal failures
> rather than remaining permanently stuck.
>
> Note, internal errors indicated by INTR_HC_INTERNAL_ERR, cause the
> controller to stop.
>
> Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
> ---

Reviewed-by: Frank Li <Frank.Li@xxxxxxx>

>
>
> Changes in V4:
>
> None
>
> Changes in V3:
>
> When erroring out transfers, ensure the final transfer of a
> transfer list is processed last
>
> Changes in V2:
>
> Rename completing_xfer to final_xfer
> Add hci_dma_xfer_done() before checking for an already complete
> transfer
> Improve commit message
>
>
> drivers/i3c/master/mipi-i3c-hci/cmd.h | 6 ++
> drivers/i3c/master/mipi-i3c-hci/core.c | 1 +
> drivers/i3c/master/mipi-i3c-hci/dma.c | 93 +++++++++++++++++++++++---
> drivers/i3c/master/mipi-i3c-hci/hci.h | 1 +
> 4 files changed, 92 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/i3c/master/mipi-i3c-hci/cmd.h b/drivers/i3c/master/mipi-i3c-hci/cmd.h
> index b1bf87daa651..7bada7b4b2de 100644
> --- a/drivers/i3c/master/mipi-i3c-hci/cmd.h
> +++ b/drivers/i3c/master/mipi-i3c-hci/cmd.h
> @@ -65,4 +65,10 @@ struct hci_cmd_ops {
> extern const struct hci_cmd_ops mipi_i3c_hci_cmd_v1;
> extern const struct hci_cmd_ops mipi_i3c_hci_cmd_v2;
>
> +static inline void hci_cmd_set_resp_err(u32 *response, int resp_err)
> +{
> + *response &= ~RESP_ERR_FIELD;
> + *response |= FIELD_PREP(RESP_ERR_FIELD, resp_err);
> +}
> +
> #endif
> diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c
> index 12a0122fb709..69dcf5dad3a5 100644
> --- a/drivers/i3c/master/mipi-i3c-hci/core.c
> +++ b/drivers/i3c/master/mipi-i3c-hci/core.c
> @@ -668,6 +668,7 @@ static irqreturn_t i3c_hci_irq_handler(int irq, void *dev_id)
> if (val & INTR_HC_INTERNAL_ERR) {
> dev_err(&hci->master.dev, "Host Controller Internal Error\n");
> val &= ~INTR_HC_INTERNAL_ERR;
> + hci->recovery_needed = true;
> }
>
> if (val)
> diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c
> index f9023cb3c5a2..f39a6ce2aad5 100644
> --- a/drivers/i3c/master/mipi-i3c-hci/dma.c
> +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c
> @@ -9,6 +9,7 @@
> */
>
> #include <linux/bitfield.h>
> +#include <linux/delay.h>
> #include <linux/device.h>
> #include <linux/dma-mapping.h>
> #include <linux/errno.h>
> @@ -258,6 +259,10 @@ static void hci_dma_init_rh(struct i3c_hci *hci, struct hci_rh_data *rh, int i)
> rh_reg_write(RING_CONTROL, RING_CTRL_ENABLE);
> rh_reg_write(RING_CONTROL, RING_CTRL_ENABLE | RING_CTRL_RUN_STOP);
>
> + /*
> + * Do not clear the entries of rh->src_xfers because the recovery uses
> + * them. In other cases they should be NULL anyway.
> + */
> rh->done_ptr = 0;
> rh->ibi_chunk_ptr = 0;
> rh->xfer_space = rh->xfer_entries;
> @@ -362,7 +367,7 @@ static int hci_dma_init(struct i3c_hci *hci)
> rh->resp = dma_alloc_coherent(rings->sysdev, resps_sz,
> &rh->resp_dma, GFP_KERNEL);
> rh->src_xfers =
> - kmalloc_objs(*rh->src_xfers, rh->xfer_entries);
> + kzalloc_objs(*rh->src_xfers, rh->xfer_entries);
> ret = -ENOMEM;
> if (!rh->xfer || !rh->resp || !rh->src_xfers)
> goto err_out;
> @@ -572,13 +577,15 @@ static void hci_dma_xfer_done(struct i3c_hci *hci, struct hci_rh_data *rh)
> hci_dma_unmap_xfer(hci, xfer, 1);
> rh->src_xfers[done_ptr] = NULL;
> xfer->ring_entry = -1;
> - xfer->response = resp;
> if (tid != xfer->cmd_tid) {
> dev_err(&hci->master.dev,
> "response tid=%d when expecting %d\n",
> tid, xfer->cmd_tid);
> - /* TODO: do something about it? */
> + hci->recovery_needed = true;
> + if (!RESP_STATUS(resp))
> + hci_cmd_set_resp_err(&resp, RESP_ERR_HC_TERMINATED);
> }
> + xfer->response = resp;
> if (xfer == xfer->final_xfer || RESP_STATUS(resp))
> complete(xfer->final_xfer->completion);
> if (RESP_STATUS(resp))
> @@ -625,6 +632,60 @@ static void hci_dma_unblock_enqueue(struct i3c_hci *hci)
> }
> }
>
> +static void hci_dma_error_out_rh(struct i3c_hci *hci, struct hci_rh_data *rh)
> +{
> + /*
> + * The entries of rh->src_xfers are not cleared by
> + * i3c_hci_reset_and_restore(), so can be used here. Do 2 passes so
> + * that the final_xfer of an xfer list is always processed last.
> + */
> + for (int pass = 0; pass < 2; pass++)
> + for (int i = 0; i < rh->xfer_entries; i++) {
> + struct hci_xfer *xfer = rh->src_xfers[i];
> +
> + if (!xfer || (!pass && xfer == xfer->final_xfer))
> + continue;
> + hci_dma_unmap_xfer(hci, xfer, 1);
> + rh->src_xfers[i] = NULL;
> + xfer->ring_entry = -1;
> + hci_cmd_set_resp_err(&xfer->response, RESP_ERR_HC_TERMINATED);
> + if (xfer == xfer->final_xfer)
> + complete(xfer->final_xfer->completion);
> + }
> +}
> +
> +static void hci_dma_error_out_all(struct i3c_hci *hci)
> +{
> + struct hci_rings_data *rings = hci->io_data;
> +
> + for (int i = 0; i < rings->total; i++)
> + hci_dma_error_out_rh(hci, &rings->headers[i]);
> +}
> +
> +static void hci_dma_recovery(struct i3c_hci *hci)
> +{
> + int ret;
> +
> + dev_err(&hci->master.dev, "Attempting to recover from internal errors\n");
> +
> + for (int i = 0; i < 3; i++) {
> + ret = i3c_hci_reset_and_restore(hci);
> + if (!ret)
> + break;
> + dev_err(&hci->master.dev, "Reset and restore failed, error %d\n", ret);
> + /* Just in case the controller is busy, give it some time */
> + msleep(1000);
> + }
> +
> + spin_lock_irq(&hci->lock);
> + hci_dma_error_out_all(hci);
> + hci_dma_unblock_enqueue(hci);
> + hci->recovery_needed = false;
> + spin_unlock_irq(&hci->lock);
> +
> + dev_err(&hci->master.dev, "Recovery %s\n", ret ? "failed!" : "done");
> +}
> +
> static bool hci_dma_dequeue_xfer(struct i3c_hci *hci,
> struct hci_xfer *xfer_list, int n)
> {
> @@ -640,6 +701,17 @@ static bool hci_dma_dequeue_xfer(struct i3c_hci *hci,
>
> ring_status = rh_reg_read(RING_STATUS);
> if (ring_status & RING_STATUS_RUNNING) {
> + /*
> + * The transfer may have already completed, especially
> + * if recovery has just run. Do nothing in that case.
> + */
> + hci_dma_xfer_done(hci, rh);
> + if (xfer_list->final_xfer->ring_entry < 0 &&
> + !hci->recovery_needed && !hci->enqueue_blocked &&
> + ring_status == (RING_STATUS_ENABLED | RING_STATUS_RUNNING)) {
> + spin_unlock_irq(&hci->lock);
> + return false;
> + }
> hci->enqueue_blocked = true;
> spin_unlock_irq(&hci->lock);
> /* stop the ring */
> @@ -647,12 +719,8 @@ static bool hci_dma_dequeue_xfer(struct i3c_hci *hci,
> spin_lock_irq(&hci->lock);
> ring_status = rh_reg_read(RING_STATUS);
> if (ring_status & RING_STATUS_RUNNING) {
> - /*
> - * We're deep in it if ever this condition is ever met.
> - * Hardware might still be writing to memory, etc.
> - */
> - dev_crit(&hci->master.dev, "unable to abort the ring\n");
> - WARN_ON(1);
> + dev_err(&hci->master.dev, "Unable to abort the DMA ring\n");
> + hci->recovery_needed = true;
> }
> }
>
> @@ -662,6 +730,13 @@ static bool hci_dma_dequeue_xfer(struct i3c_hci *hci,
>
> hci_dma_xfer_done(hci, rh);
>
> + if (hci->recovery_needed) {
> + hci->enqueue_blocked = true;
> + spin_unlock_irq(&hci->lock);
> + hci_dma_recovery(hci);
> + return true;
> + }
> +
> for (i = 0; i < n; i++) {
> struct hci_xfer *xfer = xfer_list + i;
> int idx = xfer->ring_entry;
> diff --git a/drivers/i3c/master/mipi-i3c-hci/hci.h b/drivers/i3c/master/mipi-i3c-hci/hci.h
> index a3151c26827e..4bf2c66c97b4 100644
> --- a/drivers/i3c/master/mipi-i3c-hci/hci.h
> +++ b/drivers/i3c/master/mipi-i3c-hci/hci.h
> @@ -55,6 +55,7 @@ struct i3c_hci {
> atomic_t next_cmd_tid;
> bool irq_inactive;
> bool enqueue_blocked;
> + bool recovery_needed;
> wait_queue_head_t enqueue_wait_queue;
> u32 caps;
> unsigned int quirks;
> --
> 2.51.0
>