[PATCH v3 10/13] spi: cadence-quadspi: enable PHY for direct reads and indirect writes

From: Santhosh Kumar K

Date: Wed May 27 2026 - 13:58:45 EST


After PHY tuning completes, data transfers still use the default
read-capture path. The PHY pipeline must be activated around each
eligible transfer to benefit from the calibrated delay settings.

Add cqspi_phy_enable() to toggle PHY mode. Enabling sets the calibrated
read-capture delay, asserts PHY_EN and PHY_PIPELINE, and decrements the
dummy cycle count by one since the PHY pipeline absorbs that latency.
Disabling reverses all three. Returns cqspi_wait_idle() so callers can
abort if the controller stalls on enable; disable is best-effort.

Split cqspi_direct_read_execute() so PHY-eligible reads run DMA over the
16-byte-aligned middle section with PHY active, while unaligned head and
tail bytes are transferred without PHY. PHY is used when use_phy is set,
the transfer exceeds 16 bytes, and the frequency matches the tuned rate.
cqspi_memcpy_fromio() handles small and non-DMA-able transfers, with
special handling for 8D-8D-8D to ensure 2-byte-aligned I/O accesses.

For indirect writes, PHY is enabled for transfers of at least 1 KB
where the setup overhead is amortized.

Signed-off-by: Santhosh Kumar K <s-k6@xxxxxx>
---
drivers/spi/spi-cadence-quadspi.c | 181 ++++++++++++++++++++++++++++--
1 file changed, 171 insertions(+), 10 deletions(-)

diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index 72208d376305..80e7c572ab80 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -564,6 +564,61 @@ static void cqspi_readdata_capture(struct cqspi_st *cqspi, const bool bypass,
writel(reg, reg_base + CQSPI_REG_READCAPTURE);
}

+static int cqspi_phy_enable(struct cqspi_flash_pdata *f_pdata, bool enable)
+{
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ u32 reg;
+ u8 dummy;
+
+ if (enable) {
+ cqspi_readdata_capture(cqspi, true, f_pdata->use_dqs,
+ f_pdata->phy_setting.read_delay);
+
+ reg = readl(reg_base + CQSPI_REG_CONFIG);
+ reg |= CQSPI_REG_CONFIG_PHY_EN | CQSPI_REG_CONFIG_PHY_PIPELINE;
+ writel(reg, reg_base + CQSPI_REG_CONFIG);
+
+ /*
+ * The PHY data-capture pipeline absorbs one dummy cycle's
+ * worth of latency; reduce the count to avoid over-compensation.
+ */
+ reg = readl(reg_base + CQSPI_REG_RD_INSTR);
+ dummy = FIELD_GET(CQSPI_REG_RD_INSTR_DUMMY_MASK
+ << CQSPI_REG_RD_INSTR_DUMMY_LSB,
+ reg);
+ dummy--;
+ reg &= ~(CQSPI_REG_RD_INSTR_DUMMY_MASK
+ << CQSPI_REG_RD_INSTR_DUMMY_LSB);
+ reg |= FIELD_PREP(CQSPI_REG_RD_INSTR_DUMMY_MASK
+ << CQSPI_REG_RD_INSTR_DUMMY_LSB,
+ dummy);
+ writel(reg, reg_base + CQSPI_REG_RD_INSTR);
+ } else {
+ cqspi_readdata_capture(cqspi, !cqspi->rclk_en, false,
+ f_pdata->read_delay);
+
+ reg = readl(reg_base + CQSPI_REG_CONFIG);
+ reg &= ~(CQSPI_REG_CONFIG_PHY_EN |
+ CQSPI_REG_CONFIG_PHY_PIPELINE);
+ writel(reg, reg_base + CQSPI_REG_CONFIG);
+
+ reg = readl(reg_base + CQSPI_REG_RD_INSTR);
+ dummy = FIELD_GET(CQSPI_REG_RD_INSTR_DUMMY_MASK
+ << CQSPI_REG_RD_INSTR_DUMMY_LSB,
+ reg);
+ dummy++;
+ reg &= ~(CQSPI_REG_RD_INSTR_DUMMY_MASK
+ << CQSPI_REG_RD_INSTR_DUMMY_LSB);
+ reg |= FIELD_PREP(CQSPI_REG_RD_INSTR_DUMMY_MASK
+ << CQSPI_REG_RD_INSTR_DUMMY_LSB,
+ dummy);
+ writel(reg, reg_base + CQSPI_REG_RD_INSTR);
+ }
+
+ return cqspi_wait_idle(cqspi);
+}
+
static int cqspi_exec_flash_cmd(struct cqspi_st *cqspi, unsigned int reg)
{
void __iomem *reg_base = cqspi->iobase;
@@ -1191,6 +1246,7 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
void __iomem *reg_base = cqspi->iobase;
unsigned int remaining = n_tx;
unsigned int write_bytes;
+ bool use_phy_write;
int ret;

if (!refcount_read(&cqspi->refcount))
@@ -1226,6 +1282,15 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
if (cqspi->apb_ahb_hazard)
readl(reg_base + CQSPI_REG_INDIRECTWR);

+ /* Use PHY only for large writes where setup overhead is amortized */
+ use_phy_write = n_tx >= SZ_1K && f_pdata->use_phy;
+
+ if (use_phy_write) {
+ ret = cqspi_phy_enable(f_pdata, true);
+ if (ret)
+ goto failwr;
+ }
+
while (remaining > 0) {
size_t write_words, mod_bytes;

@@ -1266,6 +1331,9 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
goto failwr;
}

+ if (use_phy_write)
+ cqspi_phy_enable(f_pdata, false);
+
/* Disable interrupt. */
writel(0, reg_base + CQSPI_REG_IRQMASK);

@@ -1277,6 +1345,9 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
return 0;

failwr:
+ if (use_phy_write)
+ cqspi_phy_enable(f_pdata, false);
+
/* Disable interrupt. */
writel(0, reg_base + CQSPI_REG_IRQMASK);

@@ -1448,8 +1519,15 @@ static void cqspi_rx_dma_callback(void *param)
complete(&cqspi->rx_dma_complete);
}

-static int cqspi_direct_read_execute(struct cqspi_flash_pdata *f_pdata,
- u_char *buf, loff_t from, size_t len)
+static bool cqspi_use_phy(struct cqspi_flash_pdata *f_pdata,
+ const struct spi_mem_op *op)
+{
+ return f_pdata->use_phy && op->data.nbytes > 16 &&
+ op->max_freq == f_pdata->max_clk_rate;
+}
+
+static int cqspi_direct_read_dma(struct cqspi_flash_pdata *f_pdata, u_char *buf,
+ loff_t from, size_t len)
{
struct cqspi_st *cqspi = f_pdata->cqspi;
struct device *dev = &cqspi->pdev->dev;
@@ -1461,19 +1539,14 @@ static int cqspi_direct_read_execute(struct cqspi_flash_pdata *f_pdata,
dma_addr_t dma_dst;
struct device *ddev;

- if (!cqspi->rx_chan || !virt_addr_valid(buf)) {
- memcpy_fromio(buf, cqspi->ahb_base + from, len);
- return 0;
- }
-
ddev = cqspi->rx_chan->device->dev;
dma_dst = dma_map_single(ddev, buf, len, DMA_FROM_DEVICE);
if (dma_mapping_error(ddev, dma_dst)) {
dev_err(dev, "dma mapping failed\n");
return -ENOMEM;
}
- tx = dmaengine_prep_dma_memcpy(cqspi->rx_chan, dma_dst, dma_src,
- len, flags);
+ tx = dmaengine_prep_dma_memcpy(cqspi->rx_chan, dma_dst, dma_src, len,
+ flags);
if (!tx) {
dev_err(dev, "device_prep_dma_memcpy error\n");
ret = -EIO;
@@ -1507,6 +1580,94 @@ static int cqspi_direct_read_execute(struct cqspi_flash_pdata *f_pdata,
return ret;
}

+static void cqspi_memcpy_fromio(const struct spi_mem_op *op, void *to,
+ const void __iomem *from, size_t count)
+{
+ if (op->data.buswidth == 8 && op->data.dtr) {
+ unsigned long from_addr = (unsigned long)from;
+
+ /* Handle unaligned start with 2-byte read */
+ if (count && !IS_ALIGNED(from_addr, 4)) {
+ *(u16 *)to = __raw_readw(from);
+ from += 2;
+ to += 2;
+ count -= 2;
+ }
+
+ /* Use 4-byte reads for aligned bulk (no readq for 32-bit) */
+ if (count >= 4) {
+ size_t len = round_down(count, 4);
+
+ memcpy_fromio(to, from, len);
+ from += len;
+ to += len;
+ count -= len;
+ }
+
+ /* Handle remaining 2 bytes */
+ if (count)
+ *(u16 *)to = __raw_readw(from);
+
+ return;
+ }
+
+ memcpy_fromio(to, from, count);
+}
+
+static int cqspi_direct_read_execute(struct cqspi_flash_pdata *f_pdata,
+ const struct spi_mem_op *op)
+{
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ loff_t from = op->addr.val;
+ loff_t from_aligned, to_aligned;
+ size_t len = op->data.nbytes;
+ size_t len_aligned;
+ u_char *buf = op->data.buf.in;
+ int ret;
+
+ if (!cqspi->rx_chan || !virt_addr_valid(buf) || len <= 16) {
+ cqspi_memcpy_fromio(op, buf, cqspi->ahb_base + from, len);
+ return 0;
+ }
+
+ if (!cqspi_use_phy(f_pdata, op))
+ return cqspi_direct_read_dma(f_pdata, buf, from, len);
+
+ /* Split into unaligned head, aligned middle, unaligned tail */
+ from_aligned = ALIGN(from, 16);
+ to_aligned = ALIGN_DOWN(from + len, 16);
+ len_aligned = to_aligned - from_aligned;
+
+ if (from != from_aligned) {
+ ret = cqspi_direct_read_dma(f_pdata, buf, from,
+ from_aligned - from);
+ if (ret)
+ return ret;
+ buf += from_aligned - from;
+ }
+
+ if (len_aligned) {
+ ret = cqspi_phy_enable(f_pdata, true);
+ if (ret)
+ return ret;
+ ret = cqspi_direct_read_dma(f_pdata, buf, from_aligned,
+ len_aligned);
+ cqspi_phy_enable(f_pdata, false);
+ if (ret)
+ return ret;
+ buf += len_aligned;
+ }
+
+ if (to_aligned != (from + len)) {
+ ret = cqspi_direct_read_dma(f_pdata, buf, to_aligned,
+ (from + len) - to_aligned);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static ssize_t cqspi_read(struct cqspi_flash_pdata *f_pdata,
const struct spi_mem_op *op)
{
@@ -1524,7 +1685,7 @@ static ssize_t cqspi_read(struct cqspi_flash_pdata *f_pdata,

if ((cqspi->use_direct_mode && ((from + len) <= cqspi->ahb_size)) ||
(cqspi->ddata && cqspi->ddata->quirks & CQSPI_NO_INDIRECT_MODE))
- return cqspi_direct_read_execute(f_pdata, buf, from, len);
+ return cqspi_direct_read_execute(f_pdata, op);

if (cqspi->use_dma_read && ddata && ddata->indirect_read_dma &&
virt_addr_valid(buf) && ((dma_align & CQSPI_DMA_UNALIGN) == 0))
--
2.34.1