[PATCH] dmaengine: add DMA_MEMCPY_SG transaction type

From: Shivank Garg

Date: Sun May 17 2026 - 14:19:10 EST

Currently, a client that wants to copy N (src, dst, len) tuples call
dmaengine_prep_dma_memcpy() N times, allocating N independent
dma_async_tx_descriptor. The provider has to treat them as independent
transactions, even when the underlying hardware can program them as
one descriptor group with a single completion. This overheads
are dominant for page size like 4KB.

To reduce this overheads, add DMA_MEMCPY_SG and the matching
device_prep_dma_memcpy_sg() callback taking paired src and dst
scatterlists, plus the dmaengine_prep_dma_memcpy_sg() inline wrapper.
The provider walks both lists in lockstep and retires the whole batch
as one async tx.

This API was removed by commit 0cae04373b77 ("dmaengine: remove
DMA_MEMCPY_SG once again") for lack of in-tree users. The user
this time is the page-migration copy offload driver.

Suggested-by: Vinod Koul <vkoul@xxxxxxxxxx>
Signed-off-by: Shivank Garg <shivankg@xxxxxxx>
---
.../driver-api/dmaengine/provider.rst | 7 ++++
drivers/dma/dmaengine.c | 1 +
include/linux/dmaengine.h | 36 +++++++++++++++++++
3 files changed, 44 insertions(+)

diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index f4ed98f701c9..fc3cab78e61a 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -175,6 +175,13 @@ Currently, the types available are:
``glReadPixels()``, which might require a verbatim copy of a huge
framebuffer from local device memory onto host memory.

+- DMA_MEMCPY_SG
+
+ - The device is able to do memory to memory scatter-gather transfers.
+
+ - This takes pair of src and dst scatterlists and retires the whole batch
+ as one async tx.
+
- DMA_XOR

- The device is able to perform XOR operations on memory areas
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 405bd2fbb4a3..665e00a96c8f 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1197,6 +1197,7 @@ int dma_async_device_register(struct dma_device *device)
}

CHECK_CAP(dma_memcpy, DMA_MEMCPY);
+ CHECK_CAP(dma_memcpy_sg, DMA_MEMCPY_SG);
CHECK_CAP(dma_xor, DMA_XOR);
CHECK_CAP(dma_xor_val, DMA_XOR_VAL);
CHECK_CAP(dma_pq, DMA_PQ);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index b3d251c9734e..9378e7a7803a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -50,6 +50,7 @@ enum dma_status {
*/
enum dma_transaction_type {
DMA_MEMCPY,
+ DMA_MEMCPY_SG,
DMA_XOR,
DMA_PQ,
DMA_XOR_VAL,
@@ -824,6 +825,7 @@ struct dma_filter {
* @device_router_config: optional callback for DMA router configuration
* @device_free_chan_resources: release DMA channel's resources
* @device_prep_dma_memcpy: prepares a memcpy operation
+ * @device_prep_dma_memcpy_sg: prepares a memcpy operation over a scatter list
* @device_prep_dma_xor: prepares a xor operation
* @device_prep_dma_xor_val: prepares a xor validation operation
* @device_prep_dma_pq: prepares a pq operation
@@ -903,6 +905,11 @@ struct dma_device {
struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
size_t len, unsigned long flags);
+ struct dma_async_tx_descriptor *(*device_prep_dma_memcpy_sg)(
+ struct dma_chan *chan,
+ struct scatterlist *dst_sg, unsigned int dst_nents,
+ struct scatterlist *src_sg, unsigned int src_nents,
+ unsigned long flags);
struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
unsigned int src_cnt, size_t len, unsigned long flags);
@@ -1091,6 +1098,35 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
len, flags);
}

+/**
+ * dmaengine_prep_dma_memcpy_sg - Prepare a scatter-gather memcpy
+ * @chan: The channel to prepare on
+ * @dst_sg: Destination scatterlist
+ * @dst_nents: Number of mapped entries in @dst_sg
+ * @src_sg: Source scatterlist
+ * @src_nents: Number of mapped entries in @src_sg
+ * @flags: Dmaengine flags (e.g. DMA_PREP_INTERRUPT, DMA_CTRL_ACK)
+ *
+ * Submit a batch of memcpy operations described by two scatterlists as
+ * a single async transaction. All segments retire as one tx with one
+ * completion.
+ *
+ * Returns NULL if the channel does not implement this operation.
+ */
+static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy_sg(
+ struct dma_chan *chan,
+ struct scatterlist *dst_sg, unsigned int dst_nents,
+ struct scatterlist *src_sg, unsigned int src_nents,
+ unsigned long flags)
+{
+ if (!chan || !chan->device ||
+ !chan->device->device_prep_dma_memcpy_sg)
+ return NULL;
+
+ return chan->device->device_prep_dma_memcpy_sg(chan,
+ dst_sg, dst_nents, src_sg, src_nents, flags);
+}
+
static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan,
enum dma_desc_metadata_mode mode)
{
--
2.43.0