Re: [PATCH 6/7] drivers/migrate_offload: add DMA batch copy driver (dcbm)

From: Karim Manaouil

Date: Fri Jun 19 2026 - 12:13:37 EST

Hi again Shivank,

I just got some time to resume testing this on Intel Sapphire Rapids and
something caught my attention, below

On Tue, Apr 28, 2026 at 03:50:49PM +0000, Shivank Garg wrote:
> +static int submit_dma_transfers(struct dma_work *work)
> +{
> + struct scatterlist *sg_src, *sg_dst;
> + struct dma_async_tx_descriptor *tx;
> + unsigned long flags = DMA_CTRL_ACK;
> + dma_cookie_t cookie;
> + int i;
> +
> + atomic_set(&work->pending, 1);
> +
> + sg_src = work->src_sgt->sgl;
> + sg_dst = work->dst_sgt->sgl;
> + for_each_sgtable_dma_sg(work->src_sgt, sg_src, i) {
> + if (i == work->src_sgt->nents - 1)
> + flags |= DMA_PREP_INTERRUPT;
> +
> + tx = dmaengine_prep_dma_memcpy(work->chan,
> + sg_dma_address(sg_dst),
> + sg_dma_address(sg_src),
> + sg_dma_len(sg_src), flags);
> + if (!tx) {
> + atomic_set(&work->pending, 0);
> + return -EIO;
> + }
> +
> + if (i == work->src_sgt->nents - 1) {
> + tx->callback = dma_completion_callback;
> + tx->callback_param = work;
> + }
> +

Here, you are submitting the descriptors one after the other and only
the last descriptor has a callback, which in theory sounds correct as
you expect the DMA engine to complete the descriptors in the same order
they were submitted. However, in reality that's not really gauranteed.

Intel DSA in particular can complete descriptors out of order. That
means, the last descriptor submitted may not necessarily be the last
descriptors that completes. In that case, you will return in
folios_copy_dma() before the copy truly completes for all the folios.

For correctness, we have to add a callback to every descriptor and
initialize work->pending to the number of descriptors submitted then
every time a descriptor completes, you call atomic_dec(&work->pending)
and only complete the completion the moment it reaches zero.

Btw, waiting for an interrupt adds massive scheduling overhead. If we
also add the logic above, it'll get even worse. In my measurements, this
can easily add up to 6ms, by which CPU page copy have easily completed
the entire copy, which again adds to the list of latency concerns I
raised in my other reply.

> + cookie = dmaengine_submit(tx);
> + if (dma_submit_error(cookie)) {
> + atomic_set(&work->pending, 0);
> + return -EIO;
> + }
> + sg_dst = sg_next(sg_dst);
> + }
> + return 0;
> +}
> +
> +/**
> + * folios_copy_dma - copy a batch of folios via DMA memcpy
> + * @dst_list: destination folio list
> + * @src_list: source folio list
> + * @nr_folios: number of folios in each list
> + *
> + * Return: 0 on success, negative errno on failure.
> + */
> +static int folios_copy_dma(struct list_head *dst_list,
> + struct list_head *src_list, unsigned int nr_folios)
> +{
> + struct dma_work *works;
> + struct list_head *src_pos = src_list->next;
> + struct list_head *dst_pos = dst_list->next;
> + int i, folios_per_chan, ret;
> + dma_cap_mask_t mask;
> + int actual_channels = 0;
> + unsigned int max_channels;
> +
> + max_channels = min3(nr_dma_channels, nr_folios,
> + (unsigned int)MAX_DMA_CHANNELS);
> +
> + works = kcalloc(max_channels, sizeof(*works), GFP_KERNEL);
> + if (!works)
> + return -ENOMEM;
> +
> + dma_cap_zero(mask);
> + dma_cap_set(DMA_MEMCPY, mask);
> +
> + for (i = 0; i < max_channels; i++) {
> + works[actual_channels].chan = dma_request_chan_by_mask(&mask);
> + if (IS_ERR(works[actual_channels].chan))
> + break;
> + init_completion(&works[actual_channels].done);
> + actual_channels++;
> + }
> +
> + if (actual_channels == 0) {
> + kfree(works);
> + return -ENODEV;
> + }
> +
> + for (i = 0; i < actual_channels; i++) {
> + folios_per_chan = nr_folios * (i + 1) / actual_channels -
> + (nr_folios * i) / actual_channels;
> + if (folios_per_chan == 0)
> + continue;
> +
> + ret = setup_sg_tables(&works[i], &src_pos, &dst_pos,
> + folios_per_chan);
> + if (ret)
> + goto err_cleanup;
> + }
> +
> + for (i = 0; i < actual_channels; i++) {
> + ret = submit_dma_transfers(&works[i]);
> + if (ret)
> + goto err_cleanup;
> + }
> +
> + for (i = 0; i < actual_channels; i++) {
> + if (atomic_read(&works[i].pending) > 0)
> + dma_async_issue_pending(works[i].chan);
> + }
> +
> + for (i = 0; i < actual_channels; i++) {
> + if (atomic_read(&works[i].pending) == 0)
> + continue;
> + if (!wait_for_completion_timeout(&works[i].done,
> + msecs_to_jiffies(10000))) {
> + ret = -ETIMEDOUT;
> + goto err_cleanup;
> + }
> + }
> +
> + cleanup_dma_work(works, actual_channels);
> +
> + atomic_long_add(nr_folios, &folios_migrated);
> + return 0;
> +
> +err_cleanup:
> + pr_warn_ratelimited("dcbm: DMA copy failed (%d), falling back to CPU\n",
> + ret);
> + cleanup_dma_work(works, actual_channels);
> +
> + atomic_long_add(nr_folios, &folios_failures);
> + return ret;
> +}

--
~karim