Re: [PATCH v20 02/12] Add infrastructure for copy offload in block and request layer.

From: Nitesh Shetty
Date: Thu May 30 2024 - 06:11:21 EST


On 29/05/24 03:41PM, Bart Van Assche wrote:
On 5/29/24 12:48 AM, Damien Le Moal wrote:
On 5/29/24 15:17, Nitesh Shetty wrote:
On 24/05/24 01:33PM, Bart Van Assche wrote:
On 5/20/24 03:20, Nitesh Shetty wrote:
We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
Since copy is a composite operation involving src and dst sectors/lba,
each needs to be represented by a separate bio to make it compatible
with device mapper.
We expect caller to take a plug and send bio with destination information,
followed by bio with source information.
Once the dst bio arrives we form a request and wait for source
bio. Upon arrival of source bio we merge these two bio's and send
corresponding request down to device driver.
Merging non copy offload bio is avoided by checking for copy specific
opcodes in merge function.

In this patch I don't see any changes for blk_attempt_bio_merge(). Does
this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?

Yes, in this case copy won't work, as both src and dst bio reach driver
as part of separate requests.
We will add this as part of documentation.

So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
will not get support for copy offload ? Not ideal, by far.

QUEUE_FLAG_NOMERGES can also be set through sysfs (see also
queue_nomerges_store()). This is one of the reasons why using the merge
infrastructure for combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC is
unacceptable.


Bart, Damien, Hannes,
Thanks for your review.
We tried a slightly modified approach which simplifies this patch and
also avoids merge path.
Also with this, we should be able to solve the QUEUE_FLAG_MERGES issue.
Previously we also tried payload/token based approach,
which avoids merge path and tries to combine bios in driver.
But we received feedback that it wasn't the right approach [1].
Do below changes look any better or do you guys have anything else in mind ?

[1] https://lore.kernel.org/linux-block/ZIKphgDavKVPREnw@xxxxxxxxxxxxx/


diff --git a/block/blk-core.c b/block/blk-core.c
index 82c3ae22d76d..7158bac8cc57 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -122,6 +122,8 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_FINISH),
REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_ZEROES),
+ REQ_OP_NAME(COPY_SRC),
+ REQ_OP_NAME(COPY_DST),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
};
@@ -839,6 +841,11 @@ void submit_bio_noacct(struct bio *bio)
* requests.
*/
fallthrough;
+ case REQ_OP_COPY_SRC:
+ case REQ_OP_COPY_DST:
+ if (!q->limits.max_copy_sectors)
+ goto not_supported;
+ break;
default:
goto not_supported;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8534c35e0497..a651e7c114d0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}
+static struct bio *bio_split_copy(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nsegs)
+{
+ *nsegs = 1;
+ if (bio_sectors(bio) <= lim->max_copy_sectors)
+ return NULL;
+ /*
+ * We don't support splitting for a copy bio. End it with EIO if
+ * splitting is required and return an error pointer.
+ */
+ return ERR_PTR(-EIO);
+}
+
/*
* Return the maximum number of sectors from the start of a bio that may be
* submitted as a single request to a block device. If enough sectors remain,
@@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
case REQ_OP_WRITE_ZEROES:
split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
+ case REQ_OP_COPY_SRC:
+ case REQ_OP_COPY_DST:
+ split = bio_split_copy(bio, lim, nr_segs);
+ if (IS_ERR(split))
+ return NULL;
+ break;
default:
split = bio_split_rw(bio, lim, nr_segs, bs,
get_max_io_size(bio, lim) << SECTOR_SHIFT);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b4df8e5ac9e..6d4ffbdade28 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2833,6 +2833,63 @@ static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
blk_mq_commit_rqs(hctx, queued, false);
}
+/*
+ * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
+ * operation by taking a plug.
+ * Initially DST bio is sent which forms a request and
+ * waits for SRC bio to arrive. Once SRC bio arrives
+ * we combine it and send request down to driver.
+ */
+static inline bool blk_copy_offload_combine_ok(struct request *req,
+ struct bio *bio)
+{
+ return (req_op(req) == REQ_OP_COPY_DST &&
+ bio_op(bio) == REQ_OP_COPY_SRC);
+}
+
+static int blk_copy_offload_combine(struct request *req, struct bio *bio)
+{
+ if (!blk_copy_offload_combine_ok(req, bio))
+ return 1;
+
+ if (req->__data_len != bio->bi_iter.bi_size)
+ return 1;
+
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->nr_phys_segments++;
+ req->__data_len += bio->bi_iter.bi_size;
+
+ return 0;
+}
+
+static inline bool blk_copy_offload_attempt_combine(struct request_queue *q,
+ struct bio *bio)
+{
+ struct blk_plug *plug = current->plug;
+ struct request *rq;
+
+ if (!plug || rq_list_empty(plug->mq_list))
+ return false;
+
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q == q) {
+ if (!blk_copy_offload_combine(rq, bio))
+ return true;
+ break;
+ }
+
+ /*
+ * Only keep iterating plug list for combines if we have multiple
+ * queues
+ */
+ if (!plug->multiple_queues)
+ break;
+ }
+ return false;
+}
+
static bool blk_mq_attempt_bio_merge(struct request_queue *q,
struct bio *bio, unsigned int nr_segs)
{
@@ -2977,6 +3034,9 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
+ if (blk_copy_offload_attempt_combine(q, bio))
+ goto queue_exit;
+
if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
goto queue_exit;
diff --git a/block/blk.h b/block/blk.h
index 189bc25beb50..112c6736f44c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -323,6 +323,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_COPY_SRC:
+ case REQ_OP_COPY_DST:
return true; /* non-trivial splitting decisions */
default:
break;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 781c4500491b..22a08425d13e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -342,6 +342,10 @@ enum req_op {
/* reset all the zone present on the device */
REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15,
+ /* copy offload source and destination operations */
+ REQ_OP_COPY_SRC = (__force blk_opf_t)18,
+ REQ_OP_COPY_DST = (__force blk_opf_t)19,
+
/* Driver private requests */
REQ_OP_DRV_IN = (__force blk_opf_t)34,
REQ_OP_DRV_OUT = (__force blk_opf_t)35,
--·
2.34.1