Re: [PATCH v20 02/12] Add infrastructure for copy offload in block and request layer.

From: Damien Le Moal
Date: Mon May 20 2024 - 11:00:27 EST


On 2024/05/20 12:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.

Why ? The beginning of the sentence isn't justification enough for the two new
operation codes ? The 2 sentences should be reversed for easier reading:
justification first naturally leads to the reader understanding why the codes
are needed.

Also: s/opcode/operations


> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.

expect ? Plugging is optional. Does copy offload require it ? Please clarify this.

> Once the dst bio arrives we form a request and wait for source

arrives ? You mean "is submitted" ?

s/and wait for/and wait for the

> bio. Upon arrival of source bio we merge these two bio's and send

s/arrival/submission ?

s/of/of the
s/bio's/BIOs
s/and send/and send the
s/down to/down to the

> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.

Super unclear... What are you trying to say here ? That merging copy offload
BIOs with other BIOs is not allowed ? That is already handled. Only BIOs &
requests with the same operation can be merged. The code below also suggests
that you allow merging copy offloads... So I really do not understand this.

>
> Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx>
> Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx>
> ---
> block/blk-core.c | 7 +++++++
> block/blk-merge.c | 41 +++++++++++++++++++++++++++++++++++++++
> block/blk.h | 16 +++++++++++++++
> block/elevator.h | 1 +
> include/linux/bio.h | 6 +-----
> include/linux/blk_types.h | 10 ++++++++++
> 6 files changed, 76 insertions(+), 5 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index ea44b13af9af..f18ee5f709c0 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -122,6 +122,8 @@ static const char *const blk_op_name[] = {
> REQ_OP_NAME(ZONE_FINISH),
> REQ_OP_NAME(ZONE_APPEND),
> REQ_OP_NAME(WRITE_ZEROES),
> + REQ_OP_NAME(COPY_SRC),
> + REQ_OP_NAME(COPY_DST),
> REQ_OP_NAME(DRV_IN),
> REQ_OP_NAME(DRV_OUT),
> };
> @@ -838,6 +840,11 @@ void submit_bio_noacct(struct bio *bio)
> * requests.
> */
> fallthrough;
> + case REQ_OP_COPY_SRC:
> + case REQ_OP_COPY_DST:
> + if (!q->limits.max_copy_sectors)
> + goto not_supported;
> + break;
> default:
> goto not_supported;
> }
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 8534c35e0497..f8dc48a03379 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
> return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
> }
>
> +static struct bio *bio_split_copy(struct bio *bio,
> + const struct queue_limits *lim,
> + unsigned int *nsegs)
> +{
> + *nsegs = 1;
> + if (bio_sectors(bio) <= lim->max_copy_sectors)
> + return NULL;
> + /*
> + * We don't support splitting for a copy bio. End it with EIO if
> + * splitting is required and return an error pointer.
> + */
> + return ERR_PTR(-EIO);
> +}

Hmm... Why not check that the copy request is small enough and will not be split
when it is submitted ? Something like blk_check_zone_append() does with
REQ_OP_ZONE_APPEND ? So adding a blk_check_copy_offload(). That would also
include the limits check from the previous hunk.

> +
> /*
> * Return the maximum number of sectors from the start of a bio that may be
> * submitted as a single request to a block device. If enough sectors remain,
> @@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
> case REQ_OP_WRITE_ZEROES:
> split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
> break;
> + case REQ_OP_COPY_SRC:
> + case REQ_OP_COPY_DST:
> + split = bio_split_copy(bio, lim, nr_segs);
> + if (IS_ERR(split))
> + return NULL;
> + break;

See above.

> default:
> split = bio_split_rw(bio, lim, nr_segs, bs,
> get_max_io_size(bio, lim) << SECTOR_SHIFT);
> @@ -925,6 +945,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
> if (!rq_mergeable(rq) || !bio_mergeable(bio))
> return false;
>
> + if (blk_copy_offload_mergable(rq, bio))
> + return true;
> +
> if (req_op(rq) != bio_op(bio))
> return false;
>
> @@ -958,6 +981,8 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
> {
> if (blk_discard_mergable(rq))
> return ELEVATOR_DISCARD_MERGE;
> + else if (blk_copy_offload_mergable(rq, bio))
> + return ELEVATOR_COPY_OFFLOAD_MERGE;
> else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
> return ELEVATOR_BACK_MERGE;
> else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
> @@ -1065,6 +1090,20 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
> return BIO_MERGE_FAILED;
> }
>
> +static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
> + struct bio *bio)
> +{
> + if (req->__data_len != bio->bi_iter.bi_size)
> + return BIO_MERGE_FAILED;
> +
> + req->biotail->bi_next = bio;
> + req->biotail = bio;
> + req->nr_phys_segments++;
> + req->__data_len += bio->bi_iter.bi_size;

Arg... You seem to be assuming that the source BIO always comes right after the
destination request... What if copy offloads are being concurrently issued ?
Shouldn't you check somehow that the pair is a match ? Or are you relying on the
per-context plugging which prevents that from happening in the first place ? But
that would assumes that you never ever sleep trying to allocate the source BIO
after the destination BIO/request are prepared and plugged.

> +
> + return BIO_MERGE_OK;
> +}
> +
> static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
> struct request *rq,
> struct bio *bio,
> @@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
> break;
> case ELEVATOR_DISCARD_MERGE:
> return bio_attempt_discard_merge(q, rq, bio);
> + case ELEVATOR_COPY_OFFLOAD_MERGE:
> + return bio_attempt_copy_offload_merge(rq, bio);
> default:
> return BIO_MERGE_NONE;
> }
> diff --git a/block/blk.h b/block/blk.h
> index 189bc25beb50..6528a2779b84 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -174,6 +174,20 @@ static inline bool blk_discard_mergable(struct request *req)
> return false;
> }
>
> +/*
> + * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
> + * operation by taking a plug.
> + * Initially DST bio is sent which forms a request and
> + * waits for SRC bio to arrive. Once SRC bio arrives
> + * we merge it and send request down to driver.
> + */
> +static inline bool blk_copy_offload_mergable(struct request *req,
> + struct bio *bio)
> +{
> + return (req_op(req) == REQ_OP_COPY_DST &&
> + bio_op(bio) == REQ_OP_COPY_SRC);
> +}

This function is really not needed at all (used in one place only).

> +
> static inline unsigned int blk_rq_get_max_segments(struct request *rq)
> {
> if (req_op(rq) == REQ_OP_DISCARD)
> @@ -323,6 +337,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
> case REQ_OP_DISCARD:
> case REQ_OP_SECURE_ERASE:
> case REQ_OP_WRITE_ZEROES:
> + case REQ_OP_COPY_SRC:
> + case REQ_OP_COPY_DST:
> return true; /* non-trivial splitting decisions */

See above. Limits should be checked on submission.

> default:
> break;
> diff --git a/block/elevator.h b/block/elevator.h
> index e9a050a96e53..c7a45c1f4156 100644
> --- a/block/elevator.h
> +++ b/block/elevator.h
> @@ -18,6 +18,7 @@ enum elv_merge {
> ELEVATOR_FRONT_MERGE = 1,
> ELEVATOR_BACK_MERGE = 2,
> ELEVATOR_DISCARD_MERGE = 3,
> + ELEVATOR_COPY_OFFLOAD_MERGE = 4,
> };
>
> struct blk_mq_alloc_data;
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index d5379548d684..528ef22dd65b 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -53,11 +53,7 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
> */
> static inline bool bio_has_data(struct bio *bio)
> {
> - if (bio &&
> - bio->bi_iter.bi_size &&
> - bio_op(bio) != REQ_OP_DISCARD &&
> - bio_op(bio) != REQ_OP_SECURE_ERASE &&
> - bio_op(bio) != REQ_OP_WRITE_ZEROES)
> + if (bio && (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE))
> return true;

This change seems completely broken and out of place. This would cause a return
of false for zone append operations.

>
> return false;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 781c4500491b..7f692bade271 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -342,6 +342,10 @@ enum req_op {
> /* reset all the zone present on the device */
> REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15,
>
> + /* copy offload src and dst operation */

s/src/source
s/dst/destination
s/operation/operations

> + REQ_OP_COPY_SRC = (__force blk_opf_t)18,
> + REQ_OP_COPY_DST = (__force blk_opf_t)19,
> +
> /* Driver private requests */
> REQ_OP_DRV_IN = (__force blk_opf_t)34,
> REQ_OP_DRV_OUT = (__force blk_opf_t)35,
> @@ -430,6 +434,12 @@ static inline bool op_is_write(blk_opf_t op)
> return !!(op & (__force blk_opf_t)1);
> }
>
> +static inline bool op_is_copy(blk_opf_t op)
> +{
> + return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
> + (op & REQ_OP_MASK) == REQ_OP_COPY_DST);
> +}

May be use a switch here to avoid the double masking of op ?

> +
> /*
> * Check if the bio or request is one that needs special treatment in the
> * flush state machine.

--
Damien Le Moal
Western Digital Research