Re: [PATCH] ceph: re-org copy_file_range and fix error handling paths

From: Jeff Layton
Date: Thu Feb 20 2020 - 15:41:19 EST


On Mon, 2020-02-17 at 12:36 +0000, Luis Henriques wrote:
> This patch re-organizes copy_file_range, trying to fix a few issues in
> error handling. Here's the summary:
>
> - Abort copy if initial do_splice_direct() returns fewer bytes than
> requested.
>
> - Move the 'size' initialization (with i_size_read()) further down in the
> code, after the initial call to do_splice_direct(). This avoids issues
> with a possibly stale value if a manual copy is done.
>
> - Move the object copy loop into a separate function. This makes it
> easier to handle errors (e.g, dirtying caps and updating the MDS
> metadata if only some objects have been copied before an error has
> occurred).
>
> - Added calls to ceph_oloc_destroy() to avoid leaking memory with src_oloc
> and dst_oloc
>
> - After the object copy loop, the new file size to be reported to the MDS
> (if there's file size change) is now the actual file size, and not the
> size after an eventual extra manual copy.
>
> - Added a few dout() to show the number of bytes copied in the two manual
> copies and in the object copy loop.
>
> Signed-off-by: Luis Henriques <lhenriques@xxxxxxxx>
> ---
> Hi!
>
> Initially I was going to have this patch split in a series, but then I
> decided not to do that as this big patch allows (IMO) to better see the
> whole picture. But please let me know if you think otherwise and I can
> split it in a few smaller patches.
>
> I tried to cover all the issues that have been pointed out by Ilya, but I
> may have missed something or, more likely, introduced new bugs ;-)
>
> Cheers,
> --
> Luis
>

Sorry for the delay in review!

> fs/ceph/file.c | 169 ++++++++++++++++++++++++++++---------------------
> 1 file changed, 96 insertions(+), 73 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index c3b8e8e0bf17..4d90a275f9a5 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1931,6 +1931,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
> return 0;
> }
>
> +static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
> + struct ceph_inode_info *dst_ci, u64 *dst_off,
> + struct ceph_fs_client *fsc,
> + size_t len, unsigned int flags)
> +{
> + struct ceph_object_locator src_oloc, dst_oloc;
> + struct ceph_object_id src_oid, dst_oid;
> + size_t bytes = 0;
> + u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
> + u32 src_objlen, dst_objlen;
> + u32 object_size = src_ci->i_layout.object_size;
> + int ret;
> +
> + src_oloc.pool = src_ci->i_layout.pool_id;
> + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
> + dst_oloc.pool = dst_ci->i_layout.pool_id;
> + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
> +
> + while (len >= object_size) {
> + ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
> + object_size, &src_objnum,
> + &src_objoff, &src_objlen);
> + ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
> + object_size, &dst_objnum,
> + &dst_objoff, &dst_objlen);
> + ceph_oid_init(&src_oid);
> + ceph_oid_printf(&src_oid, "%llx.%08llx",
> + src_ci->i_vino.ino, src_objnum);
> + ceph_oid_init(&dst_oid);
> + ceph_oid_printf(&dst_oid, "%llx.%08llx",
> + dst_ci->i_vino.ino, dst_objnum);
> + /* Do an object remote copy */
> + ret = ceph_osdc_copy_from(&fsc->client->osdc,
> + src_ci->i_vino.snap, 0,
> + &src_oid, &src_oloc,
> + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
> + &dst_oid, &dst_oloc,
> + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
> + dst_ci->i_truncate_seq,
> + dst_ci->i_truncate_size,
> + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
> + if (ret) {
> + if (ret == -EOPNOTSUPP) {
> + fsc->have_copy_from2 = false;
> + pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
> + }
> + dout("ceph_osdc_copy_from returned %d\n", ret);
> + if (!bytes)
> + bytes = ret;
> + goto out;
> + }
> + len -= object_size;
> + bytes += object_size;
> + *src_off += object_size;
> + *dst_off += object_size;
> + }
> +
> +out:
> + ceph_oloc_destroy(&src_oloc);
> + ceph_oloc_destroy(&dst_oloc);
> + return bytes;
> +}
> +
> static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> struct file *dst_file, loff_t dst_off,
> size_t len, unsigned int flags)
> @@ -1941,14 +2006,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
> struct ceph_cap_flush *prealloc_cf;
> struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
> - struct ceph_object_locator src_oloc, dst_oloc;
> - struct ceph_object_id src_oid, dst_oid;
> - loff_t endoff = 0, size;
> - ssize_t ret = -EIO;
> + loff_t size;
> + ssize_t ret = -EIO, bytes;
> u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
> - u32 src_objlen, dst_objlen, object_size;
> + u32 src_objlen, dst_objlen;
> int src_got = 0, dst_got = 0, err, dirty;
> - bool do_final_copy = false;
>
> if (src_inode->i_sb != dst_inode->i_sb) {
> struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
> @@ -2026,22 +2088,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> if (ret < 0)
> goto out_caps;
>
> - size = i_size_read(dst_inode);
> - endoff = dst_off + len;
> -
> /* Drop dst file cached pages */
> ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
> dst_off >> PAGE_SHIFT,
> - endoff >> PAGE_SHIFT);
> + (dst_off + len) >> PAGE_SHIFT);
> if (ret < 0) {
> dout("Failed to invalidate inode pages (%zd)\n", ret);
> ret = 0; /* XXX */
> }
> - src_oloc.pool = src_ci->i_layout.pool_id;
> - src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
> - dst_oloc.pool = dst_ci->i_layout.pool_id;
> - dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
> -
> ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
> src_ci->i_layout.object_size,
> &src_objnum, &src_objoff, &src_objlen);
> @@ -2060,6 +2114,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> * starting at the src_off
> */
> if (src_objoff) {
> + dout("Initial partial copy of %u bytes\n", src_objlen);
> +
> /*
> * we need to temporarily drop all caps as we'll be calling
> * {read,write}_iter, which will get caps again.
> @@ -2067,8 +2123,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
> ret = do_splice_direct(src_file, &src_off, dst_file,
> &dst_off, src_objlen, flags);
> - if (ret < 0) {
> - dout("do_splice_direct returned %d\n", err);
> + /* Abort on short copies or on error */
> + if (ret < src_objlen) {
> + dout("Failed partial copy (%zd)\n", ret);
> goto out;
> }
> len -= ret;
> @@ -2081,62 +2138,29 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> if (err < 0)
> goto out_caps;
> }
> - object_size = src_ci->i_layout.object_size;
> - while (len >= object_size) {
> - ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
> - object_size, &src_objnum,
> - &src_objoff, &src_objlen);
> - ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
> - object_size, &dst_objnum,
> - &dst_objoff, &dst_objlen);
> - ceph_oid_init(&src_oid);
> - ceph_oid_printf(&src_oid, "%llx.%08llx",
> - src_ci->i_vino.ino, src_objnum);
> - ceph_oid_init(&dst_oid);
> - ceph_oid_printf(&dst_oid, "%llx.%08llx",
> - dst_ci->i_vino.ino, dst_objnum);
> - /* Do an object remote copy */
> - err = ceph_osdc_copy_from(
> - &src_fsc->client->osdc,
> - src_ci->i_vino.snap, 0,
> - &src_oid, &src_oloc,
> - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> - CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
> - &dst_oid, &dst_oloc,
> - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
> - dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
> - CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
> - if (err) {
> - if (err == -EOPNOTSUPP) {
> - src_fsc->have_copy_from2 = false;
> - pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
> - }
> - dout("ceph_osdc_copy_from returned %d\n", err);
> - if (!ret)
> - ret = err;
> - goto out_caps;
> - }
> - len -= object_size;
> - src_off += object_size;
> - dst_off += object_size;
> - ret += object_size;
> - }
>
> - if (len)
> - /* We still need one final local copy */
> - do_final_copy = true;
> + size = i_size_read(dst_inode);
> + bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
> + src_fsc, len, flags);
> + if (bytes <= 0) {
> + if (!ret)
> + ret = bytes;
> + goto out_caps;
> + }

Suppose we did the front part with do_splice_direct (ret > 0), but then
ceph_do_objects_copy fails (bytes < 0). We "goto out_caps" but then...

[...]

> @@ -2152,15 +2176,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> out_caps:
> put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
>
> - if (do_final_copy) {
> - err = do_splice_direct(src_file, &src_off, dst_file,
> - &dst_off, len, flags);
> - if (err < 0) {
> - dout("do_splice_direct returned %d\n", err);
> - goto out;
> - }
> - len -= err;
> - ret += err;
> + if (len && (ret >= 0)) {

...len is still positive and we do_splice_direct again (probably at the
wrong offset?), instead of just returning a short copy. I think we
probably want to just stop any copying if it fails at any point along
the way, right?

> + dout("Final partial copy of %zu bytes\n", len);
> + bytes = do_splice_direct(src_file, &src_off, dst_file,
> + &dst_off, len, flags);
> + if (bytes > 0)
> + ret += bytes;
> + else
> + dout("Failed partial copy (%zd)\n", bytes);
> }
>
> out:

--
Jeff Layton <jlayton@xxxxxxxxxx>