[PATCH 3/3] ext4: Add support IOC_MOV_DATA ioctl

From: Namjae Jeon
Date: Tue Jul 08 2014 - 08:00:15 EST


This patch implements fs ioctl's IOC_MOV_DATA for Ext4.

The semantics of this ioctl are:
1) Like collapse range, offsets and length should be file system block size
aligned.
2) In the receiver file, atleast length size hole should be present at
receiver_offset
3) It does not change file size of any of donor or receiver file.
4) It leaves a hole at the place from where blocks are moved out in donor file.
5) Both (donor_offset + length) and (receiver_offset + length) should be within
size of donor file and receiver file respectively.
Only unwritten extents resides beyond file size and it does not make sense
to transfer unwritten extents, leave apart the security issues it may raise.
6) If the range to be transfered from donor file contain any holes, they are
replicated as it is in receiver file. It mean holes are preserved and
the length of hole will be added to moved_len signifying that the hole range
is succesfully transfered.

Signed-off-by: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>
Signed-off-by: Ashish Sangwan <a.sangwan@xxxxxxxxxxx>
---
fs/ext4/ext4.h | 2 +
fs/ext4/extents.c | 375 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/file.c | 1 +
3 files changed, 378 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6386c5f..26478eb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2725,6 +2725,8 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
extern int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
+extern int ext4_mov_data(struct inode *, struct inode *, loff_t, loff_t, loff_t,
+ loff_t *);

/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0c2432e..511db03 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5811,3 +5811,378 @@ out_mutex:
mutex_unlock(&inode->i_mutex);
return ret;
}
+
+/*
+ * If offset_lblk does not lie on the extent start boundary, split extent
+ */
+int ext4_find_and_split_extent_at(struct inode *inode, ext4_lblk_t offset_lblk)
+{
+ struct ext4_ext_path *path;
+ handle_t *handle;
+ int credits, err = 0, split_flag, ex_len;
+ struct ext4_extent *ex;
+ int depth = ext_depth(inode);
+ ext4_lblk_t ex_start;
+
+ path = ext4_ext_find_extent(inode, offset_lblk, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ex = path[depth].p_ext;
+ if (!ex)
+ goto free_path;
+ ex_start = le32_to_cpu(ex->ee_block);
+ ex_len = ext4_ext_get_actual_len(ex);
+
+ if (offset_lblk > ex_start && offset_lblk < (ex_start + ex_len)) {
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto free_path;
+ }
+ if (ext4_ext_is_unwritten(ex))
+ split_flag = EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
+ else
+ split_flag = 0;
+
+ err = ext4_split_extent_at(handle, inode, path, offset_lblk,
+ split_flag, EXT4_EX_NOCACHE |
+ EXT4_GET_BLOCKS_PRE_IO);
+ ext4_journal_stop(handle);
+ }
+
+free_path:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return err;
+}
+
+/*
+ * Compute the size of hole in terms of filesystem blocks present at offset_lblk
+ * until the next extent is found OR till we reach the last block within isize.
+ * Store the computed value in hole_blkcnt.
+ * offset_lblk should be within isize of inode.
+ */
+int ext4_compute_hole_size(struct inode *inode, ext4_lblk_t offset_lblk,
+ ext4_lblk_t *hole_blkcnt)
+{
+ struct ext4_ext_path *path;
+ struct ext4_extent *ex;
+ ext4_lblk_t ex_start, isize_lblk;
+ int ret = 0, depth, ex_len;
+
+ isize_lblk = (inode->i_size + EXT4_BLOCK_SIZE(inode->i_sb) - 1) >>
+ EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+ if (offset_lblk > isize_lblk)
+ return -EINVAL;
+
+ *hole_blkcnt = 0;
+ path = ext4_ext_find_extent(inode, offset_lblk, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ /* No blocks allocated in this file */
+ *hole_blkcnt = isize_lblk - offset_lblk;
+ goto out;
+ }
+ ex_start = le32_to_cpu(ex->ee_block);
+ ex_len = ext4_ext_get_actual_len(ex);
+
+ /* if offset_lblk lies within extent? */
+ if (offset_lblk >= ex_start && offset_lblk < (ex_start + ex_len))
+ goto out;
+
+ if (ex_start < offset_lblk) {
+ ret = mext_next_extent(inode, path, &ex);
+ if (!ret) {
+ ex_start = le32_to_cpu(ex->ee_block);
+ } else {
+ if (ret == 1) {
+ *hole_blkcnt = isize_lblk - offset_lblk;
+ ret = 0;
+ }
+ goto out;
+ }
+ }
+ *hole_blkcnt = (ex_start < isize_lblk) ? (ex_start - offset_lblk) :
+ (isize_lblk - offset_lblk);
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ return ret;
+}
+
+/*
+ * Remove a complete extent from in memory and on-disk extent tree
+ * without freeing any data blocks covered by the extent. Caller must call
+ * ext4_mark_inode_dirty() to sync the changes to disk.
+ */
+int ext4_ext_rm_extent(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path, struct ext4_extent *ex)
+{
+ struct ext4_extent_header *eh;
+ int depth = ext_depth(inode);
+ int credits, err, correct_index = 0;
+ int ex_ee_len = ext4_ext_get_actual_len(ex);
+
+ if (!path[depth].p_hdr)
+ path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
+ eh = path[depth].p_hdr;
+
+ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
+ if (ex == EXT_FIRST_EXTENT(eh)) {
+ correct_index = 1;
+ credits += (ext_depth(inode)) + 1;
+ }
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ if (err)
+ return err;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ return err;
+
+ ext4_ext_store_pblock(ex, 0);
+ memmove(ex, ex+1,
+ (EXT_LAST_EXTENT(eh) - ex) * sizeof(struct ext4_extent));
+ memset(EXT_LAST_EXTENT(eh), 0, sizeof(struct ext4_extent));
+ le16_add_cpu(&eh->eh_entries, -1);
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ return err;
+
+ if (correct_index && eh->eh_entries)
+ err = ext4_ext_correct_indexes(handle, inode, path);
+
+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
+ err = ext4_ext_rm_idx(handle, inode, path, depth);
+
+ return err;
+}
+
+/*
+ * Move len_lblk amount of blocks from donor inode to receiver inode.
+ * Blocks are to be moved from doffset_lblk and moved to roffset_lblk.
+ * Caller of this function must make sure there is atleast len_lblk size
+ * hole at roffset_lblk. Also doffset_lblk and doffset_lblk + len_lblk
+ * should fall on extent boundary.
+ */
+int ext4_ext_mov_data(struct inode *donor, struct inode *receiver,
+ ext4_lblk_t doffset_lblk, ext4_lblk_t roffset_lblk,
+ ext4_lblk_t len_lblk, loff_t *bytes_moved)
+{
+ int error = 0, depth = ext_depth(donor);
+ struct ext4_ext_path *path;
+ struct ext4_extent *ex;
+ loff_t blocks_moved = 0;
+ handle_t *handle;
+ int credits = ext4_writepage_trans_blocks(donor) +
+ ext4_writepage_trans_blocks(receiver);
+
+ while (blocks_moved < len_lblk && !error) {
+ struct ext4_ext_path *rpath = NULL;
+ ext4_lblk_t ex_start;
+ int ex_len;
+
+ path = ext4_ext_find_extent(donor, doffset_lblk, NULL, 0);
+ if (IS_ERR(path)) {
+ error = PTR_ERR(path);
+ break;
+ }
+ ex = path[depth].p_ext;
+ /*
+ * No allocated blocks? This could only happen during
+ * 1st iteration. Otherwise it is en error.
+ */
+ if (!ex) {
+ if (blocks_moved)
+ error = -EIO;
+ else
+ blocks_moved = len_lblk;
+ goto out;
+ }
+ ex_start = le32_to_cpu(ex->ee_block);
+ ex_len = ext4_ext_get_actual_len(ex);
+
+ if (doffset_lblk != ex_start) {
+ /* Hole within range, move to the next extent */
+ if (ex_start < doffset_lblk)
+ error = mext_next_extent(donor, path, &ex);
+ /* Below if will also handle ex_start > doffset_lblk */
+ if (error == 0) {
+ ex_start = le32_to_cpu(ex->ee_block);
+ blocks_moved += ex_start - doffset_lblk;
+ roffset_lblk += ex_start - doffset_lblk;
+ doffset_lblk = ex_start;
+ }
+ if (error == 1) {
+ /* doffset_lblk till EOF is hole. Success!! */
+ blocks_moved = len_lblk;
+ error = 0;
+ }
+ goto out;
+ }
+
+ /* Add this extent to receiver */
+ handle = ext4_journal_start(donor, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto out;
+ }
+
+ rpath = ext4_ext_find_extent(receiver, roffset_lblk, NULL, 0);
+ if (IS_ERR(rpath)) {
+ error = PTR_ERR(rpath);
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ ex->ee_block = cpu_to_le32(roffset_lblk);
+ error = ext4_ext_insert_extent(handle, receiver, rpath, ex, 0);
+ if (error)
+ goto hout;
+
+ /* Remove this extent from donor */
+ error = ext4_ext_rm_extent(handle, donor, path, ex);
+ if (error)
+ goto hout;
+
+ /* Extent moved successfully */
+ roffset_lblk += ex_len;
+ doffset_lblk += ex_len;
+ blocks_moved += ex_len;
+
+ donor->i_blocks -= (ex_len << (donor->i_blkbits - 9));
+ receiver->i_blocks += (ex_len << (receiver->i_blkbits - 9));
+ donor->i_mtime = donor->i_ctime = ext4_current_time(donor);
+ receiver->i_mtime = receiver->i_ctime =
+ ext4_current_time(receiver);
+ ext4_mark_inode_dirty(handle, donor);
+ ext4_mark_inode_dirty(handle, receiver);
+hout:
+ ext4_journal_stop(handle);
+ ext4_ext_drop_refs(rpath);
+ kfree(rpath);
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+
+ /* This can happen when (doffset_lblk + len_lblk) is in a hole */
+ if (blocks_moved > len_lblk)
+ blocks_moved = len_lblk;
+
+ *bytes_moved = blocks_moved << EXT4_BLOCK_SIZE_BITS(donor->i_sb);
+ return error;
+}
+
+int ext4_mov_data(struct inode *donor, struct inode *receiver, loff_t doffset,
+ loff_t roffset, loff_t len, loff_t *moved_len)
+{
+ struct super_block *sb = donor->i_sb;
+ loff_t d_pg_off, r_pg_off, pg_len;
+ ext4_lblk_t doffset_lblk, roffset_lblk, len_lblk, hole_size;
+ int error;
+
+ if (doffset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+ roffset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+ len & (EXT4_BLOCK_SIZE(sb) - 1))
+ return -EINVAL;
+
+ if (EXT4_SB(sb)->s_cluster_ratio > 1)
+ return -EOPNOTSUPP;
+
+ if (!ext4_test_inode_flag(donor, EXT4_INODE_EXTENTS) ||
+ !ext4_test_inode_flag(receiver, EXT4_INODE_EXTENTS))
+ return -EOPNOTSUPP;
+
+ doffset_lblk = doffset >> EXT4_BLOCK_SIZE_BITS(sb);
+ roffset_lblk = roffset >> EXT4_BLOCK_SIZE_BITS(sb);
+ len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ d_pg_off = round_down(doffset, PAGE_SIZE);
+ r_pg_off = round_down(roffset, PAGE_SIZE);
+ pg_len = round_up(len, PAGE_SIZE);
+
+ if (ext4_should_journal_data(donor)) {
+ error = ext4_force_commit(donor->i_sb);
+ if (error)
+ return error;
+ error = ext4_force_commit(receiver->i_sb);
+ if (error)
+ return error;
+ }
+
+ error = filemap_write_and_wait_range(donor->i_mapping,
+ d_pg_off, d_pg_off + pg_len);
+ if (error)
+ return error;
+ error = filemap_write_and_wait_range(receiver->i_mapping,
+ r_pg_off, r_pg_off + pg_len);
+ if (error)
+ return error;
+
+ lock_two_nondirectories(donor, receiver);
+
+ /* Check for isize limits for both files */
+ if (doffset + len > donor->i_size ||
+ roffset + len > receiver->i_size) {
+ error = -EINVAL;
+ goto out_mutex;
+ }
+
+ truncate_pagecache_range(donor, d_pg_off, d_pg_off + pg_len - 1);
+ truncate_pagecache_range(receiver, r_pg_off, r_pg_off + pg_len - 1);
+
+ ext4_inode_block_unlocked_dio(donor);
+ inode_dio_wait(donor);
+ ext4_inode_block_unlocked_dio(receiver);
+ inode_dio_wait(receiver);
+
+ ext4_discard_preallocations(donor);
+ ext4_discard_preallocations(receiver);
+
+ error = ext4_es_remove_extent(donor, doffset_lblk, len_lblk);
+ if (error)
+ goto out_sem;
+ error = ext4_es_remove_extent(receiver, roffset_lblk, len_lblk);
+ if (error)
+ goto out_sem;
+
+ error = ext4_compute_hole_size(receiver, roffset_lblk, &hole_size);
+ if (error)
+ goto out_sem;
+ if (len_lblk > hole_size) {
+ error = -EINVAL;
+ goto out_sem;
+ }
+
+ error = ext4_find_and_split_extent_at(donor, doffset_lblk);
+ if (error)
+ goto out_sem;
+
+ error = ext4_find_and_split_extent_at(donor, doffset_lblk + len_lblk);
+ if (error)
+ goto out_sem;
+
+ error = ext4_ext_mov_data(donor, receiver, doffset_lblk,
+ roffset_lblk, len_lblk, moved_len);
+
+ ext4_discard_preallocations(donor);
+ ext4_discard_preallocations(receiver);
+out_sem:
+ ext4_inode_resume_unlocked_dio(donor);
+ ext4_inode_resume_unlocked_dio(receiver);
+
+out_mutex:
+ unlock_two_nondirectories(donor, receiver);
+ return error;
+}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8695f70..d2feaba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -614,5 +614,6 @@ const struct inode_operations ext4_file_inode_operations = {
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
+ .mov_data = ext4_mov_data,
};

--
1.7.11-rc0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/