[PATCH 2/3] ext4: limit the length of per-inode prealloc list

From: brookxu
Date: Thu Jul 23 2020 - 22:58:21 EST


In the scenario of writing sparse files, the Per-inode prealloc list may
be very long, resulting in high overhead for ext4_mb_use_preallocated().
To circumvent this problem, we limit the maximum length of per-inode
prealloc list to 512 and allow users to modify it.

Signed-off-by: Chunguang Xu <brookxu@xxxxxxxxxxx>
---
Âfs/ext4/ext4.hÂÂÂÂÂÂÂ |Â 3 ++-
Âfs/ext4/extents.cÂÂÂÂ | 10 ++++-----
Âfs/ext4/file.cÂÂÂÂÂÂÂ |Â 2 +-
Âfs/ext4/indirect.cÂÂÂ |Â 2 +-
Âfs/ext4/inode.cÂÂÂÂÂÂ |Â 6 +++---
Âfs/ext4/ioctl.cÂÂÂÂÂÂ |Â 2 +-
Âfs/ext4/mballoc.cÂÂÂÂ | 57 +++++++++++++++++++++++++++++++++++++++++++++++----
Âfs/ext4/mballoc.hÂÂÂÂ |Â 4 ++++
Âfs/ext4/move_extent.c |Â 4 ++--
Âfs/ext4/super.cÂÂÂÂÂÂ |Â 2 +-
Âfs/ext4/sysfs.cÂÂÂÂÂÂ |Â 2 ++
Â11 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 42f5060..68e0ebe 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1501,6 +1501,7 @@ struct ext4_sb_info {
ÂÂÂÂ unsigned int s_mb_stats;
ÂÂÂÂ unsigned int s_mb_order2_reqs;
ÂÂÂÂ unsigned int s_mb_group_prealloc;
+ÂÂÂ unsigned int s_mb_max_inode_prealloc;
ÂÂÂÂ unsigned int s_max_dir_size_kb;
ÂÂÂÂ /* where last allocation was done - for stream allocation */
ÂÂÂÂ unsigned long s_mb_last_group;
@@ -2651,7 +2652,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
Âextern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
ÂÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ struct ext4_allocation_request *, int *);
Âextern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_discard_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *, unsigned int);
Âextern int __init ext4_init_mballoc(void);
Âextern void ext4_exit_mballoc(void);
Âextern void ext4_free_blocks(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 221f240..a40f928 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
ÂÂÂÂ Â* i_mutex. So we can safely drop the i_data_sem here.
ÂÂÂÂ Â*/
ÂÂÂÂ BUG_ON(EXT4_JOURNAL(inode) == NULL);
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
ÂÂÂÂ up_write(&EXT4_I(inode)->i_data_sem);
ÂÂÂÂ *dropped = 1;
ÂÂÂÂ return 0;
@@ -4272,7 +4272,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ÂÂÂÂ ÂÂÂ ÂÂÂ Â* not a good idea to call discard here directly,
ÂÂÂÂ ÂÂÂ ÂÂÂ Â* but otherwise we'd need to call it every free().
ÂÂÂÂ ÂÂÂ ÂÂÂ Â*/
-ÂÂÂ ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode, 0);
ÂÂÂÂ ÂÂÂ ÂÂÂ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
ÂÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
ÂÂÂÂ ÂÂÂ ÂÂÂ ext4_free_blocks(handle, inode, NULL, newblock,
@@ -5299,7 +5299,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ÂÂÂÂ }
Â
ÂÂÂÂ down_write(&EXT4_I(inode)->i_data_sem);
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
Â
ÂÂÂÂ ret = ext4_es_remove_extent(inode, punch_start,
ÂÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ EXT_MAX_BLOCKS - punch_start);
@@ -5313,7 +5313,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ÂÂÂÂ ÂÂÂ up_write(&EXT4_I(inode)->i_data_sem);
ÂÂÂÂ ÂÂÂ goto out_stop;
ÂÂÂÂ }
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
Â
ÂÂÂÂ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
ÂÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂÂ punch_stop - punch_start, SHIFT_LEFT);
@@ -5445,7 +5445,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
ÂÂÂÂ ÂÂÂ goto out_stop;
Â
ÂÂÂÂ down_write(&EXT4_I(inode)->i_data_sem);
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
Â
ÂÂÂÂ path = ext4_find_extent(inode, offset_lblk, NULL, 0);
ÂÂÂÂ if (IS_ERR(path)) {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a01e31..e3ab8ea 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -148,7 +148,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
ÂÂÂÂ ÂÂÂ ÂÂÂÂÂÂÂ !EXT4_I(inode)->i_reserved_data_blocks)
ÂÂÂÂ {
ÂÂÂÂ ÂÂÂ down_write(&EXT4_I(inode)->i_data_sem);
-ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode, 0);
ÂÂÂÂ ÂÂÂ up_write(&EXT4_I(inode)->i_data_sem);
ÂÂÂÂ }
ÂÂÂÂ if (is_dx(inode) && filp->private_data)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index be2b66e..ec6b930 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
ÂÂÂÂ Â* i_mutex. So we can safely drop the i_data_sem here.
ÂÂÂÂ Â*/
ÂÂÂÂ BUG_ON(EXT4_JOURNAL(inode) == NULL);
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
ÂÂÂÂ up_write(&EXT4_I(inode)->i_data_sem);
ÂÂÂÂ *dropped = 1;
ÂÂÂÂ return 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 10dd470..bb9e1cd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
ÂÂÂÂ Â*/
ÂÂÂÂ if ((ei->i_reserved_data_blocks == 0) &&
ÂÂÂÂ ÂÂÂ !inode_is_open_for_write(inode))
-ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode, 0);
Â}
Â
Âstatic int __check_block_validity(struct inode *inode, const char *func,
@@ -4056,7 +4056,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ÂÂÂÂ if (stop_block > first_block) {
Â
ÂÂÂÂ ÂÂÂ down_write(&EXT4_I(inode)->i_data_sem);
-ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode, 0);
Â
ÂÂÂÂ ÂÂÂ ret = ext4_es_remove_extent(inode, first_block,
ÂÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ stop_block - first_block);
@@ -4211,7 +4211,7 @@ int ext4_truncate(struct inode *inode)
Â
ÂÂÂÂ down_write(&EXT4_I(inode)->i_data_sem);
Â
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
Â
ÂÂÂÂ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ÂÂÂÂ ÂÂÂ err = ext4_ext_truncate(handle, inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 999cf6a..a5fcc23 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
ÂÂÂÂ reset_inode_seed(inode);
ÂÂÂÂ reset_inode_seed(inode_bl);
Â
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
Â
ÂÂÂÂ err = ext4_mark_inode_dirty(handle, inode);
ÂÂÂÂ if (err < 0) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4f21f34..28a139f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2736,6 +2736,7 @@ int ext4_mb_init(struct super_block *sb)
ÂÂÂÂ sbi->s_mb_stats = MB_DEFAULT_STATS;
ÂÂÂÂ sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
ÂÂÂÂ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ÂÂÂ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
ÂÂÂÂ /*
ÂÂÂÂ Â* The default group preallocation is 512, which for 4k block
 Â* sizes translates to 2 megabytes. However for bigalloc file
@@ -4103,7 +4104,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
 *
 * FIXME!! Make sure it is valid at all the call sites
 */
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
Â{
ÂÂÂÂ struct ext4_inode_info *ei = EXT4_I(inode);
ÂÂÂÂ struct super_block *sb = inode->i_sb;
@@ -4121,15 +4122,18 @@ void ext4_discard_preallocations(struct inode *inode)
Â
ÂÂÂÂ mb_debug(sb, "discard preallocation for inode %lu\n",
ÂÂÂÂ ÂÂÂ Âinode->i_ino);
-ÂÂÂ trace_ext4_discard_preallocations(inode);
+ trace_ext4_discard_preallocations(inode, needed);
Â
ÂÂÂÂ INIT_LIST_HEAD(&list);
Â
+ÂÂÂ if (needed == 0)
+ÂÂÂ ÂÂÂ needed = UINT_MAX;
+
Ârepeat:
ÂÂÂÂ /* first, collect all pa's in the inode */
ÂÂÂÂ spin_lock(&ei->i_prealloc_lock);
-ÂÂÂ while (!list_empty(&ei->i_prealloc_list)) {
-ÂÂÂ ÂÂÂ pa = list_entry(ei->i_prealloc_list.next,
+ÂÂÂ while (!list_empty(&ei->i_prealloc_list) && needed) {
+ÂÂÂ ÂÂÂ pa = list_entry(ei->i_prealloc_list.prev,
ÂÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ struct ext4_prealloc_space, pa_inode_list);
ÂÂÂÂ ÂÂÂ BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
ÂÂÂÂ ÂÂÂ spin_lock(&pa->pa_lock);
@@ -4150,6 +4154,7 @@ void ext4_discard_preallocations(struct inode *inode)
ÂÂÂÂ ÂÂÂ ÂÂÂ spin_unlock(&pa->pa_lock);
ÂÂÂÂ ÂÂÂ ÂÂÂ list_del_rcu(&pa->pa_inode_list);
ÂÂÂÂ ÂÂÂ ÂÂÂ list_add(&pa->u.pa_tmp_list, &list);
+ÂÂÂ ÂÂÂ ÂÂÂ needed--;
ÂÂÂÂ ÂÂÂ ÂÂÂ continue;
ÂÂÂÂ ÂÂÂ }
Â
@@ -4549,10 +4554,42 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
Â}
Â
Â/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void
+ext4_mb_trim_inode_pa(struct inode *inode)
+{
+ÂÂÂ struct ext4_inode_info *ei = EXT4_I(inode);
+ÂÂÂ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ÂÂÂ struct ext4_prealloc_space *pa;
+ÂÂÂ int count = 0, delta;
+
+ÂÂÂ rcu_read_lock();
+ÂÂÂ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+ÂÂÂ ÂÂÂ spin_lock(&pa->pa_lock);
+ÂÂÂ ÂÂÂ if (pa->pa_deleted) {
+ÂÂÂ ÂÂÂ ÂÂÂ spin_unlock(&pa->pa_lock);
+ÂÂÂ ÂÂÂ ÂÂÂ continue;
+ÂÂÂ ÂÂÂ }
+ÂÂÂ ÂÂÂ count++;
+ÂÂÂ ÂÂÂ spin_unlock(&pa->pa_lock);
+ÂÂÂ }
+ÂÂÂ rcu_read_unlock();
+
+ÂÂÂ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+ÂÂÂ if (count > sbi->s_mb_max_inode_prealloc + delta) {
+ÂÂÂ ÂÂÂ count -= sbi->s_mb_max_inode_prealloc;
+ÂÂÂ ÂÂÂ ext4_discard_preallocations(inode, count);
+ÂÂÂ }
+}
+
+/*
 * release all resource we used in allocation
 */
Âstatic int ext4_mb_release_context(struct ext4_allocation_context *ac)
Â{
+ÂÂÂ struct inode *inode = ac->ac_inode;
+ÂÂÂ struct ext4_inode_info *ei = EXT4_I(inode);
ÂÂÂÂ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ÂÂÂÂ struct ext4_prealloc_space *pa = ac->ac_pa;
ÂÂÂÂ if (pa) {
@@ -4578,6 +4615,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
ÂÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ ext4_mb_add_n_trim(ac);
ÂÂÂÂ ÂÂÂ ÂÂÂ }
ÂÂÂÂ ÂÂÂ }
+
+ÂÂÂ ÂÂÂ if (pa->pa_type == MB_INODE_PA) {
+ÂÂÂ ÂÂÂ ÂÂÂ /*
+ÂÂÂ ÂÂÂ ÂÂÂ Â* treat per-inode prealloc list as a lru list, then try
+ÂÂÂ ÂÂÂ ÂÂÂ Â* to trim the least recently used PA.
+ÂÂÂ ÂÂÂ ÂÂÂ Â*/
+ÂÂÂ ÂÂÂ ÂÂÂ spin_lock(pa->pa_obj_lock);
+ÂÂÂ ÂÂÂ ÂÂÂ list_move(&ei->i_prealloc_list, &pa->pa_inode_list);
+ÂÂÂ ÂÂÂ ÂÂÂ spin_unlock(pa->pa_obj_lock);
+ÂÂÂ ÂÂÂ }
+
ÂÂÂÂ ÂÂÂ ext4_mb_put_pa(ac, ac->ac_sb, pa);
ÂÂÂÂ }
ÂÂÂÂ if (ac->ac_bitmap_page)
@@ -4587,6 +4635,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
ÂÂÂÂ if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
ÂÂÂÂ ÂÂÂ mutex_unlock(&ac->ac_lg->lg_mutex);
ÂÂÂÂ ext4_mb_collect_stats(ac);
+ÂÂÂ ext4_mb_trim_inode_pa(inode);
ÂÂÂÂ return 0;
Â}
Â
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6b4d17c..e75b474 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -73,6 +73,10 @@
 */
Â#define MB_DEFAULT_GROUP_PREALLOCÂÂÂ 512
Â
+/*
+ * maximum length of inode prealloc list
+ */
+#define MB_DEFAULT_MAX_INODE_PREALLOCÂÂÂ 512
Â
Âstruct ext4_free_data {
ÂÂÂÂ /* this links the free block information from sb_info */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1ed86fb..0d601b8 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -686,8 +686,8 @@
Â
Âout:
ÂÂÂÂ if (*moved_len) {
-ÂÂÂ ÂÂÂ ext4_discard_preallocations(orig_inode);
-ÂÂÂ ÂÂÂ ext4_discard_preallocations(donor_inode);
+ÂÂÂ ÂÂÂ ext4_discard_preallocations(orig_inode, 0);
+ÂÂÂ ÂÂÂ ext4_discard_preallocations(donor_inode, 0);
ÂÂÂÂ }
Â
ÂÂÂÂ ext4_ext_drop_refs(path);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 330957e..8ce61f3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1216,7 +1216,7 @@ void ext4_clear_inode(struct inode *inode)
Â{
ÂÂÂÂ invalidate_inode_buffers(inode);
ÂÂÂÂ clear_inode(inode);
-ÂÂÂ ext4_discard_preallocations(inode);
+ÂÂÂ ext4_discard_preallocations(inode, 0);
ÂÂÂÂ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
ÂÂÂÂ dquot_drop(inode);
ÂÂÂÂ if (EXT4_I(inode)->jinode) {
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6c9fc9e..92f04e9 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -215,6 +215,7 @@ static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
ÂEXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
ÂEXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
ÂEXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
ÂEXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
ÂEXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
ÂEXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -257,6 +258,7 @@ static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
ÂÂÂÂ ATTR_LIST(mb_order2_req),
ÂÂÂÂ ATTR_LIST(mb_stream_req),
ÂÂÂÂ ATTR_LIST(mb_group_prealloc),
+ÂÂÂ ATTR_LIST(mb_max_inode_prealloc),
ÂÂÂÂ ATTR_LIST(max_writeback_mb_bump),
ÂÂÂÂ ATTR_LIST(extent_max_zeroout_kb),
ÂÂÂÂ ATTR_LIST(trigger_fs_error),
--
1.8.3.1