Re: [PATCH] ext4: make mballoc max prealloc size configurable

From: Jan Kara

Date: Mon Apr 13 2026 - 04:37:39 EST


On Fri 10-04-26 11:56:35, guzebing wrote:
> From: Guzebing <guzebing@xxxxxxxxxxxxx>
>
> Add per-superblock sysfs knob mb_max_prealloc_kb (min 8MiB, roundup
> pow2) and use it in request normalization.
>
> When multiple tasks write to different files on the same filesystem
> concurrently, each file ends up with 8 MiB extents. If the preallocation
> size is increased, the resulting extent size grows accordingly. Due
> to the readahead mechanism on NVMe SSDs, files with larger extents
> achieve higher sequential read throughput.
>
> On an ext4 filesystem on an NVMe Gen4 data drive, dd read throughput
> for a file with 8 MiB extents is 455 MB/s, while for a file with
> 32 MiB extents it reaches 702 MB/s.

Hum, I think you are not speaking about general Linux readahead code here..

> Steps to reproduce:
> 1.Configure the maximum preallocation size to 8 MiB or 32 MiB:
> echo 8192 > /sys/fs/ext4/nvme13n1/mb_max_prealloc_kb
> echo 32768 > /sys/fs/ext4/nvme13n1/mb_max_prealloc_kb
>
> 2.Run the following commands simultaneously so that the extents of
> the two files are physically interleaved, resulting in 8 MiB or 32 MiB
> extents:
> dd if=/dev/zero of=/mnt/store1/501.txt bs=128K count=80K oflag=direct
> dd if=/dev/zero of=/mnt/store1/502.txt bs=128K count=80K oflag=direct
>
> 3.Read back the file and measure the read throughput:
> dd if=/mnt/store1/501.txt of=/dev/null bs=128K count=80K iflag=direct

OK, seeing that you are using direct IO here you are likely speaking about
some internal mechanism within the SSD that is happier when the IO is more
contiguous in the LBA space?

In general I find the example you show with dd not very performance
relevant. If you care about performance, then you should be running
multiple direct IO requests in parallel (either with AIO/DIO or with
iouring). Or you should be using buffered IO to do this for you behind the
scenes. So do you have a more realistic usecase where the extent allocation
size matters so much or is this mostly a benchmarking exercise?

Honza
>
> Signed-off-by: Guzebing <guzebing@xxxxxxxxxxxxx>
> ---
> Documentation/ABI/testing/sysfs-fs-ext4 | 8 +++++++
> fs/ext4/ext4.h | 1 +
> fs/ext4/mballoc.c | 2 +-
> fs/ext4/super.c | 1 +
> fs/ext4/sysfs.c | 28 ++++++++++++++++++++++++-
> 5 files changed, 38 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
> index 2edd0a6672d3a..316ae1d1ec18b 100644
> --- a/Documentation/ABI/testing/sysfs-fs-ext4
> +++ b/Documentation/ABI/testing/sysfs-fs-ext4
> @@ -48,6 +48,14 @@ Description:
> will have its blocks allocated out of its own unique
> preallocation pool.
>
> +What: /sys/fs/ext4/<disk>/mb_max_prealloc_kb
> +Date: April 2026
> +Contact: "Linux Ext4 Development List" <linux-ext4@xxxxxxxxxxxxxxx>
> +Description:
> + Maximum size (in kilobytes) used by the multiblock allocator's
> + normalized request preallocation heuristic. Values are rounded
> + up to a power of two and clamped to a minimum of 8192 (8MiB).
> +
> What: /sys/fs/ext4/<disk>/inode_readahead_blks
> Date: March 2008
> Contact: "Theodore Ts'o" <tytso@xxxxxxx>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 7617e2d454ea5..bce99740740f5 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1634,6 +1634,7 @@ struct ext4_sb_info {
> unsigned int s_mb_best_avail_max_trim_order;
> unsigned int s_sb_update_sec;
> unsigned int s_sb_update_kb;
> + unsigned int s_mb_max_prealloc_kb;
>
> /* where last allocation was done - for stream allocation */
> ext4_group_t *s_mb_last_groups;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index bb58eafb87bcd..f5f63c56fcdac 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4589,7 +4589,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
> (8<<20)>>bsbits, max, 8 * 1024)) {
> start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
> (23 - bsbits)) << 23;
> - size = 8 * 1024 * 1024;
> + size = (loff_t)sbi->s_mb_max_prealloc_kb << 10;
> } else {
> start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
> size = (loff_t) EXT4_C2B(sbi,
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index a34efb44e73d7..f815e31657cc9 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -5447,6 +5447,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
> sbi->s_stripe = 0;
> }
> sbi->s_extent_max_zeroout_kb = 32;
> + sbi->s_mb_max_prealloc_kb = 8 * 1024;
>
> /*
> * set up enough so that it can read an inode
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 923b375e017fa..6339492eb2fa7 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -10,6 +10,8 @@
>
> #include <linux/time.h>
> #include <linux/fs.h>
> +#include <linux/log2.h>
> +#include <linux/limits.h>
> #include <linux/seq_file.h>
> #include <linux/slab.h>
> #include <linux/proc_fs.h>
> @@ -41,6 +43,7 @@ typedef enum {
> attr_pointer_atomic,
> attr_journal_task,
> attr_err_report_sec,
> + attr_mb_max_prealloc_kb,
> } attr_id_t;
>
> typedef enum {
> @@ -115,6 +118,25 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
> return count;
> }
>
> +static ssize_t mb_max_prealloc_kb_store(struct ext4_sb_info *sbi,
> + const char *buf, size_t count)
> +{
> + unsigned int v;
> + int ret;
> + unsigned long rounded;
> +
> + ret = kstrtouint(skip_spaces(buf), 0, &v);
> + if (ret)
> + return ret;
> + if (v < 8192)
> + v = 8192;
> + rounded = roundup_pow_of_two((unsigned long)v);
> + if (rounded > UINT_MAX)
> + return -EINVAL;
> + sbi->s_mb_max_prealloc_kb = (unsigned int)rounded;
> + return count;
> +}
> +
> static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
> const char *buf, size_t count)
> {
> @@ -288,6 +310,7 @@ EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
> EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
> EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
> EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
> +EXT4_ATTR_OFFSET(mb_max_prealloc_kb, 0644, mb_max_prealloc_kb, ext4_sb_info, s_mb_max_prealloc_kb);
>
> static unsigned int old_bump_val = 128;
> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -341,6 +364,7 @@ static struct attribute *ext4_attrs[] = {
> ATTR_LIST(last_trim_minblks),
> ATTR_LIST(sb_update_sec),
> ATTR_LIST(sb_update_kb),
> + ATTR_LIST(mb_max_prealloc_kb),
> ATTR_LIST(err_report_sec),
> NULL,
> };
> @@ -431,6 +455,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
> case attr_mb_order:
> case attr_pointer_pi:
> case attr_pointer_ui:
> + case attr_mb_max_prealloc_kb:
> if (a->attr_ptr == ptr_ext4_super_block_offset)
> return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
> return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
> @@ -557,6 +582,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
> return reserved_clusters_store(sbi, buf, len);
> case attr_inode_readahead:
> return inode_readahead_blks_store(sbi, buf, len);
> + case attr_mb_max_prealloc_kb:
> + return mb_max_prealloc_kb_store(sbi, buf, len);
> case attr_trigger_test_error:
> return trigger_test_error(sbi, buf, len);
> case attr_err_report_sec:
> @@ -695,4 +722,3 @@ void ext4_exit_sysfs(void)
> remove_proc_entry(proc_dirname, NULL);
> ext4_proc_root = NULL;
> }
> -
> --
> 2.20.1
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR