[RFC 1/5] ext4: add aligned allocation hint in mballoc

From: Ojaswin Mujoo
Date: Wed Sep 11 2024 - 05:02:28 EST


Add support in mballoc for allocating blocks that are aligned
to a certain power-of-2 offset.

1. We define a new flag EXT4_MB_ALIGNED_HINT to indicate that we want
an aligned allocation. This is just a hint, mballoc tries its best to
provide aligned blocks but if it can't then it'll fallback to normal
allocation

2. The alignment is determined by the length of the allocation, for
example if we ask for 8192 bytes, then the alignment of physical blocks
will also be 8192 bytes aligned (ie 2 blocks aligned on 4k blocksize).

3. We dont yet support arbitrary alignment. For aligned writes, the
length/alignment must be power of 2 in blocks, ie for 4k blocksize we
can get 4k byte aligned, 8k byte aligned, 16k byte aligned ...
allocation but not 12k byte aligned.

4. We use CR_POWER2_ALIGNED criteria for aligned allocation which by
design allocates in an aligned manner. Since CR_POWER2_ALIGNED needs the
ac->ac_g_ex.fe_len to be power of 2, thats where the restriction in
point 3 above comes from. Since right now aligned allocation support is
added mainly for atomic writes use case, this restriction should be fine
since atomic write capable devices usually support only power of 2
alignments

5. For ease of review enabling inode preallocation support is done in
upcoming patches and is disabled in this patch.

Signed-off-by: Ojaswin Mujoo <ojaswin@xxxxxxxxxxxxx>
---
fs/ext4/ext4.h | 2 ++
fs/ext4/mballoc.c | 60 +++++++++++++++++++++++++++++++++----
include/trace/events/ext4.h | 1 +
3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8cc15d00e5c8..17964994a049 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -222,6 +222,8 @@ enum criteria {
/* Avg fragment size rb tree lookup succeeded at least once for
* CR_BEST_AVAIL_LEN */
#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
+/* The allocation must respect alignment requirements for physical blocks */
+#define EXT4_MB_HINT_ALIGNED 0x40000

struct ext4_allocation_request {
/* target inode for block we're allocating */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d73e38323879..724905552f3b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2177,8 +2177,11 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
* user requested originally, we store allocated
* space in a special descriptor.
*/
- if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) {
+ /* Aligned allocation doesn't have preallocation support */
+ WARN_ON(ac->ac_flags & EXT4_MB_HINT_ALIGNED);
ext4_mb_new_preallocation(ac);
+ }

}

@@ -2814,10 +2817,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)

BUG_ON(ac->ac_status == AC_STATUS_FOUND);

- /* first, try the goal */
- err = ext4_mb_find_by_goal(ac, &e4b);
- if (err || ac->ac_status == AC_STATUS_FOUND)
- goto out;
+ /*
+ * first, try the goal. Skip trying goal for aligned allocations since
+ * goal determination logic is not alignment aware (yet)
+ */
+ if (!(ac->ac_flags & EXT4_MB_HINT_ALIGNED)) {
+ err = ext4_mb_find_by_goal(ac, &e4b);
+ if (err || ac->ac_status == AC_STATUS_FOUND)
+ goto out;
+ }

if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
goto out;
@@ -2858,9 +2866,22 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
*/
if (ac->ac_2order)
cr = CR_POWER2_ALIGNED;
+ else
+ WARN_ON_ONCE(ac->ac_g_ex.fe_len > 1 &&
+ ac->ac_flags & EXT4_MB_HINT_ALIGNED);
repeat:
for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+
+ if (ac->ac_criteria > CR_POWER2_ALIGNED &&
+ ac->ac_flags & EXT4_MB_HINT_ALIGNED &&
+ ac->ac_g_ex.fe_len > 1) {
+ ext4_warning_inode(
+ ac->ac_inode,
+ "Aligned allocation not possible, using unaligned allocation");
+ ac->ac_flags &= ~EXT4_MB_HINT_ALIGNED;
+ }
+
/*
* searching for the right group start
* from the goal value specified
@@ -2993,6 +3014,24 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;

+ if (ac->ac_flags & EXT4_MB_HINT_ALIGNED && ac->ac_status == AC_STATUS_FOUND) {
+ ext4_fsblk_t start = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ ext4_grpblk_t len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+
+ if (!len) {
+ ext4_warning_inode(ac->ac_inode,
+ "Expected a non zero len extent");
+ ac->ac_status = AC_STATUS_BREAK;
+ goto exit;
+ }
+
+ WARN_ON_ONCE(!is_power_of_2(len));
+ WARN_ON_ONCE(start % len);
+ /* We don't support preallocation yet */
+ WARN_ON_ONCE(ac->ac_b_ex.fe_len != ac->ac_o_ex.fe_len);
+ }
+
+ exit:
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
ac->ac_flags, cr, err);
@@ -4440,6 +4479,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
return;

+ /*
+ * caller may have strict alignment requirements. In this case, avoid
+ * normalization since it is not alignment aware.
+ */
+ if (ac->ac_flags & EXT4_MB_HINT_ALIGNED)
+ return;
+
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
ext4_mb_normalize_group_request(ac);
return ;
@@ -4794,6 +4840,10 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return false;

+ /* using preallocated blocks is not alignment aware. */
+ if (ac->ac_flags & EXT4_MB_HINT_ALIGNED)
+ return false;
+
/*
* first, try per-file preallocation by searching the inode pa rbtree.
*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index cc5e9b7b2b44..05441f87c5d2 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -36,6 +36,7 @@ struct partial_cluster;
{ EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \
{ EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \
{ EXT4_MB_USE_RESERVED, "USE_RESV" }, \
+ { EXT4_MB_HINT_ALIGNED, "HINT_ALIGNED" }, \
{ EXT4_MB_STRICT_CHECK, "STRICT_CHECK" })

#define show_map_flags(flags) __print_flags(flags, "|", \
--
2.43.5