[PATCH RFC 1/1] ocfs2: fix write IO performance improvement for high fragmentation
From: Gautham Ananthakrishna
Date: Mon Mar 24 2025 - 01:49:25 EST
The commit 4eb7b93e03101fd3f35e69affe566e4b1e3e3dca caused a regression
in our test suite in discontig extent tests. Upon troubleshooting I found
The following issues.
1. The function ocfs2_cluster_group_search() was called for discontig allocations
as well. But it checks only the contiguous bits 'bg_contig_free_bits'.
It hit the ENOSPC in the following case in one of the tests.
ocfs2_mkdir()
ocfs2_reserve_new_inode()
ocfs2_reserve_suballoc_bits()
ocfs2_block_group_alloc()
ocfs2_block_group_alloc_discontig()
__ocfs2_claim_clusters()
ocfs2_claim_suballoc_bits()
ocfs2_search_chain()
ocfs2_cluster_group_search()
Looked like the commit did not consider discontig searches. To fix this,
I have split ocfs2_cluster_group_search() into *_common(), *_contig() and
*_discontig()
2. That commit enforced ocfs2_cluster_group_search() to search only the
'bits_wanted' number of bits whereas ocfs2_block_group_find_clear_bits()
fills the best available size and the function ocfs2_cluster_group_search()
itself is supposed to search 'min_bits' at the minimum and need not be
'bits_wanted' always.
Fixed the above issues in this patch.
This patch fixes 4eb7b93e03101fd3f35e69affe566e4b1e3e3dca
Signed-off-by: Gautham Ananthakrishna <gautham.ananthakrishna@xxxxxxxxxx>
---
fs/ocfs2/suballoc.c | 146 ++++++++++++++++++++++++++++----------------
1 file changed, 95 insertions(+), 51 deletions(-)
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 9bffa218bc084..be386e1d8e34d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -82,7 +82,17 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
u64 *last_alloc_group,
int flags);
-static int ocfs2_cluster_group_search(struct inode *inode,
+static int ocfs2_cluster_group_search_contig(struct inode *inode,
+ struct buffer_head *group_bh,
+ u32 bits_wanted, u32 min_bits,
+ u64 max_block,
+ struct ocfs2_suballoc_result *res);
+static int ocfs2_cluster_group_search_common(struct inode *inode,
+ struct buffer_head *group_bh,
+ u32 bits_wanted, u32 min_bits,
+ u64 max_block,
+ struct ocfs2_suballoc_result *res);
+static int ocfs2_cluster_group_search_discontig(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
@@ -593,6 +603,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
goto bail;
}
+ ac->ac_group_search = ocfs2_cluster_group_search_discontig;
status = ocfs2_extend_trans(handle,
ocfs2_calc_bg_discontig_credits(osb->sb));
if (status) {
@@ -1127,7 +1138,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
int status;
ac->ac_which = OCFS2_AC_USE_MAIN;
- ac->ac_group_search = ocfs2_cluster_group_search;
+
+ /* Make contig search as default */
+ ac->ac_group_search = ocfs2_cluster_group_search_contig;
status = ocfs2_reserve_suballoc_bits(osb, ac,
GLOBAL_BITMAP_SYSTEM_INODE,
@@ -1525,9 +1538,7 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
return le16_to_cpu(bg->bg_free_bits_count) > wanted;
}
-/* return 0 on success, -ENOSPC to keep searching and any other < 0
- * value on error. */
-static int ocfs2_cluster_group_search(struct inode *inode,
+static int ocfs2_cluster_group_search_common(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
@@ -1542,57 +1553,90 @@ static int ocfs2_cluster_group_search(struct inode *inode,
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+ max_bits = le16_to_cpu(gd->bg_bits);
+
+ /* Tail groups in cluster bitmaps which aren't cpg
+ * aligned are prone to partial extension by a failed
+ * fs resize. If the file system resize never got to
+ * update the dinode cluster count, then we don't want
+ * to trust any clusters past it, regardless of what
+ * the group descriptor says. */
+ gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(gd->bg_blkno));
+ if ((gd_cluster_off + max_bits) >
+ OCFS2_I(inode)->ip_clusters) {
+ max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
+ trace_ocfs2_cluster_group_search_wrong_max_bits(
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ OCFS2_I(inode)->ip_clusters, max_bits);
+ }
+
+ ret = ocfs2_block_group_find_clear_bits(osb,
+ group_bh, bits_wanted,
+ max_bits, res);
+ if (ret)
+ return ret;
+
+ if (max_block) {
+ blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+ gd_cluster_off +
+ res->sr_bit_offset +
+ res->sr_bits);
+ trace_ocfs2_cluster_group_search_max_block(
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
+ if (blkoff > max_block)
+ return -ENOSPC;
+ }
+
+ /* ocfs2_block_group_find_clear_bits() might
+ * return success, but we still want to return
+ * -ENOSPC unless it found the minimum number
+ * of bits. */
+ if (min_bits <= res->sr_bits)
+ search = 0; /* success */
+
+ return search;
+}
+
+static int ocfs2_cluster_group_search_discontig(struct inode *inode,
+ struct buffer_head *group_bh,
+ u32 bits_wanted, u32 min_bits,
+ u64 max_block,
+ struct ocfs2_suballoc_result *res)
+{
+ int search = -ENOSPC;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+
+ if (le16_to_cpu(gd->bg_free_bits_count) >= min_bits)
+ search = ocfs2_cluster_group_search_common(inode, group_bh,
+ bits_wanted, min_bits,
+ max_block, res);
+ return search;
+}
+
+/* return 0 on success, -ENOSPC to keep searching and any other < 0
+ * value on error. */
+static int ocfs2_cluster_group_search_contig(struct inode *inode,
+ struct buffer_head *group_bh,
+ u32 bits_wanted, u32 min_bits,
+ u64 max_block,
+ struct ocfs2_suballoc_result *res)
+{
+ int search = -ENOSPC;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+
+
if (le16_to_cpu(gd->bg_contig_free_bits) &&
le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
return -ENOSPC;
/* ->bg_contig_free_bits may un-initialized, so compare again */
- if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
- max_bits = le16_to_cpu(gd->bg_bits);
-
- /* Tail groups in cluster bitmaps which aren't cpg
- * aligned are prone to partial extension by a failed
- * fs resize. If the file system resize never got to
- * update the dinode cluster count, then we don't want
- * to trust any clusters past it, regardless of what
- * the group descriptor says. */
- gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
- le64_to_cpu(gd->bg_blkno));
- if ((gd_cluster_off + max_bits) >
- OCFS2_I(inode)->ip_clusters) {
- max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
- trace_ocfs2_cluster_group_search_wrong_max_bits(
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- OCFS2_I(inode)->ip_clusters, max_bits);
- }
-
- ret = ocfs2_block_group_find_clear_bits(osb,
- group_bh, bits_wanted,
- max_bits, res);
- if (ret)
- return ret;
-
- if (max_block) {
- blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
- gd_cluster_off +
- res->sr_bit_offset +
- res->sr_bits);
- trace_ocfs2_cluster_group_search_max_block(
- (unsigned long long)blkoff,
- (unsigned long long)max_block);
- if (blkoff > max_block)
- return -ENOSPC;
- }
-
- /* ocfs2_block_group_find_clear_bits() might
- * return success, but we still want to return
- * -ENOSPC unless it found the minimum number
- * of bits. */
- if (min_bits <= res->sr_bits)
- search = 0; /* success */
- }
-
+ if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted)
+ search = ocfs2_cluster_group_search_common(inode, group_bh,
+ bits_wanted, min_bits,
+ max_block, res);
return search;
}
--
2.43.5