[PATCHv5 1/5] ext4: mballoc: Add blocks to PA list under same spinlock after allocating blocks

From: Ritesh Harjani
Date: Wed May 20 2020 - 02:41:22 EST


ext4_mb_discard_preallocations() only checks for grp->bb_prealloc_list
of every group to discard the group's PA to free up the space if
allocation request fails. Consider below race:-

Process A Process B

1. allocate blocks
1. Fails block allocation from
ext4_mb_regular_allocator()
ext4_lock_group()
allocated blocks
more than ac_o_ex.fe_len
ext4_unlock_group()
2. Scans the
grp->bb_prealloc_list (under
ext4_lock_group()) and
find nothing and thus return
-ENOSPC.

2. Add the additional blocks to PA list

ext4_lock_group()
add blocks to grp->bb_prealloc_list
ext4_unlock_group()

Above race could be avoided if we add those additional blocks to
grp->bb_prealloc_list at the same time with block allocation when
ext4_lock_group() was still held.
With this discard-PA will know if there are actually any blocks which
could be freed from the PA

Signed-off-by: Ritesh Harjani <riteshh@xxxxxxxxxxxxx>
---
fs/ext4/mballoc.c | 97 ++++++++++++++++++++++++++++++-----------------
1 file changed, 62 insertions(+), 35 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 33a69424942c..decc5168d126 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -349,6 +349,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);

static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -1701,6 +1702,14 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
spin_unlock(&sbi->s_md_lock);
}
+ /*
+ * As we've just preallocated more space than
+ * user requested originally, we store allocated
+ * space in a special descriptor.
+ */
+ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ ext4_mb_new_preallocation(ac);
+
}

/*
@@ -1949,7 +1958,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,

ext4_mb_use_best_found(ac, e4b);

- BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+ BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);

if (EXT4_SB(sb)->s_mb_stats)
atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
@@ -3675,7 +3684,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
/*
* creates new preallocated space for given inode
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3688,10 +3697,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);

- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;

if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
int winl;
@@ -3735,7 +3743,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
@@ -3755,21 +3762,17 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_obj_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode;

- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
-
- return 0;
}

/*
* creates new preallocated space for locality group inodes belongs to
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3781,11 +3784,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);

- BUG_ON(ext4_pspace_cachep == NULL);
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;

/* preallocation can change ac_b_ex, thus we store actually
* allocated blocks for history */
@@ -3795,7 +3796,6 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_lstart = pa->pa_pstart;
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
@@ -3816,26 +3816,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_obj_lock = &lg->lg_prealloc_lock;
pa->pa_inode = NULL;

- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

/*
* We will later add the new pa to the right bucket
* after updating the pa_free in ext4_mb_release_context
*/
- return 0;
}

-static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
- int err;
-
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
- err = ext4_mb_new_group_pa(ac);
+ ext4_mb_new_group_pa(ac);
else
- err = ext4_mb_new_inode_pa(ac);
- return err;
+ ext4_mb_new_inode_pa(ac);
}

/*
@@ -4150,6 +4144,29 @@ void ext4_discard_preallocations(struct inode *inode)
}
}

+static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa;
+
+ BUG_ON(ext4_pspace_cachep == NULL);
+ pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
+ if (!pa)
+ return -ENOMEM;
+ atomic_set(&pa->pa_count, 1);
+ ac->ac_pa = pa;
+ return 0;
+}
+
+static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+
+ BUG_ON(!pa);
+ ac->ac_pa = NULL;
+ WARN_ON(!atomic_dec_and_test(&pa->pa_count));
+ kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
#ifdef CONFIG_EXT4_DEBUG
static inline void ext4_mb_show_pa(struct super_block *sb)
{
@@ -4606,23 +4623,28 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
+
+ *errp = ext4_mb_pa_alloc(ac);
+ if (*errp)
+ goto errout;
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp)
- goto discard_and_exit;
-
- /* as we've just preallocated more space than
- * user requested originally, we store allocated
- * space in a special descriptor */
- if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- *errp = ext4_mb_new_preallocation(ac);
+ /*
+ * pa allocated above is added to grp->bb_prealloc_list only
+ * when we were able to allocate some block i.e. when
+ * ac->ac_status == AC_STATUS_FOUND.
+ * And error from above mean ac->ac_status != AC_STATUS_FOUND
+ * So we have to free this pa here itself.
+ */
if (*errp) {
- discard_and_exit:
+ ext4_mb_pa_free(ac);
ext4_discard_allocated_blocks(ac);
goto errout;
}
+ if (ac->ac_status == AC_STATUS_FOUND &&
+ ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+ ext4_mb_pa_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4637,6 +4659,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
if (freed)
goto repeat;
+ /*
+ * If block allocation fails then the pa allocated above
+ * needs to be freed here itself.
+ */
+ ext4_mb_pa_free(ac);
*errp = -ENOSPC;
}

--
2.21.0