[PATCH 3/6] EXT4: Remove ENOMEM/congestion_wait() loops.

From: NeilBrown
Date: Mon Sep 13 2021 - 20:28:30 EST


Indefinite loops waiting for memory allocation are discouraged by
documentation in gfp.h which says the use of __GFP_NOFAIL that it

is definitely preferable to use the flag rather than opencode endless
loop around allocator.

Such loops that use congestion_wait() are particularly unwise as
congestion_wait() is indistinguishable from
schedule_timeout_uninterruptible() in practice - and should be
deprecated.

So this patch changes the two loops in ext4_ext_truncate() to use
__GFP_NOFAIL instead of looping.

As the allocation is multiple layers deeper in the call stack, this
requires passing the EXT4_EX_NOFAIL flag down and handling it in various
places.

Of particular interest is the ext4_journal_start family of calls which
can now have EXT4_EX_NOFAIL 'or'ed in to the 'type'. This could be seen
as a blurring of types. However 'type' is 8 bits, and EXT4_EX_NOFAIL is
a high bit, so it is safe in practice.

jbd2__journal_start() is enhanced so that the gfp_t flags passed are
used for *all* allocations.

Signed-off-by: NeilBrown <neilb@xxxxxxx>
---
fs/ext4/ext4.h | 2 +-
fs/ext4/ext4_jbd2.c | 8 +++++++-
fs/ext4/extents.c | 49 +++++++++++++++++-----------------------------
fs/ext4/extents_status.c | 35 +++++++++++++++++++--------------
fs/ext4/extents_status.h | 2 +-
fs/ext4/indirect.c | 2 +-
fs/ext4/inode.c | 6 +++---
fs/ext4/ioctl.c | 4 ++--
fs/ext4/super.c | 2 +-
fs/jbd2/transaction.c | 8 ++++----
10 files changed, 58 insertions(+), 60 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 90ff5acaf11f..52a34f5dfda2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3720,7 +3720,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end);
+ ext4_lblk_t end, int nofail);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6def7339056d..2bdda3b7a3e6 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -92,6 +92,12 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
{
journal_t *journal;
int err;
+ gfp_t gfp_mask = GFP_NOFS;
+
+ if (type & EXT4_EX_NOFAIL) {
+ gfp_mask |= __GFP_NOFAIL;
+ type &= ~EXT4_EX_NOFAIL;
+ }

trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
_RET_IP_);
@@ -103,7 +109,7 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return ext4_get_nojournal();
return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
- GFP_NOFS, type, line);
+ gfp_mask, type, line);
}

int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c0de30f25185..b7bc12aedf78 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1488,7 +1488,7 @@ static int ext4_ext_search_left(struct inode *inode,
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent *ret_ex)
+ struct ext4_extent *ret_ex, int nofail)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1565,7 +1565,7 @@ static int ext4_ext_search_right(struct inode *inode,
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ path->p_depth - depth, nofail);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1574,7 +1574,7 @@ static int ext4_ext_search_right(struct inode *inode,
put_bh(bh);
}

- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, block, path->p_depth - depth, nofail);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2773,7 +2773,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
}

int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
+ ext4_lblk_t end, int nofail)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int depth = ext_depth(inode);
@@ -2789,7 +2789,8 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug(inode, "truncate since %u to %u\n", start, end);

/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
+ handle = ext4_journal_start_with_revoke(inode,
+ EXT4_HT_TRUNCATE | nofail,
depth + 1,
ext4_free_metadata_revoke_credits(inode->i_sb, depth));
if (IS_ERR(handle))
@@ -2877,7 +2878,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- NULL);
+ NULL, nofail);
if (err < 0)
goto out;
if (pblk) {
@@ -2899,10 +2900,6 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
} else {
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
GFP_NOFS | __GFP_NOFAIL);
- if (path == NULL) {
- ext4_journal_stop(handle);
- return -ENOMEM;
- }
path[0].p_maxdepth = path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
i = 0;
@@ -2955,7 +2952,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
memset(path + i + 1, 0, sizeof(*path));
bh = read_extent_tree_block(inode,
ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ EXT4_EX_NOCACHE | nofail);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -4186,7 +4183,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2, 0);
if (err < 0)
goto out;

@@ -4368,23 +4365,13 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)

last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
-retry:
err = ext4_es_remove_extent(inode, last_block,
- EXT_MAX_BLOCKS - last_block);
- if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry;
- }
+ EXT_MAX_BLOCKS - last_block,
+ EXT4_EX_NOFAIL);
if (err)
return err;
-retry_remove_space:
- err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
- if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry_remove_space;
- }
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1,
+ EXT4_EX_NOFAIL);
return err;
}

@@ -5322,13 +5309,13 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ext4_discard_preallocations(inode, 0);

ret = ext4_es_remove_extent(inode, punch_start,
- EXT_MAX_BLOCKS - punch_start);
+ EXT_MAX_BLOCKS - punch_start, 0);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}

- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1, 0);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
@@ -5510,7 +5497,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
}

ret = ext4_es_remove_extent(inode, offset_lblk,
- EXT_MAX_BLOCKS - offset_lblk);
+ EXT_MAX_BLOCKS - offset_lblk, 0);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
@@ -5574,10 +5561,10 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
BUG_ON(!inode_is_locked(inode1));
BUG_ON(!inode_is_locked(inode2));

- *erp = ext4_es_remove_extent(inode1, lblk1, count);
+ *erp = ext4_es_remove_extent(inode1, lblk1, count, 0);
if (unlikely(*erp))
return 0;
- *erp = ext4_es_remove_extent(inode2, lblk2, count);
+ *erp = ext4_es_remove_extent(inode2, lblk2, count, 0);
if (unlikely(*erp))
return 0;

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9a3a8996aacf..7f7711a2ea44 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -144,9 +144,10 @@
static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;

-static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ int nofail);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved);
+ ext4_lblk_t end, int *reserved, int nofail);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
@@ -452,10 +453,11 @@ static void ext4_es_list_del(struct inode *inode)

static struct extent_status *
ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
- ext4_fsblk_t pblk)
+ ext4_fsblk_t pblk, int nofail)
{
struct extent_status *es;
- es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+ es = kmem_cache_alloc(ext4_es_cachep,
+ GFP_ATOMIC | (nofail ? __GFP_NOFAIL : 0));
if (es == NULL)
return NULL;
es->es_lblk = lblk;
@@ -754,7 +756,8 @@ static inline void ext4_es_insert_extent_check(struct inode *inode,
}
#endif

-static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ int nofail)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
@@ -795,7 +798,7 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
}

es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
- newes->es_pblk);
+ newes->es_pblk, nofail);
if (!es)
return -ENOMEM;
rb_link_node(&es->rb_node, parent, p);
@@ -848,11 +851,11 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);

write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, NULL);
+ err = __es_remove_extent(inode, lblk, end, NULL, 0);
if (err != 0)
goto error;
retry:
- err = __es_insert_extent(inode, &newes);
+ err = __es_insert_extent(inode, &newes, 0);
if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
128, EXT4_I(inode)))
goto retry;
@@ -902,7 +905,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,

es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
if (!es || es->es_lblk > end)
- __es_insert_extent(inode, &newes);
+ __es_insert_extent(inode, &newes, 0);
write_unlock(&EXT4_I(inode)->i_es_lock);
}

@@ -1294,6 +1297,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* @lblk - first block in range
* @end - last block in range
* @reserved - number of cluster reservations released
+ * @nofail - EXT4_EX_NOFAIL if __GFP_NOFAIL should be used
*
* If @reserved is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
@@ -1301,7 +1305,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved)
+ ext4_lblk_t end, int *reserved, int nofail)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -1350,7 +1354,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
orig_es.es_len - len2;
ext4_es_store_pblock_status(&newes, block,
ext4_es_status(&orig_es));
- err = __es_insert_extent(inode, &newes);
+ err = __es_insert_extent(inode, &newes, nofail);
if (err) {
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
@@ -1426,12 +1430,13 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
* @inode - file containing range
* @lblk - first block in range
* @len - number of blocks to remove
+ * @nofail - EXT4_EX_NOFAIL if __GFP_NOFAIL should be used
*
* Reduces block/cluster reservation count and for bigalloc cancels pending
* reservations as needed. Returns 0 on success, error code on failure.
*/
int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+ ext4_lblk_t len, int nofail)
{
ext4_lblk_t end;
int err = 0;
@@ -1456,7 +1461,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, &reserved);
+ err = __es_remove_extent(inode, lblk, end, &reserved, nofail);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
@@ -2003,11 +2008,11 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,

write_lock(&EXT4_I(inode)->i_es_lock);

- err = __es_remove_extent(inode, lblk, lblk, NULL);
+ err = __es_remove_extent(inode, lblk, lblk, NULL, 0);
if (err != 0)
goto error;
retry:
- err = __es_insert_extent(inode, &newes);
+ err = __es_insert_extent(inode, &newes, 0);
if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
128, EXT4_I(inode)))
goto retry;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4ec30a798260..23d77094a165 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -134,7 +134,7 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+ ext4_lblk_t len, int nofail);
extern void ext4_es_find_extent_range(struct inode *inode,
int (*match_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 89efa78ed4b2..910e87aea7be 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1125,7 +1125,7 @@ void ext4_ind_truncate(handle_t *handle, struct inode *inode)
return;
}

- ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block, 0);

/*
* The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d18852d6029c..24246043d94b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1575,7 +1575,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
ext4_lblk_t start, last;
start = index << (PAGE_SHIFT - inode->i_blkbits);
last = end << (PAGE_SHIFT - inode->i_blkbits);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ ext4_es_remove_extent(inode, start, last - start + 1, 0);
}

pagevec_init(&pvec);
@@ -4109,7 +4109,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ext4_discard_preallocations(inode, 0);

ret = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
+ stop_block - first_block, 0);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
@@ -4117,7 +4117,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ stop_block - 1, 0);
else
ret = ext4_ind_remove_space(handle, inode, first_block,
stop_block);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 606dee9e08a3..e4de05a6b976 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -79,8 +79,8 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
(ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
swap(ei1->i_disksize, ei2->i_disksize);
- ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
- ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS, 0);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS, 0);

isize = i_size_read(inode1);
i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0775950ee84e..947e8376a35a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1393,7 +1393,7 @@ void ext4_clear_inode(struct inode *inode)
invalidate_inode_buffers(inode);
clear_inode(inode);
ext4_discard_preallocations(inode, 0);
- ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS, 0);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6a3caedd2285..23e0f003d43b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -476,9 +476,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
}

/* Allocate a new handle. This should probably be in a slab... */
-static handle_t *new_handle(int nblocks)
+static handle_t *new_handle(int nblocks, gfp_t gfp)
{
- handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
+ handle_t *handle = jbd2_alloc_handle(gfp);
if (!handle)
return NULL;
handle->h_total_credits = nblocks;
@@ -505,13 +505,13 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,

nblocks += DIV_ROUND_UP(revoke_records,
journal->j_revoke_records_per_block);
- handle = new_handle(nblocks);
+ handle = new_handle(nblocks, gfp_mask);
if (!handle)
return ERR_PTR(-ENOMEM);
if (rsv_blocks) {
handle_t *rsv_handle;

- rsv_handle = new_handle(rsv_blocks);
+ rsv_handle = new_handle(rsv_blocks, gfp_mask);
if (!rsv_handle) {
jbd2_free_handle(handle);
return ERR_PTR(-ENOMEM);