[PATCH 1/5] ext4: Avoid races caused by on-line resizing and SMP memory reordering

From: Theodore Ts'o
Date: Sat Apr 25 2009 - 23:51:32 EST


Ext4's on-line resizing adds a new block group and then, only at the
last step adjusts s_groups_count. However, it's possible on SMP
systems that another CPU could see the updated the s_group_count and
not see the newly initialized data structures for the just-added block
group. For this reason, it's important to insert a SMP read barrier
after reading s_groups_count and before reading any, say, block group
descriptors allowed by the block group count.

Unfortunately, we rather blatently violate this locking protocol as
documented in fs/ext4/resize.c. Fortunately, (1) on-line resizes
happen relatively rarely, and (2) it seems rare that the filesystem
code will immediately try to use just-added block group before any
memory ordering issues resolve themselves. So apparently problems
here are relatively hard to hit, since ext3 also is vulnerable to this
race and no one has apparently complained.

Signed-off-by: "Theodore Ts'o" <tytso@xxxxxxx>
---
fs/ext4/balloc.c | 6 ++++--
fs/ext4/ialloc.c | 34 +++++++++++++++++++++-------------
fs/ext4/mballoc.c | 49 ++++++++++++++++++++++++++++++++-----------------
3 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad..d1615f2 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -88,9 +88,11 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
ext4_group_t block_group, struct ext4_group_desc *gdp)
{
int bit, bit_max;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
unsigned free_blocks, group_blocks;
struct ext4_sb_info *sbi = EXT4_SB(sb);

+ smp_rmb(); /* after reading s_groups_count first */
if (bh) {
J_ASSERT_BH(bh, buffer_locked(bh));

@@ -123,7 +125,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
bit_max += ext4_bg_num_gdb(sb, block_group);
}

- if (block_group == sbi->s_groups_count - 1) {
+ if (block_group == ngroups - 1) {
/*
* Even though mke2fs always initialize first and last group
* if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
group_blocks = ext4_blocks_count(sbi->s_es) -
le32_to_cpu(sbi->s_es->s_first_data_block) -
- (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+ (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
} else {
group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a0..52ce274 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -322,6 +322,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
ext4_group_t group;
int ret = -1;

+ smp_rmb(); /* after reading s_groups_count first */
freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
avefreei = freei / ngroups;

@@ -362,7 +363,8 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
ext4_group_t n_fbg_groups;
ext4_group_t i;

- n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ smp_rmb(); /* after reading s_groups_count first */
+ n_fbg_groups = (ngroups + flex_size - 1) >>
sbi->s_log_groups_per_flex;

find_close_to_parent:
@@ -478,18 +480,18 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_group_t ngroups = sbi->s_groups_count;
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
unsigned int ndirs;
int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
- ext4_group_t i, grp, g;
+ ext4_group_t i, grp, g, ngroups = sbi->s_groups_count;;
struct ext4_group_desc *desc;
struct orlov_stats stats;
int flex_size = ext4_flex_bg_size(sbi);

+ smp_rmb(); /* after reading s_groups_count first */
if (flex_size > 1) {
ngroups = (ngroups + flex_size - 1) >>
sbi->s_log_groups_per_flex;
@@ -585,6 +587,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
fallback:
ngroups = sbi->s_groups_count;
avefreei = freei / ngroups;
+ smp_rmb();
fallback_retry:
parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
@@ -613,11 +616,11 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t i, last, ngroups = EXT4_SB(sb)->s_groups_count;
struct ext4_group_desc *desc;
- ext4_group_t i, last;
int flex_size = ext4_flex_bg_size(EXT4_SB(sb));

+ smp_rmb(); /* after reading s_groups_count first */
/*
* Try to place the inode is the same flex group as its
* parent. If we can't find space, use the Orlov algorithm to
@@ -799,7 +802,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
struct buffer_head *group_desc_bh;
- ext4_group_t group = 0;
+ ext4_group_t ngroups, group = 0;
unsigned long ino = 0;
struct inode *inode;
struct ext4_group_desc *gdp = NULL;
@@ -851,12 +854,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
ret2 = find_group_other(sb, dir, &group, mode);

got_group:
+ ngroups = sbi->s_groups_count;
+ smp_rmb();
EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;

- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
err = -EIO;

gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +922,7 @@ repeat_in_this_group:
* group descriptor metadata has not yet been updated.
* So we just go onto the next blockgroup.
*/
- if (++group == sbi->s_groups_count)
+ if (++group == ngroups)
group = 0;
}
err = -ENOSPC;
@@ -1158,17 +1163,18 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
{
unsigned long desc_count;
struct ext4_group_desc *gdp;
- ext4_group_t i;
+ ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count;
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
unsigned long bitmap_count, x;
struct buffer_head *bitmap_bh = NULL;

+ smp_rmb(); /* after reading s_groups_count first */
es = EXT4_SB(sb)->s_es;
desc_count = 0;
bitmap_count = 0;
gdp = NULL;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
@@ -1189,8 +1195,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
return desc_count;
#else
+ smp_rmb(); /* after reading s_groups_count first */
desc_count = 0;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
@@ -1205,9 +1212,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
unsigned long ext4_count_dirs(struct super_block * sb)
{
unsigned long count = 0;
- ext4_group_t i;
+ ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count;

- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ smp_rmb(); /* after reading s_groups_count first */
+ for (i = 0; i < ngroups; i++) {
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677..ecc2d49 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,

static int ext4_mb_init_cache(struct page *page, char *incore)
{
+ ext4_group_t ngroups;
int blocksize;
int blocks_per_page;
int groups_per_page;
@@ -757,6 +758,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)

inode = page->mapping->host;
sb = inode->i_sb;
+ ngroups = EXT4_SB(sb)->s_groups_count;
+ smp_rmb();
blocksize = 1 << inode->i_blkbits;
blocks_per_page = PAGE_CACHE_SIZE / blocksize;

@@ -780,7 +783,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
for (i = 0; i < groups_per_page; i++) {
struct ext4_group_desc *desc;

- if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+ if (first_group + i >= ngroups)
break;

err = -EIO;
@@ -852,7 +855,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
struct ext4_group_info *grinfo;

group = (first_block + i) >> 1;
- if (group >= EXT4_SB(sb)->s_groups_count)
+ if (group >= ngroups)
break;

/*
@@ -1788,9 +1791,11 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
int block, pnum;
int blocks_per_page;
int groups_per_page;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t first_group;
struct ext4_group_info *grp;

+ smp_rmb(); /* after reading s_groups_count first */
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
/*
* the buddy cache inode stores the block bitmap
@@ -1807,7 +1812,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
/* read all groups the page covers into the cache */
for (i = 0; i < groups_per_page; i++) {

- if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+ if ((first_group + i) >= ngroups)
break;
grp = ext4_get_group_info(sb, first_group + i);
/* take all groups write allocation
@@ -1945,8 +1950,7 @@ err:
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t group;
- ext4_group_t i;
+ ext4_group_t ngroups, group, i;
int cr;
int err = 0;
int bsbits;
@@ -1957,6 +1961,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)

sb = ac->ac_sb;
sbi = EXT4_SB(sb);
+ ngroups = EXT4_SB(sb)->s_groups_count;
+ smp_rmb();
BUG_ON(ac->ac_status == AC_STATUS_FOUND);

/* first, try the goal */
@@ -2017,11 +2023,11 @@ repeat:
*/
group = ac->ac_g_ex.fe_group;

- for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+ for (i = 0; i < ngroups; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;

- if (group == EXT4_SB(sb)->s_groups_count)
+ if (group == ngroups)
group = 0;

/* quick check to skip empty groups */
@@ -2320,7 +2326,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)

if (*pos < 0 || *pos >= sbi->s_groups_count)
return NULL;
-
+ smp_rmb();
group = *pos + 1;
return (void *) ((unsigned long) group);
}
@@ -2334,6 +2340,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
if (*pos < 0 || *pos >= sbi->s_groups_count)
return NULL;
+ smp_rmb();
group = *pos + 1;
return (void *) ((unsigned long) group);
}
@@ -2587,6 +2594,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)

static int ext4_mb_init_backend(struct super_block *sb)
{
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t i;
int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2597,8 +2605,10 @@ static int ext4_mb_init_backend(struct super_block *sb)
struct ext4_group_info **meta_group_info;
struct ext4_group_desc *desc;

+ smp_rmb(); /* after reading s_groups_count first */
+
/* This is the number of blocks used by GDT */
- num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
1) >> EXT4_DESC_PER_BLOCK_BITS(sb);

/*
@@ -2644,7 +2654,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
for (i = 0; i < num_meta_group_infos; i++) {
if ((i + 1) == num_meta_group_infos)
metalen = sizeof(*meta_group_info) *
- (sbi->s_groups_count -
+ (ngroups -
(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
meta_group_info = kmalloc(metalen, GFP_KERNEL);
if (meta_group_info == NULL) {
@@ -2655,7 +2665,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}

- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
@@ -2781,13 +2791,15 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)

int ext4_mb_release(struct super_block *sb)
{
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t i;
int num_meta_group_infos;
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);

+ smp_rmb(); /* after reading s_groups_count first */
if (sbi->s_group_info) {
- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
#ifdef DOUBLE_CHECK
kfree(grinfo->bb_bitmap);
@@ -2797,7 +2809,7 @@ int ext4_mb_release(struct super_block *sb)
ext4_unlock_group(sb, i);
kfree(grinfo);
}
- num_meta_group_infos = (sbi->s_groups_count +
+ num_meta_group_infos = (ngroups +
EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
for (i = 0; i < num_meta_group_infos; i++)
@@ -4121,7 +4133,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- ext4_group_t i;
+ ext4_group_t ngroups, i;

printk(KERN_ERR "EXT4-fs: Can't allocate:"
" Allocation context details:\n");
@@ -4145,7 +4157,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
ac->ac_found);
printk(KERN_ERR "EXT4-fs: groups: \n");
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ ngroups = EXT4_SB(ac->ac_sb)->s_groups_count;
+ smp_rmb();
+ for (i = 0; i < EXT4_SB(sb)->ngroups; i++) {
struct ext4_group_info *grp = ext4_get_group_info(sb, i);
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
@@ -4469,13 +4483,14 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)

static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
- ext4_group_t i;
+ ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count;
int ret;
int freed = 0;

+ smp_rmb(); /* after reading s_groups_count first */
trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
sb->s_id, needed);
- for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+ for (i = 0; i < ngroups && needed > 0; i++) {
ret = ext4_mb_discard_group_preallocations(sb, i, needed);
freed += ret;
needed -= ret;
--
1.5.6.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/