[patch 23/36] ext4: Use an rbtree for tracking blocks freed duringtransaction.

From: Greg KH
Date: Wed Feb 18 2009 - 17:40:58 EST



2.6.27-stable review patch. If anyone has any objections, please let us know.

------------------

From: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>

(cherry picked from commit c894058d66637c7720569fbe12957f4de64d9991 to allow
commit e21675d4 to be included in 2.6.27.y)

With this patch we track the block freed during a transaction using
red-black tree. We also make sure contiguous blocks freed are collected
in one node in the tree.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Theodore Ts'o <tytso@xxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxx>

---
fs/ext4/mballoc.c | 186 ++++++++++++++++++++++++++++++++++--------------------
fs/ext4/mballoc.h | 25 ++++---
2 files changed, 134 insertions(+), 77 deletions(-)

--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -332,6 +332,7 @@
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
@@ -2506,6 +2507,7 @@ int ext4_mb_add_groupinfo(struct super_b

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;

#ifdef DOUBLE_CHECK
{
@@ -2819,13 +2821,11 @@ int ext4_mb_release(struct super_block *
static noinline_for_stack void
ext4_mb_free_committed_blocks(struct super_block *sb)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;

if (list_empty(&sbi->s_committed_transaction))
return;
@@ -2833,44 +2833,46 @@ ext4_mb_free_committed_blocks(struct sup
/* there is committed blocks to be freed yet */
do {
/* get next array of blocks */
- md = NULL;
+ entry = NULL;
spin_lock(&sbi->s_md_lock);
if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
+ entry = list_entry(sbi->s_committed_transaction.next,
+ struct ext4_free_data, list);
+ list_del(&entry->list);
}
spin_unlock(&sbi->s_md_lock);

- if (md == NULL)
+ if (entry == NULL)
break;

mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);

- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);

+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- }
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->group);

- kfree(md);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ } while (1);

mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -3025,6 +3027,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
#ifdef CONFIG_PROC_FS
proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
if (proc_root_ext4 == NULL)
@@ -3041,6 +3053,7 @@ void exit_ext4_mballoc(void)
#ifdef CONFIG_PROC_FS
remove_proc_entry("fs/ext4", NULL);
#endif
+ kmem_cache_destroy(ext4_free_ext_cachep);
}


@@ -3561,6 +3574,7 @@ ext4_mb_use_preallocated(struct ext4_all
ac->ac_criteria = 20;
return 1;
}
+
return 0;
}

@@ -4678,6 +4692,21 @@ static void ext4_mb_poll_new_transaction
ext4_mb_free_committed_blocks(sb);
}

+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->t_tid == entry2->t_tid) &&
+ (entry1->group == entry2->group) &&
+ ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ return 1;
+ return 0;
+}
+
static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group, ext4_grpblk_t block, int count)
@@ -4685,57 +4714,80 @@ ext4_mb_free_metadata(handle_t *handle,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+

BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);

+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ new_node = &new_entry->node;
+
ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
}
+ }

- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);

- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
+ }

- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
+ /* Add the extent to active_transaction list */
+ spin_lock(&sbi->s_md_lock);
+ list_add(&new_entry->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -97,22 +97,27 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512

-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS 30
+struct ext4_free_data {
+ /* this links the free block information from group_info */
+ struct rb_node node;

-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+ /* this links the free block information from ext4_sb_info */
struct list_head list;
+
+ /* group which free block extent belongs */
+ ext4_group_t group;
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* transaction which freed this extent */
+ tid_t t_tid;
};

struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/