Re: [f2fs-dev] [PATCH 2/3] f2fs: add bitmaps for empty or full NAT blocks

From: Chao Yu
Date: Thu Feb 23 2017 - 06:46:55 EST


On 2017/2/14 10:06, Jaegeuk Kim wrote:
> This patches adds bitmaps to represent empty or full NAT blocks containing
> free nid entries.
>
> If we can find valid crc|cp_ver in the last block of checkpoint pack, we'll
> use these bitmaps when building free nids. In order to avoid checkpointing
> burden, up-to-date bitmaps will be flushed only during umount time. So,
> normally we can get this gain, but when power-cut happens, we rely on fsck.f2fs
> which recovers this bitmap again.
>
> After this patch, we build free nids from nid #0 at mount time to make more
> full NAT blocks, but in runtime, we check empty NAT blocks to load free nids
> without loading any NAT pages from disk.
>
> Signed-off-by: Jaegeuk Kim <jaegeuk@xxxxxxxxxx>
> ---
> fs/f2fs/checkpoint.c | 29 +++++++-
> fs/f2fs/debug.c | 1 +
> fs/f2fs/f2fs.h | 23 +++++-
> fs/f2fs/node.c | 188 +++++++++++++++++++++++++++++++++++++++++++-----
> fs/f2fs/segment.c | 2 +-
> include/linux/f2fs_fs.h | 1 +
> 6 files changed, 224 insertions(+), 20 deletions(-)
>
> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> index 042f8d9afe44..783c5c3f16a4 100644
> --- a/fs/f2fs/checkpoint.c
> +++ b/fs/f2fs/checkpoint.c
> @@ -1024,6 +1024,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>
> spin_lock(&sbi->cp_lock);
>
> + if (ckpt->cp_pack_total_block_count >
> + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
> + disable_nat_bits(sbi, false);

I think we need to drop nat full/empty bitmap only if there is no enough space
in CP area while doing umount, otherwise we can keep this in memory.

> +
> if (cpc->reason == CP_UMOUNT)
> __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
> else
> @@ -1136,6 +1140,29 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>
> start_blk = __start_cp_next_addr(sbi);
>
> + /* write nat bits */
> + if (cpc->reason == CP_UMOUNT &&
> + is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
> + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);

__u64 cp_ver = cur_cp_version(ckpt);

> + unsigned int i;
> + block_t blk;
> +
> + cp_ver |= ((__u64)crc32 << 32);
> + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
> +
> + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
> + for (i = 0; i < nm_i->nat_bits_blocks; i++)
> + update_meta_page(sbi, nm_i->nat_bits +
> + (i << F2FS_BLKSIZE_BITS), blk + i);
> +
> + /* Flush all the NAT BITS pages */
> + while (get_pages(sbi, F2FS_DIRTY_META)) {
> + sync_meta_pages(sbi, META, LONG_MAX);
> + if (unlikely(f2fs_cp_error(sbi)))
> + return -EIO;
> + }
> + }
> +
> /* need to wait for end_io results */
> wait_on_all_pages_writeback(sbi);
> if (unlikely(f2fs_cp_error(sbi)))
> @@ -1272,7 +1299,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
>
> /* write cached NAT/SIT entries to NAT/SIT area */
> - flush_nat_entries(sbi);
> + flush_nat_entries(sbi, cpc);
> flush_sit_entries(sbi, cpc);
>
> /* unlock all the fs_lock[] in do_checkpoint() */
> diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
> index de8da9fc5c99..015ad2b73a92 100644
> --- a/fs/f2fs/debug.c
> +++ b/fs/f2fs/debug.c
> @@ -193,6 +193,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
> /* build nm */
> si->base_mem += sizeof(struct f2fs_nm_info);
> si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
> + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS);
>
> get_cache:
> si->cache_mem = 0;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 5c87d763a347..d263dade5e4c 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -547,6 +547,7 @@ struct f2fs_nm_info {
> struct list_head nat_entries; /* cached nat entry list (clean) */
> unsigned int nat_cnt; /* the # of cached nat entries */
> unsigned int dirty_nat_cnt; /* total num of nat entries in set */
> + unsigned int nat_blocks; /* # of nat blocks */
>
> /* free node ids management */
> struct radix_tree_root free_nid_root;/* root of the free_nid cache */
> @@ -557,6 +558,11 @@ struct f2fs_nm_info {
>
> /* for checkpoint */
> char *nat_bitmap; /* NAT bitmap pointer */
> +
> + unsigned int nat_bits_blocks; /* # of nat bits blocks */
> + unsigned char *nat_bits; /* NAT bits blocks */
> + unsigned char *full_nat_bits; /* full NAT pages */
> + unsigned char *empty_nat_bits; /* empty NAT pages */
> #ifdef CONFIG_F2FS_CHECK_FS
> char *nat_bitmap_mir; /* NAT bitmap mirror */
> #endif
> @@ -1158,6 +1164,19 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
> spin_unlock(&sbi->cp_lock);
> }
>
> +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
> +{
> + set_sbi_flag(sbi, SBI_NEED_FSCK);
> +
> + if (lock)
> + spin_lock(&sbi->cp_lock);
> + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
> + kfree(NM_I(sbi)->nat_bits);
> + NM_I(sbi)->nat_bits = NULL;
> + if (lock)
> + spin_unlock(&sbi->cp_lock);
> +}
> +
> static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
> {
> down_read(&sbi->cp_rwsem);
> @@ -2118,7 +2137,7 @@ void move_node_page(struct page *node_page, int gc_type);
> int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
> struct writeback_control *wbc, bool atomic);
> int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc);
> -void build_free_nids(struct f2fs_sb_info *sbi, bool sync);
> +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount);
> bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
> void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
> void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
> @@ -2129,7 +2148,7 @@ int recover_xattr_data(struct inode *inode, struct page *page,
> int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
> int restore_node_summary(struct f2fs_sb_info *sbi,
> unsigned int segno, struct f2fs_summary_block *sum);
> -void flush_nat_entries(struct f2fs_sb_info *sbi);
> +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
> int build_node_manager(struct f2fs_sb_info *sbi);
> void destroy_node_manager(struct f2fs_sb_info *sbi);
> int __init create_node_manager_caches(void);
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index f8fb25a27b46..e001d084667f 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -338,6 +338,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
> set_nat_flag(e, IS_CHECKPOINTED, false);
> __set_nat_cache_dirty(nm_i, e);
>
> + if (is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
> + new_blkaddr == NEW_ADDR)
> + test_and_clear_bit_le(ni->nid / NAT_ENTRY_PER_BLOCK,
> + nm_i->empty_nat_bits);

clear_bit_le(NAT_BLOCK_OFFSET,)

> +
> /* update fsync_mark if its inode nat entry is still alive */
> if (ni->nid != ni->ino)
> e = __lookup_nat_cache(nm_i, ni->ino);
> @@ -1839,7 +1844,59 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
> }
> }
>
> -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
> +static int scan_nat_bits(struct f2fs_sb_info *sbi)
> +{
> + struct f2fs_nm_info *nm_i = NM_I(sbi);
> + struct page *page;
> + unsigned int i = 0;
> + nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK;
> + nid_t nid;
> +
> + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
> + return -EAGAIN;
> +
> + down_read(&nm_i->nat_tree_lock);
> +check_empty:
> + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
> + if (i >= nm_i->nat_blocks) {
> + i = 0;
> + goto check_partial;
> + }
> +
> + for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK;
> + nid++) {
> + if (unlikely(nid >= nm_i->max_nid))
> + break;
> + add_free_nid(sbi, nid, true);
> + }
> +
> + if (nm_i->nid_cnt[FREE_NID_LIST] >= target)
> + goto out;
> + i++;
> + goto check_empty;
> +
> +check_partial:
> + i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
> + if (i >= nm_i->nat_blocks) {
> + disable_nat_bits(sbi, true);

Can this happen in real world? Should be a bug in somewhere?

> + return -EINVAL;
> + }
> +
> + nid = i * NAT_ENTRY_PER_BLOCK;
> + page = get_current_nat_page(sbi, nid);
> + scan_nat_page(sbi, page, nid);
> + f2fs_put_page(page, 1);
> +
> + if (nm_i->nid_cnt[FREE_NID_LIST] < target) {
> + i++;
> + goto check_partial;
> + }
> +out:
> + up_read(&nm_i->nat_tree_lock);
> + return 0;
> +}
> +
> +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
> {
> struct f2fs_nm_info *nm_i = NM_I(sbi);
> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> @@ -1854,6 +1911,20 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
> if (!sync && !available_free_memory(sbi, FREE_NIDS))
> return;
>
> + /* try to find free nids with nat_bits */
> + if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST])
> + return;
> +
> + /* find next valid candidate */

This is just for mount case?

> + if (is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
> + int idx = find_next_zero_bit_le(nm_i->full_nat_bits,
> + nm_i->nat_blocks, 0);
> + if (idx >= nm_i->nat_blocks)
> + set_sbi_flag(sbi, SBI_NEED_FSCK);
> + else
> + nid = idx * NAT_ENTRY_PER_BLOCK;
> + }
> +
> /* readahead nat pages to be scanned */
> ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
> META_NAT, true);
> @@ -1896,10 +1967,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
> nm_i->ra_nid_pages, META_NAT, false);
> }
>
> -void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
> +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
> {
> mutex_lock(&NM_I(sbi)->build_lock);
> - __build_free_nids(sbi, sync);
> + __build_free_nids(sbi, sync, mount);
> mutex_unlock(&NM_I(sbi)->build_lock);
> }
>
> @@ -1941,7 +2012,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
> spin_unlock(&nm_i->nid_list_lock);
>
> /* Let's scan nat pages and its caches to get free nids */
> - build_free_nids(sbi, true);
> + build_free_nids(sbi, true, false);
> goto retry;
> }
>
> @@ -2233,8 +2304,39 @@ static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> list_add_tail(&nes->set_list, head);
> }
>
> +void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
> + struct page *page)
> +{
> + struct f2fs_nm_info *nm_i = NM_I(sbi);
> + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
> + struct f2fs_nat_block *nat_blk = page_address(page);
> + int valid = 0;
> + int i;
> +
> + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
> + return;
> +
> + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
> + if (start_nid == 0 && i == 0)
> + valid++;
> + if (nat_blk->entries[i].block_addr)
> + valid++;
> + }
> + if (valid == 0) {
> + test_and_set_bit_le(nat_index, nm_i->empty_nat_bits);
> + test_and_clear_bit_le(nat_index, nm_i->full_nat_bits);

set_bit_le/clear_bit_le

> + return;
> + }
> +
> + test_and_clear_bit_le(nat_index, nm_i->empty_nat_bits);

ditto

> + if (valid == NAT_ENTRY_PER_BLOCK)
> + test_and_set_bit_le(nat_index, nm_i->full_nat_bits);
> + else
> + test_and_clear_bit_le(nat_index, nm_i->full_nat_bits);

ditto

> +}
> +
> static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> - struct nat_entry_set *set)
> + struct nat_entry_set *set, struct cp_control *cpc)
> {
> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> struct f2fs_journal *journal = curseg->journal;
> @@ -2249,7 +2351,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> * #1, flush nat entries to journal in current hot data summary block.
> * #2, flush nat entries to nat page.
> */
> - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
> + if (cpc->reason == CP_UMOUNT ||

if ((cpc->reason == CP_UMOUNT && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) ||

> + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
> to_journal = false;
>
> if (to_journal) {
> @@ -2289,10 +2392,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> }
> }
>
> - if (to_journal)
> + if (to_journal) {
> up_write(&curseg->journal_rwsem);
> - else
> + } else {
> + __update_nat_bits(sbi, start_nid, page);
> f2fs_put_page(page, 1);
> + }
>
> f2fs_bug_on(sbi, set->entry_cnt);
>
> @@ -2303,7 +2408,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> /*
> * This function is called during the checkpointing process.
> */
> -void flush_nat_entries(struct f2fs_sb_info *sbi)
> +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> {
> struct f2fs_nm_info *nm_i = NM_I(sbi);
> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> @@ -2324,7 +2429,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
> * entries, remove all entries from journal and merge them
> * into nat entry set.
> */
> - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
> + if (cpc->reason == CP_UMOUNT ||

if ((cpc->reason == CP_UMOUNT && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) ||

> + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
> remove_nats_in_journal(sbi);
>
> while ((found = __gang_lookup_nat_set(nm_i,
> @@ -2338,27 +2444,72 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
>
> /* flush dirty nats in nat entry set */
> list_for_each_entry_safe(set, tmp, &sets, set_list)
> - __flush_nat_entry_set(sbi, set);
> + __flush_nat_entry_set(sbi, set, cpc);
>
> up_write(&nm_i->nat_tree_lock);
>
> f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
> }
>
> +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
> +{
> + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
> + struct f2fs_nm_info *nm_i = NM_I(sbi);
> + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
> + unsigned int i;
> + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);

__u64 cp_ver = cur_cp_version(ckpt);

Thanks,

> + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
> + __u64 crc = le32_to_cpu(*((__le32 *)
> + ((unsigned char *)ckpt + crc_offset)));
> + block_t nat_bits_addr;
> +
> + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
> + return 0;
> +
> + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
> + F2FS_BLKSIZE - 1);
> + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
> + GFP_KERNEL);
> + if (!nm_i->nat_bits)
> + return -ENOMEM;
> +
> + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
> + nm_i->nat_bits_blocks;
> + for (i = 0; i < nm_i->nat_bits_blocks; i++) {
> + struct page *page = get_meta_page(sbi, nat_bits_addr++);
> +
> + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
> + page_address(page), F2FS_BLKSIZE);
> + f2fs_put_page(page, 1);
> + }
> +
> + cp_ver |= (crc << 32);
> + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
> + disable_nat_bits(sbi, true);
> + return 0;
> + }
> +
> + nm_i->full_nat_bits = nm_i->nat_bits + 8;
> + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
> +
> + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
> + return 0;
> +}
> +
> static int init_node_manager(struct f2fs_sb_info *sbi)
> {
> struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
> struct f2fs_nm_info *nm_i = NM_I(sbi);
> unsigned char *version_bitmap;
> - unsigned int nat_segs, nat_blocks;
> + unsigned int nat_segs;
> + int err;
>
> nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
>
> /* segment_count_nat includes pair segment so divide to 2. */
> nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
> - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
> -
> - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
> + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
> + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
>
> /* not used nids: 0, node, meta, (and root counted as valid node) */
> nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
> @@ -2392,6 +2543,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> if (!nm_i->nat_bitmap)
> return -ENOMEM;
>
> + err = __get_nat_bitmaps(sbi);
> + if (err)
> + return err;
> +
> #ifdef CONFIG_F2FS_CHECK_FS
> nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
> GFP_KERNEL);
> @@ -2414,7 +2569,7 @@ int build_node_manager(struct f2fs_sb_info *sbi)
> if (err)
> return err;
>
> - build_free_nids(sbi, true);
> + build_free_nids(sbi, true, true);
> return 0;
> }
>
> @@ -2473,6 +2628,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
> up_write(&nm_i->nat_tree_lock);
>
> kfree(nm_i->nat_bitmap);
> + kfree(nm_i->nat_bits);
> #ifdef CONFIG_F2FS_CHECK_FS
> kfree(nm_i->nat_bitmap_mir);
> #endif
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index df2ff5cfe8f4..8e1ec248c653 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -386,7 +386,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
> if (!available_free_memory(sbi, FREE_NIDS))
> try_to_free_nids(sbi, MAX_FREE_NIDS);
> else
> - build_free_nids(sbi, false);
> + build_free_nids(sbi, false, false);
>
> if (!is_idle(sbi))
> return;
> diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
> index f0748524ca8c..1c92ace2e8f8 100644
> --- a/include/linux/f2fs_fs.h
> +++ b/include/linux/f2fs_fs.h
> @@ -114,6 +114,7 @@ struct f2fs_super_block {
> /*
> * For checkpoint
> */
> +#define CP_NAT_BITS_FLAG 0x00000080
> #define CP_CRC_RECOVERY_FLAG 0x00000040
> #define CP_FASTBOOT_FLAG 0x00000020
> #define CP_FSCK_FLAG 0x00000010
>