Re: [f2fs-dev] [PATCH 07/10] f2fs: use meta_inode cache to improve roll-forward speed

From: Jaegeuk Kim
Date: Tue Sep 23 2014 - 00:47:26 EST


Hi Chao,

On Mon, Sep 22, 2014 at 10:36:25AM +0800, Chao Yu wrote:
> Hi Jaegeuk,
>
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@xxxxxxxxxx]
> > Sent: Monday, September 15, 2014 6:14 AM
> > To: linux-kernel@xxxxxxxxxxxxxxx; linux-fsdevel@xxxxxxxxxxxxxxx;
> > linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx
> > Cc: Jaegeuk Kim
> > Subject: [f2fs-dev] [PATCH 07/10] f2fs: use meta_inode cache to improve roll-forward speed
> >
> > Previously, all the dnode pages should be read during the roll-forward recovery.
> > Even worsely, whole the chain was traversed twice.
> > This patch removes that redundant and costly read operations by using page cache
> > of meta_inode and readahead function as well.
> >
> > Signed-off-by: Jaegeuk Kim <jaegeuk@xxxxxxxxxx>
> > ---
> > fs/f2fs/checkpoint.c | 11 ++++++++--
> > fs/f2fs/f2fs.h | 5 +++--
> > fs/f2fs/recovery.c | 59 +++++++++++++++++++++++++---------------------------
> > fs/f2fs/segment.h | 5 +++--
> > 4 files changed, 43 insertions(+), 37 deletions(-)
> >
> > diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> > index 7262d99..d1ed889 100644
> > --- a/fs/f2fs/checkpoint.c
> > +++ b/fs/f2fs/checkpoint.c
> > @@ -82,6 +82,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
> > case META_SSA:
> > case META_CP:
> > return 0;
> > + case META_POR:
> > + return SM_I(sbi)->main_blkaddr + sbi->user_block_count;
>
> Here we will skip virtual over-provision segments, so better to use TOTAL_BLKS(sbi).
>
> > default:
> > BUG();
> > }
> > @@ -90,11 +92,11 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
> > /*
> > * Readahead CP/NAT/SIT/SSA pages
> > */
> > -int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
> > +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
> > {
> > block_t prev_blk_addr = 0;
> > struct page *page;
> > - int blkno = start;
> > + block_t blkno = start;
> > int max_blks = get_max_meta_blks(sbi, type);
> >
> > struct f2fs_io_info fio = {
> > @@ -128,6 +130,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int
> > type)
> > /* get ssa/cp block addr */
> > blk_addr = blkno;
> > break;
> > + case META_POR:
> > + if (unlikely(blkno >= max_blks))
> > + goto out;
> > + blk_addr = blkno;
> > + break;
>
> The real modification in patch which is merged to dev of f2fs is as following:
>
> - /* get ssa/cp block addr */
> + case META_POR:
> + if (blkno >= max_blks || blkno < min_blks)
> + goto out;
>
> IMHO, it's better to verify boundary separately for META_{SSA,CP,POR} with unlikely.
> How do you think?

Not bad.
Could you check the v2 below?

>
> > default:
> > BUG();
> > }
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 4f84d2a..48d7d46 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -103,7 +103,8 @@ enum {
> > META_CP,
> > META_NAT,
> > META_SIT,
> > - META_SSA
> > + META_SSA,
> > + META_POR,
> > };
> >
> > /* for the list of ino */
> > @@ -1291,7 +1292,7 @@ void destroy_segment_manager_caches(void);
> > */
> > struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
> > struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
> > -int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
> > +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
> > long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
> > void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
> > void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
> > diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
> > index 3736728..6f7fbfa 100644
> > --- a/fs/f2fs/recovery.c
> > +++ b/fs/f2fs/recovery.c
> > @@ -173,7 +173,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head
> > *head)
> > {
> > unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
> > struct curseg_info *curseg;
> > - struct page *page;
> > + struct page *page = NULL;
> > block_t blkaddr;
> > int err = 0;
> >
> > @@ -181,20 +181,19 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head
> > *head)
> > curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
> > blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
> >
> > - /* read node page */
> > - page = alloc_page(GFP_F2FS_ZERO);
> > - if (!page)
> > - return -ENOMEM;
> > - lock_page(page);
> > -
> > while (1) {
> > struct fsync_inode_entry *entry;
> >
> > - err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
> > - if (err)
> > - return err;
> > + if (blkaddr < SM_I(sbi)->main_blkaddr ||
> > + blkaddr > TOTAL_BLKS(sbi))
>
> blkaddr >= TOTAL_BLKS(sbi) ?

Right.

>
> > + break;
> >
> > - lock_page(page);
> > + page = get_meta_page(sbi, blkaddr);
> > + if (IS_ERR(page))
> > + return PTR_ERR(page);
> > +
> > + ra_meta_pages(sbi, next_blkaddr_of_node(page),
> > + MAX_BIO_BLOCKS(max_hw_blocks(sbi)), META_POR);
>
> In first round we readahead [0,n] pages, then in second round we readahead [1,n+1] pages,
> but as [1,n] pages is already uptodate, so we will readahead only one page, the same for the
> following rounds. It's low efficiency.
>
> So how about modify in find_fsync_dnodes/recover_data like this:
>
> block_t blkaddr, ra_start_blk;
> int err = 0;
>
> /* get node pages in the current segment */
> curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
> blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
> ra_start_blk = blkaddr;
>
> while (1) {
> struct fsync_inode_entry *entry;
>
> if (blkaddr < SM_I(sbi)->main_blkaddr ||
> blkaddr > TOTAL_BLKS(sbi))
> return 0;
>
> if (ra_start_blk == blkaddr) {
> ra_meta_pages(sbi, ra_start_blk, MAX_BIO_BLOCKS(sbi),
> META_POR);
> ra_start_blk += MAX_BIO_BLOCKS(sbi);
> }
>
> page = get_meta_page(sbi, blkaddr);

Well. the blkaddr would not be alway able to be matched.
How about this?

commit 1cdbd53437f32046af8d94f551505754446c9720
Author: Jaegeuk Kim <jaegeuk@xxxxxxxxxx>
Date: Thu Sep 11 13:49:55 2014 -0700

f2fs: use meta_inode cache to improve roll-forward speed

Previously, all the dnode pages should be read during the roll-forward recovery.
Even worsely, whole the chain was traversed twice.
This patch removes that redundant and costly read operations by using page cache
of meta_inode and readahead function as well.

Signed-off-by: Jaegeuk Kim <jaegeuk@xxxxxxxxxx>

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index d44d287..700869c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,23 @@ out:
return page;
}

-static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ bool readahead = false;
+ struct page *page;
+
+ page = find_get_page(META_MAPPING(sbi), index);
+ if (!page || (page && !PageUptodate(page)))
+ readahead = true;
+ f2fs_put_page(page, 0);
+
+ if (readahead)
+ ra_meta_pages(sbi, index,
+ MAX_BIO_BLOCKS(max_hw_blocks(sbi)), META_POR);
+ return get_meta_page(sbi, index);
+}
+
+static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
{
switch (type) {
case META_NAT:
@@ -82,6 +98,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
case META_SSA:
case META_CP:
return 0;
+ case META_POR:
+ return SM_I(sbi)->main_blkaddr + TOTAL_BLKS(sbi);
default:
BUG();
}
@@ -90,12 +108,13 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
{
block_t prev_blk_addr = 0;
struct page *page;
- int blkno = start;
- int max_blks = get_max_meta_blks(sbi, type);
+ block_t blkno = start;
+ block_t max_blks = get_max_meta_blks(sbi, type);
+ block_t min_blks = SM_I(sbi)->seg0_blkaddr;

struct f2fs_io_info fio = {
.type = META,
@@ -125,7 +144,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
break;
case META_SSA:
case META_CP:
- /* get ssa/cp block addr */
+ case META_POR:
+ if (unlikely(blkno >= max_blks))
+ goto out;
+ if (unlikely(blkno < min_blks))
+ goto out;
blk_addr = blkno;
break;
default:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8efa170..b6439c3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -103,7 +103,8 @@ enum {
META_CP,
META_NAT,
META_SIT,
- META_SSA
+ META_SSA,
+ META_POR,
};

/* for the list of ino */
@@ -1294,7 +1295,8 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
+struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index e587ee1..3a259a9 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -138,7 +138,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
- struct page *page;
+ struct page *page = NULL;
block_t blkaddr;
int err = 0;

@@ -146,20 +146,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);

- /* read node page */
- page = alloc_page(GFP_F2FS_ZERO);
- if (!page)
- return -ENOMEM;
- lock_page(page);
-
while (1) {
struct fsync_inode_entry *entry;

- err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
- if (err)
- return err;
+ if (blkaddr < SM_I(sbi)->main_blkaddr ||
+ blkaddr >= TOTAL_BLKS(sbi))
+ return 0;

- lock_page(page);
+ page = get_meta_page_ra(sbi, blkaddr);

if (cp_ver != cpver_of_node(page))
break;
@@ -202,11 +196,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
+ f2fs_put_page(page, 1);
}
-
- unlock_page(page);
- __free_pages(page, 0);
-
+ f2fs_put_page(page, 1);
return err;
}

@@ -400,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
- struct page *page;
+ struct page *page = NULL;
int err = 0;
block_t blkaddr;

@@ -408,32 +400,29 @@ static int recover_data(struct f2fs_sb_info *sbi,
curseg = CURSEG_I(sbi, type);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);

- /* read node page */
- page = alloc_page(GFP_F2FS_ZERO);
- if (!page)
- return -ENOMEM;
-
- lock_page(page);
-
while (1) {
struct fsync_inode_entry *entry;

- err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
- if (err)
- return err;
+ if (blkaddr < SM_I(sbi)->main_blkaddr ||
+ blkaddr > TOTAL_BLKS(sbi))
+ break;

- lock_page(page);
+ page = get_meta_page_ra(sbi, blkaddr);

- if (cp_ver != cpver_of_node(page))
+ if (cp_ver != cpver_of_node(page)) {
+ f2fs_put_page(page, 1);
break;
+ }

entry = get_fsync_inode(head, ino_of_node(page));
if (!entry)
goto next;

err = do_recover_data(sbi, entry->inode, page, blkaddr);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
break;
+ }

if (entry->blkaddr == blkaddr) {
iput(entry->inode);
@@ -443,11 +432,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
+ f2fs_put_page(page, 1);
}
-
- unlock_page(page);
- __free_pages(page, 0);
-
if (!err)
allocate_new_segments(sbi);
return err;
@@ -493,6 +479,10 @@ out:
destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);

+ /* truncate meta pages to be used by the recovery */
+ truncate_inode_pages_range(META_MAPPING(sbi),
+ SM_I(sbi)->main_blkaddr << PAGE_CACHE_SHIFT, -1);
+
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 013aed2..c6f37f2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -87,6 +87,7 @@
(BITS_TO_LONGS(nr) * sizeof(unsigned long))
#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
#define TOTAL_SECS(sbi) (sbi->total_sections)
+#define TOTAL_BLKS(sbi) (SM_I(sbi)->segment_count << sbi->log_blocks_per_seg)

#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
(((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
@@ -553,7 +554,7 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
struct f2fs_sm_info *sm_info = SM_I(sbi);
- block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
+ block_t total_blks = TOTAL_BLKS(sbi);
block_t start_addr = sm_info->seg0_blkaddr;
block_t end_addr = start_addr + total_blks - 1;
BUG_ON(blk_addr < start_addr);
@@ -606,7 +607,7 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
struct f2fs_sm_info *sm_info = SM_I(sbi);
- block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
+ block_t total_blks = TOTAL_BLKS(sbi);
block_t start_addr = sm_info->seg0_blkaddr;
block_t end_addr = start_addr + total_blks - 1;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/