RE:(2) [f2fs-dev] [PATCH] f2fs: support dio write for zoned storage

From: Daejun Park
Date: Mon Oct 14 2024 - 22:12:34 EST


>--------- Original Message ---------
>Sender : Daeho Jeong <daeho43@xxxxxxxxx>
>Date : 2024-10-11 05:42 (GMT+9)
>Title : Re: [f2fs-dev] [PATCH] f2fs: support dio write for zoned storage
> 
>On Fri, Sep 27, 2024 at 12:25 AM Daejun Park <daejun7.park@xxxxxxxxxxx> wrote:
>>
>> With zoned storage, F2FS avoids direct IO writes and uses buffered writes
>> with page cache flushes to prevent unaligned writes. However, the
>> unaligned write can be avoided by allowing only a single thread per zone
>> to perform direct writes.
>>
>> To achieve direct writes in zoned storage, it uses semephores to serialize
>> block allocation and writes per zone.
>>
>> Signed-off-by: Daejun Park <daejun7.park@xxxxxxxxxxx>
>> ---
>>  fs/f2fs/data.c    28 ++++++++++++++++++++++++++-
>>  fs/f2fs/f2fs.h      7 +++++--
>>  fs/f2fs/file.c    48 ++++++++++++++++++++++++++++++++++++++++-------
>>  fs/f2fs/gc.c        4 ++--
>>  fs/f2fs/segment.c   6 +++---
>>  fs/f2fs/super.c    5 ++++-
>>  6 files changed, 82 insertions(+), 16 deletions(-)
>>
>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>> index b94cf6eea2f9..fa2bd88a2ed2 100644
>> --- a/fs/f2fs/data.c
>> +++ b/fs/f2fs/data.c
>> @@ -922,7 +922,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
>>  }
>>
>>  #ifdef CONFIG_BLK_DEV_ZONED
>> -static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr)
>> +bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr)
>>  {
>>        struct block_device *bdev = sbi->sb->s_bdev;
>>        int devi = 0;
>> @@ -4207,6 +4207,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>>                            struct iomap *srcmap)
>>  {
>>        struct f2fs_map_blocks map = {};
>> +      struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>>        pgoff_t next_pgofs = 0;
>>        int err;
>>
>> @@ -4218,6 +4219,18 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>>        if (flags & IOMAP_WRITE)
>>                map.m_may_create = true;
>>
>> +      if (f2fs_sb_has_blkzoned(sbi) && !f2fs_is_pinned_file(inode)) {
>
>I think it's better that we can skip this for conventional LU by
>examining the block address.

At this line, the examining is not possible because the block address has not been allocated yet.

>
>> +              struct f2fs_rwsem *io_order_lock =
>> +                              &sbi->io_order_lock[map.m_seg_type];
>> +
>> +              f2fs_down_write(io_order_lock);
>> +
>> +              /* set io order lock */
>> +              iomap->private = io_order_lock;
>> +      } else {
>> +              iomap->private = NULL;
>> +      }
>> +
>>        err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO);
>>        if (err)
>>                return err;
>> @@ -4273,6 +4286,19 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>>        return 0;
>>  }
>>
>> +static int f2fs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
>> +              ssize_t written, unsigned int flags, struct iomap *iomap)
>> +{
>> +      struct f2fs_rwsem *io_order_lock = iomap->private;
>> +
>> +      /* ordered write */
>> +      if (io_order_lock)
>> +              f2fs_up_write(io_order_lock);
>> +
>> +      return 0;
>> +}
>> +
>>  const struct iomap_ops f2fs_iomap_ops = {
>>        .iomap_begin    = f2fs_iomap_begin,
>> +      .iomap_end      = f2fs_iomap_end,
>>  };
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 33f5449dc22d..06ed132f22ad 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -1582,8 +1582,8 @@ struct f2fs_sb_info {
>>
>>        /* for bio operations */
>>        struct f2fs_bio_info *write_io[NR_PAGE_TYPE];  /* for write bios */
>> -      /* keep migration IO order for LFS mode */
>> -      struct f2fs_rwsem io_order_lock;
>> +      /* keep IO order for LFS mode */
>> +      struct f2fs_rwsem io_order_lock[NR_CURSEG_DATA_TYPE];
>>        pgoff_t page_eio_ofs[NR_PAGE_TYPE];    /* EIO page offset */
>>        int page_eio_cnt[NR_PAGE_TYPE];        /* EIO count */
>>
>> @@ -3863,6 +3863,9 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
>>  void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
>>  int f2fs_submit_page_bio(struct f2fs_io_info *fio);
>>  int f2fs_merge_page_bio(struct f2fs_io_info *fio);
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr);
>> +#endif
>>  void f2fs_submit_page_write(struct f2fs_io_info *fio);
>>  struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
>>                block_t blk_addr, sector_t *sector);
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 99903eafa7fe..fde49f3e54cf 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -869,13 +869,7 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
>>        /* disallow direct IO if any of devices has unaligned blksize */
>>        if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
>>                return true;
>> -      /*
>> -        * for blkzoned device, fallback direct IO to buffered IO, so
>> -        * all IOs can be serialized by log-structured write.
>> -        */
>> -      if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE) &&
>> -          !f2fs_is_pinned_file(inode))
>> -              return true;
>> +
>>        if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
>>                return true;
>>
>> @@ -4815,6 +4809,17 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
>>        return 0;
>>  }
>>
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +static void f2fs_dio_zone_write_end_io(struct bio *bio)
>> +{
>> +      struct f2fs_bio_info *io = (struct f2fs_bio_info *)bio->bi_private;
>> +
>> +      bio->bi_private = io->bi_private;
>> +      complete(&io->zone_wait);
>> +      iomap_dio_bio_end_io(bio);
>> +}
>> +#endif
>> +
>>  static void f2fs_dio_write_submit_io(const struct iomap_iter *iter,
>>                                        struct bio *bio, loff_t file_offset)
>>  {
>> @@ -4824,6 +4829,31 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter,
>>        enum temp_type temp = f2fs_get_segment_temp(seg_type);
>>
>>        bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp);
>> +
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +      if (f2fs_sb_has_blkzoned(sbi) && !f2fs_is_pinned_file(inode)) {
>> +              struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
>> +              block_t last_blkaddr = SECTOR_TO_BLOCK(bio_end_sector(bio) - 1);
>> +
>> +              f2fs_down_write(&io->io_rwsem);
>> +              if (io->zone_pending_bio) {
>> +                      wait_for_completion_io(&io->zone_wait);
>> +                      bio_put(io->zone_pending_bio);
>> +                      io->zone_pending_bio = NULL;
>> +                      io->bi_private = NULL;
>> +              }
>> +
>> +              if (is_end_zone_blkaddr(sbi, last_blkaddr)) {
>> +                      bio_get(bio);
>> +                      reinit_completion(&io->zone_wait);
>> +                      io->bi_private = bio->bi_private;
>> +                      bio->bi_private = io;
>> +                      bio->bi_end_io = f2fs_dio_zone_write_end_io;
>> +                      io->zone_pending_bio = bio;
>> +              }
>> +              f2fs_up_write(&io->io_rwsem);
>> +      }
>> +#endif
>>        submit_bio(bio);
>>  }
>>
>> @@ -4897,6 +4927,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>        dio_flags = 0;
>>        if (pos + count > inode->i_size)
>>                dio_flags = IOMAP_DIO_FORCE_WAIT;
>> +
>> +      if (f2fs_sb_has_blkzoned(sbi) && !f2fs_is_pinned_file(inode))
>> +              dio_flags = IOMAP_DIO_FORCE_WAIT;
>> +
>>        dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
>>                              &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0);
>>        if (IS_ERR_OR_NULL(dio)) {
>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>> index 9322a7200e31..49270713f739 100644
>> --- a/fs/f2fs/gc.c
>> +++ b/fs/f2fs/gc.c
>> @@ -1361,7 +1361,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
>>        fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
>>
>>        if (lfs_mode)
>> -              f2fs_down_write(&fio.sbi->io_order_lock);
>> +              f2fs_down_write(&fio.sbi->io_order_lock[CURSEG_COLD_DATA]);
>>
>>        mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
>>                                        fio.old_blkaddr, false);
>> @@ -1444,7 +1444,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
>>                                                        true, true, true);
>>  up_out:
>>        if (lfs_mode)
>> -              f2fs_up_write(&fio.sbi->io_order_lock);
>> +              f2fs_up_write(&fio.sbi->io_order_lock[CURSEG_COLD_DATA]);
>>  put_out:
>>        f2fs_put_dnode(&dn);
>>  out:
>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>> index 1766254279d2..d602ae4d79e3 100644
>> --- a/fs/f2fs/segment.c
>> +++ b/fs/f2fs/segment.c
>> @@ -3796,10 +3796,10 @@ void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
>>  static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
>>  {
>>        int type = __get_segment_type(fio);
>> -      bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
>> +      bool keep_order = (f2fs_lfs_mode(fio->sbi) && type <= CURSEG_COLD_DATA);
>
>ditto

It is the same as above.

After the block allocation, the semaphore can be released early by checking zone type.
However, I'm not sure if this is necessary.

Thanks for review.
Daejun

>
>>
>>        if (keep_order)
>> -              f2fs_down_read(&fio->sbi->io_order_lock);
>> +              f2fs_down_read(&fio->sbi->io_order_lock[type]);
>>
>>        if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
>>                        &fio->new_blkaddr, sum, type, fio)) {
>> @@ -3819,7 +3819,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
>>        f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
>>  out:
>>        if (keep_order)
>> -              f2fs_up_read(&fio->sbi->io_order_lock);
>> +              f2fs_up_read(&fio->sbi->io_order_lock[type]);
>>  }
>>
>>  void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio,
>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>> index fc2c586c7619..5289b6f5f6f3 100644
>> --- a/fs/f2fs/super.c
>> +++ b/fs/f2fs/super.c
>> @@ -3833,7 +3833,10 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
>>
>>        INIT_LIST_HEAD(&sbi->s_list);
>>        mutex_init(&sbi->umount_mutex);
>> -      init_f2fs_rwsem(&sbi->io_order_lock);
>> +
>> +      for (i = 0; i < NR_CURSEG_DATA_TYPE; i++)
>> +              init_f2fs_rwsem(&sbi->io_order_lock[i]);
>> +
>>        spin_lock_init(&sbi->cp_lock);
>>
>>        sbi->dirty_device = 0;
>> --
>> 2.25.1
>>
>>
>>
>> _______________________________________________
>> Linux-f2fs-devel mailing list
>> Linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx
>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel