Re: [PATCH v4 1/2] btrfs: zoned: reserve relocation block-group on mount
From: Filipe Manana
Date: Fri May 24 2024 - 10:08:49 EST
On Thu, May 23, 2024 at 4:32 PM Johannes Thumshirn <jth@xxxxxxxxxx> wrote:
>
> From: Johannes Thumshirn <johannes.thumshirn@xxxxxxx>
>
> Reserve one zone as a data relocation target on each mount. If we already
> find one empty block group, there's no need to force a chunk allocation,
> but we can use this empty data block group as our relocation target.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@xxxxxxx>
> ---
> fs/btrfs/block-group.c | 17 +++++++++++++
> fs/btrfs/disk-io.c | 2 ++
> fs/btrfs/zoned.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/zoned.h | 3 +++
> 4 files changed, 89 insertions(+)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 9910bae89966..1195f6721c90 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1500,6 +1500,15 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> btrfs_put_block_group(block_group);
> continue;
> }
> +
> + spin_lock(&fs_info->relocation_bg_lock);
> + if (block_group->start == fs_info->data_reloc_bg) {
> + btrfs_put_block_group(block_group);
> + spin_unlock(&fs_info->relocation_bg_lock);
> + continue;
> + }
> + spin_unlock(&fs_info->relocation_bg_lock);
> +
> spin_unlock(&fs_info->unused_bgs_lock);
>
> btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
> @@ -1835,6 +1844,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
> bg_list);
> list_del_init(&bg->bg_list);
>
> + spin_lock(&fs_info->relocation_bg_lock);
> + if (bg->start == fs_info->data_reloc_bg) {
> + btrfs_put_block_group(bg);
> + spin_unlock(&fs_info->relocation_bg_lock);
> + continue;
> + }
> + spin_unlock(&fs_info->relocation_bg_lock);
Ok, so the reclaim task and cleaner kthread will not remove the
reserved block group.
But there's nothing preventing someone running balance manually, which
will delete the block group.
E.g. block group X is empty and reserved as the data relocation bg.
The balance ioctl is invoked, it goes through all block groups for relocation.
It happens that it first finds bg X. Deletes bg X.
Now there's no more reserved bg for data relocation, and other tasks
can come in and use the freed space and fill all of it or most of it.
Shouldn't we prevent the data reloc bg from being a target of a manual
relocation too?
E.g. have btrfs_relocate_chunk() do nothing if the bg is the data reloc bg.
Thanks.
> +
> space_info = bg->space_info;
> spin_unlock(&fs_info->unused_bgs_lock);
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 78d3966232ae..16bb52bcb69e 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3547,6 +3547,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> }
> btrfs_discard_resume(fs_info);
>
> + btrfs_reserve_relocation_bg(fs_info);
> +
> if (fs_info->uuid_root &&
> (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
> fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index c52a0063f7db..d291cf4f565e 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -17,6 +17,7 @@
> #include "fs.h"
> #include "accessors.h"
> #include "bio.h"
> +#include "transaction.h"
>
> /* Maximum number of zones to report per blkdev_report_zones() call */
> #define BTRFS_REPORT_NR_ZONES 4096
> @@ -2637,3 +2638,69 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
> }
> spin_unlock(&fs_info->zone_active_bgs_lock);
> }
> +
> +static u64 find_empty_block_group(struct btrfs_space_info *sinfo, u64 flags)
> +{
> + struct btrfs_block_group *bg;
> +
> + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
> + list_for_each_entry(bg, &sinfo->block_groups[i], list) {
> + if (bg->flags != flags)
> + continue;
> + if (bg->used == 0)
> + return bg->start;
> + }
> + }
> +
> + return 0;
> +}
> +
> +void btrfs_reserve_relocation_bg(struct btrfs_fs_info *fs_info)
> +{
> + struct btrfs_root *tree_root = fs_info->tree_root;
> + struct btrfs_space_info *sinfo = fs_info->data_sinfo;
> + struct btrfs_trans_handle *trans;
> + struct btrfs_block_group *bg;
> + u64 flags = btrfs_get_alloc_profile(fs_info, sinfo->flags);
> + u64 bytenr = 0;
> +
> + lockdep_assert_not_held(&fs_info->relocation_bg_lock);
> +
> + if (!btrfs_is_zoned(fs_info))
> + return;
> +
> + if (fs_info->data_reloc_bg)
> + return;
> +
> + bytenr = find_empty_block_group(sinfo, flags);
> + if (!bytenr) {
> + int ret;
> +
> + trans = btrfs_join_transaction(tree_root);
> + if (IS_ERR(trans))
> + return;
> +
> + ret = btrfs_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
> + btrfs_end_transaction(trans);
> + if (ret)
> + return;
> +
> + bytenr = find_empty_block_group(sinfo, flags);
> + if (!bytenr)
> + return;
> +
> + }
> +
> + bg = btrfs_lookup_block_group(fs_info, bytenr);
> + if (!bg)
> + return;
> +
> + if (!btrfs_zone_activate(bg))
> + bytenr = 0;
> +
> + btrfs_put_block_group(bg);
> +
> + spin_lock(&fs_info->relocation_bg_lock);
> + fs_info->data_reloc_bg = bytenr;
> + spin_unlock(&fs_info->relocation_bg_lock);
> +}
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index ff605beb84ef..56c1c19d52bc 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -95,6 +95,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
> int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
> struct btrfs_space_info *space_info, bool do_finish);
> void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
> +void btrfs_reserve_relocation_bg(struct btrfs_fs_info *fs_info);
> #else /* CONFIG_BLK_DEV_ZONED */
>
> static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
> @@ -264,6 +265,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
>
> static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
>
> +static inline void btrfs_reserve_relocation_bg(struct btrfs_fs_info *fs_info) { }
> +
> #endif
>
> static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
>
> --
> 2.43.0
>
>