Re: [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search

From: Filipe Manana
Date: Mon Nov 25 2024 - 06:24:04 EST


On Sun, Nov 24, 2024 at 12:46 PM Sasha Levin <sashal@xxxxxxxxxx> wrote:
>
> From: Robbie Ko <robbieko@xxxxxxxxxxxx>
>
> [ Upstream commit 99785998ed1cea142e20f4904ced26537a37bf74 ]

Why is this being picked for stable?

It's not a bug fix or anything critical.
It's just a performance optimization, and it's not even one where we
know (AFAIK) of any workload where it would give very significant
gains to justify backporting to stable.

Thanks.

>
> When crawling btree, if an eb cache miss occurs, we change to use the eb
> read lock and release all previous locks (including the parent lock) to
> reduce lock contention.
>
> If an eb cache miss occurs in a leaf and needs to execute IO, before this
> change we released locks only from level 2 and up and we read a leaf's
> content from disk while holding a lock on its parent (level 1), causing
> the unnecessary lock contention on the parent, after this change we
> release locks from level 1 and up, but we lock level 0, and read leaf's
> content from disk.
>
> Because we have prepared the check parameters and the read lock of eb we
> hold, we can ensure that no race will occur during the check and cause
> unexpected errors.
>
> Reviewed-by: Filipe Manana <fdmanana@xxxxxxxx>
> Signed-off-by: Robbie Ko <robbieko@xxxxxxxxxxxx>
> Signed-off-by: David Sterba <dsterba@xxxxxxxx>
> Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>
> ---
> fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++---------------
> 1 file changed, 70 insertions(+), 31 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 0cc919d15b144..dd92acd66624f 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> struct btrfs_tree_parent_check check = { 0 };
> u64 blocknr;
> u64 gen;
> - struct extent_buffer *tmp;
> - int ret;
> + struct extent_buffer *tmp = NULL;
> + int ret = 0;
> int parent_level;
> - bool unlock_up;
> + int err;
> + bool read_tmp = false;
> + bool tmp_locked = false;
> + bool path_released = false;
>
> - unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
> blocknr = btrfs_node_blockptr(*eb_ret, slot);
> gen = btrfs_node_ptr_generation(*eb_ret, slot);
> parent_level = btrfs_header_level(*eb_ret);
> @@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
> */
> if (btrfs_verify_level_key(tmp,
> parent_level - 1, &check.first_key, gen)) {
> - free_extent_buffer(tmp);
> - return -EUCLEAN;
> + ret = -EUCLEAN;
> + goto out;
> }
> *eb_ret = tmp;
> - return 0;
> + tmp = NULL;
> + ret = 0;
> + goto out;
> }
>
> if (p->nowait) {
> - free_extent_buffer(tmp);
> - return -EAGAIN;
> + ret = -EAGAIN;
> + goto out;
> }
>
> - if (unlock_up)
> + if (!p->skip_locking) {
> btrfs_unlock_up_safe(p, level + 1);
> -
> - /* now we're allowed to do a blocking uptodate check */
> - ret = btrfs_read_extent_buffer(tmp, &check);
> - if (ret) {
> - free_extent_buffer(tmp);
> + tmp_locked = true;
> + btrfs_tree_read_lock(tmp);
> btrfs_release_path(p);
> - return ret;
> + ret = -EAGAIN;
> + path_released = true;
> }
>
> - if (unlock_up)
> - ret = -EAGAIN;
> + /* Now we're allowed to do a blocking uptodate check. */
> + err = btrfs_read_extent_buffer(tmp, &check);
> + if (err) {
> + ret = err;
> + goto out;
> + }
>
> + if (ret == 0) {
> + ASSERT(!tmp_locked);
> + *eb_ret = tmp;
> + tmp = NULL;
> + }
> goto out;
> } else if (p->nowait) {
> - return -EAGAIN;
> + ret = -EAGAIN;
> + goto out;
> }
>
> - if (unlock_up) {
> + if (!p->skip_locking) {
> btrfs_unlock_up_safe(p, level + 1);
> ret = -EAGAIN;
> - } else {
> - ret = 0;
> }
>
> if (p->reada != READA_NONE)
> reada_for_search(fs_info, p, level, slot, key->objectid);
>
> - tmp = read_tree_block(fs_info, blocknr, &check);
> + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
> if (IS_ERR(tmp)) {
> + ret = PTR_ERR(tmp);
> + tmp = NULL;
> + goto out;
> + }
> + read_tmp = true;
> +
> + if (!p->skip_locking) {
> + ASSERT(ret == -EAGAIN);
> + tmp_locked = true;
> + btrfs_tree_read_lock(tmp);
> btrfs_release_path(p);
> - return PTR_ERR(tmp);
> + path_released = true;
> + }
> +
> + /* Now we're allowed to do a blocking uptodate check. */
> + err = btrfs_read_extent_buffer(tmp, &check);
> + if (err) {
> + ret = err;
> + goto out;
> }
> +
> /*
> * If the read above didn't mark this buffer up to date,
> * it will never end up being up to date. Set ret to EIO now
> * and give up so that our caller doesn't loop forever
> * on our EAGAINs.
> */
> - if (!extent_buffer_uptodate(tmp))
> + if (!extent_buffer_uptodate(tmp)) {
> ret = -EIO;
> + goto out;
> + }
>
> -out:
> if (ret == 0) {
> + ASSERT(!tmp_locked);
> *eb_ret = tmp;
> - } else {
> - free_extent_buffer(tmp);
> - btrfs_release_path(p);
> + tmp = NULL;
> + }
> +out:
> + if (tmp) {
> + if (tmp_locked)
> + btrfs_tree_read_unlock(tmp);
> + if (read_tmp && ret && ret != -EAGAIN)
> + free_extent_buffer_stale(tmp);
> + else
> + free_extent_buffer(tmp);
> }
> + if (ret && !path_released)
> + btrfs_release_path(p);
>
> return ret;
> }
> @@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> }
>
> err = read_block_for_search(root, p, &b, level, slot, key);
> - if (err == -EAGAIN)
> + if (err == -EAGAIN && !p->nowait)
> goto again;
> if (err) {
> ret = err;
> @@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
> }
>
> err = read_block_for_search(root, p, &b, level, slot, key);
> - if (err == -EAGAIN)
> + if (err == -EAGAIN && !p->nowait)
> goto again;
> if (err) {
> ret = err;
> --
> 2.43.0
>
>