Re: [PATCH v8 05/11] btrfs: lookup physical address from stripe extent

From: Qu Wenruo
Date: Thu Sep 14 2023 - 05:18:45 EST




On 2023/9/11 22:22, Johannes Thumshirn wrote:
Lookup the physical address from the raid stripe tree when a read on an
RAID volume formatted with the raid stripe tree was attempted.

If the requested logical address was not found in the stripe tree, it may
still be in the in-memory ordered stripe tree, so fallback to searching
the ordered stripe tree in this case.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@xxxxxxx>
---
fs/btrfs/raid-stripe-tree.c | 159 ++++++++++++++++++++++++++++++++++++++++++++
fs/btrfs/raid-stripe-tree.h | 11 +++
fs/btrfs/volumes.c | 37 ++++++++---
3 files changed, 198 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 5b12f40877b5..7ed02e4b79ec 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -324,3 +324,162 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,

return ret;
}
+
+static bool btrfs_check_for_extent(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length, struct btrfs_path *path)
+{
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_release_path(path);
+
+ key.objectid = logical;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+
+ return ret;
+}
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
+ struct btrfs_stripe_extent *stripe_extent;
+ struct btrfs_key stripe_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int num_stripes;
+ u8 encoding;
+ u64 offset;
+ u64 found_logical;
+ u64 found_length;
+ u64 end;
+ u64 found_end;
+ int slot;
+ int ret;
+ int i;
+
+ stripe_key.objectid = logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
+ if (ret < 0)
+ goto free_path;
+ if (ret) {
+ if (path->slots[0] != 0)
+ path->slots[0]--;

IIRC we have btrfs_previous_item() to do the forward search.

+ }
+
+ end = logical + *length;

IMHO we can make it const and initialize it at the definition part.

+
+ while (1) {

Here we only can hit at most one RST item, thus I'd recommend to remove
the while().

Although this would mean we will need a if () to handle (ret > 0) case,
but it may still be a little easier to read than a loop.

You may want to refer to btrfs_lookup_csum() for the non-loop
implementation.

+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ found_logical = found_key.objectid;
+ found_length = found_key.offset;
+ found_end = found_logical + found_length;
+
+ if (found_logical > end) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (in_range(logical, found_logical, found_length))
+ break;
+
+ ret = btrfs_next_item(stripe_root, path);
+ if (ret)
+ goto out;
+ }
+
+ offset = logical - found_logical;
+
+ /*
+ * If we have a logically contiguous, but physically noncontinuous
+ * range, we need to split the bio. Record the length after which we
+ * must split the bio.
+ */
+ if (end > found_end)
+ *length -= end - found_end;
+
+ num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+ stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+ encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
+
+ if (encoding != btrfs_bg_type_to_raid_encoding(map_type)) {
+ ret = -ENOENT;
+ goto out;

This looks like a very weird situation, we have a bg with a different type.
Should we do some warning or is there some valid situation for this?

+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
+ u64 devid = btrfs_raid_stride_devid(leaf, stride);
+ u64 len = btrfs_raid_stride_length(leaf, stride);
+ u64 physical = btrfs_raid_stride_physical(leaf, stride);
+
+ if (offset >= len) {
+ offset -= len;
+
+ if (offset >= BTRFS_STRIPE_LEN)
+ continue;
+ }
+
+ if (devid != stripe->dev->devid)
+ continue;
+
+ if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
+ continue;
+
+ stripe->physical = physical + offset;
+
+ ret = 0;
+ goto free_path;
+ }
+
+ /*
+ * If we're here, we haven't found the requested devid in the stripe.
+ */
+ ret = -ENOENT;
+out:
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret && ret != -EIO) {
+ /*
+ * Check if the range we're looking for is actually backed by
+ * an extent. This can happen, e.g. when scrub is running on a
+ * block-group and the extent it is trying to scrub get's
+ * deleted in the meantime. Although scrub is setting the
+ * block-group to read-only, deletion of extents are still
+ * allowed. If the extent is gone, simply return ENOENT and be
+ * good.
+ */

As mentioned in the next patch (sorry for the reversed order), this
should be handled in a different way (by only searching commit root for
scrub usage).
+ if (btrfs_check_for_extent(fs_info, logical, *length, path)) {
+ ret = -ENOENT;
+ goto free_path;
+ }
+
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ btrfs_print_tree(leaf, 1);
+ btrfs_err(fs_info,
+ "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
+ logical, logical + *length, stripe->dev->devid,
+ btrfs_bg_type_to_raid_name(map_type));
+ }
+free_path:
+ btrfs_free_path(path);
+
+ return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 7560dc501a65..40aa553ae8aa 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -13,6 +13,10 @@ struct btrfs_io_stripe;

int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
u64 length);
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index,
+ struct btrfs_io_stripe *stripe);
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered_extent);

@@ -33,4 +37,11 @@ static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,

return false;
}
+
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+ return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
+ sizeof(struct btrfs_raid_stride);
+}
+
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0c0fd4eb4848..7c25f5c77788 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "raid-stripe-tree.h"

#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
@@ -6206,12 +6207,22 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
return U64_MAX;
}

-static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
- u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length, struct btrfs_io_stripe *dst,
+ struct map_lookup *map, u32 stripe_index,
+ u64 stripe_offset, u64 stripe_nr)
Do we need @length to be a pointer?
IIRC we can return the number of bytes we mapped, or <0 for errors.
Thus at least @length doesn't need to be a pointer.

Thanks,
Qu

{
dst->dev = map->stripes[stripe_index].dev;
+
+ if (op == BTRFS_MAP_READ &&
+ btrfs_need_stripe_tree_update(fs_info, map->type))
+ return btrfs_get_raid_extent_offset(fs_info, logical, length,
+ map->type, stripe_index,
+ dst);
+
dst->physical = map->stripes[stripe_index].physical +
stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ return 0;
}

/*
@@ -6428,11 +6439,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
*/
if (smap && num_alloc_stripes == 1 &&
!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length, smap, map,
+ stripe_index, stripe_offset, stripe_nr);
if (mirror_num_ret)
*mirror_num_ret = mirror_num;
*bioc_ret = NULL;
- ret = 0;
goto out;
}

@@ -6463,21 +6474,29 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
bioc->full_stripe_logical = em->start +
btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
for (i = 0; i < num_stripes; i++)
- set_io_stripe(&bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map,
+ (i + stripe_nr) % num_stripes,
+ stripe_offset, stripe_nr);
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
for (i = 0; i < num_stripes; i++) {
- set_io_stripe(&bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map, stripe_index,
+ stripe_offset, stripe_nr);
stripe_index++;
}
}

+ if (ret) {
+ *bioc_ret = NULL;
+ btrfs_put_bioc(bioc);
+ goto out;
+ }
+
if (op != BTRFS_MAP_READ)
max_errors = btrfs_chunk_max_errors(map);