[PATCH 5.17 285/343] btrfs: remove device item and update super block in the same transaction

From: Greg Kroah-Hartman
Date: Tue Apr 12 2022 - 04:58:58 EST


From: Qu Wenruo <wqu@xxxxxxxx>

commit bbac58698a55cc0a6f0c0d69a6dcd3f9f3134c11 upstream.

[BUG]
There is a report that a btrfs has a bad super block num devices.

This makes btrfs to reject the fs completely.

BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here
BTRFS error (device sdd3): failed to read chunk tree: -22
BTRFS error (device sdd3): open_ctree failed

[CAUSE]
During btrfs device removal, chunk tree and super block num devs are
updated in two different transactions:

btrfs_rm_device()
|- btrfs_rm_dev_item(device)
| |- trans = btrfs_start_transaction()
| | Now we got transaction X
| |
| |- btrfs_del_item()
| | Now device item is removed from chunk tree
| |
| |- btrfs_commit_transaction()
| Transaction X got committed, super num devs untouched,
| but device item removed from chunk tree.
| (AKA, super num devs is already incorrect)
|
|- cur_devices->num_devices--;
|- cur_devices->total_devices--;
|- btrfs_set_super_num_devices()
All those operations are not in transaction X, thus it will
only be written back to disk in next transaction.

So after the transaction X in btrfs_rm_dev_item() committed, but before
transaction X+1 (which can be minutes away), a power loss happen, then
we got the super num mismatch.

[FIX]
Instead of starting and committing a transaction inside
btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and
pass it to btrfs_rm_dev_item().

And only commit the transaction after everything is done.

Reported-by: Luca Béla Palkovics <luca.bela.palkovics@xxxxxxxxx>
Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@xxxxxxxxxxxxxx/
CC: stable@xxxxxxxxxxxxxxx # 4.14+
Reviewed-by: Anand Jain <anand.jain@xxxxxxxxxx>
Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
Signed-off-by: David Sterba <dsterba@xxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
---
fs/btrfs/volumes.c | 65 ++++++++++++++++++++++-------------------------------
1 file changed, 28 insertions(+), 37 deletions(-)

--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1945,23 +1945,18 @@ static void update_dev_time(const char *
path_put(&path);
}

-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_trans_handle *trans;

path = btrfs_alloc_path();
if (!path)
return -ENOMEM;

- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- return PTR_ERR(trans);
- }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1972,21 +1967,12 @@ static int btrfs_rm_dev_item(struct btrf
if (ret) {
if (ret > 0)
ret = -ENOENT;
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
goto out;
}

ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- }
-
out:
btrfs_free_path(path);
- if (!ret)
- ret = btrfs_commit_transaction(trans);
return ret;
}

@@ -2127,6 +2113,7 @@ int btrfs_rm_device(struct btrfs_fs_info
struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2142,7 +2129,7 @@ int btrfs_rm_device(struct btrfs_fs_info

ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
- goto out;
+ return ret;

device = btrfs_find_device(fs_info->fs_devices, args);
if (!device) {
@@ -2150,27 +2137,22 @@ int btrfs_rm_device(struct btrfs_fs_info
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = -ENOENT;
- goto out;
+ return ret;
}

if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
- ret = -ETXTBSY;
- goto out;
+ return -ETXTBSY;
}

- if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto out;
- }
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;

if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
- fs_info->fs_devices->rw_devices == 1) {
- ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto out;
- }
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;

if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
@@ -2183,14 +2165,22 @@ int btrfs_rm_device(struct btrfs_fs_info
if (ret)
goto error_undo;

- /*
- * TODO: the superblock still includes this device in its num_devices
- * counter although write_all_supers() is not locked out. This
- * could give a filesystem state which requires a degraded mount.
- */
- ret = btrfs_rm_dev_item(device);
- if (ret)
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (ret) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }

clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2273,7 +2263,8 @@ int btrfs_rm_device(struct btrfs_fs_info
free_fs_devices(cur_devices);
}

-out:
+ ret = btrfs_commit_transaction(trans);
+
return ret;

error_undo:
@@ -2284,7 +2275,7 @@ error_undo:
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
- goto out;
+ return ret;
}

void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)