Re: [PATCH RFC 2/8] fs: add a global device to super block hash table

From: Jan Kara

Date: Mon Jun 08 2026 - 06:33:17 EST

On Tue 02-06-26 12:10:08, Christian Brauner wrote:
> fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> forces the holder to be exactly one superblock and prevents several
> superblocks from sharing one block device. That's what erofs is doing.
>
> Introduce a global dev_t-keyed rhltable mapping each block device to the
> superblock(s) using it. The holder argument becomes purely the block
> layer's exclusivity token (a superblock, or a file_system_type for
> shared devices) and is no longer needed by the fs specific callbacks.
>
> Registration keeps one entry per (device, superblock). When a filesystem
> claims a device it already uses (xfs with its log on the data device), no
> second entry is added, so each superblock is acted on once.
>
> Each table entry holds a passive reference (s_count) on its superblock,
> so the struct stays valid for as long as the entry is reachable. The
> callbacks look the device up in the table and act on every superblock
> using it:
>
> Unlinking an entry is deferred to the last unpin, so a cursor never
> resumes from a removed node. After this it's possible to act on all
> superblocks that share a given device.
>
> Signed-off-by: Christian Brauner (Amutable) <brauner@xxxxxxxxxx>

Looks good! One comment below:

> static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
> {
> - struct super_block *sb;
> + struct fs_bdev_holder *h;
> + dev_t dev = bdev->bd_dev;
>
> - sb = bdev_super_lock(bdev, false);
> - if (!sb)
> - return;
> + mutex_unlock(&bdev->bd_holder_lock);

The moment we drop bd_holder_lock, there's nothing which prevents the bdev
owner from changing. So this can lead to a situation where we miss calling
->mark_dead callback of the new holder. Similarly for all the other holder
ops. I didn't find a situation where it would actually matter so I think
we're fine but it's a potential catch. Anyway, feel free to add:

Reviewed-by: Jan Kara <jack@xxxxxxx>

Honza

>
> - if (sb->s_op->remove_bdev) {
> - int ret;
> + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) {
> + struct super_block *sb = h->sb;
>
> - ret = sb->s_op->remove_bdev(sb, bdev);
> - if (!ret) {
> - super_unlock_shared(sb);
> - return;
> + if (!super_lock_shared(sb))
> + continue;
> + if (sb->s_root && (sb->s_flags & SB_ACTIVE)) {
> + if (!sb->s_op->remove_bdev ||
> + sb->s_op->remove_bdev(sb, bdev)) {
> + if (!surprise)
> + sync_filesystem(sb);
> + shrink_dcache_sb(sb);
> + evict_inodes(sb);
> + if (sb->s_op->shutdown)
> + sb->s_op->shutdown(sb);
> + }
> }
> - /* Fallback to shutdown. */
> + super_unlock_shared(sb);
> }
> -
> - if (!surprise)
> - sync_filesystem(sb);
> - shrink_dcache_sb(sb);
> - evict_inodes(sb);
> - if (sb->s_op->shutdown)
> - sb->s_op->shutdown(sb);
> -
> - super_unlock_shared(sb);
> }
>
> static void fs_bdev_sync(struct block_device *bdev)
> {
> - struct super_block *sb;
> + struct fs_bdev_holder *h;
> + dev_t dev = bdev->bd_dev;
>
> - sb = bdev_super_lock(bdev, false);
> - if (!sb)
> - return;
> + mutex_unlock(&bdev->bd_holder_lock);
>
> - sync_filesystem(sb);
> - super_unlock_shared(sb);
> -}
> + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) {
> + struct super_block *sb = h->sb;
>
> -static struct super_block *get_bdev_super(struct block_device *bdev)
> -{
> - bool active = false;
> - struct super_block *sb;
> -
> - sb = bdev_super_lock(bdev, true);
> - if (sb) {
> - active = atomic_inc_not_zero(&sb->s_active);
> - super_unlock_excl(sb);
> + if (!super_lock_shared(sb))
> + continue;
> + if (sb->s_root && (sb->s_flags & SB_ACTIVE))
> + sync_filesystem(sb);
> + super_unlock_shared(sb);
> }
> - if (!active)
> - return NULL;
> - return sb;
> }
>
> /**
> - * fs_bdev_freeze - freeze owning filesystem of block device
> + * fs_bdev_freeze - freeze every superblock using a block device
> * @bdev: block device
> *
> - * Freeze the filesystem that owns this block device if it is still
> - * active.
> - *
> - * A filesystem that owns multiple block devices may be frozen from each
> - * block device and won't be unfrozen until all block devices are
> - * unfrozen. Each block device can only freeze the filesystem once as we
> - * nest freezes for block devices in the block layer.
> + * Freeze each live superblock using @bdev. A superblock owning several block
> + * devices is frozen once per device and stays frozen until all are thawed; the
> + * block layer nests these freezes so the count stays balanced.
> *
> - * Return: If the freeze was successful zero is returned. If the freeze
> - * failed a negative error code is returned.
> + * Return: 0, or the error from the one superblock on a single-fs device. When
> + * several superblocks share @bdev a per-superblock failure is swallowed
> + * (see below), but a sync_blockdev() failure is always reported.
> */
> static int fs_bdev_freeze(struct block_device *bdev)
> {
> - struct super_block *sb;
> - int error = 0;
> + dev_t dev = bdev->bd_dev;
> + struct fs_bdev_holder *h;
> + unsigned int count = 0;
> + int error = 0, err;
>
> lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
>
> - sb = get_bdev_super(bdev);
> - if (!sb)
> - return -EINVAL;
> + mutex_unlock(&bdev->bd_holder_lock);
>
> - if (sb->s_op->freeze_super)
> - error = sb->s_op->freeze_super(sb,
> - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
> - else
> - error = freeze_super(sb,
> - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
> + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) {
> + if (!atomic_inc_not_zero(&h->sb->s_active))
> + continue;
> + err = fs_super_freeze(h->sb);
> + if (err && !error)
> + error = err;
> + deactivate_super(h->sb);
> + count++;
> + }
> +
> + /*
> + * When several superblocks share the device, keep it frozen even if some
> + * of them failed to freeze and swallow the error: rolling the rest back
> + * via thaw_super() can fail too, so neither is a clear win. A single
> + * filesystem (count == 1) still reports its error.
> + */
> + if (error && count > 1)
> + error = 0;
> if (!error)
> error = sync_blockdev(bdev);
> - deactivate_super(sb);
> return error;
> }
>
> /**
> - * fs_bdev_thaw - thaw owning filesystem of block device
> + * fs_bdev_thaw - thaw every superblock using a block device
> * @bdev: block device
> *
> - * Thaw the filesystem that owns this block device.
> + * The counterpart to fs_bdev_freeze(): thaw each live superblock using @bdev.
> + * A zero return does not imply a superblock is fully unfrozen; it may have been
> + * frozen more than once (by the kernel or via another device).
> *
> - * A filesystem that owns multiple block devices may be frozen from each
> - * block device and won't be unfrozen until all block devices are
> - * unfrozen. Each block device can only freeze the filesystem once as we
> - * nest freezes for block devices in the block layer.
> - *
> - * Return: If the thaw was successful zero is returned. If the thaw
> - * failed a negative error code is returned. If this function
> - * returns zero it doesn't mean that the filesystem is unfrozen
> - * as it may have been frozen multiple times (kernel may hold a
> - * freeze or might be frozen from other block devices).
> + * Return: 0, or the first error on a single-fs device; a shared device swallows
> + * per-superblock errors, as fs_bdev_freeze() does.
> */
> static int fs_bdev_thaw(struct block_device *bdev)
> {
> - struct super_block *sb;
> - int error;
> + dev_t dev = bdev->bd_dev;
> + struct fs_bdev_holder *h;
> + unsigned int count = 0;
> + int error = 0, err;
>
> lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
>
> - /*
> - * The block device may have been frozen before it was claimed by a
> - * filesystem. Concurrently another process might try to mount that
> - * frozen block device and has temporarily claimed the block device for
> - * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
> - * mounter is already about to abort mounting because they still saw an
> - * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
> - * NULL in that case.
> - */
> - sb = get_bdev_super(bdev);
> - if (!sb)
> - return -EINVAL;
> + mutex_unlock(&bdev->bd_holder_lock);
>
> - if (sb->s_op->thaw_super)
> - error = sb->s_op->thaw_super(sb,
> - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
> - else
> - error = thaw_super(sb,
> - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
> - deactivate_super(sb);
> + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) {
> + if (!atomic_inc_not_zero(&h->sb->s_active))
> + continue;
> + err = fs_super_thaw(h->sb);
> + if (err && !error)
> + error = err;
> + deactivate_super(h->sb);
> + count++;
> + }
> +
> + /* Shared device: swallow per-superblock errors, like fs_bdev_freeze(). */
> + if (error && count > 1)
> + error = 0;
> return error;
> }
>
> @@ -1602,6 +1651,131 @@ const struct blk_holder_ops fs_holder_ops = {
> };
> EXPORT_SYMBOL_GPL(fs_holder_ops);
>
> +static int fs_bdev_register(struct file *bdev_file, struct super_block *sb)
> +{
> + dev_t dev = file_bdev(bdev_file)->bd_dev;
> + struct rhlist_head *list, *pos;
> + struct fs_bdev_holder *h;
> + int err;
> +
> + /*
> + * A superblock may claim one device more than once (xfs with its log on
> + * the data device). Keep a single entry per (device, superblock) and
> + * count the claims in @fs_bdev_active; the entry lives until the last one
> + * is released.
> + */
> + scoped_guard(rcu) {
> + list = rhltable_lookup(&fs_bdev_supers, &dev, fs_bdev_params);
> + rhl_for_each_entry_rcu(h, pos, list, node)
> + if (h->sb == sb && refcount_inc_not_zero(&h->fs_bdev_active))
> + return 0;
> + }
> +
> + h = kmalloc(sizeof(*h), GFP_KERNEL);
> + if (!h)
> + return -ENOMEM;
> + h->dev = dev;
> + h->sb = sb;
> + refcount_set(&h->fs_bdev_passive, 1);
> + refcount_set(&h->fs_bdev_active, 1);
> +
> + err = rhltable_insert(&fs_bdev_supers, &h->node, fs_bdev_params);
> + if (err) {
> + kfree(h);
> + return err;
> + }
> +
> + /* The sb->s_count ref keeps @h->sb valid for as long as the entry exists. */
> + spin_lock(&sb_lock);
> + sb->s_count++;
> + spin_unlock(&sb_lock);
> +
> + return 0;
> +}
> +
> +/**
> + * fs_bdev_file_open_by_dev - claim a block device on behalf of a superblock
> + * @dev: block device number
> + * @mode: open mode
> + * @holder: block-layer exclusivity token (a superblock, or the file_system_type
> + * when the device may be shared by several superblocks of that type)
> + * @sb: superblock to drive fs_holder_ops events for
> + *
> + * Open @dev with &fs_holder_ops and register that @sb uses it, so device
> + * removal/sync/freeze/thaw are propagated to @sb (and any other superblock
> + * sharing @dev). Must be paired with fs_bdev_file_release().
> + *
> + * Return: an opened block-device file or an ERR_PTR().
> + */
> +struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
> + struct super_block *sb)
> +{
> + struct file *bdev_file;
> + int err;
> +
> + bdev_file = bdev_file_open_by_dev(dev, mode, holder, &fs_holder_ops);
> + if (IS_ERR(bdev_file))
> + return bdev_file;
> +
> + err = fs_bdev_register(bdev_file, sb);
> + if (err) {
> + bdev_fput(bdev_file);
> + return ERR_PTR(err);
> + }
> + return bdev_file;
> +}
> +EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_dev);
> +
> +struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
> + void *holder, struct super_block *sb)
> +{
> + struct file *bdev_file;
> + int err;
> +
> + bdev_file = bdev_file_open_by_path(path, mode, holder, &fs_holder_ops);
> + if (IS_ERR(bdev_file))
> + return bdev_file;
> +
> + err = fs_bdev_register(bdev_file, sb);
> + if (err) {
> + bdev_fput(bdev_file);
> + return ERR_PTR(err);
> + }
> + return bdev_file;
> +}
> +EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_path);
> +
> +/**
> + * fs_bdev_file_release - release a block device claimed for a superblock
> + * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
> + * @sb: superblock the device was claimed for
> + *
> + * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a
> + * pinning cursor defers the actual unlink). Then close the block device.
> + */
> +void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
> +{
> + dev_t dev = file_bdev(bdev_file)->bd_dev;
> + struct fs_bdev_holder *h, *found = NULL;
> + struct rhlist_head *list, *pos;
> +
> + rcu_read_lock();
> + list = rhltable_lookup(&fs_bdev_supers, &dev, fs_bdev_params);
> + rhl_for_each_entry_rcu(h, pos, list, node) {
> + if (h->sb != sb)
> + continue;
> + /* At most one entry per (dev, sb); the last claim drops the bias. */
> + if (refcount_dec_and_test(&h->fs_bdev_active))
> + found = h;
> + break;
> + }
> + rcu_read_unlock();
> + if (found)
> + fs_bdev_holder_put(found);
> + bdev_fput(bdev_file);
> +}
> +EXPORT_SYMBOL_GPL(fs_bdev_file_release);
> +
> int setup_bdev_super(struct super_block *sb, int sb_flags,
> struct fs_context *fc)
> {
> @@ -1609,7 +1783,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
> struct file *bdev_file;
> struct block_device *bdev;
>
> - bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
> + bdev_file = fs_bdev_file_open_by_dev(sb->s_dev, mode, sb, sb);
> if (IS_ERR(bdev_file)) {
> if (fc)
> errorf(fc, "%s: Can't open blockdev", fc->source);
> @@ -1623,7 +1797,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
> * writable from userspace even for a read-only block device.
> */
> if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
> - bdev_fput(bdev_file);
> + fs_bdev_file_release(bdev_file, sb);
> return -EACCES;
> }
>
> @@ -1634,7 +1808,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
> if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
> if (fc)
> warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
> - bdev_fput(bdev_file);
> + fs_bdev_file_release(bdev_file, sb);
> return -EBUSY;
> }
> spin_lock(&sb_lock);
> @@ -1725,7 +1899,7 @@ void kill_block_super(struct super_block *sb)
> generic_shutdown_super(sb);
> if (bdev) {
> sync_blockdev(bdev);
> - bdev_fput(sb->s_bdev_file);
> + fs_bdev_file_release(sb->s_bdev_file, sb);
> }
> }
>
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c8494d64a69d..43d37c02febf 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1760,13 +1760,6 @@ struct blk_holder_ops {
> int (*thaw)(struct block_device *bdev);
> };
>
> -/*
> - * For filesystems using @fs_holder_ops, the @holder argument passed to
> - * helpers used to open and claim block devices via
> - * bd_prepare_to_claim() must point to a superblock.
> - */
> -extern const struct blk_holder_ops fs_holder_ops;
> -
> /*
> * Return the correct open flags for blkdev_get_by_* for super block flags
> * as stored in sb->s_flags.
> diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
> index f21ffbb6dea5..721d842e3b24 100644
> --- a/include/linux/fs/super.h
> +++ b/include/linux/fs/super.h
> @@ -235,4 +235,11 @@ int freeze_super(struct super_block *super, enum freeze_holder who,
> int thaw_super(struct super_block *super, enum freeze_holder who,
> const void *freeze_owner);
>
> +struct file;
> +struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
> + struct super_block *sb);
> +struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
> + void *holder, struct super_block *sb);
> +void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb);
> +
> #endif /* _LINUX_FS_SUPER_H */
>
> --
> 2.47.3
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR