Re: [PATCH 3/6] fs: distinguish between user initiated freeze and kernel initiated freeze

From: Darrick J. Wong
Date: Mon May 22 2023 - 20:41:51 EST


How about this as an alternative patch? Kernel and userspace freeze
state are stored in s_writers; each type cannot block the other (though
you still can't have nested kernel or userspace freezes); and the freeze
is maintained until /both/ freeze types are dropped.

AFAICT this should work for the two other usecases (quiescing pagefaults
for fsdax pmem pre-removal; and freezing fses during suspend) besides
online fsck for xfs.

--D

From: Darrick J. Wong <djwong@xxxxxxxxxx>
Subject: fs: distinguish between user initiated freeze and kernel initiated freeze

Userspace can freeze a filesystem using the FIFREEZE ioctl or by
suspending the block device; this state persists until userspace thaws
the filesystem with the FITHAW ioctl or resuming the block device.
Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for
the fsfreeze ioctl") we only allow the first freeze command to succeed.

The kernel may decide that it is necessary to freeze a filesystem for
its own internal purposes, such as suspends in progress, filesystem fsck
activities, or quiescing a device prior to removal. Userspace thaw
commands must never break a kernel freeze, and kernel thaw commands
shouldn't undo userspace's freeze command.

Introduce a couple of freeze holder flags and wire it into the
sb_writers state. One kernel and one userspace freeze are allowed to
coexist at the same time; the filesystem will not thaw until both are
lifted.

Inspired-by: Luis Chamberlain <mcgrof@xxxxxxxxxx>
Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
---
fs/super.c | 172 +++++++++++++++++++++++++++++++++++++++++++++++++---
include/linux/fs.h | 8 ++
2 files changed, 170 insertions(+), 10 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 34afe411cf2b..7496d51affb9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -39,7 +39,7 @@
#include <uapi/linux/mount.h>
#include "internal.h"

-static int thaw_super_locked(struct super_block *sb);
+static int thaw_super_locked(struct super_block *sb, unsigned short who);

static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
@@ -1027,7 +1027,7 @@ static void do_thaw_all_callback(struct super_block *sb)
down_write(&sb->s_umount);
if (sb->s_root && sb->s_flags & SB_BORN) {
emergency_thaw_bdev(sb);
- thaw_super_locked(sb);
+ thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
} else {
up_write(&sb->s_umount);
}
@@ -1636,13 +1636,21 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
}

/**
- * freeze_super - lock the filesystem and force it into a consistent state
+ * __freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
+ * @who: FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
+ * FREEZE_HOLDER_KERNEL if the kernel wants to freeze it
*
* Syncs the super to make sure the filesystem is consistent and calls the fs's
- * freeze_fs. Subsequent calls to this without first thawing the fs will return
+ * freeze_fs. Subsequent calls to this without first thawing the fs may return
* -EBUSY.
*
+ * The @who argument distinguishes between the kernel and userspace trying to
+ * freeze the filesystem. Although there cannot be multiple kernel freezes or
+ * multiple userspace freezes in effect at any given time, the kernel and
+ * userspace can both hold a filesystem frozen. The filesystem remains frozen
+ * until there are no kernel or userspace freezes in effect.
+ *
* During this function, sb->s_writers.frozen goes through these values:
*
* SB_UNFROZEN: File system is normal, all writes progress as usual.
@@ -1668,12 +1676,61 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
*
* sb->s_writers.frozen is protected by sb->s_umount.
*/
-int freeze_super(struct super_block *sb)
+static int __freeze_super(struct super_block *sb, unsigned short who)
{
+ struct sb_writers *sbw = &sb->s_writers;
int ret;

atomic_inc(&sb->s_active);
down_write(&sb->s_umount);
+
+ if (sbw->frozen == SB_FREEZE_COMPLETE) {
+ switch (who) {
+ case FREEZE_HOLDER_KERNEL:
+ if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
+ /*
+ * Kernel freeze already in effect; caller can
+ * try again.
+ */
+ deactivate_locked_super(sb);
+ return -EBUSY;
+ }
+ if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
+ /*
+ * Share the freeze state with the userspace
+ * freeze already in effect.
+ */
+ sbw->freeze_holders |= who;
+ deactivate_locked_super(sb);
+ return 0;
+ }
+ break;
+ case FREEZE_HOLDER_USERSPACE:
+ if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
+ /*
+ * Userspace freeze already in effect; tell
+ * the caller we're busy.
+ */
+ deactivate_locked_super(sb);
+ return -EBUSY;
+ }
+ if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
+ /*
+ * Share the freeze state with the kernel
+ * freeze already in effect.
+ */
+ sbw->freeze_holders |= who;
+ deactivate_locked_super(sb);
+ return 0;
+ }
+ break;
+ default:
+ BUG();
+ deactivate_locked_super(sb);
+ return -EINVAL;
+ }
+ }
+
if (sb->s_writers.frozen != SB_UNFROZEN) {
deactivate_locked_super(sb);
return -EBUSY;
@@ -1686,6 +1743,7 @@ int freeze_super(struct super_block *sb)

if (sb_rdonly(sb)) {
/* Nothing to do really... */
+ sb->s_writers.freeze_holders |= who;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
up_write(&sb->s_umount);
return 0;
@@ -1731,23 +1789,103 @@ int freeze_super(struct super_block *sb)
* For debugging purposes so that fs can warn if it sees write activity
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
+ sb->s_writers.freeze_holders |= who;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
lockdep_sb_freeze_release(sb);
up_write(&sb->s_umount);
return 0;
}
+
+/*
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs. Subsequent calls to this without first calling thaw_super will
+ * return -EBUSY. See the comment for __freeze_super for more information.
+ */
+int freeze_super(struct super_block *sb)
+{
+ return __freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+}
EXPORT_SYMBOL(freeze_super);

-static int thaw_super_locked(struct super_block *sb)
+/**
+ * freeze_super_kernel - lock the filesystem for an internal kernel operation
+ * and force it into a consistent state.
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs. Subsequent calls to this without first calling thaw_super_excl
+ * will return -EBUSY.
+ */
+int freeze_super_kernel(struct super_block *sb)
{
+ return __freeze_super(sb, FREEZE_HOLDER_KERNEL);
+}
+EXPORT_SYMBOL_GPL(freeze_super_kernel);
+
+/*
+ * Undoes the effect of a freeze_super_locked call. If the filesystem is
+ * frozen both by userspace and the kernel, a thaw call from either source
+ * removes that state without releasing the other state or unlocking the
+ * filesystem.
+ */
+static int thaw_super_locked(struct super_block *sb, unsigned short who)
+{
+ struct sb_writers *sbw = &sb->s_writers;
int error;

+ if (sbw->frozen == SB_FREEZE_COMPLETE) {
+ switch (who) {
+ case FREEZE_HOLDER_KERNEL:
+ if (!(sbw->freeze_holders & FREEZE_HOLDER_KERNEL)) {
+ /* Caller doesn't hold a kernel freeze. */
+ up_write(&sb->s_umount);
+ return -EINVAL;
+ }
+ if (sbw->freeze_holders & FREEZE_HOLDER_USERSPACE) {
+ /*
+ * We were sharing the freeze with userspace,
+ * so drop the userspace freeze but exit
+ * without unfreezing.
+ */
+ sbw->freeze_holders &= ~who;
+ up_write(&sb->s_umount);
+ return 0;
+ }
+ break;
+ case FREEZE_HOLDER_USERSPACE:
+ if (!(sbw->freeze_holders & FREEZE_HOLDER_USERSPACE)) {
+ /* Caller doesn't hold a userspace freeze. */
+ up_write(&sb->s_umount);
+ return -EINVAL;
+ }
+ if (sbw->freeze_holders & FREEZE_HOLDER_KERNEL) {
+ /*
+ * We were sharing the freeze with the kernel,
+ * so drop the kernel freeze but exit without
+ * unfreezing.
+ */
+ sbw->freeze_holders &= ~who;
+ up_write(&sb->s_umount);
+ return 0;
+ }
+ break;
+ default:
+ BUG();
+ up_write(&sb->s_umount);
+ return -EINVAL;
+ }
+ }
+
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
up_write(&sb->s_umount);
return -EINVAL;
}

if (sb_rdonly(sb)) {
+ sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
goto out;
}
@@ -1765,6 +1903,7 @@ static int thaw_super_locked(struct super_block *sb)
}
}

+ sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
@@ -1774,18 +1913,33 @@ static int thaw_super_locked(struct super_block *sb)
}

/**
- * thaw_super -- unlock filesystem
+ * thaw_super -- unlock filesystem frozen with freeze_super
* @sb: the super to thaw
*
- * Unlocks the filesystem and marks it writeable again after freeze_super().
+ * Unlocks the filesystem after freeze_super, and make it writeable again if
+ * there is not a freeze_super_kernel still in effect.
*/
int thaw_super(struct super_block *sb)
{
down_write(&sb->s_umount);
- return thaw_super_locked(sb);
+ return thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
}
EXPORT_SYMBOL(thaw_super);

+/**
+ * thaw_super_kernel -- unlock filesystem frozen with freeze_super_kernel
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem after freeze_super_kernel, and make it writeable
+ * again if there is not a freeze_super still in effect.
+ */
+int thaw_super_kernel(struct super_block *sb)
+{
+ down_write(&sb->s_umount);
+ return thaw_super_locked(sb, FREEZE_HOLDER_KERNEL);
+}
+EXPORT_SYMBOL_GPL(thaw_super_kernel);
+
/*
* Create workqueue for deferred direct IO completions. We allocate the
* workqueue when it's first needed. This avoids creating workqueue for
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21a981680856..147644b5d648 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1145,11 +1145,15 @@ enum {
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
- int frozen; /* Is sb frozen? */
+ unsigned short frozen; /* Is sb frozen? */
+ unsigned short freeze_holders; /* Who froze fs? */
wait_queue_head_t wait_unfrozen; /* wait for thaw */
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};

+#define FREEZE_HOLDER_USERSPACE (1U << 1) /* userspace froze fs */
+#define FREEZE_HOLDER_KERNEL (1U << 2) /* kernel froze fs */
+
struct super_block {
struct list_head s_list; /* Keep this first */
dev_t s_dev; /* search index; _not_ kdev_t */
@@ -2288,6 +2292,8 @@ extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
extern int freeze_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
+extern int freeze_super_kernel(struct super_block *super);
+extern int thaw_super_kernel(struct super_block *super);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);