[RFC PATCH 1/2] fs: introduce FALLOC_FL_FORCE_ZERO to fallocate

From: Zhang Yi
Date: Fri Dec 27 2024 - 20:50:05 EST


From: Zhang Yi <yi.zhang@xxxxxxxxxx>

Thanks to the development of flash-based storage devices, we can quickly
write zeros to SSDs using the WRITE_ZERO command. Therefore, we
introduce a new flag FALLOC_FL_FORCE_ZERO to fallocate, which acts as a
supported flag for FALLOC_FL_ZERO_RANGE. This flag forces the file
system to issue zeroes and allocate written extents. The process of
zeroing out can be accelerated with the REQ_OP_WRITE_ZEROES operation
when the underlying storage device supports WRITE_ZERO cmd and UMMAP bit
on SCSI SSDs or DEAC bit on NVMe SSDs.

This provides users with a new method to quickly generate a zeroed file.
Users no longer need to write zero data to create a file with written
extents. The subsequent overwriting of this file range can save
significant metadata changes, which should greatly improve overwrite
performance on certain filesystems.

This flag should not be used in conjunction with the FALLOC_FL_KEEP_SIZE
since allocating written extents beyond file EOF is not permitted.

Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx>
---
fs/open.c | 14 +++++++++++---
include/linux/falloc.h | 5 ++++-
include/uapi/linux/falloc.h | 12 ++++++++++++
3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index e6911101fe71..d3afaddfcf27 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -246,7 +246,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (offset < 0 || len <= 0)
return -EINVAL;

- if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
+ if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_SUPPORT_MASK))
return -EOPNOTSUPP;

/*
@@ -259,15 +259,23 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_ALLOCATE_RANGE:
case FALLOC_FL_UNSHARE_RANGE:
+ if (mode & FALLOC_FL_FORCE_ZERO)
+ return -EOPNOTSUPP;
+ break;
case FALLOC_FL_ZERO_RANGE:
+ if ((mode & FALLOC_FL_KEEP_SIZE) &&
+ (mode & FALLOC_FL_FORCE_ZERO))
+ return -EOPNOTSUPP;
break;
case FALLOC_FL_PUNCH_HOLE:
- if (!(mode & FALLOC_FL_KEEP_SIZE))
+ if (!(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_FORCE_ZERO))
return -EOPNOTSUPP;
break;
case FALLOC_FL_COLLAPSE_RANGE:
case FALLOC_FL_INSERT_RANGE:
- if (mode & FALLOC_FL_KEEP_SIZE)
+ if ((mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_FORCE_ZERO))
return -EOPNOTSUPP;
break;
default:
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 3f49f3df6af5..75ac063d7eab 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -29,7 +29,8 @@ struct space_resv {
* Mask of all supported fallocate modes. Only one can be set at a time.
*
* In addition to the mode bit, the mode argument can also encode flags.
- * FALLOC_FL_KEEP_SIZE is the only supported flag so far.
+ * FALLOC_FL_KEEP_SIZE and FALLOC_FL_FORCE_ZERO are the only supported
+ * flags so far.
*/
#define FALLOC_FL_MODE_MASK (FALLOC_FL_ALLOCATE_RANGE | \
FALLOC_FL_PUNCH_HOLE | \
@@ -37,6 +38,8 @@ struct space_resv {
FALLOC_FL_ZERO_RANGE | \
FALLOC_FL_INSERT_RANGE | \
FALLOC_FL_UNSHARE_RANGE)
+#define FALLOC_FL_SUPPORT_MASK (FALLOC_FL_KEEP_SIZE | \
+ FALLOC_FL_FORCE_ZERO)

/* on ia32 l_start is on a 32-bit boundary */
#if defined(CONFIG_X86_64)
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index 5810371ed72b..7c12bcdff7d3 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -78,4 +78,16 @@
*/
#define FALLOC_FL_UNSHARE_RANGE 0x40

+/*
+ * FALLOC_FL_FORCE_ZERO should be used in conjunction with FALLOC_FL_ZERO_RANGE,
+ * it force the file system issuing zero and allocate written extents. The
+ * zeroing out can speed up with the REQ_OP_WRITE_ZEROES command, and sebsequent
+ * overwriting over this range can save significant metadata changes, which
+ * should be contribute to improve the overwrite performance on such
+ * preallocated range.
+ *
+ * This flag cannot be used in conjunction with the FALLOC_FL_KEEP_SIZE.
+ */
+#define FALLOC_FL_FORCE_ZERO 0x80
+
#endif /* _UAPI_FALLOC_H_ */
--
2.39.2