Re: [PATCH] 9p: fix data corruption with writeback caching during concurrent stat

From: David Howells

Date: Wed Feb 18 2026 - 11:32:43 EST


How about something along these lines instead? It's a bit more convoluted and
will probably still get it wrong if a third party interferes - but I don't
think there's a whole lot we can do about that without changing the protocol.

David
---
9p: fix data corruption with writeback caching during concurrent stat

When using writeback caching (cache=mmap), v9fs_vfs_getattr/setattr have
two issues that can cause data corruption:

1. filemap_fdatawrite() initiates writeback but doesn't wait for
completion. The subsequent server stat sees stale file size.

2. v9fs_stat2inode()/v9fs_stat2inode_dotl() unconditionally overwrite
i_size from the server response, even when dirty pages exist locally.
This causes processes using lseek(SEEK_END) to see incorrect file sizes.

Fix by the following means:

(1) Make cache_validity a bitmask and use atomic bitops on it.

(2) Use a flag to keep track of when the mtime is 'uncertain'. Set this
upon completion of a write subrequest and clear it when we either
fetch and update the mtime or do a setattr.

(3) Alter getattr to make the following choices, in order of preference:

(a) If in CACHE_META/LOOSE mode, don't care about keeping mtime/i_size
uptodate.

(b) If AT_STATX_FORCE_SYNC is supplied, then sync all outstanding
writes and wait, then fetch the new stats.

(c) If AT_STATX_DONT_SYNC is supplied, then the user specifically said
they don't care and we do as LOOSE.

(d) If the mtime is marked uncertain and STATX_MTIME is specifically
requested, then do as FORCE_SYNC (stat(2) will do this).

(e) If in CACHE_WRITEBACK mode, start the writeback, but disregard the
mtime change on the server for now. Note there's nothing to
guarantee the ordering writeback will write to the server versus
the stat retrieval, so mtime may or may not be seen to be updated.

(f) Otherwise, writeback is not initiated.

(4) Make setattr write and wait and clear the uncertain mtime flag.

(5) Update ->netfs.remote_i_size after a successful Write RPC call that
appears to move the EOF marker. Writes are done synchronously, so we
only have to worry about one at a time and netfslib uses a lock to
serialise writebacks.

(6) Fix the updating of i_size in stat2inode by:

(a) Just set it for a new inode.

(b) If the retrieved file length matches ->netfs.remote_i_size then
skip updating i_size. Assume nothing has changed on the server
and the local i_size is correct.

(c) If they don't match, then assume a third party interfered and
invalidate the local cache and discard any local changes, then
update i_size and remote_i_size. The invalidation has no effect
if we're doing unbuffered I/O.

Note that I don't think that we really need to start a writeback for
getattr in CACHE_WRITEBACK mode unless specifically told to sync - and then
we need to wait if STATX_MTIME is set.

Further, if the pagecache is dirty, can we just go with the local mtime
unless the mtime is marked uncertain? And if it is marked uncertain, and
STATX_MTIME is set, we should just be able to get the stats without
flushing. Possibly, the act of writing in to the pagecache and setting
i_mtime locally should clear the uncertainty flag - it will be set again
when we write back.

One final thought: should getattr() take i_rwsem when it flushes if it is
forced to sync? It could take a write lock on i_rwsem and then downgrade
it so that reads can happen concurrently.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Pierre Barre <pierre@xxxxxxxx>
---
fs/9p/v9fs.h | 5 +-
fs/9p/v9fs_vfs.h | 3 +
fs/9p/vfs_addr.c | 11 +++++
fs/9p/vfs_dentry.c | 4 +-
fs/9p/vfs_inode.c | 84 ++++++++++++++++++++++++++++++-------------
fs/9p/vfs_inode_dotl.c | 95 ++++++++++++++++++++++++++++++++++++-------------
6 files changed, 147 insertions(+), 55 deletions(-)

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6a12445d3858..bc4a86c289f4 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -134,12 +134,13 @@ struct v9fs_session_info {
};

/* cache_validity flags */
-#define V9FS_INO_INVALID_ATTR 0x01
+#define V9FS_INO_INVALID_ATTR 0
+#define V9FS_INO_MTIME_UNCERTAIN 1

struct v9fs_inode {
struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct p9_qid qid;
- unsigned int cache_validity;
+ unsigned long cache_validity;
struct mutex v_mutex;
};

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index d3aefbec4de6..58f953788791 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -27,6 +27,7 @@

/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
#define V9FS_STAT2INODE_KEEP_ISIZE 1
+#define V9FS_STAT2INODE_NEW_INODE 2

extern struct file_system_type v9fs_fs_type;
extern const struct address_space_operations v9fs_addr_operations;
@@ -70,7 +71,7 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
struct v9fs_inode *v9inode;

v9inode = V9FS_I(inode);
- v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+ set_bit(V9FS_INO_INVALID_ATTR, &v9inode->cache_validity);
}

int v9fs_open_to_dotl_flags(int flags);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 862164181bac..1dbf05df8fc5 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -53,12 +53,21 @@ static void v9fs_begin_writeback(struct netfs_io_request *wreq)
*/
static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
{
+ struct inode *inode = subreq->rreq->inode;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid = subreq->rreq->netfs_priv;
int err, len;

len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
- if (len > 0)
+ if (len > 0) {
+ unsigned long long end = subreq->start + len;
+ spin_lock(&inode->i_lock);
+ if (end > v9inode->netfs.remote_i_size)
+ v9inode->netfs.remote_i_size = end;
+ set_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity);
+ spin_unlock(&inode->i_lock);
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ }
netfs_write_subrequest_terminated(subreq, len ?: err);
}

diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index c5bf74d547e8..aca71d0ba4b8 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -75,7 +75,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto out_valid;

v9inode = V9FS_I(inode);
- if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+ if (test_bit(V9FS_INO_INVALID_ATTR, &v9inode->cache_validity)) {
int retval;
struct v9fs_session_info *v9ses;

@@ -100,7 +100,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
dentry, dentry);
return 0;
}
- if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+ if (test_bit(V9FS_INO_INVALID_ATTR, &v9inode->cache_validity)) {
p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) invalidated due to type change\n",
dentry, dentry);
return 0;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 97abe65bf7c1..8bb7c4bf8ed3 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -435,7 +435,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
if (retval)
goto error;

- v9fs_stat2inode(st, inode, sb, 0);
+ v9fs_stat2inode(st, inode, sb, V9FS_STAT2INODE_NEW_INODE);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
@@ -966,23 +966,33 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
{
struct dentry *dentry = path->dentry;
struct inode *inode = d_inode(dentry);
- struct v9fs_session_info *v9ses;
+ struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid;
- struct p9_wstat *st;
+ struct p9_wstat *st = NULL;
+ bool force_sync = flags & AT_STATX_FORCE_SYNC;
+ int retval;

p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
- v9ses = v9fs_dentry2v9ses(dentry);
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- return 0;
- } else if (v9ses->cache & CACHE_WRITEBACK) {
- if (S_ISREG(inode->i_mode)) {
- int retval = filemap_fdatawrite(inode->i_mapping);

- if (retval)
- p9_debug(P9_DEBUG_ERROR,
- "flushing writeback during getattr returned %d\n", retval);
- }
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
+ goto fill;
+ if ((flags & AT_STATX_DONT_SYNC) && !force_sync)
+ goto fill;
+ if (test_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity) &&
+ (request_mask & STATX_MTIME))
+ force_sync = true;
+
+ if (S_ISREG(inode->i_mode)) {
+ if (force_sync)
+ retval = filemap_write_and_wait(inode->i_mapping);
+ else if (v9ses->cache & CACHE_WRITEBACK)
+ retval = filemap_fdatawrite(inode->i_mapping);
+ else
+ retval = 0;
+ if (retval)
+ p9_debug(P9_DEBUG_ERROR,
+ "flushing writeback during getattr returned %d\n", retval);
}
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
@@ -993,11 +1003,20 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
if (IS_ERR(st))
return PTR_ERR(st);

- v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
- generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
+ /*
+ * With writeback caching, the client is authoritative for i_size.
+ * Don't let the server overwrite it with a potentially stale value.
+ */
+ v9fs_stat2inode(st, inode, dentry->d_sb,
+ (v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);
+
+fill:
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

- p9stat_free(st);
- kfree(st);
+ if (st) {
+ p9stat_free(st);
+ kfree(st);
+ }
return 0;
}

@@ -1014,6 +1033,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
{
int retval, use_dentry = 0;
struct inode *inode = d_inode(dentry);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL;
struct p9_wstat wstat;
@@ -1058,7 +1078,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,

/* Write all dirty data */
if (d_is_reg(dentry)) {
- retval = filemap_fdatawrite(inode->i_mapping);
+ retval = filemap_write_and_wait(inode->i_mapping);
if (retval)
p9_debug(P9_DEBUG_ERROR,
"flushing writeback during setattr returned %d\n", retval);
@@ -1072,6 +1092,8 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
if (retval < 0)
return retval;

+ if (iattr->ia_valid & ATTR_MTIME)
+ clear_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity);
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
@@ -1113,6 +1135,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
inode_set_atime(inode, stat->atime, 0);
inode_set_mtime(inode, stat->mtime, 0);
inode_set_ctime(inode, stat->mtime, 0);
+ clear_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity);

inode->i_uid = v9ses->dfltuid;
inode->i_gid = v9ses->dfltgid;
@@ -1141,12 +1164,25 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;

- v9inode->netfs.remote_i_size = stat->length;
- if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ bool set_size;
+ if (flags & V9FS_STAT2INODE_NEW_INODE) {
+ set_size = true;
+ } else if (stat->length == v9inode->netfs.remote_i_size) {
+ set_size = false;
+ } else {
+ /* Uh oh - the server copy appears to have changed. */
+ filemap_invalidate_inode(inode, false, 0, LLONG_MAX);
+ set_size = true;
+ }
+ if (set_size) {
+ v9inode->netfs.remote_i_size = stat->length;
v9fs_i_size_write(inode, stat->length);
- /* not real number of blocks, but 512 byte ones ... */
- inode->i_blocks = (stat->length + 512 - 1) >> 9;
- v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
+ /* not real number of blocks, but 512 byte ones ... */
+ inode->i_blocks = (stat->length + 512 - 1) >> 9;
+ }
+
+no_size_change:
+ clear_bit(V9FS_INO_INVALID_ATTR, &v9inode->cache_validity);
}

/**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 643e759eacb2..cbf789752183 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -125,7 +125,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
if (retval)
goto error;

- v9fs_stat2inode_dotl(st, inode, 0);
+ v9fs_stat2inode_dotl(st, inode, V9FS_STAT2INODE_NEW_INODE);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
@@ -421,25 +421,36 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
- struct v9fs_session_info *v9ses;
- struct p9_fid *fid;
struct inode *inode = d_inode(dentry);
- struct p9_stat_dotl *st;
+ struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_fid *fid;
+ struct p9_stat_dotl *st = NULL;
+ bool force_sync = flags & AT_STATX_FORCE_SYNC;
+ int retval;

p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
- v9ses = v9fs_dentry2v9ses(dentry);
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- return 0;
- } else if (v9ses->cache) {
- if (S_ISREG(inode->i_mode)) {
- int retval = filemap_fdatawrite(inode->i_mapping);

- if (retval)
- p9_debug(P9_DEBUG_ERROR,
- "flushing writeback during getattr returned %d\n", retval);
- }
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
+ goto fill;
+ if ((flags & AT_STATX_DONT_SYNC) && !force_sync)
+ goto fill;
+ if (test_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity) &&
+ (request_mask & STATX_MTIME))
+ force_sync = true;
+
+ if (S_ISREG(inode->i_mode)) {
+ if (force_sync)
+ retval = filemap_write_and_wait(inode->i_mapping);
+ else if (v9ses->cache & CACHE_WRITEBACK)
+ retval = filemap_fdatawrite(inode->i_mapping);
+ else
+ retval = 0;
+ if (retval)
+ p9_debug(P9_DEBUG_ERROR,
+ "flushing writeback during getattr returned %d\n", retval);
}
+
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -453,8 +464,15 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
if (IS_ERR(st))
return PTR_ERR(st);

- v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
- generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
+ /*
+ * With writeback caching, the client is authoritative for i_size.
+ * Don't let the server overwrite it with a potentially stale value.
+ */
+ v9fs_stat2inode_dotl(st, inode,
+ (v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);
+
+fill:
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;

@@ -516,6 +534,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
int retval, use_dentry = 0;
struct inode *inode = d_inode(dentry);
struct v9fs_session_info __maybe_unused *v9ses;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid = NULL;
struct p9_iattr_dotl p9attr = {
.uid = INVALID_UID,
@@ -561,7 +580,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,

/* Write all dirty data */
if (S_ISREG(inode->i_mode)) {
- retval = filemap_fdatawrite(inode->i_mapping);
+ retval = filemap_write_and_wait(inode->i_mapping);
if (retval < 0)
p9_debug(P9_DEBUG_ERROR,
"Flushing file prior to setattr failed: %d\n", retval);
@@ -574,6 +593,8 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
return retval;
}

+ if (iattr->ia_valid & ATTR_MTIME)
+ clear_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity);
if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
@@ -624,6 +645,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
stat->st_atime_nsec);
inode_set_mtime(inode, stat->st_mtime_sec,
stat->st_mtime_nsec);
+ clear_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity);
inode_set_ctime(inode, stat->st_ctime_sec,
stat->st_ctime_nsec);
inode->i_uid = stat->st_uid;
@@ -634,9 +656,20 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;

- v9inode->netfs.remote_i_size = stat->st_size;
- if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ bool set_size;
+ if (flags & V9FS_STAT2INODE_NEW_INODE) {
+ set_size = true;
+ } else if (stat->st_size == v9inode->netfs.remote_i_size) {
+ set_size = false;
+ } else {
+ /* Uh oh - the server copy appears to have changed. */
+ filemap_invalidate_inode(inode, false, 0, LLONG_MAX);
+ set_size = true;
+ }
+ if (set_size) {
+ v9inode->netfs.remote_i_size = stat->st_size;
v9fs_i_size_write(inode, stat->st_size);
+ }
inode->i_blocks = stat->st_blocks;
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -647,6 +680,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode_set_mtime(inode, stat->st_mtime_sec,
stat->st_mtime_nsec);
}
+ clear_bit(V9FS_INO_MTIME_UNCERTAIN, &v9inode->cache_validity);
if (stat->st_result_mask & P9_STATS_CTIME) {
inode_set_ctime(inode, stat->st_ctime_sec,
stat->st_ctime_nsec);
@@ -662,10 +696,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
}
- if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
- stat->st_result_mask & P9_STATS_SIZE) {
- v9inode->netfs.remote_i_size = stat->st_size;
- v9fs_i_size_write(inode, stat->st_size);
+ if (stat->st_result_mask & P9_STATS_SIZE) {
+ bool set_size;
+ if (flags & V9FS_STAT2INODE_NEW_INODE) {
+ set_size = true;
+ } else if (stat->st_size == v9inode->netfs.remote_i_size) {
+ set_size = false;
+ } else {
+ /* Uh oh - the server copy appears to have changed. */
+ filemap_invalidate_inode(inode, false, 0, LLONG_MAX);
+ set_size = true;
+ }
+ if (set_size) {
+ v9inode->netfs.remote_i_size = stat->st_size;
+ v9fs_i_size_write(inode, stat->st_size);
+ }
}
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
@@ -676,7 +721,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
* because the inode structure does not have fields for them.
*/
- v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
+ clear_bit(V9FS_INO_INVALID_ATTR, &v9inode->cache_validity);
}

static int