[PATCH v2] statx: optimize copy of struct statx to userspace

From: Eric Biggers
Date: Sat Mar 11 2017 - 16:47:43 EST


From: Eric Biggers <ebiggers@xxxxxxxxxx>

I found that statx() was significantly slower than stat(). As a
microbenchmark, I compared 10,000,000 invocations of fstat() on a tmpfs
file to the same with statx() passed a NULL path:

$ time ./stat_benchmark

real 0m1.464s
user 0m0.275s
sys 0m1.187s

$ time ./statx_benchmark

real 0m5.530s
user 0m0.281s
sys 0m5.247s

statx is expected to be a little slower than stat because struct statx
is larger than struct stat, but not by *that* much. It turns out that
most of the overhead was in copying struct statx to userspace,
apparently mostly in all the stac/clac instructions that got generated
for each __put_user() call. (This was on x86_64, but some other
architectures, e.g. arm64, have something similar now too.)

stat() instead initializes its struct on the stack and copies it to
userspace with a single call to copy_to_user(). This turns out to be
much faster, and changing statx to do this makes it almost as fast as
stat:

$ time ./statx_benchmark

real 0m1.573s
user 0m0.229s
sys 0m1.344s

Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>
---
fs/stat.c | 72 +++++++++++++++++++++++++++++----------------------------------
1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/fs/stat.c b/fs/stat.c
index fa0be59340cc..5cc267ec7865 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -509,46 +509,41 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */

-static inline int __put_timestamp(struct timespec *kts,
- struct statx_timestamp __user *uts)
+static inline void init_statx_timestamp(struct statx_timestamp *uts,
+ const struct timespec *kts)
{
- return (__put_user(kts->tv_sec, &uts->tv_sec ) ||
- __put_user(kts->tv_nsec, &uts->tv_nsec ) ||
- __put_user(0, &uts->__reserved ));
+ uts->tv_sec = kts->tv_sec;
+ uts->tv_nsec = kts->tv_nsec;
+ uts->__reserved = 0;
}

-/*
- * Set the statx results.
- */
-static long statx_set_result(struct kstat *stat, struct statx __user *buffer)
+static int cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
- uid_t uid = from_kuid_munged(current_user_ns(), stat->uid);
- gid_t gid = from_kgid_munged(current_user_ns(), stat->gid);
-
- if (__put_user(stat->result_mask, &buffer->stx_mask ) ||
- __put_user(stat->mode, &buffer->stx_mode ) ||
- __clear_user(&buffer->__spare0, sizeof(buffer->__spare0)) ||
- __put_user(stat->nlink, &buffer->stx_nlink ) ||
- __put_user(uid, &buffer->stx_uid ) ||
- __put_user(gid, &buffer->stx_gid ) ||
- __put_user(stat->attributes, &buffer->stx_attributes ) ||
- __put_user(stat->blksize, &buffer->stx_blksize ) ||
- __put_user(MAJOR(stat->rdev), &buffer->stx_rdev_major ) ||
- __put_user(MINOR(stat->rdev), &buffer->stx_rdev_minor ) ||
- __put_user(MAJOR(stat->dev), &buffer->stx_dev_major ) ||
- __put_user(MINOR(stat->dev), &buffer->stx_dev_minor ) ||
- __put_timestamp(&stat->atime, &buffer->stx_atime ) ||
- __put_timestamp(&stat->btime, &buffer->stx_btime ) ||
- __put_timestamp(&stat->ctime, &buffer->stx_ctime ) ||
- __put_timestamp(&stat->mtime, &buffer->stx_mtime ) ||
- __put_user(stat->ino, &buffer->stx_ino ) ||
- __put_user(stat->size, &buffer->stx_size ) ||
- __put_user(stat->blocks, &buffer->stx_blocks ) ||
- __clear_user(&buffer->__spare1, sizeof(buffer->__spare1)) ||
- __clear_user(&buffer->__spare2, sizeof(buffer->__spare2)))
- return -EFAULT;
-
- return 0;
+ struct statx tmp;
+
+ tmp.stx_mask = stat->result_mask;
+ tmp.stx_blksize = stat->blksize;
+ tmp.stx_attributes = stat->attributes;
+ tmp.stx_nlink = stat->nlink;
+ tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
+ tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
+ tmp.stx_mode = stat->mode;
+ memset(tmp.__spare0, 0, sizeof(tmp.__spare0));
+ tmp.stx_ino = stat->ino;
+ tmp.stx_size = stat->size;
+ tmp.stx_blocks = stat->blocks;
+ memset(tmp.__spare1, 0, sizeof(tmp.__spare1));
+ init_statx_timestamp(&tmp.stx_atime, &stat->atime);
+ init_statx_timestamp(&tmp.stx_btime, &stat->btime);
+ init_statx_timestamp(&tmp.stx_ctime, &stat->ctime);
+ init_statx_timestamp(&tmp.stx_mtime, &stat->mtime);
+ tmp.stx_rdev_major = MAJOR(stat->rdev);
+ tmp.stx_rdev_minor = MINOR(stat->rdev);
+ tmp.stx_dev_major = MAJOR(stat->dev);
+ tmp.stx_dev_minor = MINOR(stat->dev);
+ memset(tmp.__spare2, 0, sizeof(tmp.__spare2));
+
+ return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

/**
@@ -572,8 +567,6 @@ SYSCALL_DEFINE5(statx,

if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
- if (!access_ok(VERIFY_WRITE, buffer, sizeof(*buffer)))
- return -EFAULT;

if (filename)
error = vfs_statx(dfd, filename, flags, &stat, mask);
@@ -581,7 +574,8 @@ SYSCALL_DEFINE5(statx,
error = vfs_statx_fd(dfd, &stat, mask, flags);
if (error)
return error;
- return statx_set_result(&stat, buffer);
+
+ return cp_statx(&stat, buffer);
}

/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
--
2.12.0