[PATCH] fs: use __fput_sync in close(2)

From: Mateusz Guzik
Date: Sun Aug 06 2023 - 19:06:48 EST


Making close(2) delegate fput finalization with task_work_add() runs
into a slowdown (atomics needed to do it) which is artificially worsened
in presence of rseq, which glibc blindly uses if present (and it is
normally present) -- they added a user memory-touching handler into
resume_user_mode_work(), where the thread leaving the kernel lands after
issuing task_work_add() from fput(). Said touching requires a SMAP
round-trip which is quite expensive and it always executes when landing
in the resume routine.

I'm going to write a separate e-mail about the rseq problem later, but
even if it gets sorted out there is still perf to gain (or rather,
overhead to avoid).

Numbers are below in the proposed patch, but tl;dr without CONFIG_RSEQ
making things worse for the stock kernel I see about 7% increase in
ops/s with open+close.

Searching mailing lists for discussions explaining why close(2) was not
already doing this I found a patch with the easiest way out (call
__fput_sync() in filp_close()):
https://lore.kernel.org/all/20150831120525.GA31015@xxxxxxxxxx/

There was no response to it though.

>From poking around there is tons of filp_close() users (including from
close_fd()) and it is unclear to me if they are going to be fine with
such a change.

With the assumption this is not going to work, I wrote my own patch
which adds close_fd_sync() and filp_close_sync(). They are shipped as
dedicated func entry points, but perhaps inlines which internally add a
flag to to the underlying routine would be preferred? Also adding __ in
front would be in line with __fput_sync, but having __filp_close_sync
call __filp_close looks weird to me.

All that said, if the simpler patch by Oleg Nestero works, then I'm
happy to drop this one. I just would like to see this sorted out,
whichever way.

Thoughts?

============================================================

fs: use __fput_sync in close(2)

close(2) is a special close which guarantees shallow kernel stack,
making delegation to task_work machinery unnecessary. Said delegation is
problematic as it involves atomic ops and interrupt masking trips, none
of which are cheap on x86-64. Forcing close(2) to do it looks like an
oversight in the original work.

Moreover presence of CONFIG_RSEQ adds an additional overhead as fput()
-> task_work_add(..., TWA_RESUME) -> set_notify_resume() makes the
thread returning to userspace land in resume_user_mode_work(), where
rseq_handle_notify_resume takes a SMAP round-trip if rseq is enabled for
the thread (and it is by default with contemporary glibc).

Sample result when benchmarking open1_processes -t 1 from will-it-scale
(that's a open + close loop) + tmpfs on /tmp, running on the Sapphire
Rapid CPU (ops/s):
stock+RSEQ: 1329857
stock-RSEQ: 1421667 (+7%)
patched: 1523521 (+14.5% / +7%) (with / without rseq)

Patched result is the same as it dodges rseq.

As there are numerous close_fd() and filp_close() consumers which may or
may not tolerate __fput_sync() behavior, dedicated routines are added
for close(2).

Signed-off-by: Mateusz Guzik <mjguzik@xxxxxxxxx>
---
fs/file.c | 20 +++++++++++++++++---
fs/file_table.c | 2 --
fs/open.c | 21 ++++++++++++++++++---
include/linux/fdtable.h | 1 +
include/linux/fs.h | 1 +
5 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 3fd003a8604f..eedb8a9fb6d2 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -651,7 +651,7 @@ static struct file *pick_file(struct files_struct *files, unsigned fd)
return file;
}

-int close_fd(unsigned fd)
+static __always_inline int __close_fd(unsigned fd, bool sync)
{
struct files_struct *files = current->files;
struct file *file;
@@ -662,9 +662,23 @@ int close_fd(unsigned fd)
if (!file)
return -EBADF;

- return filp_close(file, files);
+ if (sync)
+ return filp_close_sync(file, files);
+ else
+ return filp_close(file, files);
+}
+
+int close_fd_sync(unsigned fd)
+{
+ return __close_fd(fd, true);
+}
+EXPORT_SYMBOL(close_fd_sync); /* for ksys_close() */
+
+int close_fd(unsigned fd)
+{
+ return __close_fd(fd, false);
}
-EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd);

/**
* last_fd - return last valid index into fd table
diff --git a/fs/file_table.c b/fs/file_table.c
index fc7d677ff5ad..c7b7fcd7a8b5 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -462,8 +462,6 @@ void fput(struct file *file)
void __fput_sync(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
- struct task_struct *task = current;
- BUG_ON(!(task->flags & PF_KTHREAD));
__fput(file);
}
}
diff --git a/fs/open.c b/fs/open.c
index e6ead0f19964..e5f03f891977 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1503,7 +1503,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
* "id" is the POSIX thread ID. We use the
* files pointer for this..
*/
-int filp_close(struct file *filp, fl_owner_t id)
+static __always_inline int __filp_close(struct file *filp, fl_owner_t id, bool sync)
{
int retval = 0;

@@ -1520,12 +1520,27 @@ int filp_close(struct file *filp, fl_owner_t id)
dnotify_flush(filp, id);
locks_remove_posix(filp, id);
}
- fput(filp);
+ if (sync)
+ __fput_sync(filp);
+ else
+ fput(filp);
return retval;
}

+int filp_close_sync(struct file *filp, fl_owner_t id)
+{
+ return __filp_close(filp, id, true);
+}
+EXPORT_SYMBOL(filp_close_sync);
+
+int filp_close(struct file *filp, fl_owner_t id)
+{
+ return __filp_close(filp, id, false);
+}
EXPORT_SYMBOL(filp_close);

+extern unsigned long sysctl_fput_sync;
+
/*
* Careful here! We test whether the file pointer is NULL before
* releasing the fd. This ensures that one clone task can't release
@@ -1533,7 +1548,7 @@ EXPORT_SYMBOL(filp_close);
*/
SYSCALL_DEFINE1(close, unsigned int, fd)
{
- int retval = close_fd(fd);
+ int retval = close_fd_sync(fd);

/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..dd3d0505d34b 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -123,6 +123,7 @@ int iterate_fd(struct files_struct *, unsigned,
int (*)(const void *, struct file *, unsigned),
const void *);

+extern int close_fd_sync(unsigned int fd);
extern int close_fd(unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern struct file *close_fd_get_file(unsigned int fd);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 562f2623c9c9..300ce66eef0a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2388,6 +2388,7 @@ static inline struct file *file_clone_open(struct file *file)
{
return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
+extern int filp_close_sync(struct file *, fl_owner_t id);
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int, int *);
--
2.39.2