[PATCH/RFC] core/nfsd: allow kernel threads to use task_work.
From: NeilBrown
Date: Mon Nov 27 2023 - 17:05:41 EST
I have evidence from a customer site of 256 nfsd threads adding files to
delayed_fput_lists nearly twice as fast they are retired by a single
work-queue thread running delayed_fput(). As you might imagine this
does not end well (20 million files in the queue at the time a snapshot
was taken for analysis).
While this might point to a problem with the filesystem not handling the
final close efficiently, such problems should only hurt throughput, not
lead to memory exhaustion.
For normal threads, the thread that closes the file also calls the
final fput so there is natural rate limiting preventing excessive growth
in the list of delayed fputs. For kernel threads, and particularly for
nfsd, delayed in the final fput do not impose any throttling to prevent
the thread from closing more files.
A simple way to fix this is to treat nfsd threads like normal processes
for task_work. Thus the pending files are queued for the thread, and
the same thread finishes the work.
Currently KTHREADs are assumed never to call task_work_run(). With this
patch that it still the default but it is implemented by storing the
magic value TASK_WORKS_DISABLED in ->task_works. If a kthread, such as
nfsd, will call task_work_run() periodically, it sets ->task_works
to NULL to indicate this.
Signed-off-by: NeilBrown <neilb@xxxxxxx>
---
I wonder which tree this should go through assuming everyone likes it.
VFS maybe??
Thanks.
fs/file_table.c | 2 +-
fs/nfsd/nfssvc.c | 4 ++++
include/linux/sched.h | 1 +
include/linux/task_work.h | 4 +++-
kernel/fork.c | 2 +-
kernel/task_work.c | 7 ++++---
6 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/fs/file_table.c b/fs/file_table.c
index de4a2915bfd4..e79351df22be 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -445,7 +445,7 @@ void fput(struct file *file)
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
- if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+ if (likely(!in_interrupt())) {
init_task_work(&file->f_rcuhead, ____fput);
if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
return;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 66ca50b38b27..c047961262ca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -13,6 +13,7 @@
#include <linux/fs_struct.h>
#include <linux/swap.h>
#include <linux/siphash.h>
+#include <linux/task_work.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
@@ -941,6 +942,7 @@ nfsd(void *vrqstp)
}
current->fs->umask = 0;
+ current->task_works = NULL; /* Declare that I will call task_work_run() */
atomic_inc(&nfsdstats.th_cnt);
@@ -955,6 +957,8 @@ nfsd(void *vrqstp)
svc_recv(rqstp);
validate_process_creds();
+ if (task_work_pending(current))
+ task_work_run();
}
atomic_dec(&nfsdstats.th_cnt);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..c63c2bedbf71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,6 +1117,7 @@ struct task_struct {
unsigned int sas_ss_flags;
struct callback_head *task_works;
+#define TASK_WORKS_DISABLED ((void*)1)
#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 795ef5a68429..3c74e3de81ed 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -22,7 +22,9 @@ enum task_work_notify_mode {
static inline bool task_work_pending(struct task_struct *task)
{
- return READ_ONCE(task->task_works);
+ struct callback_head *works = READ_ONCE(task->task_works);
+
+ return works && works != TASK_WORKS_DISABLED;
}
int task_work_add(struct task_struct *task, struct callback_head *twork,
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..903b29804fe1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2577,7 +2577,7 @@ __latent_entropy struct task_struct *copy_process(
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- p->task_works = NULL;
+ p->task_works = args->kthread ? TASK_WORKS_DISABLED : NULL;
clear_posix_cputimers_work(p);
#ifdef CONFIG_KRETPROBES
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 95a7e1b7f1da..ffdf4b0d7a0e 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -49,7 +49,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
head = READ_ONCE(task->task_works);
do {
- if (unlikely(head == &work_exited))
+ if (unlikely(head == &work_exited ||
+ head == TASK_WORKS_DISABLED))
return -ESRCH;
work->next = head;
} while (!try_cmpxchg(&task->task_works, &head, work));
@@ -157,7 +158,7 @@ void task_work_run(void)
work = READ_ONCE(task->task_works);
do {
head = NULL;
- if (!work) {
+ if (!work || work == TASK_WORKS_DISABLED) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
@@ -165,7 +166,7 @@ void task_work_run(void)
}
} while (!try_cmpxchg(&task->task_works, &work, head));
- if (!work)
+ if (!work || work == TASK_WORKS_DISABLED)
break;
/*
* Synchronize with task_work_cancel(). It can not remove
--
2.42.1