[RFC PATCH] timer: Improve itimers scalability

From: Jason Low
Date: Tue Aug 04 2015 - 20:32:57 EST


When running a database workload on a 16 socket machine, there were
scalability issues related to itimers.

Commit 1018016c706f addressed the issue with the thread_group_cputimer
spinlock taking up a significant portion of total run time.

This patch addresses the other issue where a lot of time is spent
trying to acquire the sighand lock. It was found in some cases that
200+ threads were simultaneously contending for the same sighand lock.

The issue was that whenever an itimer expired, many threads ended up
simultaneously trying to send the signal. Most of the time, nothing
happened after acquiring the sighand lock because another thread
had already sent the signal and updated the "next expire" time. The
fastpath_timer_check() didn't help much since the "next expire" time
was updated later.

The contention for the sighand lock reduced throughput by more than 30%.

This patch addresses this by having the thread_group_cputimer structure
maintain a boolean to signify when a thread in the group is already
checking for process wide timers, and adds extra logic in the fastpath
to check the boolean.

Signed-off-by: Jason Low <jason.low2@xxxxxx>
---
include/linux/init_task.h | 5 +++--
include/linux/sched.h | 3 +++
kernel/time/posix-cpu-timers.c | 33 ++++++++++++++++++++++++++-------
3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d0b380e..c5d216c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -51,8 +51,9 @@ extern struct fs_struct init_fs;
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \
.cputimer = { \
- .cputime_atomic = INIT_CPUTIME_ATOMIC, \
- .running = 0, \
+ .cputime_atomic = INIT_CPUTIME_ATOMIC, \
+ .running = 0, \
+ .is_checking_timer = 0, \
}, \
INIT_PREV_CPUTIME(sig) \
.cred_guard_mutex = \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5b50082..37e952c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -619,6 +619,8 @@ struct task_cputime_atomic {
* @cputime_atomic: atomic thread group interval timers.
* @running: non-zero when there are timers running and
* @cputime receives updates.
+ * @is_checking_timer: non-zero when a thread is in the process of
+ * checking for thread group timers.
*
* This structure contains the version of task_cputime, above, that is
* used for thread group CPU timer calculations.
@@ -626,6 +628,7 @@ struct task_cputime_atomic {
struct thread_group_cputimer {
struct task_cputime_atomic cputime_atomic;
int running;
+ int is_checking_timer;
};

#include <linux/rwsem.h>
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 892e3da..fba1fc0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -962,6 +962,14 @@ static void check_process_timers(struct task_struct *tsk,
unsigned long soft;

/*
+ * Signify that a thread is checking for process timers.
+ * The is_checking_timer field is only modified in this function,
+ * which is called with the sighand lock held. Thus, we can
+ * just use WRITE_ONCE() without any further locking.
+ */
+ WRITE_ONCE(sig->cputimer.is_checking_timer, 1);
+
+ /*
* Collect the current process totals.
*/
thread_group_cputimer(tsk, &cputime);
@@ -973,13 +981,6 @@ static void check_process_timers(struct task_struct *tsk,
virt_expires = check_timers_list(++timers, firing, utime);
sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);

- /*
- * Check for the special case process timers.
- */
- check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
- SIGPROF);
- check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
- SIGVTALRM);
soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
@@ -1010,11 +1011,21 @@ static void check_process_timers(struct task_struct *tsk,
}
}

+ /*
+ * Check for the special case process timers.
+ */
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
+ SIGPROF);
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
+ SIGVTALRM);
+
sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
sig->cputime_expires.sched_exp = sched_expires;
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
+
+ WRITE_ONCE(sig->cputimer.is_checking_timer, 0);
}

/*
@@ -1137,6 +1148,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;

+ /*
+ * If another thread in the group is already checking
+ * for the thread group cputimer, then we will skip that.
+ */
+ if (READ_ONCE(sig->cputimer.is_checking_timer))
+ return 0;
+
sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);

if (task_cputime_expired(&group_sample, &sig->cputime_expires))
@@ -1174,6 +1192,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* put them on the firing list.
*/
check_thread_timers(tsk, &firing);
+
/*
* If there are any active process wide timers (POSIX 1.b, itimers,
* RLIMIT_CPU) cputimer must be running.
--
1.7.2.5



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/