posix_cpu_timer: Reduce unnecessary sighand lock contention

It was found while running a database workload on large systems that significant time was spent trying to acquire the sighand lock. The issue was that whenever an itimer expired, many threads ended up simultaneously trying to send the signal. Most of the time, nothing happened after acquiring the sighand lock because another thread had just already sent the signal and updated the "next expire" time. The fastpath_timer_check() didn't help much since the "next expire" time was updated after the threads exit fastpath_timer_check(). This patch addresses this by having the thread_group_cputimer structure maintain a boolean to signify when a thread in the group is already checking for process wide timers, and adds extra logic in the fastpath to check the boolean. Signed-off-by: Jason Low <jason.low2@hp.com> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: George Spelvin <linux@horizon.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: hideaki.kimura@hpe.com Cc: terry.rudd@hpe.com Cc: scott.norton@hpe.com Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1444849677-29330-5-git-send-email-jason.low2@hp.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Jason Low <jason.low2@hp.com> 2015-10-14 15:07:56 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2015-10-15 05:23:41 -0400
commit: c8d75aa47dd585c9538a8205e9bb9847e12cfb84 (patch)
tree: 2d02f5e5041dcec03648a69730420ae9fbafbfcb /kernel/time/posix-cpu-timers.c
parent: d5c373eb5610686162ff50429f63f4c00c554799 (diff)
1 files changed, 24 insertions, 2 deletions
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2d58153074d9..f5e86d282d52 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -975,6 +975,12 @@ static void check_process_timers(struct task_struct *tsk,
        if (!READ_ONCE(tsk->signal->cputimer.running))
                return;
+        /*
+         * Signify that a thread is checking for process timers.
+         * Write access to this field is protected by the sighand lock.
+         */
+        sig->cputimer.checking_timer = true;
        /*
         * Collect the current process totals.
         */
@@ -1029,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk,
        sig->cputime_expires.sched_exp = sched_expires;
        if (task_cputime_zero(&sig->cputime_expires))
                stop_process_timers(sig);
+        sig->cputimer.checking_timer = false;
 }
 /*
@@ -1142,8 +1150,22 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        }
        sig = tsk->signal;
-        /* Check if cputimer is running. This is accessed without locking. */
+        /*
-        if (READ_ONCE(sig->cputimer.running)) {
+         * Check if thread group timers expired when the cputimer is
+         * running and no other thread in the group is already checking
+         * for thread group cputimers. These fields are read without the
+         * sighand lock. However, this is fine because this is meant to
+         * be a fastpath heuristic to determine whether we should try to
+         * acquire the sighand lock to check/handle timers.
+         *
+         * In the worst case scenario, if 'running' or 'checking_timer' gets
+         * set but the current thread doesn't see the change yet, we'll wait
+         * until the next thread in the group gets a scheduler interrupt to
+         * handle the timer. This isn't an issue in practice because these
+         * types of delays with signals actually getting sent are expected.
+         */
+        if (READ_ONCE(sig->cputimer.running) &&
+            !READ_ONCE(sig->cputimer.checking_timer)) {
                struct task_cputime group_sample;
                sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
author	Jason Low <jason.low2@hp.com>	2015-10-14 15:07:56 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2015-10-15 05:23:41 -0400
commit	c8d75aa47dd585c9538a8205e9bb9847e12cfb84 (patch)
tree	2d02f5e5041dcec03648a69730420ae9fbafbfcb /kernel/time/posix-cpu-timers.c
parent	d5c373eb5610686162ff50429f63f4c00c554799 (diff)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2d58153074d9..f5e86d282d52 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c
@@ -975,6 +975,12 @@ static void check_process_timers(struct task_struct *tsk,
975	if (!READ_ONCE(tsk->signal->cputimer.running))	975	if (!READ_ONCE(tsk->signal->cputimer.running))
976	return;	976	return;
977		977
		978	/*
		979	* Signify that a thread is checking for process timers.
		980	* Write access to this field is protected by the sighand lock.
		981	*/
		982	sig->cputimer.checking_timer = true;
		983
978	/*	984	/*
979	* Collect the current process totals.	985	* Collect the current process totals.
980	*/	986	*/
@@ -1029,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk,
1029	sig->cputime_expires.sched_exp = sched_expires;	1035	sig->cputime_expires.sched_exp = sched_expires;
1030	if (task_cputime_zero(&sig->cputime_expires))	1036	if (task_cputime_zero(&sig->cputime_expires))
1031	stop_process_timers(sig);	1037	stop_process_timers(sig);
		1038
		1039	sig->cputimer.checking_timer = false;
1032	}	1040	}
1033		1041
1034	/*	1042	/*
@@ -1142,8 +1150,22 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1142	}	1150	}
1143		1151
1144	sig = tsk->signal;	1152	sig = tsk->signal;
1145	/* Check if cputimer is running. This is accessed without locking. */	1153	/*
1146	if (READ_ONCE(sig->cputimer.running)) {	1154	* Check if thread group timers expired when the cputimer is
		1155	* running and no other thread in the group is already checking
		1156	* for thread group cputimers. These fields are read without the
		1157	* sighand lock. However, this is fine because this is meant to
		1158	* be a fastpath heuristic to determine whether we should try to
		1159	* acquire the sighand lock to check/handle timers.
		1160	*
		1161	* In the worst case scenario, if 'running' or 'checking_timer' gets
		1162	* set but the current thread doesn't see the change yet, we'll wait
		1163	* until the next thread in the group gets a scheduler interrupt to
		1164	* handle the timer. This isn't an issue in practice because these
		1165	* types of delays with signals actually getting sent are expected.
		1166	*/
		1167	if (READ_ONCE(sig->cputimer.running) &&
		1168	!READ_ONCE(sig->cputimer.checking_timer)) {
1147	struct task_cputime group_sample;	1169	struct task_cputime group_sample;
1148		1170
1149	sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);	1171	sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);