sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability

While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low <jason.low2@hp.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: Scott J Norton <scott.norton@hp.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Waiman Long <Waiman.Long@hp.com> Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Jason Low <jason.low2@hp.com> 2015-04-28 16:00:22 -0400
committer: Ingo Molnar <mingo@kernel.org> 2015-05-08 06:15:31 -0400
commit: 1018016c706f7ff9f56fde3a649789c47085a293 (patch)
tree: 1f564aa153c8b4d3610c354ebb3e963c6a29e165 /kernel/time/posix-cpu-timers.c
parent: 7e5a2c1729f1612618ed236249a15bf15f309325 (diff)
1 files changed, 50 insertions, 29 deletions
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e072d982f64c..d85730669410 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
        return 0;
 }
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
 {
-        if (b->utime > a->utime)
+        u64 curr_cputime;
-                a->utime = b->utime;
+retry:
+        curr_cputime = atomic64_read(cputime);
+        if (sum_cputime > curr_cputime) {
+                if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+                        goto retry;
+        }
+}
-        if (b->stime > a->stime)
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
-                a->stime = b->stime;
+{
+        __update_gt_cputime(&cputimer->utime, sum->utime);
+        __update_gt_cputime(&cputimer->stime, sum->stime);
+        __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+}
-        if (b->sum_exec_runtime > a->sum_exec_runtime)
+/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
-                a->sum_exec_runtime = b->sum_exec_runtime;
+static inline void sample_group_cputimer(struct task_cputime *times,
+                                          struct thread_group_cputimer *cputimer)
+{
+        times->utime = atomic64_read(&cputimer->utime);
+        times->stime = atomic64_read(&cputimer->stime);
+        times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
 }
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
        struct task_cputime sum;
-        unsigned long flags;
-        if (!cputimer->running) {
+        /* Check if cputimer isn't running. This is accessed without locking. */
+        if (!READ_ONCE(cputimer->running)) {
                /*
                 * The POSIX timer interface allows for absolute time expiry
                 * values through the TIMER_ABSTIME flag, therefore we have
-                 * to synchronize the timer to the clock every time we start
+                 * to synchronize the timer to the clock every time we start it.
-                 * it.
                 */
                thread_group_cputime(tsk, &sum);
-                raw_spin_lock_irqsave(&cputimer->lock, flags);
+                update_gt_cputime(cputimer, &sum);
-                cputimer->running = 1;
-                update_gt_cputime(&cputimer->cputime, &sum);
+                /*
-        } else
+                 * We're setting cputimer->running without a lock. Ensure
-                raw_spin_lock_irqsave(&cputimer->lock, flags);
+                 * this only gets written to in one operation. We set
-        *times = cputimer->cputime;
+                 * running after update_gt_cputime() as a small optimization,
-        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+                 * but barriers are not required because update_gt_cputime()
+                 * can handle concurrent updates.
+                 */
+                WRITE_ONCE(cputimer->running, 1);
+        }
+        sample_group_cputimer(times, cputimer);
 }
 /*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
        if (!task_cputime_zero(&tsk->cputime_expires))
                return false;
-        if (tsk->signal->cputimer.running)
+        /* Check if cputimer is running. This is accessed without locking. */
+        if (READ_ONCE(tsk->signal->cputimer.running))
                return false;
        return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 }
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
 {
        struct thread_group_cputimer *cputimer = &sig->cputimer;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&cputimer->lock, flags);
+        /* Turn off cputimer->running. This is done without locking. */
-        cputimer->running = 0;
+        WRITE_ONCE(cputimer->running, 0);
-        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        }
        sig = tsk->signal;
-        if (sig->cputimer.running) {
+        /* Check if cputimer is running. This is accessed without locking. */
+        if (READ_ONCE(sig->cputimer.running)) {
                struct task_cputime group_sample;
-                raw_spin_lock(&sig->cputimer.lock);
+                sample_group_cputimer(&group_sample, &sig->cputimer);
-                group_sample = sig->cputimer.cputime;
-                raw_spin_unlock(&sig->cputimer.lock);
                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
                        return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
         * If there are any active process wide timers (POSIX 1.b, itimers,
         * RLIMIT_CPU) cputimer must be running.
         */
-        if (tsk->signal->cputimer.running)
+        if (READ_ONCE(tsk->signal->cputimer.running))
                check_process_timers(tsk, &firing);
        /*
author	Jason Low <jason.low2@hp.com>	2015-04-28 16:00:22 -0400
committer	Ingo Molnar <mingo@kernel.org>	2015-05-08 06:15:31 -0400
commit	1018016c706f7ff9f56fde3a649789c47085a293 (patch)
tree	1f564aa153c8b4d3610c354ebb3e963c6a29e165 /kernel/time/posix-cpu-timers.c
parent	7e5a2c1729f1612618ed236249a15bf15f309325 (diff)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e072d982f64c..d85730669410 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
196	return 0;	196	return 0;
197	}	197	}
198		198
199	static void update_gt_cputime(struct task_cputime a, struct task_cputime b)	199	/*
		200	* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
		201	* to avoid race conditions with concurrent updates to cputime.
		202	*/
		203	static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
200	{	204	{
201	if (b->utime > a->utime)	205	u64 curr_cputime;
202	a->utime = b->utime;	206	retry:
		207	curr_cputime = atomic64_read(cputime);
		208	if (sum_cputime > curr_cputime) {
		209	if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
		210	goto retry;
		211	}
		212	}
203		213
204	if (b->stime > a->stime)	214	static void update_gt_cputime(struct thread_group_cputimer cputimer, struct task_cputime sum)
205	a->stime = b->stime;	215	{
		216	__update_gt_cputime(&cputimer->utime, sum->utime);
		217	__update_gt_cputime(&cputimer->stime, sum->stime);
		218	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
		219	}
206		220
207	if (b->sum_exec_runtime > a->sum_exec_runtime)	221	/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
208	a->sum_exec_runtime = b->sum_exec_runtime;	222	static inline void sample_group_cputimer(struct task_cputime *times,
		223	struct thread_group_cputimer *cputimer)
		224	{
		225	times->utime = atomic64_read(&cputimer->utime);
		226	times->stime = atomic64_read(&cputimer->stime);
		227	times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
209	}	228	}
210		229
211	void thread_group_cputimer(struct task_struct tsk, struct task_cputime times)	230	void thread_group_cputimer(struct task_struct tsk, struct task_cputime times)
212	{	231	{
213	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;	232	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214	struct task_cputime sum;	233	struct task_cputime sum;
215	unsigned long flags;
216		234
217	if (!cputimer->running) {	235	/* Check if cputimer isn't running. This is accessed without locking. */
		236	if (!READ_ONCE(cputimer->running)) {
218	/*	237	/*
219	* The POSIX timer interface allows for absolute time expiry	238	* The POSIX timer interface allows for absolute time expiry
220	* values through the TIMER_ABSTIME flag, therefore we have	239	* values through the TIMER_ABSTIME flag, therefore we have
221	* to synchronize the timer to the clock every time we start	240	* to synchronize the timer to the clock every time we start it.
222	* it.
223	*/	241	*/
224	thread_group_cputime(tsk, &sum);	242	thread_group_cputime(tsk, &sum);
225	raw_spin_lock_irqsave(&cputimer->lock, flags);	243	update_gt_cputime(cputimer, &sum);
226	cputimer->running = 1;	244
227	update_gt_cputime(&cputimer->cputime, &sum);	245	/*
228	} else	246	* We're setting cputimer->running without a lock. Ensure
229	raw_spin_lock_irqsave(&cputimer->lock, flags);	247	* this only gets written to in one operation. We set
230	*times = cputimer->cputime;	248	* running after update_gt_cputime() as a small optimization,
231	raw_spin_unlock_irqrestore(&cputimer->lock, flags);	249	* but barriers are not required because update_gt_cputime()
		250	* can handle concurrent updates.
		251	*/
		252	WRITE_ONCE(cputimer->running, 1);
		253	}
		254	sample_group_cputimer(times, cputimer);
232	}	255	}
233		256
234	/*	257	/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
582	if (!task_cputime_zero(&tsk->cputime_expires))	605	if (!task_cputime_zero(&tsk->cputime_expires))
583	return false;	606	return false;
584		607
585	if (tsk->signal->cputimer.running)	608	/* Check if cputimer is running. This is accessed without locking. */
		609	if (READ_ONCE(tsk->signal->cputimer.running))
586	return false;	610	return false;
587		611
588	return true;	612	return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
882	}	906	}
883	}	907	}
884		908
885	static void stop_process_timers(struct signal_struct *sig)	909	static inline void stop_process_timers(struct signal_struct *sig)
886	{	910	{
887	struct thread_group_cputimer *cputimer = &sig->cputimer;	911	struct thread_group_cputimer *cputimer = &sig->cputimer;
888	unsigned long flags;
889		912
890	raw_spin_lock_irqsave(&cputimer->lock, flags);	913	/* Turn off cputimer->running. This is done without locking. */
891	cputimer->running = 0;	914	WRITE_ONCE(cputimer->running, 0);
892	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
893	}	915	}
894		916
895	static u32 onecputick;	917	static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1111	}	1133	}
1112		1134
1113	sig = tsk->signal;	1135	sig = tsk->signal;
1114	if (sig->cputimer.running) {	1136	/* Check if cputimer is running. This is accessed without locking. */
		1137	if (READ_ONCE(sig->cputimer.running)) {
1115	struct task_cputime group_sample;	1138	struct task_cputime group_sample;
1116		1139
1117	raw_spin_lock(&sig->cputimer.lock);	1140	sample_group_cputimer(&group_sample, &sig->cputimer);
1118	group_sample = sig->cputimer.cputime;
1119	raw_spin_unlock(&sig->cputimer.lock);
1120		1141
1121	if (task_cputime_expired(&group_sample, &sig->cputime_expires))	1142	if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1122	return 1;	1143	return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1157	* If there are any active process wide timers (POSIX 1.b, itimers,	1178	* If there are any active process wide timers (POSIX 1.b, itimers,
1158	* RLIMIT_CPU) cputimer must be running.	1179	* RLIMIT_CPU) cputimer must be running.
1159	*/	1180	*/
1160	if (tsk->signal->cputimer.running)	1181	if (READ_ONCE(tsk->signal->cputimer.running))
1161	check_process_timers(tsk, &firing);	1182	check_process_timers(tsk, &firing);
1162		1183
1163	/*	1184	/*