aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/time/posix-cpu-timers.c
diff options
context:
space:
mode:
authorJason Low <jason.low2@hp.com>2015-04-28 16:00:22 -0400
committerIngo Molnar <mingo@kernel.org>2015-05-08 06:15:31 -0400
commit1018016c706f7ff9f56fde3a649789c47085a293 (patch)
tree1f564aa153c8b4d3610c354ebb3e963c6a29e165 /kernel/time/posix-cpu-timers.c
parent7e5a2c1729f1612618ed236249a15bf15f309325 (diff)
sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability
While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low <jason.low2@hp.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: Scott J Norton <scott.norton@hp.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Waiman Long <Waiman.Long@hp.com> Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/time/posix-cpu-timers.c')
-rw-r--r--kernel/time/posix-cpu-timers.c79
1 files changed, 50 insertions, 29 deletions
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e072d982f64c..d85730669410 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
196 return 0; 196 return 0;
197} 197}
198 198
199static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 199/*
200 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
201 * to avoid race conditions with concurrent updates to cputime.
202 */
203static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
200{ 204{
201 if (b->utime > a->utime) 205 u64 curr_cputime;
202 a->utime = b->utime; 206retry:
207 curr_cputime = atomic64_read(cputime);
208 if (sum_cputime > curr_cputime) {
209 if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
210 goto retry;
211 }
212}
203 213
204 if (b->stime > a->stime) 214static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
205 a->stime = b->stime; 215{
216 __update_gt_cputime(&cputimer->utime, sum->utime);
217 __update_gt_cputime(&cputimer->stime, sum->stime);
218 __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
219}
206 220
207 if (b->sum_exec_runtime > a->sum_exec_runtime) 221/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
208 a->sum_exec_runtime = b->sum_exec_runtime; 222static inline void sample_group_cputimer(struct task_cputime *times,
223 struct thread_group_cputimer *cputimer)
224{
225 times->utime = atomic64_read(&cputimer->utime);
226 times->stime = atomic64_read(&cputimer->stime);
227 times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
209} 228}
210 229
211void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) 230void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212{ 231{
213 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 232 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214 struct task_cputime sum; 233 struct task_cputime sum;
215 unsigned long flags;
216 234
217 if (!cputimer->running) { 235 /* Check if cputimer isn't running. This is accessed without locking. */
236 if (!READ_ONCE(cputimer->running)) {
218 /* 237 /*
219 * The POSIX timer interface allows for absolute time expiry 238 * The POSIX timer interface allows for absolute time expiry
220 * values through the TIMER_ABSTIME flag, therefore we have 239 * values through the TIMER_ABSTIME flag, therefore we have
221 * to synchronize the timer to the clock every time we start 240 * to synchronize the timer to the clock every time we start it.
222 * it.
223 */ 241 */
224 thread_group_cputime(tsk, &sum); 242 thread_group_cputime(tsk, &sum);
225 raw_spin_lock_irqsave(&cputimer->lock, flags); 243 update_gt_cputime(cputimer, &sum);
226 cputimer->running = 1; 244
227 update_gt_cputime(&cputimer->cputime, &sum); 245 /*
228 } else 246 * We're setting cputimer->running without a lock. Ensure
229 raw_spin_lock_irqsave(&cputimer->lock, flags); 247 * this only gets written to in one operation. We set
230 *times = cputimer->cputime; 248 * running after update_gt_cputime() as a small optimization,
231 raw_spin_unlock_irqrestore(&cputimer->lock, flags); 249 * but barriers are not required because update_gt_cputime()
250 * can handle concurrent updates.
251 */
252 WRITE_ONCE(cputimer->running, 1);
253 }
254 sample_group_cputimer(times, cputimer);
232} 255}
233 256
234/* 257/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
582 if (!task_cputime_zero(&tsk->cputime_expires)) 605 if (!task_cputime_zero(&tsk->cputime_expires))
583 return false; 606 return false;
584 607
585 if (tsk->signal->cputimer.running) 608 /* Check if cputimer is running. This is accessed without locking. */
609 if (READ_ONCE(tsk->signal->cputimer.running))
586 return false; 610 return false;
587 611
588 return true; 612 return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
882 } 906 }
883} 907}
884 908
885static void stop_process_timers(struct signal_struct *sig) 909static inline void stop_process_timers(struct signal_struct *sig)
886{ 910{
887 struct thread_group_cputimer *cputimer = &sig->cputimer; 911 struct thread_group_cputimer *cputimer = &sig->cputimer;
888 unsigned long flags;
889 912
890 raw_spin_lock_irqsave(&cputimer->lock, flags); 913 /* Turn off cputimer->running. This is done without locking. */
891 cputimer->running = 0; 914 WRITE_ONCE(cputimer->running, 0);
892 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
893} 915}
894 916
895static u32 onecputick; 917static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1111 } 1133 }
1112 1134
1113 sig = tsk->signal; 1135 sig = tsk->signal;
1114 if (sig->cputimer.running) { 1136 /* Check if cputimer is running. This is accessed without locking. */
1137 if (READ_ONCE(sig->cputimer.running)) {
1115 struct task_cputime group_sample; 1138 struct task_cputime group_sample;
1116 1139
1117 raw_spin_lock(&sig->cputimer.lock); 1140 sample_group_cputimer(&group_sample, &sig->cputimer);
1118 group_sample = sig->cputimer.cputime;
1119 raw_spin_unlock(&sig->cputimer.lock);
1120 1141
1121 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1142 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1122 return 1; 1143 return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1157 * If there are any active process wide timers (POSIX 1.b, itimers, 1178 * If there are any active process wide timers (POSIX 1.b, itimers,
1158 * RLIMIT_CPU) cputimer must be running. 1179 * RLIMIT_CPU) cputimer must be running.
1159 */ 1180 */
1160 if (tsk->signal->cputimer.running) 1181 if (READ_ONCE(tsk->signal->cputimer.running))
1161 check_process_timers(tsk, &firing); 1182 check_process_timers(tsk, &firing);
1162 1183
1163 /* 1184 /*