aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJason Low <jason.low2@hp.com>2015-04-28 16:00:22 -0400
committerIngo Molnar <mingo@kernel.org>2015-05-08 06:15:31 -0400
commit1018016c706f7ff9f56fde3a649789c47085a293 (patch)
tree1f564aa153c8b4d3610c354ebb3e963c6a29e165 /kernel
parent7e5a2c1729f1612618ed236249a15bf15f309325 (diff)
sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability
While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low <jason.low2@hp.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: Scott J Norton <scott.norton@hp.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Waiman Long <Waiman.Long@hp.com> Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched/stats.h15
-rw-r--r--kernel/time/posix-cpu-timers.c79
3 files changed, 55 insertions, 42 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c37a411a62..2e670864174f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
1091{ 1091{
1092 unsigned long cpu_limit; 1092 unsigned long cpu_limit;
1093 1093
1094 /* Thread group counters. */
1095 thread_group_cputime_init(sig);
1096
1097 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); 1094 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1098 if (cpu_limit != RLIM_INFINITY) { 1095 if (cpu_limit != RLIM_INFINITY) {
1099 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); 1096 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..c6d1c7da3ea5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
174{ 174{
175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
176 176
177 if (!cputimer->running) 177 /* Check if cputimer isn't running. This is accessed without locking. */
178 if (!READ_ONCE(cputimer->running))
178 return false; 179 return false;
179 180
180 /* 181 /*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
215 if (!cputimer_running(tsk)) 216 if (!cputimer_running(tsk))
216 return; 217 return;
217 218
218 raw_spin_lock(&cputimer->lock); 219 atomic64_add(cputime, &cputimer->utime);
219 cputimer->cputime.utime += cputime;
220 raw_spin_unlock(&cputimer->lock);
221} 220}
222 221
223/** 222/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
238 if (!cputimer_running(tsk)) 237 if (!cputimer_running(tsk))
239 return; 238 return;
240 239
241 raw_spin_lock(&cputimer->lock); 240 atomic64_add(cputime, &cputimer->stime);
242 cputimer->cputime.stime += cputime;
243 raw_spin_unlock(&cputimer->lock);
244} 241}
245 242
246/** 243/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
261 if (!cputimer_running(tsk)) 258 if (!cputimer_running(tsk))
262 return; 259 return;
263 260
264 raw_spin_lock(&cputimer->lock); 261 atomic64_add(ns, &cputimer->sum_exec_runtime);
265 cputimer->cputime.sum_exec_runtime += ns;
266 raw_spin_unlock(&cputimer->lock);
267} 262}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e072d982f64c..d85730669410 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
196 return 0; 196 return 0;
197} 197}
198 198
199static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 199/*
200 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
201 * to avoid race conditions with concurrent updates to cputime.
202 */
203static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
200{ 204{
201 if (b->utime > a->utime) 205 u64 curr_cputime;
202 a->utime = b->utime; 206retry:
207 curr_cputime = atomic64_read(cputime);
208 if (sum_cputime > curr_cputime) {
209 if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
210 goto retry;
211 }
212}
203 213
204 if (b->stime > a->stime) 214static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
205 a->stime = b->stime; 215{
216 __update_gt_cputime(&cputimer->utime, sum->utime);
217 __update_gt_cputime(&cputimer->stime, sum->stime);
218 __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
219}
206 220
207 if (b->sum_exec_runtime > a->sum_exec_runtime) 221/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
208 a->sum_exec_runtime = b->sum_exec_runtime; 222static inline void sample_group_cputimer(struct task_cputime *times,
223 struct thread_group_cputimer *cputimer)
224{
225 times->utime = atomic64_read(&cputimer->utime);
226 times->stime = atomic64_read(&cputimer->stime);
227 times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
209} 228}
210 229
211void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) 230void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212{ 231{
213 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 232 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214 struct task_cputime sum; 233 struct task_cputime sum;
215 unsigned long flags;
216 234
217 if (!cputimer->running) { 235 /* Check if cputimer isn't running. This is accessed without locking. */
236 if (!READ_ONCE(cputimer->running)) {
218 /* 237 /*
219 * The POSIX timer interface allows for absolute time expiry 238 * The POSIX timer interface allows for absolute time expiry
220 * values through the TIMER_ABSTIME flag, therefore we have 239 * values through the TIMER_ABSTIME flag, therefore we have
221 * to synchronize the timer to the clock every time we start 240 * to synchronize the timer to the clock every time we start it.
222 * it.
223 */ 241 */
224 thread_group_cputime(tsk, &sum); 242 thread_group_cputime(tsk, &sum);
225 raw_spin_lock_irqsave(&cputimer->lock, flags); 243 update_gt_cputime(cputimer, &sum);
226 cputimer->running = 1; 244
227 update_gt_cputime(&cputimer->cputime, &sum); 245 /*
228 } else 246 * We're setting cputimer->running without a lock. Ensure
229 raw_spin_lock_irqsave(&cputimer->lock, flags); 247 * this only gets written to in one operation. We set
230 *times = cputimer->cputime; 248 * running after update_gt_cputime() as a small optimization,
231 raw_spin_unlock_irqrestore(&cputimer->lock, flags); 249 * but barriers are not required because update_gt_cputime()
250 * can handle concurrent updates.
251 */
252 WRITE_ONCE(cputimer->running, 1);
253 }
254 sample_group_cputimer(times, cputimer);
232} 255}
233 256
234/* 257/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
582 if (!task_cputime_zero(&tsk->cputime_expires)) 605 if (!task_cputime_zero(&tsk->cputime_expires))
583 return false; 606 return false;
584 607
585 if (tsk->signal->cputimer.running) 608 /* Check if cputimer is running. This is accessed without locking. */
609 if (READ_ONCE(tsk->signal->cputimer.running))
586 return false; 610 return false;
587 611
588 return true; 612 return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
882 } 906 }
883} 907}
884 908
885static void stop_process_timers(struct signal_struct *sig) 909static inline void stop_process_timers(struct signal_struct *sig)
886{ 910{
887 struct thread_group_cputimer *cputimer = &sig->cputimer; 911 struct thread_group_cputimer *cputimer = &sig->cputimer;
888 unsigned long flags;
889 912
890 raw_spin_lock_irqsave(&cputimer->lock, flags); 913 /* Turn off cputimer->running. This is done without locking. */
891 cputimer->running = 0; 914 WRITE_ONCE(cputimer->running, 0);
892 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
893} 915}
894 916
895static u32 onecputick; 917static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1111 } 1133 }
1112 1134
1113 sig = tsk->signal; 1135 sig = tsk->signal;
1114 if (sig->cputimer.running) { 1136 /* Check if cputimer is running. This is accessed without locking. */
1137 if (READ_ONCE(sig->cputimer.running)) {
1115 struct task_cputime group_sample; 1138 struct task_cputime group_sample;
1116 1139
1117 raw_spin_lock(&sig->cputimer.lock); 1140 sample_group_cputimer(&group_sample, &sig->cputimer);
1118 group_sample = sig->cputimer.cputime;
1119 raw_spin_unlock(&sig->cputimer.lock);
1120 1141
1121 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1142 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1122 return 1; 1143 return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1157 * If there are any active process wide timers (POSIX 1.b, itimers, 1178 * If there are any active process wide timers (POSIX 1.b, itimers,
1158 * RLIMIT_CPU) cputimer must be running. 1179 * RLIMIT_CPU) cputimer must be running.
1159 */ 1180 */
1160 if (tsk->signal->cputimer.running) 1181 if (READ_ONCE(tsk->signal->cputimer.running))
1161 check_process_timers(tsk, &firing); 1182 check_process_timers(tsk, &firing);
1162 1183
1163 /* 1184 /*