aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2009-02-05 06:24:16 -0500
committerIngo Molnar <mingo@elte.hu>2009-02-05 07:04:33 -0500
commit4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 (patch)
treeb1e580d5284648d6884e951d995509094a92cca4
parent32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 (diff)
timers: split process wide cpu clocks/timers
Change the process wide cpu timers/clocks so that we: 1) don't mess up the kernel with too many threads, 2) don't have a per-cpu allocation for each process, 3) have no impact when not used. In order to accomplish this we're going to split it into two parts: - clocks; which can take all the time they want since they run from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID) - timers; which need constant time sampling but since they're explicity used, the user can pay the overhead. The clock readout will go back to a full sum of the thread group, while the timers will run of a global 'clock' that only runs when needed, so only programs that make use of the facility pay the price. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/sched.h54
-rw-r--r--kernel/itimer.c4
-rw-r--r--kernel/posix-cpu-timers.c95
-rw-r--r--kernel/sched_stats.h45
5 files changed, 155 insertions, 54 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index ea0ea1a4c36f..e752d973fa21 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -48,12 +48,11 @@ extern struct fs_struct init_fs;
48 .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ 48 .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \
49 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ 49 .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
50 .rlim = INIT_RLIMITS, \ 50 .rlim = INIT_RLIMITS, \
51 .cputime = { .totals = { \ 51 .cputimer = { \
52 .utime = cputime_zero, \ 52 .cputime = INIT_CPUTIME, \
53 .stime = cputime_zero, \ 53 .running = 0, \
54 .sum_exec_runtime = 0, \ 54 .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
55 .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ 55 }, \
56 }, }, \
57} 56}
58 57
59extern struct nsproxy init_nsproxy; 58extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e0646a30314..082d7619b3a1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -443,7 +443,6 @@ struct pacct_struct {
443 * @utime: time spent in user mode, in &cputime_t units 443 * @utime: time spent in user mode, in &cputime_t units
444 * @stime: time spent in kernel mode, in &cputime_t units 444 * @stime: time spent in kernel mode, in &cputime_t units
445 * @sum_exec_runtime: total time spent on the CPU, in nanoseconds 445 * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
446 * @lock: lock for fields in this struct
447 * 446 *
448 * This structure groups together three kinds of CPU time that are 447 * This structure groups together three kinds of CPU time that are
449 * tracked for threads and thread groups. Most things considering 448 * tracked for threads and thread groups. Most things considering
@@ -454,23 +453,33 @@ struct task_cputime {
454 cputime_t utime; 453 cputime_t utime;
455 cputime_t stime; 454 cputime_t stime;
456 unsigned long long sum_exec_runtime; 455 unsigned long long sum_exec_runtime;
457 spinlock_t lock;
458}; 456};
459/* Alternate field names when used to cache expirations. */ 457/* Alternate field names when used to cache expirations. */
460#define prof_exp stime 458#define prof_exp stime
461#define virt_exp utime 459#define virt_exp utime
462#define sched_exp sum_exec_runtime 460#define sched_exp sum_exec_runtime
463 461
462#define INIT_CPUTIME \
463 (struct task_cputime) { \
464 .utime = cputime_zero, \
465 .stime = cputime_zero, \
466 .sum_exec_runtime = 0, \
467 }
468
464/** 469/**
465 * struct thread_group_cputime - thread group interval timer counts 470 * struct thread_group_cputimer - thread group interval timer counts
466 * @totals: thread group interval timers; substructure for 471 * @cputime: thread group interval timers.
467 * uniprocessor kernel, per-cpu for SMP kernel. 472 * @running: non-zero when there are timers running and
473 * @cputime receives updates.
474 * @lock: lock for fields in this struct.
468 * 475 *
469 * This structure contains the version of task_cputime, above, that is 476 * This structure contains the version of task_cputime, above, that is
470 * used for thread group CPU clock calculations. 477 * used for thread group CPU timer calculations.
471 */ 478 */
472struct thread_group_cputime { 479struct thread_group_cputimer {
473 struct task_cputime totals; 480 struct task_cputime cputime;
481 int running;
482 spinlock_t lock;
474}; 483};
475 484
476/* 485/*
@@ -519,10 +528,10 @@ struct signal_struct {
519 cputime_t it_prof_incr, it_virt_incr; 528 cputime_t it_prof_incr, it_virt_incr;
520 529
521 /* 530 /*
522 * Thread group totals for process CPU clocks. 531 * Thread group totals for process CPU timers.
523 * See thread_group_cputime(), et al, for details. 532 * See thread_group_cputimer(), et al, for details.
524 */ 533 */
525 struct thread_group_cputime cputime; 534 struct thread_group_cputimer cputimer;
526 535
527 /* Earliest-expiration cache. */ 536 /* Earliest-expiration cache. */
528 struct task_cputime cputime_expires; 537 struct task_cputime cputime_expires;
@@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock)
2191/* 2200/*
2192 * Thread group CPU time accounting. 2201 * Thread group CPU time accounting.
2193 */ 2202 */
2203void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
2194 2204
2195static inline 2205static inline
2196void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 2206void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
2197{ 2207{
2198 struct task_cputime *totals = &tsk->signal->cputime.totals; 2208 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
2199 unsigned long flags; 2209 unsigned long flags;
2200 2210
2201 spin_lock_irqsave(&totals->lock, flags); 2211 WARN_ON(!cputimer->running);
2202 *times = *totals; 2212
2203 spin_unlock_irqrestore(&totals->lock, flags); 2213 spin_lock_irqsave(&cputimer->lock, flags);
2214 *times = cputimer->cputime;
2215 spin_unlock_irqrestore(&cputimer->lock, flags);
2204} 2216}
2205 2217
2206static inline void thread_group_cputime_init(struct signal_struct *sig) 2218static inline void thread_group_cputime_init(struct signal_struct *sig)
2207{ 2219{
2208 sig->cputime.totals = (struct task_cputime){ 2220 sig->cputimer.cputime = INIT_CPUTIME;
2209 .utime = cputime_zero, 2221 spin_lock_init(&sig->cputimer.lock);
2210 .stime = cputime_zero, 2222 sig->cputimer.running = 0;
2211 .sum_exec_runtime = 0,
2212 };
2213
2214 spin_lock_init(&sig->cputime.totals.lock);
2215} 2223}
2216 2224
2217static inline void thread_group_cputime_free(struct signal_struct *sig) 2225static inline void thread_group_cputime_free(struct signal_struct *sig)
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6a5fe93dd8bd..58762f7077ec 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value)
62 struct task_cputime cputime; 62 struct task_cputime cputime;
63 cputime_t utime; 63 cputime_t utime;
64 64
65 thread_group_cputime(tsk, &cputime); 65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime; 66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */ 67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1); 68 cval = jiffies_to_cputime(1);
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
82 struct task_cputime times; 82 struct task_cputime times;
83 cputime_t ptime; 83 cputime_t ptime;
84 84
85 thread_group_cputime(tsk, &times); 85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime); 86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */ 87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1); 88 cval = jiffies_to_cputime(1);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da94d7be..db107c9bbc05 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
230 return 0; 230 return 0;
231} 231}
232 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{
235 struct sighand_struct *sighand;
236 struct signal_struct *sig;
237 struct task_struct *t;
238
239 *times = INIT_CPUTIME;
240
241 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand);
243 if (!sighand)
244 goto out;
245
246 sig = tsk->signal;
247
248 t = tsk;
249 do {
250 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime;
253
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out:
261 rcu_read_unlock();
262}
263
233/* 264/*
234 * Sample a process (thread group) clock for the given group_leader task. 265 * Sample a process (thread group) clock for the given group_leader task.
235 * Must be called with tasklist_lock held for reading. 266 * Must be called with tasklist_lock held for reading.
@@ -476,6 +507,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
476} 507}
477 508
478/* 509/*
510 * Enable the process wide cpu timer accounting.
511 *
512 * serialized using ->sighand->siglock
513 */
514static void start_process_timers(struct task_struct *tsk)
515{
516 tsk->signal->cputimer.running = 1;
517 barrier();
518}
519
520/*
521 * Release the process wide timer accounting -- timer stops ticking when
522 * nobody cares about it.
523 *
524 * serialized using ->sighand->siglock
525 */
526static void stop_process_timers(struct task_struct *tsk)
527{
528 tsk->signal->cputimer.running = 0;
529 barrier();
530}
531
532/*
479 * Insert the timer on the appropriate list before any timers that 533 * Insert the timer on the appropriate list before any timers that
480 * expire later. This must be called with the tasklist_lock held 534 * expire later. This must be called with the tasklist_lock held
481 * for reading, and interrupts disabled. 535 * for reading, and interrupts disabled.
@@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
495 BUG_ON(!irqs_disabled()); 549 BUG_ON(!irqs_disabled());
496 spin_lock(&p->sighand->siglock); 550 spin_lock(&p->sighand->siglock);
497 551
552 if (!CPUCLOCK_PERTHREAD(timer->it_clock))
553 start_process_timers(p);
554
498 listpos = head; 555 listpos = head;
499 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 556 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
500 list_for_each_entry(next, head, entry) { 557 list_for_each_entry(next, head, entry) {
@@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk,
987 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1044 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
988 list_empty(&timers[CPUCLOCK_VIRT]) && 1045 list_empty(&timers[CPUCLOCK_VIRT]) &&
989 cputime_eq(sig->it_virt_expires, cputime_zero) && 1046 cputime_eq(sig->it_virt_expires, cputime_zero) &&
990 list_empty(&timers[CPUCLOCK_SCHED])) 1047 list_empty(&timers[CPUCLOCK_SCHED])) {
1048 stop_process_timers(tsk);
991 return; 1049 return;
1050 }
992 1051
993 /* 1052 /*
994 * Collect the current process totals. 1053 * Collect the current process totals.
995 */ 1054 */
996 thread_group_cputime(tsk, &cputime); 1055 thread_group_cputimer(tsk, &cputime);
997 utime = cputime.utime; 1056 utime = cputime.utime;
998 ptime = cputime_add(utime, cputime.stime); 1057 ptime = cputime_add(utime, cputime.stime);
999 sum_sched_runtime = cputime.sum_exec_runtime; 1058 sum_sched_runtime = cputime.sum_exec_runtime;
@@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1259 if (!task_cputime_zero(&sig->cputime_expires)) { 1318 if (!task_cputime_zero(&sig->cputime_expires)) {
1260 struct task_cputime group_sample; 1319 struct task_cputime group_sample;
1261 1320
1262 thread_group_cputime(tsk, &group_sample); 1321 thread_group_cputimer(tsk, &group_sample);
1263 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1322 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1264 return 1; 1323 return 1;
1265 } 1324 }
@@ -1329,6 +1388,33 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1329} 1388}
1330 1389
1331/* 1390/*
1391 * Sample a process (thread group) timer for the given group_leader task.
1392 * Must be called with tasklist_lock held for reading.
1393 */
1394static int cpu_timer_sample_group(const clockid_t which_clock,
1395 struct task_struct *p,
1396 union cpu_time_count *cpu)
1397{
1398 struct task_cputime cputime;
1399
1400 thread_group_cputimer(p, &cputime);
1401 switch (CPUCLOCK_WHICH(which_clock)) {
1402 default:
1403 return -EINVAL;
1404 case CPUCLOCK_PROF:
1405 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
1406 break;
1407 case CPUCLOCK_VIRT:
1408 cpu->cpu = cputime.utime;
1409 break;
1410 case CPUCLOCK_SCHED:
1411 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
1412 break;
1413 }
1414 return 0;
1415}
1416
1417/*
1332 * Set one of the process-wide special case CPU timers. 1418 * Set one of the process-wide special case CPU timers.
1333 * The tsk->sighand->siglock must be held by the caller. 1419 * The tsk->sighand->siglock must be held by the caller.
1334 * The *newval argument is relative and we update it to be absolute, *oldval 1420 * The *newval argument is relative and we update it to be absolute, *oldval
@@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1341 struct list_head *head; 1427 struct list_head *head;
1342 1428
1343 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1429 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1344 cpu_clock_sample_group(clock_idx, tsk, &now); 1430 start_process_timers(tsk);
1431 cpu_timer_sample_group(clock_idx, tsk, &now);
1345 1432
1346 if (oldval) { 1433 if (oldval) {
1347 if (!cputime_eq(*oldval, cputime_zero)) { 1434 if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8ab0cef8ecab..a8f93dd374e1 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
296static inline void account_group_user_time(struct task_struct *tsk, 296static inline void account_group_user_time(struct task_struct *tsk,
297 cputime_t cputime) 297 cputime_t cputime)
298{ 298{
299 struct task_cputime *times; 299 struct thread_group_cputimer *cputimer;
300 struct signal_struct *sig;
301 300
302 /* tsk == current, ensure it is safe to use ->signal */ 301 /* tsk == current, ensure it is safe to use ->signal */
303 if (unlikely(tsk->exit_state)) 302 if (unlikely(tsk->exit_state))
304 return; 303 return;
305 304
306 sig = tsk->signal; 305 cputimer = &tsk->signal->cputimer;
307 times = &sig->cputime.totals;
308 306
309 spin_lock(&times->lock); 307 if (!cputimer->running)
310 times->utime = cputime_add(times->utime, cputime); 308 return;
311 spin_unlock(&times->lock); 309
310 spin_lock(&cputimer->lock);
311 cputimer->cputime.utime =
312 cputime_add(cputimer->cputime.utime, cputime);
313 spin_unlock(&cputimer->lock);
312} 314}
313 315
314/** 316/**
@@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk,
324static inline void account_group_system_time(struct task_struct *tsk, 326static inline void account_group_system_time(struct task_struct *tsk,
325 cputime_t cputime) 327 cputime_t cputime)
326{ 328{
327 struct task_cputime *times; 329 struct thread_group_cputimer *cputimer;
328 struct signal_struct *sig;
329 330
330 /* tsk == current, ensure it is safe to use ->signal */ 331 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state)) 332 if (unlikely(tsk->exit_state))
332 return; 333 return;
333 334
334 sig = tsk->signal; 335 cputimer = &tsk->signal->cputimer;
335 times = &sig->cputime.totals; 336
337 if (!cputimer->running)
338 return;
336 339
337 spin_lock(&times->lock); 340 spin_lock(&cputimer->lock);
338 times->stime = cputime_add(times->stime, cputime); 341 cputimer->cputime.stime =
339 spin_unlock(&times->lock); 342 cputime_add(cputimer->cputime.stime, cputime);
343 spin_unlock(&cputimer->lock);
340} 344}
341 345
342/** 346/**
@@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
352static inline void account_group_exec_runtime(struct task_struct *tsk, 356static inline void account_group_exec_runtime(struct task_struct *tsk,
353 unsigned long long ns) 357 unsigned long long ns)
354{ 358{
355 struct task_cputime *times; 359 struct thread_group_cputimer *cputimer;
356 struct signal_struct *sig; 360 struct signal_struct *sig;
357 361
358 sig = tsk->signal; 362 sig = tsk->signal;
@@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
361 if (unlikely(!sig)) 365 if (unlikely(!sig))
362 return; 366 return;
363 367
364 times = &sig->cputime.totals; 368 cputimer = &sig->cputimer;
369
370 if (!cputimer->running)
371 return;
365 372
366 spin_lock(&times->lock); 373 spin_lock(&cputimer->lock);
367 times->sum_exec_runtime += ns; 374 cputimer->cputime.sum_exec_runtime += ns;
368 spin_unlock(&times->lock); 375 spin_unlock(&cputimer->lock);
369} 376}