diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-02-05 06:24:16 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-02-05 07:04:33 -0500 |
commit | 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 (patch) | |
tree | b1e580d5284648d6884e951d995509094a92cca4 | |
parent | 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 (diff) |
timers: split process wide cpu clocks/timers
Change the process wide cpu timers/clocks so that we:
1) don't mess up the kernel with too many threads,
2) don't have a per-cpu allocation for each process,
3) have no impact when not used.
In order to accomplish this we're going to split it into two parts:
- clocks; which can take all the time they want since they run
from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID)
- timers; which need constant time sampling but since they're
explicity used, the user can pay the overhead.
The clock readout will go back to a full sum of the thread group, while the
timers will run of a global 'clock' that only runs when needed, so only
programs that make use of the facility pay the price.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/init_task.h | 11 | ||||
-rw-r--r-- | include/linux/sched.h | 54 | ||||
-rw-r--r-- | kernel/itimer.c | 4 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 95 | ||||
-rw-r--r-- | kernel/sched_stats.h | 45 |
5 files changed, 155 insertions, 54 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index ea0ea1a4c36f..e752d973fa21 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -48,12 +48,11 @@ extern struct fs_struct init_fs; | |||
48 | .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ | 48 | .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ |
49 | .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ | 49 | .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ |
50 | .rlim = INIT_RLIMITS, \ | 50 | .rlim = INIT_RLIMITS, \ |
51 | .cputime = { .totals = { \ | 51 | .cputimer = { \ |
52 | .utime = cputime_zero, \ | 52 | .cputime = INIT_CPUTIME, \ |
53 | .stime = cputime_zero, \ | 53 | .running = 0, \ |
54 | .sum_exec_runtime = 0, \ | 54 | .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ |
55 | .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ | 55 | }, \ |
56 | }, }, \ | ||
57 | } | 56 | } |
58 | 57 | ||
59 | extern struct nsproxy init_nsproxy; | 58 | extern struct nsproxy init_nsproxy; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 2e0646a30314..082d7619b3a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -443,7 +443,6 @@ struct pacct_struct { | |||
443 | * @utime: time spent in user mode, in &cputime_t units | 443 | * @utime: time spent in user mode, in &cputime_t units |
444 | * @stime: time spent in kernel mode, in &cputime_t units | 444 | * @stime: time spent in kernel mode, in &cputime_t units |
445 | * @sum_exec_runtime: total time spent on the CPU, in nanoseconds | 445 | * @sum_exec_runtime: total time spent on the CPU, in nanoseconds |
446 | * @lock: lock for fields in this struct | ||
447 | * | 446 | * |
448 | * This structure groups together three kinds of CPU time that are | 447 | * This structure groups together three kinds of CPU time that are |
449 | * tracked for threads and thread groups. Most things considering | 448 | * tracked for threads and thread groups. Most things considering |
@@ -454,23 +453,33 @@ struct task_cputime { | |||
454 | cputime_t utime; | 453 | cputime_t utime; |
455 | cputime_t stime; | 454 | cputime_t stime; |
456 | unsigned long long sum_exec_runtime; | 455 | unsigned long long sum_exec_runtime; |
457 | spinlock_t lock; | ||
458 | }; | 456 | }; |
459 | /* Alternate field names when used to cache expirations. */ | 457 | /* Alternate field names when used to cache expirations. */ |
460 | #define prof_exp stime | 458 | #define prof_exp stime |
461 | #define virt_exp utime | 459 | #define virt_exp utime |
462 | #define sched_exp sum_exec_runtime | 460 | #define sched_exp sum_exec_runtime |
463 | 461 | ||
462 | #define INIT_CPUTIME \ | ||
463 | (struct task_cputime) { \ | ||
464 | .utime = cputime_zero, \ | ||
465 | .stime = cputime_zero, \ | ||
466 | .sum_exec_runtime = 0, \ | ||
467 | } | ||
468 | |||
464 | /** | 469 | /** |
465 | * struct thread_group_cputime - thread group interval timer counts | 470 | * struct thread_group_cputimer - thread group interval timer counts |
466 | * @totals: thread group interval timers; substructure for | 471 | * @cputime: thread group interval timers. |
467 | * uniprocessor kernel, per-cpu for SMP kernel. | 472 | * @running: non-zero when there are timers running and |
473 | * @cputime receives updates. | ||
474 | * @lock: lock for fields in this struct. | ||
468 | * | 475 | * |
469 | * This structure contains the version of task_cputime, above, that is | 476 | * This structure contains the version of task_cputime, above, that is |
470 | * used for thread group CPU clock calculations. | 477 | * used for thread group CPU timer calculations. |
471 | */ | 478 | */ |
472 | struct thread_group_cputime { | 479 | struct thread_group_cputimer { |
473 | struct task_cputime totals; | 480 | struct task_cputime cputime; |
481 | int running; | ||
482 | spinlock_t lock; | ||
474 | }; | 483 | }; |
475 | 484 | ||
476 | /* | 485 | /* |
@@ -519,10 +528,10 @@ struct signal_struct { | |||
519 | cputime_t it_prof_incr, it_virt_incr; | 528 | cputime_t it_prof_incr, it_virt_incr; |
520 | 529 | ||
521 | /* | 530 | /* |
522 | * Thread group totals for process CPU clocks. | 531 | * Thread group totals for process CPU timers. |
523 | * See thread_group_cputime(), et al, for details. | 532 | * See thread_group_cputimer(), et al, for details. |
524 | */ | 533 | */ |
525 | struct thread_group_cputime cputime; | 534 | struct thread_group_cputimer cputimer; |
526 | 535 | ||
527 | /* Earliest-expiration cache. */ | 536 | /* Earliest-expiration cache. */ |
528 | struct task_cputime cputime_expires; | 537 | struct task_cputime cputime_expires; |
@@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock) | |||
2191 | /* | 2200 | /* |
2192 | * Thread group CPU time accounting. | 2201 | * Thread group CPU time accounting. |
2193 | */ | 2202 | */ |
2203 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); | ||
2194 | 2204 | ||
2195 | static inline | 2205 | static inline |
2196 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | 2206 | void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) |
2197 | { | 2207 | { |
2198 | struct task_cputime *totals = &tsk->signal->cputime.totals; | 2208 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
2199 | unsigned long flags; | 2209 | unsigned long flags; |
2200 | 2210 | ||
2201 | spin_lock_irqsave(&totals->lock, flags); | 2211 | WARN_ON(!cputimer->running); |
2202 | *times = *totals; | 2212 | |
2203 | spin_unlock_irqrestore(&totals->lock, flags); | 2213 | spin_lock_irqsave(&cputimer->lock, flags); |
2214 | *times = cputimer->cputime; | ||
2215 | spin_unlock_irqrestore(&cputimer->lock, flags); | ||
2204 | } | 2216 | } |
2205 | 2217 | ||
2206 | static inline void thread_group_cputime_init(struct signal_struct *sig) | 2218 | static inline void thread_group_cputime_init(struct signal_struct *sig) |
2207 | { | 2219 | { |
2208 | sig->cputime.totals = (struct task_cputime){ | 2220 | sig->cputimer.cputime = INIT_CPUTIME; |
2209 | .utime = cputime_zero, | 2221 | spin_lock_init(&sig->cputimer.lock); |
2210 | .stime = cputime_zero, | 2222 | sig->cputimer.running = 0; |
2211 | .sum_exec_runtime = 0, | ||
2212 | }; | ||
2213 | |||
2214 | spin_lock_init(&sig->cputime.totals.lock); | ||
2215 | } | 2223 | } |
2216 | 2224 | ||
2217 | static inline void thread_group_cputime_free(struct signal_struct *sig) | 2225 | static inline void thread_group_cputime_free(struct signal_struct *sig) |
diff --git a/kernel/itimer.c b/kernel/itimer.c index 6a5fe93dd8bd..58762f7077ec 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value) | |||
62 | struct task_cputime cputime; | 62 | struct task_cputime cputime; |
63 | cputime_t utime; | 63 | cputime_t utime; |
64 | 64 | ||
65 | thread_group_cputime(tsk, &cputime); | 65 | thread_group_cputimer(tsk, &cputime); |
66 | utime = cputime.utime; | 66 | utime = cputime.utime; |
67 | if (cputime_le(cval, utime)) { /* about to fire */ | 67 | if (cputime_le(cval, utime)) { /* about to fire */ |
68 | cval = jiffies_to_cputime(1); | 68 | cval = jiffies_to_cputime(1); |
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value) | |||
82 | struct task_cputime times; | 82 | struct task_cputime times; |
83 | cputime_t ptime; | 83 | cputime_t ptime; |
84 | 84 | ||
85 | thread_group_cputime(tsk, ×); | 85 | thread_group_cputimer(tsk, ×); |
86 | ptime = cputime_add(times.utime, times.stime); | 86 | ptime = cputime_add(times.utime, times.stime); |
87 | if (cputime_le(cval, ptime)) { /* about to fire */ | 87 | if (cputime_le(cval, ptime)) { /* about to fire */ |
88 | cval = jiffies_to_cputime(1); | 88 | cval = jiffies_to_cputime(1); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index fa07da94d7be..db107c9bbc05 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
230 | return 0; | 230 | return 0; |
231 | } | 231 | } |
232 | 232 | ||
233 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||
234 | { | ||
235 | struct sighand_struct *sighand; | ||
236 | struct signal_struct *sig; | ||
237 | struct task_struct *t; | ||
238 | |||
239 | *times = INIT_CPUTIME; | ||
240 | |||
241 | rcu_read_lock(); | ||
242 | sighand = rcu_dereference(tsk->sighand); | ||
243 | if (!sighand) | ||
244 | goto out; | ||
245 | |||
246 | sig = tsk->signal; | ||
247 | |||
248 | t = tsk; | ||
249 | do { | ||
250 | times->utime = cputime_add(times->utime, t->utime); | ||
251 | times->stime = cputime_add(times->stime, t->stime); | ||
252 | times->sum_exec_runtime += t->se.sum_exec_runtime; | ||
253 | |||
254 | t = next_thread(t); | ||
255 | } while (t != tsk); | ||
256 | |||
257 | times->utime = cputime_add(times->utime, sig->utime); | ||
258 | times->stime = cputime_add(times->stime, sig->stime); | ||
259 | times->sum_exec_runtime += sig->sum_sched_runtime; | ||
260 | out: | ||
261 | rcu_read_unlock(); | ||
262 | } | ||
263 | |||
233 | /* | 264 | /* |
234 | * Sample a process (thread group) clock for the given group_leader task. | 265 | * Sample a process (thread group) clock for the given group_leader task. |
235 | * Must be called with tasklist_lock held for reading. | 266 | * Must be called with tasklist_lock held for reading. |
@@ -476,6 +507,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) | |||
476 | } | 507 | } |
477 | 508 | ||
478 | /* | 509 | /* |
510 | * Enable the process wide cpu timer accounting. | ||
511 | * | ||
512 | * serialized using ->sighand->siglock | ||
513 | */ | ||
514 | static void start_process_timers(struct task_struct *tsk) | ||
515 | { | ||
516 | tsk->signal->cputimer.running = 1; | ||
517 | barrier(); | ||
518 | } | ||
519 | |||
520 | /* | ||
521 | * Release the process wide timer accounting -- timer stops ticking when | ||
522 | * nobody cares about it. | ||
523 | * | ||
524 | * serialized using ->sighand->siglock | ||
525 | */ | ||
526 | static void stop_process_timers(struct task_struct *tsk) | ||
527 | { | ||
528 | tsk->signal->cputimer.running = 0; | ||
529 | barrier(); | ||
530 | } | ||
531 | |||
532 | /* | ||
479 | * Insert the timer on the appropriate list before any timers that | 533 | * Insert the timer on the appropriate list before any timers that |
480 | * expire later. This must be called with the tasklist_lock held | 534 | * expire later. This must be called with the tasklist_lock held |
481 | * for reading, and interrupts disabled. | 535 | * for reading, and interrupts disabled. |
@@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) | |||
495 | BUG_ON(!irqs_disabled()); | 549 | BUG_ON(!irqs_disabled()); |
496 | spin_lock(&p->sighand->siglock); | 550 | spin_lock(&p->sighand->siglock); |
497 | 551 | ||
552 | if (!CPUCLOCK_PERTHREAD(timer->it_clock)) | ||
553 | start_process_timers(p); | ||
554 | |||
498 | listpos = head; | 555 | listpos = head; |
499 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { | 556 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { |
500 | list_for_each_entry(next, head, entry) { | 557 | list_for_each_entry(next, head, entry) { |
@@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk, | |||
987 | sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && | 1044 | sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && |
988 | list_empty(&timers[CPUCLOCK_VIRT]) && | 1045 | list_empty(&timers[CPUCLOCK_VIRT]) && |
989 | cputime_eq(sig->it_virt_expires, cputime_zero) && | 1046 | cputime_eq(sig->it_virt_expires, cputime_zero) && |
990 | list_empty(&timers[CPUCLOCK_SCHED])) | 1047 | list_empty(&timers[CPUCLOCK_SCHED])) { |
1048 | stop_process_timers(tsk); | ||
991 | return; | 1049 | return; |
1050 | } | ||
992 | 1051 | ||
993 | /* | 1052 | /* |
994 | * Collect the current process totals. | 1053 | * Collect the current process totals. |
995 | */ | 1054 | */ |
996 | thread_group_cputime(tsk, &cputime); | 1055 | thread_group_cputimer(tsk, &cputime); |
997 | utime = cputime.utime; | 1056 | utime = cputime.utime; |
998 | ptime = cputime_add(utime, cputime.stime); | 1057 | ptime = cputime_add(utime, cputime.stime); |
999 | sum_sched_runtime = cputime.sum_exec_runtime; | 1058 | sum_sched_runtime = cputime.sum_exec_runtime; |
@@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1259 | if (!task_cputime_zero(&sig->cputime_expires)) { | 1318 | if (!task_cputime_zero(&sig->cputime_expires)) { |
1260 | struct task_cputime group_sample; | 1319 | struct task_cputime group_sample; |
1261 | 1320 | ||
1262 | thread_group_cputime(tsk, &group_sample); | 1321 | thread_group_cputimer(tsk, &group_sample); |
1263 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1322 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
1264 | return 1; | 1323 | return 1; |
1265 | } | 1324 | } |
@@ -1329,6 +1388,33 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1329 | } | 1388 | } |
1330 | 1389 | ||
1331 | /* | 1390 | /* |
1391 | * Sample a process (thread group) timer for the given group_leader task. | ||
1392 | * Must be called with tasklist_lock held for reading. | ||
1393 | */ | ||
1394 | static int cpu_timer_sample_group(const clockid_t which_clock, | ||
1395 | struct task_struct *p, | ||
1396 | union cpu_time_count *cpu) | ||
1397 | { | ||
1398 | struct task_cputime cputime; | ||
1399 | |||
1400 | thread_group_cputimer(p, &cputime); | ||
1401 | switch (CPUCLOCK_WHICH(which_clock)) { | ||
1402 | default: | ||
1403 | return -EINVAL; | ||
1404 | case CPUCLOCK_PROF: | ||
1405 | cpu->cpu = cputime_add(cputime.utime, cputime.stime); | ||
1406 | break; | ||
1407 | case CPUCLOCK_VIRT: | ||
1408 | cpu->cpu = cputime.utime; | ||
1409 | break; | ||
1410 | case CPUCLOCK_SCHED: | ||
1411 | cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); | ||
1412 | break; | ||
1413 | } | ||
1414 | return 0; | ||
1415 | } | ||
1416 | |||
1417 | /* | ||
1332 | * Set one of the process-wide special case CPU timers. | 1418 | * Set one of the process-wide special case CPU timers. |
1333 | * The tsk->sighand->siglock must be held by the caller. | 1419 | * The tsk->sighand->siglock must be held by the caller. |
1334 | * The *newval argument is relative and we update it to be absolute, *oldval | 1420 | * The *newval argument is relative and we update it to be absolute, *oldval |
@@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1341 | struct list_head *head; | 1427 | struct list_head *head; |
1342 | 1428 | ||
1343 | BUG_ON(clock_idx == CPUCLOCK_SCHED); | 1429 | BUG_ON(clock_idx == CPUCLOCK_SCHED); |
1344 | cpu_clock_sample_group(clock_idx, tsk, &now); | 1430 | start_process_timers(tsk); |
1431 | cpu_timer_sample_group(clock_idx, tsk, &now); | ||
1345 | 1432 | ||
1346 | if (oldval) { | 1433 | if (oldval) { |
1347 | if (!cputime_eq(*oldval, cputime_zero)) { | 1434 | if (!cputime_eq(*oldval, cputime_zero)) { |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8ab0cef8ecab..a8f93dd374e1 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
296 | static inline void account_group_user_time(struct task_struct *tsk, | 296 | static inline void account_group_user_time(struct task_struct *tsk, |
297 | cputime_t cputime) | 297 | cputime_t cputime) |
298 | { | 298 | { |
299 | struct task_cputime *times; | 299 | struct thread_group_cputimer *cputimer; |
300 | struct signal_struct *sig; | ||
301 | 300 | ||
302 | /* tsk == current, ensure it is safe to use ->signal */ | 301 | /* tsk == current, ensure it is safe to use ->signal */ |
303 | if (unlikely(tsk->exit_state)) | 302 | if (unlikely(tsk->exit_state)) |
304 | return; | 303 | return; |
305 | 304 | ||
306 | sig = tsk->signal; | 305 | cputimer = &tsk->signal->cputimer; |
307 | times = &sig->cputime.totals; | ||
308 | 306 | ||
309 | spin_lock(×->lock); | 307 | if (!cputimer->running) |
310 | times->utime = cputime_add(times->utime, cputime); | 308 | return; |
311 | spin_unlock(×->lock); | 309 | |
310 | spin_lock(&cputimer->lock); | ||
311 | cputimer->cputime.utime = | ||
312 | cputime_add(cputimer->cputime.utime, cputime); | ||
313 | spin_unlock(&cputimer->lock); | ||
312 | } | 314 | } |
313 | 315 | ||
314 | /** | 316 | /** |
@@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
324 | static inline void account_group_system_time(struct task_struct *tsk, | 326 | static inline void account_group_system_time(struct task_struct *tsk, |
325 | cputime_t cputime) | 327 | cputime_t cputime) |
326 | { | 328 | { |
327 | struct task_cputime *times; | 329 | struct thread_group_cputimer *cputimer; |
328 | struct signal_struct *sig; | ||
329 | 330 | ||
330 | /* tsk == current, ensure it is safe to use ->signal */ | 331 | /* tsk == current, ensure it is safe to use ->signal */ |
331 | if (unlikely(tsk->exit_state)) | 332 | if (unlikely(tsk->exit_state)) |
332 | return; | 333 | return; |
333 | 334 | ||
334 | sig = tsk->signal; | 335 | cputimer = &tsk->signal->cputimer; |
335 | times = &sig->cputime.totals; | 336 | |
337 | if (!cputimer->running) | ||
338 | return; | ||
336 | 339 | ||
337 | spin_lock(×->lock); | 340 | spin_lock(&cputimer->lock); |
338 | times->stime = cputime_add(times->stime, cputime); | 341 | cputimer->cputime.stime = |
339 | spin_unlock(×->lock); | 342 | cputime_add(cputimer->cputime.stime, cputime); |
343 | spin_unlock(&cputimer->lock); | ||
340 | } | 344 | } |
341 | 345 | ||
342 | /** | 346 | /** |
@@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
352 | static inline void account_group_exec_runtime(struct task_struct *tsk, | 356 | static inline void account_group_exec_runtime(struct task_struct *tsk, |
353 | unsigned long long ns) | 357 | unsigned long long ns) |
354 | { | 358 | { |
355 | struct task_cputime *times; | 359 | struct thread_group_cputimer *cputimer; |
356 | struct signal_struct *sig; | 360 | struct signal_struct *sig; |
357 | 361 | ||
358 | sig = tsk->signal; | 362 | sig = tsk->signal; |
@@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, | |||
361 | if (unlikely(!sig)) | 365 | if (unlikely(!sig)) |
362 | return; | 366 | return; |
363 | 367 | ||
364 | times = &sig->cputime.totals; | 368 | cputimer = &sig->cputimer; |
369 | |||
370 | if (!cputimer->running) | ||
371 | return; | ||
365 | 372 | ||
366 | spin_lock(×->lock); | 373 | spin_lock(&cputimer->lock); |
367 | times->sum_exec_runtime += ns; | 374 | cputimer->cputime.sum_exec_runtime += ns; |
368 | spin_unlock(×->lock); | 375 | spin_unlock(&cputimer->lock); |
369 | } | 376 | } |