aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-03-25 07:46:58 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-06 03:30:36 -0400
commit53cfbf593758916aac41db728f029986a62f1254 (patch)
treec58a9c0f6e3cc050235e736e288e268bdb1f37eb /kernel
parent7730d8655880f41f2ea519aca2ca6a1413dfd2c9 (diff)
perf_counter: record time running and time enabled for each counter
Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Andrew Morton <akpm@linux-foundation.org> Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/perf_counter.c157
1 files changed, 136 insertions, 21 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 95e02575546b..3b862a7988cd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter,
116 return; 116 return;
117 117
118 counter->state = PERF_COUNTER_STATE_INACTIVE; 118 counter->state = PERF_COUNTER_STATE_INACTIVE;
119 counter->tstamp_stopped = ctx->time_now;
119 counter->hw_ops->disable(counter); 120 counter->hw_ops->disable(counter);
120 counter->oncpu = -1; 121 counter->oncpu = -1;
121 122
@@ -252,6 +253,60 @@ retry:
252} 253}
253 254
254/* 255/*
256 * Get the current time for this context.
257 * If this is a task context, we use the task's task clock,
258 * or for a per-cpu context, we use the cpu clock.
259 */
260static u64 get_context_time(struct perf_counter_context *ctx, int update)
261{
262 struct task_struct *curr = ctx->task;
263
264 if (!curr)
265 return cpu_clock(smp_processor_id());
266
267 return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
268}
269
270/*
271 * Update the record of the current time in a context.
272 */
273static void update_context_time(struct perf_counter_context *ctx, int update)
274{
275 ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
276}
277
278/*
279 * Update the total_time_enabled and total_time_running fields for a counter.
280 */
281static void update_counter_times(struct perf_counter *counter)
282{
283 struct perf_counter_context *ctx = counter->ctx;
284 u64 run_end;
285
286 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
287 counter->total_time_enabled = ctx->time_now -
288 counter->tstamp_enabled;
289 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
290 run_end = counter->tstamp_stopped;
291 else
292 run_end = ctx->time_now;
293 counter->total_time_running = run_end - counter->tstamp_running;
294 }
295}
296
297/*
298 * Update total_time_enabled and total_time_running for all counters in a group.
299 */
300static void update_group_times(struct perf_counter *leader)
301{
302 struct perf_counter *counter;
303
304 update_counter_times(leader);
305 list_for_each_entry(counter, &leader->sibling_list, list_entry)
306 update_counter_times(counter);
307}
308
309/*
255 * Cross CPU call to disable a performance counter 310 * Cross CPU call to disable a performance counter
256 */ 311 */
257static void __perf_counter_disable(void *info) 312static void __perf_counter_disable(void *info)
@@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info)
276 * If it is in error state, leave it in error state. 331 * If it is in error state, leave it in error state.
277 */ 332 */
278 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { 333 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
334 update_context_time(ctx, 1);
335 update_counter_times(counter);
279 if (counter == counter->group_leader) 336 if (counter == counter->group_leader)
280 group_sched_out(counter, cpuctx, ctx); 337 group_sched_out(counter, cpuctx, ctx);
281 else 338 else
@@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter)
320 * Since we have the lock this context can't be scheduled 377 * Since we have the lock this context can't be scheduled
321 * in, so we can change the state safely. 378 * in, so we can change the state safely.
322 */ 379 */
323 if (counter->state == PERF_COUNTER_STATE_INACTIVE) 380 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
381 update_counter_times(counter);
324 counter->state = PERF_COUNTER_STATE_OFF; 382 counter->state = PERF_COUNTER_STATE_OFF;
383 }
325 384
326 spin_unlock_irq(&ctx->lock); 385 spin_unlock_irq(&ctx->lock);
327} 386}
@@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter,
366 return -EAGAIN; 425 return -EAGAIN;
367 } 426 }
368 427
428 counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
429
369 if (!is_software_counter(counter)) 430 if (!is_software_counter(counter))
370 cpuctx->active_oncpu++; 431 cpuctx->active_oncpu++;
371 ctx->nr_active++; 432 ctx->nr_active++;
@@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter,
425 return can_add_hw; 486 return can_add_hw;
426} 487}
427 488
489static void add_counter_to_ctx(struct perf_counter *counter,
490 struct perf_counter_context *ctx)
491{
492 list_add_counter(counter, ctx);
493 ctx->nr_counters++;
494 counter->prev_state = PERF_COUNTER_STATE_OFF;
495 counter->tstamp_enabled = ctx->time_now;
496 counter->tstamp_running = ctx->time_now;
497 counter->tstamp_stopped = ctx->time_now;
498}
499
428/* 500/*
429 * Cross CPU call to install and enable a performance counter 501 * Cross CPU call to install and enable a performance counter
430 */ 502 */
@@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info)
449 521
450 curr_rq_lock_irq_save(&flags); 522 curr_rq_lock_irq_save(&flags);
451 spin_lock(&ctx->lock); 523 spin_lock(&ctx->lock);
524 update_context_time(ctx, 1);
452 525
453 /* 526 /*
454 * Protect the list operation against NMI by disabling the 527 * Protect the list operation against NMI by disabling the
@@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info)
456 */ 529 */
457 perf_flags = hw_perf_save_disable(); 530 perf_flags = hw_perf_save_disable();
458 531
459 list_add_counter(counter, ctx); 532 add_counter_to_ctx(counter, ctx);
460 ctx->nr_counters++;
461 counter->prev_state = PERF_COUNTER_STATE_OFF;
462 533
463 /* 534 /*
464 * Don't put the counter on if it is disabled or if 535 * Don't put the counter on if it is disabled or if
@@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info)
486 */ 557 */
487 if (leader != counter) 558 if (leader != counter)
488 group_sched_out(leader, cpuctx, ctx); 559 group_sched_out(leader, cpuctx, ctx);
489 if (leader->hw_event.pinned) 560 if (leader->hw_event.pinned) {
561 update_group_times(leader);
490 leader->state = PERF_COUNTER_STATE_ERROR; 562 leader->state = PERF_COUNTER_STATE_ERROR;
563 }
491 } 564 }
492 565
493 if (!err && !ctx->task && cpuctx->max_pertask) 566 if (!err && !ctx->task && cpuctx->max_pertask)
@@ -548,10 +621,8 @@ retry:
548 * can add the counter safely, if it the call above did not 621 * can add the counter safely, if it the call above did not
549 * succeed. 622 * succeed.
550 */ 623 */
551 if (list_empty(&counter->list_entry)) { 624 if (list_empty(&counter->list_entry))
552 list_add_counter(counter, ctx); 625 add_counter_to_ctx(counter, ctx);
553 ctx->nr_counters++;
554 }
555 spin_unlock_irq(&ctx->lock); 626 spin_unlock_irq(&ctx->lock);
556} 627}
557 628
@@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info)
576 647
577 curr_rq_lock_irq_save(&flags); 648 curr_rq_lock_irq_save(&flags);
578 spin_lock(&ctx->lock); 649 spin_lock(&ctx->lock);
650 update_context_time(ctx, 1);
579 651
580 counter->prev_state = counter->state; 652 counter->prev_state = counter->state;
581 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 653 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
582 goto unlock; 654 goto unlock;
583 counter->state = PERF_COUNTER_STATE_INACTIVE; 655 counter->state = PERF_COUNTER_STATE_INACTIVE;
656 counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
584 657
585 /* 658 /*
586 * If the counter is in a group and isn't the group leader, 659 * If the counter is in a group and isn't the group leader,
@@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info)
602 */ 675 */
603 if (leader != counter) 676 if (leader != counter)
604 group_sched_out(leader, cpuctx, ctx); 677 group_sched_out(leader, cpuctx, ctx);
605 if (leader->hw_event.pinned) 678 if (leader->hw_event.pinned) {
679 update_group_times(leader);
606 leader->state = PERF_COUNTER_STATE_ERROR; 680 leader->state = PERF_COUNTER_STATE_ERROR;
681 }
607 } 682 }
608 683
609 unlock: 684 unlock:
@@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter)
659 * Since we have the lock this context can't be scheduled 734 * Since we have the lock this context can't be scheduled
660 * in, so we can change the state safely. 735 * in, so we can change the state safely.
661 */ 736 */
662 if (counter->state == PERF_COUNTER_STATE_OFF) 737 if (counter->state == PERF_COUNTER_STATE_OFF) {
663 counter->state = PERF_COUNTER_STATE_INACTIVE; 738 counter->state = PERF_COUNTER_STATE_INACTIVE;
739 counter->tstamp_enabled = ctx->time_now -
740 counter->total_time_enabled;
741 }
664 out: 742 out:
665 spin_unlock_irq(&ctx->lock); 743 spin_unlock_irq(&ctx->lock);
666} 744}
@@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
693 ctx->is_active = 0; 771 ctx->is_active = 0;
694 if (likely(!ctx->nr_counters)) 772 if (likely(!ctx->nr_counters))
695 goto out; 773 goto out;
774 update_context_time(ctx, 0);
696 775
697 flags = hw_perf_save_disable(); 776 flags = hw_perf_save_disable();
698 if (ctx->nr_active) { 777 if (ctx->nr_active) {
@@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
797 if (likely(!ctx->nr_counters)) 876 if (likely(!ctx->nr_counters))
798 goto out; 877 goto out;
799 878
879 /*
880 * Add any time since the last sched_out to the lost time
881 * so it doesn't get included in the total_time_enabled and
882 * total_time_running measures for counters in the context.
883 */
884 ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
885
800 flags = hw_perf_save_disable(); 886 flags = hw_perf_save_disable();
801 887
802 /* 888 /*
@@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
817 * If this pinned group hasn't been scheduled, 903 * If this pinned group hasn't been scheduled,
818 * put it in error state. 904 * put it in error state.
819 */ 905 */
820 if (counter->state == PERF_COUNTER_STATE_INACTIVE) 906 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
907 update_group_times(counter);
821 counter->state = PERF_COUNTER_STATE_ERROR; 908 counter->state = PERF_COUNTER_STATE_ERROR;
909 }
822 } 910 }
823 911
824 list_for_each_entry(counter, &ctx->counter_list, list_entry) { 912 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -902,8 +990,10 @@ int perf_counter_task_disable(void)
902 perf_flags = hw_perf_save_disable(); 990 perf_flags = hw_perf_save_disable();
903 991
904 list_for_each_entry(counter, &ctx->counter_list, list_entry) { 992 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
905 if (counter->state != PERF_COUNTER_STATE_ERROR) 993 if (counter->state != PERF_COUNTER_STATE_ERROR) {
994 update_group_times(counter);
906 counter->state = PERF_COUNTER_STATE_OFF; 995 counter->state = PERF_COUNTER_STATE_OFF;
996 }
907 } 997 }
908 998
909 hw_perf_restore(perf_flags); 999 hw_perf_restore(perf_flags);
@@ -946,6 +1036,8 @@ int perf_counter_task_enable(void)
946 if (counter->state > PERF_COUNTER_STATE_OFF) 1036 if (counter->state > PERF_COUNTER_STATE_OFF)
947 continue; 1037 continue;
948 counter->state = PERF_COUNTER_STATE_INACTIVE; 1038 counter->state = PERF_COUNTER_STATE_INACTIVE;
1039 counter->tstamp_enabled = ctx->time_now -
1040 counter->total_time_enabled;
949 counter->hw_event.disabled = 0; 1041 counter->hw_event.disabled = 0;
950 } 1042 }
951 hw_perf_restore(perf_flags); 1043 hw_perf_restore(perf_flags);
@@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1009static void __read(void *info) 1101static void __read(void *info)
1010{ 1102{
1011 struct perf_counter *counter = info; 1103 struct perf_counter *counter = info;
1104 struct perf_counter_context *ctx = counter->ctx;
1012 unsigned long flags; 1105 unsigned long flags;
1013 1106
1014 curr_rq_lock_irq_save(&flags); 1107 curr_rq_lock_irq_save(&flags);
1108 if (ctx->is_active)
1109 update_context_time(ctx, 1);
1015 counter->hw_ops->read(counter); 1110 counter->hw_ops->read(counter);
1111 update_counter_times(counter);
1016 curr_rq_unlock_irq_restore(&flags); 1112 curr_rq_unlock_irq_restore(&flags);
1017} 1113}
1018 1114
@@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter)
1025 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1121 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1026 smp_call_function_single(counter->oncpu, 1122 smp_call_function_single(counter->oncpu,
1027 __read, counter, 1); 1123 __read, counter, 1);
1124 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1125 update_counter_times(counter);
1028 } 1126 }
1029 1127
1030 return atomic64_read(&counter->count); 1128 return atomic64_read(&counter->count);
@@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file)
1137static ssize_t 1235static ssize_t
1138perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1236perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1139{ 1237{
1140 u64 cntval; 1238 u64 values[3];
1141 1239 int n;
1142 if (count < sizeof(cntval))
1143 return -EINVAL;
1144 1240
1145 /* 1241 /*
1146 * Return end-of-file for a read on a counter that is in 1242 * Return end-of-file for a read on a counter that is in
@@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1151 return 0; 1247 return 0;
1152 1248
1153 mutex_lock(&counter->mutex); 1249 mutex_lock(&counter->mutex);
1154 cntval = perf_counter_read(counter); 1250 values[0] = perf_counter_read(counter);
1251 n = 1;
1252 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1253 values[n++] = counter->total_time_enabled +
1254 atomic64_read(&counter->child_total_time_enabled);
1255 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1256 values[n++] = counter->total_time_running +
1257 atomic64_read(&counter->child_total_time_running);
1155 mutex_unlock(&counter->mutex); 1258 mutex_unlock(&counter->mutex);
1156 1259
1157 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); 1260 if (count < n * sizeof(u64))
1261 return -EINVAL;
1262 count = n * sizeof(u64);
1263
1264 if (copy_to_user(buf, values, count))
1265 return -EFAULT;
1266
1267 return count;
1158} 1268}
1159 1269
1160static ssize_t 1270static ssize_t
@@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter,
2290 * Link it up in the child's context: 2400 * Link it up in the child's context:
2291 */ 2401 */
2292 child_counter->task = child; 2402 child_counter->task = child;
2293 list_add_counter(child_counter, child_ctx); 2403 add_counter_to_ctx(child_counter, child_ctx);
2294 child_ctx->nr_counters++;
2295 2404
2296 child_counter->parent = parent_counter; 2405 child_counter->parent = parent_counter;
2297 /* 2406 /*
@@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter,
2361 * Add back the child's count to the parent's count: 2470 * Add back the child's count to the parent's count:
2362 */ 2471 */
2363 atomic64_add(child_val, &parent_counter->count); 2472 atomic64_add(child_val, &parent_counter->count);
2473 atomic64_add(child_counter->total_time_enabled,
2474 &parent_counter->child_total_time_enabled);
2475 atomic64_add(child_counter->total_time_running,
2476 &parent_counter->child_total_time_running);
2364 2477
2365 /* 2478 /*
2366 * Remove this counter from the parent's list 2479 * Remove this counter from the parent's list
@@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child,
2395 if (child != current) { 2508 if (child != current) {
2396 wait_task_inactive(child, 0); 2509 wait_task_inactive(child, 0);
2397 list_del_init(&child_counter->list_entry); 2510 list_del_init(&child_counter->list_entry);
2511 update_counter_times(child_counter);
2398 } else { 2512 } else {
2399 struct perf_cpu_context *cpuctx; 2513 struct perf_cpu_context *cpuctx;
2400 unsigned long flags; 2514 unsigned long flags;
@@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child,
2412 cpuctx = &__get_cpu_var(perf_cpu_context); 2526 cpuctx = &__get_cpu_var(perf_cpu_context);
2413 2527
2414 group_sched_out(child_counter, cpuctx, child_ctx); 2528 group_sched_out(child_counter, cpuctx, child_ctx);
2529 update_counter_times(child_counter);
2415 2530
2416 list_del_init(&child_counter->list_entry); 2531 list_del_init(&child_counter->list_entry);
2417 2532