aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-01-14 05:00:30 -0500
committerPaul Mackerras <paulus@samba.org>2009-01-14 05:00:30 -0500
commit3b6f9e5cb21964b7ce12bf81076f830885563ec8 (patch)
treee9d5ecffafa66cc3aeb259ade15a2611ad795327 /kernel/perf_counter.c
parent01d0287f068de2934109ba9b989d8807526cccc2 (diff)
perf_counter: Add support for pinned and exclusive counter groups
Impact: New perf_counter features A pinned counter group is one that the user wants to have on the CPU whenever possible, i.e. whenever the associated task is running, for a per-task group, or always for a per-cpu group. If the system cannot satisfy that, it puts the group into an error state where it is not scheduled any more and reads from it return EOF (i.e. 0 bytes read). The group can be released from error state and made readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE). When we have finer-grained enable/disable controls on counters we'll be able to reset the error state on individual groups. An exclusive group is one that the user wants to be the only group using the CPU performance monitor hardware whenever it is on. The counter group scheduler will not schedule an exclusive group if there are already other groups on the CPU and will not schedule other groups onto the CPU if there is an exclusive group scheduled (that statement does not apply to groups containing only software counters, which can always go on and which do not prevent an exclusive group from going on). With an exclusive group, we will be able to let users program PMU registers at a low level without the concern that those settings will perturb other measurements. Along the way this reorganizes things a little: - is_software_counter() is moved to perf_counter.h. - cpuctx->active_oncpu now records the number of hardware counters on the CPU, i.e. it now excludes software counters. Nothing was reading cpuctx->active_oncpu before, so this change is harmless. - A new cpuctx->exclusive field records whether we currently have an exclusive group on the CPU. - counter_sched_out moves higher up in perf_counter.c and gets called from __perf_counter_remove_from_context and __perf_counter_exit_task, where we used to have essentially the same code. - __perf_counter_sched_in now goes through the counter list twice, doing the pinned counters in the first loop and the non-pinned counters in the second loop, in order to give the pinned counters the best chance to be scheduled in. Note that only a group leader can be exclusive or pinned, and that attribute applies to the whole group. This avoids some awkwardness in some corner cases (e.g. where a group leader is closed and the other group members get added to the context list). If we want to relax that restriction later, we can, and it is easier to relax a restriction than to apply a new one. This doesn't yet handle the case where a pinned counter is inherited and goes into error state in the child - the error state is not propagated up to the parent when the child exits, and arguably it should. Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c226
1 files changed, 154 insertions, 72 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52f2f526248e..faf671b29566 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
93 } 93 }
94} 94}
95 95
96static void
97counter_sched_out(struct perf_counter *counter,
98 struct perf_cpu_context *cpuctx,
99 struct perf_counter_context *ctx)
100{
101 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
102 return;
103
104 counter->state = PERF_COUNTER_STATE_INACTIVE;
105 counter->hw_ops->disable(counter);
106 counter->oncpu = -1;
107
108 if (!is_software_counter(counter))
109 cpuctx->active_oncpu--;
110 ctx->nr_active--;
111 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
112 cpuctx->exclusive = 0;
113}
114
96/* 115/*
97 * Cross CPU call to remove a performance counter 116 * Cross CPU call to remove a performance counter
98 * 117 *
@@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info)
118 curr_rq_lock_irq_save(&flags); 137 curr_rq_lock_irq_save(&flags);
119 spin_lock(&ctx->lock); 138 spin_lock(&ctx->lock);
120 139
121 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 140 counter_sched_out(counter, cpuctx, ctx);
122 counter->state = PERF_COUNTER_STATE_INACTIVE; 141
123 counter->hw_ops->disable(counter); 142 counter->task = NULL;
124 ctx->nr_active--;
125 cpuctx->active_oncpu--;
126 counter->task = NULL;
127 counter->oncpu = -1;
128 }
129 ctx->nr_counters--; 143 ctx->nr_counters--;
130 144
131 /* 145 /*
@@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter,
207 struct perf_counter_context *ctx, 221 struct perf_counter_context *ctx,
208 int cpu) 222 int cpu)
209{ 223{
210 if (counter->state == PERF_COUNTER_STATE_OFF) 224 if (counter->state <= PERF_COUNTER_STATE_OFF)
211 return 0; 225 return 0;
212 226
213 counter->state = PERF_COUNTER_STATE_ACTIVE; 227 counter->state = PERF_COUNTER_STATE_ACTIVE;
@@ -223,13 +237,64 @@ counter_sched_in(struct perf_counter *counter,
223 return -EAGAIN; 237 return -EAGAIN;
224 } 238 }
225 239
226 cpuctx->active_oncpu++; 240 if (!is_software_counter(counter))
241 cpuctx->active_oncpu++;
227 ctx->nr_active++; 242 ctx->nr_active++;
228 243
244 if (counter->hw_event.exclusive)
245 cpuctx->exclusive = 1;
246
229 return 0; 247 return 0;
230} 248}
231 249
232/* 250/*
251 * Return 1 for a group consisting entirely of software counters,
252 * 0 if the group contains any hardware counters.
253 */
254static int is_software_only_group(struct perf_counter *leader)
255{
256 struct perf_counter *counter;
257
258 if (!is_software_counter(leader))
259 return 0;
260 list_for_each_entry(counter, &leader->sibling_list, list_entry)
261 if (!is_software_counter(counter))
262 return 0;
263 return 1;
264}
265
266/*
267 * Work out whether we can put this counter group on the CPU now.
268 */
269static int group_can_go_on(struct perf_counter *counter,
270 struct perf_cpu_context *cpuctx,
271 int can_add_hw)
272{
273 /*
274 * Groups consisting entirely of software counters can always go on.
275 */
276 if (is_software_only_group(counter))
277 return 1;
278 /*
279 * If an exclusive group is already on, no other hardware
280 * counters can go on.
281 */
282 if (cpuctx->exclusive)
283 return 0;
284 /*
285 * If this group is exclusive and there are already
286 * counters on the CPU, it can't go on.
287 */
288 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
289 return 0;
290 /*
291 * Otherwise, try to add it if all previous groups were able
292 * to go on.
293 */
294 return can_add_hw;
295}
296
297/*
233 * Cross CPU call to install and enable a performance counter 298 * Cross CPU call to install and enable a performance counter
234 */ 299 */
235static void __perf_install_in_context(void *info) 300static void __perf_install_in_context(void *info)
@@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info)
240 int cpu = smp_processor_id(); 305 int cpu = smp_processor_id();
241 unsigned long flags; 306 unsigned long flags;
242 u64 perf_flags; 307 u64 perf_flags;
308 int err;
243 309
244 /* 310 /*
245 * If this is a task context, we need to check whether it is 311 * If this is a task context, we need to check whether it is
@@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info)
261 list_add_counter(counter, ctx); 327 list_add_counter(counter, ctx);
262 ctx->nr_counters++; 328 ctx->nr_counters++;
263 329
264 counter_sched_in(counter, cpuctx, ctx, cpu); 330 /*
331 * An exclusive counter can't go on if there are already active
332 * hardware counters, and no hardware counter can go on if there
333 * is already an exclusive counter on.
334 */
335 if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
336 !group_can_go_on(counter, cpuctx, 1))
337 err = -EEXIST;
338 else
339 err = counter_sched_in(counter, cpuctx, ctx, cpu);
340
341 if (err && counter->hw_event.pinned)
342 counter->state = PERF_COUNTER_STATE_ERROR;
265 343
266 if (!ctx->task && cpuctx->max_pertask) 344 if (!err && !ctx->task && cpuctx->max_pertask)
267 cpuctx->max_pertask--; 345 cpuctx->max_pertask--;
268 346
269 hw_perf_restore(perf_flags); 347 hw_perf_restore(perf_flags);
@@ -327,22 +405,6 @@ retry:
327} 405}
328 406
329static void 407static void
330counter_sched_out(struct perf_counter *counter,
331 struct perf_cpu_context *cpuctx,
332 struct perf_counter_context *ctx)
333{
334 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
335 return;
336
337 counter->state = PERF_COUNTER_STATE_INACTIVE;
338 counter->hw_ops->disable(counter);
339 counter->oncpu = -1;
340
341 cpuctx->active_oncpu--;
342 ctx->nr_active--;
343}
344
345static void
346group_sched_out(struct perf_counter *group_counter, 408group_sched_out(struct perf_counter *group_counter,
347 struct perf_cpu_context *cpuctx, 409 struct perf_cpu_context *cpuctx,
348 struct perf_counter_context *ctx) 410 struct perf_counter_context *ctx)
@@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter,
359 */ 421 */
360 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) 422 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
361 counter_sched_out(counter, cpuctx, ctx); 423 counter_sched_out(counter, cpuctx, ctx);
424
425 if (group_counter->hw_event.exclusive)
426 cpuctx->exclusive = 0;
362} 427}
363 428
364void __perf_counter_sched_out(struct perf_counter_context *ctx, 429void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -455,30 +520,6 @@ group_error:
455 return -EAGAIN; 520 return -EAGAIN;
456} 521}
457 522
458/*
459 * Return 1 for a software counter, 0 for a hardware counter
460 */
461static inline int is_software_counter(struct perf_counter *counter)
462{
463 return !counter->hw_event.raw && counter->hw_event.type < 0;
464}
465
466/*
467 * Return 1 for a group consisting entirely of software counters,
468 * 0 if the group contains any hardware counters.
469 */
470static int is_software_only_group(struct perf_counter *leader)
471{
472 struct perf_counter *counter;
473
474 if (!is_software_counter(leader))
475 return 0;
476 list_for_each_entry(counter, &leader->sibling_list, list_entry)
477 if (!is_software_counter(counter))
478 return 0;
479 return 1;
480}
481
482static void 523static void
483__perf_counter_sched_in(struct perf_counter_context *ctx, 524__perf_counter_sched_in(struct perf_counter_context *ctx,
484 struct perf_cpu_context *cpuctx, int cpu) 525 struct perf_cpu_context *cpuctx, int cpu)
@@ -492,22 +533,49 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
492 533
493 spin_lock(&ctx->lock); 534 spin_lock(&ctx->lock);
494 flags = hw_perf_save_disable(); 535 flags = hw_perf_save_disable();
536
537 /*
538 * First go through the list and put on any pinned groups
539 * in order to give them the best chance of going on.
540 */
541 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
542 if (counter->state <= PERF_COUNTER_STATE_OFF ||
543 !counter->hw_event.pinned)
544 continue;
545 if (counter->cpu != -1 && counter->cpu != cpu)
546 continue;
547
548 if (group_can_go_on(counter, cpuctx, 1))
549 group_sched_in(counter, cpuctx, ctx, cpu);
550
551 /*
552 * If this pinned group hasn't been scheduled,
553 * put it in error state.
554 */
555 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
556 counter->state = PERF_COUNTER_STATE_ERROR;
557 }
558
495 list_for_each_entry(counter, &ctx->counter_list, list_entry) { 559 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
496 /* 560 /*
561 * Ignore counters in OFF or ERROR state, and
562 * ignore pinned counters since we did them already.
563 */
564 if (counter->state <= PERF_COUNTER_STATE_OFF ||
565 counter->hw_event.pinned)
566 continue;
567
568 /*
497 * Listen to the 'cpu' scheduling filter constraint 569 * Listen to the 'cpu' scheduling filter constraint
498 * of counters: 570 * of counters:
499 */ 571 */
500 if (counter->cpu != -1 && counter->cpu != cpu) 572 if (counter->cpu != -1 && counter->cpu != cpu)
501 continue; 573 continue;
502 574
503 /* 575 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
504 * If we scheduled in a group atomically and exclusively,
505 * or if this group can't go on, don't add any more
506 * hardware counters.
507 */
508 if (can_add_hw || is_software_only_group(counter))
509 if (group_sched_in(counter, cpuctx, ctx, cpu)) 576 if (group_sched_in(counter, cpuctx, ctx, cpu))
510 can_add_hw = 0; 577 can_add_hw = 0;
578 }
511 } 579 }
512 hw_perf_restore(flags); 580 hw_perf_restore(flags);
513 spin_unlock(&ctx->lock); 581 spin_unlock(&ctx->lock);
@@ -567,8 +635,10 @@ int perf_counter_task_disable(void)
567 */ 635 */
568 perf_flags = hw_perf_save_disable(); 636 perf_flags = hw_perf_save_disable();
569 637
570 list_for_each_entry(counter, &ctx->counter_list, list_entry) 638 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
571 counter->state = PERF_COUNTER_STATE_OFF; 639 if (counter->state != PERF_COUNTER_STATE_ERROR)
640 counter->state = PERF_COUNTER_STATE_OFF;
641 }
572 642
573 hw_perf_restore(perf_flags); 643 hw_perf_restore(perf_flags);
574 644
@@ -607,7 +677,7 @@ int perf_counter_task_enable(void)
607 perf_flags = hw_perf_save_disable(); 677 perf_flags = hw_perf_save_disable();
608 678
609 list_for_each_entry(counter, &ctx->counter_list, list_entry) { 679 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
610 if (counter->state != PERF_COUNTER_STATE_OFF) 680 if (counter->state > PERF_COUNTER_STATE_OFF)
611 continue; 681 continue;
612 counter->state = PERF_COUNTER_STATE_INACTIVE; 682 counter->state = PERF_COUNTER_STATE_INACTIVE;
613 counter->hw_event.disabled = 0; 683 counter->hw_event.disabled = 0;
@@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
849 if (count != sizeof(cntval)) 919 if (count != sizeof(cntval))
850 return -EINVAL; 920 return -EINVAL;
851 921
922 /*
923 * Return end-of-file for a read on a counter that is in
924 * error state (i.e. because it was pinned but it couldn't be
925 * scheduled on to the CPU at some point).
926 */
927 if (counter->state == PERF_COUNTER_STATE_ERROR)
928 return 0;
929
852 mutex_lock(&counter->mutex); 930 mutex_lock(&counter->mutex);
853 cntval = perf_counter_read(counter); 931 cntval = perf_counter_read(counter);
854 mutex_unlock(&counter->mutex); 932 mutex_unlock(&counter->mutex);
@@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter *counter,
884{ 962{
885 struct perf_data *irqdata, *usrdata; 963 struct perf_data *irqdata, *usrdata;
886 DECLARE_WAITQUEUE(wait, current); 964 DECLARE_WAITQUEUE(wait, current);
887 ssize_t res; 965 ssize_t res, res2;
888 966
889 irqdata = counter->irqdata; 967 irqdata = counter->irqdata;
890 usrdata = counter->usrdata; 968 usrdata = counter->usrdata;
@@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter *counter,
905 if (signal_pending(current)) 983 if (signal_pending(current))
906 break; 984 break;
907 985
986 if (counter->state == PERF_COUNTER_STATE_ERROR)
987 break;
988
908 spin_unlock_irq(&counter->waitq.lock); 989 spin_unlock_irq(&counter->waitq.lock);
909 schedule(); 990 schedule();
910 spin_lock_irq(&counter->waitq.lock); 991 spin_lock_irq(&counter->waitq.lock);
@@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter *counter,
913 __set_current_state(TASK_RUNNING); 994 __set_current_state(TASK_RUNNING);
914 spin_unlock_irq(&counter->waitq.lock); 995 spin_unlock_irq(&counter->waitq.lock);
915 996
916 if (usrdata->len + irqdata->len < count) 997 if (usrdata->len + irqdata->len < count &&
998 counter->state != PERF_COUNTER_STATE_ERROR)
917 return -ERESTARTSYS; 999 return -ERESTARTSYS;
918read_pending: 1000read_pending:
919 mutex_lock(&counter->mutex); 1001 mutex_lock(&counter->mutex);
@@ -925,11 +1007,12 @@ read_pending:
925 1007
926 /* Switch irq buffer: */ 1008 /* Switch irq buffer: */
927 usrdata = perf_switch_irq_data(counter); 1009 usrdata = perf_switch_irq_data(counter);
928 if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { 1010 res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1011 if (res2 < 0) {
929 if (!res) 1012 if (!res)
930 res = -EFAULT; 1013 res = -EFAULT;
931 } else { 1014 } else {
932 res = count; 1015 res += res2;
933 } 1016 }
934out: 1017out:
935 mutex_unlock(&counter->mutex); 1018 mutex_unlock(&counter->mutex);
@@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1348 */ 1431 */
1349 if (group_leader->ctx != ctx) 1432 if (group_leader->ctx != ctx)
1350 goto err_put_context; 1433 goto err_put_context;
1434 /*
1435 * Only a group leader can be exclusive or pinned
1436 */
1437 if (hw_event.exclusive || hw_event.pinned)
1438 goto err_put_context;
1351 } 1439 }
1352 1440
1353 ret = -EINVAL; 1441 ret = -EINVAL;
@@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child,
1473 1561
1474 cpuctx = &__get_cpu_var(perf_cpu_context); 1562 cpuctx = &__get_cpu_var(perf_cpu_context);
1475 1563
1476 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { 1564 counter_sched_out(child_counter, cpuctx, child_ctx);
1477 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1478 child_counter->hw_ops->disable(child_counter);
1479 cpuctx->active_oncpu--;
1480 child_ctx->nr_active--;
1481 child_counter->oncpu = -1;
1482 }
1483 1565
1484 list_del_init(&child_counter->list_entry); 1566 list_del_init(&child_counter->list_entry);
1485 1567