diff options
-rw-r--r-- | include/linux/perf_counter.h | 21 | ||||
-rw-r--r-- | kernel/perf_counter.c | 455 |
2 files changed, 415 insertions, 61 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7ab8e5f96f5b..33ba9fe0a781 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h | |||
@@ -14,6 +14,7 @@ | |||
14 | #define _LINUX_PERF_COUNTER_H | 14 | #define _LINUX_PERF_COUNTER_H |
15 | 15 | ||
16 | #include <asm/atomic.h> | 16 | #include <asm/atomic.h> |
17 | #include <asm/ioctl.h> | ||
17 | 18 | ||
18 | #ifdef CONFIG_PERF_COUNTERS | 19 | #ifdef CONFIG_PERF_COUNTERS |
19 | # include <asm/perf_counter.h> | 20 | # include <asm/perf_counter.h> |
@@ -95,6 +96,12 @@ struct perf_counter_hw_event { | |||
95 | }; | 96 | }; |
96 | 97 | ||
97 | /* | 98 | /* |
99 | * Ioctls that can be done on a perf counter fd: | ||
100 | */ | ||
101 | #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) | ||
102 | #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) | ||
103 | |||
104 | /* | ||
98 | * Kernel-internal data types: | 105 | * Kernel-internal data types: |
99 | */ | 106 | */ |
100 | 107 | ||
@@ -173,8 +180,10 @@ struct perf_counter { | |||
173 | struct file *filp; | 180 | struct file *filp; |
174 | 181 | ||
175 | struct perf_counter *parent; | 182 | struct perf_counter *parent; |
183 | struct list_head child_list; | ||
184 | |||
176 | /* | 185 | /* |
177 | * Protect attach/detach: | 186 | * Protect attach/detach and child_list: |
178 | */ | 187 | */ |
179 | struct mutex mutex; | 188 | struct mutex mutex; |
180 | 189 | ||
@@ -199,13 +208,21 @@ struct perf_counter { | |||
199 | struct perf_counter_context { | 208 | struct perf_counter_context { |
200 | #ifdef CONFIG_PERF_COUNTERS | 209 | #ifdef CONFIG_PERF_COUNTERS |
201 | /* | 210 | /* |
202 | * Protect the list of counters: | 211 | * Protect the states of the counters in the list, |
212 | * nr_active, and the list: | ||
203 | */ | 213 | */ |
204 | spinlock_t lock; | 214 | spinlock_t lock; |
215 | /* | ||
216 | * Protect the list of counters. Locking either mutex or lock | ||
217 | * is sufficient to ensure the list doesn't change; to change | ||
218 | * the list you need to lock both the mutex and the spinlock. | ||
219 | */ | ||
220 | struct mutex mutex; | ||
205 | 221 | ||
206 | struct list_head counter_list; | 222 | struct list_head counter_list; |
207 | int nr_counters; | 223 | int nr_counters; |
208 | int nr_active; | 224 | int nr_active; |
225 | int is_active; | ||
209 | struct task_struct *task; | 226 | struct task_struct *task; |
210 | #endif | 227 | #endif |
211 | }; | 228 | }; |
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index faf671b29566..1ac18daa424f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter, | |||
112 | cpuctx->exclusive = 0; | 112 | cpuctx->exclusive = 0; |
113 | } | 113 | } |
114 | 114 | ||
115 | static void | ||
116 | group_sched_out(struct perf_counter *group_counter, | ||
117 | struct perf_cpu_context *cpuctx, | ||
118 | struct perf_counter_context *ctx) | ||
119 | { | ||
120 | struct perf_counter *counter; | ||
121 | |||
122 | if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) | ||
123 | return; | ||
124 | |||
125 | counter_sched_out(group_counter, cpuctx, ctx); | ||
126 | |||
127 | /* | ||
128 | * Schedule out siblings (if any): | ||
129 | */ | ||
130 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) | ||
131 | counter_sched_out(counter, cpuctx, ctx); | ||
132 | |||
133 | if (group_counter->hw_event.exclusive) | ||
134 | cpuctx->exclusive = 0; | ||
135 | } | ||
136 | |||
115 | /* | 137 | /* |
116 | * Cross CPU call to remove a performance counter | 138 | * Cross CPU call to remove a performance counter |
117 | * | 139 | * |
@@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info) | |||
168 | /* | 190 | /* |
169 | * Remove the counter from a task's (or a CPU's) list of counters. | 191 | * Remove the counter from a task's (or a CPU's) list of counters. |
170 | * | 192 | * |
171 | * Must be called with counter->mutex held. | 193 | * Must be called with counter->mutex and ctx->mutex held. |
172 | * | 194 | * |
173 | * CPU counters are removed with a smp call. For task counters we only | 195 | * CPU counters are removed with a smp call. For task counters we only |
174 | * call when the task is on a CPU. | 196 | * call when the task is on a CPU. |
@@ -215,6 +237,99 @@ retry: | |||
215 | spin_unlock_irq(&ctx->lock); | 237 | spin_unlock_irq(&ctx->lock); |
216 | } | 238 | } |
217 | 239 | ||
240 | /* | ||
241 | * Cross CPU call to disable a performance counter | ||
242 | */ | ||
243 | static void __perf_counter_disable(void *info) | ||
244 | { | ||
245 | struct perf_counter *counter = info; | ||
246 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
247 | struct perf_counter_context *ctx = counter->ctx; | ||
248 | unsigned long flags; | ||
249 | |||
250 | /* | ||
251 | * If this is a per-task counter, need to check whether this | ||
252 | * counter's task is the current task on this cpu. | ||
253 | */ | ||
254 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
255 | return; | ||
256 | |||
257 | curr_rq_lock_irq_save(&flags); | ||
258 | spin_lock(&ctx->lock); | ||
259 | |||
260 | /* | ||
261 | * If the counter is on, turn it off. | ||
262 | * If it is in error state, leave it in error state. | ||
263 | */ | ||
264 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { | ||
265 | if (counter == counter->group_leader) | ||
266 | group_sched_out(counter, cpuctx, ctx); | ||
267 | else | ||
268 | counter_sched_out(counter, cpuctx, ctx); | ||
269 | counter->state = PERF_COUNTER_STATE_OFF; | ||
270 | } | ||
271 | |||
272 | spin_unlock(&ctx->lock); | ||
273 | curr_rq_unlock_irq_restore(&flags); | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Disable a counter. | ||
278 | */ | ||
279 | static void perf_counter_disable(struct perf_counter *counter) | ||
280 | { | ||
281 | struct perf_counter_context *ctx = counter->ctx; | ||
282 | struct task_struct *task = ctx->task; | ||
283 | |||
284 | if (!task) { | ||
285 | /* | ||
286 | * Disable the counter on the cpu that it's on | ||
287 | */ | ||
288 | smp_call_function_single(counter->cpu, __perf_counter_disable, | ||
289 | counter, 1); | ||
290 | return; | ||
291 | } | ||
292 | |||
293 | retry: | ||
294 | task_oncpu_function_call(task, __perf_counter_disable, counter); | ||
295 | |||
296 | spin_lock_irq(&ctx->lock); | ||
297 | /* | ||
298 | * If the counter is still active, we need to retry the cross-call. | ||
299 | */ | ||
300 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { | ||
301 | spin_unlock_irq(&ctx->lock); | ||
302 | goto retry; | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * Since we have the lock this context can't be scheduled | ||
307 | * in, so we can change the state safely. | ||
308 | */ | ||
309 | if (counter->state == PERF_COUNTER_STATE_INACTIVE) | ||
310 | counter->state = PERF_COUNTER_STATE_OFF; | ||
311 | |||
312 | spin_unlock_irq(&ctx->lock); | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Disable a counter and all its children. | ||
317 | */ | ||
318 | static void perf_counter_disable_family(struct perf_counter *counter) | ||
319 | { | ||
320 | struct perf_counter *child; | ||
321 | |||
322 | perf_counter_disable(counter); | ||
323 | |||
324 | /* | ||
325 | * Lock the mutex to protect the list of children | ||
326 | */ | ||
327 | mutex_lock(&counter->mutex); | ||
328 | list_for_each_entry(child, &counter->child_list, child_list) | ||
329 | perf_counter_disable(child); | ||
330 | mutex_unlock(&counter->mutex); | ||
331 | } | ||
332 | |||
218 | static int | 333 | static int |
219 | counter_sched_in(struct perf_counter *counter, | 334 | counter_sched_in(struct perf_counter *counter, |
220 | struct perf_cpu_context *cpuctx, | 335 | struct perf_cpu_context *cpuctx, |
@@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info) | |||
302 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 417 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
303 | struct perf_counter *counter = info; | 418 | struct perf_counter *counter = info; |
304 | struct perf_counter_context *ctx = counter->ctx; | 419 | struct perf_counter_context *ctx = counter->ctx; |
420 | struct perf_counter *leader = counter->group_leader; | ||
305 | int cpu = smp_processor_id(); | 421 | int cpu = smp_processor_id(); |
306 | unsigned long flags; | 422 | unsigned long flags; |
307 | u64 perf_flags; | 423 | u64 perf_flags; |
@@ -328,22 +444,39 @@ static void __perf_install_in_context(void *info) | |||
328 | ctx->nr_counters++; | 444 | ctx->nr_counters++; |
329 | 445 | ||
330 | /* | 446 | /* |
447 | * Don't put the counter on if it is disabled or if | ||
448 | * it is in a group and the group isn't on. | ||
449 | */ | ||
450 | if (counter->state != PERF_COUNTER_STATE_INACTIVE || | ||
451 | (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) | ||
452 | goto unlock; | ||
453 | |||
454 | /* | ||
331 | * An exclusive counter can't go on if there are already active | 455 | * An exclusive counter can't go on if there are already active |
332 | * hardware counters, and no hardware counter can go on if there | 456 | * hardware counters, and no hardware counter can go on if there |
333 | * is already an exclusive counter on. | 457 | * is already an exclusive counter on. |
334 | */ | 458 | */ |
335 | if (counter->state == PERF_COUNTER_STATE_INACTIVE && | 459 | if (!group_can_go_on(counter, cpuctx, 1)) |
336 | !group_can_go_on(counter, cpuctx, 1)) | ||
337 | err = -EEXIST; | 460 | err = -EEXIST; |
338 | else | 461 | else |
339 | err = counter_sched_in(counter, cpuctx, ctx, cpu); | 462 | err = counter_sched_in(counter, cpuctx, ctx, cpu); |
340 | 463 | ||
341 | if (err && counter->hw_event.pinned) | 464 | if (err) { |
342 | counter->state = PERF_COUNTER_STATE_ERROR; | 465 | /* |
466 | * This counter couldn't go on. If it is in a group | ||
467 | * then we have to pull the whole group off. | ||
468 | * If the counter group is pinned then put it in error state. | ||
469 | */ | ||
470 | if (leader != counter) | ||
471 | group_sched_out(leader, cpuctx, ctx); | ||
472 | if (leader->hw_event.pinned) | ||
473 | leader->state = PERF_COUNTER_STATE_ERROR; | ||
474 | } | ||
343 | 475 | ||
344 | if (!err && !ctx->task && cpuctx->max_pertask) | 476 | if (!err && !ctx->task && cpuctx->max_pertask) |
345 | cpuctx->max_pertask--; | 477 | cpuctx->max_pertask--; |
346 | 478 | ||
479 | unlock: | ||
347 | hw_perf_restore(perf_flags); | 480 | hw_perf_restore(perf_flags); |
348 | 481 | ||
349 | spin_unlock(&ctx->lock); | 482 | spin_unlock(&ctx->lock); |
@@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info) | |||
359 | * If the counter is attached to a task which is on a CPU we use a smp | 492 | * If the counter is attached to a task which is on a CPU we use a smp |
360 | * call to enable it in the task context. The task might have been | 493 | * call to enable it in the task context. The task might have been |
361 | * scheduled away, but we check this in the smp call again. | 494 | * scheduled away, but we check this in the smp call again. |
495 | * | ||
496 | * Must be called with ctx->mutex held. | ||
362 | */ | 497 | */ |
363 | static void | 498 | static void |
364 | perf_install_in_context(struct perf_counter_context *ctx, | 499 | perf_install_in_context(struct perf_counter_context *ctx, |
@@ -387,7 +522,7 @@ retry: | |||
387 | /* | 522 | /* |
388 | * we need to retry the smp call. | 523 | * we need to retry the smp call. |
389 | */ | 524 | */ |
390 | if (ctx->nr_active && list_empty(&counter->list_entry)) { | 525 | if (ctx->is_active && list_empty(&counter->list_entry)) { |
391 | spin_unlock_irq(&ctx->lock); | 526 | spin_unlock_irq(&ctx->lock); |
392 | goto retry; | 527 | goto retry; |
393 | } | 528 | } |
@@ -404,26 +539,131 @@ retry: | |||
404 | spin_unlock_irq(&ctx->lock); | 539 | spin_unlock_irq(&ctx->lock); |
405 | } | 540 | } |
406 | 541 | ||
407 | static void | 542 | /* |
408 | group_sched_out(struct perf_counter *group_counter, | 543 | * Cross CPU call to enable a performance counter |
409 | struct perf_cpu_context *cpuctx, | 544 | */ |
410 | struct perf_counter_context *ctx) | 545 | static void __perf_counter_enable(void *info) |
411 | { | 546 | { |
412 | struct perf_counter *counter; | 547 | struct perf_counter *counter = info; |
548 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
549 | struct perf_counter_context *ctx = counter->ctx; | ||
550 | struct perf_counter *leader = counter->group_leader; | ||
551 | unsigned long flags; | ||
552 | int err; | ||
413 | 553 | ||
414 | if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) | 554 | /* |
555 | * If this is a per-task counter, need to check whether this | ||
556 | * counter's task is the current task on this cpu. | ||
557 | */ | ||
558 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
415 | return; | 559 | return; |
416 | 560 | ||
417 | counter_sched_out(group_counter, cpuctx, ctx); | 561 | curr_rq_lock_irq_save(&flags); |
562 | spin_lock(&ctx->lock); | ||
563 | |||
564 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) | ||
565 | goto unlock; | ||
566 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
418 | 567 | ||
419 | /* | 568 | /* |
420 | * Schedule out siblings (if any): | 569 | * If the counter is in a group and isn't the group leader, |
570 | * then don't put it on unless the group is on. | ||
421 | */ | 571 | */ |
422 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) | 572 | if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) |
423 | counter_sched_out(counter, cpuctx, ctx); | 573 | goto unlock; |
424 | 574 | ||
425 | if (group_counter->hw_event.exclusive) | 575 | if (!group_can_go_on(counter, cpuctx, 1)) |
426 | cpuctx->exclusive = 0; | 576 | err = -EEXIST; |
577 | else | ||
578 | err = counter_sched_in(counter, cpuctx, ctx, | ||
579 | smp_processor_id()); | ||
580 | |||
581 | if (err) { | ||
582 | /* | ||
583 | * If this counter can't go on and it's part of a | ||
584 | * group, then the whole group has to come off. | ||
585 | */ | ||
586 | if (leader != counter) | ||
587 | group_sched_out(leader, cpuctx, ctx); | ||
588 | if (leader->hw_event.pinned) | ||
589 | leader->state = PERF_COUNTER_STATE_ERROR; | ||
590 | } | ||
591 | |||
592 | unlock: | ||
593 | spin_unlock(&ctx->lock); | ||
594 | curr_rq_unlock_irq_restore(&flags); | ||
595 | } | ||
596 | |||
597 | /* | ||
598 | * Enable a counter. | ||
599 | */ | ||
600 | static void perf_counter_enable(struct perf_counter *counter) | ||
601 | { | ||
602 | struct perf_counter_context *ctx = counter->ctx; | ||
603 | struct task_struct *task = ctx->task; | ||
604 | |||
605 | if (!task) { | ||
606 | /* | ||
607 | * Enable the counter on the cpu that it's on | ||
608 | */ | ||
609 | smp_call_function_single(counter->cpu, __perf_counter_enable, | ||
610 | counter, 1); | ||
611 | return; | ||
612 | } | ||
613 | |||
614 | spin_lock_irq(&ctx->lock); | ||
615 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) | ||
616 | goto out; | ||
617 | |||
618 | /* | ||
619 | * If the counter is in error state, clear that first. | ||
620 | * That way, if we see the counter in error state below, we | ||
621 | * know that it has gone back into error state, as distinct | ||
622 | * from the task having been scheduled away before the | ||
623 | * cross-call arrived. | ||
624 | */ | ||
625 | if (counter->state == PERF_COUNTER_STATE_ERROR) | ||
626 | counter->state = PERF_COUNTER_STATE_OFF; | ||
627 | |||
628 | retry: | ||
629 | spin_unlock_irq(&ctx->lock); | ||
630 | task_oncpu_function_call(task, __perf_counter_enable, counter); | ||
631 | |||
632 | spin_lock_irq(&ctx->lock); | ||
633 | |||
634 | /* | ||
635 | * If the context is active and the counter is still off, | ||
636 | * we need to retry the cross-call. | ||
637 | */ | ||
638 | if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) | ||
639 | goto retry; | ||
640 | |||
641 | /* | ||
642 | * Since we have the lock this context can't be scheduled | ||
643 | * in, so we can change the state safely. | ||
644 | */ | ||
645 | if (counter->state == PERF_COUNTER_STATE_OFF) | ||
646 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
647 | out: | ||
648 | spin_unlock_irq(&ctx->lock); | ||
649 | } | ||
650 | |||
651 | /* | ||
652 | * Enable a counter and all its children. | ||
653 | */ | ||
654 | static void perf_counter_enable_family(struct perf_counter *counter) | ||
655 | { | ||
656 | struct perf_counter *child; | ||
657 | |||
658 | perf_counter_enable(counter); | ||
659 | |||
660 | /* | ||
661 | * Lock the mutex to protect the list of children | ||
662 | */ | ||
663 | mutex_lock(&counter->mutex); | ||
664 | list_for_each_entry(child, &counter->child_list, child_list) | ||
665 | perf_counter_enable(child); | ||
666 | mutex_unlock(&counter->mutex); | ||
427 | } | 667 | } |
428 | 668 | ||
429 | void __perf_counter_sched_out(struct perf_counter_context *ctx, | 669 | void __perf_counter_sched_out(struct perf_counter_context *ctx, |
@@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, | |||
432 | struct perf_counter *counter; | 672 | struct perf_counter *counter; |
433 | u64 flags; | 673 | u64 flags; |
434 | 674 | ||
675 | spin_lock(&ctx->lock); | ||
676 | ctx->is_active = 0; | ||
435 | if (likely(!ctx->nr_counters)) | 677 | if (likely(!ctx->nr_counters)) |
436 | return; | 678 | goto out; |
437 | 679 | ||
438 | spin_lock(&ctx->lock); | ||
439 | flags = hw_perf_save_disable(); | 680 | flags = hw_perf_save_disable(); |
440 | if (ctx->nr_active) { | 681 | if (ctx->nr_active) { |
441 | list_for_each_entry(counter, &ctx->counter_list, list_entry) | 682 | list_for_each_entry(counter, &ctx->counter_list, list_entry) |
442 | group_sched_out(counter, cpuctx, ctx); | 683 | group_sched_out(counter, cpuctx, ctx); |
443 | } | 684 | } |
444 | hw_perf_restore(flags); | 685 | hw_perf_restore(flags); |
686 | out: | ||
445 | spin_unlock(&ctx->lock); | 687 | spin_unlock(&ctx->lock); |
446 | } | 688 | } |
447 | 689 | ||
@@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, | |||
528 | u64 flags; | 770 | u64 flags; |
529 | int can_add_hw = 1; | 771 | int can_add_hw = 1; |
530 | 772 | ||
773 | spin_lock(&ctx->lock); | ||
774 | ctx->is_active = 1; | ||
531 | if (likely(!ctx->nr_counters)) | 775 | if (likely(!ctx->nr_counters)) |
532 | return; | 776 | goto out; |
533 | 777 | ||
534 | spin_lock(&ctx->lock); | ||
535 | flags = hw_perf_save_disable(); | 778 | flags = hw_perf_save_disable(); |
536 | 779 | ||
537 | /* | 780 | /* |
@@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, | |||
578 | } | 821 | } |
579 | } | 822 | } |
580 | hw_perf_restore(flags); | 823 | hw_perf_restore(flags); |
824 | out: | ||
581 | spin_unlock(&ctx->lock); | 825 | spin_unlock(&ctx->lock); |
582 | } | 826 | } |
583 | 827 | ||
@@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file) | |||
896 | 1140 | ||
897 | file->private_data = NULL; | 1141 | file->private_data = NULL; |
898 | 1142 | ||
1143 | mutex_lock(&ctx->mutex); | ||
899 | mutex_lock(&counter->mutex); | 1144 | mutex_lock(&counter->mutex); |
900 | 1145 | ||
901 | perf_counter_remove_from_context(counter); | 1146 | perf_counter_remove_from_context(counter); |
902 | put_context(ctx); | 1147 | put_context(ctx); |
903 | 1148 | ||
904 | mutex_unlock(&counter->mutex); | 1149 | mutex_unlock(&counter->mutex); |
1150 | mutex_unlock(&ctx->mutex); | ||
905 | 1151 | ||
906 | kfree(counter); | 1152 | kfree(counter); |
907 | 1153 | ||
@@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
1053 | return events; | 1299 | return events; |
1054 | } | 1300 | } |
1055 | 1301 | ||
1302 | static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
1303 | { | ||
1304 | struct perf_counter *counter = file->private_data; | ||
1305 | int err = 0; | ||
1306 | |||
1307 | switch (cmd) { | ||
1308 | case PERF_COUNTER_IOC_ENABLE: | ||
1309 | perf_counter_enable_family(counter); | ||
1310 | break; | ||
1311 | case PERF_COUNTER_IOC_DISABLE: | ||
1312 | perf_counter_disable_family(counter); | ||
1313 | break; | ||
1314 | default: | ||
1315 | err = -ENOTTY; | ||
1316 | } | ||
1317 | return err; | ||
1318 | } | ||
1319 | |||
1056 | static const struct file_operations perf_fops = { | 1320 | static const struct file_operations perf_fops = { |
1057 | .release = perf_release, | 1321 | .release = perf_release, |
1058 | .read = perf_read, | 1322 | .read = perf_read, |
1059 | .poll = perf_poll, | 1323 | .poll = perf_poll, |
1324 | .unlocked_ioctl = perf_ioctl, | ||
1325 | .compat_ioctl = perf_ioctl, | ||
1060 | }; | 1326 | }; |
1061 | 1327 | ||
1062 | static int cpu_clock_perf_counter_enable(struct perf_counter *counter) | 1328 | static int cpu_clock_perf_counter_enable(struct perf_counter *counter) |
@@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, | |||
1348 | INIT_LIST_HEAD(&counter->sibling_list); | 1614 | INIT_LIST_HEAD(&counter->sibling_list); |
1349 | init_waitqueue_head(&counter->waitq); | 1615 | init_waitqueue_head(&counter->waitq); |
1350 | 1616 | ||
1617 | INIT_LIST_HEAD(&counter->child_list); | ||
1618 | |||
1351 | counter->irqdata = &counter->data[0]; | 1619 | counter->irqdata = &counter->data[0]; |
1352 | counter->usrdata = &counter->data[1]; | 1620 | counter->usrdata = &counter->data[1]; |
1353 | counter->cpu = cpu; | 1621 | counter->cpu = cpu; |
@@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, | |||
1452 | goto err_free_put_context; | 1720 | goto err_free_put_context; |
1453 | 1721 | ||
1454 | counter->filp = counter_file; | 1722 | counter->filp = counter_file; |
1723 | mutex_lock(&ctx->mutex); | ||
1455 | perf_install_in_context(ctx, counter, cpu); | 1724 | perf_install_in_context(ctx, counter, cpu); |
1725 | mutex_unlock(&ctx->mutex); | ||
1456 | 1726 | ||
1457 | fput_light(counter_file, fput_needed2); | 1727 | fput_light(counter_file, fput_needed2); |
1458 | 1728 | ||
@@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, | |||
1479 | { | 1749 | { |
1480 | memset(ctx, 0, sizeof(*ctx)); | 1750 | memset(ctx, 0, sizeof(*ctx)); |
1481 | spin_lock_init(&ctx->lock); | 1751 | spin_lock_init(&ctx->lock); |
1752 | mutex_init(&ctx->mutex); | ||
1482 | INIT_LIST_HEAD(&ctx->counter_list); | 1753 | INIT_LIST_HEAD(&ctx->counter_list); |
1483 | ctx->task = task; | 1754 | ctx->task = task; |
1484 | } | 1755 | } |
@@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx, | |||
1486 | /* | 1757 | /* |
1487 | * inherit a counter from parent task to child task: | 1758 | * inherit a counter from parent task to child task: |
1488 | */ | 1759 | */ |
1489 | static int | 1760 | static struct perf_counter * |
1490 | inherit_counter(struct perf_counter *parent_counter, | 1761 | inherit_counter(struct perf_counter *parent_counter, |
1491 | struct task_struct *parent, | 1762 | struct task_struct *parent, |
1492 | struct perf_counter_context *parent_ctx, | 1763 | struct perf_counter_context *parent_ctx, |
1493 | struct task_struct *child, | 1764 | struct task_struct *child, |
1765 | struct perf_counter *group_leader, | ||
1494 | struct perf_counter_context *child_ctx) | 1766 | struct perf_counter_context *child_ctx) |
1495 | { | 1767 | { |
1496 | struct perf_counter *child_counter; | 1768 | struct perf_counter *child_counter; |
1497 | 1769 | ||
1770 | /* | ||
1771 | * Instead of creating recursive hierarchies of counters, | ||
1772 | * we link inherited counters back to the original parent, | ||
1773 | * which has a filp for sure, which we use as the reference | ||
1774 | * count: | ||
1775 | */ | ||
1776 | if (parent_counter->parent) | ||
1777 | parent_counter = parent_counter->parent; | ||
1778 | |||
1498 | child_counter = perf_counter_alloc(&parent_counter->hw_event, | 1779 | child_counter = perf_counter_alloc(&parent_counter->hw_event, |
1499 | parent_counter->cpu, NULL, | 1780 | parent_counter->cpu, group_leader, |
1500 | GFP_ATOMIC); | 1781 | GFP_KERNEL); |
1501 | if (!child_counter) | 1782 | if (!child_counter) |
1502 | return -ENOMEM; | 1783 | return NULL; |
1503 | 1784 | ||
1504 | /* | 1785 | /* |
1505 | * Link it up in the child's context: | 1786 | * Link it up in the child's context: |
@@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter, | |||
1523 | */ | 1804 | */ |
1524 | atomic_long_inc(&parent_counter->filp->f_count); | 1805 | atomic_long_inc(&parent_counter->filp->f_count); |
1525 | 1806 | ||
1807 | /* | ||
1808 | * Link this into the parent counter's child list | ||
1809 | */ | ||
1810 | mutex_lock(&parent_counter->mutex); | ||
1811 | list_add_tail(&child_counter->child_list, &parent_counter->child_list); | ||
1812 | |||
1813 | /* | ||
1814 | * Make the child state follow the state of the parent counter, | ||
1815 | * not its hw_event.disabled bit. We hold the parent's mutex, | ||
1816 | * so we won't race with perf_counter_{en,dis}able_family. | ||
1817 | */ | ||
1818 | if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) | ||
1819 | child_counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
1820 | else | ||
1821 | child_counter->state = PERF_COUNTER_STATE_OFF; | ||
1822 | |||
1823 | mutex_unlock(&parent_counter->mutex); | ||
1824 | |||
1825 | return child_counter; | ||
1826 | } | ||
1827 | |||
1828 | static int inherit_group(struct perf_counter *parent_counter, | ||
1829 | struct task_struct *parent, | ||
1830 | struct perf_counter_context *parent_ctx, | ||
1831 | struct task_struct *child, | ||
1832 | struct perf_counter_context *child_ctx) | ||
1833 | { | ||
1834 | struct perf_counter *leader; | ||
1835 | struct perf_counter *sub; | ||
1836 | |||
1837 | leader = inherit_counter(parent_counter, parent, parent_ctx, | ||
1838 | child, NULL, child_ctx); | ||
1839 | if (!leader) | ||
1840 | return -ENOMEM; | ||
1841 | list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { | ||
1842 | if (!inherit_counter(sub, parent, parent_ctx, | ||
1843 | child, leader, child_ctx)) | ||
1844 | return -ENOMEM; | ||
1845 | } | ||
1526 | return 0; | 1846 | return 0; |
1527 | } | 1847 | } |
1528 | 1848 | ||
1849 | static void sync_child_counter(struct perf_counter *child_counter, | ||
1850 | struct perf_counter *parent_counter) | ||
1851 | { | ||
1852 | u64 parent_val, child_val; | ||
1853 | |||
1854 | parent_val = atomic64_read(&parent_counter->count); | ||
1855 | child_val = atomic64_read(&child_counter->count); | ||
1856 | |||
1857 | /* | ||
1858 | * Add back the child's count to the parent's count: | ||
1859 | */ | ||
1860 | atomic64_add(child_val, &parent_counter->count); | ||
1861 | |||
1862 | /* | ||
1863 | * Remove this counter from the parent's list | ||
1864 | */ | ||
1865 | mutex_lock(&parent_counter->mutex); | ||
1866 | list_del_init(&child_counter->child_list); | ||
1867 | mutex_unlock(&parent_counter->mutex); | ||
1868 | |||
1869 | /* | ||
1870 | * Release the parent counter, if this was the last | ||
1871 | * reference to it. | ||
1872 | */ | ||
1873 | fput(parent_counter->filp); | ||
1874 | } | ||
1875 | |||
1529 | static void | 1876 | static void |
1530 | __perf_counter_exit_task(struct task_struct *child, | 1877 | __perf_counter_exit_task(struct task_struct *child, |
1531 | struct perf_counter *child_counter, | 1878 | struct perf_counter *child_counter, |
1532 | struct perf_counter_context *child_ctx) | 1879 | struct perf_counter_context *child_ctx) |
1533 | { | 1880 | { |
1534 | struct perf_counter *parent_counter; | 1881 | struct perf_counter *parent_counter; |
1535 | u64 parent_val, child_val; | 1882 | struct perf_counter *sub, *tmp; |
1536 | 1883 | ||
1537 | /* | 1884 | /* |
1538 | * If we do not self-reap then we have to wait for the | 1885 | * If we do not self-reap then we have to wait for the |
@@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child, | |||
1561 | 1908 | ||
1562 | cpuctx = &__get_cpu_var(perf_cpu_context); | 1909 | cpuctx = &__get_cpu_var(perf_cpu_context); |
1563 | 1910 | ||
1564 | counter_sched_out(child_counter, cpuctx, child_ctx); | 1911 | group_sched_out(child_counter, cpuctx, child_ctx); |
1565 | 1912 | ||
1566 | list_del_init(&child_counter->list_entry); | 1913 | list_del_init(&child_counter->list_entry); |
1567 | 1914 | ||
@@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child, | |||
1577 | * that are still around due to the child reference. These | 1924 | * that are still around due to the child reference. These |
1578 | * counters need to be zapped - but otherwise linger. | 1925 | * counters need to be zapped - but otherwise linger. |
1579 | */ | 1926 | */ |
1580 | if (!parent_counter) | 1927 | if (parent_counter) { |
1581 | return; | 1928 | sync_child_counter(child_counter, parent_counter); |
1582 | 1929 | list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, | |
1583 | parent_val = atomic64_read(&parent_counter->count); | 1930 | list_entry) { |
1584 | child_val = atomic64_read(&child_counter->count); | 1931 | if (sub->parent) |
1585 | 1932 | sync_child_counter(sub, sub->parent); | |
1586 | /* | 1933 | kfree(sub); |
1587 | * Add back the child's count to the parent's count: | 1934 | } |
1588 | */ | 1935 | } |
1589 | atomic64_add(child_val, &parent_counter->count); | ||
1590 | |||
1591 | fput(parent_counter->filp); | ||
1592 | 1936 | ||
1593 | kfree(child_counter); | 1937 | kfree(child_counter); |
1594 | } | 1938 | } |
1595 | 1939 | ||
1596 | /* | 1940 | /* |
1597 | * When a child task exist, feed back counter values to parent counters. | 1941 | * When a child task exits, feed back counter values to parent counters. |
1598 | * | 1942 | * |
1599 | * Note: we are running in child context, but the PID is not hashed | 1943 | * Note: we may be running in child context, but the PID is not hashed |
1600 | * anymore so new counters will not be added. | 1944 | * anymore so new counters will not be added. |
1601 | */ | 1945 | */ |
1602 | void perf_counter_exit_task(struct task_struct *child) | 1946 | void perf_counter_exit_task(struct task_struct *child) |
@@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child) | |||
1620 | void perf_counter_init_task(struct task_struct *child) | 1964 | void perf_counter_init_task(struct task_struct *child) |
1621 | { | 1965 | { |
1622 | struct perf_counter_context *child_ctx, *parent_ctx; | 1966 | struct perf_counter_context *child_ctx, *parent_ctx; |
1623 | struct perf_counter *counter, *parent_counter; | 1967 | struct perf_counter *counter; |
1624 | struct task_struct *parent = current; | 1968 | struct task_struct *parent = current; |
1625 | unsigned long flags; | ||
1626 | 1969 | ||
1627 | child_ctx = &child->perf_counter_ctx; | 1970 | child_ctx = &child->perf_counter_ctx; |
1628 | parent_ctx = &parent->perf_counter_ctx; | 1971 | parent_ctx = &parent->perf_counter_ctx; |
@@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child) | |||
1641 | * Lock the parent list. No need to lock the child - not PID | 1984 | * Lock the parent list. No need to lock the child - not PID |
1642 | * hashed yet and not running, so nobody can access it. | 1985 | * hashed yet and not running, so nobody can access it. |
1643 | */ | 1986 | */ |
1644 | spin_lock_irqsave(&parent_ctx->lock, flags); | 1987 | mutex_lock(&parent_ctx->mutex); |
1645 | 1988 | ||
1646 | /* | 1989 | /* |
1647 | * We dont have to disable NMIs - we are only looking at | 1990 | * We dont have to disable NMIs - we are only looking at |
1648 | * the list, not manipulating it: | 1991 | * the list, not manipulating it: |
1649 | */ | 1992 | */ |
1650 | list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { | 1993 | list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { |
1651 | if (!counter->hw_event.inherit || counter->group_leader != counter) | 1994 | if (!counter->hw_event.inherit) |
1652 | continue; | 1995 | continue; |
1653 | 1996 | ||
1654 | /* | 1997 | if (inherit_group(counter, parent, |
1655 | * Instead of creating recursive hierarchies of counters, | ||
1656 | * we link inheritd counters back to the original parent, | ||
1657 | * which has a filp for sure, which we use as the reference | ||
1658 | * count: | ||
1659 | */ | ||
1660 | parent_counter = counter; | ||
1661 | if (counter->parent) | ||
1662 | parent_counter = counter->parent; | ||
1663 | |||
1664 | if (inherit_counter(parent_counter, parent, | ||
1665 | parent_ctx, child, child_ctx)) | 1998 | parent_ctx, child, child_ctx)) |
1666 | break; | 1999 | break; |
1667 | } | 2000 | } |
1668 | 2001 | ||
1669 | spin_unlock_irqrestore(&parent_ctx->lock, flags); | 2002 | mutex_unlock(&parent_ctx->mutex); |
1670 | } | 2003 | } |
1671 | 2004 | ||
1672 | static void __cpuinit perf_counter_init_cpu(int cpu) | 2005 | static void __cpuinit perf_counter_init_cpu(int cpu) |
@@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info) | |||
1692 | 2025 | ||
1693 | list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) | 2026 | list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) |
1694 | __perf_counter_remove_from_context(counter); | 2027 | __perf_counter_remove_from_context(counter); |
1695 | |||
1696 | } | 2028 | } |
1697 | static void perf_counter_exit_cpu(int cpu) | 2029 | static void perf_counter_exit_cpu(int cpu) |
1698 | { | 2030 | { |
2031 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
2032 | struct perf_counter_context *ctx = &cpuctx->ctx; | ||
2033 | |||
2034 | mutex_lock(&ctx->mutex); | ||
1699 | smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); | 2035 | smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); |
2036 | mutex_unlock(&ctx->mutex); | ||
1700 | } | 2037 | } |
1701 | #else | 2038 | #else |
1702 | static inline void perf_counter_exit_cpu(int cpu) { } | 2039 | static inline void perf_counter_exit_cpu(int cpu) { } |