aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-01-17 02:10:22 -0500
committerPaul Mackerras <paulus@samba.org>2009-01-17 02:10:22 -0500
commitd859e29fe34cb833071b20aef860ee94fbad9bb2 (patch)
tree6359fe345851db2b7e8379fa65b7ed6a873d3ee3
parent3b6f9e5cb21964b7ce12bf81076f830885563ec8 (diff)
perf_counter: Add counter enable/disable ioctls
Impact: New perf_counter features This primarily adds a way for perf_counter users to enable and disable counters and groups. Enabling or disabling a counter or group also enables or disables all of the child counters that have been cloned from it to monitor children of the task monitored by the top-level counter. The userspace interface to enable/disable counters is via ioctl on the counter file descriptor. Along the way this extends the code that handles child counters to handle child counter groups properly. A group with multiple counters will be cloned to child tasks if and only if the group leader has the hw_event.inherit bit set - if it is set the whole group is cloned as a group in the child task. In order to be able to enable or disable all child counters of a given top-level counter, we need a way to find them all. Hence I have added a child_list field to struct perf_counter, which is the head of the list of children for a top-level counter, or the link in that list for a child counter. That list is protected by the perf_counter.mutex field. This also adds a mutex to the perf_counter_context struct. Previously the list of counters was protected just by the lock field in the context, which meant that perf_counter_init_task had to take that lock and then take whatever lock/mutex protects the top-level counter's child_list. But the counter enable/disable functions need to take that lock in order to traverse the list, then for each counter take the lock in that counter's context in order to change the counter's state safely, which will lead to a deadlock. To solve this, we now have both a mutex and a spinlock in the context, and taking either is sufficient to ensure the list of counters can't change - you have to take both before changing the list. Now perf_counter_init_task takes the mutex instead of the lock (which incidentally means that inherit_counter can use GFP_KERNEL instead of GFP_ATOMIC) and thus avoids the possible deadlock. Similarly the new enable/disable functions can take the mutex while traversing the list of child counters without incurring a possible deadlock when the counter manipulation code locks the context for a child counter. We also had an misfeature that the first counter added to a context would possibly not go on until the next sched-in, because we were using ctx->nr_active to detect if the context was running on a CPU. But nr_active is the number of active counters, and if that was zero (because the context didn't have any counters yet) it would look like the context wasn't running on a cpu and so the retry code in __perf_install_in_context wouldn't retry. So this adds an 'is_active' field that is set when the context is on a CPU, even if it has no counters. The is_active field is only used for task contexts, not for per-cpu contexts. If we enable a subsidiary counter in a group that is active on a CPU, and the arch code can't enable the counter, then we have to pull the whole group off the CPU. We do this with group_sched_out, which gets moved up in the file so it comes before all its callers. This also adds similar logic to __perf_install_in_context so that the "all on, or none" invariant of groups is preserved when adding a new counter to a group. Signed-off-by: Paul Mackerras <paulus@samba.org>
-rw-r--r--include/linux/perf_counter.h21
-rw-r--r--kernel/perf_counter.c455
2 files changed, 415 insertions, 61 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7ab8e5f96f5b..33ba9fe0a781 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
14#define _LINUX_PERF_COUNTER_H 14#define _LINUX_PERF_COUNTER_H
15 15
16#include <asm/atomic.h> 16#include <asm/atomic.h>
17#include <asm/ioctl.h>
17 18
18#ifdef CONFIG_PERF_COUNTERS 19#ifdef CONFIG_PERF_COUNTERS
19# include <asm/perf_counter.h> 20# include <asm/perf_counter.h>
@@ -95,6 +96,12 @@ struct perf_counter_hw_event {
95}; 96};
96 97
97/* 98/*
99 * Ioctls that can be done on a perf counter fd:
100 */
101#define PERF_COUNTER_IOC_ENABLE _IO('$', 0)
102#define PERF_COUNTER_IOC_DISABLE _IO('$', 1)
103
104/*
98 * Kernel-internal data types: 105 * Kernel-internal data types:
99 */ 106 */
100 107
@@ -173,8 +180,10 @@ struct perf_counter {
173 struct file *filp; 180 struct file *filp;
174 181
175 struct perf_counter *parent; 182 struct perf_counter *parent;
183 struct list_head child_list;
184
176 /* 185 /*
177 * Protect attach/detach: 186 * Protect attach/detach and child_list:
178 */ 187 */
179 struct mutex mutex; 188 struct mutex mutex;
180 189
@@ -199,13 +208,21 @@ struct perf_counter {
199struct perf_counter_context { 208struct perf_counter_context {
200#ifdef CONFIG_PERF_COUNTERS 209#ifdef CONFIG_PERF_COUNTERS
201 /* 210 /*
202 * Protect the list of counters: 211 * Protect the states of the counters in the list,
212 * nr_active, and the list:
203 */ 213 */
204 spinlock_t lock; 214 spinlock_t lock;
215 /*
216 * Protect the list of counters. Locking either mutex or lock
217 * is sufficient to ensure the list doesn't change; to change
218 * the list you need to lock both the mutex and the spinlock.
219 */
220 struct mutex mutex;
205 221
206 struct list_head counter_list; 222 struct list_head counter_list;
207 int nr_counters; 223 int nr_counters;
208 int nr_active; 224 int nr_active;
225 int is_active;
209 struct task_struct *task; 226 struct task_struct *task;
210#endif 227#endif
211}; 228};
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index faf671b29566..1ac18daa424f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter,
112 cpuctx->exclusive = 0; 112 cpuctx->exclusive = 0;
113} 113}
114 114
115static void
116group_sched_out(struct perf_counter *group_counter,
117 struct perf_cpu_context *cpuctx,
118 struct perf_counter_context *ctx)
119{
120 struct perf_counter *counter;
121
122 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
123 return;
124
125 counter_sched_out(group_counter, cpuctx, ctx);
126
127 /*
128 * Schedule out siblings (if any):
129 */
130 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
131 counter_sched_out(counter, cpuctx, ctx);
132
133 if (group_counter->hw_event.exclusive)
134 cpuctx->exclusive = 0;
135}
136
115/* 137/*
116 * Cross CPU call to remove a performance counter 138 * Cross CPU call to remove a performance counter
117 * 139 *
@@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info)
168/* 190/*
169 * Remove the counter from a task's (or a CPU's) list of counters. 191 * Remove the counter from a task's (or a CPU's) list of counters.
170 * 192 *
171 * Must be called with counter->mutex held. 193 * Must be called with counter->mutex and ctx->mutex held.
172 * 194 *
173 * CPU counters are removed with a smp call. For task counters we only 195 * CPU counters are removed with a smp call. For task counters we only
174 * call when the task is on a CPU. 196 * call when the task is on a CPU.
@@ -215,6 +237,99 @@ retry:
215 spin_unlock_irq(&ctx->lock); 237 spin_unlock_irq(&ctx->lock);
216} 238}
217 239
240/*
241 * Cross CPU call to disable a performance counter
242 */
243static void __perf_counter_disable(void *info)
244{
245 struct perf_counter *counter = info;
246 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
247 struct perf_counter_context *ctx = counter->ctx;
248 unsigned long flags;
249
250 /*
251 * If this is a per-task counter, need to check whether this
252 * counter's task is the current task on this cpu.
253 */
254 if (ctx->task && cpuctx->task_ctx != ctx)
255 return;
256
257 curr_rq_lock_irq_save(&flags);
258 spin_lock(&ctx->lock);
259
260 /*
261 * If the counter is on, turn it off.
262 * If it is in error state, leave it in error state.
263 */
264 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
265 if (counter == counter->group_leader)
266 group_sched_out(counter, cpuctx, ctx);
267 else
268 counter_sched_out(counter, cpuctx, ctx);
269 counter->state = PERF_COUNTER_STATE_OFF;
270 }
271
272 spin_unlock(&ctx->lock);
273 curr_rq_unlock_irq_restore(&flags);
274}
275
276/*
277 * Disable a counter.
278 */
279static void perf_counter_disable(struct perf_counter *counter)
280{
281 struct perf_counter_context *ctx = counter->ctx;
282 struct task_struct *task = ctx->task;
283
284 if (!task) {
285 /*
286 * Disable the counter on the cpu that it's on
287 */
288 smp_call_function_single(counter->cpu, __perf_counter_disable,
289 counter, 1);
290 return;
291 }
292
293 retry:
294 task_oncpu_function_call(task, __perf_counter_disable, counter);
295
296 spin_lock_irq(&ctx->lock);
297 /*
298 * If the counter is still active, we need to retry the cross-call.
299 */
300 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
301 spin_unlock_irq(&ctx->lock);
302 goto retry;
303 }
304
305 /*
306 * Since we have the lock this context can't be scheduled
307 * in, so we can change the state safely.
308 */
309 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
310 counter->state = PERF_COUNTER_STATE_OFF;
311
312 spin_unlock_irq(&ctx->lock);
313}
314
315/*
316 * Disable a counter and all its children.
317 */
318static void perf_counter_disable_family(struct perf_counter *counter)
319{
320 struct perf_counter *child;
321
322 perf_counter_disable(counter);
323
324 /*
325 * Lock the mutex to protect the list of children
326 */
327 mutex_lock(&counter->mutex);
328 list_for_each_entry(child, &counter->child_list, child_list)
329 perf_counter_disable(child);
330 mutex_unlock(&counter->mutex);
331}
332
218static int 333static int
219counter_sched_in(struct perf_counter *counter, 334counter_sched_in(struct perf_counter *counter,
220 struct perf_cpu_context *cpuctx, 335 struct perf_cpu_context *cpuctx,
@@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info)
302 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 417 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
303 struct perf_counter *counter = info; 418 struct perf_counter *counter = info;
304 struct perf_counter_context *ctx = counter->ctx; 419 struct perf_counter_context *ctx = counter->ctx;
420 struct perf_counter *leader = counter->group_leader;
305 int cpu = smp_processor_id(); 421 int cpu = smp_processor_id();
306 unsigned long flags; 422 unsigned long flags;
307 u64 perf_flags; 423 u64 perf_flags;
@@ -328,22 +444,39 @@ static void __perf_install_in_context(void *info)
328 ctx->nr_counters++; 444 ctx->nr_counters++;
329 445
330 /* 446 /*
447 * Don't put the counter on if it is disabled or if
448 * it is in a group and the group isn't on.
449 */
450 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
451 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
452 goto unlock;
453
454 /*
331 * An exclusive counter can't go on if there are already active 455 * An exclusive counter can't go on if there are already active
332 * hardware counters, and no hardware counter can go on if there 456 * hardware counters, and no hardware counter can go on if there
333 * is already an exclusive counter on. 457 * is already an exclusive counter on.
334 */ 458 */
335 if (counter->state == PERF_COUNTER_STATE_INACTIVE && 459 if (!group_can_go_on(counter, cpuctx, 1))
336 !group_can_go_on(counter, cpuctx, 1))
337 err = -EEXIST; 460 err = -EEXIST;
338 else 461 else
339 err = counter_sched_in(counter, cpuctx, ctx, cpu); 462 err = counter_sched_in(counter, cpuctx, ctx, cpu);
340 463
341 if (err && counter->hw_event.pinned) 464 if (err) {
342 counter->state = PERF_COUNTER_STATE_ERROR; 465 /*
466 * This counter couldn't go on. If it is in a group
467 * then we have to pull the whole group off.
468 * If the counter group is pinned then put it in error state.
469 */
470 if (leader != counter)
471 group_sched_out(leader, cpuctx, ctx);
472 if (leader->hw_event.pinned)
473 leader->state = PERF_COUNTER_STATE_ERROR;
474 }
343 475
344 if (!err && !ctx->task && cpuctx->max_pertask) 476 if (!err && !ctx->task && cpuctx->max_pertask)
345 cpuctx->max_pertask--; 477 cpuctx->max_pertask--;
346 478
479 unlock:
347 hw_perf_restore(perf_flags); 480 hw_perf_restore(perf_flags);
348 481
349 spin_unlock(&ctx->lock); 482 spin_unlock(&ctx->lock);
@@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info)
359 * If the counter is attached to a task which is on a CPU we use a smp 492 * If the counter is attached to a task which is on a CPU we use a smp
360 * call to enable it in the task context. The task might have been 493 * call to enable it in the task context. The task might have been
361 * scheduled away, but we check this in the smp call again. 494 * scheduled away, but we check this in the smp call again.
495 *
496 * Must be called with ctx->mutex held.
362 */ 497 */
363static void 498static void
364perf_install_in_context(struct perf_counter_context *ctx, 499perf_install_in_context(struct perf_counter_context *ctx,
@@ -387,7 +522,7 @@ retry:
387 /* 522 /*
388 * we need to retry the smp call. 523 * we need to retry the smp call.
389 */ 524 */
390 if (ctx->nr_active && list_empty(&counter->list_entry)) { 525 if (ctx->is_active && list_empty(&counter->list_entry)) {
391 spin_unlock_irq(&ctx->lock); 526 spin_unlock_irq(&ctx->lock);
392 goto retry; 527 goto retry;
393 } 528 }
@@ -404,26 +539,131 @@ retry:
404 spin_unlock_irq(&ctx->lock); 539 spin_unlock_irq(&ctx->lock);
405} 540}
406 541
407static void 542/*
408group_sched_out(struct perf_counter *group_counter, 543 * Cross CPU call to enable a performance counter
409 struct perf_cpu_context *cpuctx, 544 */
410 struct perf_counter_context *ctx) 545static void __perf_counter_enable(void *info)
411{ 546{
412 struct perf_counter *counter; 547 struct perf_counter *counter = info;
548 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
549 struct perf_counter_context *ctx = counter->ctx;
550 struct perf_counter *leader = counter->group_leader;
551 unsigned long flags;
552 int err;
413 553
414 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) 554 /*
555 * If this is a per-task counter, need to check whether this
556 * counter's task is the current task on this cpu.
557 */
558 if (ctx->task && cpuctx->task_ctx != ctx)
415 return; 559 return;
416 560
417 counter_sched_out(group_counter, cpuctx, ctx); 561 curr_rq_lock_irq_save(&flags);
562 spin_lock(&ctx->lock);
563
564 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
565 goto unlock;
566 counter->state = PERF_COUNTER_STATE_INACTIVE;
418 567
419 /* 568 /*
420 * Schedule out siblings (if any): 569 * If the counter is in a group and isn't the group leader,
570 * then don't put it on unless the group is on.
421 */ 571 */
422 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) 572 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
423 counter_sched_out(counter, cpuctx, ctx); 573 goto unlock;
424 574
425 if (group_counter->hw_event.exclusive) 575 if (!group_can_go_on(counter, cpuctx, 1))
426 cpuctx->exclusive = 0; 576 err = -EEXIST;
577 else
578 err = counter_sched_in(counter, cpuctx, ctx,
579 smp_processor_id());
580
581 if (err) {
582 /*
583 * If this counter can't go on and it's part of a
584 * group, then the whole group has to come off.
585 */
586 if (leader != counter)
587 group_sched_out(leader, cpuctx, ctx);
588 if (leader->hw_event.pinned)
589 leader->state = PERF_COUNTER_STATE_ERROR;
590 }
591
592 unlock:
593 spin_unlock(&ctx->lock);
594 curr_rq_unlock_irq_restore(&flags);
595}
596
597/*
598 * Enable a counter.
599 */
600static void perf_counter_enable(struct perf_counter *counter)
601{
602 struct perf_counter_context *ctx = counter->ctx;
603 struct task_struct *task = ctx->task;
604
605 if (!task) {
606 /*
607 * Enable the counter on the cpu that it's on
608 */
609 smp_call_function_single(counter->cpu, __perf_counter_enable,
610 counter, 1);
611 return;
612 }
613
614 spin_lock_irq(&ctx->lock);
615 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
616 goto out;
617
618 /*
619 * If the counter is in error state, clear that first.
620 * That way, if we see the counter in error state below, we
621 * know that it has gone back into error state, as distinct
622 * from the task having been scheduled away before the
623 * cross-call arrived.
624 */
625 if (counter->state == PERF_COUNTER_STATE_ERROR)
626 counter->state = PERF_COUNTER_STATE_OFF;
627
628 retry:
629 spin_unlock_irq(&ctx->lock);
630 task_oncpu_function_call(task, __perf_counter_enable, counter);
631
632 spin_lock_irq(&ctx->lock);
633
634 /*
635 * If the context is active and the counter is still off,
636 * we need to retry the cross-call.
637 */
638 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
639 goto retry;
640
641 /*
642 * Since we have the lock this context can't be scheduled
643 * in, so we can change the state safely.
644 */
645 if (counter->state == PERF_COUNTER_STATE_OFF)
646 counter->state = PERF_COUNTER_STATE_INACTIVE;
647 out:
648 spin_unlock_irq(&ctx->lock);
649}
650
651/*
652 * Enable a counter and all its children.
653 */
654static void perf_counter_enable_family(struct perf_counter *counter)
655{
656 struct perf_counter *child;
657
658 perf_counter_enable(counter);
659
660 /*
661 * Lock the mutex to protect the list of children
662 */
663 mutex_lock(&counter->mutex);
664 list_for_each_entry(child, &counter->child_list, child_list)
665 perf_counter_enable(child);
666 mutex_unlock(&counter->mutex);
427} 667}
428 668
429void __perf_counter_sched_out(struct perf_counter_context *ctx, 669void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
432 struct perf_counter *counter; 672 struct perf_counter *counter;
433 u64 flags; 673 u64 flags;
434 674
675 spin_lock(&ctx->lock);
676 ctx->is_active = 0;
435 if (likely(!ctx->nr_counters)) 677 if (likely(!ctx->nr_counters))
436 return; 678 goto out;
437 679
438 spin_lock(&ctx->lock);
439 flags = hw_perf_save_disable(); 680 flags = hw_perf_save_disable();
440 if (ctx->nr_active) { 681 if (ctx->nr_active) {
441 list_for_each_entry(counter, &ctx->counter_list, list_entry) 682 list_for_each_entry(counter, &ctx->counter_list, list_entry)
442 group_sched_out(counter, cpuctx, ctx); 683 group_sched_out(counter, cpuctx, ctx);
443 } 684 }
444 hw_perf_restore(flags); 685 hw_perf_restore(flags);
686 out:
445 spin_unlock(&ctx->lock); 687 spin_unlock(&ctx->lock);
446} 688}
447 689
@@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
528 u64 flags; 770 u64 flags;
529 int can_add_hw = 1; 771 int can_add_hw = 1;
530 772
773 spin_lock(&ctx->lock);
774 ctx->is_active = 1;
531 if (likely(!ctx->nr_counters)) 775 if (likely(!ctx->nr_counters))
532 return; 776 goto out;
533 777
534 spin_lock(&ctx->lock);
535 flags = hw_perf_save_disable(); 778 flags = hw_perf_save_disable();
536 779
537 /* 780 /*
@@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
578 } 821 }
579 } 822 }
580 hw_perf_restore(flags); 823 hw_perf_restore(flags);
824 out:
581 spin_unlock(&ctx->lock); 825 spin_unlock(&ctx->lock);
582} 826}
583 827
@@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file)
896 1140
897 file->private_data = NULL; 1141 file->private_data = NULL;
898 1142
1143 mutex_lock(&ctx->mutex);
899 mutex_lock(&counter->mutex); 1144 mutex_lock(&counter->mutex);
900 1145
901 perf_counter_remove_from_context(counter); 1146 perf_counter_remove_from_context(counter);
902 put_context(ctx); 1147 put_context(ctx);
903 1148
904 mutex_unlock(&counter->mutex); 1149 mutex_unlock(&counter->mutex);
1150 mutex_unlock(&ctx->mutex);
905 1151
906 kfree(counter); 1152 kfree(counter);
907 1153
@@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
1053 return events; 1299 return events;
1054} 1300}
1055 1301
1302static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1303{
1304 struct perf_counter *counter = file->private_data;
1305 int err = 0;
1306
1307 switch (cmd) {
1308 case PERF_COUNTER_IOC_ENABLE:
1309 perf_counter_enable_family(counter);
1310 break;
1311 case PERF_COUNTER_IOC_DISABLE:
1312 perf_counter_disable_family(counter);
1313 break;
1314 default:
1315 err = -ENOTTY;
1316 }
1317 return err;
1318}
1319
1056static const struct file_operations perf_fops = { 1320static const struct file_operations perf_fops = {
1057 .release = perf_release, 1321 .release = perf_release,
1058 .read = perf_read, 1322 .read = perf_read,
1059 .poll = perf_poll, 1323 .poll = perf_poll,
1324 .unlocked_ioctl = perf_ioctl,
1325 .compat_ioctl = perf_ioctl,
1060}; 1326};
1061 1327
1062static int cpu_clock_perf_counter_enable(struct perf_counter *counter) 1328static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
@@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1348 INIT_LIST_HEAD(&counter->sibling_list); 1614 INIT_LIST_HEAD(&counter->sibling_list);
1349 init_waitqueue_head(&counter->waitq); 1615 init_waitqueue_head(&counter->waitq);
1350 1616
1617 INIT_LIST_HEAD(&counter->child_list);
1618
1351 counter->irqdata = &counter->data[0]; 1619 counter->irqdata = &counter->data[0];
1352 counter->usrdata = &counter->data[1]; 1620 counter->usrdata = &counter->data[1];
1353 counter->cpu = cpu; 1621 counter->cpu = cpu;
@@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1452 goto err_free_put_context; 1720 goto err_free_put_context;
1453 1721
1454 counter->filp = counter_file; 1722 counter->filp = counter_file;
1723 mutex_lock(&ctx->mutex);
1455 perf_install_in_context(ctx, counter, cpu); 1724 perf_install_in_context(ctx, counter, cpu);
1725 mutex_unlock(&ctx->mutex);
1456 1726
1457 fput_light(counter_file, fput_needed2); 1727 fput_light(counter_file, fput_needed2);
1458 1728
@@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1479{ 1749{
1480 memset(ctx, 0, sizeof(*ctx)); 1750 memset(ctx, 0, sizeof(*ctx));
1481 spin_lock_init(&ctx->lock); 1751 spin_lock_init(&ctx->lock);
1752 mutex_init(&ctx->mutex);
1482 INIT_LIST_HEAD(&ctx->counter_list); 1753 INIT_LIST_HEAD(&ctx->counter_list);
1483 ctx->task = task; 1754 ctx->task = task;
1484} 1755}
@@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1486/* 1757/*
1487 * inherit a counter from parent task to child task: 1758 * inherit a counter from parent task to child task:
1488 */ 1759 */
1489static int 1760static struct perf_counter *
1490inherit_counter(struct perf_counter *parent_counter, 1761inherit_counter(struct perf_counter *parent_counter,
1491 struct task_struct *parent, 1762 struct task_struct *parent,
1492 struct perf_counter_context *parent_ctx, 1763 struct perf_counter_context *parent_ctx,
1493 struct task_struct *child, 1764 struct task_struct *child,
1765 struct perf_counter *group_leader,
1494 struct perf_counter_context *child_ctx) 1766 struct perf_counter_context *child_ctx)
1495{ 1767{
1496 struct perf_counter *child_counter; 1768 struct perf_counter *child_counter;
1497 1769
1770 /*
1771 * Instead of creating recursive hierarchies of counters,
1772 * we link inherited counters back to the original parent,
1773 * which has a filp for sure, which we use as the reference
1774 * count:
1775 */
1776 if (parent_counter->parent)
1777 parent_counter = parent_counter->parent;
1778
1498 child_counter = perf_counter_alloc(&parent_counter->hw_event, 1779 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1499 parent_counter->cpu, NULL, 1780 parent_counter->cpu, group_leader,
1500 GFP_ATOMIC); 1781 GFP_KERNEL);
1501 if (!child_counter) 1782 if (!child_counter)
1502 return -ENOMEM; 1783 return NULL;
1503 1784
1504 /* 1785 /*
1505 * Link it up in the child's context: 1786 * Link it up in the child's context:
@@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter,
1523 */ 1804 */
1524 atomic_long_inc(&parent_counter->filp->f_count); 1805 atomic_long_inc(&parent_counter->filp->f_count);
1525 1806
1807 /*
1808 * Link this into the parent counter's child list
1809 */
1810 mutex_lock(&parent_counter->mutex);
1811 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
1812
1813 /*
1814 * Make the child state follow the state of the parent counter,
1815 * not its hw_event.disabled bit. We hold the parent's mutex,
1816 * so we won't race with perf_counter_{en,dis}able_family.
1817 */
1818 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
1819 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1820 else
1821 child_counter->state = PERF_COUNTER_STATE_OFF;
1822
1823 mutex_unlock(&parent_counter->mutex);
1824
1825 return child_counter;
1826}
1827
1828static int inherit_group(struct perf_counter *parent_counter,
1829 struct task_struct *parent,
1830 struct perf_counter_context *parent_ctx,
1831 struct task_struct *child,
1832 struct perf_counter_context *child_ctx)
1833{
1834 struct perf_counter *leader;
1835 struct perf_counter *sub;
1836
1837 leader = inherit_counter(parent_counter, parent, parent_ctx,
1838 child, NULL, child_ctx);
1839 if (!leader)
1840 return -ENOMEM;
1841 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
1842 if (!inherit_counter(sub, parent, parent_ctx,
1843 child, leader, child_ctx))
1844 return -ENOMEM;
1845 }
1526 return 0; 1846 return 0;
1527} 1847}
1528 1848
1849static void sync_child_counter(struct perf_counter *child_counter,
1850 struct perf_counter *parent_counter)
1851{
1852 u64 parent_val, child_val;
1853
1854 parent_val = atomic64_read(&parent_counter->count);
1855 child_val = atomic64_read(&child_counter->count);
1856
1857 /*
1858 * Add back the child's count to the parent's count:
1859 */
1860 atomic64_add(child_val, &parent_counter->count);
1861
1862 /*
1863 * Remove this counter from the parent's list
1864 */
1865 mutex_lock(&parent_counter->mutex);
1866 list_del_init(&child_counter->child_list);
1867 mutex_unlock(&parent_counter->mutex);
1868
1869 /*
1870 * Release the parent counter, if this was the last
1871 * reference to it.
1872 */
1873 fput(parent_counter->filp);
1874}
1875
1529static void 1876static void
1530__perf_counter_exit_task(struct task_struct *child, 1877__perf_counter_exit_task(struct task_struct *child,
1531 struct perf_counter *child_counter, 1878 struct perf_counter *child_counter,
1532 struct perf_counter_context *child_ctx) 1879 struct perf_counter_context *child_ctx)
1533{ 1880{
1534 struct perf_counter *parent_counter; 1881 struct perf_counter *parent_counter;
1535 u64 parent_val, child_val; 1882 struct perf_counter *sub, *tmp;
1536 1883
1537 /* 1884 /*
1538 * If we do not self-reap then we have to wait for the 1885 * If we do not self-reap then we have to wait for the
@@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child,
1561 1908
1562 cpuctx = &__get_cpu_var(perf_cpu_context); 1909 cpuctx = &__get_cpu_var(perf_cpu_context);
1563 1910
1564 counter_sched_out(child_counter, cpuctx, child_ctx); 1911 group_sched_out(child_counter, cpuctx, child_ctx);
1565 1912
1566 list_del_init(&child_counter->list_entry); 1913 list_del_init(&child_counter->list_entry);
1567 1914
@@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child,
1577 * that are still around due to the child reference. These 1924 * that are still around due to the child reference. These
1578 * counters need to be zapped - but otherwise linger. 1925 * counters need to be zapped - but otherwise linger.
1579 */ 1926 */
1580 if (!parent_counter) 1927 if (parent_counter) {
1581 return; 1928 sync_child_counter(child_counter, parent_counter);
1582 1929 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
1583 parent_val = atomic64_read(&parent_counter->count); 1930 list_entry) {
1584 child_val = atomic64_read(&child_counter->count); 1931 if (sub->parent)
1585 1932 sync_child_counter(sub, sub->parent);
1586 /* 1933 kfree(sub);
1587 * Add back the child's count to the parent's count: 1934 }
1588 */ 1935 }
1589 atomic64_add(child_val, &parent_counter->count);
1590
1591 fput(parent_counter->filp);
1592 1936
1593 kfree(child_counter); 1937 kfree(child_counter);
1594} 1938}
1595 1939
1596/* 1940/*
1597 * When a child task exist, feed back counter values to parent counters. 1941 * When a child task exits, feed back counter values to parent counters.
1598 * 1942 *
1599 * Note: we are running in child context, but the PID is not hashed 1943 * Note: we may be running in child context, but the PID is not hashed
1600 * anymore so new counters will not be added. 1944 * anymore so new counters will not be added.
1601 */ 1945 */
1602void perf_counter_exit_task(struct task_struct *child) 1946void perf_counter_exit_task(struct task_struct *child)
@@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child)
1620void perf_counter_init_task(struct task_struct *child) 1964void perf_counter_init_task(struct task_struct *child)
1621{ 1965{
1622 struct perf_counter_context *child_ctx, *parent_ctx; 1966 struct perf_counter_context *child_ctx, *parent_ctx;
1623 struct perf_counter *counter, *parent_counter; 1967 struct perf_counter *counter;
1624 struct task_struct *parent = current; 1968 struct task_struct *parent = current;
1625 unsigned long flags;
1626 1969
1627 child_ctx = &child->perf_counter_ctx; 1970 child_ctx = &child->perf_counter_ctx;
1628 parent_ctx = &parent->perf_counter_ctx; 1971 parent_ctx = &parent->perf_counter_ctx;
@@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child)
1641 * Lock the parent list. No need to lock the child - not PID 1984 * Lock the parent list. No need to lock the child - not PID
1642 * hashed yet and not running, so nobody can access it. 1985 * hashed yet and not running, so nobody can access it.
1643 */ 1986 */
1644 spin_lock_irqsave(&parent_ctx->lock, flags); 1987 mutex_lock(&parent_ctx->mutex);
1645 1988
1646 /* 1989 /*
1647 * We dont have to disable NMIs - we are only looking at 1990 * We dont have to disable NMIs - we are only looking at
1648 * the list, not manipulating it: 1991 * the list, not manipulating it:
1649 */ 1992 */
1650 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { 1993 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1651 if (!counter->hw_event.inherit || counter->group_leader != counter) 1994 if (!counter->hw_event.inherit)
1652 continue; 1995 continue;
1653 1996
1654 /* 1997 if (inherit_group(counter, parent,
1655 * Instead of creating recursive hierarchies of counters,
1656 * we link inheritd counters back to the original parent,
1657 * which has a filp for sure, which we use as the reference
1658 * count:
1659 */
1660 parent_counter = counter;
1661 if (counter->parent)
1662 parent_counter = counter->parent;
1663
1664 if (inherit_counter(parent_counter, parent,
1665 parent_ctx, child, child_ctx)) 1998 parent_ctx, child, child_ctx))
1666 break; 1999 break;
1667 } 2000 }
1668 2001
1669 spin_unlock_irqrestore(&parent_ctx->lock, flags); 2002 mutex_unlock(&parent_ctx->mutex);
1670} 2003}
1671 2004
1672static void __cpuinit perf_counter_init_cpu(int cpu) 2005static void __cpuinit perf_counter_init_cpu(int cpu)
@@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info)
1692 2025
1693 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) 2026 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
1694 __perf_counter_remove_from_context(counter); 2027 __perf_counter_remove_from_context(counter);
1695
1696} 2028}
1697static void perf_counter_exit_cpu(int cpu) 2029static void perf_counter_exit_cpu(int cpu)
1698{ 2030{
2031 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2032 struct perf_counter_context *ctx = &cpuctx->ctx;
2033
2034 mutex_lock(&ctx->mutex);
1699 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); 2035 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2036 mutex_unlock(&ctx->mutex);
1700} 2037}
1701#else 2038#else
1702static inline void perf_counter_exit_cpu(int cpu) { } 2039static inline void perf_counter_exit_cpu(int cpu) { }