aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-05-29 02:06:20 -0400
committerIngo Molnar <mingo@elte.hu>2009-05-29 05:02:46 -0400
commitad3a37de81c45f6c20d410ece86004b98f7b6d84 (patch)
tree6748194aa66d1648232dc9767cba44615d2e14fd /kernel/perf_counter.c
parentbe1ac0d81d0e3ab655f8c8ade31fb860ef6aa186 (diff)
perf_counter: Don't swap contexts containing locked mutex
Peter Zijlstra pointed out that under some circumstances, we can take the mutex in a context or a counter and then swap that context or counter to another task, potentially leading to lock order inversions or the mutexes not protecting what they are supposed to protect. This fixes the problem by making sure that we never take a mutex in a context or counter which could get swapped to another task. Most of the cases where we take a mutex is on a top-level counter or context, i.e. a counter which has an fd associated with it or a context that contains such a counter. This adds WARN_ON_ONCE statements to verify that. The two cases where we need to take the mutex on a context that is a clone of another are in perf_counter_exit_task and perf_counter_init_task. The perf_counter_exit_task case is solved by uncloning the context before starting to remove the counters from it. The perf_counter_init_task is a little trickier; we temporarily disable context swapping for the parent (forking) task by setting its ctx->parent_gen to the all-1s value after locking the context, if it is a cloned context, and restore the ctx->parent_gen value at the end if the context didn't get uncloned in the meantime. This also moves the increment of the context generation count to be within the same critical section, protected by the context mutex, that adds the new counter to the context. That way, taking the mutex is sufficient to ensure that both the counter list and the generation count are stable. [ Impact: fix hangs, races with inherited and PID counters ] Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <18975.31580.520676.619896@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c89
1 files changed, 79 insertions, 10 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52e5a15321d8..db843f812a60 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -919,7 +919,8 @@ static int context_equiv(struct perf_counter_context *ctx1,
919 struct perf_counter_context *ctx2) 919 struct perf_counter_context *ctx2)
920{ 920{
921 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 921 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
922 && ctx1->parent_gen == ctx2->parent_gen; 922 && ctx1->parent_gen == ctx2->parent_gen
923 && ctx1->parent_gen != ~0ull;
923} 924}
924 925
925/* 926/*
@@ -1339,7 +1340,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1339 put_ctx(parent_ctx); 1340 put_ctx(parent_ctx);
1340 ctx->parent_ctx = NULL; /* no longer a clone */ 1341 ctx->parent_ctx = NULL; /* no longer a clone */
1341 } 1342 }
1342 ++ctx->generation;
1343 /* 1343 /*
1344 * Get an extra reference before dropping the lock so that 1344 * Get an extra reference before dropping the lock so that
1345 * this context won't get freed if the task exits. 1345 * this context won't get freed if the task exits.
@@ -1414,6 +1414,7 @@ static int perf_release(struct inode *inode, struct file *file)
1414 1414
1415 file->private_data = NULL; 1415 file->private_data = NULL;
1416 1416
1417 WARN_ON_ONCE(ctx->parent_ctx);
1417 mutex_lock(&ctx->mutex); 1418 mutex_lock(&ctx->mutex);
1418 perf_counter_remove_from_context(counter); 1419 perf_counter_remove_from_context(counter);
1419 mutex_unlock(&ctx->mutex); 1420 mutex_unlock(&ctx->mutex);
@@ -1445,6 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1445 if (counter->state == PERF_COUNTER_STATE_ERROR) 1446 if (counter->state == PERF_COUNTER_STATE_ERROR)
1446 return 0; 1447 return 0;
1447 1448
1449 WARN_ON_ONCE(counter->ctx->parent_ctx);
1448 mutex_lock(&counter->child_mutex); 1450 mutex_lock(&counter->child_mutex);
1449 values[0] = perf_counter_read(counter); 1451 values[0] = perf_counter_read(counter);
1450 n = 1; 1452 n = 1;
@@ -1504,6 +1506,7 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
1504 struct perf_counter_context *ctx = counter->ctx; 1506 struct perf_counter_context *ctx = counter->ctx;
1505 struct perf_counter *sibling; 1507 struct perf_counter *sibling;
1506 1508
1509 WARN_ON_ONCE(ctx->parent_ctx);
1507 mutex_lock(&ctx->mutex); 1510 mutex_lock(&ctx->mutex);
1508 counter = counter->group_leader; 1511 counter = counter->group_leader;
1509 1512
@@ -1524,6 +1527,7 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
1524{ 1527{
1525 struct perf_counter *child; 1528 struct perf_counter *child;
1526 1529
1530 WARN_ON_ONCE(counter->ctx->parent_ctx);
1527 mutex_lock(&counter->child_mutex); 1531 mutex_lock(&counter->child_mutex);
1528 func(counter); 1532 func(counter);
1529 list_for_each_entry(child, &counter->child_list, child_list) 1533 list_for_each_entry(child, &counter->child_list, child_list)
@@ -1536,6 +1540,7 @@ static void perf_counter_for_each(struct perf_counter *counter,
1536{ 1540{
1537 struct perf_counter *child; 1541 struct perf_counter *child;
1538 1542
1543 WARN_ON_ONCE(counter->ctx->parent_ctx);
1539 mutex_lock(&counter->child_mutex); 1544 mutex_lock(&counter->child_mutex);
1540 perf_counter_for_each_sibling(counter, func); 1545 perf_counter_for_each_sibling(counter, func);
1541 list_for_each_entry(child, &counter->child_list, child_list) 1546 list_for_each_entry(child, &counter->child_list, child_list)
@@ -1741,6 +1746,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
1741{ 1746{
1742 struct perf_counter *counter = vma->vm_file->private_data; 1747 struct perf_counter *counter = vma->vm_file->private_data;
1743 1748
1749 WARN_ON_ONCE(counter->ctx->parent_ctx);
1744 if (atomic_dec_and_mutex_lock(&counter->mmap_count, 1750 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1745 &counter->mmap_mutex)) { 1751 &counter->mmap_mutex)) {
1746 struct user_struct *user = current_user(); 1752 struct user_struct *user = current_user();
@@ -1788,6 +1794,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1788 if (vma->vm_pgoff != 0) 1794 if (vma->vm_pgoff != 0)
1789 return -EINVAL; 1795 return -EINVAL;
1790 1796
1797 WARN_ON_ONCE(counter->ctx->parent_ctx);
1791 mutex_lock(&counter->mmap_mutex); 1798 mutex_lock(&counter->mmap_mutex);
1792 if (atomic_inc_not_zero(&counter->mmap_count)) { 1799 if (atomic_inc_not_zero(&counter->mmap_count)) {
1793 if (nr_pages != counter->data->nr_pages) 1800 if (nr_pages != counter->data->nr_pages)
@@ -3349,8 +3356,10 @@ SYSCALL_DEFINE5(perf_counter_open,
3349 goto err_free_put_context; 3356 goto err_free_put_context;
3350 3357
3351 counter->filp = counter_file; 3358 counter->filp = counter_file;
3359 WARN_ON_ONCE(ctx->parent_ctx);
3352 mutex_lock(&ctx->mutex); 3360 mutex_lock(&ctx->mutex);
3353 perf_install_in_context(ctx, counter, cpu); 3361 perf_install_in_context(ctx, counter, cpu);
3362 ++ctx->generation;
3354 mutex_unlock(&ctx->mutex); 3363 mutex_unlock(&ctx->mutex);
3355 3364
3356 counter->owner = current; 3365 counter->owner = current;
@@ -3436,6 +3445,7 @@ inherit_counter(struct perf_counter *parent_counter,
3436 /* 3445 /*
3437 * Link this into the parent counter's child list 3446 * Link this into the parent counter's child list
3438 */ 3447 */
3448 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3439 mutex_lock(&parent_counter->child_mutex); 3449 mutex_lock(&parent_counter->child_mutex);
3440 list_add_tail(&child_counter->child_list, &parent_counter->child_list); 3450 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3441 mutex_unlock(&parent_counter->child_mutex); 3451 mutex_unlock(&parent_counter->child_mutex);
@@ -3485,6 +3495,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
3485 /* 3495 /*
3486 * Remove this counter from the parent's list 3496 * Remove this counter from the parent's list
3487 */ 3497 */
3498 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3488 mutex_lock(&parent_counter->child_mutex); 3499 mutex_lock(&parent_counter->child_mutex);
3489 list_del_init(&child_counter->child_list); 3500 list_del_init(&child_counter->child_list);
3490 mutex_unlock(&parent_counter->child_mutex); 3501 mutex_unlock(&parent_counter->child_mutex);
@@ -3527,12 +3538,17 @@ void perf_counter_exit_task(struct task_struct *child)
3527 struct perf_counter_context *child_ctx; 3538 struct perf_counter_context *child_ctx;
3528 unsigned long flags; 3539 unsigned long flags;
3529 3540
3530 child_ctx = child->perf_counter_ctxp; 3541 if (likely(!child->perf_counter_ctxp))
3531
3532 if (likely(!child_ctx))
3533 return; 3542 return;
3534 3543
3535 local_irq_save(flags); 3544 local_irq_save(flags);
3545 /*
3546 * We can't reschedule here because interrupts are disabled,
3547 * and either child is current or it is a task that can't be
3548 * scheduled, so we are now safe from rescheduling changing
3549 * our context.
3550 */
3551 child_ctx = child->perf_counter_ctxp;
3536 __perf_counter_task_sched_out(child_ctx); 3552 __perf_counter_task_sched_out(child_ctx);
3537 3553
3538 /* 3554 /*
@@ -3542,6 +3558,15 @@ void perf_counter_exit_task(struct task_struct *child)
3542 */ 3558 */
3543 spin_lock(&child_ctx->lock); 3559 spin_lock(&child_ctx->lock);
3544 child->perf_counter_ctxp = NULL; 3560 child->perf_counter_ctxp = NULL;
3561 if (child_ctx->parent_ctx) {
3562 /*
3563 * This context is a clone; unclone it so it can't get
3564 * swapped to another process while we're removing all
3565 * the counters from it.
3566 */
3567 put_ctx(child_ctx->parent_ctx);
3568 child_ctx->parent_ctx = NULL;
3569 }
3545 spin_unlock(&child_ctx->lock); 3570 spin_unlock(&child_ctx->lock);
3546 local_irq_restore(flags); 3571 local_irq_restore(flags);
3547 3572
@@ -3571,9 +3596,11 @@ again:
3571int perf_counter_init_task(struct task_struct *child) 3596int perf_counter_init_task(struct task_struct *child)
3572{ 3597{
3573 struct perf_counter_context *child_ctx, *parent_ctx; 3598 struct perf_counter_context *child_ctx, *parent_ctx;
3599 struct perf_counter_context *cloned_ctx;
3574 struct perf_counter *counter; 3600 struct perf_counter *counter;
3575 struct task_struct *parent = current; 3601 struct task_struct *parent = current;
3576 int inherited_all = 1; 3602 int inherited_all = 1;
3603 u64 cloned_gen;
3577 int ret = 0; 3604 int ret = 0;
3578 3605
3579 child->perf_counter_ctxp = NULL; 3606 child->perf_counter_ctxp = NULL;
@@ -3581,8 +3608,7 @@ int perf_counter_init_task(struct task_struct *child)
3581 mutex_init(&child->perf_counter_mutex); 3608 mutex_init(&child->perf_counter_mutex);
3582 INIT_LIST_HEAD(&child->perf_counter_list); 3609 INIT_LIST_HEAD(&child->perf_counter_list);
3583 3610
3584 parent_ctx = parent->perf_counter_ctxp; 3611 if (likely(!parent->perf_counter_ctxp))
3585 if (likely(!parent_ctx || !parent_ctx->nr_counters))
3586 return 0; 3612 return 0;
3587 3613
3588 /* 3614 /*
@@ -3600,6 +3626,34 @@ int perf_counter_init_task(struct task_struct *child)
3600 get_task_struct(child); 3626 get_task_struct(child);
3601 3627
3602 /* 3628 /*
3629 * If the parent's context is a clone, temporarily set its
3630 * parent_gen to an impossible value (all 1s) so it won't get
3631 * swapped under us. The rcu_read_lock makes sure that
3632 * parent_ctx continues to exist even if it gets swapped to
3633 * another process and then freed while we are trying to get
3634 * its lock.
3635 */
3636 rcu_read_lock();
3637 retry:
3638 parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
3639 /*
3640 * No need to check if parent_ctx != NULL here; since we saw
3641 * it non-NULL earlier, the only reason for it to become NULL
3642 * is if we exit, and since we're currently in the middle of
3643 * a fork we can't be exiting at the same time.
3644 */
3645 spin_lock_irq(&parent_ctx->lock);
3646 if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
3647 spin_unlock_irq(&parent_ctx->lock);
3648 goto retry;
3649 }
3650 cloned_gen = parent_ctx->parent_gen;
3651 if (parent_ctx->parent_ctx)
3652 parent_ctx->parent_gen = ~0ull;
3653 spin_unlock_irq(&parent_ctx->lock);
3654 rcu_read_unlock();
3655
3656 /*
3603 * Lock the parent list. No need to lock the child - not PID 3657 * Lock the parent list. No need to lock the child - not PID
3604 * hashed yet and not running, so nobody can access it. 3658 * hashed yet and not running, so nobody can access it.
3605 */ 3659 */
@@ -3630,10 +3684,15 @@ int perf_counter_init_task(struct task_struct *child)
3630 /* 3684 /*
3631 * Mark the child context as a clone of the parent 3685 * Mark the child context as a clone of the parent
3632 * context, or of whatever the parent is a clone of. 3686 * context, or of whatever the parent is a clone of.
3687 * Note that if the parent is a clone, it could get
3688 * uncloned at any point, but that doesn't matter
3689 * because the list of counters and the generation
3690 * count can't have changed since we took the mutex.
3633 */ 3691 */
3634 if (parent_ctx->parent_ctx) { 3692 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
3635 child_ctx->parent_ctx = parent_ctx->parent_ctx; 3693 if (cloned_ctx) {
3636 child_ctx->parent_gen = parent_ctx->parent_gen; 3694 child_ctx->parent_ctx = cloned_ctx;
3695 child_ctx->parent_gen = cloned_gen;
3637 } else { 3696 } else {
3638 child_ctx->parent_ctx = parent_ctx; 3697 child_ctx->parent_ctx = parent_ctx;
3639 child_ctx->parent_gen = parent_ctx->generation; 3698 child_ctx->parent_gen = parent_ctx->generation;
@@ -3643,6 +3702,16 @@ int perf_counter_init_task(struct task_struct *child)
3643 3702
3644 mutex_unlock(&parent_ctx->mutex); 3703 mutex_unlock(&parent_ctx->mutex);
3645 3704
3705 /*
3706 * Restore the clone status of the parent.
3707 */
3708 if (parent_ctx->parent_ctx) {
3709 spin_lock_irq(&parent_ctx->lock);
3710 if (parent_ctx->parent_ctx)
3711 parent_ctx->parent_gen = cloned_gen;
3712 spin_unlock_irq(&parent_ctx->lock);
3713 }
3714
3646 return ret; 3715 return ret;
3647} 3716}
3648 3717