diff options
author | Li Zefan <lizefan@huawei.com> | 2013-02-18 05:56:14 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-02-18 12:17:24 -0500 |
commit | 810cbee4fad570ff167132d4ecf247d99c48f71d (patch) | |
tree | 2806f56a033f43b17ad2c547b2153941c0e77f40 /kernel | |
parent | 63f43f55c9bbc14f76b582644019b8a07dc8219a (diff) |
cgroup: fix cgroup_rmdir() vs close(eventfd) race
commit 205a872bd6f9a9a09ef035ef1e90185a8245cc58 ("cgroup: fix lockdep
warning for event_control") solved a deadlock by introducing a new
bug.
Move cgrp->event_list to a temporary list doesn't mean you can traverse
this list locklessly, because at the same time cgroup_event_wake() can
be called and remove the event from the list. The result of this race
is disastrous.
We adopt the way how kvm irqfd code implements race-free event removal,
which is now described in the comments in cgroup_event_wake().
v3:
- call eventfd_signal() no matter it's eventfd close or cgroup removal
that removes the cgroup event.
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 41 |
1 files changed, 25 insertions, 16 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5d4c92ead691..feda81487be4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -3786,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work) | |||
3786 | remove); | 3786 | remove); |
3787 | struct cgroup *cgrp = event->cgrp; | 3787 | struct cgroup *cgrp = event->cgrp; |
3788 | 3788 | ||
3789 | remove_wait_queue(event->wqh, &event->wait); | ||
3790 | |||
3789 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3791 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); |
3790 | 3792 | ||
3793 | /* Notify userspace the event is going away. */ | ||
3794 | eventfd_signal(event->eventfd, 1); | ||
3795 | |||
3791 | eventfd_ctx_put(event->eventfd); | 3796 | eventfd_ctx_put(event->eventfd); |
3792 | kfree(event); | 3797 | kfree(event); |
3793 | dput(cgrp->dentry); | 3798 | dput(cgrp->dentry); |
@@ -3807,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3807 | unsigned long flags = (unsigned long)key; | 3812 | unsigned long flags = (unsigned long)key; |
3808 | 3813 | ||
3809 | if (flags & POLLHUP) { | 3814 | if (flags & POLLHUP) { |
3810 | __remove_wait_queue(event->wqh, &event->wait); | ||
3811 | spin_lock(&cgrp->event_list_lock); | ||
3812 | list_del_init(&event->list); | ||
3813 | spin_unlock(&cgrp->event_list_lock); | ||
3814 | /* | 3815 | /* |
3815 | * We are in atomic context, but cgroup_event_remove() may | 3816 | * If the event has been detached at cgroup removal, we |
3816 | * sleep, so we have to call it in workqueue. | 3817 | * can simply return knowing the other side will cleanup |
3818 | * for us. | ||
3819 | * | ||
3820 | * We can't race against event freeing since the other | ||
3821 | * side will require wqh->lock via remove_wait_queue(), | ||
3822 | * which we hold. | ||
3817 | */ | 3823 | */ |
3818 | schedule_work(&event->remove); | 3824 | spin_lock(&cgrp->event_list_lock); |
3825 | if (!list_empty(&event->list)) { | ||
3826 | list_del_init(&event->list); | ||
3827 | /* | ||
3828 | * We are in atomic context, but cgroup_event_remove() | ||
3829 | * may sleep, so we have to call it in workqueue. | ||
3830 | */ | ||
3831 | schedule_work(&event->remove); | ||
3832 | } | ||
3833 | spin_unlock(&cgrp->event_list_lock); | ||
3819 | } | 3834 | } |
3820 | 3835 | ||
3821 | return 0; | 3836 | return 0; |
@@ -4375,20 +4390,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4375 | /* | 4390 | /* |
4376 | * Unregister events and notify userspace. | 4391 | * Unregister events and notify userspace. |
4377 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4392 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4378 | * directory to avoid race between userspace and kernelspace. Use | 4393 | * directory to avoid race between userspace and kernelspace. |
4379 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
4380 | * cgroup_event_wake() is called with the wait queue head locked, | ||
4381 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
4382 | */ | 4394 | */ |
4383 | spin_lock(&cgrp->event_list_lock); | 4395 | spin_lock(&cgrp->event_list_lock); |
4384 | list_splice_init(&cgrp->event_list, &tmp_list); | 4396 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { |
4385 | spin_unlock(&cgrp->event_list_lock); | ||
4386 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
4387 | list_del_init(&event->list); | 4397 | list_del_init(&event->list); |
4388 | remove_wait_queue(event->wqh, &event->wait); | ||
4389 | eventfd_signal(event->eventfd, 1); | ||
4390 | schedule_work(&event->remove); | 4398 | schedule_work(&event->remove); |
4391 | } | 4399 | } |
4400 | spin_unlock(&cgrp->event_list_lock); | ||
4392 | 4401 | ||
4393 | return 0; | 4402 | return 0; |
4394 | } | 4403 | } |