summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-10-16 18:03:14 -0400
committerTejun Heo <tj@kernel.org>2012-10-16 18:03:14 -0400
commit5edee61edeaaebafe584f8fb7074c1ef4658596b (patch)
tree23e6ee3581eb0009b3c2a2686c25fdba538219de
parentddffeb8c4d0331609ef2581d84de4d763607bd37 (diff)
cgroup: cgroup_subsys->fork() should be called after the task is added to css_set
cgroup core has a bug which violates a basic rule about event notifications - when a new entity needs to be added, you add that to the notification list first and then make the new entity conform to the current state. If done in the reverse order, an event happening inbetween will be lost. cgroup_subsys->fork() is invoked way before the new task is added to the css_set. Currently, cgroup_freezer is the only user of ->fork() and uses it to make new tasks conform to the current state of the freezer. If FROZEN state is requested while fork is in progress between cgroup_fork_callbacks() and cgroup_post_fork(), the child could escape freezing - the cgroup isn't frozen when ->fork() is called and the freezer couldn't see the new task on the css_set. This patch moves cgroup_subsys->fork() invocation to cgroup_post_fork() after the new task is added to the css_set. cgroup_fork_callbacks() is removed. Because now a task may be migrated during cgroup_subsys->fork(), freezer_fork() is updated so that it adheres to the usual RCU locking and the rather pointless comment on why locking can be different there is removed (if it doesn't make anything simpler, why even bother?). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Rafael J. Wysocki <rjw@sisk.pl> Cc: stable@vger.kernel.org
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--kernel/cgroup.c62
-rw-r--r--kernel/cgroup_freezer.c13
-rw-r--r--kernel/fork.c9
4 files changed, 35 insertions, 50 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8a030ced0c7..4cd1d0fd2542 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -34,7 +34,6 @@ extern int cgroup_lock_is_held(void);
34extern bool cgroup_lock_live_group(struct cgroup *cgrp); 34extern bool cgroup_lock_live_group(struct cgroup *cgrp);
35extern void cgroup_unlock(void); 35extern void cgroup_unlock(void);
36extern void cgroup_fork(struct task_struct *p); 36extern void cgroup_fork(struct task_struct *p);
37extern void cgroup_fork_callbacks(struct task_struct *p);
38extern void cgroup_post_fork(struct task_struct *p); 37extern void cgroup_post_fork(struct task_struct *p);
39extern void cgroup_exit(struct task_struct *p, int run_callbacks); 38extern void cgroup_exit(struct task_struct *p, int run_callbacks);
40extern int cgroupstats_build(struct cgroupstats *stats, 39extern int cgroupstats_build(struct cgroupstats *stats,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13774b3b39aa..b7a0171067ea 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4844,44 +4844,19 @@ void cgroup_fork(struct task_struct *child)
4844} 4844}
4845 4845
4846/** 4846/**
4847 * cgroup_fork_callbacks - run fork callbacks
4848 * @child: the new task
4849 *
4850 * Called on a new task very soon before adding it to the
4851 * tasklist. No need to take any locks since no-one can
4852 * be operating on this task.
4853 */
4854void cgroup_fork_callbacks(struct task_struct *child)
4855{
4856 if (need_forkexit_callback) {
4857 int i;
4858 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4859 struct cgroup_subsys *ss = subsys[i];
4860
4861 /*
4862 * forkexit callbacks are only supported for
4863 * builtin subsystems.
4864 */
4865 if (!ss || ss->module)
4866 continue;
4867
4868 if (ss->fork)
4869 ss->fork(child);
4870 }
4871 }
4872}
4873
4874/**
4875 * cgroup_post_fork - called on a new task after adding it to the task list 4847 * cgroup_post_fork - called on a new task after adding it to the task list
4876 * @child: the task in question 4848 * @child: the task in question
4877 * 4849 *
4878 * Adds the task to the list running through its css_set if necessary. 4850 * Adds the task to the list running through its css_set if necessary and
4879 * Has to be after the task is visible on the task list in case we race 4851 * call the subsystem fork() callbacks. Has to be after the task is
4880 * with the first call to cgroup_iter_start() - to guarantee that the 4852 * visible on the task list in case we race with the first call to
4881 * new task ends up on its list. 4853 * cgroup_iter_start() - to guarantee that the new task ends up on its
4854 * list.
4882 */ 4855 */
4883void cgroup_post_fork(struct task_struct *child) 4856void cgroup_post_fork(struct task_struct *child)
4884{ 4857{
4858 int i;
4859
4885 /* 4860 /*
4886 * use_task_css_set_links is set to 1 before we walk the tasklist 4861 * use_task_css_set_links is set to 1 before we walk the tasklist
4887 * under the tasklist_lock and we read it here after we added the child 4862 * under the tasklist_lock and we read it here after we added the child
@@ -4910,7 +4885,30 @@ void cgroup_post_fork(struct task_struct *child)
4910 } 4885 }
4911 write_unlock(&css_set_lock); 4886 write_unlock(&css_set_lock);
4912 } 4887 }
4888
4889 /*
4890 * Call ss->fork(). This must happen after @child is linked on
4891 * css_set; otherwise, @child might change state between ->fork()
4892 * and addition to css_set.
4893 */
4894 if (need_forkexit_callback) {
4895 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4896 struct cgroup_subsys *ss = subsys[i];
4897
4898 /*
4899 * fork/exit callbacks are supported only for
4900 * builtin subsystems and we don't need further
4901 * synchronization as they never go away.
4902 */
4903 if (!ss || ss->module)
4904 continue;
4905
4906 if (ss->fork)
4907 ss->fork(child);
4908 }
4909 }
4913} 4910}
4911
4914/** 4912/**
4915 * cgroup_exit - detach cgroup from exiting task 4913 * cgroup_exit - detach cgroup from exiting task
4916 * @tsk: pointer to task_struct of exiting process 4914 * @tsk: pointer to task_struct of exiting process
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..12bfedb598c2 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -186,23 +186,15 @@ static void freezer_fork(struct task_struct *task)
186{ 186{
187 struct freezer *freezer; 187 struct freezer *freezer;
188 188
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 189 rcu_read_lock();
197 freezer = task_freezer(task); 190 freezer = task_freezer(task);
198 rcu_read_unlock();
199 191
200 /* 192 /*
201 * The root cgroup is non-freezable, so we can skip the 193 * The root cgroup is non-freezable, so we can skip the
202 * following check. 194 * following check.
203 */ 195 */
204 if (!freezer->css.cgroup->parent) 196 if (!freezer->css.cgroup->parent)
205 return; 197 goto out;
206 198
207 spin_lock_irq(&freezer->lock); 199 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 200 BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -210,7 +202,10 @@ static void freezer_fork(struct task_struct *task)
210 /* Locking avoids race with FREEZING -> THAWED transitions. */ 202 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING) 203 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 204 freeze_task(task);
205
213 spin_unlock_irq(&freezer->lock); 206 spin_unlock_irq(&freezer->lock);
207out:
208 rcu_read_unlock();
214} 209}
215 210
216/* 211/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7d3aa2..acc4cb62f32f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1135,7 +1135,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1135{ 1135{
1136 int retval; 1136 int retval;
1137 struct task_struct *p; 1137 struct task_struct *p;
1138 int cgroup_callbacks_done = 0;
1139 1138
1140 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1139 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1141 return ERR_PTR(-EINVAL); 1140 return ERR_PTR(-EINVAL);
@@ -1393,12 +1392,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1393 INIT_LIST_HEAD(&p->thread_group); 1392 INIT_LIST_HEAD(&p->thread_group);
1394 p->task_works = NULL; 1393 p->task_works = NULL;
1395 1394
1396 /* Now that the task is set up, run cgroup callbacks if
1397 * necessary. We need to run them before the task is visible
1398 * on the tasklist. */
1399 cgroup_fork_callbacks(p);
1400 cgroup_callbacks_done = 1;
1401
1402 /* Need tasklist lock for parent etc handling! */ 1395 /* Need tasklist lock for parent etc handling! */
1403 write_lock_irq(&tasklist_lock); 1396 write_lock_irq(&tasklist_lock);
1404 1397
@@ -1503,7 +1496,7 @@ bad_fork_cleanup_cgroup:
1503#endif 1496#endif
1504 if (clone_flags & CLONE_THREAD) 1497 if (clone_flags & CLONE_THREAD)
1505 threadgroup_change_end(current); 1498 threadgroup_change_end(current);
1506 cgroup_exit(p, cgroup_callbacks_done); 1499 cgroup_exit(p, 0);
1507 delayacct_tsk_free(p); 1500 delayacct_tsk_free(p);
1508 module_put(task_thread_info(p)->exec_domain->module); 1501 module_put(task_thread_info(p)->exec_domain->module);
1509bad_fork_cleanup_count: 1502bad_fork_cleanup_count: