aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSrivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>2007-11-09 16:39:39 -0500
committerIngo Molnar <mingo@elte.hu>2007-11-09 16:39:39 -0500
commit3c90e6e99b08f01d5684a3a07cceae6a543e4fa8 (patch)
tree2432814cfe4891e4c99945fbe09e6b59d6df49f3
parent502d26b524d8980f3ed80d9aec398e85671a8160 (diff)
sched: fix copy_namespace() <-> sched_fork() dependency in do_fork
Sukadev Bhattiprolu reported a kernel crash with control groups. There are couple of problems discovered by Suka's test: - The test requires the cgroup filesystem to be mounted with atleast the cpu and ns options (i.e both namespace and cpu controllers are active in the same hierarchy). # mkdir /dev/cpuctl # mount -t cgroup -ocpu,ns none cpuctl (or simply) # mount -t cgroup none cpuctl -> Will activate all controllers in same hierarchy. - The test invokes clone() with CLONE_NEWNS set. This causes a a new child to be created, also a new group (do_fork->copy_namespaces->ns_cgroup_clone-> cgroup_clone) and the child is attached to the new group (cgroup_clone-> attach_task->sched_move_task). At this point in time, the child's scheduler related fields are uninitialized (including its on_rq field, which it has inherited from parent). As a result sched_move_task thinks its on runqueue, when it isn't. As a solution to this problem, I moved sched_fork() call, which initializes scheduler related fields on a new task, before copy_namespaces(). I am not sure though whether moving up will cause other side-effects. Do you see any issue? - The second problem exposed by this test is that task_new_fair() assumes that parent and child will be part of the same group (which needn't be as this test shows). As a result, cfs_rq->curr can be NULL for the child. The solution is to test for curr pointer being NULL in task_new_fair(). With the patch below, I could run ns_exec() fine w/o a crash. Reported-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/sched_fair.c3
2 files changed, 5 insertions, 4 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 28a740151988..8ca1a14cdc8c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1123,6 +1123,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1123 p->blocked_on = NULL; /* not blocked yet */ 1123 p->blocked_on = NULL; /* not blocked yet */
1124#endif 1124#endif
1125 1125
1126 /* Perform scheduler related setup. Assign this task to a CPU. */
1127 sched_fork(p, clone_flags);
1128
1126 if ((retval = security_task_alloc(p))) 1129 if ((retval = security_task_alloc(p)))
1127 goto bad_fork_cleanup_policy; 1130 goto bad_fork_cleanup_policy;
1128 if ((retval = audit_alloc(p))) 1131 if ((retval = audit_alloc(p)))
@@ -1212,9 +1215,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1212 INIT_LIST_HEAD(&p->ptrace_children); 1215 INIT_LIST_HEAD(&p->ptrace_children);
1213 INIT_LIST_HEAD(&p->ptrace_list); 1216 INIT_LIST_HEAD(&p->ptrace_list);
1214 1217
1215 /* Perform scheduler related setup. Assign this task to a CPU. */
1216 sched_fork(p, clone_flags);
1217
1218 /* Now that the task is set up, run cgroup callbacks if 1218 /* Now that the task is set up, run cgroup callbacks if
1219 * necessary. We need to run them before the task is visible 1219 * necessary. We need to run them before the task is visible
1220 * on the tasklist. */ 1220 * on the tasklist. */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c361472cc74..d3c03070872d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1067,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1067 update_curr(cfs_rq); 1067 update_curr(cfs_rq);
1068 place_entity(cfs_rq, se, 1); 1068 place_entity(cfs_rq, se, 1);
1069 1069
1070 /* 'curr' will be NULL if the child belongs to a different group */
1070 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1071 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1071 curr->vruntime < se->vruntime) { 1072 curr && curr->vruntime < se->vruntime) {
1072 /* 1073 /*
1073 * Upon rescheduling, sched_class::put_prev_task() will place 1074 * Upon rescheduling, sched_class::put_prev_task() will place
1074 * 'current' within the tree based on its new key value. 1075 * 'current' within the tree based on its new key value.