summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-10-15 16:41:53 -0400
committerTejun Heo <tj@kernel.org>2015-10-15 16:41:53 -0400
commit2e91fa7f6d451e3ea9fec999065d2fd199691f9d (patch)
tree9869ef4bca569d824ba3266e5b2373302356bea3
parentf0d9a5f175753a371bc7fdff0d584a8d9cd72bb0 (diff)
cgroup: keep zombies associated with their original cgroups
cgroup_exit() is called when a task exits and disassociates the exiting task from its cgroups and half-attach it to the root cgroup. This is unnecessary and undesirable. No controller actually needs an exiting task to be disassociated with non-root cgroups. Both cpu and perf_event controllers update the association to the root cgroup from their exit callbacks just to keep consistent with the cgroup core behavior. Also, this disassociation makes it difficult to track resources held by zombies or determine where the zombies came from. Currently, pids controller is completely broken as it uncharges on exit and zombies always escape the resource restriction. With cgroup association being reset on exit, fixing it is pretty painful. There's no reason to reset cgroup membership on exit. The zombie can be removed from its css_set so that it doesn't show up on "cgroup.procs" and thus can't be migrated or interfere with cgroup removal. It can still pin and point to the css_set so that its cgroup membership is maintained. This patch makes cgroup core keep zombies associated with their cgroups at the time of exit. * Previous patches decoupled populated_cnt tracking from css_set lifetime, so a dying task can be simply unlinked from its css_set while pinning and pointing to the css_set. This keeps css_set association from task side alive while hiding it from "cgroup.procs" and populated_cnt tracking. The css_set reference is dropped when the task_struct is freed. * ->exit() callback no longer needs the css arguments as the associated css never changes once PF_EXITING is set. Removed. * cpu and perf_events controllers no longer need ->exit() callbacks. There's no reason to explicitly switch away on exit. The final schedule out is enough. The callbacks are removed. * On traditional hierarchies, nothing changes. "/proc/PID/cgroup" still reports "/" for all zombies. On the default hierarchy, "/proc/PID/cgroup" keeps reporting the cgroup that the task belonged to at the time of exit. If the cgroup gets removed before the task is reaped, " (deleted)" is appended. v2: Build brekage due to missing dummy cgroup_free() when !CONFIG_CGROUP fixed. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt4
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--kernel/cgroup.c51
-rw-r--r--kernel/cgroup_pids.c6
-rw-r--r--kernel/events/core.c16
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/sched/core.c16
8 files changed, 44 insertions, 56 deletions
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 176b940f8327..6932453d37a2 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -374,6 +374,10 @@ supported and the interface files "release_agent" and
374 374
375- The "cgroup.clone_children" file is removed. 375- The "cgroup.clone_children" file is removed.
376 376
377- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
378 to before exiting. If the cgroup is removed before the zombie is
379 reaped, " (deleted)" is appeneded to the path.
380
377 381
3785-3. Controller File Conventions 3825-3. Controller File Conventions
379 383
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 62413c3e2f4b..6a1ab64ee5f9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -435,9 +435,7 @@ struct cgroup_subsys {
435 int (*can_fork)(struct task_struct *task, void **priv_p); 435 int (*can_fork)(struct task_struct *task, void **priv_p);
436 void (*cancel_fork)(struct task_struct *task, void *priv); 436 void (*cancel_fork)(struct task_struct *task, void *priv);
437 void (*fork)(struct task_struct *task, void *priv); 437 void (*fork)(struct task_struct *task, void *priv);
438 void (*exit)(struct cgroup_subsys_state *css, 438 void (*exit)(struct task_struct *task);
439 struct cgroup_subsys_state *old_css,
440 struct task_struct *task);
441 void (*bind)(struct cgroup_subsys_state *root_css); 439 void (*bind)(struct cgroup_subsys_state *root_css);
442 440
443 int early_init; 441 int early_init;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 46020735bcbb..22e3754f89c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -102,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
102extern void cgroup_post_fork(struct task_struct *p, 102extern void cgroup_post_fork(struct task_struct *p,
103 void *old_ss_priv[CGROUP_CANFORK_COUNT]); 103 void *old_ss_priv[CGROUP_CANFORK_COUNT]);
104void cgroup_exit(struct task_struct *p); 104void cgroup_exit(struct task_struct *p);
105void cgroup_free(struct task_struct *p);
105 106
106int cgroup_init_early(void); 107int cgroup_init_early(void);
107int cgroup_init(void); 108int cgroup_init(void);
@@ -547,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
547static inline void cgroup_post_fork(struct task_struct *p, 548static inline void cgroup_post_fork(struct task_struct *p,
548 void *ss_priv[CGROUP_CANFORK_COUNT]) {} 549 void *ss_priv[CGROUP_CANFORK_COUNT]) {}
549static inline void cgroup_exit(struct task_struct *p) {} 550static inline void cgroup_exit(struct task_struct *p) {}
551static inline void cgroup_free(struct task_struct *p) {}
550 552
551static inline int cgroup_init_early(void) { return 0; } 553static inline int cgroup_init_early(void) { return 0; }
552static inline int cgroup_init(void) { return 0; } 554static inline int cgroup_init(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ba7b3284c2e4..918658497625 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5379,14 +5379,34 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5379 seq_printf(m, "%sname=%s", count ? "," : "", 5379 seq_printf(m, "%sname=%s", count ? "," : "",
5380 root->name); 5380 root->name);
5381 seq_putc(m, ':'); 5381 seq_putc(m, ':');
5382
5382 cgrp = task_cgroup_from_root(tsk, root); 5383 cgrp = task_cgroup_from_root(tsk, root);
5383 path = cgroup_path(cgrp, buf, PATH_MAX); 5384
5384 if (!path) { 5385 /*
5385 retval = -ENAMETOOLONG; 5386 * On traditional hierarchies, all zombie tasks show up as
5386 goto out_unlock; 5387 * belonging to the root cgroup. On the default hierarchy,
5388 * while a zombie doesn't show up in "cgroup.procs" and
5389 * thus can't be migrated, its /proc/PID/cgroup keeps
5390 * reporting the cgroup it belonged to before exiting. If
5391 * the cgroup is removed before the zombie is reaped,
5392 * " (deleted)" is appended to the cgroup path.
5393 */
5394 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5395 path = cgroup_path(cgrp, buf, PATH_MAX);
5396 if (!path) {
5397 retval = -ENAMETOOLONG;
5398 goto out_unlock;
5399 }
5400 } else {
5401 path = "/";
5387 } 5402 }
5403
5388 seq_puts(m, path); 5404 seq_puts(m, path);
5389 seq_putc(m, '\n'); 5405
5406 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5407 seq_puts(m, " (deleted)\n");
5408 else
5409 seq_putc(m, '\n');
5390 } 5410 }
5391 5411
5392 retval = 0; 5412 retval = 0;
@@ -5593,7 +5613,6 @@ void cgroup_exit(struct task_struct *tsk)
5593{ 5613{
5594 struct cgroup_subsys *ss; 5614 struct cgroup_subsys *ss;
5595 struct css_set *cset; 5615 struct css_set *cset;
5596 bool put_cset = false;
5597 int i; 5616 int i;
5598 5617
5599 /* 5618 /*
@@ -5606,22 +5625,20 @@ void cgroup_exit(struct task_struct *tsk)
5606 spin_lock_bh(&css_set_lock); 5625 spin_lock_bh(&css_set_lock);
5607 css_set_move_task(tsk, cset, NULL, false); 5626 css_set_move_task(tsk, cset, NULL, false);
5608 spin_unlock_bh(&css_set_lock); 5627 spin_unlock_bh(&css_set_lock);
5609 put_cset = true; 5628 } else {
5629 get_css_set(cset);
5610 } 5630 }
5611 5631
5612 /* Reassign the task to the init_css_set. */
5613 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5614
5615 /* see cgroup_post_fork() for details */ 5632 /* see cgroup_post_fork() for details */
5616 for_each_subsys_which(ss, i, &have_exit_callback) { 5633 for_each_subsys_which(ss, i, &have_exit_callback)
5617 struct cgroup_subsys_state *old_css = cset->subsys[i]; 5634 ss->exit(tsk);
5618 struct cgroup_subsys_state *css = task_css(tsk, i); 5635}
5619 5636
5620 ss->exit(css, old_css, tsk); 5637void cgroup_free(struct task_struct *task)
5621 } 5638{
5639 struct css_set *cset = task_css_set(task);
5622 5640
5623 if (put_cset) 5641 put_css_set(cset);
5624 put_css_set(cset);
5625} 5642}
5626 5643
5627static void check_for_release(struct cgroup *cgrp) 5644static void check_for_release(struct cgroup *cgrp)
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd7693ac8..45f0856a61fe 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
266 css_put(old_css); 266 css_put(old_css);
267} 267}
268 268
269static void pids_exit(struct cgroup_subsys_state *css, 269static void pids_exit(struct task_struct *task)
270 struct cgroup_subsys_state *old_css,
271 struct task_struct *task)
272{ 270{
273 struct pids_cgroup *pids = css_pids(old_css); 271 struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
274 272
275 pids_uncharge(pids, 1); 273 pids_uncharge(pids, 1);
276} 274}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..e9874949c787 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9293,25 +9293,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9293 task_function_call(task, __perf_cgroup_move, task); 9293 task_function_call(task, __perf_cgroup_move, task);
9294} 9294}
9295 9295
9296static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9297 struct cgroup_subsys_state *old_css,
9298 struct task_struct *task)
9299{
9300 /*
9301 * cgroup_exit() is called in the copy_process() failure path.
9302 * Ignore this case since the task hasn't ran yet, this avoids
9303 * trying to poke a half freed task state from generic code.
9304 */
9305 if (!(task->flags & PF_EXITING))
9306 return;
9307
9308 task_function_call(task, __perf_cgroup_move, task);
9309}
9310
9311struct cgroup_subsys perf_event_cgrp_subsys = { 9296struct cgroup_subsys perf_event_cgrp_subsys = {
9312 .css_alloc = perf_cgroup_css_alloc, 9297 .css_alloc = perf_cgroup_css_alloc,
9313 .css_free = perf_cgroup_css_free, 9298 .css_free = perf_cgroup_css_free,
9314 .exit = perf_cgroup_exit,
9315 .attach = perf_cgroup_attach, 9299 .attach = perf_cgroup_attach,
9316}; 9300};
9317#endif /* CONFIG_CGROUP_PERF */ 9301#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..118743bb5964 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
251 WARN_ON(atomic_read(&tsk->usage)); 251 WARN_ON(atomic_read(&tsk->usage));
252 WARN_ON(tsk == current); 252 WARN_ON(tsk == current);
253 253
254 cgroup_free(tsk);
254 task_numa_free(tsk); 255 task_numa_free(tsk);
255 security_task_free(tsk); 256 security_task_free(tsk);
256 exit_creds(tsk); 257 exit_creds(tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3595403921bd..2cad9ba91036 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8163,21 +8163,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
8163 sched_move_task(task); 8163 sched_move_task(task);
8164} 8164}
8165 8165
8166static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
8167 struct cgroup_subsys_state *old_css,
8168 struct task_struct *task)
8169{
8170 /*
8171 * cgroup_exit() is called in the copy_process() failure path.
8172 * Ignore this case since the task hasn't ran yet, this avoids
8173 * trying to poke a half freed task state from generic code.
8174 */
8175 if (!(task->flags & PF_EXITING))
8176 return;
8177
8178 sched_move_task(task);
8179}
8180
8181#ifdef CONFIG_FAIR_GROUP_SCHED 8166#ifdef CONFIG_FAIR_GROUP_SCHED
8182static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 8167static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8183 struct cftype *cftype, u64 shareval) 8168 struct cftype *cftype, u64 shareval)
@@ -8509,7 +8494,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8509 .fork = cpu_cgroup_fork, 8494 .fork = cpu_cgroup_fork,
8510 .can_attach = cpu_cgroup_can_attach, 8495 .can_attach = cpu_cgroup_can_attach,
8511 .attach = cpu_cgroup_attach, 8496 .attach = cpu_cgroup_attach,
8512 .exit = cpu_cgroup_exit,
8513 .legacy_cftypes = cpu_files, 8497 .legacy_cftypes = cpu_files,
8514 .early_init = 1, 8498 .early_init = 1,
8515}; 8499};