diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 1297 |
1 files changed, 725 insertions, 572 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2c9eae6ad970..b9d0cce3f9ce 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -45,7 +45,7 @@ | |||
45 | #include <linux/sched.h> | 45 | #include <linux/sched.h> |
46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
47 | #include <linux/spinlock.h> | 47 | #include <linux/spinlock.h> |
48 | #include <linux/rwsem.h> | 48 | #include <linux/percpu-rwsem.h> |
49 | #include <linux/string.h> | 49 | #include <linux/string.h> |
50 | #include <linux/sort.h> | 50 | #include <linux/sort.h> |
51 | #include <linux/kmod.h> | 51 | #include <linux/kmod.h> |
@@ -75,7 +75,7 @@ | |||
75 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 75 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
76 | * hierarchy must be performed while holding it. | 76 | * hierarchy must be performed while holding it. |
77 | * | 77 | * |
78 | * css_set_rwsem protects task->cgroups pointer, the list of css_set | 78 | * css_set_lock protects task->cgroups pointer, the list of css_set |
79 | * objects, and the chain of tasks off each css_set. | 79 | * objects, and the chain of tasks off each css_set. |
80 | * | 80 | * |
81 | * These locks are exported if CONFIG_PROVE_RCU so that accessors in | 81 | * These locks are exported if CONFIG_PROVE_RCU so that accessors in |
@@ -83,12 +83,12 @@ | |||
83 | */ | 83 | */ |
84 | #ifdef CONFIG_PROVE_RCU | 84 | #ifdef CONFIG_PROVE_RCU |
85 | DEFINE_MUTEX(cgroup_mutex); | 85 | DEFINE_MUTEX(cgroup_mutex); |
86 | DECLARE_RWSEM(css_set_rwsem); | 86 | DEFINE_SPINLOCK(css_set_lock); |
87 | EXPORT_SYMBOL_GPL(cgroup_mutex); | 87 | EXPORT_SYMBOL_GPL(cgroup_mutex); |
88 | EXPORT_SYMBOL_GPL(css_set_rwsem); | 88 | EXPORT_SYMBOL_GPL(css_set_lock); |
89 | #else | 89 | #else |
90 | static DEFINE_MUTEX(cgroup_mutex); | 90 | static DEFINE_MUTEX(cgroup_mutex); |
91 | static DECLARE_RWSEM(css_set_rwsem); | 91 | static DEFINE_SPINLOCK(css_set_lock); |
92 | #endif | 92 | #endif |
93 | 93 | ||
94 | /* | 94 | /* |
@@ -103,6 +103,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); | |||
103 | */ | 103 | */ |
104 | static DEFINE_SPINLOCK(release_agent_path_lock); | 104 | static DEFINE_SPINLOCK(release_agent_path_lock); |
105 | 105 | ||
106 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | ||
107 | |||
106 | #define cgroup_assert_mutex_or_rcu_locked() \ | 108 | #define cgroup_assert_mutex_or_rcu_locked() \ |
107 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ | 109 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ |
108 | !lockdep_is_held(&cgroup_mutex), \ | 110 | !lockdep_is_held(&cgroup_mutex), \ |
@@ -136,6 +138,27 @@ static const char *cgroup_subsys_name[] = { | |||
136 | }; | 138 | }; |
137 | #undef SUBSYS | 139 | #undef SUBSYS |
138 | 140 | ||
141 | /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ | ||
142 | #define SUBSYS(_x) \ | ||
143 | DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ | ||
144 | DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ | ||
145 | EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ | ||
146 | EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); | ||
147 | #include <linux/cgroup_subsys.h> | ||
148 | #undef SUBSYS | ||
149 | |||
150 | #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, | ||
151 | static struct static_key_true *cgroup_subsys_enabled_key[] = { | ||
152 | #include <linux/cgroup_subsys.h> | ||
153 | }; | ||
154 | #undef SUBSYS | ||
155 | |||
156 | #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, | ||
157 | static struct static_key_true *cgroup_subsys_on_dfl_key[] = { | ||
158 | #include <linux/cgroup_subsys.h> | ||
159 | }; | ||
160 | #undef SUBSYS | ||
161 | |||
139 | /* | 162 | /* |
140 | * The default hierarchy, reserved for the subsystems that are otherwise | 163 | * The default hierarchy, reserved for the subsystems that are otherwise |
141 | * unattached - it never has more than a single cgroup, and all tasks are | 164 | * unattached - it never has more than a single cgroup, and all tasks are |
@@ -150,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); | |||
150 | */ | 173 | */ |
151 | static bool cgrp_dfl_root_visible; | 174 | static bool cgrp_dfl_root_visible; |
152 | 175 | ||
153 | /* | ||
154 | * Set by the boot param of the same name and makes subsystems with NULL | ||
155 | * ->dfl_files to use ->legacy_files on the default hierarchy. | ||
156 | */ | ||
157 | static bool cgroup_legacy_files_on_dfl; | ||
158 | |||
159 | /* some controllers are not supported in the default hierarchy */ | 176 | /* some controllers are not supported in the default hierarchy */ |
160 | static unsigned long cgrp_dfl_root_inhibit_ss_mask; | 177 | static unsigned long cgrp_dfl_root_inhibit_ss_mask; |
161 | 178 | ||
@@ -183,6 +200,7 @@ static u64 css_serial_nr_next = 1; | |||
183 | */ | 200 | */ |
184 | static unsigned long have_fork_callback __read_mostly; | 201 | static unsigned long have_fork_callback __read_mostly; |
185 | static unsigned long have_exit_callback __read_mostly; | 202 | static unsigned long have_exit_callback __read_mostly; |
203 | static unsigned long have_free_callback __read_mostly; | ||
186 | 204 | ||
187 | /* Ditto for the can_fork callback. */ | 205 | /* Ditto for the can_fork callback. */ |
188 | static unsigned long have_canfork_callback __read_mostly; | 206 | static unsigned long have_canfork_callback __read_mostly; |
@@ -192,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[]; | |||
192 | 210 | ||
193 | static int rebind_subsystems(struct cgroup_root *dst_root, | 211 | static int rebind_subsystems(struct cgroup_root *dst_root, |
194 | unsigned long ss_mask); | 212 | unsigned long ss_mask); |
213 | static void css_task_iter_advance(struct css_task_iter *it); | ||
195 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 214 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
196 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | 215 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, |
197 | bool visible); | 216 | bool visible); |
198 | static void css_release(struct percpu_ref *ref); | 217 | static void css_release(struct percpu_ref *ref); |
199 | static void kill_css(struct cgroup_subsys_state *css); | 218 | static void kill_css(struct cgroup_subsys_state *css); |
200 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 219 | static int cgroup_addrm_files(struct cgroup_subsys_state *css, |
220 | struct cgroup *cgrp, struct cftype cfts[], | ||
201 | bool is_add); | 221 | bool is_add); |
202 | 222 | ||
223 | /** | ||
224 | * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID | ||
225 | * @ssid: subsys ID of interest | ||
226 | * | ||
227 | * cgroup_subsys_enabled() can only be used with literal subsys names which | ||
228 | * is fine for individual subsystems but unsuitable for cgroup core. This | ||
229 | * is slower static_key_enabled() based test indexed by @ssid. | ||
230 | */ | ||
231 | static bool cgroup_ssid_enabled(int ssid) | ||
232 | { | ||
233 | return static_key_enabled(cgroup_subsys_enabled_key[ssid]); | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy | ||
238 | * @cgrp: the cgroup of interest | ||
239 | * | ||
240 | * The default hierarchy is the v2 interface of cgroup and this function | ||
241 | * can be used to test whether a cgroup is on the default hierarchy for | ||
242 | * cases where a subsystem should behave differnetly depending on the | ||
243 | * interface version. | ||
244 | * | ||
245 | * The set of behaviors which change on the default hierarchy are still | ||
246 | * being determined and the mount option is prefixed with __DEVEL__. | ||
247 | * | ||
248 | * List of changed behaviors: | ||
249 | * | ||
250 | * - Mount options "noprefix", "xattr", "clone_children", "release_agent" | ||
251 | * and "name" are disallowed. | ||
252 | * | ||
253 | * - When mounting an existing superblock, mount options should match. | ||
254 | * | ||
255 | * - Remount is disallowed. | ||
256 | * | ||
257 | * - rename(2) is disallowed. | ||
258 | * | ||
259 | * - "tasks" is removed. Everything should be at process granularity. Use | ||
260 | * "cgroup.procs" instead. | ||
261 | * | ||
262 | * - "cgroup.procs" is not sorted. pids will be unique unless they got | ||
263 | * recycled inbetween reads. | ||
264 | * | ||
265 | * - "release_agent" and "notify_on_release" are removed. Replacement | ||
266 | * notification mechanism will be implemented. | ||
267 | * | ||
268 | * - "cgroup.clone_children" is removed. | ||
269 | * | ||
270 | * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup | ||
271 | * and its descendants contain no task; otherwise, 1. The file also | ||
272 | * generates kernfs notification which can be monitored through poll and | ||
273 | * [di]notify when the value of the file changes. | ||
274 | * | ||
275 | * - cpuset: tasks will be kept in empty cpusets when hotplug happens and | ||
276 | * take masks of ancestors with non-empty cpus/mems, instead of being | ||
277 | * moved to an ancestor. | ||
278 | * | ||
279 | * - cpuset: a task can be moved into an empty cpuset, and again it takes | ||
280 | * masks of ancestors. | ||
281 | * | ||
282 | * - memcg: use_hierarchy is on by default and the cgroup file for the flag | ||
283 | * is not created. | ||
284 | * | ||
285 | * - blkcg: blk-throttle becomes properly hierarchical. | ||
286 | * | ||
287 | * - debug: disallowed on the default hierarchy. | ||
288 | */ | ||
289 | static bool cgroup_on_dfl(const struct cgroup *cgrp) | ||
290 | { | ||
291 | return cgrp->root == &cgrp_dfl_root; | ||
292 | } | ||
293 | |||
203 | /* IDR wrappers which synchronize using cgroup_idr_lock */ | 294 | /* IDR wrappers which synchronize using cgroup_idr_lock */ |
204 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | 295 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, |
205 | gfp_t gfp_mask) | 296 | gfp_t gfp_mask) |
@@ -332,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) | |||
332 | return !(cgrp->self.flags & CSS_ONLINE); | 423 | return !(cgrp->self.flags & CSS_ONLINE); |
333 | } | 424 | } |
334 | 425 | ||
426 | static void cgroup_get(struct cgroup *cgrp) | ||
427 | { | ||
428 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | ||
429 | css_get(&cgrp->self); | ||
430 | } | ||
431 | |||
432 | static bool cgroup_tryget(struct cgroup *cgrp) | ||
433 | { | ||
434 | return css_tryget(&cgrp->self); | ||
435 | } | ||
436 | |||
437 | static void cgroup_put(struct cgroup *cgrp) | ||
438 | { | ||
439 | css_put(&cgrp->self); | ||
440 | } | ||
441 | |||
335 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) | 442 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) |
336 | { | 443 | { |
337 | struct cgroup *cgrp = of->kn->parent->priv; | 444 | struct cgroup *cgrp = of->kn->parent->priv; |
@@ -481,19 +588,31 @@ struct css_set init_css_set = { | |||
481 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), | 588 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
482 | .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), | 589 | .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), |
483 | .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), | 590 | .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), |
591 | .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), | ||
484 | }; | 592 | }; |
485 | 593 | ||
486 | static int css_set_count = 1; /* 1 for init_css_set */ | 594 | static int css_set_count = 1; /* 1 for init_css_set */ |
487 | 595 | ||
488 | /** | 596 | /** |
597 | * css_set_populated - does a css_set contain any tasks? | ||
598 | * @cset: target css_set | ||
599 | */ | ||
600 | static bool css_set_populated(struct css_set *cset) | ||
601 | { | ||
602 | lockdep_assert_held(&css_set_lock); | ||
603 | |||
604 | return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); | ||
605 | } | ||
606 | |||
607 | /** | ||
489 | * cgroup_update_populated - updated populated count of a cgroup | 608 | * cgroup_update_populated - updated populated count of a cgroup |
490 | * @cgrp: the target cgroup | 609 | * @cgrp: the target cgroup |
491 | * @populated: inc or dec populated count | 610 | * @populated: inc or dec populated count |
492 | * | 611 | * |
493 | * @cgrp is either getting the first task (css_set) or losing the last. | 612 | * One of the css_sets associated with @cgrp is either getting its first |
494 | * Update @cgrp->populated_cnt accordingly. The count is propagated | 613 | * task or losing the last. Update @cgrp->populated_cnt accordingly. The |
495 | * towards root so that a given cgroup's populated_cnt is zero iff the | 614 | * count is propagated towards root so that a given cgroup's populated_cnt |
496 | * cgroup and all its descendants are empty. | 615 | * is zero iff the cgroup and all its descendants don't contain any tasks. |
497 | * | 616 | * |
498 | * @cgrp's interface file "cgroup.populated" is zero if | 617 | * @cgrp's interface file "cgroup.populated" is zero if |
499 | * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt | 618 | * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt |
@@ -503,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */ | |||
503 | */ | 622 | */ |
504 | static void cgroup_update_populated(struct cgroup *cgrp, bool populated) | 623 | static void cgroup_update_populated(struct cgroup *cgrp, bool populated) |
505 | { | 624 | { |
506 | lockdep_assert_held(&css_set_rwsem); | 625 | lockdep_assert_held(&css_set_lock); |
507 | 626 | ||
508 | do { | 627 | do { |
509 | bool trigger; | 628 | bool trigger; |
@@ -516,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) | |||
516 | if (!trigger) | 635 | if (!trigger) |
517 | break; | 636 | break; |
518 | 637 | ||
519 | if (cgrp->populated_kn) | 638 | check_for_release(cgrp); |
520 | kernfs_notify(cgrp->populated_kn); | 639 | cgroup_file_notify(&cgrp->events_file); |
640 | |||
521 | cgrp = cgroup_parent(cgrp); | 641 | cgrp = cgroup_parent(cgrp); |
522 | } while (cgrp); | 642 | } while (cgrp); |
523 | } | 643 | } |
524 | 644 | ||
645 | /** | ||
646 | * css_set_update_populated - update populated state of a css_set | ||
647 | * @cset: target css_set | ||
648 | * @populated: whether @cset is populated or depopulated | ||
649 | * | ||
650 | * @cset is either getting the first task or losing the last. Update the | ||
651 | * ->populated_cnt of all associated cgroups accordingly. | ||
652 | */ | ||
653 | static void css_set_update_populated(struct css_set *cset, bool populated) | ||
654 | { | ||
655 | struct cgrp_cset_link *link; | ||
656 | |||
657 | lockdep_assert_held(&css_set_lock); | ||
658 | |||
659 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) | ||
660 | cgroup_update_populated(link->cgrp, populated); | ||
661 | } | ||
662 | |||
663 | /** | ||
664 | * css_set_move_task - move a task from one css_set to another | ||
665 | * @task: task being moved | ||
666 | * @from_cset: css_set @task currently belongs to (may be NULL) | ||
667 | * @to_cset: new css_set @task is being moved to (may be NULL) | ||
668 | * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks | ||
669 | * | ||
670 | * Move @task from @from_cset to @to_cset. If @task didn't belong to any | ||
671 | * css_set, @from_cset can be NULL. If @task is being disassociated | ||
672 | * instead of moved, @to_cset can be NULL. | ||
673 | * | ||
674 | * This function automatically handles populated_cnt updates and | ||
675 | * css_task_iter adjustments but the caller is responsible for managing | ||
676 | * @from_cset and @to_cset's reference counts. | ||
677 | */ | ||
678 | static void css_set_move_task(struct task_struct *task, | ||
679 | struct css_set *from_cset, struct css_set *to_cset, | ||
680 | bool use_mg_tasks) | ||
681 | { | ||
682 | lockdep_assert_held(&css_set_lock); | ||
683 | |||
684 | if (from_cset) { | ||
685 | struct css_task_iter *it, *pos; | ||
686 | |||
687 | WARN_ON_ONCE(list_empty(&task->cg_list)); | ||
688 | |||
689 | /* | ||
690 | * @task is leaving, advance task iterators which are | ||
691 | * pointing to it so that they can resume at the next | ||
692 | * position. Advancing an iterator might remove it from | ||
693 | * the list, use safe walk. See css_task_iter_advance*() | ||
694 | * for details. | ||
695 | */ | ||
696 | list_for_each_entry_safe(it, pos, &from_cset->task_iters, | ||
697 | iters_node) | ||
698 | if (it->task_pos == &task->cg_list) | ||
699 | css_task_iter_advance(it); | ||
700 | |||
701 | list_del_init(&task->cg_list); | ||
702 | if (!css_set_populated(from_cset)) | ||
703 | css_set_update_populated(from_cset, false); | ||
704 | } else { | ||
705 | WARN_ON_ONCE(!list_empty(&task->cg_list)); | ||
706 | } | ||
707 | |||
708 | if (to_cset) { | ||
709 | /* | ||
710 | * We are synchronized through cgroup_threadgroup_rwsem | ||
711 | * against PF_EXITING setting such that we can't race | ||
712 | * against cgroup_exit() changing the css_set to | ||
713 | * init_css_set and dropping the old one. | ||
714 | */ | ||
715 | WARN_ON_ONCE(task->flags & PF_EXITING); | ||
716 | |||
717 | if (!css_set_populated(to_cset)) | ||
718 | css_set_update_populated(to_cset, true); | ||
719 | rcu_assign_pointer(task->cgroups, to_cset); | ||
720 | list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : | ||
721 | &to_cset->tasks); | ||
722 | } | ||
723 | } | ||
724 | |||
525 | /* | 725 | /* |
526 | * hash table for cgroup groups. This improves the performance to find | 726 | * hash table for cgroup groups. This improves the performance to find |
527 | * an existing css_set. This hash doesn't (currently) take into | 727 | * an existing css_set. This hash doesn't (currently) take into |
@@ -549,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset) | |||
549 | struct cgroup_subsys *ss; | 749 | struct cgroup_subsys *ss; |
550 | int ssid; | 750 | int ssid; |
551 | 751 | ||
552 | lockdep_assert_held(&css_set_rwsem); | 752 | lockdep_assert_held(&css_set_lock); |
553 | 753 | ||
554 | if (!atomic_dec_and_test(&cset->refcount)) | 754 | if (!atomic_dec_and_test(&cset->refcount)) |
555 | return; | 755 | return; |
@@ -561,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset) | |||
561 | css_set_count--; | 761 | css_set_count--; |
562 | 762 | ||
563 | list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { | 763 | list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { |
564 | struct cgroup *cgrp = link->cgrp; | ||
565 | |||
566 | list_del(&link->cset_link); | 764 | list_del(&link->cset_link); |
567 | list_del(&link->cgrp_link); | 765 | list_del(&link->cgrp_link); |
568 | 766 | if (cgroup_parent(link->cgrp)) | |
569 | /* @cgrp can't go away while we're holding css_set_rwsem */ | 767 | cgroup_put(link->cgrp); |
570 | if (list_empty(&cgrp->cset_links)) { | ||
571 | cgroup_update_populated(cgrp, false); | ||
572 | check_for_release(cgrp); | ||
573 | } | ||
574 | |||
575 | kfree(link); | 768 | kfree(link); |
576 | } | 769 | } |
577 | 770 | ||
@@ -588,9 +781,9 @@ static void put_css_set(struct css_set *cset) | |||
588 | if (atomic_add_unless(&cset->refcount, -1, 1)) | 781 | if (atomic_add_unless(&cset->refcount, -1, 1)) |
589 | return; | 782 | return; |
590 | 783 | ||
591 | down_write(&css_set_rwsem); | 784 | spin_lock_bh(&css_set_lock); |
592 | put_css_set_locked(cset); | 785 | put_css_set_locked(cset); |
593 | up_write(&css_set_rwsem); | 786 | spin_unlock_bh(&css_set_lock); |
594 | } | 787 | } |
595 | 788 | ||
596 | /* | 789 | /* |
@@ -779,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, | |||
779 | link->cset = cset; | 972 | link->cset = cset; |
780 | link->cgrp = cgrp; | 973 | link->cgrp = cgrp; |
781 | 974 | ||
782 | if (list_empty(&cgrp->cset_links)) | ||
783 | cgroup_update_populated(cgrp, true); | ||
784 | list_move(&link->cset_link, &cgrp->cset_links); | ||
785 | |||
786 | /* | 975 | /* |
787 | * Always add links to the tail of the list so that the list | 976 | * Always add links to the tail of the lists so that the lists are |
788 | * is sorted by order of hierarchy creation | 977 | * in choronological order. |
789 | */ | 978 | */ |
979 | list_move_tail(&link->cset_link, &cgrp->cset_links); | ||
790 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); | 980 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); |
981 | |||
982 | if (cgroup_parent(cgrp)) | ||
983 | cgroup_get(cgrp); | ||
791 | } | 984 | } |
792 | 985 | ||
793 | /** | 986 | /** |
@@ -813,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
813 | 1006 | ||
814 | /* First see if we already have a cgroup group that matches | 1007 | /* First see if we already have a cgroup group that matches |
815 | * the desired set */ | 1008 | * the desired set */ |
816 | down_read(&css_set_rwsem); | 1009 | spin_lock_bh(&css_set_lock); |
817 | cset = find_existing_css_set(old_cset, cgrp, template); | 1010 | cset = find_existing_css_set(old_cset, cgrp, template); |
818 | if (cset) | 1011 | if (cset) |
819 | get_css_set(cset); | 1012 | get_css_set(cset); |
820 | up_read(&css_set_rwsem); | 1013 | spin_unlock_bh(&css_set_lock); |
821 | 1014 | ||
822 | if (cset) | 1015 | if (cset) |
823 | return cset; | 1016 | return cset; |
@@ -838,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
838 | INIT_LIST_HEAD(&cset->mg_tasks); | 1031 | INIT_LIST_HEAD(&cset->mg_tasks); |
839 | INIT_LIST_HEAD(&cset->mg_preload_node); | 1032 | INIT_LIST_HEAD(&cset->mg_preload_node); |
840 | INIT_LIST_HEAD(&cset->mg_node); | 1033 | INIT_LIST_HEAD(&cset->mg_node); |
1034 | INIT_LIST_HEAD(&cset->task_iters); | ||
841 | INIT_HLIST_NODE(&cset->hlist); | 1035 | INIT_HLIST_NODE(&cset->hlist); |
842 | 1036 | ||
843 | /* Copy the set of subsystem state objects generated in | 1037 | /* Copy the set of subsystem state objects generated in |
844 | * find_existing_css_set() */ | 1038 | * find_existing_css_set() */ |
845 | memcpy(cset->subsys, template, sizeof(cset->subsys)); | 1039 | memcpy(cset->subsys, template, sizeof(cset->subsys)); |
846 | 1040 | ||
847 | down_write(&css_set_rwsem); | 1041 | spin_lock_bh(&css_set_lock); |
848 | /* Add reference counts and links from the new css_set. */ | 1042 | /* Add reference counts and links from the new css_set. */ |
849 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { | 1043 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { |
850 | struct cgroup *c = link->cgrp; | 1044 | struct cgroup *c = link->cgrp; |
@@ -866,53 +1060,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
866 | list_add_tail(&cset->e_cset_node[ssid], | 1060 | list_add_tail(&cset->e_cset_node[ssid], |
867 | &cset->subsys[ssid]->cgroup->e_csets[ssid]); | 1061 | &cset->subsys[ssid]->cgroup->e_csets[ssid]); |
868 | 1062 | ||
869 | up_write(&css_set_rwsem); | 1063 | spin_unlock_bh(&css_set_lock); |
870 | 1064 | ||
871 | return cset; | 1065 | return cset; |
872 | } | 1066 | } |
873 | 1067 | ||
874 | void cgroup_threadgroup_change_begin(struct task_struct *tsk) | ||
875 | { | ||
876 | down_read(&tsk->signal->group_rwsem); | ||
877 | } | ||
878 | |||
879 | void cgroup_threadgroup_change_end(struct task_struct *tsk) | ||
880 | { | ||
881 | up_read(&tsk->signal->group_rwsem); | ||
882 | } | ||
883 | |||
884 | /** | ||
885 | * threadgroup_lock - lock threadgroup | ||
886 | * @tsk: member task of the threadgroup to lock | ||
887 | * | ||
888 | * Lock the threadgroup @tsk belongs to. No new task is allowed to enter | ||
889 | * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or | ||
890 | * change ->group_leader/pid. This is useful for cases where the threadgroup | ||
891 | * needs to stay stable across blockable operations. | ||
892 | * | ||
893 | * fork and exit explicitly call threadgroup_change_{begin|end}() for | ||
894 | * synchronization. While held, no new task will be added to threadgroup | ||
895 | * and no existing live task will have its PF_EXITING set. | ||
896 | * | ||
897 | * de_thread() does threadgroup_change_{begin|end}() when a non-leader | ||
898 | * sub-thread becomes a new leader. | ||
899 | */ | ||
900 | static void threadgroup_lock(struct task_struct *tsk) | ||
901 | { | ||
902 | down_write(&tsk->signal->group_rwsem); | ||
903 | } | ||
904 | |||
905 | /** | ||
906 | * threadgroup_unlock - unlock threadgroup | ||
907 | * @tsk: member task of the threadgroup to unlock | ||
908 | * | ||
909 | * Reverse threadgroup_lock(). | ||
910 | */ | ||
911 | static inline void threadgroup_unlock(struct task_struct *tsk) | ||
912 | { | ||
913 | up_write(&tsk->signal->group_rwsem); | ||
914 | } | ||
915 | |||
916 | static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) | 1068 | static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) |
917 | { | 1069 | { |
918 | struct cgroup *root_cgrp = kf_root->kn->priv; | 1070 | struct cgroup *root_cgrp = kf_root->kn->priv; |
@@ -972,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
972 | * Release all the links from cset_links to this hierarchy's | 1124 | * Release all the links from cset_links to this hierarchy's |
973 | * root cgroup | 1125 | * root cgroup |
974 | */ | 1126 | */ |
975 | down_write(&css_set_rwsem); | 1127 | spin_lock_bh(&css_set_lock); |
976 | 1128 | ||
977 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { | 1129 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { |
978 | list_del(&link->cset_link); | 1130 | list_del(&link->cset_link); |
979 | list_del(&link->cgrp_link); | 1131 | list_del(&link->cgrp_link); |
980 | kfree(link); | 1132 | kfree(link); |
981 | } | 1133 | } |
982 | up_write(&css_set_rwsem); | 1134 | |
1135 | spin_unlock_bh(&css_set_lock); | ||
983 | 1136 | ||
984 | if (!list_empty(&root->root_list)) { | 1137 | if (!list_empty(&root->root_list)) { |
985 | list_del(&root->root_list); | 1138 | list_del(&root->root_list); |
@@ -1001,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, | |||
1001 | struct cgroup *res = NULL; | 1154 | struct cgroup *res = NULL; |
1002 | 1155 | ||
1003 | lockdep_assert_held(&cgroup_mutex); | 1156 | lockdep_assert_held(&cgroup_mutex); |
1004 | lockdep_assert_held(&css_set_rwsem); | 1157 | lockdep_assert_held(&css_set_lock); |
1005 | 1158 | ||
1006 | if (cset == &init_css_set) { | 1159 | if (cset == &init_css_set) { |
1007 | res = &root->cgrp; | 1160 | res = &root->cgrp; |
@@ -1024,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, | |||
1024 | 1177 | ||
1025 | /* | 1178 | /* |
1026 | * Return the cgroup for "task" from the given hierarchy. Must be | 1179 | * Return the cgroup for "task" from the given hierarchy. Must be |
1027 | * called with cgroup_mutex and css_set_rwsem held. | 1180 | * called with cgroup_mutex and css_set_lock held. |
1028 | */ | 1181 | */ |
1029 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, | 1182 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, |
1030 | struct cgroup_root *root) | 1183 | struct cgroup_root *root) |
@@ -1063,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
1063 | * update of a tasks cgroup pointer by cgroup_attach_task() | 1216 | * update of a tasks cgroup pointer by cgroup_attach_task() |
1064 | */ | 1217 | */ |
1065 | 1218 | ||
1066 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); | ||
1067 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; | 1219 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; |
1068 | static const struct file_operations proc_cgroupstats_operations; | 1220 | static const struct file_operations proc_cgroupstats_operations; |
1069 | 1221 | ||
@@ -1086,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, | |||
1086 | * cgroup_file_mode - deduce file mode of a control file | 1238 | * cgroup_file_mode - deduce file mode of a control file |
1087 | * @cft: the control file in question | 1239 | * @cft: the control file in question |
1088 | * | 1240 | * |
1089 | * returns cft->mode if ->mode is not 0 | 1241 | * S_IRUGO for read, S_IWUSR for write. |
1090 | * returns S_IRUGO|S_IWUSR if it has both a read and a write handler | ||
1091 | * returns S_IRUGO if it has only a read handler | ||
1092 | * returns S_IWUSR if it has only a write hander | ||
1093 | */ | 1242 | */ |
1094 | static umode_t cgroup_file_mode(const struct cftype *cft) | 1243 | static umode_t cgroup_file_mode(const struct cftype *cft) |
1095 | { | 1244 | { |
1096 | umode_t mode = 0; | 1245 | umode_t mode = 0; |
1097 | 1246 | ||
1098 | if (cft->mode) | ||
1099 | return cft->mode; | ||
1100 | |||
1101 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) | 1247 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) |
1102 | mode |= S_IRUGO; | 1248 | mode |= S_IRUGO; |
1103 | 1249 | ||
1104 | if (cft->write_u64 || cft->write_s64 || cft->write) | 1250 | if (cft->write_u64 || cft->write_s64 || cft->write) { |
1105 | mode |= S_IWUSR; | 1251 | if (cft->flags & CFTYPE_WORLD_WRITABLE) |
1252 | mode |= S_IWUGO; | ||
1253 | else | ||
1254 | mode |= S_IWUSR; | ||
1255 | } | ||
1106 | 1256 | ||
1107 | return mode; | 1257 | return mode; |
1108 | } | 1258 | } |
1109 | 1259 | ||
1110 | static void cgroup_get(struct cgroup *cgrp) | ||
1111 | { | ||
1112 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | ||
1113 | css_get(&cgrp->self); | ||
1114 | } | ||
1115 | |||
1116 | static bool cgroup_tryget(struct cgroup *cgrp) | ||
1117 | { | ||
1118 | return css_tryget(&cgrp->self); | ||
1119 | } | ||
1120 | |||
1121 | static void cgroup_put(struct cgroup *cgrp) | ||
1122 | { | ||
1123 | css_put(&cgrp->self); | ||
1124 | } | ||
1125 | |||
1126 | /** | 1260 | /** |
1127 | * cgroup_calc_child_subsys_mask - calculate child_subsys_mask | 1261 | * cgroup_calc_child_subsys_mask - calculate child_subsys_mask |
1128 | * @cgrp: the target cgroup | 1262 | * @cgrp: the target cgroup |
@@ -1263,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
1263 | } | 1397 | } |
1264 | 1398 | ||
1265 | /** | 1399 | /** |
1266 | * cgroup_clear_dir - remove subsys files in a cgroup directory | 1400 | * css_clear_dir - remove subsys files in a cgroup directory |
1267 | * @cgrp: target cgroup | 1401 | * @css: taget css |
1268 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 1402 | * @cgrp_override: specify if target cgroup is different from css->cgroup |
1269 | */ | 1403 | */ |
1270 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 1404 | static void css_clear_dir(struct cgroup_subsys_state *css, |
1405 | struct cgroup *cgrp_override) | ||
1271 | { | 1406 | { |
1272 | struct cgroup_subsys *ss; | 1407 | struct cgroup *cgrp = cgrp_override ?: css->cgroup; |
1273 | int i; | 1408 | struct cftype *cfts; |
1274 | 1409 | ||
1275 | for_each_subsys(ss, i) { | 1410 | list_for_each_entry(cfts, &css->ss->cfts, node) |
1276 | struct cftype *cfts; | 1411 | cgroup_addrm_files(css, cgrp, cfts, false); |
1412 | } | ||
1277 | 1413 | ||
1278 | if (!(subsys_mask & (1 << i))) | 1414 | /** |
1279 | continue; | 1415 | * css_populate_dir - create subsys files in a cgroup directory |
1280 | list_for_each_entry(cfts, &ss->cfts, node) | 1416 | * @css: target css |
1281 | cgroup_addrm_files(cgrp, cfts, false); | 1417 | * @cgrp_overried: specify if target cgroup is different from css->cgroup |
1418 | * | ||
1419 | * On failure, no file is added. | ||
1420 | */ | ||
1421 | static int css_populate_dir(struct cgroup_subsys_state *css, | ||
1422 | struct cgroup *cgrp_override) | ||
1423 | { | ||
1424 | struct cgroup *cgrp = cgrp_override ?: css->cgroup; | ||
1425 | struct cftype *cfts, *failed_cfts; | ||
1426 | int ret; | ||
1427 | |||
1428 | if (!css->ss) { | ||
1429 | if (cgroup_on_dfl(cgrp)) | ||
1430 | cfts = cgroup_dfl_base_files; | ||
1431 | else | ||
1432 | cfts = cgroup_legacy_base_files; | ||
1433 | |||
1434 | return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); | ||
1435 | } | ||
1436 | |||
1437 | list_for_each_entry(cfts, &css->ss->cfts, node) { | ||
1438 | ret = cgroup_addrm_files(css, cgrp, cfts, true); | ||
1439 | if (ret < 0) { | ||
1440 | failed_cfts = cfts; | ||
1441 | goto err; | ||
1442 | } | ||
1282 | } | 1443 | } |
1444 | return 0; | ||
1445 | err: | ||
1446 | list_for_each_entry(cfts, &css->ss->cfts, node) { | ||
1447 | if (cfts == failed_cfts) | ||
1448 | break; | ||
1449 | cgroup_addrm_files(css, cgrp, cfts, false); | ||
1450 | } | ||
1451 | return ret; | ||
1283 | } | 1452 | } |
1284 | 1453 | ||
1285 | static int rebind_subsystems(struct cgroup_root *dst_root, | 1454 | static int rebind_subsystems(struct cgroup_root *dst_root, |
1286 | unsigned long ss_mask) | 1455 | unsigned long ss_mask) |
1287 | { | 1456 | { |
1457 | struct cgroup *dcgrp = &dst_root->cgrp; | ||
1288 | struct cgroup_subsys *ss; | 1458 | struct cgroup_subsys *ss; |
1289 | unsigned long tmp_ss_mask; | 1459 | unsigned long tmp_ss_mask; |
1290 | int ssid, i, ret; | 1460 | int ssid, i, ret; |
@@ -1306,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
1306 | if (dst_root == &cgrp_dfl_root) | 1476 | if (dst_root == &cgrp_dfl_root) |
1307 | tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; | 1477 | tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; |
1308 | 1478 | ||
1309 | ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); | 1479 | for_each_subsys_which(ss, ssid, &tmp_ss_mask) { |
1310 | if (ret) { | 1480 | struct cgroup *scgrp = &ss->root->cgrp; |
1311 | if (dst_root != &cgrp_dfl_root) | 1481 | int tssid; |
1312 | return ret; | 1482 | |
1483 | ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); | ||
1484 | if (!ret) | ||
1485 | continue; | ||
1313 | 1486 | ||
1314 | /* | 1487 | /* |
1315 | * Rebinding back to the default root is not allowed to | 1488 | * Rebinding back to the default root is not allowed to |
@@ -1317,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
1317 | * be rare. Moving subsystems back and forth even more so. | 1490 | * be rare. Moving subsystems back and forth even more so. |
1318 | * Just warn about it and continue. | 1491 | * Just warn about it and continue. |
1319 | */ | 1492 | */ |
1320 | if (cgrp_dfl_root_visible) { | 1493 | if (dst_root == &cgrp_dfl_root) { |
1321 | pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", | 1494 | if (cgrp_dfl_root_visible) { |
1322 | ret, ss_mask); | 1495 | pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", |
1323 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); | 1496 | ret, ss_mask); |
1497 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); | ||
1498 | } | ||
1499 | continue; | ||
1500 | } | ||
1501 | |||
1502 | for_each_subsys_which(ss, tssid, &tmp_ss_mask) { | ||
1503 | if (tssid == ssid) | ||
1504 | break; | ||
1505 | css_clear_dir(cgroup_css(scgrp, ss), dcgrp); | ||
1324 | } | 1506 | } |
1507 | return ret; | ||
1325 | } | 1508 | } |
1326 | 1509 | ||
1327 | /* | 1510 | /* |
1328 | * Nothing can fail from this point on. Remove files for the | 1511 | * Nothing can fail from this point on. Remove files for the |
1329 | * removed subsystems and rebind each subsystem. | 1512 | * removed subsystems and rebind each subsystem. |
1330 | */ | 1513 | */ |
1331 | for_each_subsys_which(ss, ssid, &ss_mask) | ||
1332 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); | ||
1333 | |||
1334 | for_each_subsys_which(ss, ssid, &ss_mask) { | 1514 | for_each_subsys_which(ss, ssid, &ss_mask) { |
1335 | struct cgroup_root *src_root; | 1515 | struct cgroup_root *src_root = ss->root; |
1336 | struct cgroup_subsys_state *css; | 1516 | struct cgroup *scgrp = &src_root->cgrp; |
1517 | struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); | ||
1337 | struct css_set *cset; | 1518 | struct css_set *cset; |
1338 | 1519 | ||
1339 | src_root = ss->root; | 1520 | WARN_ON(!css || cgroup_css(dcgrp, ss)); |
1340 | css = cgroup_css(&src_root->cgrp, ss); | ||
1341 | 1521 | ||
1342 | WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); | 1522 | css_clear_dir(css, NULL); |
1343 | 1523 | ||
1344 | RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); | 1524 | RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); |
1345 | rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); | 1525 | rcu_assign_pointer(dcgrp->subsys[ssid], css); |
1346 | ss->root = dst_root; | 1526 | ss->root = dst_root; |
1347 | css->cgroup = &dst_root->cgrp; | 1527 | css->cgroup = dcgrp; |
1348 | 1528 | ||
1349 | down_write(&css_set_rwsem); | 1529 | spin_lock_bh(&css_set_lock); |
1350 | hash_for_each(css_set_table, i, cset, hlist) | 1530 | hash_for_each(css_set_table, i, cset, hlist) |
1351 | list_move_tail(&cset->e_cset_node[ss->id], | 1531 | list_move_tail(&cset->e_cset_node[ss->id], |
1352 | &dst_root->cgrp.e_csets[ss->id]); | 1532 | &dcgrp->e_csets[ss->id]); |
1353 | up_write(&css_set_rwsem); | 1533 | spin_unlock_bh(&css_set_lock); |
1354 | 1534 | ||
1355 | src_root->subsys_mask &= ~(1 << ssid); | 1535 | src_root->subsys_mask &= ~(1 << ssid); |
1356 | src_root->cgrp.subtree_control &= ~(1 << ssid); | 1536 | scgrp->subtree_control &= ~(1 << ssid); |
1357 | cgroup_refresh_child_subsys_mask(&src_root->cgrp); | 1537 | cgroup_refresh_child_subsys_mask(scgrp); |
1358 | 1538 | ||
1359 | /* default hierarchy doesn't enable controllers by default */ | 1539 | /* default hierarchy doesn't enable controllers by default */ |
1360 | dst_root->subsys_mask |= 1 << ssid; | 1540 | dst_root->subsys_mask |= 1 << ssid; |
1361 | if (dst_root != &cgrp_dfl_root) { | 1541 | if (dst_root == &cgrp_dfl_root) { |
1362 | dst_root->cgrp.subtree_control |= 1 << ssid; | 1542 | static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); |
1363 | cgroup_refresh_child_subsys_mask(&dst_root->cgrp); | 1543 | } else { |
1544 | dcgrp->subtree_control |= 1 << ssid; | ||
1545 | cgroup_refresh_child_subsys_mask(dcgrp); | ||
1546 | static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); | ||
1364 | } | 1547 | } |
1365 | 1548 | ||
1366 | if (ss->bind) | 1549 | if (ss->bind) |
1367 | ss->bind(css); | 1550 | ss->bind(css); |
1368 | } | 1551 | } |
1369 | 1552 | ||
1370 | kernfs_activate(dst_root->cgrp.kn); | 1553 | kernfs_activate(dcgrp->kn); |
1371 | return 0; | 1554 | return 0; |
1372 | } | 1555 | } |
1373 | 1556 | ||
@@ -1497,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1497 | for_each_subsys(ss, i) { | 1680 | for_each_subsys(ss, i) { |
1498 | if (strcmp(token, ss->legacy_name)) | 1681 | if (strcmp(token, ss->legacy_name)) |
1499 | continue; | 1682 | continue; |
1500 | if (ss->disabled) | 1683 | if (!cgroup_ssid_enabled(i)) |
1501 | continue; | 1684 | continue; |
1502 | 1685 | ||
1503 | /* Mutually exclusive option 'all' + subsystem name */ | 1686 | /* Mutually exclusive option 'all' + subsystem name */ |
@@ -1528,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1528 | */ | 1711 | */ |
1529 | if (all_ss || (!one_ss && !opts->none && !opts->name)) | 1712 | if (all_ss || (!one_ss && !opts->none && !opts->name)) |
1530 | for_each_subsys(ss, i) | 1713 | for_each_subsys(ss, i) |
1531 | if (!ss->disabled) | 1714 | if (cgroup_ssid_enabled(i)) |
1532 | opts->subsys_mask |= (1 << i); | 1715 | opts->subsys_mask |= (1 << i); |
1533 | 1716 | ||
1534 | /* | 1717 | /* |
@@ -1624,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
1624 | { | 1807 | { |
1625 | struct task_struct *p, *g; | 1808 | struct task_struct *p, *g; |
1626 | 1809 | ||
1627 | down_write(&css_set_rwsem); | 1810 | spin_lock_bh(&css_set_lock); |
1628 | 1811 | ||
1629 | if (use_task_css_set_links) | 1812 | if (use_task_css_set_links) |
1630 | goto out_unlock; | 1813 | goto out_unlock; |
@@ -1654,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void) | |||
1654 | if (!(p->flags & PF_EXITING)) { | 1837 | if (!(p->flags & PF_EXITING)) { |
1655 | struct css_set *cset = task_css_set(p); | 1838 | struct css_set *cset = task_css_set(p); |
1656 | 1839 | ||
1657 | list_add(&p->cg_list, &cset->tasks); | 1840 | if (!css_set_populated(cset)) |
1841 | css_set_update_populated(cset, true); | ||
1842 | list_add_tail(&p->cg_list, &cset->tasks); | ||
1658 | get_css_set(cset); | 1843 | get_css_set(cset); |
1659 | } | 1844 | } |
1660 | spin_unlock_irq(&p->sighand->siglock); | 1845 | spin_unlock_irq(&p->sighand->siglock); |
1661 | } while_each_thread(g, p); | 1846 | } while_each_thread(g, p); |
1662 | read_unlock(&tasklist_lock); | 1847 | read_unlock(&tasklist_lock); |
1663 | out_unlock: | 1848 | out_unlock: |
1664 | up_write(&css_set_rwsem); | 1849 | spin_unlock_bh(&css_set_lock); |
1665 | } | 1850 | } |
1666 | 1851 | ||
1667 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1852 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
@@ -1671,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1671 | 1856 | ||
1672 | INIT_LIST_HEAD(&cgrp->self.sibling); | 1857 | INIT_LIST_HEAD(&cgrp->self.sibling); |
1673 | INIT_LIST_HEAD(&cgrp->self.children); | 1858 | INIT_LIST_HEAD(&cgrp->self.children); |
1859 | INIT_LIST_HEAD(&cgrp->self.files); | ||
1674 | INIT_LIST_HEAD(&cgrp->cset_links); | 1860 | INIT_LIST_HEAD(&cgrp->cset_links); |
1675 | INIT_LIST_HEAD(&cgrp->pidlists); | 1861 | INIT_LIST_HEAD(&cgrp->pidlists); |
1676 | mutex_init(&cgrp->pidlist_mutex); | 1862 | mutex_init(&cgrp->pidlist_mutex); |
@@ -1708,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1708 | { | 1894 | { |
1709 | LIST_HEAD(tmp_links); | 1895 | LIST_HEAD(tmp_links); |
1710 | struct cgroup *root_cgrp = &root->cgrp; | 1896 | struct cgroup *root_cgrp = &root->cgrp; |
1711 | struct cftype *base_files; | ||
1712 | struct css_set *cset; | 1897 | struct css_set *cset; |
1713 | int i, ret; | 1898 | int i, ret; |
1714 | 1899 | ||
@@ -1725,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1725 | goto out; | 1910 | goto out; |
1726 | 1911 | ||
1727 | /* | 1912 | /* |
1728 | * We're accessing css_set_count without locking css_set_rwsem here, | 1913 | * We're accessing css_set_count without locking css_set_lock here, |
1729 | * but that's OK - it can only be increased by someone holding | 1914 | * but that's OK - it can only be increased by someone holding |
1730 | * cgroup_lock, and that's us. The worst that can happen is that we | 1915 | * cgroup_lock, and that's us. The worst that can happen is that we |
1731 | * have some link structures left over | 1916 | * have some link structures left over |
@@ -1747,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1747 | } | 1932 | } |
1748 | root_cgrp->kn = root->kf_root->kn; | 1933 | root_cgrp->kn = root->kf_root->kn; |
1749 | 1934 | ||
1750 | if (root == &cgrp_dfl_root) | 1935 | ret = css_populate_dir(&root_cgrp->self, NULL); |
1751 | base_files = cgroup_dfl_base_files; | ||
1752 | else | ||
1753 | base_files = cgroup_legacy_base_files; | ||
1754 | |||
1755 | ret = cgroup_addrm_files(root_cgrp, base_files, true); | ||
1756 | if (ret) | 1936 | if (ret) |
1757 | goto destroy_root; | 1937 | goto destroy_root; |
1758 | 1938 | ||
@@ -1772,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1772 | * Link the root cgroup in this hierarchy into all the css_set | 1952 | * Link the root cgroup in this hierarchy into all the css_set |
1773 | * objects. | 1953 | * objects. |
1774 | */ | 1954 | */ |
1775 | down_write(&css_set_rwsem); | 1955 | spin_lock_bh(&css_set_lock); |
1776 | hash_for_each(css_set_table, i, cset, hlist) | 1956 | hash_for_each(css_set_table, i, cset, hlist) { |
1777 | link_css_set(&tmp_links, cset, root_cgrp); | 1957 | link_css_set(&tmp_links, cset, root_cgrp); |
1778 | up_write(&css_set_rwsem); | 1958 | if (css_set_populated(cset)) |
1959 | cgroup_update_populated(root_cgrp, true); | ||
1960 | } | ||
1961 | spin_unlock_bh(&css_set_lock); | ||
1779 | 1962 | ||
1780 | BUG_ON(!list_empty(&root_cgrp->self.children)); | 1963 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
1781 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); | 1964 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
@@ -2008,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
2008 | char *path = NULL; | 2191 | char *path = NULL; |
2009 | 2192 | ||
2010 | mutex_lock(&cgroup_mutex); | 2193 | mutex_lock(&cgroup_mutex); |
2011 | down_read(&css_set_rwsem); | 2194 | spin_lock_bh(&css_set_lock); |
2012 | 2195 | ||
2013 | root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); | 2196 | root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); |
2014 | 2197 | ||
@@ -2021,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
2021 | path = buf; | 2204 | path = buf; |
2022 | } | 2205 | } |
2023 | 2206 | ||
2024 | up_read(&css_set_rwsem); | 2207 | spin_unlock_bh(&css_set_lock); |
2025 | mutex_unlock(&cgroup_mutex); | 2208 | mutex_unlock(&cgroup_mutex); |
2026 | return path; | 2209 | return path; |
2027 | } | 2210 | } |
@@ -2049,6 +2232,49 @@ struct cgroup_taskset { | |||
2049 | struct task_struct *cur_task; | 2232 | struct task_struct *cur_task; |
2050 | }; | 2233 | }; |
2051 | 2234 | ||
2235 | #define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ | ||
2236 | .src_csets = LIST_HEAD_INIT(tset.src_csets), \ | ||
2237 | .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ | ||
2238 | .csets = &tset.src_csets, \ | ||
2239 | } | ||
2240 | |||
2241 | /** | ||
2242 | * cgroup_taskset_add - try to add a migration target task to a taskset | ||
2243 | * @task: target task | ||
2244 | * @tset: target taskset | ||
2245 | * | ||
2246 | * Add @task, which is a migration target, to @tset. This function becomes | ||
2247 | * noop if @task doesn't need to be migrated. @task's css_set should have | ||
2248 | * been added as a migration source and @task->cg_list will be moved from | ||
2249 | * the css_set's tasks list to mg_tasks one. | ||
2250 | */ | ||
2251 | static void cgroup_taskset_add(struct task_struct *task, | ||
2252 | struct cgroup_taskset *tset) | ||
2253 | { | ||
2254 | struct css_set *cset; | ||
2255 | |||
2256 | lockdep_assert_held(&css_set_lock); | ||
2257 | |||
2258 | /* @task either already exited or can't exit until the end */ | ||
2259 | if (task->flags & PF_EXITING) | ||
2260 | return; | ||
2261 | |||
2262 | /* leave @task alone if post_fork() hasn't linked it yet */ | ||
2263 | if (list_empty(&task->cg_list)) | ||
2264 | return; | ||
2265 | |||
2266 | cset = task_css_set(task); | ||
2267 | if (!cset->mg_src_cgrp) | ||
2268 | return; | ||
2269 | |||
2270 | list_move_tail(&task->cg_list, &cset->mg_tasks); | ||
2271 | if (list_empty(&cset->mg_node)) | ||
2272 | list_add_tail(&cset->mg_node, &tset->src_csets); | ||
2273 | if (list_empty(&cset->mg_dst_cset->mg_node)) | ||
2274 | list_move_tail(&cset->mg_dst_cset->mg_node, | ||
2275 | &tset->dst_csets); | ||
2276 | } | ||
2277 | |||
2052 | /** | 2278 | /** |
2053 | * cgroup_taskset_first - reset taskset and return the first task | 2279 | * cgroup_taskset_first - reset taskset and return the first task |
2054 | * @tset: taskset of interest | 2280 | * @tset: taskset of interest |
@@ -2096,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
2096 | } | 2322 | } |
2097 | 2323 | ||
2098 | /** | 2324 | /** |
2099 | * cgroup_task_migrate - move a task from one cgroup to another. | 2325 | * cgroup_taskset_migrate - migrate a taskset to a cgroup |
2100 | * @old_cgrp: the cgroup @tsk is being migrated from | 2326 | * @tset: taget taskset |
2101 | * @tsk: the task being migrated | 2327 | * @dst_cgrp: destination cgroup |
2102 | * @new_cset: the new css_set @tsk is being attached to | ||
2103 | * | 2328 | * |
2104 | * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. | 2329 | * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the |
2330 | * ->can_attach callbacks fails and guarantees that either all or none of | ||
2331 | * the tasks in @tset are migrated. @tset is consumed regardless of | ||
2332 | * success. | ||
2105 | */ | 2333 | */ |
2106 | static void cgroup_task_migrate(struct cgroup *old_cgrp, | 2334 | static int cgroup_taskset_migrate(struct cgroup_taskset *tset, |
2107 | struct task_struct *tsk, | 2335 | struct cgroup *dst_cgrp) |
2108 | struct css_set *new_cset) | ||
2109 | { | 2336 | { |
2110 | struct css_set *old_cset; | 2337 | struct cgroup_subsys_state *css, *failed_css = NULL; |
2111 | 2338 | struct task_struct *task, *tmp_task; | |
2112 | lockdep_assert_held(&cgroup_mutex); | 2339 | struct css_set *cset, *tmp_cset; |
2113 | lockdep_assert_held(&css_set_rwsem); | 2340 | int i, ret; |
2114 | 2341 | ||
2115 | /* | 2342 | /* methods shouldn't be called if no task is actually migrating */ |
2116 | * We are synchronized through threadgroup_lock() against PF_EXITING | 2343 | if (list_empty(&tset->src_csets)) |
2117 | * setting such that we can't race against cgroup_exit() changing the | 2344 | return 0; |
2118 | * css_set to init_css_set and dropping the old one. | ||
2119 | */ | ||
2120 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | ||
2121 | old_cset = task_css_set(tsk); | ||
2122 | 2345 | ||
2123 | get_css_set(new_cset); | 2346 | /* check that we can legitimately attach to the cgroup */ |
2124 | rcu_assign_pointer(tsk->cgroups, new_cset); | 2347 | for_each_e_css(css, i, dst_cgrp) { |
2348 | if (css->ss->can_attach) { | ||
2349 | ret = css->ss->can_attach(css, tset); | ||
2350 | if (ret) { | ||
2351 | failed_css = css; | ||
2352 | goto out_cancel_attach; | ||
2353 | } | ||
2354 | } | ||
2355 | } | ||
2125 | 2356 | ||
2126 | /* | 2357 | /* |
2127 | * Use move_tail so that cgroup_taskset_first() still returns the | 2358 | * Now that we're guaranteed success, proceed to move all tasks to |
2128 | * leader after migration. This works because cgroup_migrate() | 2359 | * the new cgroup. There are no failure cases after here, so this |
2129 | * ensures that the dst_cset of the leader is the first on the | 2360 | * is the commit point. |
2130 | * tset's dst_csets list. | ||
2131 | */ | 2361 | */ |
2132 | list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); | 2362 | spin_lock_bh(&css_set_lock); |
2363 | list_for_each_entry(cset, &tset->src_csets, mg_node) { | ||
2364 | list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { | ||
2365 | struct css_set *from_cset = task_css_set(task); | ||
2366 | struct css_set *to_cset = cset->mg_dst_cset; | ||
2367 | |||
2368 | get_css_set(to_cset); | ||
2369 | css_set_move_task(task, from_cset, to_cset, true); | ||
2370 | put_css_set_locked(from_cset); | ||
2371 | } | ||
2372 | } | ||
2373 | spin_unlock_bh(&css_set_lock); | ||
2133 | 2374 | ||
2134 | /* | 2375 | /* |
2135 | * We just gained a reference on old_cset by taking it from the | 2376 | * Migration is committed, all target tasks are now on dst_csets. |
2136 | * task. As trading it for new_cset is protected by cgroup_mutex, | 2377 | * Nothing is sensitive to fork() after this point. Notify |
2137 | * we're safe to drop it here; it will be freed under RCU. | 2378 | * controllers that migration is complete. |
2138 | */ | 2379 | */ |
2139 | put_css_set_locked(old_cset); | 2380 | tset->csets = &tset->dst_csets; |
2381 | |||
2382 | for_each_e_css(css, i, dst_cgrp) | ||
2383 | if (css->ss->attach) | ||
2384 | css->ss->attach(css, tset); | ||
2385 | |||
2386 | ret = 0; | ||
2387 | goto out_release_tset; | ||
2388 | |||
2389 | out_cancel_attach: | ||
2390 | for_each_e_css(css, i, dst_cgrp) { | ||
2391 | if (css == failed_css) | ||
2392 | break; | ||
2393 | if (css->ss->cancel_attach) | ||
2394 | css->ss->cancel_attach(css, tset); | ||
2395 | } | ||
2396 | out_release_tset: | ||
2397 | spin_lock_bh(&css_set_lock); | ||
2398 | list_splice_init(&tset->dst_csets, &tset->src_csets); | ||
2399 | list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { | ||
2400 | list_splice_tail_init(&cset->mg_tasks, &cset->tasks); | ||
2401 | list_del_init(&cset->mg_node); | ||
2402 | } | ||
2403 | spin_unlock_bh(&css_set_lock); | ||
2404 | return ret; | ||
2140 | } | 2405 | } |
2141 | 2406 | ||
2142 | /** | 2407 | /** |
@@ -2152,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2152 | 2417 | ||
2153 | lockdep_assert_held(&cgroup_mutex); | 2418 | lockdep_assert_held(&cgroup_mutex); |
2154 | 2419 | ||
2155 | down_write(&css_set_rwsem); | 2420 | spin_lock_bh(&css_set_lock); |
2156 | list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { | 2421 | list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { |
2157 | cset->mg_src_cgrp = NULL; | 2422 | cset->mg_src_cgrp = NULL; |
2158 | cset->mg_dst_cset = NULL; | 2423 | cset->mg_dst_cset = NULL; |
2159 | list_del_init(&cset->mg_preload_node); | 2424 | list_del_init(&cset->mg_preload_node); |
2160 | put_css_set_locked(cset); | 2425 | put_css_set_locked(cset); |
2161 | } | 2426 | } |
2162 | up_write(&css_set_rwsem); | 2427 | spin_unlock_bh(&css_set_lock); |
2163 | } | 2428 | } |
2164 | 2429 | ||
2165 | /** | 2430 | /** |
@@ -2172,10 +2437,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2172 | * @src_cset and add it to @preloaded_csets, which should later be cleaned | 2437 | * @src_cset and add it to @preloaded_csets, which should later be cleaned |
2173 | * up by cgroup_migrate_finish(). | 2438 | * up by cgroup_migrate_finish(). |
2174 | * | 2439 | * |
2175 | * This function may be called without holding threadgroup_lock even if the | 2440 | * This function may be called without holding cgroup_threadgroup_rwsem |
2176 | * target is a process. Threads may be created and destroyed but as long | 2441 | * even if the target is a process. Threads may be created and destroyed |
2177 | * as cgroup_mutex is not dropped, no new css_set can be put into play and | 2442 | * but as long as cgroup_mutex is not dropped, no new css_set can be put |
2178 | * the preloaded css_sets are guaranteed to cover all migrations. | 2443 | * into play and the preloaded css_sets are guaranteed to cover all |
2444 | * migrations. | ||
2179 | */ | 2445 | */ |
2180 | static void cgroup_migrate_add_src(struct css_set *src_cset, | 2446 | static void cgroup_migrate_add_src(struct css_set *src_cset, |
2181 | struct cgroup *dst_cgrp, | 2447 | struct cgroup *dst_cgrp, |
@@ -2184,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
2184 | struct cgroup *src_cgrp; | 2450 | struct cgroup *src_cgrp; |
2185 | 2451 | ||
2186 | lockdep_assert_held(&cgroup_mutex); | 2452 | lockdep_assert_held(&cgroup_mutex); |
2187 | lockdep_assert_held(&css_set_rwsem); | 2453 | lockdep_assert_held(&css_set_lock); |
2188 | 2454 | ||
2189 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); | 2455 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); |
2190 | 2456 | ||
@@ -2273,12 +2539,12 @@ err: | |||
2273 | 2539 | ||
2274 | /** | 2540 | /** |
2275 | * cgroup_migrate - migrate a process or task to a cgroup | 2541 | * cgroup_migrate - migrate a process or task to a cgroup |
2276 | * @cgrp: the destination cgroup | ||
2277 | * @leader: the leader of the process or the task to migrate | 2542 | * @leader: the leader of the process or the task to migrate |
2278 | * @threadgroup: whether @leader points to the whole process or a single task | 2543 | * @threadgroup: whether @leader points to the whole process or a single task |
2544 | * @cgrp: the destination cgroup | ||
2279 | * | 2545 | * |
2280 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a | 2546 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a |
2281 | * process, the caller must be holding threadgroup_lock of @leader. The | 2547 | * process, the caller must be holding cgroup_threadgroup_rwsem. The |
2282 | * caller is also responsible for invoking cgroup_migrate_add_src() and | 2548 | * caller is also responsible for invoking cgroup_migrate_add_src() and |
2283 | * cgroup_migrate_prepare_dst() on the targets before invoking this | 2549 | * cgroup_migrate_prepare_dst() on the targets before invoking this |
2284 | * function and following up with cgroup_migrate_finish(). | 2550 | * function and following up with cgroup_migrate_finish(). |
@@ -2289,115 +2555,29 @@ err: | |||
2289 | * decided for all targets by invoking group_migrate_prepare_dst() before | 2555 | * decided for all targets by invoking group_migrate_prepare_dst() before |
2290 | * actually starting migrating. | 2556 | * actually starting migrating. |
2291 | */ | 2557 | */ |
2292 | static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | 2558 | static int cgroup_migrate(struct task_struct *leader, bool threadgroup, |
2293 | bool threadgroup) | 2559 | struct cgroup *cgrp) |
2294 | { | 2560 | { |
2295 | struct cgroup_taskset tset = { | 2561 | struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); |
2296 | .src_csets = LIST_HEAD_INIT(tset.src_csets), | 2562 | struct task_struct *task; |
2297 | .dst_csets = LIST_HEAD_INIT(tset.dst_csets), | ||
2298 | .csets = &tset.src_csets, | ||
2299 | }; | ||
2300 | struct cgroup_subsys_state *css, *failed_css = NULL; | ||
2301 | struct css_set *cset, *tmp_cset; | ||
2302 | struct task_struct *task, *tmp_task; | ||
2303 | int i, ret; | ||
2304 | 2563 | ||
2305 | /* | 2564 | /* |
2306 | * Prevent freeing of tasks while we take a snapshot. Tasks that are | 2565 | * Prevent freeing of tasks while we take a snapshot. Tasks that are |
2307 | * already PF_EXITING could be freed from underneath us unless we | 2566 | * already PF_EXITING could be freed from underneath us unless we |
2308 | * take an rcu_read_lock. | 2567 | * take an rcu_read_lock. |
2309 | */ | 2568 | */ |
2310 | down_write(&css_set_rwsem); | 2569 | spin_lock_bh(&css_set_lock); |
2311 | rcu_read_lock(); | 2570 | rcu_read_lock(); |
2312 | task = leader; | 2571 | task = leader; |
2313 | do { | 2572 | do { |
2314 | /* @task either already exited or can't exit until the end */ | 2573 | cgroup_taskset_add(task, &tset); |
2315 | if (task->flags & PF_EXITING) | ||
2316 | goto next; | ||
2317 | |||
2318 | /* leave @task alone if post_fork() hasn't linked it yet */ | ||
2319 | if (list_empty(&task->cg_list)) | ||
2320 | goto next; | ||
2321 | |||
2322 | cset = task_css_set(task); | ||
2323 | if (!cset->mg_src_cgrp) | ||
2324 | goto next; | ||
2325 | |||
2326 | /* | ||
2327 | * cgroup_taskset_first() must always return the leader. | ||
2328 | * Take care to avoid disturbing the ordering. | ||
2329 | */ | ||
2330 | list_move_tail(&task->cg_list, &cset->mg_tasks); | ||
2331 | if (list_empty(&cset->mg_node)) | ||
2332 | list_add_tail(&cset->mg_node, &tset.src_csets); | ||
2333 | if (list_empty(&cset->mg_dst_cset->mg_node)) | ||
2334 | list_move_tail(&cset->mg_dst_cset->mg_node, | ||
2335 | &tset.dst_csets); | ||
2336 | next: | ||
2337 | if (!threadgroup) | 2574 | if (!threadgroup) |
2338 | break; | 2575 | break; |
2339 | } while_each_thread(leader, task); | 2576 | } while_each_thread(leader, task); |
2340 | rcu_read_unlock(); | 2577 | rcu_read_unlock(); |
2341 | up_write(&css_set_rwsem); | 2578 | spin_unlock_bh(&css_set_lock); |
2342 | |||
2343 | /* methods shouldn't be called if no task is actually migrating */ | ||
2344 | if (list_empty(&tset.src_csets)) | ||
2345 | return 0; | ||
2346 | |||
2347 | /* check that we can legitimately attach to the cgroup */ | ||
2348 | for_each_e_css(css, i, cgrp) { | ||
2349 | if (css->ss->can_attach) { | ||
2350 | ret = css->ss->can_attach(css, &tset); | ||
2351 | if (ret) { | ||
2352 | failed_css = css; | ||
2353 | goto out_cancel_attach; | ||
2354 | } | ||
2355 | } | ||
2356 | } | ||
2357 | |||
2358 | /* | ||
2359 | * Now that we're guaranteed success, proceed to move all tasks to | ||
2360 | * the new cgroup. There are no failure cases after here, so this | ||
2361 | * is the commit point. | ||
2362 | */ | ||
2363 | down_write(&css_set_rwsem); | ||
2364 | list_for_each_entry(cset, &tset.src_csets, mg_node) { | ||
2365 | list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) | ||
2366 | cgroup_task_migrate(cset->mg_src_cgrp, task, | ||
2367 | cset->mg_dst_cset); | ||
2368 | } | ||
2369 | up_write(&css_set_rwsem); | ||
2370 | |||
2371 | /* | ||
2372 | * Migration is committed, all target tasks are now on dst_csets. | ||
2373 | * Nothing is sensitive to fork() after this point. Notify | ||
2374 | * controllers that migration is complete. | ||
2375 | */ | ||
2376 | tset.csets = &tset.dst_csets; | ||
2377 | |||
2378 | for_each_e_css(css, i, cgrp) | ||
2379 | if (css->ss->attach) | ||
2380 | css->ss->attach(css, &tset); | ||
2381 | |||
2382 | ret = 0; | ||
2383 | goto out_release_tset; | ||
2384 | 2579 | ||
2385 | out_cancel_attach: | 2580 | return cgroup_taskset_migrate(&tset, cgrp); |
2386 | for_each_e_css(css, i, cgrp) { | ||
2387 | if (css == failed_css) | ||
2388 | break; | ||
2389 | if (css->ss->cancel_attach) | ||
2390 | css->ss->cancel_attach(css, &tset); | ||
2391 | } | ||
2392 | out_release_tset: | ||
2393 | down_write(&css_set_rwsem); | ||
2394 | list_splice_init(&tset.dst_csets, &tset.src_csets); | ||
2395 | list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { | ||
2396 | list_splice_tail_init(&cset->mg_tasks, &cset->tasks); | ||
2397 | list_del_init(&cset->mg_node); | ||
2398 | } | ||
2399 | up_write(&css_set_rwsem); | ||
2400 | return ret; | ||
2401 | } | 2581 | } |
2402 | 2582 | ||
2403 | /** | 2583 | /** |
@@ -2406,7 +2586,7 @@ out_release_tset: | |||
2406 | * @leader: the task or the leader of the threadgroup to be attached | 2586 | * @leader: the task or the leader of the threadgroup to be attached |
2407 | * @threadgroup: attach the whole threadgroup? | 2587 | * @threadgroup: attach the whole threadgroup? |
2408 | * | 2588 | * |
2409 | * Call holding cgroup_mutex and threadgroup_lock of @leader. | 2589 | * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. |
2410 | */ | 2590 | */ |
2411 | static int cgroup_attach_task(struct cgroup *dst_cgrp, | 2591 | static int cgroup_attach_task(struct cgroup *dst_cgrp, |
2412 | struct task_struct *leader, bool threadgroup) | 2592 | struct task_struct *leader, bool threadgroup) |
@@ -2416,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2416 | int ret; | 2596 | int ret; |
2417 | 2597 | ||
2418 | /* look up all src csets */ | 2598 | /* look up all src csets */ |
2419 | down_read(&css_set_rwsem); | 2599 | spin_lock_bh(&css_set_lock); |
2420 | rcu_read_lock(); | 2600 | rcu_read_lock(); |
2421 | task = leader; | 2601 | task = leader; |
2422 | do { | 2602 | do { |
@@ -2426,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2426 | break; | 2606 | break; |
2427 | } while_each_thread(leader, task); | 2607 | } while_each_thread(leader, task); |
2428 | rcu_read_unlock(); | 2608 | rcu_read_unlock(); |
2429 | up_read(&css_set_rwsem); | 2609 | spin_unlock_bh(&css_set_lock); |
2430 | 2610 | ||
2431 | /* prepare dst csets and commit */ | 2611 | /* prepare dst csets and commit */ |
2432 | ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); | 2612 | ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); |
2433 | if (!ret) | 2613 | if (!ret) |
2434 | ret = cgroup_migrate(dst_cgrp, leader, threadgroup); | 2614 | ret = cgroup_migrate(leader, threadgroup, dst_cgrp); |
2435 | 2615 | ||
2436 | cgroup_migrate_finish(&preloaded_csets); | 2616 | cgroup_migrate_finish(&preloaded_csets); |
2437 | return ret; | 2617 | return ret; |
@@ -2459,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
2459 | struct cgroup *cgrp; | 2639 | struct cgroup *cgrp; |
2460 | struct inode *inode; | 2640 | struct inode *inode; |
2461 | 2641 | ||
2462 | down_read(&css_set_rwsem); | 2642 | spin_lock_bh(&css_set_lock); |
2463 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | 2643 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); |
2464 | up_read(&css_set_rwsem); | 2644 | spin_unlock_bh(&css_set_lock); |
2465 | 2645 | ||
2466 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) | 2646 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) |
2467 | cgrp = cgroup_parent(cgrp); | 2647 | cgrp = cgroup_parent(cgrp); |
2468 | 2648 | ||
2469 | ret = -ENOMEM; | 2649 | ret = -ENOMEM; |
2470 | inode = kernfs_get_inode(sb, cgrp->procs_kn); | 2650 | inode = kernfs_get_inode(sb, cgrp->procs_file.kn); |
2471 | if (inode) { | 2651 | if (inode) { |
2472 | ret = inode_permission(inode, MAY_WRITE); | 2652 | ret = inode_permission(inode, MAY_WRITE); |
2473 | iput(inode); | 2653 | iput(inode); |
@@ -2498,14 +2678,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, | |||
2498 | if (!cgrp) | 2678 | if (!cgrp) |
2499 | return -ENODEV; | 2679 | return -ENODEV; |
2500 | 2680 | ||
2501 | retry_find_task: | 2681 | percpu_down_write(&cgroup_threadgroup_rwsem); |
2502 | rcu_read_lock(); | 2682 | rcu_read_lock(); |
2503 | if (pid) { | 2683 | if (pid) { |
2504 | tsk = find_task_by_vpid(pid); | 2684 | tsk = find_task_by_vpid(pid); |
2505 | if (!tsk) { | 2685 | if (!tsk) { |
2506 | rcu_read_unlock(); | ||
2507 | ret = -ESRCH; | 2686 | ret = -ESRCH; |
2508 | goto out_unlock_cgroup; | 2687 | goto out_unlock_rcu; |
2509 | } | 2688 | } |
2510 | } else { | 2689 | } else { |
2511 | tsk = current; | 2690 | tsk = current; |
@@ -2521,37 +2700,23 @@ retry_find_task: | |||
2521 | */ | 2700 | */ |
2522 | if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { | 2701 | if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { |
2523 | ret = -EINVAL; | 2702 | ret = -EINVAL; |
2524 | rcu_read_unlock(); | 2703 | goto out_unlock_rcu; |
2525 | goto out_unlock_cgroup; | ||
2526 | } | 2704 | } |
2527 | 2705 | ||
2528 | get_task_struct(tsk); | 2706 | get_task_struct(tsk); |
2529 | rcu_read_unlock(); | 2707 | rcu_read_unlock(); |
2530 | 2708 | ||
2531 | threadgroup_lock(tsk); | ||
2532 | if (threadgroup) { | ||
2533 | if (!thread_group_leader(tsk)) { | ||
2534 | /* | ||
2535 | * a race with de_thread from another thread's exec() | ||
2536 | * may strip us of our leadership, if this happens, | ||
2537 | * there is no choice but to throw this task away and | ||
2538 | * try again; this is | ||
2539 | * "double-double-toil-and-trouble-check locking". | ||
2540 | */ | ||
2541 | threadgroup_unlock(tsk); | ||
2542 | put_task_struct(tsk); | ||
2543 | goto retry_find_task; | ||
2544 | } | ||
2545 | } | ||
2546 | |||
2547 | ret = cgroup_procs_write_permission(tsk, cgrp, of); | 2709 | ret = cgroup_procs_write_permission(tsk, cgrp, of); |
2548 | if (!ret) | 2710 | if (!ret) |
2549 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); | 2711 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); |
2550 | 2712 | ||
2551 | threadgroup_unlock(tsk); | ||
2552 | |||
2553 | put_task_struct(tsk); | 2713 | put_task_struct(tsk); |
2554 | out_unlock_cgroup: | 2714 | goto out_unlock_threadgroup; |
2715 | |||
2716 | out_unlock_rcu: | ||
2717 | rcu_read_unlock(); | ||
2718 | out_unlock_threadgroup: | ||
2719 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
2555 | cgroup_kn_unlock(of->kn); | 2720 | cgroup_kn_unlock(of->kn); |
2556 | return ret ?: nbytes; | 2721 | return ret ?: nbytes; |
2557 | } | 2722 | } |
@@ -2573,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2573 | if (root == &cgrp_dfl_root) | 2738 | if (root == &cgrp_dfl_root) |
2574 | continue; | 2739 | continue; |
2575 | 2740 | ||
2576 | down_read(&css_set_rwsem); | 2741 | spin_lock_bh(&css_set_lock); |
2577 | from_cgrp = task_cgroup_from_root(from, root); | 2742 | from_cgrp = task_cgroup_from_root(from, root); |
2578 | up_read(&css_set_rwsem); | 2743 | spin_unlock_bh(&css_set_lock); |
2579 | 2744 | ||
2580 | retval = cgroup_attach_task(from_cgrp, tsk, false); | 2745 | retval = cgroup_attach_task(from_cgrp, tsk, false); |
2581 | if (retval) | 2746 | if (retval) |
@@ -2690,14 +2855,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) | |||
2690 | static int cgroup_update_dfl_csses(struct cgroup *cgrp) | 2855 | static int cgroup_update_dfl_csses(struct cgroup *cgrp) |
2691 | { | 2856 | { |
2692 | LIST_HEAD(preloaded_csets); | 2857 | LIST_HEAD(preloaded_csets); |
2858 | struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); | ||
2693 | struct cgroup_subsys_state *css; | 2859 | struct cgroup_subsys_state *css; |
2694 | struct css_set *src_cset; | 2860 | struct css_set *src_cset; |
2695 | int ret; | 2861 | int ret; |
2696 | 2862 | ||
2697 | lockdep_assert_held(&cgroup_mutex); | 2863 | lockdep_assert_held(&cgroup_mutex); |
2698 | 2864 | ||
2865 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
2866 | |||
2699 | /* look up all csses currently attached to @cgrp's subtree */ | 2867 | /* look up all csses currently attached to @cgrp's subtree */ |
2700 | down_read(&css_set_rwsem); | 2868 | spin_lock_bh(&css_set_lock); |
2701 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { | 2869 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { |
2702 | struct cgrp_cset_link *link; | 2870 | struct cgrp_cset_link *link; |
2703 | 2871 | ||
@@ -2709,68 +2877,31 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
2709 | cgroup_migrate_add_src(link->cset, cgrp, | 2877 | cgroup_migrate_add_src(link->cset, cgrp, |
2710 | &preloaded_csets); | 2878 | &preloaded_csets); |
2711 | } | 2879 | } |
2712 | up_read(&css_set_rwsem); | 2880 | spin_unlock_bh(&css_set_lock); |
2713 | 2881 | ||
2714 | /* NULL dst indicates self on default hierarchy */ | 2882 | /* NULL dst indicates self on default hierarchy */ |
2715 | ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); | 2883 | ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); |
2716 | if (ret) | 2884 | if (ret) |
2717 | goto out_finish; | 2885 | goto out_finish; |
2718 | 2886 | ||
2887 | spin_lock_bh(&css_set_lock); | ||
2719 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { | 2888 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { |
2720 | struct task_struct *last_task = NULL, *task; | 2889 | struct task_struct *task, *ntask; |
2721 | 2890 | ||
2722 | /* src_csets precede dst_csets, break on the first dst_cset */ | 2891 | /* src_csets precede dst_csets, break on the first dst_cset */ |
2723 | if (!src_cset->mg_src_cgrp) | 2892 | if (!src_cset->mg_src_cgrp) |
2724 | break; | 2893 | break; |
2725 | 2894 | ||
2726 | /* | 2895 | /* all tasks in src_csets need to be migrated */ |
2727 | * All tasks in src_cset need to be migrated to the | 2896 | list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) |
2728 | * matching dst_cset. Empty it process by process. We | 2897 | cgroup_taskset_add(task, &tset); |
2729 | * walk tasks but migrate processes. The leader might even | ||
2730 | * belong to a different cset but such src_cset would also | ||
2731 | * be among the target src_csets because the default | ||
2732 | * hierarchy enforces per-process membership. | ||
2733 | */ | ||
2734 | while (true) { | ||
2735 | down_read(&css_set_rwsem); | ||
2736 | task = list_first_entry_or_null(&src_cset->tasks, | ||
2737 | struct task_struct, cg_list); | ||
2738 | if (task) { | ||
2739 | task = task->group_leader; | ||
2740 | WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); | ||
2741 | get_task_struct(task); | ||
2742 | } | ||
2743 | up_read(&css_set_rwsem); | ||
2744 | |||
2745 | if (!task) | ||
2746 | break; | ||
2747 | |||
2748 | /* guard against possible infinite loop */ | ||
2749 | if (WARN(last_task == task, | ||
2750 | "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) | ||
2751 | goto out_finish; | ||
2752 | last_task = task; | ||
2753 | |||
2754 | threadgroup_lock(task); | ||
2755 | /* raced against de_thread() from another thread? */ | ||
2756 | if (!thread_group_leader(task)) { | ||
2757 | threadgroup_unlock(task); | ||
2758 | put_task_struct(task); | ||
2759 | continue; | ||
2760 | } | ||
2761 | |||
2762 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | ||
2763 | |||
2764 | threadgroup_unlock(task); | ||
2765 | put_task_struct(task); | ||
2766 | |||
2767 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | ||
2768 | goto out_finish; | ||
2769 | } | ||
2770 | } | 2898 | } |
2899 | spin_unlock_bh(&css_set_lock); | ||
2771 | 2900 | ||
2901 | ret = cgroup_taskset_migrate(&tset, cgrp); | ||
2772 | out_finish: | 2902 | out_finish: |
2773 | cgroup_migrate_finish(&preloaded_csets); | 2903 | cgroup_migrate_finish(&preloaded_csets); |
2904 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
2774 | return ret; | 2905 | return ret; |
2775 | } | 2906 | } |
2776 | 2907 | ||
@@ -2797,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2797 | if (tok[0] == '\0') | 2928 | if (tok[0] == '\0') |
2798 | continue; | 2929 | continue; |
2799 | for_each_subsys_which(ss, ssid, &tmp_ss_mask) { | 2930 | for_each_subsys_which(ss, ssid, &tmp_ss_mask) { |
2800 | if (ss->disabled || strcmp(tok + 1, ss->name)) | 2931 | if (!cgroup_ssid_enabled(ssid) || |
2932 | strcmp(tok + 1, ss->name)) | ||
2801 | continue; | 2933 | continue; |
2802 | 2934 | ||
2803 | if (*tok == '+') { | 2935 | if (*tok == '+') { |
@@ -2921,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2921 | ret = create_css(child, ss, | 3053 | ret = create_css(child, ss, |
2922 | cgrp->subtree_control & (1 << ssid)); | 3054 | cgrp->subtree_control & (1 << ssid)); |
2923 | else | 3055 | else |
2924 | ret = cgroup_populate_dir(child, 1 << ssid); | 3056 | ret = css_populate_dir(cgroup_css(child, ss), |
3057 | NULL); | ||
2925 | if (ret) | 3058 | if (ret) |
2926 | goto err_undo_css; | 3059 | goto err_undo_css; |
2927 | } | 3060 | } |
@@ -2954,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2954 | if (css_disable & (1 << ssid)) { | 3087 | if (css_disable & (1 << ssid)) { |
2955 | kill_css(css); | 3088 | kill_css(css); |
2956 | } else { | 3089 | } else { |
2957 | cgroup_clear_dir(child, 1 << ssid); | 3090 | css_clear_dir(css, NULL); |
2958 | if (ss->css_reset) | 3091 | if (ss->css_reset) |
2959 | ss->css_reset(css); | 3092 | ss->css_reset(css); |
2960 | } | 3093 | } |
@@ -3002,15 +3135,16 @@ err_undo_css: | |||
3002 | if (css_enable & (1 << ssid)) | 3135 | if (css_enable & (1 << ssid)) |
3003 | kill_css(css); | 3136 | kill_css(css); |
3004 | else | 3137 | else |
3005 | cgroup_clear_dir(child, 1 << ssid); | 3138 | css_clear_dir(css, NULL); |
3006 | } | 3139 | } |
3007 | } | 3140 | } |
3008 | goto out_unlock; | 3141 | goto out_unlock; |
3009 | } | 3142 | } |
3010 | 3143 | ||
3011 | static int cgroup_populated_show(struct seq_file *seq, void *v) | 3144 | static int cgroup_events_show(struct seq_file *seq, void *v) |
3012 | { | 3145 | { |
3013 | seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); | 3146 | seq_printf(seq, "populated %d\n", |
3147 | cgroup_is_populated(seq_css(seq)->cgroup)); | ||
3014 | return 0; | 3148 | return 0; |
3015 | } | 3149 | } |
3016 | 3150 | ||
@@ -3153,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) | |||
3153 | return kernfs_setattr(kn, &iattr); | 3287 | return kernfs_setattr(kn, &iattr); |
3154 | } | 3288 | } |
3155 | 3289 | ||
3156 | static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | 3290 | static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, |
3291 | struct cftype *cft) | ||
3157 | { | 3292 | { |
3158 | char name[CGROUP_FILE_NAME_MAX]; | 3293 | char name[CGROUP_FILE_NAME_MAX]; |
3159 | struct kernfs_node *kn; | 3294 | struct kernfs_node *kn; |
@@ -3175,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | |||
3175 | return ret; | 3310 | return ret; |
3176 | } | 3311 | } |
3177 | 3312 | ||
3178 | if (cft->write == cgroup_procs_write) | 3313 | if (cft->file_offset) { |
3179 | cgrp->procs_kn = kn; | 3314 | struct cgroup_file *cfile = (void *)css + cft->file_offset; |
3180 | else if (cft->seq_show == cgroup_populated_show) | 3315 | |
3181 | cgrp->populated_kn = kn; | 3316 | kernfs_get(kn); |
3317 | cfile->kn = kn; | ||
3318 | list_add(&cfile->node, &css->files); | ||
3319 | } | ||
3320 | |||
3182 | return 0; | 3321 | return 0; |
3183 | } | 3322 | } |
3184 | 3323 | ||
3185 | /** | 3324 | /** |
3186 | * cgroup_addrm_files - add or remove files to a cgroup directory | 3325 | * cgroup_addrm_files - add or remove files to a cgroup directory |
3187 | * @cgrp: the target cgroup | 3326 | * @css: the target css |
3327 | * @cgrp: the target cgroup (usually css->cgroup) | ||
3188 | * @cfts: array of cftypes to be added | 3328 | * @cfts: array of cftypes to be added |
3189 | * @is_add: whether to add or remove | 3329 | * @is_add: whether to add or remove |
3190 | * | 3330 | * |
3191 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. | 3331 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. |
3192 | * For removals, this function never fails. If addition fails, this | 3332 | * For removals, this function never fails. |
3193 | * function doesn't remove files already added. The caller is responsible | ||
3194 | * for cleaning up. | ||
3195 | */ | 3333 | */ |
3196 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 3334 | static int cgroup_addrm_files(struct cgroup_subsys_state *css, |
3335 | struct cgroup *cgrp, struct cftype cfts[], | ||
3197 | bool is_add) | 3336 | bool is_add) |
3198 | { | 3337 | { |
3199 | struct cftype *cft; | 3338 | struct cftype *cft, *cft_end = NULL; |
3200 | int ret; | 3339 | int ret; |
3201 | 3340 | ||
3202 | lockdep_assert_held(&cgroup_mutex); | 3341 | lockdep_assert_held(&cgroup_mutex); |
3203 | 3342 | ||
3204 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 3343 | restart: |
3344 | for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { | ||
3205 | /* does cft->flags tell us to skip this file on @cgrp? */ | 3345 | /* does cft->flags tell us to skip this file on @cgrp? */ |
3206 | if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) | 3346 | if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) |
3207 | continue; | 3347 | continue; |
@@ -3213,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
3213 | continue; | 3353 | continue; |
3214 | 3354 | ||
3215 | if (is_add) { | 3355 | if (is_add) { |
3216 | ret = cgroup_add_file(cgrp, cft); | 3356 | ret = cgroup_add_file(css, cgrp, cft); |
3217 | if (ret) { | 3357 | if (ret) { |
3218 | pr_warn("%s: failed to add %s, err=%d\n", | 3358 | pr_warn("%s: failed to add %s, err=%d\n", |
3219 | __func__, cft->name, ret); | 3359 | __func__, cft->name, ret); |
3220 | return ret; | 3360 | cft_end = cft; |
3361 | is_add = false; | ||
3362 | goto restart; | ||
3221 | } | 3363 | } |
3222 | } else { | 3364 | } else { |
3223 | cgroup_rm_file(cgrp, cft); | 3365 | cgroup_rm_file(cgrp, cft); |
@@ -3243,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) | |||
3243 | if (cgroup_is_dead(cgrp)) | 3385 | if (cgroup_is_dead(cgrp)) |
3244 | continue; | 3386 | continue; |
3245 | 3387 | ||
3246 | ret = cgroup_addrm_files(cgrp, cfts, is_add); | 3388 | ret = cgroup_addrm_files(css, cgrp, cfts, is_add); |
3247 | if (ret) | 3389 | if (ret) |
3248 | break; | 3390 | break; |
3249 | } | 3391 | } |
@@ -3355,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
3355 | { | 3497 | { |
3356 | int ret; | 3498 | int ret; |
3357 | 3499 | ||
3358 | if (ss->disabled) | 3500 | if (!cgroup_ssid_enabled(ss->id)) |
3359 | return 0; | 3501 | return 0; |
3360 | 3502 | ||
3361 | if (!cfts || cfts[0].name[0] == '\0') | 3503 | if (!cfts || cfts[0].name[0] == '\0') |
@@ -3405,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
3405 | { | 3547 | { |
3406 | struct cftype *cft; | 3548 | struct cftype *cft; |
3407 | 3549 | ||
3408 | /* | 3550 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) |
3409 | * If legacy_flies_on_dfl, we want to show the legacy files on the | 3551 | cft->flags |= __CFTYPE_NOT_ON_DFL; |
3410 | * dfl hierarchy but iff the target subsystem hasn't been updated | ||
3411 | * for the dfl hierarchy yet. | ||
3412 | */ | ||
3413 | if (!cgroup_legacy_files_on_dfl || | ||
3414 | ss->dfl_cftypes != ss->legacy_cftypes) { | ||
3415 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) | ||
3416 | cft->flags |= __CFTYPE_NOT_ON_DFL; | ||
3417 | } | ||
3418 | |||
3419 | return cgroup_add_cftypes(ss, cfts); | 3552 | return cgroup_add_cftypes(ss, cfts); |
3420 | } | 3553 | } |
3421 | 3554 | ||
@@ -3430,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) | |||
3430 | int count = 0; | 3563 | int count = 0; |
3431 | struct cgrp_cset_link *link; | 3564 | struct cgrp_cset_link *link; |
3432 | 3565 | ||
3433 | down_read(&css_set_rwsem); | 3566 | spin_lock_bh(&css_set_lock); |
3434 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | 3567 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
3435 | count += atomic_read(&link->cset->refcount); | 3568 | count += atomic_read(&link->cset->refcount); |
3436 | up_read(&css_set_rwsem); | 3569 | spin_unlock_bh(&css_set_lock); |
3437 | return count; | 3570 | return count; |
3438 | } | 3571 | } |
3439 | 3572 | ||
@@ -3665,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css) | |||
3665 | } | 3798 | } |
3666 | 3799 | ||
3667 | /** | 3800 | /** |
3668 | * css_advance_task_iter - advance a task itererator to the next css_set | 3801 | * css_task_iter_advance_css_set - advance a task itererator to the next css_set |
3669 | * @it: the iterator to advance | 3802 | * @it: the iterator to advance |
3670 | * | 3803 | * |
3671 | * Advance @it to the next css_set to walk. | 3804 | * Advance @it to the next css_set to walk. |
3672 | */ | 3805 | */ |
3673 | static void css_advance_task_iter(struct css_task_iter *it) | 3806 | static void css_task_iter_advance_css_set(struct css_task_iter *it) |
3674 | { | 3807 | { |
3675 | struct list_head *l = it->cset_pos; | 3808 | struct list_head *l = it->cset_pos; |
3676 | struct cgrp_cset_link *link; | 3809 | struct cgrp_cset_link *link; |
3677 | struct css_set *cset; | 3810 | struct css_set *cset; |
3678 | 3811 | ||
3812 | lockdep_assert_held(&css_set_lock); | ||
3813 | |||
3679 | /* Advance to the next non-empty css_set */ | 3814 | /* Advance to the next non-empty css_set */ |
3680 | do { | 3815 | do { |
3681 | l = l->next; | 3816 | l = l->next; |
3682 | if (l == it->cset_head) { | 3817 | if (l == it->cset_head) { |
3683 | it->cset_pos = NULL; | 3818 | it->cset_pos = NULL; |
3819 | it->task_pos = NULL; | ||
3684 | return; | 3820 | return; |
3685 | } | 3821 | } |
3686 | 3822 | ||
@@ -3691,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it) | |||
3691 | link = list_entry(l, struct cgrp_cset_link, cset_link); | 3827 | link = list_entry(l, struct cgrp_cset_link, cset_link); |
3692 | cset = link->cset; | 3828 | cset = link->cset; |
3693 | } | 3829 | } |
3694 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); | 3830 | } while (!css_set_populated(cset)); |
3695 | 3831 | ||
3696 | it->cset_pos = l; | 3832 | it->cset_pos = l; |
3697 | 3833 | ||
@@ -3702,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it) | |||
3702 | 3838 | ||
3703 | it->tasks_head = &cset->tasks; | 3839 | it->tasks_head = &cset->tasks; |
3704 | it->mg_tasks_head = &cset->mg_tasks; | 3840 | it->mg_tasks_head = &cset->mg_tasks; |
3841 | |||
3842 | /* | ||
3843 | * We don't keep css_sets locked across iteration steps and thus | ||
3844 | * need to take steps to ensure that iteration can be resumed after | ||
3845 | * the lock is re-acquired. Iteration is performed at two levels - | ||
3846 | * css_sets and tasks in them. | ||
3847 | * | ||
3848 | * Once created, a css_set never leaves its cgroup lists, so a | ||
3849 | * pinned css_set is guaranteed to stay put and we can resume | ||
3850 | * iteration afterwards. | ||
3851 | * | ||
3852 | * Tasks may leave @cset across iteration steps. This is resolved | ||
3853 | * by registering each iterator with the css_set currently being | ||
3854 | * walked and making css_set_move_task() advance iterators whose | ||
3855 | * next task is leaving. | ||
3856 | */ | ||
3857 | if (it->cur_cset) { | ||
3858 | list_del(&it->iters_node); | ||
3859 | put_css_set_locked(it->cur_cset); | ||
3860 | } | ||
3861 | get_css_set(cset); | ||
3862 | it->cur_cset = cset; | ||
3863 | list_add(&it->iters_node, &cset->task_iters); | ||
3864 | } | ||
3865 | |||
3866 | static void css_task_iter_advance(struct css_task_iter *it) | ||
3867 | { | ||
3868 | struct list_head *l = it->task_pos; | ||
3869 | |||
3870 | lockdep_assert_held(&css_set_lock); | ||
3871 | WARN_ON_ONCE(!l); | ||
3872 | |||
3873 | /* | ||
3874 | * Advance iterator to find next entry. cset->tasks is consumed | ||
3875 | * first and then ->mg_tasks. After ->mg_tasks, we move onto the | ||
3876 | * next cset. | ||
3877 | */ | ||
3878 | l = l->next; | ||
3879 | |||
3880 | if (l == it->tasks_head) | ||
3881 | l = it->mg_tasks_head->next; | ||
3882 | |||
3883 | if (l == it->mg_tasks_head) | ||
3884 | css_task_iter_advance_css_set(it); | ||
3885 | else | ||
3886 | it->task_pos = l; | ||
3705 | } | 3887 | } |
3706 | 3888 | ||
3707 | /** | 3889 | /** |
@@ -3713,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it) | |||
3713 | * css_task_iter_next() to walk through the tasks until the function | 3895 | * css_task_iter_next() to walk through the tasks until the function |
3714 | * returns NULL. On completion of iteration, css_task_iter_end() must be | 3896 | * returns NULL. On completion of iteration, css_task_iter_end() must be |
3715 | * called. | 3897 | * called. |
3716 | * | ||
3717 | * Note that this function acquires a lock which is released when the | ||
3718 | * iteration finishes. The caller can't sleep while iteration is in | ||
3719 | * progress. | ||
3720 | */ | 3898 | */ |
3721 | void css_task_iter_start(struct cgroup_subsys_state *css, | 3899 | void css_task_iter_start(struct cgroup_subsys_state *css, |
3722 | struct css_task_iter *it) | 3900 | struct css_task_iter *it) |
3723 | __acquires(css_set_rwsem) | ||
3724 | { | 3901 | { |
3725 | /* no one should try to iterate before mounting cgroups */ | 3902 | /* no one should try to iterate before mounting cgroups */ |
3726 | WARN_ON_ONCE(!use_task_css_set_links); | 3903 | WARN_ON_ONCE(!use_task_css_set_links); |
3727 | 3904 | ||
3728 | down_read(&css_set_rwsem); | 3905 | memset(it, 0, sizeof(*it)); |
3906 | |||
3907 | spin_lock_bh(&css_set_lock); | ||
3729 | 3908 | ||
3730 | it->ss = css->ss; | 3909 | it->ss = css->ss; |
3731 | 3910 | ||
@@ -3736,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
3736 | 3915 | ||
3737 | it->cset_head = it->cset_pos; | 3916 | it->cset_head = it->cset_pos; |
3738 | 3917 | ||
3739 | css_advance_task_iter(it); | 3918 | css_task_iter_advance_css_set(it); |
3919 | |||
3920 | spin_unlock_bh(&css_set_lock); | ||
3740 | } | 3921 | } |
3741 | 3922 | ||
3742 | /** | 3923 | /** |
@@ -3749,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
3749 | */ | 3930 | */ |
3750 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | 3931 | struct task_struct *css_task_iter_next(struct css_task_iter *it) |
3751 | { | 3932 | { |
3752 | struct task_struct *res; | 3933 | if (it->cur_task) { |
3753 | struct list_head *l = it->task_pos; | 3934 | put_task_struct(it->cur_task); |
3935 | it->cur_task = NULL; | ||
3936 | } | ||
3754 | 3937 | ||
3755 | /* If the iterator cg is NULL, we have no tasks */ | 3938 | spin_lock_bh(&css_set_lock); |
3756 | if (!it->cset_pos) | ||
3757 | return NULL; | ||
3758 | res = list_entry(l, struct task_struct, cg_list); | ||
3759 | 3939 | ||
3760 | /* | 3940 | if (it->task_pos) { |
3761 | * Advance iterator to find next entry. cset->tasks is consumed | 3941 | it->cur_task = list_entry(it->task_pos, struct task_struct, |
3762 | * first and then ->mg_tasks. After ->mg_tasks, we move onto the | 3942 | cg_list); |
3763 | * next cset. | 3943 | get_task_struct(it->cur_task); |
3764 | */ | 3944 | css_task_iter_advance(it); |
3765 | l = l->next; | 3945 | } |
3766 | 3946 | ||
3767 | if (l == it->tasks_head) | 3947 | spin_unlock_bh(&css_set_lock); |
3768 | l = it->mg_tasks_head->next; | ||
3769 | 3948 | ||
3770 | if (l == it->mg_tasks_head) | 3949 | return it->cur_task; |
3771 | css_advance_task_iter(it); | ||
3772 | else | ||
3773 | it->task_pos = l; | ||
3774 | |||
3775 | return res; | ||
3776 | } | 3950 | } |
3777 | 3951 | ||
3778 | /** | 3952 | /** |
@@ -3782,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
3782 | * Finish task iteration started by css_task_iter_start(). | 3956 | * Finish task iteration started by css_task_iter_start(). |
3783 | */ | 3957 | */ |
3784 | void css_task_iter_end(struct css_task_iter *it) | 3958 | void css_task_iter_end(struct css_task_iter *it) |
3785 | __releases(css_set_rwsem) | ||
3786 | { | 3959 | { |
3787 | up_read(&css_set_rwsem); | 3960 | if (it->cur_cset) { |
3961 | spin_lock_bh(&css_set_lock); | ||
3962 | list_del(&it->iters_node); | ||
3963 | put_css_set_locked(it->cur_cset); | ||
3964 | spin_unlock_bh(&css_set_lock); | ||
3965 | } | ||
3966 | |||
3967 | if (it->cur_task) | ||
3968 | put_task_struct(it->cur_task); | ||
3788 | } | 3969 | } |
3789 | 3970 | ||
3790 | /** | 3971 | /** |
@@ -3809,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
3809 | mutex_lock(&cgroup_mutex); | 3990 | mutex_lock(&cgroup_mutex); |
3810 | 3991 | ||
3811 | /* all tasks in @from are being moved, all csets are source */ | 3992 | /* all tasks in @from are being moved, all csets are source */ |
3812 | down_read(&css_set_rwsem); | 3993 | spin_lock_bh(&css_set_lock); |
3813 | list_for_each_entry(link, &from->cset_links, cset_link) | 3994 | list_for_each_entry(link, &from->cset_links, cset_link) |
3814 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); | 3995 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); |
3815 | up_read(&css_set_rwsem); | 3996 | spin_unlock_bh(&css_set_lock); |
3816 | 3997 | ||
3817 | ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); | 3998 | ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); |
3818 | if (ret) | 3999 | if (ret) |
@@ -3830,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
3830 | css_task_iter_end(&it); | 4011 | css_task_iter_end(&it); |
3831 | 4012 | ||
3832 | if (task) { | 4013 | if (task) { |
3833 | ret = cgroup_migrate(to, task, false); | 4014 | ret = cgroup_migrate(task, false, to); |
3834 | put_task_struct(task); | 4015 | put_task_struct(task); |
3835 | } | 4016 | } |
3836 | } while (task && !ret); | 4017 | } while (task && !ret); |
@@ -4327,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, | |||
4327 | static struct cftype cgroup_dfl_base_files[] = { | 4508 | static struct cftype cgroup_dfl_base_files[] = { |
4328 | { | 4509 | { |
4329 | .name = "cgroup.procs", | 4510 | .name = "cgroup.procs", |
4511 | .file_offset = offsetof(struct cgroup, procs_file), | ||
4330 | .seq_start = cgroup_pidlist_start, | 4512 | .seq_start = cgroup_pidlist_start, |
4331 | .seq_next = cgroup_pidlist_next, | 4513 | .seq_next = cgroup_pidlist_next, |
4332 | .seq_stop = cgroup_pidlist_stop, | 4514 | .seq_stop = cgroup_pidlist_stop, |
4333 | .seq_show = cgroup_pidlist_show, | 4515 | .seq_show = cgroup_pidlist_show, |
4334 | .private = CGROUP_FILE_PROCS, | 4516 | .private = CGROUP_FILE_PROCS, |
4335 | .write = cgroup_procs_write, | 4517 | .write = cgroup_procs_write, |
4336 | .mode = S_IRUGO | S_IWUSR, | ||
4337 | }, | 4518 | }, |
4338 | { | 4519 | { |
4339 | .name = "cgroup.controllers", | 4520 | .name = "cgroup.controllers", |
@@ -4351,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = { | |||
4351 | .write = cgroup_subtree_control_write, | 4532 | .write = cgroup_subtree_control_write, |
4352 | }, | 4533 | }, |
4353 | { | 4534 | { |
4354 | .name = "cgroup.populated", | 4535 | .name = "cgroup.events", |
4355 | .flags = CFTYPE_NOT_ON_ROOT, | 4536 | .flags = CFTYPE_NOT_ON_ROOT, |
4356 | .seq_show = cgroup_populated_show, | 4537 | .file_offset = offsetof(struct cgroup, events_file), |
4538 | .seq_show = cgroup_events_show, | ||
4357 | }, | 4539 | }, |
4358 | { } /* terminate */ | 4540 | { } /* terminate */ |
4359 | }; | 4541 | }; |
@@ -4368,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = { | |||
4368 | .seq_show = cgroup_pidlist_show, | 4550 | .seq_show = cgroup_pidlist_show, |
4369 | .private = CGROUP_FILE_PROCS, | 4551 | .private = CGROUP_FILE_PROCS, |
4370 | .write = cgroup_procs_write, | 4552 | .write = cgroup_procs_write, |
4371 | .mode = S_IRUGO | S_IWUSR, | ||
4372 | }, | 4553 | }, |
4373 | { | 4554 | { |
4374 | .name = "cgroup.clone_children", | 4555 | .name = "cgroup.clone_children", |
@@ -4388,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = { | |||
4388 | .seq_show = cgroup_pidlist_show, | 4569 | .seq_show = cgroup_pidlist_show, |
4389 | .private = CGROUP_FILE_TASKS, | 4570 | .private = CGROUP_FILE_TASKS, |
4390 | .write = cgroup_tasks_write, | 4571 | .write = cgroup_tasks_write, |
4391 | .mode = S_IRUGO | S_IWUSR, | ||
4392 | }, | 4572 | }, |
4393 | { | 4573 | { |
4394 | .name = "notify_on_release", | 4574 | .name = "notify_on_release", |
@@ -4405,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = { | |||
4405 | { } /* terminate */ | 4585 | { } /* terminate */ |
4406 | }; | 4586 | }; |
4407 | 4587 | ||
4408 | /** | ||
4409 | * cgroup_populate_dir - create subsys files in a cgroup directory | ||
4410 | * @cgrp: target cgroup | ||
4411 | * @subsys_mask: mask of the subsystem ids whose files should be added | ||
4412 | * | ||
4413 | * On failure, no file is added. | ||
4414 | */ | ||
4415 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | ||
4416 | { | ||
4417 | struct cgroup_subsys *ss; | ||
4418 | int i, ret = 0; | ||
4419 | |||
4420 | /* process cftsets of each subsystem */ | ||
4421 | for_each_subsys(ss, i) { | ||
4422 | struct cftype *cfts; | ||
4423 | |||
4424 | if (!(subsys_mask & (1 << i))) | ||
4425 | continue; | ||
4426 | |||
4427 | list_for_each_entry(cfts, &ss->cfts, node) { | ||
4428 | ret = cgroup_addrm_files(cgrp, cfts, true); | ||
4429 | if (ret < 0) | ||
4430 | goto err; | ||
4431 | } | ||
4432 | } | ||
4433 | return 0; | ||
4434 | err: | ||
4435 | cgroup_clear_dir(cgrp, subsys_mask); | ||
4436 | return ret; | ||
4437 | } | ||
4438 | |||
4439 | /* | 4588 | /* |
4440 | * css destruction is four-stage process. | 4589 | * css destruction is four-stage process. |
4441 | * | 4590 | * |
@@ -4464,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work) | |||
4464 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4613 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4465 | struct cgroup_subsys *ss = css->ss; | 4614 | struct cgroup_subsys *ss = css->ss; |
4466 | struct cgroup *cgrp = css->cgroup; | 4615 | struct cgroup *cgrp = css->cgroup; |
4616 | struct cgroup_file *cfile; | ||
4467 | 4617 | ||
4468 | percpu_ref_exit(&css->refcnt); | 4618 | percpu_ref_exit(&css->refcnt); |
4469 | 4619 | ||
4620 | list_for_each_entry(cfile, &css->files, node) | ||
4621 | kernfs_put(cfile->kn); | ||
4622 | |||
4470 | if (ss) { | 4623 | if (ss) { |
4471 | /* css free path */ | 4624 | /* css free path */ |
4472 | int id = css->id; | 4625 | int id = css->id; |
@@ -4571,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
4571 | css->ss = ss; | 4724 | css->ss = ss; |
4572 | INIT_LIST_HEAD(&css->sibling); | 4725 | INIT_LIST_HEAD(&css->sibling); |
4573 | INIT_LIST_HEAD(&css->children); | 4726 | INIT_LIST_HEAD(&css->children); |
4727 | INIT_LIST_HEAD(&css->files); | ||
4574 | css->serial_nr = css_serial_nr_next++; | 4728 | css->serial_nr = css_serial_nr_next++; |
4575 | 4729 | ||
4576 | if (cgroup_parent(cgrp)) { | 4730 | if (cgroup_parent(cgrp)) { |
@@ -4653,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
4653 | css->id = err; | 4807 | css->id = err; |
4654 | 4808 | ||
4655 | if (visible) { | 4809 | if (visible) { |
4656 | err = cgroup_populate_dir(cgrp, 1 << ss->id); | 4810 | err = css_populate_dir(css, NULL); |
4657 | if (err) | 4811 | if (err) |
4658 | goto err_free_id; | 4812 | goto err_free_id; |
4659 | } | 4813 | } |
@@ -4679,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
4679 | 4833 | ||
4680 | err_list_del: | 4834 | err_list_del: |
4681 | list_del_rcu(&css->sibling); | 4835 | list_del_rcu(&css->sibling); |
4682 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); | 4836 | css_clear_dir(css, NULL); |
4683 | err_free_id: | 4837 | err_free_id: |
4684 | cgroup_idr_remove(&ss->css_idr, css->id); | 4838 | cgroup_idr_remove(&ss->css_idr, css->id); |
4685 | err_free_percpu_ref: | 4839 | err_free_percpu_ref: |
@@ -4696,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4696 | struct cgroup_root *root; | 4850 | struct cgroup_root *root; |
4697 | struct cgroup_subsys *ss; | 4851 | struct cgroup_subsys *ss; |
4698 | struct kernfs_node *kn; | 4852 | struct kernfs_node *kn; |
4699 | struct cftype *base_files; | ||
4700 | int ssid, ret; | 4853 | int ssid, ret; |
4701 | 4854 | ||
4702 | /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. | 4855 | /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. |
@@ -4772,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4772 | if (ret) | 4925 | if (ret) |
4773 | goto out_destroy; | 4926 | goto out_destroy; |
4774 | 4927 | ||
4775 | if (cgroup_on_dfl(cgrp)) | 4928 | ret = css_populate_dir(&cgrp->self, NULL); |
4776 | base_files = cgroup_dfl_base_files; | ||
4777 | else | ||
4778 | base_files = cgroup_legacy_base_files; | ||
4779 | |||
4780 | ret = cgroup_addrm_files(cgrp, base_files, true); | ||
4781 | if (ret) | 4929 | if (ret) |
4782 | goto out_destroy; | 4930 | goto out_destroy; |
4783 | 4931 | ||
@@ -4864,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css) | |||
4864 | * This must happen before css is disassociated with its cgroup. | 5012 | * This must happen before css is disassociated with its cgroup. |
4865 | * See seq_css() for details. | 5013 | * See seq_css() for details. |
4866 | */ | 5014 | */ |
4867 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); | 5015 | css_clear_dir(css, NULL); |
4868 | 5016 | ||
4869 | /* | 5017 | /* |
4870 | * Killing would put the base ref, but we need to keep it alive | 5018 | * Killing would put the base ref, but we need to keep it alive |
@@ -4913,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4913 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 5061 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4914 | { | 5062 | { |
4915 | struct cgroup_subsys_state *css; | 5063 | struct cgroup_subsys_state *css; |
4916 | bool empty; | ||
4917 | int ssid; | 5064 | int ssid; |
4918 | 5065 | ||
4919 | lockdep_assert_held(&cgroup_mutex); | 5066 | lockdep_assert_held(&cgroup_mutex); |
4920 | 5067 | ||
4921 | /* | 5068 | /* |
4922 | * css_set_rwsem synchronizes access to ->cset_links and prevents | 5069 | * Only migration can raise populated from zero and we're already |
4923 | * @cgrp from being removed while put_css_set() is in progress. | 5070 | * holding cgroup_mutex. |
4924 | */ | 5071 | */ |
4925 | down_read(&css_set_rwsem); | 5072 | if (cgroup_is_populated(cgrp)) |
4926 | empty = list_empty(&cgrp->cset_links); | ||
4927 | up_read(&css_set_rwsem); | ||
4928 | if (!empty) | ||
4929 | return -EBUSY; | 5073 | return -EBUSY; |
4930 | 5074 | ||
4931 | /* | 5075 | /* |
@@ -5023,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) | |||
5023 | 5167 | ||
5024 | have_fork_callback |= (bool)ss->fork << ss->id; | 5168 | have_fork_callback |= (bool)ss->fork << ss->id; |
5025 | have_exit_callback |= (bool)ss->exit << ss->id; | 5169 | have_exit_callback |= (bool)ss->exit << ss->id; |
5170 | have_free_callback |= (bool)ss->free << ss->id; | ||
5026 | have_canfork_callback |= (bool)ss->can_fork << ss->id; | 5171 | have_canfork_callback |= (bool)ss->can_fork << ss->id; |
5027 | 5172 | ||
5028 | /* At system boot, before all subsystems have been | 5173 | /* At system boot, before all subsystems have been |
@@ -5071,6 +5216,8 @@ int __init cgroup_init_early(void) | |||
5071 | return 0; | 5216 | return 0; |
5072 | } | 5217 | } |
5073 | 5218 | ||
5219 | static unsigned long cgroup_disable_mask __initdata; | ||
5220 | |||
5074 | /** | 5221 | /** |
5075 | * cgroup_init - cgroup initialization | 5222 | * cgroup_init - cgroup initialization |
5076 | * | 5223 | * |
@@ -5081,8 +5228,9 @@ int __init cgroup_init(void) | |||
5081 | { | 5228 | { |
5082 | struct cgroup_subsys *ss; | 5229 | struct cgroup_subsys *ss; |
5083 | unsigned long key; | 5230 | unsigned long key; |
5084 | int ssid, err; | 5231 | int ssid; |
5085 | 5232 | ||
5233 | BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); | ||
5086 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); | 5234 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
5087 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); | 5235 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); |
5088 | 5236 | ||
@@ -5116,14 +5264,15 @@ int __init cgroup_init(void) | |||
5116 | * disabled flag and cftype registration needs kmalloc, | 5264 | * disabled flag and cftype registration needs kmalloc, |
5117 | * both of which aren't available during early_init. | 5265 | * both of which aren't available during early_init. |
5118 | */ | 5266 | */ |
5119 | if (ss->disabled) | 5267 | if (cgroup_disable_mask & (1 << ssid)) { |
5268 | static_branch_disable(cgroup_subsys_enabled_key[ssid]); | ||
5269 | printk(KERN_INFO "Disabling %s control group subsystem\n", | ||
5270 | ss->name); | ||
5120 | continue; | 5271 | continue; |
5272 | } | ||
5121 | 5273 | ||
5122 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; | 5274 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; |
5123 | 5275 | ||
5124 | if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) | ||
5125 | ss->dfl_cftypes = ss->legacy_cftypes; | ||
5126 | |||
5127 | if (!ss->dfl_cftypes) | 5276 | if (!ss->dfl_cftypes) |
5128 | cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; | 5277 | cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; |
5129 | 5278 | ||
@@ -5138,17 +5287,10 @@ int __init cgroup_init(void) | |||
5138 | ss->bind(init_css_set.subsys[ssid]); | 5287 | ss->bind(init_css_set.subsys[ssid]); |
5139 | } | 5288 | } |
5140 | 5289 | ||
5141 | err = sysfs_create_mount_point(fs_kobj, "cgroup"); | 5290 | WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); |
5142 | if (err) | 5291 | WARN_ON(register_filesystem(&cgroup_fs_type)); |
5143 | return err; | 5292 | WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); |
5144 | 5293 | ||
5145 | err = register_filesystem(&cgroup_fs_type); | ||
5146 | if (err < 0) { | ||
5147 | sysfs_remove_mount_point(fs_kobj, "cgroup"); | ||
5148 | return err; | ||
5149 | } | ||
5150 | |||
5151 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); | ||
5152 | return 0; | 5294 | return 0; |
5153 | } | 5295 | } |
5154 | 5296 | ||
@@ -5195,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5195 | goto out; | 5337 | goto out; |
5196 | 5338 | ||
5197 | mutex_lock(&cgroup_mutex); | 5339 | mutex_lock(&cgroup_mutex); |
5198 | down_read(&css_set_rwsem); | 5340 | spin_lock_bh(&css_set_lock); |
5199 | 5341 | ||
5200 | for_each_root(root) { | 5342 | for_each_root(root) { |
5201 | struct cgroup_subsys *ss; | 5343 | struct cgroup_subsys *ss; |
@@ -5215,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5215 | seq_printf(m, "%sname=%s", count ? "," : "", | 5357 | seq_printf(m, "%sname=%s", count ? "," : "", |
5216 | root->name); | 5358 | root->name); |
5217 | seq_putc(m, ':'); | 5359 | seq_putc(m, ':'); |
5360 | |||
5218 | cgrp = task_cgroup_from_root(tsk, root); | 5361 | cgrp = task_cgroup_from_root(tsk, root); |
5219 | path = cgroup_path(cgrp, buf, PATH_MAX); | 5362 | |
5220 | if (!path) { | 5363 | /* |
5221 | retval = -ENAMETOOLONG; | 5364 | * On traditional hierarchies, all zombie tasks show up as |
5222 | goto out_unlock; | 5365 | * belonging to the root cgroup. On the default hierarchy, |
5366 | * while a zombie doesn't show up in "cgroup.procs" and | ||
5367 | * thus can't be migrated, its /proc/PID/cgroup keeps | ||
5368 | * reporting the cgroup it belonged to before exiting. If | ||
5369 | * the cgroup is removed before the zombie is reaped, | ||
5370 | * " (deleted)" is appended to the cgroup path. | ||
5371 | */ | ||
5372 | if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { | ||
5373 | path = cgroup_path(cgrp, buf, PATH_MAX); | ||
5374 | if (!path) { | ||
5375 | retval = -ENAMETOOLONG; | ||
5376 | goto out_unlock; | ||
5377 | } | ||
5378 | } else { | ||
5379 | path = "/"; | ||
5223 | } | 5380 | } |
5381 | |||
5224 | seq_puts(m, path); | 5382 | seq_puts(m, path); |
5225 | seq_putc(m, '\n'); | 5383 | |
5384 | if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) | ||
5385 | seq_puts(m, " (deleted)\n"); | ||
5386 | else | ||
5387 | seq_putc(m, '\n'); | ||
5226 | } | 5388 | } |
5227 | 5389 | ||
5228 | retval = 0; | 5390 | retval = 0; |
5229 | out_unlock: | 5391 | out_unlock: |
5230 | up_read(&css_set_rwsem); | 5392 | spin_unlock_bh(&css_set_lock); |
5231 | mutex_unlock(&cgroup_mutex); | 5393 | mutex_unlock(&cgroup_mutex); |
5232 | kfree(buf); | 5394 | kfree(buf); |
5233 | out: | 5395 | out: |
@@ -5251,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
5251 | for_each_subsys(ss, i) | 5413 | for_each_subsys(ss, i) |
5252 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 5414 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
5253 | ss->legacy_name, ss->root->hierarchy_id, | 5415 | ss->legacy_name, ss->root->hierarchy_id, |
5254 | atomic_read(&ss->root->nr_cgrps), !ss->disabled); | 5416 | atomic_read(&ss->root->nr_cgrps), |
5417 | cgroup_ssid_enabled(i)); | ||
5255 | 5418 | ||
5256 | mutex_unlock(&cgroup_mutex); | 5419 | mutex_unlock(&cgroup_mutex); |
5257 | return 0; | 5420 | return 0; |
@@ -5372,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child, | |||
5372 | * @child during its iteration. | 5535 | * @child during its iteration. |
5373 | * | 5536 | * |
5374 | * If we won the race, @child is associated with %current's | 5537 | * If we won the race, @child is associated with %current's |
5375 | * css_set. Grabbing css_set_rwsem guarantees both that the | 5538 | * css_set. Grabbing css_set_lock guarantees both that the |
5376 | * association is stable, and, on completion of the parent's | 5539 | * association is stable, and, on completion of the parent's |
5377 | * migration, @child is visible in the source of migration or | 5540 | * migration, @child is visible in the source of migration or |
5378 | * already in the destination cgroup. This guarantee is necessary | 5541 | * already in the destination cgroup. This guarantee is necessary |
@@ -5387,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child, | |||
5387 | if (use_task_css_set_links) { | 5550 | if (use_task_css_set_links) { |
5388 | struct css_set *cset; | 5551 | struct css_set *cset; |
5389 | 5552 | ||
5390 | down_write(&css_set_rwsem); | 5553 | spin_lock_bh(&css_set_lock); |
5391 | cset = task_css_set(current); | 5554 | cset = task_css_set(current); |
5392 | if (list_empty(&child->cg_list)) { | 5555 | if (list_empty(&child->cg_list)) { |
5393 | rcu_assign_pointer(child->cgroups, cset); | ||
5394 | list_add(&child->cg_list, &cset->tasks); | ||
5395 | get_css_set(cset); | 5556 | get_css_set(cset); |
5557 | css_set_move_task(child, NULL, cset, false); | ||
5396 | } | 5558 | } |
5397 | up_write(&css_set_rwsem); | 5559 | spin_unlock_bh(&css_set_lock); |
5398 | } | 5560 | } |
5399 | 5561 | ||
5400 | /* | 5562 | /* |
@@ -5429,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk) | |||
5429 | { | 5591 | { |
5430 | struct cgroup_subsys *ss; | 5592 | struct cgroup_subsys *ss; |
5431 | struct css_set *cset; | 5593 | struct css_set *cset; |
5432 | bool put_cset = false; | ||
5433 | int i; | 5594 | int i; |
5434 | 5595 | ||
5435 | /* | 5596 | /* |
5436 | * Unlink from @tsk from its css_set. As migration path can't race | 5597 | * Unlink from @tsk from its css_set. As migration path can't race |
5437 | * with us, we can check cg_list without grabbing css_set_rwsem. | 5598 | * with us, we can check css_set and cg_list without synchronization. |
5438 | */ | 5599 | */ |
5600 | cset = task_css_set(tsk); | ||
5601 | |||
5439 | if (!list_empty(&tsk->cg_list)) { | 5602 | if (!list_empty(&tsk->cg_list)) { |
5440 | down_write(&css_set_rwsem); | 5603 | spin_lock_bh(&css_set_lock); |
5441 | list_del_init(&tsk->cg_list); | 5604 | css_set_move_task(tsk, cset, NULL, false); |
5442 | up_write(&css_set_rwsem); | 5605 | spin_unlock_bh(&css_set_lock); |
5443 | put_cset = true; | 5606 | } else { |
5607 | get_css_set(cset); | ||
5444 | } | 5608 | } |
5445 | 5609 | ||
5446 | /* Reassign the task to the init_css_set. */ | ||
5447 | cset = task_css_set(tsk); | ||
5448 | RCU_INIT_POINTER(tsk->cgroups, &init_css_set); | ||
5449 | |||
5450 | /* see cgroup_post_fork() for details */ | 5610 | /* see cgroup_post_fork() for details */ |
5451 | for_each_subsys_which(ss, i, &have_exit_callback) { | 5611 | for_each_subsys_which(ss, i, &have_exit_callback) |
5452 | struct cgroup_subsys_state *old_css = cset->subsys[i]; | 5612 | ss->exit(tsk); |
5453 | struct cgroup_subsys_state *css = task_css(tsk, i); | 5613 | } |
5454 | 5614 | ||
5455 | ss->exit(css, old_css, tsk); | 5615 | void cgroup_free(struct task_struct *task) |
5456 | } | 5616 | { |
5617 | struct css_set *cset = task_css_set(task); | ||
5618 | struct cgroup_subsys *ss; | ||
5619 | int ssid; | ||
5457 | 5620 | ||
5458 | if (put_cset) | 5621 | for_each_subsys_which(ss, ssid, &have_free_callback) |
5459 | put_css_set(cset); | 5622 | ss->free(task); |
5623 | |||
5624 | put_css_set(cset); | ||
5460 | } | 5625 | } |
5461 | 5626 | ||
5462 | static void check_for_release(struct cgroup *cgrp) | 5627 | static void check_for_release(struct cgroup *cgrp) |
5463 | { | 5628 | { |
5464 | if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && | 5629 | if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && |
5465 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) | 5630 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) |
5466 | schedule_work(&cgrp->release_agent_work); | 5631 | schedule_work(&cgrp->release_agent_work); |
5467 | } | 5632 | } |
@@ -5540,25 +5705,13 @@ static int __init cgroup_disable(char *str) | |||
5540 | if (strcmp(token, ss->name) && | 5705 | if (strcmp(token, ss->name) && |
5541 | strcmp(token, ss->legacy_name)) | 5706 | strcmp(token, ss->legacy_name)) |
5542 | continue; | 5707 | continue; |
5543 | 5708 | cgroup_disable_mask |= 1 << i; | |
5544 | ss->disabled = 1; | ||
5545 | printk(KERN_INFO "Disabling %s control group subsystem\n", | ||
5546 | ss->name); | ||
5547 | break; | ||
5548 | } | 5709 | } |
5549 | } | 5710 | } |
5550 | return 1; | 5711 | return 1; |
5551 | } | 5712 | } |
5552 | __setup("cgroup_disable=", cgroup_disable); | 5713 | __setup("cgroup_disable=", cgroup_disable); |
5553 | 5714 | ||
5554 | static int __init cgroup_set_legacy_files_on_dfl(char *str) | ||
5555 | { | ||
5556 | printk("cgroup: using legacy files on the default hierarchy\n"); | ||
5557 | cgroup_legacy_files_on_dfl = true; | ||
5558 | return 0; | ||
5559 | } | ||
5560 | __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); | ||
5561 | |||
5562 | /** | 5715 | /** |
5563 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry | 5716 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
5564 | * @dentry: directory dentry of interest | 5717 | * @dentry: directory dentry of interest |
@@ -5662,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | |||
5662 | if (!name_buf) | 5815 | if (!name_buf) |
5663 | return -ENOMEM; | 5816 | return -ENOMEM; |
5664 | 5817 | ||
5665 | down_read(&css_set_rwsem); | 5818 | spin_lock_bh(&css_set_lock); |
5666 | rcu_read_lock(); | 5819 | rcu_read_lock(); |
5667 | cset = rcu_dereference(current->cgroups); | 5820 | cset = rcu_dereference(current->cgroups); |
5668 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | 5821 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { |
@@ -5673,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | |||
5673 | c->root->hierarchy_id, name_buf); | 5826 | c->root->hierarchy_id, name_buf); |
5674 | } | 5827 | } |
5675 | rcu_read_unlock(); | 5828 | rcu_read_unlock(); |
5676 | up_read(&css_set_rwsem); | 5829 | spin_unlock_bh(&css_set_lock); |
5677 | kfree(name_buf); | 5830 | kfree(name_buf); |
5678 | return 0; | 5831 | return 0; |
5679 | } | 5832 | } |
@@ -5684,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
5684 | struct cgroup_subsys_state *css = seq_css(seq); | 5837 | struct cgroup_subsys_state *css = seq_css(seq); |
5685 | struct cgrp_cset_link *link; | 5838 | struct cgrp_cset_link *link; |
5686 | 5839 | ||
5687 | down_read(&css_set_rwsem); | 5840 | spin_lock_bh(&css_set_lock); |
5688 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | 5841 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { |
5689 | struct css_set *cset = link->cset; | 5842 | struct css_set *cset = link->cset; |
5690 | struct task_struct *task; | 5843 | struct task_struct *task; |
@@ -5707,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
5707 | overflow: | 5860 | overflow: |
5708 | seq_puts(seq, " ...\n"); | 5861 | seq_puts(seq, " ...\n"); |
5709 | } | 5862 | } |
5710 | up_read(&css_set_rwsem); | 5863 | spin_unlock_bh(&css_set_lock); |
5711 | return 0; | 5864 | return 0; |
5712 | } | 5865 | } |
5713 | 5866 | ||
5714 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | 5867 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
5715 | { | 5868 | { |
5716 | return (!cgroup_has_tasks(css->cgroup) && | 5869 | return (!cgroup_is_populated(css->cgroup) && |
5717 | !css_has_online_children(&css->cgroup->self)); | 5870 | !css_has_online_children(&css->cgroup->self)); |
5718 | } | 5871 | } |
5719 | 5872 | ||