summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-03 21:25:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-03 21:25:03 -0400
commit32dad03d164206ea886885d0740284ba215b0970 (patch)
tree5fd89fe27295bfbe47dce5f274aa645099741a71 /kernel
parent357397a14117f0c2eeafcac06a1f8412a02aa6af (diff)
parentd1625964da51bda61306ad3ec45307a799c21f08 (diff)
Merge branch 'for-3.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "A lot of activities on the cgroup front. Most changes aren't visible to userland at all at this point and are laying foundation for the planned unified hierarchy. - The biggest change is decoupling the lifetime management of css (cgroup_subsys_state) from that of cgroup's. Because controllers (cpu, memory, block and so on) will need to be dynamically enabled and disabled, css which is the association point between a cgroup and a controller may come and go dynamically across the lifetime of a cgroup. Till now, css's were created when the associated cgroup was created and stayed till the cgroup got destroyed. Assumptions around this tight coupling permeated through cgroup core and controllers. These assumptions are gradually removed, which consists bulk of patches, and css destruction path is completely decoupled from cgroup destruction path. Note that decoupling of creation path is relatively easy on top of these changes and the patchset is pending for the next window. - cgroup has its own event mechanism cgroup.event_control, which is only used by memcg. It is overly complex trying to achieve high flexibility whose benefits seem dubious at best. Going forward, new events will simply generate file modified event and the existing mechanism is being made specific to memcg. This pull request contains prepatory patches for such change. - Various fixes and cleanups" Fixed up conflict in kernel/cgroup.c as per Tejun. * 'for-3.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (69 commits) cgroup: fix cgroup_css() invocation in css_from_id() cgroup: make cgroup_write_event_control() use css_from_dir() instead of __d_cgrp() cgroup: make cgroup_event hold onto cgroup_subsys_state instead of cgroup cgroup: implement CFTYPE_NO_PREFIX cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys cgroup: rename cgroup_css_from_dir() to css_from_dir() and update its syntax cgroup: fix cgroup_write_event_control() cgroup: fix subsystem file accesses on the root cgroup cgroup: change cgroup_from_id() to css_from_id() cgroup: use css_get() in cgroup_create() to check CSS_ROOT cpuset: remove an unncessary forward declaration cgroup: RCU protect each cgroup_subsys_state release cgroup: move subsys file removal to kill_css() cgroup: factor out kill_css() cgroup: decouple cgroup_subsys_state destruction from cgroup destruction cgroup: replace cgroup->css_kill_cnt with ->nr_css cgroup: bounce cgroup_subsys_state ref kill confirmation to a work item cgroup: move cgroup->subsys[] assignment to online_css() cgroup: reorganize css init / exit paths cgroup: add __rcu modifier to cgroup->subsys[] ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c1643
-rw-r--r--kernel/cgroup_freezer.c155
-rw-r--r--kernel/cpuset.c317
-rw-r--r--kernel/events/core.c27
-rw-r--r--kernel/sched/core.c113
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/sched.h6
7 files changed, 1246 insertions, 1066 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e91963302c0d..e0aeb32415ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -81,7 +81,7 @@
81 */ 81 */
82#ifdef CONFIG_PROVE_RCU 82#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 83DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 85#else
86static DEFINE_MUTEX(cgroup_mutex); 86static DEFINE_MUTEX(cgroup_mutex);
87#endif 87#endif
@@ -117,6 +117,7 @@ struct cfent {
117 struct list_head node; 117 struct list_head node;
118 struct dentry *dentry; 118 struct dentry *dentry;
119 struct cftype *type; 119 struct cftype *type;
120 struct cgroup_subsys_state *css;
120 121
121 /* file xattrs */ 122 /* file xattrs */
122 struct simple_xattrs xattrs; 123 struct simple_xattrs xattrs;
@@ -159,9 +160,9 @@ struct css_id {
159 */ 160 */
160struct cgroup_event { 161struct cgroup_event {
161 /* 162 /*
162 * Cgroup which the event belongs to. 163 * css which the event belongs to.
163 */ 164 */
164 struct cgroup *cgrp; 165 struct cgroup_subsys_state *css;
165 /* 166 /*
166 * Control file which the event associated. 167 * Control file which the event associated.
167 */ 168 */
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 216 */
216static int need_forkexit_callback __read_mostly; 217static int need_forkexit_callback __read_mostly;
217 218
218static void cgroup_offline_fn(struct work_struct *work); 219static struct cftype cgroup_base_files[];
220
221static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 222static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 223static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 224 bool is_add);
225
226/**
227 * cgroup_css - obtain a cgroup's css for the specified subsystem
228 * @cgrp: the cgroup of interest
229 * @ss: the subsystem of interest (%NULL returns the dummy_css)
230 *
231 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
232 * function must be called either under cgroup_mutex or rcu_read_lock() and
233 * the caller is responsible for pinning the returned css if it wants to
234 * keep accessing it outside the said locks. This function may return
235 * %NULL if @cgrp doesn't have @subsys_id enabled.
236 */
237static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
238 struct cgroup_subsys *ss)
239{
240 if (ss)
241 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
242 lockdep_is_held(&cgroup_mutex));
243 else
244 return &cgrp->dummy_css;
245}
222 246
223/* convenient tests for these bits */ 247/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 248static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
365static int cgroup_init_idr(struct cgroup_subsys *ss, 389static int cgroup_init_idr(struct cgroup_subsys *ss,
366 struct cgroup_subsys_state *css); 390 struct cgroup_subsys_state *css);
367 391
368/* css_set_lock protects the list of css_set objects, and the 392/*
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 393 * css_set_lock protects the list of css_set objects, and the chain of
370 * due to cgroup_iter_start() */ 394 * tasks off each css_set. Nests outside task->alloc_lock due to
395 * css_task_iter_start().
396 */
371static DEFINE_RWLOCK(css_set_lock); 397static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 398static int css_set_count;
373 399
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 418 return key;
393} 419}
394 420
395/* We don't maintain the lists running through each css_set to its 421/*
396 * task until after the first call to cgroup_iter_start(). This 422 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 423 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 424 * fork()/exit() overhead for people who have cgroups compiled into their
425 * kernel but not actually in use.
426 */
399static int use_task_css_set_links __read_mostly; 427static int use_task_css_set_links __read_mostly;
400 428
401static void __put_css_set(struct css_set *cset, int taskexit) 429static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 492 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 493 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 494 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 495 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 496 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 497 */
470static bool compare_css_sets(struct css_set *cset, 498static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 583 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 584 * the subsystem state from the new
557 * cgroup */ 585 * cgroup */
558 template[i] = cgrp->subsys[i]; 586 template[i] = cgroup_css(cgrp, ss);
559 } else { 587 } else {
560 /* Subsystem is not in this hierarchy, so we 588 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 589 * don't want to change the subsystem state */
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 831
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 832static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 833static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 834static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 835static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 836static const struct file_operations proc_cgroupstats_operations;
810 837
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 840 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 841};
815 842
816static int alloc_css_id(struct cgroup_subsys *ss, 843static int alloc_css_id(struct cgroup_subsys_state *child_css);
817 struct cgroup *parent, struct cgroup *child);
818 844
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 845static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 846{
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 871static void cgroup_free_fn(struct work_struct *work)
846{ 872{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 873 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 874
850 mutex_lock(&cgroup_mutex); 875 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 876 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 877 mutex_unlock(&cgroup_mutex);
859 878
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 883 */
865 dput(cgrp->parent->dentry); 884 dput(cgrp->parent->dentry);
866 885
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 886 /*
870 * Drop the active superblock reference that we took when we 887 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 888 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 973}
957 974
958/** 975/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 976 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 977 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 978 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 979 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 980static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 981{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 982 struct cgroup_subsys *ss;
983 int i;
969 984
970 for_each_root_subsys(cgrp->root, ss) { 985 for_each_subsys(ss, i) {
971 struct cftype_set *set; 986 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 987
988 if (!test_bit(i, &subsys_mask))
973 continue; 989 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 990 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 991 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 992 }
981} 993}
982 994
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 998static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 999{
988 struct dentry *parent; 1000 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 1001
993 parent = dentry->d_parent; 1002 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 1003 spin_lock(&parent->d_lock);
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 1018{
1010 struct cgroup *cgrp = &root->top_cgroup; 1019 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 1020 struct cgroup_subsys *ss;
1012 int i; 1021 unsigned long pinned = 0;
1022 int i, ret;
1013 1023
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1024 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1025 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 1026
1017 /* Check that any added subsystems are currently free */ 1027 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 1028 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 1029 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 1030 continue;
1023 1031
1032 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 1033 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 1034 ret = -EBUSY;
1026 return -EBUSY; 1035 goto out_put;
1036 }
1037
1038 /* pin the module */
1039 if (!try_module_get(ss->module)) {
1040 ret = -ENOENT;
1041 goto out_put;
1027 } 1042 }
1043 pinned |= 1 << i;
1028 } 1044 }
1029 1045
1030 /* Currently we don't handle adding/removing subsystems when 1046 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1047 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1048 ret = -ENOENT;
1033 * later */ 1049 goto out_put;
1034 if (root->number_of_cgroups > 1) 1050 }
1035 return -EBUSY; 1051
1052 ret = cgroup_populate_dir(cgrp, added_mask);
1053 if (ret)
1054 goto out_put;
1055
1056 /*
1057 * Nothing can fail from this point on. Remove files for the
1058 * removed subsystems and rebind each subsystem.
1059 */
1060 cgroup_clear_dir(cgrp, removed_mask);
1036 1061
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1062 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1063 unsigned long bit = 1UL << i;
1040 1064
1041 if (bit & added_mask) { 1065 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1066 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1067 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1068 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1069 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1070
1071 rcu_assign_pointer(cgrp->subsys[i],
1072 cgroup_css(cgroup_dummy_top, ss));
1073 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1074
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1075 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1076 ss->root = root;
1051 if (ss->bind) 1077 if (ss->bind)
1052 ss->bind(cgrp); 1078 ss->bind(cgroup_css(cgrp, ss));
1053 1079
1054 /* refcount was already taken, and we're keeping it */ 1080 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1081 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1082 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1083 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1084 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1085 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1086
1061 if (ss->bind) 1087 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1088 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1089
1064 cgrp->subsys[i] = NULL; 1090 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1091 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1092
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1093 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1094 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1095
1068 /* subsystem is now free - drop reference on module */ 1096 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1097 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1098 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1099 }
1086 } 1100 }
1087 1101
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1106 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1107
1094 return 0; 1108 return 0;
1109
1110out_put:
1111 for_each_subsys(ss, i)
1112 if (pinned & (1 << i))
1113 module_put(ss->module);
1114 return ret;
1095} 1115}
1096 1116
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1117static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1162 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1163 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1164 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1165 struct cgroup_subsys *ss;
1147 int i; 1166 int i;
1148 1167
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1304 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1305 return -EINVAL;
1287 1306
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1307 return 0;
1320} 1308}
1321 1309
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1310static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1311{
1336 int ret = 0; 1312 int ret = 0;
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1346 goto out_unlock;
1371 } 1347 }
1372 1348
1373 /* 1349 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1350 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1351 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1352 goto out_unlock;
1385 } 1353 }
1386 1354
1387 /* re-populate subsystem files */ 1355 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1356 if (ret)
1357 goto out_unlock;
1389 1358
1390 if (opts.release_agent) 1359 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1360 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1364 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1366 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1367 return ret;
1401} 1368}
1402 1369
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1383 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1384 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1385 mutex_init(&cgrp->pidlist_mutex);
1386 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1387 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1388 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1389 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1399 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1400 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1401 init_cgroup_housekeeping(cgrp);
1402 idr_init(&root->cgroup_idr);
1434} 1403}
1435 1404
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1405static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1472 */
1504 root->subsys_mask = opts->subsys_mask; 1473 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1474 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1475 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1476 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1477 if (opts->name)
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1487 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1488 WARN_ON_ONCE(root->hierarchy_id);
1521 1489
1522 ida_destroy(&root->cgroup_ida); 1490 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1491 kfree(root);
1524 } 1492 }
1525} 1493}
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1552 int ret = 0;
1585 struct super_block *sb; 1553 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1554 struct cgroupfs_root *new_root;
1555 struct list_head tmp_links;
1587 struct inode *inode; 1556 struct inode *inode;
1557 const struct cred *cred;
1588 1558
1589 /* First find the desired set of subsystems */ 1559 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1560 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1570 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1571 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1572 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1573 goto out_err;
1604 } 1574 }
1605 opts.new_root = new_root; 1575 opts.new_root = new_root;
1606 1576
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1579 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1580 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1581 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1582 goto out_err;
1613 } 1583 }
1614 1584
1615 root = sb->s_fs_info; 1585 root = sb->s_fs_info;
1616 BUG_ON(!root); 1586 BUG_ON(!root);
1617 if (root == opts.new_root) { 1587 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1588 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1589 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1590 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1591 int i;
1624 struct css_set *cset; 1592 struct css_set *cset;
1625 1593
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1602 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1603 mutex_lock(&cgroup_root_mutex);
1636 1604
1605 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1606 0, 1, GFP_KERNEL);
1607 if (root_cgrp->id < 0)
1608 goto unlock_drop;
1609
1637 /* Check for name clashes with existing mounts */ 1610 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1611 ret = -EBUSY;
1639 if (strlen(root->name)) 1612 if (strlen(root->name))
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1630 if (ret)
1658 goto unlock_drop; 1631 goto unlock_drop;
1659 1632
1633 sb->s_root->d_fsdata = root_cgrp;
1634 root_cgrp->dentry = sb->s_root;
1635
1636 /*
1637 * We're inside get_sb() and will call lookup_one_len() to
1638 * create the root files, which doesn't work if SELinux is
1639 * in use. The following cred dancing somehow works around
1640 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1641 * populating new cgroupfs mount") for more details.
1642 */
1643 cred = override_creds(&init_cred);
1644
1645 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1646 if (ret)
1647 goto rm_base_files;
1648
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1649 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1650 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1651 goto rm_base_files;
1663 goto unlock_drop; 1652
1664 } 1653 revert_creds(cred);
1654
1665 /* 1655 /*
1666 * There must be no failure case after here, since rebinding 1656 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1657 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1658 * dropped in the failure exit path.
1669 */ 1659 */
1670 1660
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1661 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1662 cgroup_root_count++;
1676 1663
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1665 * the css_set objects */
1682 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1673 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1674 BUG_ON(root->number_of_cgroups != 1);
1691 1675
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1676 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1677 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1678 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1692 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1693 }
1713 } 1694 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1695 }
1718 1696
1719 kfree(opts.release_agent); 1697 kfree(opts.release_agent);
1720 kfree(opts.name); 1698 kfree(opts.name);
1721 return dget(sb->s_root); 1699 return dget(sb->s_root);
1722 1700
1701 rm_base_files:
1702 free_cgrp_cset_links(&tmp_links);
1703 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1704 revert_creds(cred);
1723 unlock_drop: 1705 unlock_drop:
1724 cgroup_exit_root_id(root); 1706 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1707 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1709 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1710 drop_new_super:
1729 deactivate_locked_super(sb); 1711 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1712 out_err:
1733 kfree(opts.release_agent); 1713 kfree(opts.release_agent);
1734 kfree(opts.name); 1714 kfree(opts.name);
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1726 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1727 BUG_ON(!list_empty(&cgrp->children));
1748 1728
1729 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1730 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1731 mutex_lock(&cgroup_root_mutex);
1751 1732
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1759
1779 mutex_unlock(&cgroup_root_mutex); 1760 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1762 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1763
1782 simple_xattrs_free(&cgrp->xattrs); 1764 simple_xattrs_free(&cgrp->xattrs);
1783 1765
@@ -1889,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
1889struct task_and_cgroup { 1871struct task_and_cgroup {
1890 struct task_struct *task; 1872 struct task_struct *task;
1891 struct cgroup *cgrp; 1873 struct cgroup *cgrp;
1892 struct css_set *cg; 1874 struct css_set *cset;
1893}; 1875};
1894 1876
1895struct cgroup_taskset { 1877struct cgroup_taskset {
@@ -1939,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1939EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1921EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1940 1922
1941/** 1923/**
1942 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1924 * cgroup_taskset_cur_css - return the matching css for the current task
1943 * @tset: taskset of interest 1925 * @tset: taskset of interest
1926 * @subsys_id: the ID of the target subsystem
1944 * 1927 *
1945 * Return the cgroup for the current (last returned) task of @tset. This 1928 * Return the css for the current (last returned) task of @tset for
1946 * function must be preceded by either cgroup_taskset_first() or 1929 * subsystem specified by @subsys_id. This function must be preceded by
1947 * cgroup_taskset_next(). 1930 * either cgroup_taskset_first() or cgroup_taskset_next().
1948 */ 1931 */
1949struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1932struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1933 int subsys_id)
1950{ 1934{
1951 return tset->cur_cgrp; 1935 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1952} 1936}
1953EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1937EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1954 1938
1955/** 1939/**
1956 * cgroup_taskset_size - return the number of tasks in taskset 1940 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2089,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2089 * step 1: check that we can legitimately attach to the cgroup. 2073 * step 1: check that we can legitimately attach to the cgroup.
2090 */ 2074 */
2091 for_each_root_subsys(root, ss) { 2075 for_each_root_subsys(root, ss) {
2076 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2077
2092 if (ss->can_attach) { 2078 if (ss->can_attach) {
2093 retval = ss->can_attach(cgrp, &tset); 2079 retval = ss->can_attach(css, &tset);
2094 if (retval) { 2080 if (retval) {
2095 failed_ss = ss; 2081 failed_ss = ss;
2096 goto out_cancel_attach; 2082 goto out_cancel_attach;
@@ -2107,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2107 2093
2108 tc = flex_array_get(group, i); 2094 tc = flex_array_get(group, i);
2109 old_cset = task_css_set(tc->task); 2095 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp); 2096 tc->cset = find_css_set(old_cset, cgrp);
2111 if (!tc->cg) { 2097 if (!tc->cset) {
2112 retval = -ENOMEM; 2098 retval = -ENOMEM;
2113 goto out_put_css_set_refs; 2099 goto out_put_css_set_refs;
2114 } 2100 }
@@ -2121,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2121 */ 2107 */
2122 for (i = 0; i < group_size; i++) { 2108 for (i = 0; i < group_size; i++) {
2123 tc = flex_array_get(group, i); 2109 tc = flex_array_get(group, i);
2124 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2110 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2125 } 2111 }
2126 /* nothing is sensitive to fork() after this point. */ 2112 /* nothing is sensitive to fork() after this point. */
2127 2113
@@ -2129,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2129 * step 4: do subsystem attach callbacks. 2115 * step 4: do subsystem attach callbacks.
2130 */ 2116 */
2131 for_each_root_subsys(root, ss) { 2117 for_each_root_subsys(root, ss) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2119
2132 if (ss->attach) 2120 if (ss->attach)
2133 ss->attach(cgrp, &tset); 2121 ss->attach(css, &tset);
2134 } 2122 }
2135 2123
2136 /* 2124 /*
@@ -2141,18 +2129,20 @@ out_put_css_set_refs:
2141 if (retval) { 2129 if (retval) {
2142 for (i = 0; i < group_size; i++) { 2130 for (i = 0; i < group_size; i++) {
2143 tc = flex_array_get(group, i); 2131 tc = flex_array_get(group, i);
2144 if (!tc->cg) 2132 if (!tc->cset)
2145 break; 2133 break;
2146 put_css_set(tc->cg); 2134 put_css_set(tc->cset);
2147 } 2135 }
2148 } 2136 }
2149out_cancel_attach: 2137out_cancel_attach:
2150 if (retval) { 2138 if (retval) {
2151 for_each_root_subsys(root, ss) { 2139 for_each_root_subsys(root, ss) {
2140 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2141
2152 if (ss == failed_ss) 2142 if (ss == failed_ss)
2153 break; 2143 break;
2154 if (ss->cancel_attach) 2144 if (ss->cancel_attach)
2155 ss->cancel_attach(cgrp, &tset); 2145 ss->cancel_attach(css, &tset);
2156 } 2146 }
2157 } 2147 }
2158out_free_group_list: 2148out_free_group_list:
@@ -2253,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2253 2243
2254 mutex_lock(&cgroup_mutex); 2244 mutex_lock(&cgroup_mutex);
2255 for_each_active_root(root) { 2245 for_each_active_root(root) {
2256 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2246 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2257 2247
2258 retval = cgroup_attach_task(from_cg, tsk, false); 2248 retval = cgroup_attach_task(from_cgrp, tsk, false);
2259 if (retval) 2249 if (retval)
2260 break; 2250 break;
2261 } 2251 }
@@ -2265,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2265} 2255}
2266EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2256EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2267 2257
2268static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2258static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2259 struct cftype *cft, u64 pid)
2269{ 2260{
2270 return attach_task_by_pid(cgrp, pid, false); 2261 return attach_task_by_pid(css->cgroup, pid, false);
2271} 2262}
2272 2263
2273static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2264static int cgroup_procs_write(struct cgroup_subsys_state *css,
2265 struct cftype *cft, u64 tgid)
2274{ 2266{
2275 return attach_task_by_pid(cgrp, tgid, true); 2267 return attach_task_by_pid(css->cgroup, tgid, true);
2276} 2268}
2277 2269
2278static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2270static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2279 const char *buffer) 2271 struct cftype *cft, const char *buffer)
2280{ 2272{
2281 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2273 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2282 if (strlen(buffer) >= PATH_MAX) 2274 if (strlen(buffer) >= PATH_MAX)
2283 return -EINVAL; 2275 return -EINVAL;
2284 if (!cgroup_lock_live_group(cgrp)) 2276 if (!cgroup_lock_live_group(css->cgroup))
2285 return -ENODEV; 2277 return -ENODEV;
2286 mutex_lock(&cgroup_root_mutex); 2278 mutex_lock(&cgroup_root_mutex);
2287 strcpy(cgrp->root->release_agent_path, buffer); 2279 strcpy(css->cgroup->root->release_agent_path, buffer);
2288 mutex_unlock(&cgroup_root_mutex); 2280 mutex_unlock(&cgroup_root_mutex);
2289 mutex_unlock(&cgroup_mutex); 2281 mutex_unlock(&cgroup_mutex);
2290 return 0; 2282 return 0;
2291} 2283}
2292 2284
2293static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2285static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2294 struct seq_file *seq) 2286 struct cftype *cft, struct seq_file *seq)
2295{ 2287{
2288 struct cgroup *cgrp = css->cgroup;
2289
2296 if (!cgroup_lock_live_group(cgrp)) 2290 if (!cgroup_lock_live_group(cgrp))
2297 return -ENODEV; 2291 return -ENODEV;
2298 seq_puts(seq, cgrp->root->release_agent_path); 2292 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2301 return 0; 2295 return 0;
2302} 2296}
2303 2297
2304static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2298static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2305 struct seq_file *seq) 2299 struct cftype *cft, struct seq_file *seq)
2306{ 2300{
2307 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2301 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2308 return 0; 2302 return 0;
2309} 2303}
2310 2304
2311/* A buffer size big enough for numbers or short strings */ 2305/* A buffer size big enough for numbers or short strings */
2312#define CGROUP_LOCAL_BUFFER_SIZE 64 2306#define CGROUP_LOCAL_BUFFER_SIZE 64
2313 2307
2314static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2308static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2315 struct file *file, 2309 struct cftype *cft, struct file *file,
2316 const char __user *userbuf, 2310 const char __user *userbuf, size_t nbytes,
2317 size_t nbytes, loff_t *unused_ppos) 2311 loff_t *unused_ppos)
2318{ 2312{
2319 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2313 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2320 int retval = 0; 2314 int retval = 0;
@@ -2332,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2332 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2326 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2333 if (*end) 2327 if (*end)
2334 return -EINVAL; 2328 return -EINVAL;
2335 retval = cft->write_u64(cgrp, cft, val); 2329 retval = cft->write_u64(css, cft, val);
2336 } else { 2330 } else {
2337 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2331 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2338 if (*end) 2332 if (*end)
2339 return -EINVAL; 2333 return -EINVAL;
2340 retval = cft->write_s64(cgrp, cft, val); 2334 retval = cft->write_s64(css, cft, val);
2341 } 2335 }
2342 if (!retval) 2336 if (!retval)
2343 retval = nbytes; 2337 retval = nbytes;
2344 return retval; 2338 return retval;
2345} 2339}
2346 2340
2347static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2341static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2348 struct file *file, 2342 struct cftype *cft, struct file *file,
2349 const char __user *userbuf, 2343 const char __user *userbuf, size_t nbytes,
2350 size_t nbytes, loff_t *unused_ppos) 2344 loff_t *unused_ppos)
2351{ 2345{
2352 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2346 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2353 int retval = 0; 2347 int retval = 0;
@@ -2370,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2370 } 2364 }
2371 2365
2372 buffer[nbytes] = 0; /* nul-terminate */ 2366 buffer[nbytes] = 0; /* nul-terminate */
2373 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2367 retval = cft->write_string(css, cft, strstrip(buffer));
2374 if (!retval) 2368 if (!retval)
2375 retval = nbytes; 2369 retval = nbytes;
2376out: 2370out:
@@ -2380,65 +2374,60 @@ out:
2380} 2374}
2381 2375
2382static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2376static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2383 size_t nbytes, loff_t *ppos) 2377 size_t nbytes, loff_t *ppos)
2384{ 2378{
2379 struct cfent *cfe = __d_cfe(file->f_dentry);
2385 struct cftype *cft = __d_cft(file->f_dentry); 2380 struct cftype *cft = __d_cft(file->f_dentry);
2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2381 struct cgroup_subsys_state *css = cfe->css;
2387 2382
2388 if (cgroup_is_dead(cgrp))
2389 return -ENODEV;
2390 if (cft->write) 2383 if (cft->write)
2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2384 return cft->write(css, cft, file, buf, nbytes, ppos);
2392 if (cft->write_u64 || cft->write_s64) 2385 if (cft->write_u64 || cft->write_s64)
2393 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2386 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2394 if (cft->write_string) 2387 if (cft->write_string)
2395 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2388 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2396 if (cft->trigger) { 2389 if (cft->trigger) {
2397 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2390 int ret = cft->trigger(css, (unsigned int)cft->private);
2398 return ret ? ret : nbytes; 2391 return ret ? ret : nbytes;
2399 } 2392 }
2400 return -EINVAL; 2393 return -EINVAL;
2401} 2394}
2402 2395
2403static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2396static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2404 struct file *file, 2397 struct cftype *cft, struct file *file,
2405 char __user *buf, size_t nbytes, 2398 char __user *buf, size_t nbytes, loff_t *ppos)
2406 loff_t *ppos)
2407{ 2399{
2408 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2400 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2409 u64 val = cft->read_u64(cgrp, cft); 2401 u64 val = cft->read_u64(css, cft);
2410 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2402 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2411 2403
2412 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2404 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2413} 2405}
2414 2406
2415static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2407static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2416 struct file *file, 2408 struct cftype *cft, struct file *file,
2417 char __user *buf, size_t nbytes, 2409 char __user *buf, size_t nbytes, loff_t *ppos)
2418 loff_t *ppos)
2419{ 2410{
2420 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2411 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2421 s64 val = cft->read_s64(cgrp, cft); 2412 s64 val = cft->read_s64(css, cft);
2422 int len = sprintf(tmp, "%lld\n", (long long) val); 2413 int len = sprintf(tmp, "%lld\n", (long long) val);
2423 2414
2424 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2415 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2425} 2416}
2426 2417
2427static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2418static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2428 size_t nbytes, loff_t *ppos) 2419 size_t nbytes, loff_t *ppos)
2429{ 2420{
2421 struct cfent *cfe = __d_cfe(file->f_dentry);
2430 struct cftype *cft = __d_cft(file->f_dentry); 2422 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2423 struct cgroup_subsys_state *css = cfe->css;
2432
2433 if (cgroup_is_dead(cgrp))
2434 return -ENODEV;
2435 2424
2436 if (cft->read) 2425 if (cft->read)
2437 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2426 return cft->read(css, cft, file, buf, nbytes, ppos);
2438 if (cft->read_u64) 2427 if (cft->read_u64)
2439 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2428 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2440 if (cft->read_s64) 2429 if (cft->read_s64)
2441 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2430 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2442 return -EINVAL; 2431 return -EINVAL;
2443} 2432}
2444 2433
@@ -2447,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2447 * supports string->u64 maps, but can be extended in future. 2436 * supports string->u64 maps, but can be extended in future.
2448 */ 2437 */
2449 2438
2450struct cgroup_seqfile_state {
2451 struct cftype *cft;
2452 struct cgroup *cgroup;
2453};
2454
2455static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2439static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2456{ 2440{
2457 struct seq_file *sf = cb->state; 2441 struct seq_file *sf = cb->state;
@@ -2460,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2460 2444
2461static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2445static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2462{ 2446{
2463 struct cgroup_seqfile_state *state = m->private; 2447 struct cfent *cfe = m->private;
2464 struct cftype *cft = state->cft; 2448 struct cftype *cft = cfe->type;
2449 struct cgroup_subsys_state *css = cfe->css;
2450
2465 if (cft->read_map) { 2451 if (cft->read_map) {
2466 struct cgroup_map_cb cb = { 2452 struct cgroup_map_cb cb = {
2467 .fill = cgroup_map_add, 2453 .fill = cgroup_map_add,
2468 .state = m, 2454 .state = m,
2469 }; 2455 };
2470 return cft->read_map(state->cgroup, cft, &cb); 2456 return cft->read_map(css, cft, &cb);
2471 } 2457 }
2472 return cft->read_seq_string(state->cgroup, cft, m); 2458 return cft->read_seq_string(css, cft, m);
2473}
2474
2475static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq = file->private_data;
2478 kfree(seq->private);
2479 return single_release(inode, file);
2480} 2459}
2481 2460
2482static const struct file_operations cgroup_seqfile_operations = { 2461static const struct file_operations cgroup_seqfile_operations = {
2483 .read = seq_read, 2462 .read = seq_read,
2484 .write = cgroup_file_write, 2463 .write = cgroup_file_write,
2485 .llseek = seq_lseek, 2464 .llseek = seq_lseek,
2486 .release = cgroup_seqfile_release, 2465 .release = single_release,
2487}; 2466};
2488 2467
2489static int cgroup_file_open(struct inode *inode, struct file *file) 2468static int cgroup_file_open(struct inode *inode, struct file *file)
2490{ 2469{
2470 struct cfent *cfe = __d_cfe(file->f_dentry);
2471 struct cftype *cft = __d_cft(file->f_dentry);
2472 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2473 struct cgroup_subsys_state *css;
2491 int err; 2474 int err;
2492 struct cftype *cft;
2493 2475
2494 err = generic_file_open(inode, file); 2476 err = generic_file_open(inode, file);
2495 if (err) 2477 if (err)
2496 return err; 2478 return err;
2497 cft = __d_cft(file->f_dentry);
2498 2479
2499 if (cft->read_map || cft->read_seq_string) { 2480 /*
2500 struct cgroup_seqfile_state *state; 2481 * If the file belongs to a subsystem, pin the css. Will be
2482 * unpinned either on open failure or release. This ensures that
2483 * @css stays alive for all file operations.
2484 */
2485 rcu_read_lock();
2486 css = cgroup_css(cgrp, cft->ss);
2487 if (cft->ss && !css_tryget(css))
2488 css = NULL;
2489 rcu_read_unlock();
2501 2490
2502 state = kzalloc(sizeof(*state), GFP_USER); 2491 if (!css)
2503 if (!state) 2492 return -ENODEV;
2504 return -ENOMEM; 2493
2494 /*
2495 * @cfe->css is used by read/write/close to determine the
2496 * associated css. @file->private_data would be a better place but
2497 * that's already used by seqfile. Multiple accessors may use it
2498 * simultaneously which is okay as the association never changes.
2499 */
2500 WARN_ON_ONCE(cfe->css && cfe->css != css);
2501 cfe->css = css;
2505 2502
2506 state->cft = cft; 2503 if (cft->read_map || cft->read_seq_string) {
2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2508 file->f_op = &cgroup_seqfile_operations; 2504 file->f_op = &cgroup_seqfile_operations;
2509 err = single_open(file, cgroup_seqfile_show, state); 2505 err = single_open(file, cgroup_seqfile_show, cfe);
2510 if (err < 0) 2506 } else if (cft->open) {
2511 kfree(state);
2512 } else if (cft->open)
2513 err = cft->open(inode, file); 2507 err = cft->open(inode, file);
2514 else 2508 }
2515 err = 0;
2516 2509
2510 if (css->ss && err)
2511 css_put(css);
2517 return err; 2512 return err;
2518} 2513}
2519 2514
2520static int cgroup_file_release(struct inode *inode, struct file *file) 2515static int cgroup_file_release(struct inode *inode, struct file *file)
2521{ 2516{
2517 struct cfent *cfe = __d_cfe(file->f_dentry);
2522 struct cftype *cft = __d_cft(file->f_dentry); 2518 struct cftype *cft = __d_cft(file->f_dentry);
2519 struct cgroup_subsys_state *css = cfe->css;
2520 int ret = 0;
2521
2523 if (cft->release) 2522 if (cft->release)
2524 return cft->release(inode, file); 2523 ret = cft->release(inode, file);
2525 return 0; 2524 if (css->ss)
2525 css_put(css);
2526 return ret;
2526} 2527}
2527 2528
2528/* 2529/*
@@ -2736,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2736 return mode; 2737 return mode;
2737} 2738}
2738 2739
2739static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2740static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2740 struct cftype *cft)
2741{ 2741{
2742 struct dentry *dir = cgrp->dentry; 2742 struct dentry *dir = cgrp->dentry;
2743 struct cgroup *parent = __d_cgrp(dir); 2743 struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 umode_t mode; 2747 umode_t mode;
2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2749 2749
2750 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2750 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2751 strcpy(name, subsys->name); 2751 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2752 strcpy(name, cft->ss->name);
2752 strcat(name, "."); 2753 strcat(name, ".");
2753 } 2754 }
2754 strcat(name, cft->name); 2755 strcat(name, cft->name);
@@ -2782,11 +2783,25 @@ out:
2782 return error; 2783 return error;
2783} 2784}
2784 2785
2785static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2786/**
2786 struct cftype cfts[], bool is_add) 2787 * cgroup_addrm_files - add or remove files to a cgroup directory
2788 * @cgrp: the target cgroup
2789 * @cfts: array of cftypes to be added
2790 * @is_add: whether to add or remove
2791 *
2792 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2793 * For removals, this function never fails. If addition fails, this
2794 * function doesn't remove files already added. The caller is responsible
2795 * for cleaning up.
2796 */
2797static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2798 bool is_add)
2787{ 2799{
2788 struct cftype *cft; 2800 struct cftype *cft;
2789 int err, ret = 0; 2801 int ret;
2802
2803 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2804 lockdep_assert_held(&cgroup_mutex);
2790 2805
2791 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2806 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2792 /* does cft->flags tell us to skip this file on @cgrp? */ 2807 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2798 continue; 2813 continue;
2799 2814
2800 if (is_add) { 2815 if (is_add) {
2801 err = cgroup_add_file(cgrp, subsys, cft); 2816 ret = cgroup_add_file(cgrp, cft);
2802 if (err) 2817 if (ret) {
2803 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2818 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2804 cft->name, err); 2819 cft->name, ret);
2805 ret = err; 2820 return ret;
2821 }
2806 } else { 2822 } else {
2807 cgroup_rm_file(cgrp, cft); 2823 cgroup_rm_file(cgrp, cft);
2808 } 2824 }
2809 } 2825 }
2810 return ret; 2826 return 0;
2811} 2827}
2812 2828
2813static void cgroup_cfts_prepare(void) 2829static void cgroup_cfts_prepare(void)
@@ -2816,28 +2832,30 @@ static void cgroup_cfts_prepare(void)
2816 /* 2832 /*
2817 * Thanks to the entanglement with vfs inode locking, we can't walk 2833 * Thanks to the entanglement with vfs inode locking, we can't walk
2818 * the existing cgroups under cgroup_mutex and create files. 2834 * the existing cgroups under cgroup_mutex and create files.
2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2835 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2820 * read lock before calling cgroup_addrm_files(). 2836 * lock before calling cgroup_addrm_files().
2821 */ 2837 */
2822 mutex_lock(&cgroup_mutex); 2838 mutex_lock(&cgroup_mutex);
2823} 2839}
2824 2840
2825static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2841static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2826 struct cftype *cfts, bool is_add)
2827 __releases(&cgroup_mutex) 2842 __releases(&cgroup_mutex)
2828{ 2843{
2829 LIST_HEAD(pending); 2844 LIST_HEAD(pending);
2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2845 struct cgroup_subsys *ss = cfts[0].ss;
2846 struct cgroup *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb; 2847 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL; 2848 struct dentry *prev = NULL;
2833 struct inode *inode; 2849 struct inode *inode;
2850 struct cgroup_subsys_state *css;
2834 u64 update_before; 2851 u64 update_before;
2852 int ret = 0;
2835 2853
2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2854 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2837 if (!cfts || ss->root == &cgroup_dummy_root || 2855 if (!cfts || ss->root == &cgroup_dummy_root ||
2838 !atomic_inc_not_zero(&sb->s_active)) { 2856 !atomic_inc_not_zero(&sb->s_active)) {
2839 mutex_unlock(&cgroup_mutex); 2857 mutex_unlock(&cgroup_mutex);
2840 return; 2858 return 0;
2841 } 2859 }
2842 2860
2843 /* 2861 /*
@@ -2849,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2849 2867
2850 mutex_unlock(&cgroup_mutex); 2868 mutex_unlock(&cgroup_mutex);
2851 2869
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */ 2870 /* add/rm files for all cgroups created before */
2861 rcu_read_lock(); 2871 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) { 2872 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2873 struct cgroup *cgrp = css->cgroup;
2874
2863 if (cgroup_is_dead(cgrp)) 2875 if (cgroup_is_dead(cgrp))
2864 continue; 2876 continue;
2865 2877
@@ -2873,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2873 mutex_lock(&inode->i_mutex); 2885 mutex_lock(&inode->i_mutex);
2874 mutex_lock(&cgroup_mutex); 2886 mutex_lock(&cgroup_mutex);
2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2887 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2876 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2888 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2877 mutex_unlock(&cgroup_mutex); 2889 mutex_unlock(&cgroup_mutex);
2878 mutex_unlock(&inode->i_mutex); 2890 mutex_unlock(&inode->i_mutex);
2879 2891
2880 rcu_read_lock(); 2892 rcu_read_lock();
2893 if (ret)
2894 break;
2881 } 2895 }
2882 rcu_read_unlock(); 2896 rcu_read_unlock();
2883 dput(prev); 2897 dput(prev);
2884 deactivate_super(sb); 2898 deactivate_super(sb);
2899 return ret;
2885} 2900}
2886 2901
2887/** 2902/**
@@ -2901,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2901int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2916int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2902{ 2917{
2903 struct cftype_set *set; 2918 struct cftype_set *set;
2919 struct cftype *cft;
2920 int ret;
2904 2921
2905 set = kzalloc(sizeof(*set), GFP_KERNEL); 2922 set = kzalloc(sizeof(*set), GFP_KERNEL);
2906 if (!set) 2923 if (!set)
2907 return -ENOMEM; 2924 return -ENOMEM;
2908 2925
2926 for (cft = cfts; cft->name[0] != '\0'; cft++)
2927 cft->ss = ss;
2928
2909 cgroup_cfts_prepare(); 2929 cgroup_cfts_prepare();
2910 set->cfts = cfts; 2930 set->cfts = cfts;
2911 list_add_tail(&set->node, &ss->cftsets); 2931 list_add_tail(&set->node, &ss->cftsets);
2912 cgroup_cfts_commit(ss, cfts, true); 2932 ret = cgroup_cfts_commit(cfts, true);
2913 2933 if (ret)
2914 return 0; 2934 cgroup_rm_cftypes(cfts);
2935 return ret;
2915} 2936}
2916EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2937EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2917 2938
2918/** 2939/**
2919 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2940 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2920 * @ss: target cgroup subsystem
2921 * @cfts: zero-length name terminated array of cftypes 2941 * @cfts: zero-length name terminated array of cftypes
2922 * 2942 *
2923 * Unregister @cfts from @ss. Files described by @cfts are removed from 2943 * Unregister @cfts. Files described by @cfts are removed from all
2924 * all existing cgroups to which @ss is attached and all future cgroups 2944 * existing cgroups and all future cgroups won't have them either. This
2925 * won't have them either. This function can be called anytime whether @ss 2945 * function can be called anytime whether @cfts' subsys is attached or not.
2926 * is attached or not.
2927 * 2946 *
2928 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2947 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2929 * registered with @ss. 2948 * registered.
2930 */ 2949 */
2931int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2950int cgroup_rm_cftypes(struct cftype *cfts)
2932{ 2951{
2933 struct cftype_set *set; 2952 struct cftype_set *set;
2934 2953
2954 if (!cfts || !cfts[0].ss)
2955 return -ENOENT;
2956
2935 cgroup_cfts_prepare(); 2957 cgroup_cfts_prepare();
2936 2958
2937 list_for_each_entry(set, &ss->cftsets, node) { 2959 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2938 if (set->cfts == cfts) { 2960 if (set->cfts == cfts) {
2939 list_del(&set->node); 2961 list_del(&set->node);
2940 kfree(set); 2962 kfree(set);
2941 cgroup_cfts_commit(ss, cfts, false); 2963 cgroup_cfts_commit(cfts, false);
2942 return 0; 2964 return 0;
2943 } 2965 }
2944 } 2966 }
2945 2967
2946 cgroup_cfts_commit(ss, NULL, false); 2968 cgroup_cfts_commit(NULL, false);
2947 return -ENOENT; 2969 return -ENOENT;
2948} 2970}
2949 2971
@@ -2966,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2966} 2988}
2967 2989
2968/* 2990/*
2969 * Advance a list_head iterator. The iterator should be positioned at 2991 * To reduce the fork() overhead for systems that are not actually using
2970 * the start of a css_set 2992 * their cgroups capability, we don't maintain the lists running through
2971 */ 2993 * each css_set to its tasks until we see the list actually used - in other
2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2994 * words after the first call to css_task_iter_start().
2973{
2974 struct list_head *l = it->cset_link;
2975 struct cgrp_cset_link *link;
2976 struct css_set *cset;
2977
2978 /* Advance to the next non-empty css_set */
2979 do {
2980 l = l->next;
2981 if (l == &cgrp->cset_links) {
2982 it->cset_link = NULL;
2983 return;
2984 }
2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2986 cset = link->cset;
2987 } while (list_empty(&cset->tasks));
2988 it->cset_link = l;
2989 it->task = cset->tasks.next;
2990}
2991
2992/*
2993 * To reduce the fork() overhead for systems that are not actually
2994 * using their cgroups capability, we don't maintain the lists running
2995 * through each css_set to its tasks until we see the list actually
2996 * used - in other words after the first call to cgroup_iter_start().
2997 */ 2995 */
2998static void cgroup_enable_task_cg_lists(void) 2996static void cgroup_enable_task_cg_lists(void)
2999{ 2997{
@@ -3024,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void)
3024} 3022}
3025 3023
3026/** 3024/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup 3025 * css_next_child - find the next child of a given css
3028 * @pos: the current cgroup 3026 * @pos_css: the current position (%NULL to initiate traversal)
3027 * @parent_css: css whose children to walk
3029 * 3028 *
3030 * This function returns the next sibling of @pos and should be called 3029 * This function returns the next child of @parent_css and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible. 3030 * under RCU read lock. The only requirement is that @parent_css and
3032 * The next sibling is guaranteed to be returned regardless of @pos's 3031 * @pos_css are accessible. The next sibling is guaranteed to be returned
3033 * state. 3032 * regardless of their states.
3034 */ 3033 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos) 3034struct cgroup_subsys_state *
3035css_next_child(struct cgroup_subsys_state *pos_css,
3036 struct cgroup_subsys_state *parent_css)
3036{ 3037{
3038 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3039 struct cgroup *cgrp = parent_css->cgroup;
3037 struct cgroup *next; 3040 struct cgroup *next;
3038 3041
3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3042 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3048 * safe to dereference from this RCU critical section. If 3051 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3052 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here. 3053 * to be visible as %true here.
3054 *
3055 * If @pos is dead, its next pointer can't be dereferenced;
3056 * however, as each cgroup is given a monotonically increasing
3057 * unique serial number and always appended to the sibling list,
3058 * the next one can be found by walking the parent's children until
3059 * we see a cgroup with higher serial number than @pos's. While
3060 * this path can be slower, it's taken only when either the current
3061 * cgroup is removed or iteration and removal race.
3051 */ 3062 */
3052 if (likely(!cgroup_is_dead(pos))) { 3063 if (!pos) {
3064 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3065 } else if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3066 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children) 3067 } else {
3055 return next; 3068 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3056 return NULL; 3069 if (next->serial_nr > pos->serial_nr)
3070 break;
3057 } 3071 }
3058 3072
3059 /* 3073 if (&next->sibling == &cgrp->children)
3060 * Can't dereference the next pointer. Each cgroup is given a 3074 return NULL;
3061 * monotonically increasing unique serial number and always 3075
3062 * appended to the sibling list, so the next one can be found by 3076 return cgroup_css(next, parent_css->ss);
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073} 3077}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3078EXPORT_SYMBOL_GPL(css_next_child);
3075 3079
3076/** 3080/**
3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3081 * css_next_descendant_pre - find the next descendant for pre-order walk
3078 * @pos: the current position (%NULL to initiate traversal) 3082 * @pos: the current position (%NULL to initiate traversal)
3079 * @cgroup: cgroup whose descendants to walk 3083 * @root: css whose descendants to walk
3080 * 3084 *
3081 * To be used by cgroup_for_each_descendant_pre(). Find the next 3085 * To be used by css_for_each_descendant_pre(). Find the next descendant
3082 * descendant to visit for pre-order traversal of @cgroup's descendants. 3086 * to visit for pre-order traversal of @root's descendants. @root is
3087 * included in the iteration and the first node to be visited.
3083 * 3088 *
3084 * While this function requires RCU read locking, it doesn't require the 3089 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This 3090 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos 3091 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3092 * and @root are accessible and @pos is a descendant of @root.
3088 */ 3093 */
3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3094struct cgroup_subsys_state *
3090 struct cgroup *cgroup) 3095css_next_descendant_pre(struct cgroup_subsys_state *pos,
3096 struct cgroup_subsys_state *root)
3091{ 3097{
3092 struct cgroup *next; 3098 struct cgroup_subsys_state *next;
3093 3099
3094 WARN_ON_ONCE(!rcu_read_lock_held()); 3100 WARN_ON_ONCE(!rcu_read_lock_held());
3095 3101
3096 /* if first iteration, pretend we just visited @cgroup */ 3102 /* if first iteration, visit @root */
3097 if (!pos) 3103 if (!pos)
3098 pos = cgroup; 3104 return root;
3099 3105
3100 /* visit the first child if exists */ 3106 /* visit the first child if exists */
3101 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3107 next = css_next_child(NULL, pos);
3102 if (next) 3108 if (next)
3103 return next; 3109 return next;
3104 3110
3105 /* no child, visit my or the closest ancestor's next sibling */ 3111 /* no child, visit my or the closest ancestor's next sibling */
3106 while (pos != cgroup) { 3112 while (pos != root) {
3107 next = cgroup_next_sibling(pos); 3113 next = css_next_child(pos, css_parent(pos));
3108 if (next) 3114 if (next)
3109 return next; 3115 return next;
3110 pos = pos->parent; 3116 pos = css_parent(pos);
3111 } 3117 }
3112 3118
3113 return NULL; 3119 return NULL;
3114} 3120}
3115EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3121EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3116 3122
3117/** 3123/**
3118 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3124 * css_rightmost_descendant - return the rightmost descendant of a css
3119 * @pos: cgroup of interest 3125 * @pos: css of interest
3120 * 3126 *
3121 * Return the rightmost descendant of @pos. If there's no descendant, 3127 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3122 * @pos is returned. This can be used during pre-order traversal to skip 3128 * is returned. This can be used during pre-order traversal to skip
3123 * subtree of @pos. 3129 * subtree of @pos.
3124 * 3130 *
3125 * While this function requires RCU read locking, it doesn't require the 3131 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3127 * function will return the correct rightmost descendant as long as @pos is 3133 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible. 3134 * accessible.
3129 */ 3135 */
3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3136struct cgroup_subsys_state *
3137css_rightmost_descendant(struct cgroup_subsys_state *pos)
3131{ 3138{
3132 struct cgroup *last, *tmp; 3139 struct cgroup_subsys_state *last, *tmp;
3133 3140
3134 WARN_ON_ONCE(!rcu_read_lock_held()); 3141 WARN_ON_ONCE(!rcu_read_lock_held());
3135 3142
@@ -3137,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3137 last = pos; 3144 last = pos;
3138 /* ->prev isn't RCU safe, walk ->next till the end */ 3145 /* ->prev isn't RCU safe, walk ->next till the end */
3139 pos = NULL; 3146 pos = NULL;
3140 list_for_each_entry_rcu(tmp, &last->children, sibling) 3147 css_for_each_child(tmp, last)
3141 pos = tmp; 3148 pos = tmp;
3142 } while (pos); 3149 } while (pos);
3143 3150
3144 return last; 3151 return last;
3145} 3152}
3146EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3153EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3147 3154
3148static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3155static struct cgroup_subsys_state *
3156css_leftmost_descendant(struct cgroup_subsys_state *pos)
3149{ 3157{
3150 struct cgroup *last; 3158 struct cgroup_subsys_state *last;
3151 3159
3152 do { 3160 do {
3153 last = pos; 3161 last = pos;
3154 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3162 pos = css_next_child(NULL, pos);
3155 sibling);
3156 } while (pos); 3163 } while (pos);
3157 3164
3158 return last; 3165 return last;
3159} 3166}
3160 3167
3161/** 3168/**
3162 * cgroup_next_descendant_post - find the next descendant for post-order walk 3169 * css_next_descendant_post - find the next descendant for post-order walk
3163 * @pos: the current position (%NULL to initiate traversal) 3170 * @pos: the current position (%NULL to initiate traversal)
3164 * @cgroup: cgroup whose descendants to walk 3171 * @root: css whose descendants to walk
3165 * 3172 *
3166 * To be used by cgroup_for_each_descendant_post(). Find the next 3173 * To be used by css_for_each_descendant_post(). Find the next descendant
3167 * descendant to visit for post-order traversal of @cgroup's descendants. 3174 * to visit for post-order traversal of @root's descendants. @root is
3175 * included in the iteration and the last node to be visited.
3168 * 3176 *
3169 * While this function requires RCU read locking, it doesn't require the 3177 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This 3178 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos 3179 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3180 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3173 */ 3181 */
3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3182struct cgroup_subsys_state *
3175 struct cgroup *cgroup) 3183css_next_descendant_post(struct cgroup_subsys_state *pos,
3184 struct cgroup_subsys_state *root)
3176{ 3185{
3177 struct cgroup *next; 3186 struct cgroup_subsys_state *next;
3178 3187
3179 WARN_ON_ONCE(!rcu_read_lock_held()); 3188 WARN_ON_ONCE(!rcu_read_lock_held());
3180 3189
3181 /* if first iteration, visit the leftmost descendant */ 3190 /* if first iteration, visit the leftmost descendant */
3182 if (!pos) { 3191 if (!pos) {
3183 next = cgroup_leftmost_descendant(cgroup); 3192 next = css_leftmost_descendant(root);
3184 return next != cgroup ? next : NULL; 3193 return next != root ? next : NULL;
3185 } 3194 }
3186 3195
3196 /* if we visited @root, we're done */
3197 if (pos == root)
3198 return NULL;
3199
3187 /* if there's an unvisited sibling, visit its leftmost descendant */ 3200 /* if there's an unvisited sibling, visit its leftmost descendant */
3188 next = cgroup_next_sibling(pos); 3201 next = css_next_child(pos, css_parent(pos));
3189 if (next) 3202 if (next)
3190 return cgroup_leftmost_descendant(next); 3203 return css_leftmost_descendant(next);
3191 3204
3192 /* no sibling left, visit parent */ 3205 /* no sibling left, visit parent */
3193 next = pos->parent; 3206 return css_parent(pos);
3194 return next != cgroup ? next : NULL;
3195} 3207}
3196EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); 3208EXPORT_SYMBOL_GPL(css_next_descendant_post);
3197 3209
3198void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3210/**
3211 * css_advance_task_iter - advance a task itererator to the next css_set
3212 * @it: the iterator to advance
3213 *
3214 * Advance @it to the next css_set to walk.
3215 */
3216static void css_advance_task_iter(struct css_task_iter *it)
3217{
3218 struct list_head *l = it->cset_link;
3219 struct cgrp_cset_link *link;
3220 struct css_set *cset;
3221
3222 /* Advance to the next non-empty css_set */
3223 do {
3224 l = l->next;
3225 if (l == &it->origin_css->cgroup->cset_links) {
3226 it->cset_link = NULL;
3227 return;
3228 }
3229 link = list_entry(l, struct cgrp_cset_link, cset_link);
3230 cset = link->cset;
3231 } while (list_empty(&cset->tasks));
3232 it->cset_link = l;
3233 it->task = cset->tasks.next;
3234}
3235
3236/**
3237 * css_task_iter_start - initiate task iteration
3238 * @css: the css to walk tasks of
3239 * @it: the task iterator to use
3240 *
3241 * Initiate iteration through the tasks of @css. The caller can call
3242 * css_task_iter_next() to walk through the tasks until the function
3243 * returns NULL. On completion of iteration, css_task_iter_end() must be
3244 * called.
3245 *
3246 * Note that this function acquires a lock which is released when the
3247 * iteration finishes. The caller can't sleep while iteration is in
3248 * progress.
3249 */
3250void css_task_iter_start(struct cgroup_subsys_state *css,
3251 struct css_task_iter *it)
3199 __acquires(css_set_lock) 3252 __acquires(css_set_lock)
3200{ 3253{
3201 /* 3254 /*
3202 * The first time anyone tries to iterate across a cgroup, 3255 * The first time anyone tries to iterate across a css, we need to
3203 * we need to enable the list linking each css_set to its 3256 * enable the list linking each css_set to its tasks, and fix up
3204 * tasks, and fix up all existing tasks. 3257 * all existing tasks.
3205 */ 3258 */
3206 if (!use_task_css_set_links) 3259 if (!use_task_css_set_links)
3207 cgroup_enable_task_cg_lists(); 3260 cgroup_enable_task_cg_lists();
3208 3261
3209 read_lock(&css_set_lock); 3262 read_lock(&css_set_lock);
3210 it->cset_link = &cgrp->cset_links; 3263
3211 cgroup_advance_iter(cgrp, it); 3264 it->origin_css = css;
3265 it->cset_link = &css->cgroup->cset_links;
3266
3267 css_advance_task_iter(it);
3212} 3268}
3213 3269
3214struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3270/**
3215 struct cgroup_iter *it) 3271 * css_task_iter_next - return the next task for the iterator
3272 * @it: the task iterator being iterated
3273 *
3274 * The "next" function for task iteration. @it should have been
3275 * initialized via css_task_iter_start(). Returns NULL when the iteration
3276 * reaches the end.
3277 */
3278struct task_struct *css_task_iter_next(struct css_task_iter *it)
3216{ 3279{
3217 struct task_struct *res; 3280 struct task_struct *res;
3218 struct list_head *l = it->task; 3281 struct list_head *l = it->task;
@@ -3226,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3226 l = l->next; 3289 l = l->next;
3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3290 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3228 if (l == &link->cset->tasks) { 3291 if (l == &link->cset->tasks) {
3229 /* We reached the end of this task list - move on to 3292 /*
3230 * the next cg_cgroup_link */ 3293 * We reached the end of this task list - move on to the
3231 cgroup_advance_iter(cgrp, it); 3294 * next cgrp_cset_link.
3295 */
3296 css_advance_task_iter(it);
3232 } else { 3297 } else {
3233 it->task = l; 3298 it->task = l;
3234 } 3299 }
3235 return res; 3300 return res;
3236} 3301}
3237 3302
3238void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3303/**
3304 * css_task_iter_end - finish task iteration
3305 * @it: the task iterator to finish
3306 *
3307 * Finish task iteration started by css_task_iter_start().
3308 */
3309void css_task_iter_end(struct css_task_iter *it)
3239 __releases(css_set_lock) 3310 __releases(css_set_lock)
3240{ 3311{
3241 read_unlock(&css_set_lock); 3312 read_unlock(&css_set_lock);
@@ -3276,46 +3347,49 @@ static inline int started_after(void *p1, void *p2)
3276} 3347}
3277 3348
3278/** 3349/**
3279 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3350 * css_scan_tasks - iterate though all the tasks in a css
3280 * @scan: struct cgroup_scanner containing arguments for the scan 3351 * @css: the css to iterate tasks of
3352 * @test: optional test callback
3353 * @process: process callback
3354 * @data: data passed to @test and @process
3355 * @heap: optional pre-allocated heap used for task iteration
3356 *
3357 * Iterate through all the tasks in @css, calling @test for each, and if it
3358 * returns %true, call @process for it also.
3359 *
3360 * @test may be NULL, meaning always true (select all tasks), which
3361 * effectively duplicates css_task_iter_{start,next,end}() but does not
3362 * lock css_set_lock for the call to @process.
3363 *
3364 * It is guaranteed that @process will act on every task that is a member
3365 * of @css for the duration of this call. This function may or may not
3366 * call @process for tasks that exit or move to a different css during the
3367 * call, or are forked or move into the css during the call.
3281 * 3368 *
3282 * Arguments include pointers to callback functions test_task() and 3369 * Note that @test may be called with locks held, and may in some
3283 * process_task(). 3370 * situations be called multiple times for the same task, so it should be
3284 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3371 * cheap.
3285 * and if it returns true, call process_task() for it also.
3286 * The test_task pointer may be NULL, meaning always true (select all tasks).
3287 * Effectively duplicates cgroup_iter_{start,next,end}()
3288 * but does not lock css_set_lock for the call to process_task().
3289 * The struct cgroup_scanner may be embedded in any structure of the caller's
3290 * creation.
3291 * It is guaranteed that process_task() will act on every task that
3292 * is a member of the cgroup for the duration of this call. This
3293 * function may or may not call process_task() for tasks that exit
3294 * or move to a different cgroup during the call, or are forked or
3295 * move into the cgroup during the call.
3296 * 3372 *
3297 * Note that test_task() may be called with locks held, and may in some 3373 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3298 * situations be called multiple times for the same task, so it should 3374 * heap operations (and its "gt" member will be overwritten), else a
3299 * be cheap. 3375 * temporary heap will be used (allocation of which may cause this function
3300 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3376 * to fail).
3301 * pre-allocated and will be used for heap operations (and its "gt" member will
3302 * be overwritten), else a temporary heap will be used (allocation of which
3303 * may cause this function to fail).
3304 */ 3377 */
3305int cgroup_scan_tasks(struct cgroup_scanner *scan) 3378int css_scan_tasks(struct cgroup_subsys_state *css,
3379 bool (*test)(struct task_struct *, void *),
3380 void (*process)(struct task_struct *, void *),
3381 void *data, struct ptr_heap *heap)
3306{ 3382{
3307 int retval, i; 3383 int retval, i;
3308 struct cgroup_iter it; 3384 struct css_task_iter it;
3309 struct task_struct *p, *dropped; 3385 struct task_struct *p, *dropped;
3310 /* Never dereference latest_task, since it's not refcounted */ 3386 /* Never dereference latest_task, since it's not refcounted */
3311 struct task_struct *latest_task = NULL; 3387 struct task_struct *latest_task = NULL;
3312 struct ptr_heap tmp_heap; 3388 struct ptr_heap tmp_heap;
3313 struct ptr_heap *heap;
3314 struct timespec latest_time = { 0, 0 }; 3389 struct timespec latest_time = { 0, 0 };
3315 3390
3316 if (scan->heap) { 3391 if (heap) {
3317 /* The caller supplied our heap and pre-allocated its memory */ 3392 /* The caller supplied our heap and pre-allocated its memory */
3318 heap = scan->heap;
3319 heap->gt = &started_after; 3393 heap->gt = &started_after;
3320 } else { 3394 } else {
3321 /* We need to allocate our own heap memory */ 3395 /* We need to allocate our own heap memory */
@@ -3328,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3328 3402
3329 again: 3403 again:
3330 /* 3404 /*
3331 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3405 * Scan tasks in the css, using the @test callback to determine
3332 * to determine which are of interest, and using the scanner's 3406 * which are of interest, and invoking @process callback on the
3333 * "process_task" callback to process any of them that need an update. 3407 * ones which need an update. Since we don't want to hold any
3334 * Since we don't want to hold any locks during the task updates, 3408 * locks during the task updates, gather tasks to be processed in a
3335 * gather tasks to be processed in a heap structure. 3409 * heap structure. The heap is sorted by descending task start
3336 * The heap is sorted by descending task start time. 3410 * time. If the statically-sized heap fills up, we overflow tasks
3337 * If the statically-sized heap fills up, we overflow tasks that 3411 * that started later, and in future iterations only consider tasks
3338 * started later, and in future iterations only consider tasks that 3412 * that started after the latest task in the previous pass. This
3339 * started after the latest task in the previous pass. This
3340 * guarantees forward progress and that we don't miss any tasks. 3413 * guarantees forward progress and that we don't miss any tasks.
3341 */ 3414 */
3342 heap->size = 0; 3415 heap->size = 0;
3343 cgroup_iter_start(scan->cg, &it); 3416 css_task_iter_start(css, &it);
3344 while ((p = cgroup_iter_next(scan->cg, &it))) { 3417 while ((p = css_task_iter_next(&it))) {
3345 /* 3418 /*
3346 * Only affect tasks that qualify per the caller's callback, 3419 * Only affect tasks that qualify per the caller's callback,
3347 * if he provided one 3420 * if he provided one
3348 */ 3421 */
3349 if (scan->test_task && !scan->test_task(p, scan)) 3422 if (test && !test(p, data))
3350 continue; 3423 continue;
3351 /* 3424 /*
3352 * Only process tasks that started after the last task 3425 * Only process tasks that started after the last task
@@ -3374,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3374 * the heap and wasn't inserted 3447 * the heap and wasn't inserted
3375 */ 3448 */
3376 } 3449 }
3377 cgroup_iter_end(scan->cg, &it); 3450 css_task_iter_end(&it);
3378 3451
3379 if (heap->size) { 3452 if (heap->size) {
3380 for (i = 0; i < heap->size; i++) { 3453 for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3384 latest_task = q; 3457 latest_task = q;
3385 } 3458 }
3386 /* Process the task per the caller's callback */ 3459 /* Process the task per the caller's callback */
3387 scan->process_task(q, scan); 3460 process(q, data);
3388 put_task_struct(q); 3461 put_task_struct(q);
3389 } 3462 }
3390 /* 3463 /*
@@ -3401,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3401 return 0; 3474 return 0;
3402} 3475}
3403 3476
3404static void cgroup_transfer_one_task(struct task_struct *task, 3477static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3405 struct cgroup_scanner *scan)
3406{ 3478{
3407 struct cgroup *new_cgroup = scan->data; 3479 struct cgroup *new_cgroup = data;
3408 3480
3409 mutex_lock(&cgroup_mutex); 3481 mutex_lock(&cgroup_mutex);
3410 cgroup_attach_task(new_cgroup, task, false); 3482 cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3418 */ 3490 */
3419int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3491int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3420{ 3492{
3421 struct cgroup_scanner scan; 3493 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3422 3494 to, NULL);
3423 scan.cg = from;
3424 scan.test_task = NULL; /* select all tasks in cgroup */
3425 scan.process_task = cgroup_transfer_one_task;
3426 scan.heap = NULL;
3427 scan.data = to;
3428
3429 return cgroup_scan_tasks(&scan);
3430} 3495}
3431 3496
3432/* 3497/*
@@ -3468,7 +3533,7 @@ struct cgroup_pidlist {
3468 /* pointer to the cgroup we belong to, for list removal purposes */ 3533 /* pointer to the cgroup we belong to, for list removal purposes */
3469 struct cgroup *owner; 3534 struct cgroup *owner;
3470 /* protects the other fields */ 3535 /* protects the other fields */
3471 struct rw_semaphore mutex; 3536 struct rw_semaphore rwsem;
3472}; 3537};
3473 3538
3474/* 3539/*
@@ -3541,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3541 struct pid_namespace *ns = task_active_pid_ns(current); 3606 struct pid_namespace *ns = task_active_pid_ns(current);
3542 3607
3543 /* 3608 /*
3544 * We can't drop the pidlist_mutex before taking the l->mutex in case 3609 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3545 * the last ref-holder is trying to remove l from the list at the same 3610 * the last ref-holder is trying to remove l from the list at the same
3546 * time. Holding the pidlist_mutex precludes somebody taking whichever 3611 * time. Holding the pidlist_mutex precludes somebody taking whichever
3547 * list we find out from under us - compare release_pid_array(). 3612 * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3550 list_for_each_entry(l, &cgrp->pidlists, links) { 3615 list_for_each_entry(l, &cgrp->pidlists, links) {
3551 if (l->key.type == type && l->key.ns == ns) { 3616 if (l->key.type == type && l->key.ns == ns) {
3552 /* make sure l doesn't vanish out from under us */ 3617 /* make sure l doesn't vanish out from under us */
3553 down_write(&l->mutex); 3618 down_write(&l->rwsem);
3554 mutex_unlock(&cgrp->pidlist_mutex); 3619 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3620 return l;
3556 } 3621 }
@@ -3561,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3561 mutex_unlock(&cgrp->pidlist_mutex); 3626 mutex_unlock(&cgrp->pidlist_mutex);
3562 return l; 3627 return l;
3563 } 3628 }
3564 init_rwsem(&l->mutex); 3629 init_rwsem(&l->rwsem);
3565 down_write(&l->mutex); 3630 down_write(&l->rwsem);
3566 l->key.type = type; 3631 l->key.type = type;
3567 l->key.ns = get_pid_ns(ns); 3632 l->key.ns = get_pid_ns(ns);
3568 l->owner = cgrp; 3633 l->owner = cgrp;
@@ -3580,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3580 pid_t *array; 3645 pid_t *array;
3581 int length; 3646 int length;
3582 int pid, n = 0; /* used for populating the array */ 3647 int pid, n = 0; /* used for populating the array */
3583 struct cgroup_iter it; 3648 struct css_task_iter it;
3584 struct task_struct *tsk; 3649 struct task_struct *tsk;
3585 struct cgroup_pidlist *l; 3650 struct cgroup_pidlist *l;
3586 3651
@@ -3595,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3595 if (!array) 3660 if (!array)
3596 return -ENOMEM; 3661 return -ENOMEM;
3597 /* now, populate the array */ 3662 /* now, populate the array */
3598 cgroup_iter_start(cgrp, &it); 3663 css_task_iter_start(&cgrp->dummy_css, &it);
3599 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3664 while ((tsk = css_task_iter_next(&it))) {
3600 if (unlikely(n == length)) 3665 if (unlikely(n == length))
3601 break; 3666 break;
3602 /* get tgid or pid for procs or tasks file respectively */ 3667 /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3607 if (pid > 0) /* make sure to only use valid results */ 3672 if (pid > 0) /* make sure to only use valid results */
3608 array[n++] = pid; 3673 array[n++] = pid;
3609 } 3674 }
3610 cgroup_iter_end(cgrp, &it); 3675 css_task_iter_end(&it);
3611 length = n; 3676 length = n;
3612 /* now sort & (if procs) strip out duplicates */ 3677 /* now sort & (if procs) strip out duplicates */
3613 sort(array, length, sizeof(pid_t), cmppid, NULL); 3678 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3623 l->list = array; 3688 l->list = array;
3624 l->length = length; 3689 l->length = length;
3625 l->use_count++; 3690 l->use_count++;
3626 up_write(&l->mutex); 3691 up_write(&l->rwsem);
3627 *lp = l; 3692 *lp = l;
3628 return 0; 3693 return 0;
3629} 3694}
@@ -3641,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3641{ 3706{
3642 int ret = -EINVAL; 3707 int ret = -EINVAL;
3643 struct cgroup *cgrp; 3708 struct cgroup *cgrp;
3644 struct cgroup_iter it; 3709 struct css_task_iter it;
3645 struct task_struct *tsk; 3710 struct task_struct *tsk;
3646 3711
3647 /* 3712 /*
@@ -3655,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3655 ret = 0; 3720 ret = 0;
3656 cgrp = dentry->d_fsdata; 3721 cgrp = dentry->d_fsdata;
3657 3722
3658 cgroup_iter_start(cgrp, &it); 3723 css_task_iter_start(&cgrp->dummy_css, &it);
3659 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3724 while ((tsk = css_task_iter_next(&it))) {
3660 switch (tsk->state) { 3725 switch (tsk->state) {
3661 case TASK_RUNNING: 3726 case TASK_RUNNING:
3662 stats->nr_running++; 3727 stats->nr_running++;
@@ -3676,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3676 break; 3741 break;
3677 } 3742 }
3678 } 3743 }
3679 cgroup_iter_end(cgrp, &it); 3744 css_task_iter_end(&it);
3680 3745
3681err: 3746err:
3682 return ret; 3747 return ret;
@@ -3701,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3701 int index = 0, pid = *pos; 3766 int index = 0, pid = *pos;
3702 int *iter; 3767 int *iter;
3703 3768
3704 down_read(&l->mutex); 3769 down_read(&l->rwsem);
3705 if (pid) { 3770 if (pid) {
3706 int end = l->length; 3771 int end = l->length;
3707 3772
@@ -3728,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3728static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3793static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3729{ 3794{
3730 struct cgroup_pidlist *l = s->private; 3795 struct cgroup_pidlist *l = s->private;
3731 up_read(&l->mutex); 3796 up_read(&l->rwsem);
3732} 3797}
3733 3798
3734static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3799static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3774 * pidlist_mutex, we have to take pidlist_mutex first. 3839 * pidlist_mutex, we have to take pidlist_mutex first.
3775 */ 3840 */
3776 mutex_lock(&l->owner->pidlist_mutex); 3841 mutex_lock(&l->owner->pidlist_mutex);
3777 down_write(&l->mutex); 3842 down_write(&l->rwsem);
3778 BUG_ON(!l->use_count); 3843 BUG_ON(!l->use_count);
3779 if (!--l->use_count) { 3844 if (!--l->use_count) {
3780 /* we're the last user if refcount is 0; remove and free */ 3845 /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3782 mutex_unlock(&l->owner->pidlist_mutex); 3847 mutex_unlock(&l->owner->pidlist_mutex);
3783 pidlist_free(l->list); 3848 pidlist_free(l->list);
3784 put_pid_ns(l->key.ns); 3849 put_pid_ns(l->key.ns);
3785 up_write(&l->mutex); 3850 up_write(&l->rwsem);
3786 kfree(l); 3851 kfree(l);
3787 return; 3852 return;
3788 } 3853 }
3789 mutex_unlock(&l->owner->pidlist_mutex); 3854 mutex_unlock(&l->owner->pidlist_mutex);
3790 up_write(&l->mutex); 3855 up_write(&l->rwsem);
3791} 3856}
3792 3857
3793static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3858static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3851 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3916 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3852} 3917}
3853 3918
3854static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3919static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3855 struct cftype *cft) 3920 struct cftype *cft)
3856{ 3921{
3857 return notify_on_release(cgrp); 3922 return notify_on_release(css->cgroup);
3858} 3923}
3859 3924
3860static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3925static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 struct cftype *cft, 3926 struct cftype *cft, u64 val)
3862 u64 val)
3863{ 3927{
3864 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3928 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3865 if (val) 3929 if (val)
3866 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3930 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3867 else 3931 else
3868 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3932 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3869 return 0; 3933 return 0;
3870} 3934}
3871 3935
@@ -3895,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work)
3895{ 3959{
3896 struct cgroup_event *event = container_of(work, struct cgroup_event, 3960 struct cgroup_event *event = container_of(work, struct cgroup_event,
3897 remove); 3961 remove);
3898 struct cgroup *cgrp = event->cgrp; 3962 struct cgroup_subsys_state *css = event->css;
3899 3963
3900 remove_wait_queue(event->wqh, &event->wait); 3964 remove_wait_queue(event->wqh, &event->wait);
3901 3965
3902 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3966 event->cft->unregister_event(css, event->cft, event->eventfd);
3903 3967
3904 /* Notify userspace the event is going away. */ 3968 /* Notify userspace the event is going away. */
3905 eventfd_signal(event->eventfd, 1); 3969 eventfd_signal(event->eventfd, 1);
3906 3970
3907 eventfd_ctx_put(event->eventfd); 3971 eventfd_ctx_put(event->eventfd);
3908 kfree(event); 3972 kfree(event);
3909 cgroup_dput(cgrp); 3973 css_put(css);
3910} 3974}
3911 3975
3912/* 3976/*
@@ -3919,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3919{ 3983{
3920 struct cgroup_event *event = container_of(wait, 3984 struct cgroup_event *event = container_of(wait,
3921 struct cgroup_event, wait); 3985 struct cgroup_event, wait);
3922 struct cgroup *cgrp = event->cgrp; 3986 struct cgroup *cgrp = event->css->cgroup;
3923 unsigned long flags = (unsigned long)key; 3987 unsigned long flags = (unsigned long)key;
3924 3988
3925 if (flags & POLLHUP) { 3989 if (flags & POLLHUP) {
@@ -3963,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3963 * Input must be in format '<event_fd> <control_fd> <args>'. 4027 * Input must be in format '<event_fd> <control_fd> <args>'.
3964 * Interpretation of args is defined by control file implementation. 4028 * Interpretation of args is defined by control file implementation.
3965 */ 4029 */
3966static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 4030static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3967 const char *buffer) 4031 struct cftype *cft, const char *buffer)
3968{ 4032{
3969 struct cgroup_event *event = NULL; 4033 struct cgroup *cgrp = dummy_css->cgroup;
3970 struct cgroup *cgrp_cfile; 4034 struct cgroup_event *event;
4035 struct cgroup_subsys_state *cfile_css;
3971 unsigned int efd, cfd; 4036 unsigned int efd, cfd;
3972 struct file *efile = NULL; 4037 struct file *efile;
3973 struct file *cfile = NULL; 4038 struct file *cfile;
3974 char *endp; 4039 char *endp;
3975 int ret; 4040 int ret;
3976 4041
@@ -3987,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3987 event = kzalloc(sizeof(*event), GFP_KERNEL); 4052 event = kzalloc(sizeof(*event), GFP_KERNEL);
3988 if (!event) 4053 if (!event)
3989 return -ENOMEM; 4054 return -ENOMEM;
3990 event->cgrp = cgrp; 4055
3991 INIT_LIST_HEAD(&event->list); 4056 INIT_LIST_HEAD(&event->list);
3992 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4057 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3993 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4058 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
@@ -3996,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3996 efile = eventfd_fget(efd); 4061 efile = eventfd_fget(efd);
3997 if (IS_ERR(efile)) { 4062 if (IS_ERR(efile)) {
3998 ret = PTR_ERR(efile); 4063 ret = PTR_ERR(efile);
3999 goto fail; 4064 goto out_kfree;
4000 } 4065 }
4001 4066
4002 event->eventfd = eventfd_ctx_fileget(efile); 4067 event->eventfd = eventfd_ctx_fileget(efile);
4003 if (IS_ERR(event->eventfd)) { 4068 if (IS_ERR(event->eventfd)) {
4004 ret = PTR_ERR(event->eventfd); 4069 ret = PTR_ERR(event->eventfd);
4005 goto fail; 4070 goto out_put_efile;
4006 } 4071 }
4007 4072
4008 cfile = fget(cfd); 4073 cfile = fget(cfd);
4009 if (!cfile) { 4074 if (!cfile) {
4010 ret = -EBADF; 4075 ret = -EBADF;
4011 goto fail; 4076 goto out_put_eventfd;
4012 } 4077 }
4013 4078
4014 /* the process need read permission on control file */ 4079 /* the process need read permission on control file */
4015 /* AV: shouldn't we check that it's been opened for read instead? */ 4080 /* AV: shouldn't we check that it's been opened for read instead? */
4016 ret = inode_permission(file_inode(cfile), MAY_READ); 4081 ret = inode_permission(file_inode(cfile), MAY_READ);
4017 if (ret < 0) 4082 if (ret < 0)
4018 goto fail; 4083 goto out_put_cfile;
4019 4084
4020 event->cft = __file_cft(cfile); 4085 event->cft = __file_cft(cfile);
4021 if (IS_ERR(event->cft)) { 4086 if (IS_ERR(event->cft)) {
4022 ret = PTR_ERR(event->cft); 4087 ret = PTR_ERR(event->cft);
4023 goto fail; 4088 goto out_put_cfile;
4089 }
4090
4091 if (!event->cft->ss) {
4092 ret = -EBADF;
4093 goto out_put_cfile;
4024 } 4094 }
4025 4095
4026 /* 4096 /*
4027 * The file to be monitored must be in the same cgroup as 4097 * Determine the css of @cfile, verify it belongs to the same
4028 * cgroup.event_control is. 4098 * cgroup as cgroup.event_control, and associate @event with it.
4099 * Remaining events are automatically removed on cgroup destruction
4100 * but the removal is asynchronous, so take an extra ref.
4029 */ 4101 */
4030 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4102 rcu_read_lock();
4031 if (cgrp_cfile != cgrp) { 4103
4032 ret = -EINVAL; 4104 ret = -EINVAL;
4033 goto fail; 4105 event->css = cgroup_css(cgrp, event->cft->ss);
4034 } 4106 cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss);
4107 if (event->css && event->css == cfile_css && css_tryget(event->css))
4108 ret = 0;
4109
4110 rcu_read_unlock();
4111 if (ret)
4112 goto out_put_cfile;
4035 4113
4036 if (!event->cft->register_event || !event->cft->unregister_event) { 4114 if (!event->cft->register_event || !event->cft->unregister_event) {
4037 ret = -EINVAL; 4115 ret = -EINVAL;
4038 goto fail; 4116 goto out_put_css;
4039 } 4117 }
4040 4118
4041 ret = event->cft->register_event(cgrp, event->cft, 4119 ret = event->cft->register_event(event->css, event->cft,
4042 event->eventfd, buffer); 4120 event->eventfd, buffer);
4043 if (ret) 4121 if (ret)
4044 goto fail; 4122 goto out_put_css;
4045 4123
4046 efile->f_op->poll(efile, &event->pt); 4124 efile->f_op->poll(efile, &event->pt);
4047 4125
4048 /*
4049 * Events should be removed after rmdir of cgroup directory, but before
4050 * destroying subsystem state objects. Let's take reference to cgroup
4051 * directory dentry to do that.
4052 */
4053 dget(cgrp->dentry);
4054
4055 spin_lock(&cgrp->event_list_lock); 4126 spin_lock(&cgrp->event_list_lock);
4056 list_add(&event->list, &cgrp->event_list); 4127 list_add(&event->list, &cgrp->event_list);
4057 spin_unlock(&cgrp->event_list_lock); 4128 spin_unlock(&cgrp->event_list_lock);
@@ -4061,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
4061 4132
4062 return 0; 4133 return 0;
4063 4134
4064fail: 4135out_put_css:
4065 if (cfile) 4136 css_put(event->css);
4066 fput(cfile); 4137out_put_cfile:
4067 4138 fput(cfile);
4068 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4139out_put_eventfd:
4069 eventfd_ctx_put(event->eventfd); 4140 eventfd_ctx_put(event->eventfd);
4070 4141out_put_efile:
4071 if (!IS_ERR_OR_NULL(efile)) 4142 fput(efile);
4072 fput(efile); 4143out_kfree:
4073
4074 kfree(event); 4144 kfree(event);
4075 4145
4076 return ret; 4146 return ret;
4077} 4147}
4078 4148
4079static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4149static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4080 struct cftype *cft) 4150 struct cftype *cft)
4081{ 4151{
4082 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4152 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4083} 4153}
4084 4154
4085static int cgroup_clone_children_write(struct cgroup *cgrp, 4155static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4086 struct cftype *cft, 4156 struct cftype *cft, u64 val)
4087 u64 val)
4088{ 4157{
4089 if (val) 4158 if (val)
4090 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4159 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4091 else 4160 else
4092 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4161 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4093 return 0; 4162 return 0;
4094} 4163}
4095 4164
@@ -4148,36 +4217,34 @@ static struct cftype cgroup_base_files[] = {
4148}; 4217};
4149 4218
4150/** 4219/**
4151 * cgroup_populate_dir - selectively creation of files in a directory 4220 * cgroup_populate_dir - create subsys files in a cgroup directory
4152 * @cgrp: target cgroup 4221 * @cgrp: target cgroup
4153 * @base_files: true if the base files should be added
4154 * @subsys_mask: mask of the subsystem ids whose files should be added 4222 * @subsys_mask: mask of the subsystem ids whose files should be added
4223 *
4224 * On failure, no file is added.
4155 */ 4225 */
4156static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4226static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4157 unsigned long subsys_mask)
4158{ 4227{
4159 int err;
4160 struct cgroup_subsys *ss; 4228 struct cgroup_subsys *ss;
4161 4229 int i, ret = 0;
4162 if (base_files) {
4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4164 if (err < 0)
4165 return err;
4166 }
4167 4230
4168 /* process cftsets of each subsystem */ 4231 /* process cftsets of each subsystem */
4169 for_each_root_subsys(cgrp->root, ss) { 4232 for_each_subsys(ss, i) {
4170 struct cftype_set *set; 4233 struct cftype_set *set;
4171 if (!test_bit(ss->subsys_id, &subsys_mask)) 4234
4235 if (!test_bit(i, &subsys_mask))
4172 continue; 4236 continue;
4173 4237
4174 list_for_each_entry(set, &ss->cftsets, node) 4238 list_for_each_entry(set, &ss->cftsets, node) {
4175 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4239 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4240 if (ret < 0)
4241 goto err;
4242 }
4176 } 4243 }
4177 4244
4178 /* This cgroup is ready now */ 4245 /* This cgroup is ready now */
4179 for_each_root_subsys(cgrp->root, ss) { 4246 for_each_root_subsys(cgrp->root, ss) {
4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4247 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4181 struct css_id *id = rcu_dereference_protected(css->id, true); 4248 struct css_id *id = rcu_dereference_protected(css->id, true);
4182 4249
4183 /* 4250 /*
@@ -4190,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4190 } 4257 }
4191 4258
4192 return 0; 4259 return 0;
4260err:
4261 cgroup_clear_dir(cgrp, subsys_mask);
4262 return ret;
4193} 4263}
4194 4264
4195static void css_dput_fn(struct work_struct *work) 4265/*
4266 * css destruction is four-stage process.
4267 *
4268 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4269 * Implemented in kill_css().
4270 *
4271 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4272 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4273 * by invoking offline_css(). After offlining, the base ref is put.
4274 * Implemented in css_killed_work_fn().
4275 *
4276 * 3. When the percpu_ref reaches zero, the only possible remaining
4277 * accessors are inside RCU read sections. css_release() schedules the
4278 * RCU callback.
4279 *
4280 * 4. After the grace period, the css can be freed. Implemented in
4281 * css_free_work_fn().
4282 *
4283 * It is actually hairier because both step 2 and 4 require process context
4284 * and thus involve punting to css->destroy_work adding two additional
4285 * steps to the already complex sequence.
4286 */
4287static void css_free_work_fn(struct work_struct *work)
4196{ 4288{
4197 struct cgroup_subsys_state *css = 4289 struct cgroup_subsys_state *css =
4198 container_of(work, struct cgroup_subsys_state, dput_work); 4290 container_of(work, struct cgroup_subsys_state, destroy_work);
4291 struct cgroup *cgrp = css->cgroup;
4199 4292
4200 cgroup_dput(css->cgroup); 4293 if (css->parent)
4294 css_put(css->parent);
4295
4296 css->ss->css_free(css);
4297 cgroup_dput(cgrp);
4298}
4299
4300static void css_free_rcu_fn(struct rcu_head *rcu_head)
4301{
4302 struct cgroup_subsys_state *css =
4303 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4304
4305 /*
4306 * css holds an extra ref to @cgrp->dentry which is put on the last
4307 * css_put(). dput() requires process context which we don't have.
4308 */
4309 INIT_WORK(&css->destroy_work, css_free_work_fn);
4310 schedule_work(&css->destroy_work);
4201} 4311}
4202 4312
4203static void css_release(struct percpu_ref *ref) 4313static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4315,47 @@ static void css_release(struct percpu_ref *ref)
4205 struct cgroup_subsys_state *css = 4315 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt); 4316 container_of(ref, struct cgroup_subsys_state, refcnt);
4207 4317
4208 schedule_work(&css->dput_work); 4318 call_rcu(&css->rcu_head, css_free_rcu_fn);
4209} 4319}
4210 4320
4211static void init_cgroup_css(struct cgroup_subsys_state *css, 4321static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4212 struct cgroup_subsys *ss, 4322 struct cgroup *cgrp)
4213 struct cgroup *cgrp)
4214{ 4323{
4215 css->cgroup = cgrp; 4324 css->cgroup = cgrp;
4325 css->ss = ss;
4216 css->flags = 0; 4326 css->flags = 0;
4217 css->id = NULL; 4327 css->id = NULL;
4218 if (cgrp == cgroup_dummy_top) 4328
4329 if (cgrp->parent)
4330 css->parent = cgroup_css(cgrp->parent, ss);
4331 else
4219 css->flags |= CSS_ROOT; 4332 css->flags |= CSS_ROOT;
4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4221 cgrp->subsys[ss->subsys_id] = css;
4222 4333
4223 /* 4334 BUG_ON(cgroup_css(cgrp, ss));
4224 * css holds an extra ref to @cgrp->dentry which is put on the last
4225 * css_put(). dput() requires process context, which css_put() may
4226 * be called without. @css->dput_work will be used to invoke
4227 * dput() asynchronously from css_put().
4228 */
4229 INIT_WORK(&css->dput_work, css_dput_fn);
4230} 4335}
4231 4336
4232/* invoke ->post_create() on a new CSS and mark it online if successful */ 4337/* invoke ->css_online() on a new CSS and mark it online if successful */
4233static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4338static int online_css(struct cgroup_subsys_state *css)
4234{ 4339{
4340 struct cgroup_subsys *ss = css->ss;
4235 int ret = 0; 4341 int ret = 0;
4236 4342
4237 lockdep_assert_held(&cgroup_mutex); 4343 lockdep_assert_held(&cgroup_mutex);
4238 4344
4239 if (ss->css_online) 4345 if (ss->css_online)
4240 ret = ss->css_online(cgrp); 4346 ret = ss->css_online(css);
4241 if (!ret) 4347 if (!ret) {
4242 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4348 css->flags |= CSS_ONLINE;
4349 css->cgroup->nr_css++;
4350 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4351 }
4243 return ret; 4352 return ret;
4244} 4353}
4245 4354
4246/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4355/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4247static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4356static void offline_css(struct cgroup_subsys_state *css)
4248 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4249{ 4357{
4250 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4358 struct cgroup_subsys *ss = css->ss;
4251 4359
4252 lockdep_assert_held(&cgroup_mutex); 4360 lockdep_assert_held(&cgroup_mutex);
4253 4361
@@ -4255,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4255 return; 4363 return;
4256 4364
4257 if (ss->css_offline) 4365 if (ss->css_offline)
4258 ss->css_offline(cgrp); 4366 ss->css_offline(css);
4259 4367
4260 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4368 css->flags &= ~CSS_ONLINE;
4369 css->cgroup->nr_css--;
4370 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4261} 4371}
4262 4372
4263/* 4373/*
@@ -4271,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4271static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4381static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4272 umode_t mode) 4382 umode_t mode)
4273{ 4383{
4384 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4274 struct cgroup *cgrp; 4385 struct cgroup *cgrp;
4275 struct cgroup_name *name; 4386 struct cgroup_name *name;
4276 struct cgroupfs_root *root = parent->root; 4387 struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4288 goto err_free_cgrp; 4399 goto err_free_cgrp;
4289 rcu_assign_pointer(cgrp->name, name); 4400 rcu_assign_pointer(cgrp->name, name);
4290 4401
4291 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4402 /*
4403 * Temporarily set the pointer to NULL, so idr_find() won't return
4404 * a half-baked cgroup.
4405 */
4406 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4292 if (cgrp->id < 0) 4407 if (cgrp->id < 0)
4293 goto err_free_name; 4408 goto err_free_name;
4294 4409
@@ -4317,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4317 cgrp->dentry = dentry; 4432 cgrp->dentry = dentry;
4318 4433
4319 cgrp->parent = parent; 4434 cgrp->parent = parent;
4435 cgrp->dummy_css.parent = &parent->dummy_css;
4320 cgrp->root = parent->root; 4436 cgrp->root = parent->root;
4321 4437
4322 if (notify_on_release(parent)) 4438 if (notify_on_release(parent))
@@ -4328,22 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4328 for_each_root_subsys(root, ss) { 4444 for_each_root_subsys(root, ss) {
4329 struct cgroup_subsys_state *css; 4445 struct cgroup_subsys_state *css;
4330 4446
4331 css = ss->css_alloc(cgrp); 4447 css = ss->css_alloc(cgroup_css(parent, ss));
4332 if (IS_ERR(css)) { 4448 if (IS_ERR(css)) {
4333 err = PTR_ERR(css); 4449 err = PTR_ERR(css);
4334 goto err_free_all; 4450 goto err_free_all;
4335 } 4451 }
4452 css_ar[ss->subsys_id] = css;
4336 4453
4337 err = percpu_ref_init(&css->refcnt, css_release); 4454 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) { 4455 if (err)
4339 ss->css_free(cgrp);
4340 goto err_free_all; 4456 goto err_free_all;
4341 }
4342 4457
4343 init_cgroup_css(css, ss, cgrp); 4458 init_css(css, ss, cgrp);
4344 4459
4345 if (ss->use_id) { 4460 if (ss->use_id) {
4346 err = alloc_css_id(ss, parent, cgrp); 4461 err = alloc_css_id(css);
4347 if (err) 4462 if (err)
4348 goto err_free_all; 4463 goto err_free_all;
4349 } 4464 }
@@ -4365,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4480 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4366 root->number_of_cgroups++; 4481 root->number_of_cgroups++;
4367 4482
4368 /* each css holds a ref to the cgroup's dentry */ 4483 /* each css holds a ref to the cgroup's dentry and the parent css */
4369 for_each_root_subsys(root, ss) 4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4370 dget(dentry); 4487 dget(dentry);
4488 css_get(css->parent);
4489 }
4371 4490
4372 /* hold a ref to the parent's dentry */ 4491 /* hold a ref to the parent's dentry */
4373 dget(parent->dentry); 4492 dget(parent->dentry);
4374 4493
4375 /* creation succeeded, notify subsystems */ 4494 /* creation succeeded, notify subsystems */
4376 for_each_root_subsys(root, ss) { 4495 for_each_root_subsys(root, ss) {
4377 err = online_css(ss, cgrp); 4496 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4497
4498 err = online_css(css);
4378 if (err) 4499 if (err)
4379 goto err_destroy; 4500 goto err_destroy;
4380 4501
@@ -4388,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4388 } 4509 }
4389 } 4510 }
4390 4511
4391 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4512 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4513
4514 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4515 if (err)
4516 goto err_destroy;
4517
4518 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4392 if (err) 4519 if (err)
4393 goto err_destroy; 4520 goto err_destroy;
4394 4521
@@ -4399,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4399 4526
4400err_free_all: 4527err_free_all:
4401 for_each_root_subsys(root, ss) { 4528 for_each_root_subsys(root, ss) {
4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4529 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4403 4530
4404 if (css) { 4531 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt); 4532 percpu_ref_cancel_init(&css->refcnt);
4406 ss->css_free(cgrp); 4533 ss->css_free(css);
4407 } 4534 }
4408 } 4535 }
4409 mutex_unlock(&cgroup_mutex); 4536 mutex_unlock(&cgroup_mutex);
4410 /* Release the reference count that we took on the superblock */ 4537 /* Release the reference count that we took on the superblock */
4411 deactivate_super(sb); 4538 deactivate_super(sb);
4412err_free_id: 4539err_free_id:
4413 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4540 idr_remove(&root->cgroup_idr, cgrp->id);
4414err_free_name: 4541err_free_name:
4415 kfree(rcu_dereference_raw(cgrp->name)); 4542 kfree(rcu_dereference_raw(cgrp->name));
4416err_free_cgrp: 4543err_free_cgrp:
@@ -4432,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4559 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4433} 4560}
4434 4561
4435static void cgroup_css_killed(struct cgroup *cgrp) 4562/*
4563 * This is called when the refcnt of a css is confirmed to be killed.
4564 * css_tryget() is now guaranteed to fail.
4565 */
4566static void css_killed_work_fn(struct work_struct *work)
4436{ 4567{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4568 struct cgroup_subsys_state *css =
4438 return; 4569 container_of(work, struct cgroup_subsys_state, destroy_work);
4570 struct cgroup *cgrp = css->cgroup;
4439 4571
4440 /* percpu ref's of all css's are killed, kick off the next step */ 4572 mutex_lock(&cgroup_mutex);
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4573
4442 schedule_work(&cgrp->destroy_work); 4574 /*
4575 * css_tryget() is guaranteed to fail now. Tell subsystems to
4576 * initate destruction.
4577 */
4578 offline_css(css);
4579
4580 /*
4581 * If @cgrp is marked dead, it's waiting for refs of all css's to
4582 * be disabled before proceeding to the second phase of cgroup
4583 * destruction. If we are the last one, kick it off.
4584 */
4585 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4586 cgroup_destroy_css_killed(cgrp);
4587
4588 mutex_unlock(&cgroup_mutex);
4589
4590 /*
4591 * Put the css refs from kill_css(). Each css holds an extra
4592 * reference to the cgroup's dentry and cgroup removal proceeds
4593 * regardless of css refs. On the last put of each css, whenever
4594 * that may be, the extra dentry ref is put so that dentry
4595 * destruction happens only after all css's are released.
4596 */
4597 css_put(css);
4443} 4598}
4444 4599
4445static void css_ref_killed_fn(struct percpu_ref *ref) 4600/* css kill confirmation processing requires process context, bounce */
4601static void css_killed_ref_fn(struct percpu_ref *ref)
4446{ 4602{
4447 struct cgroup_subsys_state *css = 4603 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt); 4604 container_of(ref, struct cgroup_subsys_state, refcnt);
4449 4605
4450 cgroup_css_killed(css->cgroup); 4606 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4607 schedule_work(&css->destroy_work);
4608}
4609
4610/**
4611 * kill_css - destroy a css
4612 * @css: css to destroy
4613 *
4614 * This function initiates destruction of @css by removing cgroup interface
4615 * files and putting its base reference. ->css_offline() will be invoked
4616 * asynchronously once css_tryget() is guaranteed to fail and when the
4617 * reference count reaches zero, @css will be released.
4618 */
4619static void kill_css(struct cgroup_subsys_state *css)
4620{
4621 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4622
4623 /*
4624 * Killing would put the base ref, but we need to keep it alive
4625 * until after ->css_offline().
4626 */
4627 css_get(css);
4628
4629 /*
4630 * cgroup core guarantees that, by the time ->css_offline() is
4631 * invoked, no new css reference will be given out via
4632 * css_tryget(). We can't simply call percpu_ref_kill() and
4633 * proceed to offlining css's because percpu_ref_kill() doesn't
4634 * guarantee that the ref is seen as killed on all CPUs on return.
4635 *
4636 * Use percpu_ref_kill_and_confirm() to get notifications as each
4637 * css is confirmed to be seen as killed on all CPUs.
4638 */
4639 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4451} 4640}
4452 4641
4453/** 4642/**
@@ -4513,41 +4702,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4513 return -EBUSY; 4702 return -EBUSY;
4514 4703
4515 /* 4704 /*
4516 * Block new css_tryget() by killing css refcnts. cgroup core 4705 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4517 * guarantees that, by the time ->css_offline() is invoked, no new 4706 * will be invoked to perform the rest of destruction once the
4518 * css reference will be given out via css_tryget(). We can't 4707 * percpu refs of all css's are confirmed to be killed.
4519 * simply call percpu_ref_kill() and proceed to offlining css's
4520 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4521 * as killed on all CPUs on return.
4522 *
4523 * Use percpu_ref_kill_and_confirm() to get notifications as each
4524 * css is confirmed to be seen as killed on all CPUs. The
4525 * notification callback keeps track of the number of css's to be
4526 * killed and schedules cgroup_offline_fn() to perform the rest of
4527 * destruction once the percpu refs of all css's are confirmed to
4528 * be killed.
4529 */ 4708 */
4530 atomic_set(&cgrp->css_kill_cnt, 1); 4709 for_each_root_subsys(cgrp->root, ss)
4531 for_each_root_subsys(cgrp->root, ss) { 4710 kill_css(cgroup_css(cgrp, ss));
4532 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4533
4534 /*
4535 * Killing would put the base ref, but we need to keep it
4536 * alive until after ->css_offline.
4537 */
4538 percpu_ref_get(&css->refcnt);
4539
4540 atomic_inc(&cgrp->css_kill_cnt);
4541 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4542 }
4543 cgroup_css_killed(cgrp);
4544 4711
4545 /* 4712 /*
4546 * Mark @cgrp dead. This prevents further task migration and child 4713 * Mark @cgrp dead. This prevents further task migration and child
4547 * creation by disabling cgroup_lock_live_group(). Note that 4714 * creation by disabling cgroup_lock_live_group(). Note that
4548 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4715 * CGRP_DEAD assertion is depended upon by css_next_child() to
4549 * resume iteration after dropping RCU read lock. See 4716 * resume iteration after dropping RCU read lock. See
4550 * cgroup_next_sibling() for details. 4717 * css_next_child() for details.
4551 */ 4718 */
4552 set_bit(CGRP_DEAD, &cgrp->flags); 4719 set_bit(CGRP_DEAD, &cgrp->flags);
4553 4720
@@ -4558,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4558 raw_spin_unlock(&release_list_lock); 4725 raw_spin_unlock(&release_list_lock);
4559 4726
4560 /* 4727 /*
4561 * Remove @cgrp directory. The removal puts the base ref but we 4728 * If @cgrp has css's attached, the second stage of cgroup
4562 * aren't quite done with @cgrp yet, so hold onto it. 4729 * destruction is kicked off from css_killed_work_fn() after the
4730 * refs of all attached css's are killed. If @cgrp doesn't have
4731 * any css, we kick it off here.
4732 */
4733 if (!cgrp->nr_css)
4734 cgroup_destroy_css_killed(cgrp);
4735
4736 /*
4737 * Clear the base files and remove @cgrp directory. The removal
4738 * puts the base ref but we aren't quite done with @cgrp yet, so
4739 * hold onto it.
4563 */ 4740 */
4741 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4564 dget(d); 4742 dget(d);
4565 cgroup_d_remove_dir(d); 4743 cgroup_d_remove_dir(d);
4566 4744
@@ -4580,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4580}; 4758};
4581 4759
4582/** 4760/**
4583 * cgroup_offline_fn - the second step of cgroup destruction 4761 * cgroup_destroy_css_killed - the second step of cgroup destruction
4584 * @work: cgroup->destroy_free_work 4762 * @work: cgroup->destroy_free_work
4585 * 4763 *
4586 * This function is invoked from a work item for a cgroup which is being 4764 * This function is invoked from a work item for a cgroup which is being
4587 * destroyed after the percpu refcnts of all css's are guaranteed to be 4765 * destroyed after all css's are offlined and performs the rest of
4588 * seen as killed on all CPUs, and performs the rest of destruction. This 4766 * destruction. This is the second step of destruction described in the
4589 * is the second step of destruction described in the comment above 4767 * comment above cgroup_destroy_locked().
4590 * cgroup_destroy_locked().
4591 */ 4768 */
4592static void cgroup_offline_fn(struct work_struct *work) 4769static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4593{ 4770{
4594 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4595 struct cgroup *parent = cgrp->parent; 4771 struct cgroup *parent = cgrp->parent;
4596 struct dentry *d = cgrp->dentry; 4772 struct dentry *d = cgrp->dentry;
4597 struct cgroup_subsys *ss;
4598 4773
4599 mutex_lock(&cgroup_mutex); 4774 lockdep_assert_held(&cgroup_mutex);
4600 4775
4601 /* 4776 /* delete this cgroup from parent->children */
4602 * css_tryget() is guaranteed to fail now. Tell subsystems to 4777 list_del_rcu(&cgrp->sibling);
4603 * initate destruction.
4604 */
4605 for_each_root_subsys(cgrp->root, ss)
4606 offline_css(ss, cgrp);
4607 4778
4608 /* 4779 /*
4609 * Put the css refs from cgroup_destroy_locked(). Each css holds 4780 * We should remove the cgroup object from idr before its grace
4610 * an extra reference to the cgroup's dentry and cgroup removal 4781 * period starts, so we won't be looking up a cgroup while the
4611 * proceeds regardless of css refs. On the last put of each css, 4782 * cgroup is being freed.
4612 * whenever that may be, the extra dentry ref is put so that dentry
4613 * destruction happens only after all css's are released.
4614 */ 4783 */
4615 for_each_root_subsys(cgrp->root, ss) 4784 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4616 css_put(cgrp->subsys[ss->subsys_id]); 4785 cgrp->id = -1;
4617
4618 /* delete this cgroup from parent->children */
4619 list_del_rcu(&cgrp->sibling);
4620 4786
4621 dput(d); 4787 dput(d);
4622 4788
4623 set_bit(CGRP_RELEASABLE, &parent->flags); 4789 set_bit(CGRP_RELEASABLE, &parent->flags);
4624 check_for_release(parent); 4790 check_for_release(parent);
4625
4626 mutex_unlock(&cgroup_mutex);
4627} 4791}
4628 4792
4629static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4793static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4646,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4646 * deregistration. 4810 * deregistration.
4647 */ 4811 */
4648 if (ss->base_cftypes) { 4812 if (ss->base_cftypes) {
4813 struct cftype *cft;
4814
4815 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4816 cft->ss = ss;
4817
4649 ss->base_cftset.cfts = ss->base_cftypes; 4818 ss->base_cftset.cfts = ss->base_cftypes;
4650 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4819 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4651 } 4820 }
@@ -4665,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4665 /* Create the top cgroup state for this subsystem */ 4834 /* Create the top cgroup state for this subsystem */
4666 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4835 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4667 ss->root = &cgroup_dummy_root; 4836 ss->root = &cgroup_dummy_root;
4668 css = ss->css_alloc(cgroup_dummy_top); 4837 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4669 /* We don't handle early failures gracefully */ 4838 /* We don't handle early failures gracefully */
4670 BUG_ON(IS_ERR(css)); 4839 BUG_ON(IS_ERR(css));
4671 init_cgroup_css(css, ss, cgroup_dummy_top); 4840 init_css(css, ss, cgroup_dummy_top);
4672 4841
4673 /* Update the init_css_set to contain a subsys 4842 /* Update the init_css_set to contain a subsys
4674 * pointer to this state - since the subsystem is 4843 * pointer to this state - since the subsystem is
@@ -4683,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4683 * need to invoke fork callbacks here. */ 4852 * need to invoke fork callbacks here. */
4684 BUG_ON(!list_empty(&init_task.tasks)); 4853 BUG_ON(!list_empty(&init_task.tasks));
4685 4854
4686 BUG_ON(online_css(ss, cgroup_dummy_top)); 4855 BUG_ON(online_css(css));
4687 4856
4688 mutex_unlock(&cgroup_mutex); 4857 mutex_unlock(&cgroup_mutex);
4689 4858
@@ -4744,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4744 * struct, so this can happen first (i.e. before the dummy root 4913 * struct, so this can happen first (i.e. before the dummy root
4745 * attachment). 4914 * attachment).
4746 */ 4915 */
4747 css = ss->css_alloc(cgroup_dummy_top); 4916 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4748 if (IS_ERR(css)) { 4917 if (IS_ERR(css)) {
4749 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4918 /* failure case - need to deassign the cgroup_subsys[] slot. */
4750 cgroup_subsys[ss->subsys_id] = NULL; 4919 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4756,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4756 ss->root = &cgroup_dummy_root; 4925 ss->root = &cgroup_dummy_root;
4757 4926
4758 /* our new subsystem will be attached to the dummy hierarchy. */ 4927 /* our new subsystem will be attached to the dummy hierarchy. */
4759 init_cgroup_css(css, ss, cgroup_dummy_top); 4928 init_css(css, ss, cgroup_dummy_top);
4760 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4929 /* init_idr must be after init_css() because it sets css->id. */
4761 if (ss->use_id) { 4930 if (ss->use_id) {
4762 ret = cgroup_init_idr(ss, css); 4931 ret = cgroup_init_idr(ss, css);
4763 if (ret) 4932 if (ret)
@@ -4787,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4787 } 4956 }
4788 write_unlock(&css_set_lock); 4957 write_unlock(&css_set_lock);
4789 4958
4790 ret = online_css(ss, cgroup_dummy_top); 4959 ret = online_css(css);
4791 if (ret) 4960 if (ret)
4792 goto err_unload; 4961 goto err_unload;
4793 4962
@@ -4819,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4819 4988
4820 /* 4989 /*
4821 * we shouldn't be called if the subsystem is in use, and the use of 4990 * we shouldn't be called if the subsystem is in use, and the use of
4822 * try_module_get in parse_cgroupfs_options should ensure that it 4991 * try_module_get() in rebind_subsystems() should ensure that it
4823 * doesn't start being used while we're killing it off. 4992 * doesn't start being used while we're killing it off.
4824 */ 4993 */
4825 BUG_ON(ss->root != &cgroup_dummy_root); 4994 BUG_ON(ss->root != &cgroup_dummy_root);
4826 4995
4827 mutex_lock(&cgroup_mutex); 4996 mutex_lock(&cgroup_mutex);
4828 4997
4829 offline_css(ss, cgroup_dummy_top); 4998 offline_css(cgroup_css(cgroup_dummy_top, ss));
4830 4999
4831 if (ss->use_id) 5000 if (ss->use_id)
4832 idr_destroy(&ss->idr); 5001 idr_destroy(&ss->idr);
@@ -4860,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4860 * the cgrp->subsys pointer to find their state. note that this 5029 * the cgrp->subsys pointer to find their state. note that this
4861 * also takes care of freeing the css_id. 5030 * also takes care of freeing the css_id.
4862 */ 5031 */
4863 ss->css_free(cgroup_dummy_top); 5032 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4864 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 5033 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4865 5034
4866 mutex_unlock(&cgroup_mutex); 5035 mutex_unlock(&cgroup_mutex);
4867} 5036}
@@ -4943,6 +5112,10 @@ int __init cgroup_init(void)
4943 5112
4944 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5113 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4945 5114
5115 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5116 0, 1, GFP_KERNEL);
5117 BUG_ON(err < 0);
5118
4946 mutex_unlock(&cgroup_root_mutex); 5119 mutex_unlock(&cgroup_root_mutex);
4947 mutex_unlock(&cgroup_mutex); 5120 mutex_unlock(&cgroup_mutex);
4948 5121
@@ -5099,7 +5272,7 @@ void cgroup_fork(struct task_struct *child)
5099 * Adds the task to the list running through its css_set if necessary and 5272 * Adds the task to the list running through its css_set if necessary and
5100 * call the subsystem fork() callbacks. Has to be after the task is 5273 * call the subsystem fork() callbacks. Has to be after the task is
5101 * visible on the task list in case we race with the first call to 5274 * visible on the task list in case we race with the first call to
5102 * cgroup_iter_start() - to guarantee that the new task ends up on its 5275 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5103 * list. 5276 * list.
5104 */ 5277 */
5105void cgroup_post_fork(struct task_struct *child) 5278void cgroup_post_fork(struct task_struct *child)
@@ -5212,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5212 */ 5385 */
5213 for_each_builtin_subsys(ss, i) { 5386 for_each_builtin_subsys(ss, i) {
5214 if (ss->exit) { 5387 if (ss->exit) {
5215 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5388 struct cgroup_subsys_state *old_css = cset->subsys[i];
5216 struct cgroup *cgrp = task_cgroup(tsk, i); 5389 struct cgroup_subsys_state *css = task_css(tsk, i);
5217 5390
5218 ss->exit(cgrp, old_cgrp, tsk); 5391 ss->exit(css, old_css, tsk);
5219 } 5392 }
5220 } 5393 }
5221 } 5394 }
@@ -5474,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5474 return 0; 5647 return 0;
5475} 5648}
5476 5649
5477static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5650static int alloc_css_id(struct cgroup_subsys_state *child_css)
5478 struct cgroup *child)
5479{ 5651{
5480 int subsys_id, i, depth = 0; 5652 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5481 struct cgroup_subsys_state *parent_css, *child_css;
5482 struct css_id *child_id, *parent_id; 5653 struct css_id *child_id, *parent_id;
5654 int i, depth;
5483 5655
5484 subsys_id = ss->subsys_id;
5485 parent_css = parent->subsys[subsys_id];
5486 child_css = child->subsys[subsys_id];
5487 parent_id = rcu_dereference_protected(parent_css->id, true); 5656 parent_id = rcu_dereference_protected(parent_css->id, true);
5488 depth = parent_id->depth + 1; 5657 depth = parent_id->depth + 1;
5489 5658
5490 child_id = get_new_cssid(ss, depth); 5659 child_id = get_new_cssid(child_css->ss, depth);
5491 if (IS_ERR(child_id)) 5660 if (IS_ERR(child_id))
5492 return PTR_ERR(child_id); 5661 return PTR_ERR(child_id);
5493 5662
@@ -5525,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5525} 5694}
5526EXPORT_SYMBOL_GPL(css_lookup); 5695EXPORT_SYMBOL_GPL(css_lookup);
5527 5696
5528/* 5697/**
5529 * get corresponding css from file open on cgroupfs directory 5698 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5699 * @dentry: directory dentry of interest
5700 * @ss: subsystem of interest
5701 *
5702 * Must be called under RCU read lock. The caller is responsible for
5703 * pinning the returned css if it needs to be accessed outside the RCU
5704 * critical section.
5530 */ 5705 */
5531struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5706struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5707 struct cgroup_subsys *ss)
5532{ 5708{
5533 struct cgroup *cgrp; 5709 struct cgroup *cgrp;
5534 struct inode *inode;
5535 struct cgroup_subsys_state *css;
5536 5710
5537 inode = file_inode(f); 5711 WARN_ON_ONCE(!rcu_read_lock_held());
5538 /* check in cgroup filesystem dir */ 5712
5539 if (inode->i_op != &cgroup_dir_inode_operations) 5713 /* is @dentry a cgroup dir? */
5714 if (!dentry->d_inode ||
5715 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5540 return ERR_PTR(-EBADF); 5716 return ERR_PTR(-EBADF);
5541 5717
5542 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5718 cgrp = __d_cgrp(dentry);
5543 return ERR_PTR(-EINVAL); 5719 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5720}
5544 5721
5545 /* get cgroup */ 5722/**
5546 cgrp = __d_cgrp(f->f_dentry); 5723 * css_from_id - lookup css by id
5547 css = cgrp->subsys[id]; 5724 * @id: the cgroup id
5548 return css ? css : ERR_PTR(-ENOENT); 5725 * @ss: cgroup subsys to be looked into
5726 *
5727 * Returns the css if there's valid one with @id, otherwise returns NULL.
5728 * Should be called under rcu_read_lock().
5729 */
5730struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5731{
5732 struct cgroup *cgrp;
5733
5734 rcu_lockdep_assert(rcu_read_lock_held() ||
5735 lockdep_is_held(&cgroup_mutex),
5736 "css_from_id() needs proper protection");
5737
5738 cgrp = idr_find(&ss->root->cgroup_idr, id);
5739 if (cgrp)
5740 return cgroup_css(cgrp, ss);
5741 return NULL;
5549} 5742}
5550 5743
5551#ifdef CONFIG_CGROUP_DEBUG 5744#ifdef CONFIG_CGROUP_DEBUG
5552static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5745static struct cgroup_subsys_state *
5746debug_css_alloc(struct cgroup_subsys_state *parent_css)
5553{ 5747{
5554 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5748 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5555 5749
@@ -5559,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5559 return css; 5753 return css;
5560} 5754}
5561 5755
5562static void debug_css_free(struct cgroup *cgrp) 5756static void debug_css_free(struct cgroup_subsys_state *css)
5563{ 5757{
5564 kfree(cgrp->subsys[debug_subsys_id]); 5758 kfree(css);
5565} 5759}
5566 5760
5567static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5761static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5762 struct cftype *cft)
5568{ 5763{
5569 return cgroup_task_count(cgrp); 5764 return cgroup_task_count(css->cgroup);
5570} 5765}
5571 5766
5572static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5767static u64 current_css_set_read(struct cgroup_subsys_state *css,
5768 struct cftype *cft)
5573{ 5769{
5574 return (u64)(unsigned long)current->cgroups; 5770 return (u64)(unsigned long)current->cgroups;
5575} 5771}
5576 5772
5577static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5773static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5578 struct cftype *cft) 5774 struct cftype *cft)
5579{ 5775{
5580 u64 count; 5776 u64 count;
@@ -5585,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5585 return count; 5781 return count;
5586} 5782}
5587 5783
5588static int current_css_set_cg_links_read(struct cgroup *cgrp, 5784static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5589 struct cftype *cft, 5785 struct cftype *cft,
5590 struct seq_file *seq) 5786 struct seq_file *seq)
5591{ 5787{
@@ -5612,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5612} 5808}
5613 5809
5614#define MAX_TASKS_SHOWN_PER_CSS 25 5810#define MAX_TASKS_SHOWN_PER_CSS 25
5615static int cgroup_css_links_read(struct cgroup *cgrp, 5811static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5616 struct cftype *cft, 5812 struct cftype *cft, struct seq_file *seq)
5617 struct seq_file *seq)
5618{ 5813{
5619 struct cgrp_cset_link *link; 5814 struct cgrp_cset_link *link;
5620 5815
5621 read_lock(&css_set_lock); 5816 read_lock(&css_set_lock);
5622 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5817 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5623 struct css_set *cset = link->cset; 5818 struct css_set *cset = link->cset;
5624 struct task_struct *task; 5819 struct task_struct *task;
5625 int count = 0; 5820 int count = 0;
@@ -5638,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5638 return 0; 5833 return 0;
5639} 5834}
5640 5835
5641static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5836static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5642{ 5837{
5643 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5838 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5644} 5839}
5645 5840
5646static struct cftype debug_files[] = { 5841static struct cftype debug_files[] = {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
45 spinlock_t lock; 45 spinlock_t lock;
46}; 46};
47 47
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 49{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 50 return css ? container_of(css, struct freezer, css) : NULL;
51 struct freezer, css);
52} 51}
53 52
54static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
55{ 54{
56 return container_of(task_subsys_state(task, freezer_subsys_id), 55 return css_freezer(task_css(task, freezer_subsys_id));
57 struct freezer, css);
58} 56}
59 57
60static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
61{ 59{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 60 return css_freezer(css_parent(&freezer->css));
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 61}
68 62
69bool cgroup_freezing(struct task_struct *task) 63bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
92 86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css)
96{ 91{
97 struct freezer *freezer; 92 struct freezer *freezer;
98 93
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
105} 100}
106 101
107/** 102/**
108 * freezer_css_online - commit creation of a freezer cgroup 103 * freezer_css_online - commit creation of a freezer css
109 * @cgroup: cgroup being created 104 * @css: css being created
110 * 105 *
111 * We're committing to creation of @cgroup. Mark it online and inherit 106 * We're committing to creation of @css. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our 107 * parent's freezing state while holding both parent's and our
113 * freezer->lock. 108 * freezer->lock.
114 */ 109 */
115static int freezer_css_online(struct cgroup *cgroup) 110static int freezer_css_online(struct cgroup_subsys_state *css)
116{ 111{
117 struct freezer *freezer = cgroup_freezer(cgroup); 112 struct freezer *freezer = css_freezer(css);
118 struct freezer *parent = parent_freezer(freezer); 113 struct freezer *parent = parent_freezer(freezer);
119 114
120 /* 115 /*
121 * The following double locking and freezing state inheritance 116 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing 117 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details. 118 * states. See css_for_each_descendant_pre() for details.
124 */ 119 */
125 if (parent) 120 if (parent)
126 spin_lock_irq(&parent->lock); 121 spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
141} 136}
142 137
143/** 138/**
144 * freezer_css_offline - initiate destruction of @cgroup 139 * freezer_css_offline - initiate destruction of a freezer css
145 * @cgroup: cgroup being destroyed 140 * @css: css being destroyed
146 * 141 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count 142 * @css is going away. Mark it dead and decrement system_freezing_count if
148 * if it was holding one. 143 * it was holding one.
149 */ 144 */
150static void freezer_css_offline(struct cgroup *cgroup) 145static void freezer_css_offline(struct cgroup_subsys_state *css)
151{ 146{
152 struct freezer *freezer = cgroup_freezer(cgroup); 147 struct freezer *freezer = css_freezer(css);
153 148
154 spin_lock_irq(&freezer->lock); 149 spin_lock_irq(&freezer->lock);
155 150
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
161 spin_unlock_irq(&freezer->lock); 156 spin_unlock_irq(&freezer->lock);
162} 157}
163 158
164static void freezer_css_free(struct cgroup *cgroup) 159static void freezer_css_free(struct cgroup_subsys_state *css)
165{ 160{
166 kfree(cgroup_freezer(cgroup)); 161 kfree(css_freezer(css));
167} 162}
168 163
169/* 164/*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the 170 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks. 171 * current state and all following state changes can see the new tasks.
177 */ 172 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 173static void freezer_attach(struct cgroup_subsys_state *new_css,
174 struct cgroup_taskset *tset)
179{ 175{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 176 struct freezer *freezer = css_freezer(new_css);
181 struct task_struct *task; 177 struct task_struct *task;
182 bool clear_frozen = false; 178 bool clear_frozen = false;
183 179
184 spin_lock_irq(&freezer->lock); 180 spin_lock_irq(&freezer->lock);
185 181
186 /* 182 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 183 * Make the new tasks conform to the current state of @new_css.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we 184 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the 185 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later. 186 * correct state later.
191 * 187 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its 188 * Tasks in @tset are on @new_css but may not conform to its
193 * current state before executing the following - !frozen tasks may 189 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 191 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) { 192 cgroup_taskset_for_each(task, new_css, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) { 193 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task); 194 __thaw_task(task);
199 } else { 195 } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
231 * The root cgroup is non-freezable, so we can skip the 227 * The root cgroup is non-freezable, so we can skip the
232 * following check. 228 * following check.
233 */ 229 */
234 if (!freezer->css.cgroup->parent) 230 if (!parent_freezer(freezer))
235 goto out; 231 goto out;
236 232
237 spin_lock_irq(&freezer->lock); 233 spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
244 240
245/** 241/**
246 * update_if_frozen - update whether a cgroup finished freezing 242 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest 243 * @css: css of interest
248 * 244 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by 245 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN, 246 * calling this function. If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
255 * update_if_frozen() on all descendants prior to invoking this function. 251 * update_if_frozen() on all descendants prior to invoking this function.
256 * 252 *
257 * Task states and freezer state might disagree while tasks are being 253 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against 254 * migrated into or out of @css, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details. 255 * @freezer state here. See freezer_attach() for details.
260 */ 256 */
261static void update_if_frozen(struct cgroup *cgroup) 257static void update_if_frozen(struct cgroup_subsys_state *css)
262{ 258{
263 struct freezer *freezer = cgroup_freezer(cgroup); 259 struct freezer *freezer = css_freezer(css);
264 struct cgroup *pos; 260 struct cgroup_subsys_state *pos;
265 struct cgroup_iter it; 261 struct css_task_iter it;
266 struct task_struct *task; 262 struct task_struct *task;
267 263
268 WARN_ON_ONCE(!rcu_read_lock_held()); 264 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
274 goto out_unlock; 270 goto out_unlock;
275 271
276 /* are all (live) children frozen? */ 272 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) { 273 css_for_each_child(pos, css) {
278 struct freezer *child = cgroup_freezer(pos); 274 struct freezer *child = css_freezer(pos);
279 275
280 if ((child->state & CGROUP_FREEZER_ONLINE) && 276 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN)) 277 !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
283 } 279 }
284 280
285 /* are all tasks frozen? */ 281 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 282 css_task_iter_start(css, &it);
287 283
288 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = css_task_iter_next(&it))) {
289 if (freezing(task)) { 285 if (freezing(task)) {
290 /* 286 /*
291 * freezer_should_skip() indicates that the task 287 * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
300 296
301 freezer->state |= CGROUP_FROZEN; 297 freezer->state |= CGROUP_FROZEN;
302out_iter_end: 298out_iter_end:
303 cgroup_iter_end(cgroup, &it); 299 css_task_iter_end(&it);
304out_unlock: 300out_unlock:
305 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
306} 302}
307 303
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
309 struct seq_file *m) 305 struct seq_file *m)
310{ 306{
311 struct cgroup *pos; 307 struct cgroup_subsys_state *pos;
312 308
313 rcu_read_lock(); 309 rcu_read_lock();
314 310
315 /* update states bottom-up */ 311 /* update states bottom-up */
316 cgroup_for_each_descendant_post(pos, cgroup) 312 css_for_each_descendant_post(pos, css)
317 update_if_frozen(pos); 313 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 314
320 rcu_read_unlock(); 315 rcu_read_unlock();
321 316
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 317 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
323 seq_putc(m, '\n'); 318 seq_putc(m, '\n');
324 return 0; 319 return 0;
325} 320}
326 321
327static void freeze_cgroup(struct freezer *freezer) 322static void freeze_cgroup(struct freezer *freezer)
328{ 323{
329 struct cgroup *cgroup = freezer->css.cgroup; 324 struct css_task_iter it;
330 struct cgroup_iter it;
331 struct task_struct *task; 325 struct task_struct *task;
332 326
333 cgroup_iter_start(cgroup, &it); 327 css_task_iter_start(&freezer->css, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 328 while ((task = css_task_iter_next(&it)))
335 freeze_task(task); 329 freeze_task(task);
336 cgroup_iter_end(cgroup, &it); 330 css_task_iter_end(&it);
337} 331}
338 332
339static void unfreeze_cgroup(struct freezer *freezer) 333static void unfreeze_cgroup(struct freezer *freezer)
340{ 334{
341 struct cgroup *cgroup = freezer->css.cgroup; 335 struct css_task_iter it;
342 struct cgroup_iter it;
343 struct task_struct *task; 336 struct task_struct *task;
344 337
345 cgroup_iter_start(cgroup, &it); 338 css_task_iter_start(&freezer->css, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 339 while ((task = css_task_iter_next(&it)))
347 __thaw_task(task); 340 __thaw_task(task);
348 cgroup_iter_end(cgroup, &it); 341 css_task_iter_end(&it);
349} 342}
350 343
351/** 344/**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
395 */ 388 */
396static void freezer_change_state(struct freezer *freezer, bool freeze) 389static void freezer_change_state(struct freezer *freezer, bool freeze)
397{ 390{
398 struct cgroup *pos; 391 struct cgroup_subsys_state *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
403 spin_unlock_irq(&freezer->lock);
404 392
405 /* 393 /*
406 * Update all its descendants in pre-order traversal. Each 394 * Update all its descendants in pre-order traversal. Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
408 * CGROUP_FREEZING_PARENT. 396 * CGROUP_FREEZING_PARENT.
409 */ 397 */
410 rcu_read_lock(); 398 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
412 struct freezer *pos_f = cgroup_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
414 402
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock); 403 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, 404
422 CGROUP_FREEZING_PARENT); 405 if (pos_f == freezer) {
406 freezer_apply_state(pos_f, freeze,
407 CGROUP_FREEZING_SELF);
408 } else {
409 /*
410 * Our update to @parent->state is already visible
411 * which is all we need. No need to lock @parent.
412 * For more info on synchronization, see
413 * freezer_post_create().
414 */
415 freezer_apply_state(pos_f,
416 parent->state & CGROUP_FREEZING,
417 CGROUP_FREEZING_PARENT);
418 }
419
423 spin_unlock_irq(&pos_f->lock); 420 spin_unlock_irq(&pos_f->lock);
424 } 421 }
425 rcu_read_unlock(); 422 rcu_read_unlock();
426} 423}
427 424
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 425static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
429 const char *buffer) 426 const char *buffer)
430{ 427{
431 bool freeze; 428 bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
437 else 434 else
438 return -EINVAL; 435 return -EINVAL;
439 436
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 437 freezer_change_state(css_freezer(css), freeze);
441 return 0; 438 return 0;
442} 439}
443 440
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 441static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
442 struct cftype *cft)
445{ 443{
446 struct freezer *freezer = cgroup_freezer(cgroup); 444 struct freezer *freezer = css_freezer(css);
447 445
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF); 446 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449} 447}
450 448
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) 449static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
450 struct cftype *cft)
452{ 451{
453 struct freezer *freezer = cgroup_freezer(cgroup); 452 struct freezer *freezer = css_freezer(css);
454 453
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT); 454 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 455}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ea1966db34f2..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
68 */ 68 */
69int number_of_cpusets __read_mostly; 69int number_of_cpusets __read_mostly;
70 70
71/* Forward declare cgroup structures */
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
76 72
77struct fmeter { 73struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
115 int relax_domain_level; 111 int relax_domain_level;
116}; 112};
117 113
118/* Retrieve the cpuset for a cgroup */ 114static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{ 115{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), 116 return css ? container_of(css, struct cpuset, css) : NULL;
122 struct cpuset, css);
123} 117}
124 118
125/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
126static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
127{ 121{
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 122 return css_cs(task_css(task, cpuset_subsys_id));
129 struct cpuset, css);
130} 123}
131 124
132static inline struct cpuset *parent_cs(const struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
133{ 126{
134 struct cgroup *pcgrp = cs->css.cgroup->parent; 127 return css_cs(css_parent(&cs->css));
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139} 128}
140 129
141#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
212/** 201/**
213 * cpuset_for_each_child - traverse online children of a cpuset 202 * cpuset_for_each_child - traverse online children of a cpuset
214 * @child_cs: loop cursor pointing to the current child 203 * @child_cs: loop cursor pointing to the current child
215 * @pos_cgrp: used for iteration 204 * @pos_css: used for iteration
216 * @parent_cs: target cpuset to walk children of 205 * @parent_cs: target cpuset to walk children of
217 * 206 *
218 * Walk @child_cs through the online children of @parent_cs. Must be used 207 * Walk @child_cs through the online children of @parent_cs. Must be used
219 * with RCU read locked. 208 * with RCU read locked.
220 */ 209 */
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ 210#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 211 css_for_each_child((pos_css), &(parent_cs)->css) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 212 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
224 213
225/** 214/**
226 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
227 * @des_cs: loop cursor pointing to the current descendant 216 * @des_cs: loop cursor pointing to the current descendant
228 * @pos_cgrp: used for iteration 217 * @pos_css: used for iteration
229 * @root_cs: target cpuset to walk ancestor of 218 * @root_cs: target cpuset to walk ancestor of
230 * 219 *
231 * Walk @des_cs through the online descendants of @root_cs. Must be used 220 * Walk @des_cs through the online descendants of @root_cs. Must be used
232 * with RCU read locked. The caller may modify @pos_cgrp by calling 221 * with RCU read locked. The caller may modify @pos_css by calling
233 * cgroup_rightmost_descendant() to skip subtree. 222 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
223 * iteration and the first node to be visited.
234 */ 224 */
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ 225#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ 226 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) 227 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
238 228
239/* 229/*
240 * There are two global mutexes guarding cpuset structures - cpuset_mutex 230 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
320 * 310 *
321 * Call with callback_mutex held. 311 * Call with callback_mutex held.
322 */ 312 */
323static void guarantee_online_cpus(const struct cpuset *cs, 313static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
324 struct cpumask *pmask)
325{ 314{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 315 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs); 316 cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
339 * 328 *
340 * Call with callback_mutex held. 329 * Call with callback_mutex held.
341 */ 330 */
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 331static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
343{ 332{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 333 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs); 334 cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
384 * alloc_trial_cpuset - allocate a trial cpuset 373 * alloc_trial_cpuset - allocate a trial cpuset
385 * @cs: the cpuset that the trial cpuset duplicates 374 * @cs: the cpuset that the trial cpuset duplicates
386 */ 375 */
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 376static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
388{ 377{
389 struct cpuset *trial; 378 struct cpuset *trial;
390 379
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
431 * Return 0 if valid, -errno if not. 420 * Return 0 if valid, -errno if not.
432 */ 421 */
433 422
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 423static int validate_change(struct cpuset *cur, struct cpuset *trial)
435{ 424{
436 struct cgroup *cgrp; 425 struct cgroup_subsys_state *css;
437 struct cpuset *c, *par; 426 struct cpuset *c, *par;
438 int ret; 427 int ret;
439 428
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
441 430
442 /* Each of our child cpusets must be a subset of us */ 431 /* Each of our child cpusets must be a subset of us */
443 ret = -EBUSY; 432 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur) 433 cpuset_for_each_child(c, css, cur)
445 if (!is_cpuset_subset(c, trial)) 434 if (!is_cpuset_subset(c, trial))
446 goto out; 435 goto out;
447 436
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
462 * overlap 451 * overlap
463 */ 452 */
464 ret = -EINVAL; 453 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) { 454 cpuset_for_each_child(c, css, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 455 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur && 456 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 457 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
515 struct cpuset *root_cs) 504 struct cpuset *root_cs)
516{ 505{
517 struct cpuset *cp; 506 struct cpuset *cp;
518 struct cgroup *pos_cgrp; 507 struct cgroup_subsys_state *pos_css;
519 508
520 rcu_read_lock(); 509 rcu_read_lock();
521 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 510 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
511 if (cp == root_cs)
512 continue;
513
522 /* skip the whole subtree if @cp doesn't have any CPU */ 514 /* skip the whole subtree if @cp doesn't have any CPU */
523 if (cpumask_empty(cp->cpus_allowed)) { 515 if (cpumask_empty(cp->cpus_allowed)) {
524 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 516 pos_css = css_rightmost_descendant(pos_css);
525 continue; 517 continue;
526 } 518 }
527 519
@@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
596 struct sched_domain_attr *dattr; /* attributes for custom domains */ 588 struct sched_domain_attr *dattr; /* attributes for custom domains */
597 int ndoms = 0; /* number of sched domains in result */ 589 int ndoms = 0; /* number of sched domains in result */
598 int nslot; /* next empty doms[] struct cpumask slot */ 590 int nslot; /* next empty doms[] struct cpumask slot */
599 struct cgroup *pos_cgrp; 591 struct cgroup_subsys_state *pos_css;
600 592
601 doms = NULL; 593 doms = NULL;
602 dattr = NULL; 594 dattr = NULL;
@@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
625 csn = 0; 617 csn = 0;
626 618
627 rcu_read_lock(); 619 rcu_read_lock();
628 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { 620 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
621 if (cp == &top_cpuset)
622 continue;
629 /* 623 /*
630 * Continue traversing beyond @cp iff @cp has some CPUs and 624 * Continue traversing beyond @cp iff @cp has some CPUs and
631 * isn't load balancing. The former is obvious. The 625 * isn't load balancing. The former is obvious. The
@@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
642 csa[csn++] = cp; 636 csa[csn++] = cp;
643 637
644 /* skip @cp's subtree */ 638 /* skip @cp's subtree */
645 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 639 pos_css = css_rightmost_descendant(pos_css);
646 } 640 }
647 rcu_read_unlock(); 641 rcu_read_unlock();
648 642
@@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
837/** 831/**
838 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
839 * @tsk: task to test 833 * @tsk: task to test
840 * @scan: struct cgroup_scanner containing the cgroup of the task 834 * @data: cpuset to @tsk belongs to
841 * 835 *
842 * Called by cgroup_scan_tasks() for each task in a cgroup whose 836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
843 * cpus_allowed mask needs to be changed. 837 * mask needs to be changed.
844 * 838 *
845 * We don't need to re-check for the cgroup/cpuset membership, since we're 839 * We don't need to re-check for the cgroup/cpuset membership, since we're
846 * holding cpuset_mutex at this point. 840 * holding cpuset_mutex at this point.
847 */ 841 */
848static void cpuset_change_cpumask(struct task_struct *tsk, 842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
849 struct cgroup_scanner *scan)
850{ 843{
851 struct cpuset *cpus_cs; 844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
852 846
853 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
854 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); 847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
855} 848}
856 849
857/** 850/**
858 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
859 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
860 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
861 * 854 *
862 * Called with cpuset_mutex held 855 * Called with cpuset_mutex held
863 * 856 *
864 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 857 * The css_scan_tasks() function will scan all the tasks in a cgroup,
865 * calling callback functions for each. 858 * calling callback functions for each.
866 * 859 *
867 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
868 * if @heap != NULL. 861 * if @heap != NULL.
869 */ 862 */
870static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
871{ 864{
872 struct cgroup_scanner scan; 865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
873
874 scan.cg = cs->css.cgroup;
875 scan.test_task = NULL;
876 scan.process_task = cpuset_change_cpumask;
877 scan.heap = heap;
878 cgroup_scan_tasks(&scan);
879} 866}
880 867
881/* 868/*
882 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
883 * @root_cs: the root cpuset of the hierarchy 870 * @root_cs: the root cpuset of the hierarchy
884 * @update_root: update root cpuset or not? 871 * @update_root: update root cpuset or not?
885 * @heap: the heap used by cgroup_scan_tasks() 872 * @heap: the heap used by css_scan_tasks()
886 * 873 *
887 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
888 * which take on cpumask of @root_cs. 875 * which take on cpumask of @root_cs.
@@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
893 bool update_root, struct ptr_heap *heap) 880 bool update_root, struct ptr_heap *heap)
894{ 881{
895 struct cpuset *cp; 882 struct cpuset *cp;
896 struct cgroup *pos_cgrp; 883 struct cgroup_subsys_state *pos_css;
897
898 if (update_root)
899 update_tasks_cpumask(root_cs, heap);
900 884
901 rcu_read_lock(); 885 rcu_read_lock();
902 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 886 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
903 /* skip the whole subtree if @cp have some CPU */ 887 if (cp == root_cs) {
904 if (!cpumask_empty(cp->cpus_allowed)) { 888 if (!update_root)
905 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 889 continue;
906 continue; 890 } else {
891 /* skip the whole subtree if @cp have some CPU */
892 if (!cpumask_empty(cp->cpus_allowed)) {
893 pos_css = css_rightmost_descendant(pos_css);
894 continue;
895 }
907 } 896 }
908 if (!css_tryget(&cp->css)) 897 if (!css_tryget(&cp->css))
909 continue; 898 continue;
@@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1059 task_unlock(tsk); 1048 task_unlock(tsk);
1060} 1049}
1061 1050
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1062/* 1056/*
1063 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1064 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1065 * memory_migrate flag is set. Called with cpuset_mutex held. 1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1066 */ 1060 */
1067static void cpuset_change_nodemask(struct task_struct *p, 1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1068 struct cgroup_scanner *scan)
1069{ 1062{
1070 struct cpuset *cs = cgroup_cs(scan->cg); 1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1071 struct mm_struct *mm; 1065 struct mm_struct *mm;
1072 int migrate; 1066 int migrate;
1073 nodemask_t *newmems = scan->data;
1074 1067
1075 cpuset_change_task_nodemask(p, newmems); 1068 cpuset_change_task_nodemask(p, arg->newmems);
1076 1069
1077 mm = get_task_mm(p); 1070 mm = get_task_mm(p);
1078 if (!mm) 1071 if (!mm)
@@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1082 1075
1083 mpol_rebind_mm(mm, &cs->mems_allowed); 1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1084 if (migrate) 1077 if (migrate)
1085 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); 1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1086 mmput(mm); 1079 mmput(mm);
1087} 1080}
1088 1081
@@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound;
1091/** 1084/**
1092 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1093 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1094 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1095 * 1088 *
1096 * Called with cpuset_mutex held 1089 * Called with cpuset_mutex held. No return value. It's guaranteed that
1097 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1090 * css_scan_tasks() always returns 0 if @heap != NULL.
1098 * if @heap != NULL.
1099 */ 1091 */
1100static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1101{ 1093{
1102 static nodemask_t newmems; /* protected by cpuset_mutex */ 1094 static nodemask_t newmems; /* protected by cpuset_mutex */
1103 struct cgroup_scanner scan;
1104 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs,
1097 .newmems = &newmems };
1105 1098
1106 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1107 1100
1108 guarantee_online_mems(mems_cs, &newmems); 1101 guarantee_online_mems(mems_cs, &newmems);
1109 1102
1110 scan.cg = cs->css.cgroup;
1111 scan.test_task = NULL;
1112 scan.process_task = cpuset_change_nodemask;
1113 scan.heap = heap;
1114 scan.data = &newmems;
1115
1116 /* 1103 /*
1117 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1104 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1118 * take while holding tasklist_lock. Forks can happen - the 1105 * take while holding tasklist_lock. Forks can happen - the
@@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1123 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1124 * is idempotent. Also migrate pages in each mm to new nodes. 1111 * is idempotent. Also migrate pages in each mm to new nodes.
1125 */ 1112 */
1126 cgroup_scan_tasks(&scan); 1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
1127 1114
1128 /* 1115 /*
1129 * All the tasks' nodemasks have been updated, update 1116 * All the tasks' nodemasks have been updated, update
@@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1139 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1140 * @cs: the root cpuset of the hierarchy 1127 * @cs: the root cpuset of the hierarchy
1141 * @update_root: update the root cpuset or not? 1128 * @update_root: update the root cpuset or not?
1142 * @heap: the heap used by cgroup_scan_tasks() 1129 * @heap: the heap used by css_scan_tasks()
1143 * 1130 *
1144 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1145 * which take on nodemask of @root_cs. 1132 * which take on nodemask of @root_cs.
@@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1150 bool update_root, struct ptr_heap *heap) 1137 bool update_root, struct ptr_heap *heap)
1151{ 1138{
1152 struct cpuset *cp; 1139 struct cpuset *cp;
1153 struct cgroup *pos_cgrp; 1140 struct cgroup_subsys_state *pos_css;
1154
1155 if (update_root)
1156 update_tasks_nodemask(root_cs, heap);
1157 1141
1158 rcu_read_lock(); 1142 rcu_read_lock();
1159 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 1143 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1160 /* skip the whole subtree if @cp have some CPU */ 1144 if (cp == root_cs) {
1161 if (!nodes_empty(cp->mems_allowed)) { 1145 if (!update_root)
1162 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 1146 continue;
1163 continue; 1147 } else {
1148 /* skip the whole subtree if @cp have some CPU */
1149 if (!nodes_empty(cp->mems_allowed)) {
1150 pos_css = css_rightmost_descendant(pos_css);
1151 continue;
1152 }
1164 } 1153 }
1165 if (!css_tryget(&cp->css)) 1154 if (!css_tryget(&cp->css))
1166 continue; 1155 continue;
@@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1267 return 0; 1256 return 0;
1268} 1257}
1269 1258
1270/* 1259/**
1271 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1272 * @tsk: task to be updated 1261 * @tsk: task to be updated
1273 * @scan: struct cgroup_scanner containing the cgroup of the task 1262 * @data: cpuset to @tsk belongs to
1274 * 1263 *
1275 * Called by cgroup_scan_tasks() for each task in a cgroup. 1264 * Called by css_scan_tasks() for each task in a cgroup.
1276 * 1265 *
1277 * We don't need to re-check for the cgroup/cpuset membership, since we're 1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1278 * holding cpuset_mutex at this point. 1267 * holding cpuset_mutex at this point.
1279 */ 1268 */
1280static void cpuset_change_flag(struct task_struct *tsk, 1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1281 struct cgroup_scanner *scan)
1282{ 1270{
1283 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1284} 1274}
1285 1275
1286/* 1276/**
1287 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1277 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1288 * @cs: the cpuset in which each task's spread flags needs to be changed 1278 * @cs: the cpuset in which each task's spread flags needs to be changed
1289 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1290 * 1280 *
1291 * Called with cpuset_mutex held 1281 * Called with cpuset_mutex held
1292 * 1282 *
1293 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1283 * The css_scan_tasks() function will scan all the tasks in a cgroup,
1294 * calling callback functions for each. 1284 * calling callback functions for each.
1295 * 1285 *
1296 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1297 * if @heap != NULL. 1287 * if @heap != NULL.
1298 */ 1288 */
1299static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1300{ 1290{
1301 struct cgroup_scanner scan; 1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
1302
1303 scan.cg = cs->css.cgroup;
1304 scan.test_task = NULL;
1305 scan.process_task = cpuset_change_flag;
1306 scan.heap = heap;
1307 cgroup_scan_tasks(&scan);
1308} 1292}
1309 1293
1310/* 1294/*
@@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1462} 1446}
1463 1447
1464/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1465static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1449static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset)
1466{ 1451{
1467 struct cpuset *cs = cgroup_cs(cgrp); 1452 struct cpuset *cs = css_cs(css);
1468 struct task_struct *task; 1453 struct task_struct *task;
1469 int ret; 1454 int ret;
1470 1455
@@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1475 * flag is set. 1460 * flag is set.
1476 */ 1461 */
1477 ret = -ENOSPC; 1462 ret = -ENOSPC;
1478 if (!cgroup_sane_behavior(cgrp) && 1463 if (!cgroup_sane_behavior(css->cgroup) &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1480 goto out_unlock; 1465 goto out_unlock;
1481 1466
1482 cgroup_taskset_for_each(task, cgrp, tset) { 1467 cgroup_taskset_for_each(task, css, tset) {
1483 /* 1468 /*
1484 * Kthreads which disallow setaffinity shouldn't be moved 1469 * Kthreads which disallow setaffinity shouldn't be moved
1485 * to a new cpuset; we don't want to change their cpu 1470 * to a new cpuset; we don't want to change their cpu
@@ -1508,11 +1493,11 @@ out_unlock:
1508 return ret; 1493 return ret;
1509} 1494}
1510 1495
1511static void cpuset_cancel_attach(struct cgroup *cgrp, 1496static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset) 1497 struct cgroup_taskset *tset)
1513{ 1498{
1514 mutex_lock(&cpuset_mutex); 1499 mutex_lock(&cpuset_mutex);
1515 cgroup_cs(cgrp)->attach_in_progress--; 1500 css_cs(css)->attach_in_progress--;
1516 mutex_unlock(&cpuset_mutex); 1501 mutex_unlock(&cpuset_mutex);
1517} 1502}
1518 1503
@@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
1523 */ 1508 */
1524static cpumask_var_t cpus_attach; 1509static cpumask_var_t cpus_attach;
1525 1510
1526static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1511static void cpuset_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset)
1527{ 1513{
1528 /* static buf protected by cpuset_mutex */ 1514 /* static buf protected by cpuset_mutex */
1529 static nodemask_t cpuset_attach_nodemask_to; 1515 static nodemask_t cpuset_attach_nodemask_to;
1530 struct mm_struct *mm; 1516 struct mm_struct *mm;
1531 struct task_struct *task; 1517 struct task_struct *task;
1532 struct task_struct *leader = cgroup_taskset_first(tset); 1518 struct task_struct *leader = cgroup_taskset_first(tset);
1533 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1534 struct cpuset *cs = cgroup_cs(cgrp); 1520 cpuset_subsys_id);
1535 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1521 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss);
1536 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1537 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1538 1525
@@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1546 1533
1547 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1548 1535
1549 cgroup_taskset_for_each(task, cgrp, tset) { 1536 cgroup_taskset_for_each(task, css, tset) {
1550 /* 1537 /*
1551 * can_attach beforehand should guarantee that this doesn't 1538 * can_attach beforehand should guarantee that this doesn't
1552 * fail. TODO: have a better way to handle failure here 1539 * fail. TODO: have a better way to handle failure here
@@ -1608,9 +1595,10 @@ typedef enum {
1608 FILE_SPREAD_SLAB, 1595 FILE_SPREAD_SLAB,
1609} cpuset_filetype_t; 1596} cpuset_filetype_t;
1610 1597
1611static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1598static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1599 u64 val)
1612{ 1600{
1613 struct cpuset *cs = cgroup_cs(cgrp); 1601 struct cpuset *cs = css_cs(css);
1614 cpuset_filetype_t type = cft->private; 1602 cpuset_filetype_t type = cft->private;
1615 int retval = 0; 1603 int retval = 0;
1616 1604
@@ -1657,9 +1645,10 @@ out_unlock:
1657 return retval; 1645 return retval;
1658} 1646}
1659 1647
1660static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1648static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1649 s64 val)
1661{ 1650{
1662 struct cpuset *cs = cgroup_cs(cgrp); 1651 struct cpuset *cs = css_cs(css);
1663 cpuset_filetype_t type = cft->private; 1652 cpuset_filetype_t type = cft->private;
1664 int retval = -ENODEV; 1653 int retval = -ENODEV;
1665 1654
@@ -1683,10 +1672,10 @@ out_unlock:
1683/* 1672/*
1684 * Common handling for a write to a "cpus" or "mems" file. 1673 * Common handling for a write to a "cpus" or "mems" file.
1685 */ 1674 */
1686static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1675static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1687 const char *buf) 1676 struct cftype *cft, const char *buf)
1688{ 1677{
1689 struct cpuset *cs = cgroup_cs(cgrp); 1678 struct cpuset *cs = css_cs(css);
1690 struct cpuset *trialcs; 1679 struct cpuset *trialcs;
1691 int retval = -ENODEV; 1680 int retval = -ENODEV;
1692 1681
@@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1765 return count; 1754 return count;
1766} 1755}
1767 1756
1768static ssize_t cpuset_common_file_read(struct cgroup *cgrp, 1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1769 struct cftype *cft, 1758 struct cftype *cft, struct file *file,
1770 struct file *file, 1759 char __user *buf, size_t nbytes,
1771 char __user *buf, 1760 loff_t *ppos)
1772 size_t nbytes, loff_t *ppos)
1773{ 1761{
1774 struct cpuset *cs = cgroup_cs(cgrp); 1762 struct cpuset *cs = css_cs(css);
1775 cpuset_filetype_t type = cft->private; 1763 cpuset_filetype_t type = cft->private;
1776 char *page; 1764 char *page;
1777 ssize_t retval = 0; 1765 ssize_t retval = 0;
@@ -1801,9 +1789,9 @@ out:
1801 return retval; 1789 return retval;
1802} 1790}
1803 1791
1804static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) 1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1805{ 1793{
1806 struct cpuset *cs = cgroup_cs(cgrp); 1794 struct cpuset *cs = css_cs(css);
1807 cpuset_filetype_t type = cft->private; 1795 cpuset_filetype_t type = cft->private;
1808 switch (type) { 1796 switch (type) {
1809 case FILE_CPU_EXCLUSIVE: 1797 case FILE_CPU_EXCLUSIVE:
@@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1832 return 0; 1820 return 0;
1833} 1821}
1834 1822
1835static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) 1823static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1836{ 1824{
1837 struct cpuset *cs = cgroup_cs(cgrp); 1825 struct cpuset *cs = css_cs(css);
1838 cpuset_filetype_t type = cft->private; 1826 cpuset_filetype_t type = cft->private;
1839 switch (type) { 1827 switch (type) {
1840 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1828 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1949,11 +1937,12 @@ static struct cftype files[] = {
1949 * cgrp: control group that the new cpuset will be part of 1937 * cgrp: control group that the new cpuset will be part of
1950 */ 1938 */
1951 1939
1952static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) 1940static struct cgroup_subsys_state *
1941cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1953{ 1942{
1954 struct cpuset *cs; 1943 struct cpuset *cs;
1955 1944
1956 if (!cgrp->parent) 1945 if (!parent_css)
1957 return &top_cpuset.css; 1946 return &top_cpuset.css;
1958 1947
1959 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1973 return &cs->css; 1962 return &cs->css;
1974} 1963}
1975 1964
1976static int cpuset_css_online(struct cgroup *cgrp) 1965static int cpuset_css_online(struct cgroup_subsys_state *css)
1977{ 1966{
1978 struct cpuset *cs = cgroup_cs(cgrp); 1967 struct cpuset *cs = css_cs(css);
1979 struct cpuset *parent = parent_cs(cs); 1968 struct cpuset *parent = parent_cs(cs);
1980 struct cpuset *tmp_cs; 1969 struct cpuset *tmp_cs;
1981 struct cgroup *pos_cg; 1970 struct cgroup_subsys_state *pos_css;
1982 1971
1983 if (!parent) 1972 if (!parent)
1984 return 0; 1973 return 0;
@@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1993 1982
1994 number_of_cpusets++; 1983 number_of_cpusets++;
1995 1984
1996 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1985 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1997 goto out_unlock; 1986 goto out_unlock;
1998 1987
1999 /* 1988 /*
@@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
2010 * (and likewise for mems) to the new cgroup. 1999 * (and likewise for mems) to the new cgroup.
2011 */ 2000 */
2012 rcu_read_lock(); 2001 rcu_read_lock();
2013 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 2002 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2014 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2003 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2015 rcu_read_unlock(); 2004 rcu_read_unlock();
2016 goto out_unlock; 2005 goto out_unlock;
@@ -2027,9 +2016,15 @@ out_unlock:
2027 return 0; 2016 return 0;
2028} 2017}
2029 2018
2030static void cpuset_css_offline(struct cgroup *cgrp) 2019/*
2020 * If the cpuset being removed has its flag 'sched_load_balance'
2021 * enabled, then simulate turning sched_load_balance off, which
2022 * will call rebuild_sched_domains_locked().
2023 */
2024
2025static void cpuset_css_offline(struct cgroup_subsys_state *css)
2031{ 2026{
2032 struct cpuset *cs = cgroup_cs(cgrp); 2027 struct cpuset *cs = css_cs(css);
2033 2028
2034 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2035 2030
@@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
2042 mutex_unlock(&cpuset_mutex); 2037 mutex_unlock(&cpuset_mutex);
2043} 2038}
2044 2039
2045/* 2040static void cpuset_css_free(struct cgroup_subsys_state *css)
2046 * If the cpuset being removed has its flag 'sched_load_balance'
2047 * enabled, then simulate turning sched_load_balance off, which
2048 * will call rebuild_sched_domains_locked().
2049 */
2050
2051static void cpuset_css_free(struct cgroup *cgrp)
2052{ 2041{
2053 struct cpuset *cs = cgroup_cs(cgrp); 2042 struct cpuset *cs = css_cs(css);
2054 2043
2055 free_cpumask_var(cs->cpus_allowed); 2044 free_cpumask_var(cs->cpus_allowed);
2056 kfree(cs); 2045 kfree(cs);
@@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2257 /* if cpus or mems changed, we need to propagate to descendants */ 2246 /* if cpus or mems changed, we need to propagate to descendants */
2258 if (cpus_updated || mems_updated) { 2247 if (cpus_updated || mems_updated) {
2259 struct cpuset *cs; 2248 struct cpuset *cs;
2260 struct cgroup *pos_cgrp; 2249 struct cgroup_subsys_state *pos_css;
2261 2250
2262 rcu_read_lock(); 2251 rcu_read_lock();
2263 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { 2252 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2264 if (!css_tryget(&cs->css)) 2253 if (cs == &top_cpuset || !css_tryget(&cs->css))
2265 continue; 2254 continue;
2266 rcu_read_unlock(); 2255 rcu_read_unlock();
2267 2256
@@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2350 2339
2351void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2340void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2352{ 2341{
2353 const struct cpuset *cpus_cs; 2342 struct cpuset *cpus_cs;
2354 2343
2355 rcu_read_lock(); 2344 rcu_read_lock();
2356 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2423 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2412 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
2424 * (an unusual configuration), then returns the root cpuset. 2413 * (an unusual configuration), then returns the root cpuset.
2425 */ 2414 */
2426static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2415static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2427{ 2416{
2428 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2417 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2429 cs = parent_cs(cs); 2418 cs = parent_cs(cs);
@@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2493 */ 2482 */
2494int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2483int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2495{ 2484{
2496 const struct cpuset *cs; /* current cpuset ancestors */ 2485 struct cpuset *cs; /* current cpuset ancestors */
2497 int allowed; /* is allocation in zone z allowed? */ 2486 int allowed; /* is allocation in zone z allowed? */
2498 2487
2499 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2488 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2731 goto out_free; 2720 goto out_free;
2732 2721
2733 rcu_read_lock(); 2722 rcu_read_lock();
2734 css = task_subsys_state(tsk, cpuset_subsys_id); 2723 css = task_css(tsk, cpuset_subsys_id);
2735 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2724 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2736 rcu_read_unlock(); 2725 rcu_read_unlock();
2737 if (retval < 0) 2726 if (retval < 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f86599e8c123..9300f5226077 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -340,8 +340,8 @@ struct perf_cgroup {
340static inline struct perf_cgroup * 340static inline struct perf_cgroup *
341perf_cgroup_from_task(struct task_struct *task) 341perf_cgroup_from_task(struct task_struct *task)
342{ 342{
343 return container_of(task_subsys_state(task, perf_subsys_id), 343 return container_of(task_css(task, perf_subsys_id),
344 struct perf_cgroup, css); 344 struct perf_cgroup, css);
345} 345}
346 346
347static inline bool 347static inline bool
@@ -591,7 +591,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
591 if (!f.file) 591 if (!f.file)
592 return -EBADF; 592 return -EBADF;
593 593
594 css = cgroup_css_from_dir(f.file, perf_subsys_id); 594 rcu_read_lock();
595
596 css = css_from_dir(f.file->f_dentry, &perf_subsys);
595 if (IS_ERR(css)) { 597 if (IS_ERR(css)) {
596 ret = PTR_ERR(css); 598 ret = PTR_ERR(css);
597 goto out; 599 goto out;
@@ -617,6 +619,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
617 ret = -EINVAL; 619 ret = -EINVAL;
618 } 620 }
619out: 621out:
622 rcu_read_unlock();
620 fdput(f); 623 fdput(f);
621 return ret; 624 return ret;
622} 625}
@@ -7798,7 +7801,8 @@ unlock:
7798device_initcall(perf_event_sysfs_init); 7801device_initcall(perf_event_sysfs_init);
7799 7802
7800#ifdef CONFIG_CGROUP_PERF 7803#ifdef CONFIG_CGROUP_PERF
7801static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7804static struct cgroup_subsys_state *
7805perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7802{ 7806{
7803 struct perf_cgroup *jc; 7807 struct perf_cgroup *jc;
7804 7808
@@ -7815,11 +7819,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7815 return &jc->css; 7819 return &jc->css;
7816} 7820}
7817 7821
7818static void perf_cgroup_css_free(struct cgroup *cont) 7822static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
7819{ 7823{
7820 struct perf_cgroup *jc; 7824 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
7821 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7825
7822 struct perf_cgroup, css);
7823 free_percpu(jc->info); 7826 free_percpu(jc->info);
7824 kfree(jc); 7827 kfree(jc);
7825} 7828}
@@ -7831,15 +7834,17 @@ static int __perf_cgroup_move(void *info)
7831 return 0; 7834 return 0;
7832} 7835}
7833 7836
7834static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 7837static void perf_cgroup_attach(struct cgroup_subsys_state *css,
7838 struct cgroup_taskset *tset)
7835{ 7839{
7836 struct task_struct *task; 7840 struct task_struct *task;
7837 7841
7838 cgroup_taskset_for_each(task, cgrp, tset) 7842 cgroup_taskset_for_each(task, css, tset)
7839 task_function_call(task, __perf_cgroup_move, task); 7843 task_function_call(task, __perf_cgroup_move, task);
7840} 7844}
7841 7845
7842static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7846static void perf_cgroup_exit(struct cgroup_subsys_state *css,
7847 struct cgroup_subsys_state *old_css,
7843 struct task_struct *task) 7848 struct task_struct *task)
7844{ 7849{
7845 /* 7850 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05c39f030314..e53bda3ff2f1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6815,7 +6815,7 @@ void sched_move_task(struct task_struct *tsk)
6815 if (unlikely(running)) 6815 if (unlikely(running))
6816 tsk->sched_class->put_prev_task(rq, tsk); 6816 tsk->sched_class->put_prev_task(rq, tsk);
6817 6817
6818 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6818 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6819 lockdep_is_held(&tsk->sighand->siglock)), 6819 lockdep_is_held(&tsk->sighand->siglock)),
6820 struct task_group, css); 6820 struct task_group, css);
6821 tg = autogroup_task_group(tsk, tg); 6821 tg = autogroup_task_group(tsk, tg);
@@ -7137,23 +7137,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
7137 7137
7138#ifdef CONFIG_CGROUP_SCHED 7138#ifdef CONFIG_CGROUP_SCHED
7139 7139
7140/* return corresponding task_group object of a cgroup */ 7140static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7141static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7142{ 7141{
7143 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7142 return css ? container_of(css, struct task_group, css) : NULL;
7144 struct task_group, css);
7145} 7143}
7146 7144
7147static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 7145static struct cgroup_subsys_state *
7146cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7148{ 7147{
7149 struct task_group *tg, *parent; 7148 struct task_group *parent = css_tg(parent_css);
7149 struct task_group *tg;
7150 7150
7151 if (!cgrp->parent) { 7151 if (!parent) {
7152 /* This is early initialization for the top cgroup */ 7152 /* This is early initialization for the top cgroup */
7153 return &root_task_group.css; 7153 return &root_task_group.css;
7154 } 7154 }
7155 7155
7156 parent = cgroup_tg(cgrp->parent);
7157 tg = sched_create_group(parent); 7156 tg = sched_create_group(parent);
7158 if (IS_ERR(tg)) 7157 if (IS_ERR(tg))
7159 return ERR_PTR(-ENOMEM); 7158 return ERR_PTR(-ENOMEM);
@@ -7161,41 +7160,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7161 return &tg->css; 7160 return &tg->css;
7162} 7161}
7163 7162
7164static int cpu_cgroup_css_online(struct cgroup *cgrp) 7163static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7165{ 7164{
7166 struct task_group *tg = cgroup_tg(cgrp); 7165 struct task_group *tg = css_tg(css);
7167 struct task_group *parent; 7166 struct task_group *parent = css_tg(css_parent(css));
7168
7169 if (!cgrp->parent)
7170 return 0;
7171 7167
7172 parent = cgroup_tg(cgrp->parent); 7168 if (parent)
7173 sched_online_group(tg, parent); 7169 sched_online_group(tg, parent);
7174 return 0; 7170 return 0;
7175} 7171}
7176 7172
7177static void cpu_cgroup_css_free(struct cgroup *cgrp) 7173static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7178{ 7174{
7179 struct task_group *tg = cgroup_tg(cgrp); 7175 struct task_group *tg = css_tg(css);
7180 7176
7181 sched_destroy_group(tg); 7177 sched_destroy_group(tg);
7182} 7178}
7183 7179
7184static void cpu_cgroup_css_offline(struct cgroup *cgrp) 7180static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7185{ 7181{
7186 struct task_group *tg = cgroup_tg(cgrp); 7182 struct task_group *tg = css_tg(css);
7187 7183
7188 sched_offline_group(tg); 7184 sched_offline_group(tg);
7189} 7185}
7190 7186
7191static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7187static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7192 struct cgroup_taskset *tset) 7188 struct cgroup_taskset *tset)
7193{ 7189{
7194 struct task_struct *task; 7190 struct task_struct *task;
7195 7191
7196 cgroup_taskset_for_each(task, cgrp, tset) { 7192 cgroup_taskset_for_each(task, css, tset) {
7197#ifdef CONFIG_RT_GROUP_SCHED 7193#ifdef CONFIG_RT_GROUP_SCHED
7198 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7194 if (!sched_rt_can_attach(css_tg(css), task))
7199 return -EINVAL; 7195 return -EINVAL;
7200#else 7196#else
7201 /* We don't support RT-tasks being in separate groups */ 7197 /* We don't support RT-tasks being in separate groups */
@@ -7206,18 +7202,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7206 return 0; 7202 return 0;
7207} 7203}
7208 7204
7209static void cpu_cgroup_attach(struct cgroup *cgrp, 7205static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7210 struct cgroup_taskset *tset) 7206 struct cgroup_taskset *tset)
7211{ 7207{
7212 struct task_struct *task; 7208 struct task_struct *task;
7213 7209
7214 cgroup_taskset_for_each(task, cgrp, tset) 7210 cgroup_taskset_for_each(task, css, tset)
7215 sched_move_task(task); 7211 sched_move_task(task);
7216} 7212}
7217 7213
7218static void 7214static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7219cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7215 struct cgroup_subsys_state *old_css,
7220 struct task_struct *task) 7216 struct task_struct *task)
7221{ 7217{
7222 /* 7218 /*
7223 * cgroup_exit() is called in the copy_process() failure path. 7219 * cgroup_exit() is called in the copy_process() failure path.
@@ -7231,15 +7227,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7231} 7227}
7232 7228
7233#ifdef CONFIG_FAIR_GROUP_SCHED 7229#ifdef CONFIG_FAIR_GROUP_SCHED
7234static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7230static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7235 u64 shareval) 7231 struct cftype *cftype, u64 shareval)
7236{ 7232{
7237 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7233 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7238} 7234}
7239 7235
7240static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7236static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7237 struct cftype *cft)
7241{ 7238{
7242 struct task_group *tg = cgroup_tg(cgrp); 7239 struct task_group *tg = css_tg(css);
7243 7240
7244 return (u64) scale_load_down(tg->shares); 7241 return (u64) scale_load_down(tg->shares);
7245} 7242}
@@ -7361,26 +7358,28 @@ long tg_get_cfs_period(struct task_group *tg)
7361 return cfs_period_us; 7358 return cfs_period_us;
7362} 7359}
7363 7360
7364static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7361static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7362 struct cftype *cft)
7365{ 7363{
7366 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7364 return tg_get_cfs_quota(css_tg(css));
7367} 7365}
7368 7366
7369static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7367static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7370 s64 cfs_quota_us) 7368 struct cftype *cftype, s64 cfs_quota_us)
7371{ 7369{
7372 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7370 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7373} 7371}
7374 7372
7375static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7373static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7374 struct cftype *cft)
7376{ 7375{
7377 return tg_get_cfs_period(cgroup_tg(cgrp)); 7376 return tg_get_cfs_period(css_tg(css));
7378} 7377}
7379 7378
7380static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7379static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7381 u64 cfs_period_us) 7380 struct cftype *cftype, u64 cfs_period_us)
7382{ 7381{
7383 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7382 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7384} 7383}
7385 7384
7386struct cfs_schedulable_data { 7385struct cfs_schedulable_data {
@@ -7461,10 +7460,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7461 return ret; 7460 return ret;
7462} 7461}
7463 7462
7464static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7463static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7465 struct cgroup_map_cb *cb) 7464 struct cgroup_map_cb *cb)
7466{ 7465{
7467 struct task_group *tg = cgroup_tg(cgrp); 7466 struct task_group *tg = css_tg(css);
7468 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7467 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7469 7468
7470 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7469 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7477,26 +7476,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7477#endif /* CONFIG_FAIR_GROUP_SCHED */ 7476#endif /* CONFIG_FAIR_GROUP_SCHED */
7478 7477
7479#ifdef CONFIG_RT_GROUP_SCHED 7478#ifdef CONFIG_RT_GROUP_SCHED
7480static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7479static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7481 s64 val) 7480 struct cftype *cft, s64 val)
7482{ 7481{
7483 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7482 return sched_group_set_rt_runtime(css_tg(css), val);
7484} 7483}
7485 7484
7486static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7485static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7486 struct cftype *cft)
7487{ 7487{
7488 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7488 return sched_group_rt_runtime(css_tg(css));
7489} 7489}
7490 7490
7491static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7491static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7492 u64 rt_period_us) 7492 struct cftype *cftype, u64 rt_period_us)
7493{ 7493{
7494 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7494 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7495} 7495}
7496 7496
7497static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7497static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7498 struct cftype *cft)
7498{ 7499{
7499 return sched_group_rt_period(cgroup_tg(cgrp)); 7500 return sched_group_rt_period(css_tg(css));
7500} 7501}
7501#endif /* CONFIG_RT_GROUP_SCHED */ 7502#endif /* CONFIG_RT_GROUP_SCHED */
7502 7503
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
33 struct kernel_cpustat __percpu *cpustat; 33 struct kernel_cpustat __percpu *cpustat;
34}; 34};
35 35
36/* return cpu accounting group corresponding to this container */ 36static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{ 37{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 38 return css ? container_of(css, struct cpuacct, css) : NULL;
40 struct cpuacct, css);
41} 39}
42 40
43/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{ 43{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 44 return css_ca(task_css(tsk, cpuacct_subsys_id));
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53} 45}
54 46
55static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{ 48{
57 if (!ca->css.cgroup->parent) 49 return css_ca(css_parent(&ca->css));
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60} 50}
61 51
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
66}; 56};
67 57
68/* create a new cpu accounting group */ 58/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 59static struct cgroup_subsys_state *
60cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
70{ 61{
71 struct cpuacct *ca; 62 struct cpuacct *ca;
72 63
73 if (!cgrp->parent) 64 if (!parent_css)
74 return &root_cpuacct.css; 65 return &root_cpuacct.css;
75 66
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 67 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
96} 87}
97 88
98/* destroy an existing cpu accounting group */ 89/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp) 90static void cpuacct_css_free(struct cgroup_subsys_state *css)
100{ 91{
101 struct cpuacct *ca = cgroup_ca(cgrp); 92 struct cpuacct *ca = css_ca(css);
102 93
103 free_percpu(ca->cpustat); 94 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage); 95 free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
141} 132}
142 133
143/* return total cpu usage (in nanoseconds) of a group */ 134/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 135static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145{ 136{
146 struct cpuacct *ca = cgroup_ca(cgrp); 137 struct cpuacct *ca = css_ca(css);
147 u64 totalcpuusage = 0; 138 u64 totalcpuusage = 0;
148 int i; 139 int i;
149 140
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
153 return totalcpuusage; 144 return totalcpuusage;
154} 145}
155 146
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
157 u64 reset) 148 u64 reset)
158{ 149{
159 struct cpuacct *ca = cgroup_ca(cgrp); 150 struct cpuacct *ca = css_ca(css);
160 int err = 0; 151 int err = 0;
161 int i; 152 int i;
162 153
@@ -172,10 +163,10 @@ out:
172 return err; 163 return err;
173} 164}
174 165
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
176 struct seq_file *m) 167 struct cftype *cft, struct seq_file *m)
177{ 168{
178 struct cpuacct *ca = cgroup_ca(cgroup); 169 struct cpuacct *ca = css_ca(css);
179 u64 percpu; 170 u64 percpu;
180 int i; 171 int i;
181 172
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
192 [CPUACCT_STAT_SYSTEM] = "system", 183 [CPUACCT_STAT_SYSTEM] = "system",
193}; 184};
194 185
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 186static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 struct cgroup_map_cb *cb) 187 struct cftype *cft, struct cgroup_map_cb *cb)
197{ 188{
198 struct cpuacct *ca = cgroup_ca(cgrp); 189 struct cpuacct *ca = css_ca(css);
199 int cpu; 190 int cpu;
200 s64 val = 0; 191 s64 val = 0;
201 192
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
281 while (ca != &root_cpuacct) { 272 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat); 273 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val; 274 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca); 275 ca = parent_ca(ca);
285 } 276 }
286 rcu_read_unlock(); 277 rcu_read_unlock();
287} 278}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..471a56db05ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg);
665/* 665/*
666 * Return the group to which this tasks belongs. 666 * Return the group to which this tasks belongs.
667 * 667 *
668 * We cannot use task_subsys_state() and friends because the cgroup 668 * We cannot use task_css() and friends because the cgroup subsystem
669 * subsystem changes that value before the cgroup_subsys::attach() method 669 * changes that value before the cgroup_subsys::attach() method is called,
670 * is called, therefore we cannot pin it and might observe the wrong value. 670 * therefore we cannot pin it and might observe the wrong value.
671 * 671 *
672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
673 * core changes this before calling sched_move_task(). 673 * core changes this before calling sched_move_task().