aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1666
1 files changed, 931 insertions, 735 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e91963302c0d..2418b6e71a85 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
60#include <linux/poll.h> 60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 62#include <linux/kthread.h>
63#include <linux/file.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
@@ -81,7 +82,7 @@
81 */ 82 */
82#ifdef CONFIG_PROVE_RCU 83#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 84DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 85EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 86#else
86static DEFINE_MUTEX(cgroup_mutex); 87static DEFINE_MUTEX(cgroup_mutex);
87#endif 88#endif
@@ -117,6 +118,7 @@ struct cfent {
117 struct list_head node; 118 struct list_head node;
118 struct dentry *dentry; 119 struct dentry *dentry;
119 struct cftype *type; 120 struct cftype *type;
121 struct cgroup_subsys_state *css;
120 122
121 /* file xattrs */ 123 /* file xattrs */
122 struct simple_xattrs xattrs; 124 struct simple_xattrs xattrs;
@@ -159,9 +161,9 @@ struct css_id {
159 */ 161 */
160struct cgroup_event { 162struct cgroup_event {
161 /* 163 /*
162 * Cgroup which the event belongs to. 164 * css which the event belongs to.
163 */ 165 */
164 struct cgroup *cgrp; 166 struct cgroup_subsys_state *css;
165 /* 167 /*
166 * Control file which the event associated. 168 * Control file which the event associated.
167 */ 169 */
@@ -215,10 +217,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 217 */
216static int need_forkexit_callback __read_mostly; 218static int need_forkexit_callback __read_mostly;
217 219
218static void cgroup_offline_fn(struct work_struct *work); 220static struct cftype cgroup_base_files[];
221
222static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 223static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 224static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 225 bool is_add);
226
227/**
228 * cgroup_css - obtain a cgroup's css for the specified subsystem
229 * @cgrp: the cgroup of interest
230 * @ss: the subsystem of interest (%NULL returns the dummy_css)
231 *
232 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
233 * function must be called either under cgroup_mutex or rcu_read_lock() and
234 * the caller is responsible for pinning the returned css if it wants to
235 * keep accessing it outside the said locks. This function may return
236 * %NULL if @cgrp doesn't have @subsys_id enabled.
237 */
238static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
239 struct cgroup_subsys *ss)
240{
241 if (ss)
242 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
243 lockdep_is_held(&cgroup_mutex));
244 else
245 return &cgrp->dummy_css;
246}
222 247
223/* convenient tests for these bits */ 248/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 249static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +390,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
365static int cgroup_init_idr(struct cgroup_subsys *ss, 390static int cgroup_init_idr(struct cgroup_subsys *ss,
366 struct cgroup_subsys_state *css); 391 struct cgroup_subsys_state *css);
367 392
368/* css_set_lock protects the list of css_set objects, and the 393/*
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 394 * css_set_lock protects the list of css_set objects, and the chain of
370 * due to cgroup_iter_start() */ 395 * tasks off each css_set. Nests outside task->alloc_lock due to
396 * css_task_iter_start().
397 */
371static DEFINE_RWLOCK(css_set_lock); 398static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 399static int css_set_count;
373 400
@@ -392,10 +419,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 419 return key;
393} 420}
394 421
395/* We don't maintain the lists running through each css_set to its 422/*
396 * task until after the first call to cgroup_iter_start(). This 423 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 424 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 425 * fork()/exit() overhead for people who have cgroups compiled into their
426 * kernel but not actually in use.
427 */
399static int use_task_css_set_links __read_mostly; 428static int use_task_css_set_links __read_mostly;
400 429
401static void __put_css_set(struct css_set *cset, int taskexit) 430static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +493,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 493 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 494 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 495 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 496 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 497 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 498 */
470static bool compare_css_sets(struct css_set *cset, 499static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +584,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 584 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 585 * the subsystem state from the new
557 * cgroup */ 586 * cgroup */
558 template[i] = cgrp->subsys[i]; 587 template[i] = cgroup_css(cgrp, ss);
559 } else { 588 } else {
560 /* Subsystem is not in this hierarchy, so we 589 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 590 * don't want to change the subsystem state */
@@ -803,8 +832,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 832
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 833static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 834static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 835static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 836static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 837static const struct file_operations proc_cgroupstats_operations;
810 838
@@ -813,8 +841,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 841 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 842};
815 843
816static int alloc_css_id(struct cgroup_subsys *ss, 844static int alloc_css_id(struct cgroup_subsys_state *child_css);
817 struct cgroup *parent, struct cgroup *child);
818 845
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 846static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 847{
@@ -845,15 +872,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 872static void cgroup_free_fn(struct work_struct *work)
846{ 873{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 874 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 875
850 mutex_lock(&cgroup_mutex); 876 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 877 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 878 mutex_unlock(&cgroup_mutex);
859 879
@@ -864,8 +884,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 884 */
865 dput(cgrp->parent->dentry); 885 dput(cgrp->parent->dentry);
866 886
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 887 /*
870 * Drop the active superblock reference that we took when we 888 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 889 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +974,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 974}
957 975
958/** 976/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 977 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 978 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 979 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 980 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 981static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 982{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 983 struct cgroup_subsys *ss;
984 int i;
969 985
970 for_each_root_subsys(cgrp->root, ss) { 986 for_each_subsys(ss, i) {
971 struct cftype_set *set; 987 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 988
989 if (!test_bit(i, &subsys_mask))
973 continue; 990 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 991 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 992 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 993 }
981} 994}
982 995
@@ -986,9 +999,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 999static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 1000{
988 struct dentry *parent; 1001 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 1002
993 parent = dentry->d_parent; 1003 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 1004 spin_lock(&parent->d_lock);
@@ -1009,79 +1019,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 1019{
1010 struct cgroup *cgrp = &root->top_cgroup; 1020 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 1021 struct cgroup_subsys *ss;
1012 int i; 1022 unsigned long pinned = 0;
1023 int i, ret;
1013 1024
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1025 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1026 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 1027
1017 /* Check that any added subsystems are currently free */ 1028 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 1029 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 1030 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 1031 continue;
1023 1032
1033 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 1034 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 1035 ret = -EBUSY;
1026 return -EBUSY; 1036 goto out_put;
1037 }
1038
1039 /* pin the module */
1040 if (!try_module_get(ss->module)) {
1041 ret = -ENOENT;
1042 goto out_put;
1027 } 1043 }
1044 pinned |= 1 << i;
1028 } 1045 }
1029 1046
1030 /* Currently we don't handle adding/removing subsystems when 1047 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1048 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1049 ret = -ENOENT;
1033 * later */ 1050 goto out_put;
1034 if (root->number_of_cgroups > 1) 1051 }
1035 return -EBUSY; 1052
1053 ret = cgroup_populate_dir(cgrp, added_mask);
1054 if (ret)
1055 goto out_put;
1056
1057 /*
1058 * Nothing can fail from this point on. Remove files for the
1059 * removed subsystems and rebind each subsystem.
1060 */
1061 cgroup_clear_dir(cgrp, removed_mask);
1036 1062
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1063 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1064 unsigned long bit = 1UL << i;
1040 1065
1041 if (bit & added_mask) { 1066 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1067 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1068 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1069 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1070 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1071
1072 rcu_assign_pointer(cgrp->subsys[i],
1073 cgroup_css(cgroup_dummy_top, ss));
1074 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1075
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1076 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1077 ss->root = root;
1051 if (ss->bind) 1078 if (ss->bind)
1052 ss->bind(cgrp); 1079 ss->bind(cgroup_css(cgrp, ss));
1053 1080
1054 /* refcount was already taken, and we're keeping it */ 1081 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1082 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1083 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1084 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1085 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1086 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1087
1061 if (ss->bind) 1088 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1089 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1090
1064 cgrp->subsys[i] = NULL; 1091 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1092 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1093
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1094 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1095 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1096
1068 /* subsystem is now free - drop reference on module */ 1097 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1098 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1099 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1100 }
1086 } 1101 }
1087 1102
@@ -1092,6 +1107,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1107 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1108
1094 return 0; 1109 return 0;
1110
1111out_put:
1112 for_each_subsys(ss, i)
1113 if (pinned & (1 << i))
1114 module_put(ss->module);
1115 return ret;
1095} 1116}
1096 1117
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1118static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1163,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1163 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1164 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1165 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1166 struct cgroup_subsys *ss;
1147 int i; 1167 int i;
1148 1168
@@ -1285,52 +1305,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1305 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1306 return -EINVAL;
1287 1307
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1308 return 0;
1320} 1309}
1321 1310
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1311static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1312{
1336 int ret = 0; 1313 int ret = 0;
@@ -1370,22 +1347,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1347 goto out_unlock;
1371 } 1348 }
1372 1349
1373 /* 1350 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1351 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1352 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1353 goto out_unlock;
1385 } 1354 }
1386 1355
1387 /* re-populate subsystem files */ 1356 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1357 if (ret)
1358 goto out_unlock;
1389 1359
1390 if (opts.release_agent) 1360 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1361 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1365,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1365 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1366 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1367 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1368 return ret;
1401} 1369}
1402 1370
@@ -1416,6 +1384,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1384 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1385 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1386 mutex_init(&cgrp->pidlist_mutex);
1387 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1388 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1389 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1390 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1400,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1400 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1401 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1402 init_cgroup_housekeeping(cgrp);
1403 idr_init(&root->cgroup_idr);
1434} 1404}
1435 1405
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1406static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1473,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1473 */
1504 root->subsys_mask = opts->subsys_mask; 1474 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1475 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1476 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1477 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1478 if (opts->name)
@@ -1519,7 +1488,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1488 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1489 WARN_ON_ONCE(root->hierarchy_id);
1521 1490
1522 ida_destroy(&root->cgroup_ida); 1491 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1492 kfree(root);
1524 } 1493 }
1525} 1494}
@@ -1584,7 +1553,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1553 int ret = 0;
1585 struct super_block *sb; 1554 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1555 struct cgroupfs_root *new_root;
1556 struct list_head tmp_links;
1587 struct inode *inode; 1557 struct inode *inode;
1558 const struct cred *cred;
1588 1559
1589 /* First find the desired set of subsystems */ 1560 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1561 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1571,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1571 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1572 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1573 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1574 goto out_err;
1604 } 1575 }
1605 opts.new_root = new_root; 1576 opts.new_root = new_root;
1606 1577
@@ -1609,17 +1580,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1580 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1581 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1582 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1583 goto out_err;
1613 } 1584 }
1614 1585
1615 root = sb->s_fs_info; 1586 root = sb->s_fs_info;
1616 BUG_ON(!root); 1587 BUG_ON(!root);
1617 if (root == opts.new_root) { 1588 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1589 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1590 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1591 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1592 int i;
1624 struct css_set *cset; 1593 struct css_set *cset;
1625 1594
@@ -1634,6 +1603,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1603 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1604 mutex_lock(&cgroup_root_mutex);
1636 1605
1606 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1607 0, 1, GFP_KERNEL);
1608 if (root_cgrp->id < 0)
1609 goto unlock_drop;
1610
1637 /* Check for name clashes with existing mounts */ 1611 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1612 ret = -EBUSY;
1639 if (strlen(root->name)) 1613 if (strlen(root->name))
@@ -1657,26 +1631,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1631 if (ret)
1658 goto unlock_drop; 1632 goto unlock_drop;
1659 1633
1634 sb->s_root->d_fsdata = root_cgrp;
1635 root_cgrp->dentry = sb->s_root;
1636
1637 /*
1638 * We're inside get_sb() and will call lookup_one_len() to
1639 * create the root files, which doesn't work if SELinux is
1640 * in use. The following cred dancing somehow works around
1641 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1642 * populating new cgroupfs mount") for more details.
1643 */
1644 cred = override_creds(&init_cred);
1645
1646 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1647 if (ret)
1648 goto rm_base_files;
1649
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1650 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1651 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1652 goto rm_base_files;
1663 goto unlock_drop; 1653
1664 } 1654 revert_creds(cred);
1655
1665 /* 1656 /*
1666 * There must be no failure case after here, since rebinding 1657 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1658 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1659 * dropped in the failure exit path.
1669 */ 1660 */
1670 1661
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1662 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1663 cgroup_root_count++;
1676 1664
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1665 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1666 * the css_set objects */
1682 write_lock(&css_set_lock); 1667 write_lock(&css_set_lock);
@@ -1689,9 +1674,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1674 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1675 BUG_ON(root->number_of_cgroups != 1);
1691 1676
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1677 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1678 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1679 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1693,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1693 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1694 }
1713 } 1695 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1696 }
1718 1697
1719 kfree(opts.release_agent); 1698 kfree(opts.release_agent);
1720 kfree(opts.name); 1699 kfree(opts.name);
1721 return dget(sb->s_root); 1700 return dget(sb->s_root);
1722 1701
1702 rm_base_files:
1703 free_cgrp_cset_links(&tmp_links);
1704 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1705 revert_creds(cred);
1723 unlock_drop: 1706 unlock_drop:
1724 cgroup_exit_root_id(root); 1707 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1708 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1710,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1710 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1711 drop_new_super:
1729 deactivate_locked_super(sb); 1712 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1713 out_err:
1733 kfree(opts.release_agent); 1714 kfree(opts.release_agent);
1734 kfree(opts.name); 1715 kfree(opts.name);
@@ -1746,6 +1727,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1727 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1728 BUG_ON(!list_empty(&cgrp->children));
1748 1729
1730 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1731 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1732 mutex_lock(&cgroup_root_mutex);
1751 1733
@@ -1778,6 +1760,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1760
1779 mutex_unlock(&cgroup_root_mutex); 1761 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1762 mutex_unlock(&cgroup_mutex);
1763 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1764
1782 simple_xattrs_free(&cgrp->xattrs); 1765 simple_xattrs_free(&cgrp->xattrs);
1783 1766
@@ -1889,7 +1872,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
1889struct task_and_cgroup { 1872struct task_and_cgroup {
1890 struct task_struct *task; 1873 struct task_struct *task;
1891 struct cgroup *cgrp; 1874 struct cgroup *cgrp;
1892 struct css_set *cg; 1875 struct css_set *cset;
1893}; 1876};
1894 1877
1895struct cgroup_taskset { 1878struct cgroup_taskset {
@@ -1939,18 +1922,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1939EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1922EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1940 1923
1941/** 1924/**
1942 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1925 * cgroup_taskset_cur_css - return the matching css for the current task
1943 * @tset: taskset of interest 1926 * @tset: taskset of interest
1927 * @subsys_id: the ID of the target subsystem
1944 * 1928 *
1945 * Return the cgroup for the current (last returned) task of @tset. This 1929 * Return the css for the current (last returned) task of @tset for
1946 * function must be preceded by either cgroup_taskset_first() or 1930 * subsystem specified by @subsys_id. This function must be preceded by
1947 * cgroup_taskset_next(). 1931 * either cgroup_taskset_first() or cgroup_taskset_next().
1948 */ 1932 */
1949struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1933struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1934 int subsys_id)
1950{ 1935{
1951 return tset->cur_cgrp; 1936 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1952} 1937}
1953EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1938EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1954 1939
1955/** 1940/**
1956 * cgroup_taskset_size - return the number of tasks in taskset 1941 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2089,8 +2074,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2089 * step 1: check that we can legitimately attach to the cgroup. 2074 * step 1: check that we can legitimately attach to the cgroup.
2090 */ 2075 */
2091 for_each_root_subsys(root, ss) { 2076 for_each_root_subsys(root, ss) {
2077 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2078
2092 if (ss->can_attach) { 2079 if (ss->can_attach) {
2093 retval = ss->can_attach(cgrp, &tset); 2080 retval = ss->can_attach(css, &tset);
2094 if (retval) { 2081 if (retval) {
2095 failed_ss = ss; 2082 failed_ss = ss;
2096 goto out_cancel_attach; 2083 goto out_cancel_attach;
@@ -2107,8 +2094,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2107 2094
2108 tc = flex_array_get(group, i); 2095 tc = flex_array_get(group, i);
2109 old_cset = task_css_set(tc->task); 2096 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp); 2097 tc->cset = find_css_set(old_cset, cgrp);
2111 if (!tc->cg) { 2098 if (!tc->cset) {
2112 retval = -ENOMEM; 2099 retval = -ENOMEM;
2113 goto out_put_css_set_refs; 2100 goto out_put_css_set_refs;
2114 } 2101 }
@@ -2121,7 +2108,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2121 */ 2108 */
2122 for (i = 0; i < group_size; i++) { 2109 for (i = 0; i < group_size; i++) {
2123 tc = flex_array_get(group, i); 2110 tc = flex_array_get(group, i);
2124 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2111 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2125 } 2112 }
2126 /* nothing is sensitive to fork() after this point. */ 2113 /* nothing is sensitive to fork() after this point. */
2127 2114
@@ -2129,8 +2116,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2129 * step 4: do subsystem attach callbacks. 2116 * step 4: do subsystem attach callbacks.
2130 */ 2117 */
2131 for_each_root_subsys(root, ss) { 2118 for_each_root_subsys(root, ss) {
2119 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2120
2132 if (ss->attach) 2121 if (ss->attach)
2133 ss->attach(cgrp, &tset); 2122 ss->attach(css, &tset);
2134 } 2123 }
2135 2124
2136 /* 2125 /*
@@ -2141,18 +2130,20 @@ out_put_css_set_refs:
2141 if (retval) { 2130 if (retval) {
2142 for (i = 0; i < group_size; i++) { 2131 for (i = 0; i < group_size; i++) {
2143 tc = flex_array_get(group, i); 2132 tc = flex_array_get(group, i);
2144 if (!tc->cg) 2133 if (!tc->cset)
2145 break; 2134 break;
2146 put_css_set(tc->cg); 2135 put_css_set(tc->cset);
2147 } 2136 }
2148 } 2137 }
2149out_cancel_attach: 2138out_cancel_attach:
2150 if (retval) { 2139 if (retval) {
2151 for_each_root_subsys(root, ss) { 2140 for_each_root_subsys(root, ss) {
2141 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2142
2152 if (ss == failed_ss) 2143 if (ss == failed_ss)
2153 break; 2144 break;
2154 if (ss->cancel_attach) 2145 if (ss->cancel_attach)
2155 ss->cancel_attach(cgrp, &tset); 2146 ss->cancel_attach(css, &tset);
2156 } 2147 }
2157 } 2148 }
2158out_free_group_list: 2149out_free_group_list:
@@ -2253,9 +2244,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2253 2244
2254 mutex_lock(&cgroup_mutex); 2245 mutex_lock(&cgroup_mutex);
2255 for_each_active_root(root) { 2246 for_each_active_root(root) {
2256 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2247 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2257 2248
2258 retval = cgroup_attach_task(from_cg, tsk, false); 2249 retval = cgroup_attach_task(from_cgrp, tsk, false);
2259 if (retval) 2250 if (retval)
2260 break; 2251 break;
2261 } 2252 }
@@ -2265,34 +2256,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2265} 2256}
2266EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2257EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2267 2258
2268static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2259static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2260 struct cftype *cft, u64 pid)
2269{ 2261{
2270 return attach_task_by_pid(cgrp, pid, false); 2262 return attach_task_by_pid(css->cgroup, pid, false);
2271} 2263}
2272 2264
2273static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2265static int cgroup_procs_write(struct cgroup_subsys_state *css,
2266 struct cftype *cft, u64 tgid)
2274{ 2267{
2275 return attach_task_by_pid(cgrp, tgid, true); 2268 return attach_task_by_pid(css->cgroup, tgid, true);
2276} 2269}
2277 2270
2278static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2271static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2279 const char *buffer) 2272 struct cftype *cft, const char *buffer)
2280{ 2273{
2281 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2274 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2282 if (strlen(buffer) >= PATH_MAX) 2275 if (strlen(buffer) >= PATH_MAX)
2283 return -EINVAL; 2276 return -EINVAL;
2284 if (!cgroup_lock_live_group(cgrp)) 2277 if (!cgroup_lock_live_group(css->cgroup))
2285 return -ENODEV; 2278 return -ENODEV;
2286 mutex_lock(&cgroup_root_mutex); 2279 mutex_lock(&cgroup_root_mutex);
2287 strcpy(cgrp->root->release_agent_path, buffer); 2280 strcpy(css->cgroup->root->release_agent_path, buffer);
2288 mutex_unlock(&cgroup_root_mutex); 2281 mutex_unlock(&cgroup_root_mutex);
2289 mutex_unlock(&cgroup_mutex); 2282 mutex_unlock(&cgroup_mutex);
2290 return 0; 2283 return 0;
2291} 2284}
2292 2285
2293static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2286static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2294 struct seq_file *seq) 2287 struct cftype *cft, struct seq_file *seq)
2295{ 2288{
2289 struct cgroup *cgrp = css->cgroup;
2290
2296 if (!cgroup_lock_live_group(cgrp)) 2291 if (!cgroup_lock_live_group(cgrp))
2297 return -ENODEV; 2292 return -ENODEV;
2298 seq_puts(seq, cgrp->root->release_agent_path); 2293 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2296,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2301 return 0; 2296 return 0;
2302} 2297}
2303 2298
2304static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2299static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2305 struct seq_file *seq) 2300 struct cftype *cft, struct seq_file *seq)
2306{ 2301{
2307 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2302 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2308 return 0; 2303 return 0;
2309} 2304}
2310 2305
2311/* A buffer size big enough for numbers or short strings */ 2306/* A buffer size big enough for numbers or short strings */
2312#define CGROUP_LOCAL_BUFFER_SIZE 64 2307#define CGROUP_LOCAL_BUFFER_SIZE 64
2313 2308
2314static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2309static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2315 struct file *file, 2310 struct cftype *cft, struct file *file,
2316 const char __user *userbuf, 2311 const char __user *userbuf, size_t nbytes,
2317 size_t nbytes, loff_t *unused_ppos) 2312 loff_t *unused_ppos)
2318{ 2313{
2319 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2314 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2320 int retval = 0; 2315 int retval = 0;
@@ -2332,22 +2327,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2332 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2327 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2333 if (*end) 2328 if (*end)
2334 return -EINVAL; 2329 return -EINVAL;
2335 retval = cft->write_u64(cgrp, cft, val); 2330 retval = cft->write_u64(css, cft, val);
2336 } else { 2331 } else {
2337 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2332 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2338 if (*end) 2333 if (*end)
2339 return -EINVAL; 2334 return -EINVAL;
2340 retval = cft->write_s64(cgrp, cft, val); 2335 retval = cft->write_s64(css, cft, val);
2341 } 2336 }
2342 if (!retval) 2337 if (!retval)
2343 retval = nbytes; 2338 retval = nbytes;
2344 return retval; 2339 return retval;
2345} 2340}
2346 2341
2347static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2342static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2348 struct file *file, 2343 struct cftype *cft, struct file *file,
2349 const char __user *userbuf, 2344 const char __user *userbuf, size_t nbytes,
2350 size_t nbytes, loff_t *unused_ppos) 2345 loff_t *unused_ppos)
2351{ 2346{
2352 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2347 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2353 int retval = 0; 2348 int retval = 0;
@@ -2370,7 +2365,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2370 } 2365 }
2371 2366
2372 buffer[nbytes] = 0; /* nul-terminate */ 2367 buffer[nbytes] = 0; /* nul-terminate */
2373 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2368 retval = cft->write_string(css, cft, strstrip(buffer));
2374 if (!retval) 2369 if (!retval)
2375 retval = nbytes; 2370 retval = nbytes;
2376out: 2371out:
@@ -2380,65 +2375,60 @@ out:
2380} 2375}
2381 2376
2382static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2377static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2383 size_t nbytes, loff_t *ppos) 2378 size_t nbytes, loff_t *ppos)
2384{ 2379{
2380 struct cfent *cfe = __d_cfe(file->f_dentry);
2385 struct cftype *cft = __d_cft(file->f_dentry); 2381 struct cftype *cft = __d_cft(file->f_dentry);
2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2382 struct cgroup_subsys_state *css = cfe->css;
2387 2383
2388 if (cgroup_is_dead(cgrp))
2389 return -ENODEV;
2390 if (cft->write) 2384 if (cft->write)
2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2385 return cft->write(css, cft, file, buf, nbytes, ppos);
2392 if (cft->write_u64 || cft->write_s64) 2386 if (cft->write_u64 || cft->write_s64)
2393 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2387 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2394 if (cft->write_string) 2388 if (cft->write_string)
2395 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2389 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2396 if (cft->trigger) { 2390 if (cft->trigger) {
2397 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2391 int ret = cft->trigger(css, (unsigned int)cft->private);
2398 return ret ? ret : nbytes; 2392 return ret ? ret : nbytes;
2399 } 2393 }
2400 return -EINVAL; 2394 return -EINVAL;
2401} 2395}
2402 2396
2403static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2397static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2404 struct file *file, 2398 struct cftype *cft, struct file *file,
2405 char __user *buf, size_t nbytes, 2399 char __user *buf, size_t nbytes, loff_t *ppos)
2406 loff_t *ppos)
2407{ 2400{
2408 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2401 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2409 u64 val = cft->read_u64(cgrp, cft); 2402 u64 val = cft->read_u64(css, cft);
2410 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2403 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2411 2404
2412 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2405 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2413} 2406}
2414 2407
2415static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2408static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2416 struct file *file, 2409 struct cftype *cft, struct file *file,
2417 char __user *buf, size_t nbytes, 2410 char __user *buf, size_t nbytes, loff_t *ppos)
2418 loff_t *ppos)
2419{ 2411{
2420 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2412 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2421 s64 val = cft->read_s64(cgrp, cft); 2413 s64 val = cft->read_s64(css, cft);
2422 int len = sprintf(tmp, "%lld\n", (long long) val); 2414 int len = sprintf(tmp, "%lld\n", (long long) val);
2423 2415
2424 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2416 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2425} 2417}
2426 2418
2427static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2419static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2428 size_t nbytes, loff_t *ppos) 2420 size_t nbytes, loff_t *ppos)
2429{ 2421{
2422 struct cfent *cfe = __d_cfe(file->f_dentry);
2430 struct cftype *cft = __d_cft(file->f_dentry); 2423 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2424 struct cgroup_subsys_state *css = cfe->css;
2432
2433 if (cgroup_is_dead(cgrp))
2434 return -ENODEV;
2435 2425
2436 if (cft->read) 2426 if (cft->read)
2437 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2427 return cft->read(css, cft, file, buf, nbytes, ppos);
2438 if (cft->read_u64) 2428 if (cft->read_u64)
2439 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2429 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2440 if (cft->read_s64) 2430 if (cft->read_s64)
2441 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2431 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2442 return -EINVAL; 2432 return -EINVAL;
2443} 2433}
2444 2434
@@ -2447,11 +2437,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2447 * supports string->u64 maps, but can be extended in future. 2437 * supports string->u64 maps, but can be extended in future.
2448 */ 2438 */
2449 2439
2450struct cgroup_seqfile_state {
2451 struct cftype *cft;
2452 struct cgroup *cgroup;
2453};
2454
2455static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2440static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2456{ 2441{
2457 struct seq_file *sf = cb->state; 2442 struct seq_file *sf = cb->state;
@@ -2460,69 +2445,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2460 2445
2461static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2446static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2462{ 2447{
2463 struct cgroup_seqfile_state *state = m->private; 2448 struct cfent *cfe = m->private;
2464 struct cftype *cft = state->cft; 2449 struct cftype *cft = cfe->type;
2450 struct cgroup_subsys_state *css = cfe->css;
2451
2465 if (cft->read_map) { 2452 if (cft->read_map) {
2466 struct cgroup_map_cb cb = { 2453 struct cgroup_map_cb cb = {
2467 .fill = cgroup_map_add, 2454 .fill = cgroup_map_add,
2468 .state = m, 2455 .state = m,
2469 }; 2456 };
2470 return cft->read_map(state->cgroup, cft, &cb); 2457 return cft->read_map(css, cft, &cb);
2471 } 2458 }
2472 return cft->read_seq_string(state->cgroup, cft, m); 2459 return cft->read_seq_string(css, cft, m);
2473}
2474
2475static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq = file->private_data;
2478 kfree(seq->private);
2479 return single_release(inode, file);
2480} 2460}
2481 2461
2482static const struct file_operations cgroup_seqfile_operations = { 2462static const struct file_operations cgroup_seqfile_operations = {
2483 .read = seq_read, 2463 .read = seq_read,
2484 .write = cgroup_file_write, 2464 .write = cgroup_file_write,
2485 .llseek = seq_lseek, 2465 .llseek = seq_lseek,
2486 .release = cgroup_seqfile_release, 2466 .release = single_release,
2487}; 2467};
2488 2468
2489static int cgroup_file_open(struct inode *inode, struct file *file) 2469static int cgroup_file_open(struct inode *inode, struct file *file)
2490{ 2470{
2471 struct cfent *cfe = __d_cfe(file->f_dentry);
2472 struct cftype *cft = __d_cft(file->f_dentry);
2473 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2474 struct cgroup_subsys_state *css;
2491 int err; 2475 int err;
2492 struct cftype *cft;
2493 2476
2494 err = generic_file_open(inode, file); 2477 err = generic_file_open(inode, file);
2495 if (err) 2478 if (err)
2496 return err; 2479 return err;
2497 cft = __d_cft(file->f_dentry);
2498 2480
2499 if (cft->read_map || cft->read_seq_string) { 2481 /*
2500 struct cgroup_seqfile_state *state; 2482 * If the file belongs to a subsystem, pin the css. Will be
2483 * unpinned either on open failure or release. This ensures that
2484 * @css stays alive for all file operations.
2485 */
2486 rcu_read_lock();
2487 css = cgroup_css(cgrp, cft->ss);
2488 if (cft->ss && !css_tryget(css))
2489 css = NULL;
2490 rcu_read_unlock();
2501 2491
2502 state = kzalloc(sizeof(*state), GFP_USER); 2492 if (!css)
2503 if (!state) 2493 return -ENODEV;
2504 return -ENOMEM; 2494
2495 /*
2496 * @cfe->css is used by read/write/close to determine the
2497 * associated css. @file->private_data would be a better place but
2498 * that's already used by seqfile. Multiple accessors may use it
2499 * simultaneously which is okay as the association never changes.
2500 */
2501 WARN_ON_ONCE(cfe->css && cfe->css != css);
2502 cfe->css = css;
2505 2503
2506 state->cft = cft; 2504 if (cft->read_map || cft->read_seq_string) {
2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2508 file->f_op = &cgroup_seqfile_operations; 2505 file->f_op = &cgroup_seqfile_operations;
2509 err = single_open(file, cgroup_seqfile_show, state); 2506 err = single_open(file, cgroup_seqfile_show, cfe);
2510 if (err < 0) 2507 } else if (cft->open) {
2511 kfree(state);
2512 } else if (cft->open)
2513 err = cft->open(inode, file); 2508 err = cft->open(inode, file);
2514 else 2509 }
2515 err = 0;
2516 2510
2511 if (css->ss && err)
2512 css_put(css);
2517 return err; 2513 return err;
2518} 2514}
2519 2515
2520static int cgroup_file_release(struct inode *inode, struct file *file) 2516static int cgroup_file_release(struct inode *inode, struct file *file)
2521{ 2517{
2518 struct cfent *cfe = __d_cfe(file->f_dentry);
2522 struct cftype *cft = __d_cft(file->f_dentry); 2519 struct cftype *cft = __d_cft(file->f_dentry);
2520 struct cgroup_subsys_state *css = cfe->css;
2521 int ret = 0;
2522
2523 if (cft->release) 2523 if (cft->release)
2524 return cft->release(inode, file); 2524 ret = cft->release(inode, file);
2525 return 0; 2525 if (css->ss)
2526 css_put(css);
2527 return ret;
2526} 2528}
2527 2529
2528/* 2530/*
@@ -2736,8 +2738,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2736 return mode; 2738 return mode;
2737} 2739}
2738 2740
2739static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2741static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2740 struct cftype *cft)
2741{ 2742{
2742 struct dentry *dir = cgrp->dentry; 2743 struct dentry *dir = cgrp->dentry;
2743 struct cgroup *parent = __d_cgrp(dir); 2744 struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2748,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 umode_t mode; 2748 umode_t mode;
2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2749 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2749 2750
2750 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2751 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2751 strcpy(name, subsys->name); 2752 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2753 strcpy(name, cft->ss->name);
2752 strcat(name, "."); 2754 strcat(name, ".");
2753 } 2755 }
2754 strcat(name, cft->name); 2756 strcat(name, cft->name);
@@ -2782,11 +2784,25 @@ out:
2782 return error; 2784 return error;
2783} 2785}
2784 2786
2785static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2787/**
2786 struct cftype cfts[], bool is_add) 2788 * cgroup_addrm_files - add or remove files to a cgroup directory
2789 * @cgrp: the target cgroup
2790 * @cfts: array of cftypes to be added
2791 * @is_add: whether to add or remove
2792 *
2793 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2794 * For removals, this function never fails. If addition fails, this
2795 * function doesn't remove files already added. The caller is responsible
2796 * for cleaning up.
2797 */
2798static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2799 bool is_add)
2787{ 2800{
2788 struct cftype *cft; 2801 struct cftype *cft;
2789 int err, ret = 0; 2802 int ret;
2803
2804 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2805 lockdep_assert_held(&cgroup_mutex);
2790 2806
2791 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2807 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2792 /* does cft->flags tell us to skip this file on @cgrp? */ 2808 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2814,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2798 continue; 2814 continue;
2799 2815
2800 if (is_add) { 2816 if (is_add) {
2801 err = cgroup_add_file(cgrp, subsys, cft); 2817 ret = cgroup_add_file(cgrp, cft);
2802 if (err) 2818 if (ret) {
2803 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2819 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2804 cft->name, err); 2820 cft->name, ret);
2805 ret = err; 2821 return ret;
2822 }
2806 } else { 2823 } else {
2807 cgroup_rm_file(cgrp, cft); 2824 cgroup_rm_file(cgrp, cft);
2808 } 2825 }
2809 } 2826 }
2810 return ret; 2827 return 0;
2811} 2828}
2812 2829
2813static void cgroup_cfts_prepare(void) 2830static void cgroup_cfts_prepare(void)
@@ -2816,28 +2833,30 @@ static void cgroup_cfts_prepare(void)
2816 /* 2833 /*
2817 * Thanks to the entanglement with vfs inode locking, we can't walk 2834 * Thanks to the entanglement with vfs inode locking, we can't walk
2818 * the existing cgroups under cgroup_mutex and create files. 2835 * the existing cgroups under cgroup_mutex and create files.
2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2836 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2820 * read lock before calling cgroup_addrm_files(). 2837 * lock before calling cgroup_addrm_files().
2821 */ 2838 */
2822 mutex_lock(&cgroup_mutex); 2839 mutex_lock(&cgroup_mutex);
2823} 2840}
2824 2841
2825static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2842static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2826 struct cftype *cfts, bool is_add)
2827 __releases(&cgroup_mutex) 2843 __releases(&cgroup_mutex)
2828{ 2844{
2829 LIST_HEAD(pending); 2845 LIST_HEAD(pending);
2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2846 struct cgroup_subsys *ss = cfts[0].ss;
2847 struct cgroup *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb; 2848 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL; 2849 struct dentry *prev = NULL;
2833 struct inode *inode; 2850 struct inode *inode;
2851 struct cgroup_subsys_state *css;
2834 u64 update_before; 2852 u64 update_before;
2853 int ret = 0;
2835 2854
2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2855 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2837 if (!cfts || ss->root == &cgroup_dummy_root || 2856 if (!cfts || ss->root == &cgroup_dummy_root ||
2838 !atomic_inc_not_zero(&sb->s_active)) { 2857 !atomic_inc_not_zero(&sb->s_active)) {
2839 mutex_unlock(&cgroup_mutex); 2858 mutex_unlock(&cgroup_mutex);
2840 return; 2859 return 0;
2841 } 2860 }
2842 2861
2843 /* 2862 /*
@@ -2849,17 +2868,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2849 2868
2850 mutex_unlock(&cgroup_mutex); 2869 mutex_unlock(&cgroup_mutex);
2851 2870
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */ 2871 /* add/rm files for all cgroups created before */
2861 rcu_read_lock(); 2872 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) { 2873 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2874 struct cgroup *cgrp = css->cgroup;
2875
2863 if (cgroup_is_dead(cgrp)) 2876 if (cgroup_is_dead(cgrp))
2864 continue; 2877 continue;
2865 2878
@@ -2873,15 +2886,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2873 mutex_lock(&inode->i_mutex); 2886 mutex_lock(&inode->i_mutex);
2874 mutex_lock(&cgroup_mutex); 2887 mutex_lock(&cgroup_mutex);
2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2888 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2876 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2889 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2877 mutex_unlock(&cgroup_mutex); 2890 mutex_unlock(&cgroup_mutex);
2878 mutex_unlock(&inode->i_mutex); 2891 mutex_unlock(&inode->i_mutex);
2879 2892
2880 rcu_read_lock(); 2893 rcu_read_lock();
2894 if (ret)
2895 break;
2881 } 2896 }
2882 rcu_read_unlock(); 2897 rcu_read_unlock();
2883 dput(prev); 2898 dput(prev);
2884 deactivate_super(sb); 2899 deactivate_super(sb);
2900 return ret;
2885} 2901}
2886 2902
2887/** 2903/**
@@ -2901,49 +2917,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2901int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2917int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2902{ 2918{
2903 struct cftype_set *set; 2919 struct cftype_set *set;
2920 struct cftype *cft;
2921 int ret;
2904 2922
2905 set = kzalloc(sizeof(*set), GFP_KERNEL); 2923 set = kzalloc(sizeof(*set), GFP_KERNEL);
2906 if (!set) 2924 if (!set)
2907 return -ENOMEM; 2925 return -ENOMEM;
2908 2926
2927 for (cft = cfts; cft->name[0] != '\0'; cft++)
2928 cft->ss = ss;
2929
2909 cgroup_cfts_prepare(); 2930 cgroup_cfts_prepare();
2910 set->cfts = cfts; 2931 set->cfts = cfts;
2911 list_add_tail(&set->node, &ss->cftsets); 2932 list_add_tail(&set->node, &ss->cftsets);
2912 cgroup_cfts_commit(ss, cfts, true); 2933 ret = cgroup_cfts_commit(cfts, true);
2913 2934 if (ret)
2914 return 0; 2935 cgroup_rm_cftypes(cfts);
2936 return ret;
2915} 2937}
2916EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2938EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2917 2939
2918/** 2940/**
2919 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2941 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2920 * @ss: target cgroup subsystem
2921 * @cfts: zero-length name terminated array of cftypes 2942 * @cfts: zero-length name terminated array of cftypes
2922 * 2943 *
2923 * Unregister @cfts from @ss. Files described by @cfts are removed from 2944 * Unregister @cfts. Files described by @cfts are removed from all
2924 * all existing cgroups to which @ss is attached and all future cgroups 2945 * existing cgroups and all future cgroups won't have them either. This
2925 * won't have them either. This function can be called anytime whether @ss 2946 * function can be called anytime whether @cfts' subsys is attached or not.
2926 * is attached or not.
2927 * 2947 *
2928 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2948 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2929 * registered with @ss. 2949 * registered.
2930 */ 2950 */
2931int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2951int cgroup_rm_cftypes(struct cftype *cfts)
2932{ 2952{
2933 struct cftype_set *set; 2953 struct cftype_set *set;
2934 2954
2955 if (!cfts || !cfts[0].ss)
2956 return -ENOENT;
2957
2935 cgroup_cfts_prepare(); 2958 cgroup_cfts_prepare();
2936 2959
2937 list_for_each_entry(set, &ss->cftsets, node) { 2960 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2938 if (set->cfts == cfts) { 2961 if (set->cfts == cfts) {
2939 list_del(&set->node); 2962 list_del(&set->node);
2940 kfree(set); 2963 kfree(set);
2941 cgroup_cfts_commit(ss, cfts, false); 2964 cgroup_cfts_commit(cfts, false);
2942 return 0; 2965 return 0;
2943 } 2966 }
2944 } 2967 }
2945 2968
2946 cgroup_cfts_commit(ss, NULL, false); 2969 cgroup_cfts_commit(NULL, false);
2947 return -ENOENT; 2970 return -ENOENT;
2948} 2971}
2949 2972
@@ -2966,34 +2989,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2966} 2989}
2967 2990
2968/* 2991/*
2969 * Advance a list_head iterator. The iterator should be positioned at 2992 * To reduce the fork() overhead for systems that are not actually using
2970 * the start of a css_set 2993 * their cgroups capability, we don't maintain the lists running through
2971 */ 2994 * each css_set to its tasks until we see the list actually used - in other
2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2995 * words after the first call to css_task_iter_start().
2973{
2974 struct list_head *l = it->cset_link;
2975 struct cgrp_cset_link *link;
2976 struct css_set *cset;
2977
2978 /* Advance to the next non-empty css_set */
2979 do {
2980 l = l->next;
2981 if (l == &cgrp->cset_links) {
2982 it->cset_link = NULL;
2983 return;
2984 }
2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2986 cset = link->cset;
2987 } while (list_empty(&cset->tasks));
2988 it->cset_link = l;
2989 it->task = cset->tasks.next;
2990}
2991
2992/*
2993 * To reduce the fork() overhead for systems that are not actually
2994 * using their cgroups capability, we don't maintain the lists running
2995 * through each css_set to its tasks until we see the list actually
2996 * used - in other words after the first call to cgroup_iter_start().
2997 */ 2996 */
2998static void cgroup_enable_task_cg_lists(void) 2997static void cgroup_enable_task_cg_lists(void)
2999{ 2998{
@@ -3024,16 +3023,21 @@ static void cgroup_enable_task_cg_lists(void)
3024} 3023}
3025 3024
3026/** 3025/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup 3026 * css_next_child - find the next child of a given css
3028 * @pos: the current cgroup 3027 * @pos_css: the current position (%NULL to initiate traversal)
3028 * @parent_css: css whose children to walk
3029 * 3029 *
3030 * This function returns the next sibling of @pos and should be called 3030 * This function returns the next child of @parent_css and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible. 3031 * under RCU read lock. The only requirement is that @parent_css and
3032 * The next sibling is guaranteed to be returned regardless of @pos's 3032 * @pos_css are accessible. The next sibling is guaranteed to be returned
3033 * state. 3033 * regardless of their states.
3034 */ 3034 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos) 3035struct cgroup_subsys_state *
3036css_next_child(struct cgroup_subsys_state *pos_css,
3037 struct cgroup_subsys_state *parent_css)
3036{ 3038{
3039 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3040 struct cgroup *cgrp = parent_css->cgroup;
3037 struct cgroup *next; 3041 struct cgroup *next;
3038 3042
3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3043 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3052,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3048 * safe to dereference from this RCU critical section. If 3052 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3053 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here. 3054 * to be visible as %true here.
3055 *
3056 * If @pos is dead, its next pointer can't be dereferenced;
3057 * however, as each cgroup is given a monotonically increasing
3058 * unique serial number and always appended to the sibling list,
3059 * the next one can be found by walking the parent's children until
3060 * we see a cgroup with higher serial number than @pos's. While
3061 * this path can be slower, it's taken only when either the current
3062 * cgroup is removed or iteration and removal race.
3051 */ 3063 */
3052 if (likely(!cgroup_is_dead(pos))) { 3064 if (!pos) {
3065 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3066 } else if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3067 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children) 3068 } else {
3055 return next; 3069 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3056 return NULL; 3070 if (next->serial_nr > pos->serial_nr)
3071 break;
3057 } 3072 }
3058 3073
3059 /* 3074 if (&next->sibling == &cgrp->children)
3060 * Can't dereference the next pointer. Each cgroup is given a 3075 return NULL;
3061 * monotonically increasing unique serial number and always 3076
3062 * appended to the sibling list, so the next one can be found by 3077 return cgroup_css(next, parent_css->ss);
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073} 3078}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3079EXPORT_SYMBOL_GPL(css_next_child);
3075 3080
3076/** 3081/**
3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3082 * css_next_descendant_pre - find the next descendant for pre-order walk
3078 * @pos: the current position (%NULL to initiate traversal) 3083 * @pos: the current position (%NULL to initiate traversal)
3079 * @cgroup: cgroup whose descendants to walk 3084 * @root: css whose descendants to walk
3080 * 3085 *
3081 * To be used by cgroup_for_each_descendant_pre(). Find the next 3086 * To be used by css_for_each_descendant_pre(). Find the next descendant
3082 * descendant to visit for pre-order traversal of @cgroup's descendants. 3087 * to visit for pre-order traversal of @root's descendants. @root is
3088 * included in the iteration and the first node to be visited.
3083 * 3089 *
3084 * While this function requires RCU read locking, it doesn't require the 3090 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This 3091 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos 3092 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3093 * and @root are accessible and @pos is a descendant of @root.
3088 */ 3094 */
3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3095struct cgroup_subsys_state *
3090 struct cgroup *cgroup) 3096css_next_descendant_pre(struct cgroup_subsys_state *pos,
3097 struct cgroup_subsys_state *root)
3091{ 3098{
3092 struct cgroup *next; 3099 struct cgroup_subsys_state *next;
3093 3100
3094 WARN_ON_ONCE(!rcu_read_lock_held()); 3101 WARN_ON_ONCE(!rcu_read_lock_held());
3095 3102
3096 /* if first iteration, pretend we just visited @cgroup */ 3103 /* if first iteration, visit @root */
3097 if (!pos) 3104 if (!pos)
3098 pos = cgroup; 3105 return root;
3099 3106
3100 /* visit the first child if exists */ 3107 /* visit the first child if exists */
3101 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3108 next = css_next_child(NULL, pos);
3102 if (next) 3109 if (next)
3103 return next; 3110 return next;
3104 3111
3105 /* no child, visit my or the closest ancestor's next sibling */ 3112 /* no child, visit my or the closest ancestor's next sibling */
3106 while (pos != cgroup) { 3113 while (pos != root) {
3107 next = cgroup_next_sibling(pos); 3114 next = css_next_child(pos, css_parent(pos));
3108 if (next) 3115 if (next)
3109 return next; 3116 return next;
3110 pos = pos->parent; 3117 pos = css_parent(pos);
3111 } 3118 }
3112 3119
3113 return NULL; 3120 return NULL;
3114} 3121}
3115EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3122EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3116 3123
3117/** 3124/**
3118 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3125 * css_rightmost_descendant - return the rightmost descendant of a css
3119 * @pos: cgroup of interest 3126 * @pos: css of interest
3120 * 3127 *
3121 * Return the rightmost descendant of @pos. If there's no descendant, 3128 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3122 * @pos is returned. This can be used during pre-order traversal to skip 3129 * is returned. This can be used during pre-order traversal to skip
3123 * subtree of @pos. 3130 * subtree of @pos.
3124 * 3131 *
3125 * While this function requires RCU read locking, it doesn't require the 3132 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3134,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3127 * function will return the correct rightmost descendant as long as @pos is 3134 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible. 3135 * accessible.
3129 */ 3136 */
3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3137struct cgroup_subsys_state *
3138css_rightmost_descendant(struct cgroup_subsys_state *pos)
3131{ 3139{
3132 struct cgroup *last, *tmp; 3140 struct cgroup_subsys_state *last, *tmp;
3133 3141
3134 WARN_ON_ONCE(!rcu_read_lock_held()); 3142 WARN_ON_ONCE(!rcu_read_lock_held());
3135 3143
@@ -3137,82 +3145,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3137 last = pos; 3145 last = pos;
3138 /* ->prev isn't RCU safe, walk ->next till the end */ 3146 /* ->prev isn't RCU safe, walk ->next till the end */
3139 pos = NULL; 3147 pos = NULL;
3140 list_for_each_entry_rcu(tmp, &last->children, sibling) 3148 css_for_each_child(tmp, last)
3141 pos = tmp; 3149 pos = tmp;
3142 } while (pos); 3150 } while (pos);
3143 3151
3144 return last; 3152 return last;
3145} 3153}
3146EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3154EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3147 3155
3148static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3156static struct cgroup_subsys_state *
3157css_leftmost_descendant(struct cgroup_subsys_state *pos)
3149{ 3158{
3150 struct cgroup *last; 3159 struct cgroup_subsys_state *last;
3151 3160
3152 do { 3161 do {
3153 last = pos; 3162 last = pos;
3154 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3163 pos = css_next_child(NULL, pos);
3155 sibling);
3156 } while (pos); 3164 } while (pos);
3157 3165
3158 return last; 3166 return last;
3159} 3167}
3160 3168
3161/** 3169/**
3162 * cgroup_next_descendant_post - find the next descendant for post-order walk 3170 * css_next_descendant_post - find the next descendant for post-order walk
3163 * @pos: the current position (%NULL to initiate traversal) 3171 * @pos: the current position (%NULL to initiate traversal)
3164 * @cgroup: cgroup whose descendants to walk 3172 * @root: css whose descendants to walk
3165 * 3173 *
3166 * To be used by cgroup_for_each_descendant_post(). Find the next 3174 * To be used by css_for_each_descendant_post(). Find the next descendant
3167 * descendant to visit for post-order traversal of @cgroup's descendants. 3175 * to visit for post-order traversal of @root's descendants. @root is
3176 * included in the iteration and the last node to be visited.
3168 * 3177 *
3169 * While this function requires RCU read locking, it doesn't require the 3178 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This 3179 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos 3180 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3181 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3173 */ 3182 */
3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3183struct cgroup_subsys_state *
3175 struct cgroup *cgroup) 3184css_next_descendant_post(struct cgroup_subsys_state *pos,
3185 struct cgroup_subsys_state *root)
3176{ 3186{
3177 struct cgroup *next; 3187 struct cgroup_subsys_state *next;
3178 3188
3179 WARN_ON_ONCE(!rcu_read_lock_held()); 3189 WARN_ON_ONCE(!rcu_read_lock_held());
3180 3190
3181 /* if first iteration, visit the leftmost descendant */ 3191 /* if first iteration, visit the leftmost descendant */
3182 if (!pos) { 3192 if (!pos) {
3183 next = cgroup_leftmost_descendant(cgroup); 3193 next = css_leftmost_descendant(root);
3184 return next != cgroup ? next : NULL; 3194 return next != root ? next : NULL;
3185 } 3195 }
3186 3196
3197 /* if we visited @root, we're done */
3198 if (pos == root)
3199 return NULL;
3200
3187 /* if there's an unvisited sibling, visit its leftmost descendant */ 3201 /* if there's an unvisited sibling, visit its leftmost descendant */
3188 next = cgroup_next_sibling(pos); 3202 next = css_next_child(pos, css_parent(pos));
3189 if (next) 3203 if (next)
3190 return cgroup_leftmost_descendant(next); 3204 return css_leftmost_descendant(next);
3191 3205
3192 /* no sibling left, visit parent */ 3206 /* no sibling left, visit parent */
3193 next = pos->parent; 3207 return css_parent(pos);
3194 return next != cgroup ? next : NULL;
3195} 3208}
3196EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); 3209EXPORT_SYMBOL_GPL(css_next_descendant_post);
3197 3210
3198void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3211/**
3212 * css_advance_task_iter - advance a task itererator to the next css_set
3213 * @it: the iterator to advance
3214 *
3215 * Advance @it to the next css_set to walk.
3216 */
3217static void css_advance_task_iter(struct css_task_iter *it)
3218{
3219 struct list_head *l = it->cset_link;
3220 struct cgrp_cset_link *link;
3221 struct css_set *cset;
3222
3223 /* Advance to the next non-empty css_set */
3224 do {
3225 l = l->next;
3226 if (l == &it->origin_css->cgroup->cset_links) {
3227 it->cset_link = NULL;
3228 return;
3229 }
3230 link = list_entry(l, struct cgrp_cset_link, cset_link);
3231 cset = link->cset;
3232 } while (list_empty(&cset->tasks));
3233 it->cset_link = l;
3234 it->task = cset->tasks.next;
3235}
3236
3237/**
3238 * css_task_iter_start - initiate task iteration
3239 * @css: the css to walk tasks of
3240 * @it: the task iterator to use
3241 *
3242 * Initiate iteration through the tasks of @css. The caller can call
3243 * css_task_iter_next() to walk through the tasks until the function
3244 * returns NULL. On completion of iteration, css_task_iter_end() must be
3245 * called.
3246 *
3247 * Note that this function acquires a lock which is released when the
3248 * iteration finishes. The caller can't sleep while iteration is in
3249 * progress.
3250 */
3251void css_task_iter_start(struct cgroup_subsys_state *css,
3252 struct css_task_iter *it)
3199 __acquires(css_set_lock) 3253 __acquires(css_set_lock)
3200{ 3254{
3201 /* 3255 /*
3202 * The first time anyone tries to iterate across a cgroup, 3256 * The first time anyone tries to iterate across a css, we need to
3203 * we need to enable the list linking each css_set to its 3257 * enable the list linking each css_set to its tasks, and fix up
3204 * tasks, and fix up all existing tasks. 3258 * all existing tasks.
3205 */ 3259 */
3206 if (!use_task_css_set_links) 3260 if (!use_task_css_set_links)
3207 cgroup_enable_task_cg_lists(); 3261 cgroup_enable_task_cg_lists();
3208 3262
3209 read_lock(&css_set_lock); 3263 read_lock(&css_set_lock);
3210 it->cset_link = &cgrp->cset_links; 3264
3211 cgroup_advance_iter(cgrp, it); 3265 it->origin_css = css;
3266 it->cset_link = &css->cgroup->cset_links;
3267
3268 css_advance_task_iter(it);
3212} 3269}
3213 3270
3214struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3271/**
3215 struct cgroup_iter *it) 3272 * css_task_iter_next - return the next task for the iterator
3273 * @it: the task iterator being iterated
3274 *
3275 * The "next" function for task iteration. @it should have been
3276 * initialized via css_task_iter_start(). Returns NULL when the iteration
3277 * reaches the end.
3278 */
3279struct task_struct *css_task_iter_next(struct css_task_iter *it)
3216{ 3280{
3217 struct task_struct *res; 3281 struct task_struct *res;
3218 struct list_head *l = it->task; 3282 struct list_head *l = it->task;
@@ -3226,16 +3290,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3226 l = l->next; 3290 l = l->next;
3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3291 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3228 if (l == &link->cset->tasks) { 3292 if (l == &link->cset->tasks) {
3229 /* We reached the end of this task list - move on to 3293 /*
3230 * the next cg_cgroup_link */ 3294 * We reached the end of this task list - move on to the
3231 cgroup_advance_iter(cgrp, it); 3295 * next cgrp_cset_link.
3296 */
3297 css_advance_task_iter(it);
3232 } else { 3298 } else {
3233 it->task = l; 3299 it->task = l;
3234 } 3300 }
3235 return res; 3301 return res;
3236} 3302}
3237 3303
3238void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3304/**
3305 * css_task_iter_end - finish task iteration
3306 * @it: the task iterator to finish
3307 *
3308 * Finish task iteration started by css_task_iter_start().
3309 */
3310void css_task_iter_end(struct css_task_iter *it)
3239 __releases(css_set_lock) 3311 __releases(css_set_lock)
3240{ 3312{
3241 read_unlock(&css_set_lock); 3313 read_unlock(&css_set_lock);
@@ -3276,46 +3348,49 @@ static inline int started_after(void *p1, void *p2)
3276} 3348}
3277 3349
3278/** 3350/**
3279 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3351 * css_scan_tasks - iterate though all the tasks in a css
3280 * @scan: struct cgroup_scanner containing arguments for the scan 3352 * @css: the css to iterate tasks of
3353 * @test: optional test callback
3354 * @process: process callback
3355 * @data: data passed to @test and @process
3356 * @heap: optional pre-allocated heap used for task iteration
3357 *
3358 * Iterate through all the tasks in @css, calling @test for each, and if it
3359 * returns %true, call @process for it also.
3360 *
3361 * @test may be NULL, meaning always true (select all tasks), which
3362 * effectively duplicates css_task_iter_{start,next,end}() but does not
3363 * lock css_set_lock for the call to @process.
3364 *
3365 * It is guaranteed that @process will act on every task that is a member
3366 * of @css for the duration of this call. This function may or may not
3367 * call @process for tasks that exit or move to a different css during the
3368 * call, or are forked or move into the css during the call.
3281 * 3369 *
3282 * Arguments include pointers to callback functions test_task() and 3370 * Note that @test may be called with locks held, and may in some
3283 * process_task(). 3371 * situations be called multiple times for the same task, so it should be
3284 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3372 * cheap.
3285 * and if it returns true, call process_task() for it also.
3286 * The test_task pointer may be NULL, meaning always true (select all tasks).
3287 * Effectively duplicates cgroup_iter_{start,next,end}()
3288 * but does not lock css_set_lock for the call to process_task().
3289 * The struct cgroup_scanner may be embedded in any structure of the caller's
3290 * creation.
3291 * It is guaranteed that process_task() will act on every task that
3292 * is a member of the cgroup for the duration of this call. This
3293 * function may or may not call process_task() for tasks that exit
3294 * or move to a different cgroup during the call, or are forked or
3295 * move into the cgroup during the call.
3296 * 3373 *
3297 * Note that test_task() may be called with locks held, and may in some 3374 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3298 * situations be called multiple times for the same task, so it should 3375 * heap operations (and its "gt" member will be overwritten), else a
3299 * be cheap. 3376 * temporary heap will be used (allocation of which may cause this function
3300 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3377 * to fail).
3301 * pre-allocated and will be used for heap operations (and its "gt" member will
3302 * be overwritten), else a temporary heap will be used (allocation of which
3303 * may cause this function to fail).
3304 */ 3378 */
3305int cgroup_scan_tasks(struct cgroup_scanner *scan) 3379int css_scan_tasks(struct cgroup_subsys_state *css,
3380 bool (*test)(struct task_struct *, void *),
3381 void (*process)(struct task_struct *, void *),
3382 void *data, struct ptr_heap *heap)
3306{ 3383{
3307 int retval, i; 3384 int retval, i;
3308 struct cgroup_iter it; 3385 struct css_task_iter it;
3309 struct task_struct *p, *dropped; 3386 struct task_struct *p, *dropped;
3310 /* Never dereference latest_task, since it's not refcounted */ 3387 /* Never dereference latest_task, since it's not refcounted */
3311 struct task_struct *latest_task = NULL; 3388 struct task_struct *latest_task = NULL;
3312 struct ptr_heap tmp_heap; 3389 struct ptr_heap tmp_heap;
3313 struct ptr_heap *heap;
3314 struct timespec latest_time = { 0, 0 }; 3390 struct timespec latest_time = { 0, 0 };
3315 3391
3316 if (scan->heap) { 3392 if (heap) {
3317 /* The caller supplied our heap and pre-allocated its memory */ 3393 /* The caller supplied our heap and pre-allocated its memory */
3318 heap = scan->heap;
3319 heap->gt = &started_after; 3394 heap->gt = &started_after;
3320 } else { 3395 } else {
3321 /* We need to allocate our own heap memory */ 3396 /* We need to allocate our own heap memory */
@@ -3328,25 +3403,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3328 3403
3329 again: 3404 again:
3330 /* 3405 /*
3331 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3406 * Scan tasks in the css, using the @test callback to determine
3332 * to determine which are of interest, and using the scanner's 3407 * which are of interest, and invoking @process callback on the
3333 * "process_task" callback to process any of them that need an update. 3408 * ones which need an update. Since we don't want to hold any
3334 * Since we don't want to hold any locks during the task updates, 3409 * locks during the task updates, gather tasks to be processed in a
3335 * gather tasks to be processed in a heap structure. 3410 * heap structure. The heap is sorted by descending task start
3336 * The heap is sorted by descending task start time. 3411 * time. If the statically-sized heap fills up, we overflow tasks
3337 * If the statically-sized heap fills up, we overflow tasks that 3412 * that started later, and in future iterations only consider tasks
3338 * started later, and in future iterations only consider tasks that 3413 * that started after the latest task in the previous pass. This
3339 * started after the latest task in the previous pass. This
3340 * guarantees forward progress and that we don't miss any tasks. 3414 * guarantees forward progress and that we don't miss any tasks.
3341 */ 3415 */
3342 heap->size = 0; 3416 heap->size = 0;
3343 cgroup_iter_start(scan->cg, &it); 3417 css_task_iter_start(css, &it);
3344 while ((p = cgroup_iter_next(scan->cg, &it))) { 3418 while ((p = css_task_iter_next(&it))) {
3345 /* 3419 /*
3346 * Only affect tasks that qualify per the caller's callback, 3420 * Only affect tasks that qualify per the caller's callback,
3347 * if he provided one 3421 * if he provided one
3348 */ 3422 */
3349 if (scan->test_task && !scan->test_task(p, scan)) 3423 if (test && !test(p, data))
3350 continue; 3424 continue;
3351 /* 3425 /*
3352 * Only process tasks that started after the last task 3426 * Only process tasks that started after the last task
@@ -3374,7 +3448,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3374 * the heap and wasn't inserted 3448 * the heap and wasn't inserted
3375 */ 3449 */
3376 } 3450 }
3377 cgroup_iter_end(scan->cg, &it); 3451 css_task_iter_end(&it);
3378 3452
3379 if (heap->size) { 3453 if (heap->size) {
3380 for (i = 0; i < heap->size; i++) { 3454 for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3458,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3384 latest_task = q; 3458 latest_task = q;
3385 } 3459 }
3386 /* Process the task per the caller's callback */ 3460 /* Process the task per the caller's callback */
3387 scan->process_task(q, scan); 3461 process(q, data);
3388 put_task_struct(q); 3462 put_task_struct(q);
3389 } 3463 }
3390 /* 3464 /*
@@ -3401,10 +3475,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3401 return 0; 3475 return 0;
3402} 3476}
3403 3477
3404static void cgroup_transfer_one_task(struct task_struct *task, 3478static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3405 struct cgroup_scanner *scan)
3406{ 3479{
3407 struct cgroup *new_cgroup = scan->data; 3480 struct cgroup *new_cgroup = data;
3408 3481
3409 mutex_lock(&cgroup_mutex); 3482 mutex_lock(&cgroup_mutex);
3410 cgroup_attach_task(new_cgroup, task, false); 3483 cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3491,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3418 */ 3491 */
3419int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3492int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3420{ 3493{
3421 struct cgroup_scanner scan; 3494 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3422 3495 to, NULL);
3423 scan.cg = from;
3424 scan.test_task = NULL; /* select all tasks in cgroup */
3425 scan.process_task = cgroup_transfer_one_task;
3426 scan.heap = NULL;
3427 scan.data = to;
3428
3429 return cgroup_scan_tasks(&scan);
3430} 3496}
3431 3497
3432/* 3498/*
@@ -3468,7 +3534,7 @@ struct cgroup_pidlist {
3468 /* pointer to the cgroup we belong to, for list removal purposes */ 3534 /* pointer to the cgroup we belong to, for list removal purposes */
3469 struct cgroup *owner; 3535 struct cgroup *owner;
3470 /* protects the other fields */ 3536 /* protects the other fields */
3471 struct rw_semaphore mutex; 3537 struct rw_semaphore rwsem;
3472}; 3538};
3473 3539
3474/* 3540/*
@@ -3541,7 +3607,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3541 struct pid_namespace *ns = task_active_pid_ns(current); 3607 struct pid_namespace *ns = task_active_pid_ns(current);
3542 3608
3543 /* 3609 /*
3544 * We can't drop the pidlist_mutex before taking the l->mutex in case 3610 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3545 * the last ref-holder is trying to remove l from the list at the same 3611 * the last ref-holder is trying to remove l from the list at the same
3546 * time. Holding the pidlist_mutex precludes somebody taking whichever 3612 * time. Holding the pidlist_mutex precludes somebody taking whichever
3547 * list we find out from under us - compare release_pid_array(). 3613 * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3616,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3550 list_for_each_entry(l, &cgrp->pidlists, links) { 3616 list_for_each_entry(l, &cgrp->pidlists, links) {
3551 if (l->key.type == type && l->key.ns == ns) { 3617 if (l->key.type == type && l->key.ns == ns) {
3552 /* make sure l doesn't vanish out from under us */ 3618 /* make sure l doesn't vanish out from under us */
3553 down_write(&l->mutex); 3619 down_write(&l->rwsem);
3554 mutex_unlock(&cgrp->pidlist_mutex); 3620 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3621 return l;
3556 } 3622 }
@@ -3561,8 +3627,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3561 mutex_unlock(&cgrp->pidlist_mutex); 3627 mutex_unlock(&cgrp->pidlist_mutex);
3562 return l; 3628 return l;
3563 } 3629 }
3564 init_rwsem(&l->mutex); 3630 init_rwsem(&l->rwsem);
3565 down_write(&l->mutex); 3631 down_write(&l->rwsem);
3566 l->key.type = type; 3632 l->key.type = type;
3567 l->key.ns = get_pid_ns(ns); 3633 l->key.ns = get_pid_ns(ns);
3568 l->owner = cgrp; 3634 l->owner = cgrp;
@@ -3580,7 +3646,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3580 pid_t *array; 3646 pid_t *array;
3581 int length; 3647 int length;
3582 int pid, n = 0; /* used for populating the array */ 3648 int pid, n = 0; /* used for populating the array */
3583 struct cgroup_iter it; 3649 struct css_task_iter it;
3584 struct task_struct *tsk; 3650 struct task_struct *tsk;
3585 struct cgroup_pidlist *l; 3651 struct cgroup_pidlist *l;
3586 3652
@@ -3595,8 +3661,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3595 if (!array) 3661 if (!array)
3596 return -ENOMEM; 3662 return -ENOMEM;
3597 /* now, populate the array */ 3663 /* now, populate the array */
3598 cgroup_iter_start(cgrp, &it); 3664 css_task_iter_start(&cgrp->dummy_css, &it);
3599 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3665 while ((tsk = css_task_iter_next(&it))) {
3600 if (unlikely(n == length)) 3666 if (unlikely(n == length))
3601 break; 3667 break;
3602 /* get tgid or pid for procs or tasks file respectively */ 3668 /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3673,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3607 if (pid > 0) /* make sure to only use valid results */ 3673 if (pid > 0) /* make sure to only use valid results */
3608 array[n++] = pid; 3674 array[n++] = pid;
3609 } 3675 }
3610 cgroup_iter_end(cgrp, &it); 3676 css_task_iter_end(&it);
3611 length = n; 3677 length = n;
3612 /* now sort & (if procs) strip out duplicates */ 3678 /* now sort & (if procs) strip out duplicates */
3613 sort(array, length, sizeof(pid_t), cmppid, NULL); 3679 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3689,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3623 l->list = array; 3689 l->list = array;
3624 l->length = length; 3690 l->length = length;
3625 l->use_count++; 3691 l->use_count++;
3626 up_write(&l->mutex); 3692 up_write(&l->rwsem);
3627 *lp = l; 3693 *lp = l;
3628 return 0; 3694 return 0;
3629} 3695}
@@ -3641,7 +3707,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3641{ 3707{
3642 int ret = -EINVAL; 3708 int ret = -EINVAL;
3643 struct cgroup *cgrp; 3709 struct cgroup *cgrp;
3644 struct cgroup_iter it; 3710 struct css_task_iter it;
3645 struct task_struct *tsk; 3711 struct task_struct *tsk;
3646 3712
3647 /* 3713 /*
@@ -3655,8 +3721,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3655 ret = 0; 3721 ret = 0;
3656 cgrp = dentry->d_fsdata; 3722 cgrp = dentry->d_fsdata;
3657 3723
3658 cgroup_iter_start(cgrp, &it); 3724 css_task_iter_start(&cgrp->dummy_css, &it);
3659 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3725 while ((tsk = css_task_iter_next(&it))) {
3660 switch (tsk->state) { 3726 switch (tsk->state) {
3661 case TASK_RUNNING: 3727 case TASK_RUNNING:
3662 stats->nr_running++; 3728 stats->nr_running++;
@@ -3676,7 +3742,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3676 break; 3742 break;
3677 } 3743 }
3678 } 3744 }
3679 cgroup_iter_end(cgrp, &it); 3745 css_task_iter_end(&it);
3680 3746
3681err: 3747err:
3682 return ret; 3748 return ret;
@@ -3701,7 +3767,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3701 int index = 0, pid = *pos; 3767 int index = 0, pid = *pos;
3702 int *iter; 3768 int *iter;
3703 3769
3704 down_read(&l->mutex); 3770 down_read(&l->rwsem);
3705 if (pid) { 3771 if (pid) {
3706 int end = l->length; 3772 int end = l->length;
3707 3773
@@ -3728,7 +3794,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3728static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3794static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3729{ 3795{
3730 struct cgroup_pidlist *l = s->private; 3796 struct cgroup_pidlist *l = s->private;
3731 up_read(&l->mutex); 3797 up_read(&l->rwsem);
3732} 3798}
3733 3799
3734static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3800static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3840,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3774 * pidlist_mutex, we have to take pidlist_mutex first. 3840 * pidlist_mutex, we have to take pidlist_mutex first.
3775 */ 3841 */
3776 mutex_lock(&l->owner->pidlist_mutex); 3842 mutex_lock(&l->owner->pidlist_mutex);
3777 down_write(&l->mutex); 3843 down_write(&l->rwsem);
3778 BUG_ON(!l->use_count); 3844 BUG_ON(!l->use_count);
3779 if (!--l->use_count) { 3845 if (!--l->use_count) {
3780 /* we're the last user if refcount is 0; remove and free */ 3846 /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3848,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3782 mutex_unlock(&l->owner->pidlist_mutex); 3848 mutex_unlock(&l->owner->pidlist_mutex);
3783 pidlist_free(l->list); 3849 pidlist_free(l->list);
3784 put_pid_ns(l->key.ns); 3850 put_pid_ns(l->key.ns);
3785 up_write(&l->mutex); 3851 up_write(&l->rwsem);
3786 kfree(l); 3852 kfree(l);
3787 return; 3853 return;
3788 } 3854 }
3789 mutex_unlock(&l->owner->pidlist_mutex); 3855 mutex_unlock(&l->owner->pidlist_mutex);
3790 up_write(&l->mutex); 3856 up_write(&l->rwsem);
3791} 3857}
3792 3858
3793static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3859static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3917,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3851 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3917 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3852} 3918}
3853 3919
3854static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3920static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3855 struct cftype *cft) 3921 struct cftype *cft)
3856{ 3922{
3857 return notify_on_release(cgrp); 3923 return notify_on_release(css->cgroup);
3858} 3924}
3859 3925
3860static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3926static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 struct cftype *cft, 3927 struct cftype *cft, u64 val)
3862 u64 val)
3863{ 3928{
3864 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3929 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3865 if (val) 3930 if (val)
3866 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3931 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3867 else 3932 else
3868 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3933 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3869 return 0; 3934 return 0;
3870} 3935}
3871 3936
@@ -3895,18 +3960,18 @@ static void cgroup_event_remove(struct work_struct *work)
3895{ 3960{
3896 struct cgroup_event *event = container_of(work, struct cgroup_event, 3961 struct cgroup_event *event = container_of(work, struct cgroup_event,
3897 remove); 3962 remove);
3898 struct cgroup *cgrp = event->cgrp; 3963 struct cgroup_subsys_state *css = event->css;
3899 3964
3900 remove_wait_queue(event->wqh, &event->wait); 3965 remove_wait_queue(event->wqh, &event->wait);
3901 3966
3902 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3967 event->cft->unregister_event(css, event->cft, event->eventfd);
3903 3968
3904 /* Notify userspace the event is going away. */ 3969 /* Notify userspace the event is going away. */
3905 eventfd_signal(event->eventfd, 1); 3970 eventfd_signal(event->eventfd, 1);
3906 3971
3907 eventfd_ctx_put(event->eventfd); 3972 eventfd_ctx_put(event->eventfd);
3908 kfree(event); 3973 kfree(event);
3909 cgroup_dput(cgrp); 3974 css_put(css);
3910} 3975}
3911 3976
3912/* 3977/*
@@ -3919,7 +3984,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3919{ 3984{
3920 struct cgroup_event *event = container_of(wait, 3985 struct cgroup_event *event = container_of(wait,
3921 struct cgroup_event, wait); 3986 struct cgroup_event, wait);
3922 struct cgroup *cgrp = event->cgrp; 3987 struct cgroup *cgrp = event->css->cgroup;
3923 unsigned long flags = (unsigned long)key; 3988 unsigned long flags = (unsigned long)key;
3924 3989
3925 if (flags & POLLHUP) { 3990 if (flags & POLLHUP) {
@@ -3963,14 +4028,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3963 * Input must be in format '<event_fd> <control_fd> <args>'. 4028 * Input must be in format '<event_fd> <control_fd> <args>'.
3964 * Interpretation of args is defined by control file implementation. 4029 * Interpretation of args is defined by control file implementation.
3965 */ 4030 */
3966static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 4031static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3967 const char *buffer) 4032 struct cftype *cft, const char *buffer)
3968{ 4033{
3969 struct cgroup_event *event = NULL; 4034 struct cgroup *cgrp = dummy_css->cgroup;
3970 struct cgroup *cgrp_cfile; 4035 struct cgroup_event *event;
4036 struct cgroup_subsys_state *cfile_css;
3971 unsigned int efd, cfd; 4037 unsigned int efd, cfd;
3972 struct file *efile = NULL; 4038 struct fd efile;
3973 struct file *cfile = NULL; 4039 struct fd cfile;
3974 char *endp; 4040 char *endp;
3975 int ret; 4041 int ret;
3976 4042
@@ -3987,109 +4053,113 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3987 event = kzalloc(sizeof(*event), GFP_KERNEL); 4053 event = kzalloc(sizeof(*event), GFP_KERNEL);
3988 if (!event) 4054 if (!event)
3989 return -ENOMEM; 4055 return -ENOMEM;
3990 event->cgrp = cgrp; 4056
3991 INIT_LIST_HEAD(&event->list); 4057 INIT_LIST_HEAD(&event->list);
3992 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4058 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3993 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4059 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3994 INIT_WORK(&event->remove, cgroup_event_remove); 4060 INIT_WORK(&event->remove, cgroup_event_remove);
3995 4061
3996 efile = eventfd_fget(efd); 4062 efile = fdget(efd);
3997 if (IS_ERR(efile)) { 4063 if (!efile.file) {
3998 ret = PTR_ERR(efile); 4064 ret = -EBADF;
3999 goto fail; 4065 goto out_kfree;
4000 } 4066 }
4001 4067
4002 event->eventfd = eventfd_ctx_fileget(efile); 4068 event->eventfd = eventfd_ctx_fileget(efile.file);
4003 if (IS_ERR(event->eventfd)) { 4069 if (IS_ERR(event->eventfd)) {
4004 ret = PTR_ERR(event->eventfd); 4070 ret = PTR_ERR(event->eventfd);
4005 goto fail; 4071 goto out_put_efile;
4006 } 4072 }
4007 4073
4008 cfile = fget(cfd); 4074 cfile = fdget(cfd);
4009 if (!cfile) { 4075 if (!cfile.file) {
4010 ret = -EBADF; 4076 ret = -EBADF;
4011 goto fail; 4077 goto out_put_eventfd;
4012 } 4078 }
4013 4079
4014 /* the process need read permission on control file */ 4080 /* the process need read permission on control file */
4015 /* AV: shouldn't we check that it's been opened for read instead? */ 4081 /* AV: shouldn't we check that it's been opened for read instead? */
4016 ret = inode_permission(file_inode(cfile), MAY_READ); 4082 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4017 if (ret < 0) 4083 if (ret < 0)
4018 goto fail; 4084 goto out_put_cfile;
4019 4085
4020 event->cft = __file_cft(cfile); 4086 event->cft = __file_cft(cfile.file);
4021 if (IS_ERR(event->cft)) { 4087 if (IS_ERR(event->cft)) {
4022 ret = PTR_ERR(event->cft); 4088 ret = PTR_ERR(event->cft);
4023 goto fail; 4089 goto out_put_cfile;
4090 }
4091
4092 if (!event->cft->ss) {
4093 ret = -EBADF;
4094 goto out_put_cfile;
4024 } 4095 }
4025 4096
4026 /* 4097 /*
4027 * The file to be monitored must be in the same cgroup as 4098 * Determine the css of @cfile, verify it belongs to the same
4028 * cgroup.event_control is. 4099 * cgroup as cgroup.event_control, and associate @event with it.
4100 * Remaining events are automatically removed on cgroup destruction
4101 * but the removal is asynchronous, so take an extra ref.
4029 */ 4102 */
4030 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4103 rcu_read_lock();
4031 if (cgrp_cfile != cgrp) { 4104
4032 ret = -EINVAL; 4105 ret = -EINVAL;
4033 goto fail; 4106 event->css = cgroup_css(cgrp, event->cft->ss);
4034 } 4107 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4108 if (event->css && event->css == cfile_css && css_tryget(event->css))
4109 ret = 0;
4110
4111 rcu_read_unlock();
4112 if (ret)
4113 goto out_put_cfile;
4035 4114
4036 if (!event->cft->register_event || !event->cft->unregister_event) { 4115 if (!event->cft->register_event || !event->cft->unregister_event) {
4037 ret = -EINVAL; 4116 ret = -EINVAL;
4038 goto fail; 4117 goto out_put_css;
4039 } 4118 }
4040 4119
4041 ret = event->cft->register_event(cgrp, event->cft, 4120 ret = event->cft->register_event(event->css, event->cft,
4042 event->eventfd, buffer); 4121 event->eventfd, buffer);
4043 if (ret) 4122 if (ret)
4044 goto fail; 4123 goto out_put_css;
4045 4124
4046 efile->f_op->poll(efile, &event->pt); 4125 efile.file->f_op->poll(efile.file, &event->pt);
4047
4048 /*
4049 * Events should be removed after rmdir of cgroup directory, but before
4050 * destroying subsystem state objects. Let's take reference to cgroup
4051 * directory dentry to do that.
4052 */
4053 dget(cgrp->dentry);
4054 4126
4055 spin_lock(&cgrp->event_list_lock); 4127 spin_lock(&cgrp->event_list_lock);
4056 list_add(&event->list, &cgrp->event_list); 4128 list_add(&event->list, &cgrp->event_list);
4057 spin_unlock(&cgrp->event_list_lock); 4129 spin_unlock(&cgrp->event_list_lock);
4058 4130
4059 fput(cfile); 4131 fdput(cfile);
4060 fput(efile); 4132 fdput(efile);
4061 4133
4062 return 0; 4134 return 0;
4063 4135
4064fail: 4136out_put_css:
4065 if (cfile) 4137 css_put(event->css);
4066 fput(cfile); 4138out_put_cfile:
4067 4139 fdput(cfile);
4068 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4140out_put_eventfd:
4069 eventfd_ctx_put(event->eventfd); 4141 eventfd_ctx_put(event->eventfd);
4070 4142out_put_efile:
4071 if (!IS_ERR_OR_NULL(efile)) 4143 fdput(efile);
4072 fput(efile); 4144out_kfree:
4073
4074 kfree(event); 4145 kfree(event);
4075 4146
4076 return ret; 4147 return ret;
4077} 4148}
4078 4149
4079static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4150static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4080 struct cftype *cft) 4151 struct cftype *cft)
4081{ 4152{
4082 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4153 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4083} 4154}
4084 4155
4085static int cgroup_clone_children_write(struct cgroup *cgrp, 4156static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4086 struct cftype *cft, 4157 struct cftype *cft, u64 val)
4087 u64 val)
4088{ 4158{
4089 if (val) 4159 if (val)
4090 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4160 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4091 else 4161 else
4092 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4162 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4093 return 0; 4163 return 0;
4094} 4164}
4095 4165
@@ -4148,36 +4218,34 @@ static struct cftype cgroup_base_files[] = {
4148}; 4218};
4149 4219
4150/** 4220/**
4151 * cgroup_populate_dir - selectively creation of files in a directory 4221 * cgroup_populate_dir - create subsys files in a cgroup directory
4152 * @cgrp: target cgroup 4222 * @cgrp: target cgroup
4153 * @base_files: true if the base files should be added
4154 * @subsys_mask: mask of the subsystem ids whose files should be added 4223 * @subsys_mask: mask of the subsystem ids whose files should be added
4224 *
4225 * On failure, no file is added.
4155 */ 4226 */
4156static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4227static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4157 unsigned long subsys_mask)
4158{ 4228{
4159 int err;
4160 struct cgroup_subsys *ss; 4229 struct cgroup_subsys *ss;
4161 4230 int i, ret = 0;
4162 if (base_files) {
4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4164 if (err < 0)
4165 return err;
4166 }
4167 4231
4168 /* process cftsets of each subsystem */ 4232 /* process cftsets of each subsystem */
4169 for_each_root_subsys(cgrp->root, ss) { 4233 for_each_subsys(ss, i) {
4170 struct cftype_set *set; 4234 struct cftype_set *set;
4171 if (!test_bit(ss->subsys_id, &subsys_mask)) 4235
4236 if (!test_bit(i, &subsys_mask))
4172 continue; 4237 continue;
4173 4238
4174 list_for_each_entry(set, &ss->cftsets, node) 4239 list_for_each_entry(set, &ss->cftsets, node) {
4175 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4240 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4241 if (ret < 0)
4242 goto err;
4243 }
4176 } 4244 }
4177 4245
4178 /* This cgroup is ready now */ 4246 /* This cgroup is ready now */
4179 for_each_root_subsys(cgrp->root, ss) { 4247 for_each_root_subsys(cgrp->root, ss) {
4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4248 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4181 struct css_id *id = rcu_dereference_protected(css->id, true); 4249 struct css_id *id = rcu_dereference_protected(css->id, true);
4182 4250
4183 /* 4251 /*
@@ -4190,14 +4258,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4190 } 4258 }
4191 4259
4192 return 0; 4260 return 0;
4261err:
4262 cgroup_clear_dir(cgrp, subsys_mask);
4263 return ret;
4264}
4265
4266/*
4267 * css destruction is four-stage process.
4268 *
4269 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4270 * Implemented in kill_css().
4271 *
4272 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4273 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4274 * by invoking offline_css(). After offlining, the base ref is put.
4275 * Implemented in css_killed_work_fn().
4276 *
4277 * 3. When the percpu_ref reaches zero, the only possible remaining
4278 * accessors are inside RCU read sections. css_release() schedules the
4279 * RCU callback.
4280 *
4281 * 4. After the grace period, the css can be freed. Implemented in
4282 * css_free_work_fn().
4283 *
4284 * It is actually hairier because both step 2 and 4 require process context
4285 * and thus involve punting to css->destroy_work adding two additional
4286 * steps to the already complex sequence.
4287 */
4288static void css_free_work_fn(struct work_struct *work)
4289{
4290 struct cgroup_subsys_state *css =
4291 container_of(work, struct cgroup_subsys_state, destroy_work);
4292 struct cgroup *cgrp = css->cgroup;
4293
4294 if (css->parent)
4295 css_put(css->parent);
4296
4297 css->ss->css_free(css);
4298 cgroup_dput(cgrp);
4193} 4299}
4194 4300
4195static void css_dput_fn(struct work_struct *work) 4301static void css_free_rcu_fn(struct rcu_head *rcu_head)
4196{ 4302{
4197 struct cgroup_subsys_state *css = 4303 struct cgroup_subsys_state *css =
4198 container_of(work, struct cgroup_subsys_state, dput_work); 4304 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4199 4305
4200 cgroup_dput(css->cgroup); 4306 /*
4307 * css holds an extra ref to @cgrp->dentry which is put on the last
4308 * css_put(). dput() requires process context which we don't have.
4309 */
4310 INIT_WORK(&css->destroy_work, css_free_work_fn);
4311 schedule_work(&css->destroy_work);
4201} 4312}
4202 4313
4203static void css_release(struct percpu_ref *ref) 4314static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4316,47 @@ static void css_release(struct percpu_ref *ref)
4205 struct cgroup_subsys_state *css = 4316 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt); 4317 container_of(ref, struct cgroup_subsys_state, refcnt);
4207 4318
4208 schedule_work(&css->dput_work); 4319 call_rcu(&css->rcu_head, css_free_rcu_fn);
4209} 4320}
4210 4321
4211static void init_cgroup_css(struct cgroup_subsys_state *css, 4322static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4212 struct cgroup_subsys *ss, 4323 struct cgroup *cgrp)
4213 struct cgroup *cgrp)
4214{ 4324{
4215 css->cgroup = cgrp; 4325 css->cgroup = cgrp;
4326 css->ss = ss;
4216 css->flags = 0; 4327 css->flags = 0;
4217 css->id = NULL; 4328 css->id = NULL;
4218 if (cgrp == cgroup_dummy_top) 4329
4330 if (cgrp->parent)
4331 css->parent = cgroup_css(cgrp->parent, ss);
4332 else
4219 css->flags |= CSS_ROOT; 4333 css->flags |= CSS_ROOT;
4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4221 cgrp->subsys[ss->subsys_id] = css;
4222 4334
4223 /* 4335 BUG_ON(cgroup_css(cgrp, ss));
4224 * css holds an extra ref to @cgrp->dentry which is put on the last
4225 * css_put(). dput() requires process context, which css_put() may
4226 * be called without. @css->dput_work will be used to invoke
4227 * dput() asynchronously from css_put().
4228 */
4229 INIT_WORK(&css->dput_work, css_dput_fn);
4230} 4336}
4231 4337
4232/* invoke ->post_create() on a new CSS and mark it online if successful */ 4338/* invoke ->css_online() on a new CSS and mark it online if successful */
4233static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4339static int online_css(struct cgroup_subsys_state *css)
4234{ 4340{
4341 struct cgroup_subsys *ss = css->ss;
4235 int ret = 0; 4342 int ret = 0;
4236 4343
4237 lockdep_assert_held(&cgroup_mutex); 4344 lockdep_assert_held(&cgroup_mutex);
4238 4345
4239 if (ss->css_online) 4346 if (ss->css_online)
4240 ret = ss->css_online(cgrp); 4347 ret = ss->css_online(css);
4241 if (!ret) 4348 if (!ret) {
4242 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4349 css->flags |= CSS_ONLINE;
4350 css->cgroup->nr_css++;
4351 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4352 }
4243 return ret; 4353 return ret;
4244} 4354}
4245 4355
4246/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4356/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4247static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4357static void offline_css(struct cgroup_subsys_state *css)
4248 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4249{ 4358{
4250 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4359 struct cgroup_subsys *ss = css->ss;
4251 4360
4252 lockdep_assert_held(&cgroup_mutex); 4361 lockdep_assert_held(&cgroup_mutex);
4253 4362
@@ -4255,9 +4364,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4255 return; 4364 return;
4256 4365
4257 if (ss->css_offline) 4366 if (ss->css_offline)
4258 ss->css_offline(cgrp); 4367 ss->css_offline(css);
4259 4368
4260 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4369 css->flags &= ~CSS_ONLINE;
4370 css->cgroup->nr_css--;
4371 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4261} 4372}
4262 4373
4263/* 4374/*
@@ -4271,6 +4382,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4271static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4382static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4272 umode_t mode) 4383 umode_t mode)
4273{ 4384{
4385 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4274 struct cgroup *cgrp; 4386 struct cgroup *cgrp;
4275 struct cgroup_name *name; 4387 struct cgroup_name *name;
4276 struct cgroupfs_root *root = parent->root; 4388 struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4400,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4288 goto err_free_cgrp; 4400 goto err_free_cgrp;
4289 rcu_assign_pointer(cgrp->name, name); 4401 rcu_assign_pointer(cgrp->name, name);
4290 4402
4291 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4403 /*
4404 * Temporarily set the pointer to NULL, so idr_find() won't return
4405 * a half-baked cgroup.
4406 */
4407 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4292 if (cgrp->id < 0) 4408 if (cgrp->id < 0)
4293 goto err_free_name; 4409 goto err_free_name;
4294 4410
@@ -4317,6 +4433,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4317 cgrp->dentry = dentry; 4433 cgrp->dentry = dentry;
4318 4434
4319 cgrp->parent = parent; 4435 cgrp->parent = parent;
4436 cgrp->dummy_css.parent = &parent->dummy_css;
4320 cgrp->root = parent->root; 4437 cgrp->root = parent->root;
4321 4438
4322 if (notify_on_release(parent)) 4439 if (notify_on_release(parent))
@@ -4328,22 +4445,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4328 for_each_root_subsys(root, ss) { 4445 for_each_root_subsys(root, ss) {
4329 struct cgroup_subsys_state *css; 4446 struct cgroup_subsys_state *css;
4330 4447
4331 css = ss->css_alloc(cgrp); 4448 css = ss->css_alloc(cgroup_css(parent, ss));
4332 if (IS_ERR(css)) { 4449 if (IS_ERR(css)) {
4333 err = PTR_ERR(css); 4450 err = PTR_ERR(css);
4334 goto err_free_all; 4451 goto err_free_all;
4335 } 4452 }
4453 css_ar[ss->subsys_id] = css;
4336 4454
4337 err = percpu_ref_init(&css->refcnt, css_release); 4455 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) { 4456 if (err)
4339 ss->css_free(cgrp);
4340 goto err_free_all; 4457 goto err_free_all;
4341 }
4342 4458
4343 init_cgroup_css(css, ss, cgrp); 4459 init_css(css, ss, cgrp);
4344 4460
4345 if (ss->use_id) { 4461 if (ss->use_id) {
4346 err = alloc_css_id(ss, parent, cgrp); 4462 err = alloc_css_id(css);
4347 if (err) 4463 if (err)
4348 goto err_free_all; 4464 goto err_free_all;
4349 } 4465 }
@@ -4365,16 +4481,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4481 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4366 root->number_of_cgroups++; 4482 root->number_of_cgroups++;
4367 4483
4368 /* each css holds a ref to the cgroup's dentry */ 4484 /* each css holds a ref to the cgroup's dentry and the parent css */
4369 for_each_root_subsys(root, ss) 4485 for_each_root_subsys(root, ss) {
4486 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4487
4370 dget(dentry); 4488 dget(dentry);
4489 css_get(css->parent);
4490 }
4371 4491
4372 /* hold a ref to the parent's dentry */ 4492 /* hold a ref to the parent's dentry */
4373 dget(parent->dentry); 4493 dget(parent->dentry);
4374 4494
4375 /* creation succeeded, notify subsystems */ 4495 /* creation succeeded, notify subsystems */
4376 for_each_root_subsys(root, ss) { 4496 for_each_root_subsys(root, ss) {
4377 err = online_css(ss, cgrp); 4497 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4498
4499 err = online_css(css);
4378 if (err) 4500 if (err)
4379 goto err_destroy; 4501 goto err_destroy;
4380 4502
@@ -4388,7 +4510,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4388 } 4510 }
4389 } 4511 }
4390 4512
4391 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4513 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4514
4515 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4516 if (err)
4517 goto err_destroy;
4518
4519 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4392 if (err) 4520 if (err)
4393 goto err_destroy; 4521 goto err_destroy;
4394 4522
@@ -4399,18 +4527,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4399 4527
4400err_free_all: 4528err_free_all:
4401 for_each_root_subsys(root, ss) { 4529 for_each_root_subsys(root, ss) {
4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4530 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4403 4531
4404 if (css) { 4532 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt); 4533 percpu_ref_cancel_init(&css->refcnt);
4406 ss->css_free(cgrp); 4534 ss->css_free(css);
4407 } 4535 }
4408 } 4536 }
4409 mutex_unlock(&cgroup_mutex); 4537 mutex_unlock(&cgroup_mutex);
4410 /* Release the reference count that we took on the superblock */ 4538 /* Release the reference count that we took on the superblock */
4411 deactivate_super(sb); 4539 deactivate_super(sb);
4412err_free_id: 4540err_free_id:
4413 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4541 idr_remove(&root->cgroup_idr, cgrp->id);
4414err_free_name: 4542err_free_name:
4415 kfree(rcu_dereference_raw(cgrp->name)); 4543 kfree(rcu_dereference_raw(cgrp->name));
4416err_free_cgrp: 4544err_free_cgrp:
@@ -4432,22 +4560,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4560 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4433} 4561}
4434 4562
4435static void cgroup_css_killed(struct cgroup *cgrp) 4563/*
4564 * This is called when the refcnt of a css is confirmed to be killed.
4565 * css_tryget() is now guaranteed to fail.
4566 */
4567static void css_killed_work_fn(struct work_struct *work)
4436{ 4568{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4569 struct cgroup_subsys_state *css =
4438 return; 4570 container_of(work, struct cgroup_subsys_state, destroy_work);
4571 struct cgroup *cgrp = css->cgroup;
4439 4572
4440 /* percpu ref's of all css's are killed, kick off the next step */ 4573 mutex_lock(&cgroup_mutex);
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4574
4442 schedule_work(&cgrp->destroy_work); 4575 /*
4576 * css_tryget() is guaranteed to fail now. Tell subsystems to
4577 * initate destruction.
4578 */
4579 offline_css(css);
4580
4581 /*
4582 * If @cgrp is marked dead, it's waiting for refs of all css's to
4583 * be disabled before proceeding to the second phase of cgroup
4584 * destruction. If we are the last one, kick it off.
4585 */
4586 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4587 cgroup_destroy_css_killed(cgrp);
4588
4589 mutex_unlock(&cgroup_mutex);
4590
4591 /*
4592 * Put the css refs from kill_css(). Each css holds an extra
4593 * reference to the cgroup's dentry and cgroup removal proceeds
4594 * regardless of css refs. On the last put of each css, whenever
4595 * that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */
4598 css_put(css);
4443} 4599}
4444 4600
4445static void css_ref_killed_fn(struct percpu_ref *ref) 4601/* css kill confirmation processing requires process context, bounce */
4602static void css_killed_ref_fn(struct percpu_ref *ref)
4446{ 4603{
4447 struct cgroup_subsys_state *css = 4604 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt); 4605 container_of(ref, struct cgroup_subsys_state, refcnt);
4449 4606
4450 cgroup_css_killed(css->cgroup); 4607 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4608 schedule_work(&css->destroy_work);
4609}
4610
4611/**
4612 * kill_css - destroy a css
4613 * @css: css to destroy
4614 *
4615 * This function initiates destruction of @css by removing cgroup interface
4616 * files and putting its base reference. ->css_offline() will be invoked
4617 * asynchronously once css_tryget() is guaranteed to fail and when the
4618 * reference count reaches zero, @css will be released.
4619 */
4620static void kill_css(struct cgroup_subsys_state *css)
4621{
4622 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4623
4624 /*
4625 * Killing would put the base ref, but we need to keep it alive
4626 * until after ->css_offline().
4627 */
4628 css_get(css);
4629
4630 /*
4631 * cgroup core guarantees that, by the time ->css_offline() is
4632 * invoked, no new css reference will be given out via
4633 * css_tryget(). We can't simply call percpu_ref_kill() and
4634 * proceed to offlining css's because percpu_ref_kill() doesn't
4635 * guarantee that the ref is seen as killed on all CPUs on return.
4636 *
4637 * Use percpu_ref_kill_and_confirm() to get notifications as each
4638 * css is confirmed to be seen as killed on all CPUs.
4639 */
4640 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4451} 4641}
4452 4642
4453/** 4643/**
@@ -4513,41 +4703,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4513 return -EBUSY; 4703 return -EBUSY;
4514 4704
4515 /* 4705 /*
4516 * Block new css_tryget() by killing css refcnts. cgroup core 4706 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4517 * guarantees that, by the time ->css_offline() is invoked, no new 4707 * will be invoked to perform the rest of destruction once the
4518 * css reference will be given out via css_tryget(). We can't 4708 * percpu refs of all css's are confirmed to be killed.
4519 * simply call percpu_ref_kill() and proceed to offlining css's
4520 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4521 * as killed on all CPUs on return.
4522 *
4523 * Use percpu_ref_kill_and_confirm() to get notifications as each
4524 * css is confirmed to be seen as killed on all CPUs. The
4525 * notification callback keeps track of the number of css's to be
4526 * killed and schedules cgroup_offline_fn() to perform the rest of
4527 * destruction once the percpu refs of all css's are confirmed to
4528 * be killed.
4529 */ 4709 */
4530 atomic_set(&cgrp->css_kill_cnt, 1); 4710 for_each_root_subsys(cgrp->root, ss)
4531 for_each_root_subsys(cgrp->root, ss) { 4711 kill_css(cgroup_css(cgrp, ss));
4532 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4533
4534 /*
4535 * Killing would put the base ref, but we need to keep it
4536 * alive until after ->css_offline.
4537 */
4538 percpu_ref_get(&css->refcnt);
4539
4540 atomic_inc(&cgrp->css_kill_cnt);
4541 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4542 }
4543 cgroup_css_killed(cgrp);
4544 4712
4545 /* 4713 /*
4546 * Mark @cgrp dead. This prevents further task migration and child 4714 * Mark @cgrp dead. This prevents further task migration and child
4547 * creation by disabling cgroup_lock_live_group(). Note that 4715 * creation by disabling cgroup_lock_live_group(). Note that
4548 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4716 * CGRP_DEAD assertion is depended upon by css_next_child() to
4549 * resume iteration after dropping RCU read lock. See 4717 * resume iteration after dropping RCU read lock. See
4550 * cgroup_next_sibling() for details. 4718 * css_next_child() for details.
4551 */ 4719 */
4552 set_bit(CGRP_DEAD, &cgrp->flags); 4720 set_bit(CGRP_DEAD, &cgrp->flags);
4553 4721
@@ -4558,9 +4726,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4558 raw_spin_unlock(&release_list_lock); 4726 raw_spin_unlock(&release_list_lock);
4559 4727
4560 /* 4728 /*
4561 * Remove @cgrp directory. The removal puts the base ref but we 4729 * If @cgrp has css's attached, the second stage of cgroup
4562 * aren't quite done with @cgrp yet, so hold onto it. 4730 * destruction is kicked off from css_killed_work_fn() after the
4731 * refs of all attached css's are killed. If @cgrp doesn't have
4732 * any css, we kick it off here.
4733 */
4734 if (!cgrp->nr_css)
4735 cgroup_destroy_css_killed(cgrp);
4736
4737 /*
4738 * Clear the base files and remove @cgrp directory. The removal
4739 * puts the base ref but we aren't quite done with @cgrp yet, so
4740 * hold onto it.
4563 */ 4741 */
4742 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4564 dget(d); 4743 dget(d);
4565 cgroup_d_remove_dir(d); 4744 cgroup_d_remove_dir(d);
4566 4745
@@ -4580,50 +4759,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4580}; 4759};
4581 4760
4582/** 4761/**
4583 * cgroup_offline_fn - the second step of cgroup destruction 4762 * cgroup_destroy_css_killed - the second step of cgroup destruction
4584 * @work: cgroup->destroy_free_work 4763 * @work: cgroup->destroy_free_work
4585 * 4764 *
4586 * This function is invoked from a work item for a cgroup which is being 4765 * This function is invoked from a work item for a cgroup which is being
4587 * destroyed after the percpu refcnts of all css's are guaranteed to be 4766 * destroyed after all css's are offlined and performs the rest of
4588 * seen as killed on all CPUs, and performs the rest of destruction. This 4767 * destruction. This is the second step of destruction described in the
4589 * is the second step of destruction described in the comment above 4768 * comment above cgroup_destroy_locked().
4590 * cgroup_destroy_locked().
4591 */ 4769 */
4592static void cgroup_offline_fn(struct work_struct *work) 4770static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4593{ 4771{
4594 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4595 struct cgroup *parent = cgrp->parent; 4772 struct cgroup *parent = cgrp->parent;
4596 struct dentry *d = cgrp->dentry; 4773 struct dentry *d = cgrp->dentry;
4597 struct cgroup_subsys *ss;
4598 4774
4599 mutex_lock(&cgroup_mutex); 4775 lockdep_assert_held(&cgroup_mutex);
4600 4776
4601 /* 4777 /* delete this cgroup from parent->children */
4602 * css_tryget() is guaranteed to fail now. Tell subsystems to 4778 list_del_rcu(&cgrp->sibling);
4603 * initate destruction.
4604 */
4605 for_each_root_subsys(cgrp->root, ss)
4606 offline_css(ss, cgrp);
4607 4779
4608 /* 4780 /*
4609 * Put the css refs from cgroup_destroy_locked(). Each css holds 4781 * We should remove the cgroup object from idr before its grace
4610 * an extra reference to the cgroup's dentry and cgroup removal 4782 * period starts, so we won't be looking up a cgroup while the
4611 * proceeds regardless of css refs. On the last put of each css, 4783 * cgroup is being freed.
4612 * whenever that may be, the extra dentry ref is put so that dentry
4613 * destruction happens only after all css's are released.
4614 */ 4784 */
4615 for_each_root_subsys(cgrp->root, ss) 4785 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4616 css_put(cgrp->subsys[ss->subsys_id]); 4786 cgrp->id = -1;
4617
4618 /* delete this cgroup from parent->children */
4619 list_del_rcu(&cgrp->sibling);
4620 4787
4621 dput(d); 4788 dput(d);
4622 4789
4623 set_bit(CGRP_RELEASABLE, &parent->flags); 4790 set_bit(CGRP_RELEASABLE, &parent->flags);
4624 check_for_release(parent); 4791 check_for_release(parent);
4625
4626 mutex_unlock(&cgroup_mutex);
4627} 4792}
4628 4793
4629static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4794static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4646,6 +4811,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4646 * deregistration. 4811 * deregistration.
4647 */ 4812 */
4648 if (ss->base_cftypes) { 4813 if (ss->base_cftypes) {
4814 struct cftype *cft;
4815
4816 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4817 cft->ss = ss;
4818
4649 ss->base_cftset.cfts = ss->base_cftypes; 4819 ss->base_cftset.cfts = ss->base_cftypes;
4650 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4820 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4651 } 4821 }
@@ -4665,10 +4835,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4665 /* Create the top cgroup state for this subsystem */ 4835 /* Create the top cgroup state for this subsystem */
4666 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4836 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4667 ss->root = &cgroup_dummy_root; 4837 ss->root = &cgroup_dummy_root;
4668 css = ss->css_alloc(cgroup_dummy_top); 4838 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4669 /* We don't handle early failures gracefully */ 4839 /* We don't handle early failures gracefully */
4670 BUG_ON(IS_ERR(css)); 4840 BUG_ON(IS_ERR(css));
4671 init_cgroup_css(css, ss, cgroup_dummy_top); 4841 init_css(css, ss, cgroup_dummy_top);
4672 4842
4673 /* Update the init_css_set to contain a subsys 4843 /* Update the init_css_set to contain a subsys
4674 * pointer to this state - since the subsystem is 4844 * pointer to this state - since the subsystem is
@@ -4683,7 +4853,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4683 * need to invoke fork callbacks here. */ 4853 * need to invoke fork callbacks here. */
4684 BUG_ON(!list_empty(&init_task.tasks)); 4854 BUG_ON(!list_empty(&init_task.tasks));
4685 4855
4686 BUG_ON(online_css(ss, cgroup_dummy_top)); 4856 BUG_ON(online_css(css));
4687 4857
4688 mutex_unlock(&cgroup_mutex); 4858 mutex_unlock(&cgroup_mutex);
4689 4859
@@ -4744,7 +4914,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4744 * struct, so this can happen first (i.e. before the dummy root 4914 * struct, so this can happen first (i.e. before the dummy root
4745 * attachment). 4915 * attachment).
4746 */ 4916 */
4747 css = ss->css_alloc(cgroup_dummy_top); 4917 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4748 if (IS_ERR(css)) { 4918 if (IS_ERR(css)) {
4749 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4919 /* failure case - need to deassign the cgroup_subsys[] slot. */
4750 cgroup_subsys[ss->subsys_id] = NULL; 4920 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4756,8 +4926,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4756 ss->root = &cgroup_dummy_root; 4926 ss->root = &cgroup_dummy_root;
4757 4927
4758 /* our new subsystem will be attached to the dummy hierarchy. */ 4928 /* our new subsystem will be attached to the dummy hierarchy. */
4759 init_cgroup_css(css, ss, cgroup_dummy_top); 4929 init_css(css, ss, cgroup_dummy_top);
4760 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4930 /* init_idr must be after init_css() because it sets css->id. */
4761 if (ss->use_id) { 4931 if (ss->use_id) {
4762 ret = cgroup_init_idr(ss, css); 4932 ret = cgroup_init_idr(ss, css);
4763 if (ret) 4933 if (ret)
@@ -4787,7 +4957,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4787 } 4957 }
4788 write_unlock(&css_set_lock); 4958 write_unlock(&css_set_lock);
4789 4959
4790 ret = online_css(ss, cgroup_dummy_top); 4960 ret = online_css(css);
4791 if (ret) 4961 if (ret)
4792 goto err_unload; 4962 goto err_unload;
4793 4963
@@ -4819,14 +4989,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4819 4989
4820 /* 4990 /*
4821 * we shouldn't be called if the subsystem is in use, and the use of 4991 * we shouldn't be called if the subsystem is in use, and the use of
4822 * try_module_get in parse_cgroupfs_options should ensure that it 4992 * try_module_get() in rebind_subsystems() should ensure that it
4823 * doesn't start being used while we're killing it off. 4993 * doesn't start being used while we're killing it off.
4824 */ 4994 */
4825 BUG_ON(ss->root != &cgroup_dummy_root); 4995 BUG_ON(ss->root != &cgroup_dummy_root);
4826 4996
4827 mutex_lock(&cgroup_mutex); 4997 mutex_lock(&cgroup_mutex);
4828 4998
4829 offline_css(ss, cgroup_dummy_top); 4999 offline_css(cgroup_css(cgroup_dummy_top, ss));
4830 5000
4831 if (ss->use_id) 5001 if (ss->use_id)
4832 idr_destroy(&ss->idr); 5002 idr_destroy(&ss->idr);
@@ -4860,8 +5030,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4860 * the cgrp->subsys pointer to find their state. note that this 5030 * the cgrp->subsys pointer to find their state. note that this
4861 * also takes care of freeing the css_id. 5031 * also takes care of freeing the css_id.
4862 */ 5032 */
4863 ss->css_free(cgroup_dummy_top); 5033 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4864 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 5034 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4865 5035
4866 mutex_unlock(&cgroup_mutex); 5036 mutex_unlock(&cgroup_mutex);
4867} 5037}
@@ -4943,6 +5113,10 @@ int __init cgroup_init(void)
4943 5113
4944 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5114 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4945 5115
5116 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5117 0, 1, GFP_KERNEL);
5118 BUG_ON(err < 0);
5119
4946 mutex_unlock(&cgroup_root_mutex); 5120 mutex_unlock(&cgroup_root_mutex);
4947 mutex_unlock(&cgroup_mutex); 5121 mutex_unlock(&cgroup_mutex);
4948 5122
@@ -5099,7 +5273,7 @@ void cgroup_fork(struct task_struct *child)
5099 * Adds the task to the list running through its css_set if necessary and 5273 * Adds the task to the list running through its css_set if necessary and
5100 * call the subsystem fork() callbacks. Has to be after the task is 5274 * call the subsystem fork() callbacks. Has to be after the task is
5101 * visible on the task list in case we race with the first call to 5275 * visible on the task list in case we race with the first call to
5102 * cgroup_iter_start() - to guarantee that the new task ends up on its 5276 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5103 * list. 5277 * list.
5104 */ 5278 */
5105void cgroup_post_fork(struct task_struct *child) 5279void cgroup_post_fork(struct task_struct *child)
@@ -5212,10 +5386,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5212 */ 5386 */
5213 for_each_builtin_subsys(ss, i) { 5387 for_each_builtin_subsys(ss, i) {
5214 if (ss->exit) { 5388 if (ss->exit) {
5215 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5389 struct cgroup_subsys_state *old_css = cset->subsys[i];
5216 struct cgroup *cgrp = task_cgroup(tsk, i); 5390 struct cgroup_subsys_state *css = task_css(tsk, i);
5217 5391
5218 ss->exit(cgrp, old_cgrp, tsk); 5392 ss->exit(css, old_css, tsk);
5219 } 5393 }
5220 } 5394 }
5221 } 5395 }
@@ -5474,20 +5648,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5474 return 0; 5648 return 0;
5475} 5649}
5476 5650
5477static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5651static int alloc_css_id(struct cgroup_subsys_state *child_css)
5478 struct cgroup *child)
5479{ 5652{
5480 int subsys_id, i, depth = 0; 5653 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5481 struct cgroup_subsys_state *parent_css, *child_css;
5482 struct css_id *child_id, *parent_id; 5654 struct css_id *child_id, *parent_id;
5655 int i, depth;
5483 5656
5484 subsys_id = ss->subsys_id;
5485 parent_css = parent->subsys[subsys_id];
5486 child_css = child->subsys[subsys_id];
5487 parent_id = rcu_dereference_protected(parent_css->id, true); 5657 parent_id = rcu_dereference_protected(parent_css->id, true);
5488 depth = parent_id->depth + 1; 5658 depth = parent_id->depth + 1;
5489 5659
5490 child_id = get_new_cssid(ss, depth); 5660 child_id = get_new_cssid(child_css->ss, depth);
5491 if (IS_ERR(child_id)) 5661 if (IS_ERR(child_id))
5492 return PTR_ERR(child_id); 5662 return PTR_ERR(child_id);
5493 5663
@@ -5525,31 +5695,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5525} 5695}
5526EXPORT_SYMBOL_GPL(css_lookup); 5696EXPORT_SYMBOL_GPL(css_lookup);
5527 5697
5528/* 5698/**
5529 * get corresponding css from file open on cgroupfs directory 5699 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5700 * @dentry: directory dentry of interest
5701 * @ss: subsystem of interest
5702 *
5703 * Must be called under RCU read lock. The caller is responsible for
5704 * pinning the returned css if it needs to be accessed outside the RCU
5705 * critical section.
5530 */ 5706 */
5531struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5707struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5708 struct cgroup_subsys *ss)
5532{ 5709{
5533 struct cgroup *cgrp; 5710 struct cgroup *cgrp;
5534 struct inode *inode;
5535 struct cgroup_subsys_state *css;
5536 5711
5537 inode = file_inode(f); 5712 WARN_ON_ONCE(!rcu_read_lock_held());
5538 /* check in cgroup filesystem dir */ 5713
5539 if (inode->i_op != &cgroup_dir_inode_operations) 5714 /* is @dentry a cgroup dir? */
5715 if (!dentry->d_inode ||
5716 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5540 return ERR_PTR(-EBADF); 5717 return ERR_PTR(-EBADF);
5541 5718
5542 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5719 cgrp = __d_cgrp(dentry);
5543 return ERR_PTR(-EINVAL); 5720 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5721}
5544 5722
5545 /* get cgroup */ 5723/**
5546 cgrp = __d_cgrp(f->f_dentry); 5724 * css_from_id - lookup css by id
5547 css = cgrp->subsys[id]; 5725 * @id: the cgroup id
5548 return css ? css : ERR_PTR(-ENOENT); 5726 * @ss: cgroup subsys to be looked into
5727 *
5728 * Returns the css if there's valid one with @id, otherwise returns NULL.
5729 * Should be called under rcu_read_lock().
5730 */
5731struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5732{
5733 struct cgroup *cgrp;
5734
5735 rcu_lockdep_assert(rcu_read_lock_held() ||
5736 lockdep_is_held(&cgroup_mutex),
5737 "css_from_id() needs proper protection");
5738
5739 cgrp = idr_find(&ss->root->cgroup_idr, id);
5740 if (cgrp)
5741 return cgroup_css(cgrp, ss);
5742 return NULL;
5549} 5743}
5550 5744
5551#ifdef CONFIG_CGROUP_DEBUG 5745#ifdef CONFIG_CGROUP_DEBUG
5552static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5746static struct cgroup_subsys_state *
5747debug_css_alloc(struct cgroup_subsys_state *parent_css)
5553{ 5748{
5554 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5749 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5555 5750
@@ -5559,22 +5754,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5559 return css; 5754 return css;
5560} 5755}
5561 5756
5562static void debug_css_free(struct cgroup *cgrp) 5757static void debug_css_free(struct cgroup_subsys_state *css)
5563{ 5758{
5564 kfree(cgrp->subsys[debug_subsys_id]); 5759 kfree(css);
5565} 5760}
5566 5761
5567static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5762static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5763 struct cftype *cft)
5568{ 5764{
5569 return cgroup_task_count(cgrp); 5765 return cgroup_task_count(css->cgroup);
5570} 5766}
5571 5767
5572static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5768static u64 current_css_set_read(struct cgroup_subsys_state *css,
5769 struct cftype *cft)
5573{ 5770{
5574 return (u64)(unsigned long)current->cgroups; 5771 return (u64)(unsigned long)current->cgroups;
5575} 5772}
5576 5773
5577static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5774static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5578 struct cftype *cft) 5775 struct cftype *cft)
5579{ 5776{
5580 u64 count; 5777 u64 count;
@@ -5585,7 +5782,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5585 return count; 5782 return count;
5586} 5783}
5587 5784
5588static int current_css_set_cg_links_read(struct cgroup *cgrp, 5785static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5589 struct cftype *cft, 5786 struct cftype *cft,
5590 struct seq_file *seq) 5787 struct seq_file *seq)
5591{ 5788{
@@ -5612,14 +5809,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5612} 5809}
5613 5810
5614#define MAX_TASKS_SHOWN_PER_CSS 25 5811#define MAX_TASKS_SHOWN_PER_CSS 25
5615static int cgroup_css_links_read(struct cgroup *cgrp, 5812static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5616 struct cftype *cft, 5813 struct cftype *cft, struct seq_file *seq)
5617 struct seq_file *seq)
5618{ 5814{
5619 struct cgrp_cset_link *link; 5815 struct cgrp_cset_link *link;
5620 5816
5621 read_lock(&css_set_lock); 5817 read_lock(&css_set_lock);
5622 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5818 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5623 struct css_set *cset = link->cset; 5819 struct css_set *cset = link->cset;
5624 struct task_struct *task; 5820 struct task_struct *task;
5625 int count = 0; 5821 int count = 0;
@@ -5638,9 +5834,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5638 return 0; 5834 return 0;
5639} 5835}
5640 5836
5641static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5837static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5642{ 5838{
5643 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5839 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5644} 5840}
5645 5841
5646static struct cftype debug_files[] = { 5842static struct cftype debug_files[] = {