aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c158
1 files changed, 99 insertions, 59 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8ba680985335..14efffed72c8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -49,6 +49,7 @@
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
51#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
52 53
53#include <asm/atomic.h> 54#include <asm/atomic.h>
54 55
@@ -77,6 +78,9 @@ struct cgroupfs_root {
77 */ 78 */
78 unsigned long subsys_bits; 79 unsigned long subsys_bits;
79 80
81 /* Unique id for this hierarchy. */
82 int hierarchy_id;
83
80 /* The bitmask of subsystems currently attached to this hierarchy */ 84 /* The bitmask of subsystems currently attached to this hierarchy */
81 unsigned long actual_subsys_bits; 85 unsigned long actual_subsys_bits;
82 86
@@ -147,6 +151,10 @@ struct css_id {
147static LIST_HEAD(roots); 151static LIST_HEAD(roots);
148static int root_count; 152static int root_count;
149 153
154static DEFINE_IDA(hierarchy_ida);
155static int next_hierarchy_id;
156static DEFINE_SPINLOCK(hierarchy_id_lock);
157
150/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 158/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
151#define dummytop (&rootnode.top_cgroup) 159#define dummytop (&rootnode.top_cgroup)
152 160
@@ -264,42 +272,10 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
264 * compiled into their kernel but not actually in use */ 272 * compiled into their kernel but not actually in use */
265static int use_task_css_set_links __read_mostly; 273static int use_task_css_set_links __read_mostly;
266 274
267/* When we create or destroy a css_set, the operation simply 275static void __put_css_set(struct css_set *cg, int taskexit)
268 * takes/releases a reference count on all the cgroups referenced
269 * by subsystems in this css_set. This can end up multiple-counting
270 * some cgroups, but that's OK - the ref-count is just a
271 * busy/not-busy indicator; ensuring that we only count each cgroup
272 * once would require taking a global lock to ensure that no
273 * subsystems moved between hierarchies while we were doing so.
274 *
275 * Possible TODO: decide at boot time based on the number of
276 * registered subsystems and the number of CPUs or NUMA nodes whether
277 * it's better for performance to ref-count every subsystem, or to
278 * take a global lock and only add one ref count to each hierarchy.
279 */
280
281/*
282 * unlink a css_set from the list and free it
283 */
284static void unlink_css_set(struct css_set *cg)
285{ 276{
286 struct cg_cgroup_link *link; 277 struct cg_cgroup_link *link;
287 struct cg_cgroup_link *saved_link; 278 struct cg_cgroup_link *saved_link;
288
289 hlist_del(&cg->hlist);
290 css_set_count--;
291
292 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
293 cg_link_list) {
294 list_del(&link->cg_link_list);
295 list_del(&link->cgrp_link_list);
296 kfree(link);
297 }
298}
299
300static void __put_css_set(struct css_set *cg, int taskexit)
301{
302 int i;
303 /* 279 /*
304 * Ensure that the refcount doesn't hit zero while any readers 280 * Ensure that the refcount doesn't hit zero while any readers
305 * can see it. Similar to atomic_dec_and_lock(), but for an 281 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -312,20 +288,27 @@ static void __put_css_set(struct css_set *cg, int taskexit)
312 write_unlock(&css_set_lock); 288 write_unlock(&css_set_lock);
313 return; 289 return;
314 } 290 }
315 unlink_css_set(cg);
316 write_unlock(&css_set_lock);
317 291
318 rcu_read_lock(); 292 /* This css_set is dead. unlink it and release cgroup refcounts */
319 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 293 hlist_del(&cg->hlist);
320 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 294 css_set_count--;
295
296 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
297 cg_link_list) {
298 struct cgroup *cgrp = link->cgrp;
299 list_del(&link->cg_link_list);
300 list_del(&link->cgrp_link_list);
321 if (atomic_dec_and_test(&cgrp->count) && 301 if (atomic_dec_and_test(&cgrp->count) &&
322 notify_on_release(cgrp)) { 302 notify_on_release(cgrp)) {
323 if (taskexit) 303 if (taskexit)
324 set_bit(CGRP_RELEASABLE, &cgrp->flags); 304 set_bit(CGRP_RELEASABLE, &cgrp->flags);
325 check_for_release(cgrp); 305 check_for_release(cgrp);
326 } 306 }
307
308 kfree(link);
327 } 309 }
328 rcu_read_unlock(); 310
311 write_unlock(&css_set_lock);
329 kfree(cg); 312 kfree(cg);
330} 313}
331 314
@@ -519,6 +502,7 @@ static void link_css_set(struct list_head *tmp_cg_links,
519 cgrp_link_list); 502 cgrp_link_list);
520 link->cg = cg; 503 link->cg = cg;
521 link->cgrp = cgrp; 504 link->cgrp = cgrp;
505 atomic_inc(&cgrp->count);
522 list_move(&link->cgrp_link_list, &cgrp->css_sets); 506 list_move(&link->cgrp_link_list, &cgrp->css_sets);
523 /* 507 /*
524 * Always add links to the tail of the list so that the list 508 * Always add links to the tail of the list so that the list
@@ -539,7 +523,6 @@ static struct css_set *find_css_set(
539{ 523{
540 struct css_set *res; 524 struct css_set *res;
541 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 525 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
542 int i;
543 526
544 struct list_head tmp_cg_links; 527 struct list_head tmp_cg_links;
545 528
@@ -578,10 +561,6 @@ static struct css_set *find_css_set(
578 561
579 write_lock(&css_set_lock); 562 write_lock(&css_set_lock);
580 /* Add reference counts and links from the new css_set. */ 563 /* Add reference counts and links from the new css_set. */
581 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
582 struct cgroup *cgrp = res->subsys[i]->cgroup;
583 atomic_inc(&cgrp->count);
584 }
585 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 564 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
586 struct cgroup *c = link->cgrp; 565 struct cgroup *c = link->cgrp;
587 if (c->root == cgrp->root) 566 if (c->root == cgrp->root)
@@ -972,8 +951,11 @@ struct cgroup_sb_opts {
972 unsigned long flags; 951 unsigned long flags;
973 char *release_agent; 952 char *release_agent;
974 char *name; 953 char *name;
954 /* User explicitly requested empty subsystem */
955 bool none;
975 956
976 struct cgroupfs_root *new_root; 957 struct cgroupfs_root *new_root;
958
977}; 959};
978 960
979/* Convert a hierarchy specifier into a bitmask of subsystems and 961/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -1002,6 +984,9 @@ static int parse_cgroupfs_options(char *data,
1002 if (!ss->disabled) 984 if (!ss->disabled)
1003 opts->subsys_bits |= 1ul << i; 985 opts->subsys_bits |= 1ul << i;
1004 } 986 }
987 } else if (!strcmp(token, "none")) {
988 /* Explicitly have no subsystems */
989 opts->none = true;
1005 } else if (!strcmp(token, "noprefix")) { 990 } else if (!strcmp(token, "noprefix")) {
1006 set_bit(ROOT_NOPREFIX, &opts->flags); 991 set_bit(ROOT_NOPREFIX, &opts->flags);
1007 } else if (!strncmp(token, "release_agent=", 14)) { 992 } else if (!strncmp(token, "release_agent=", 14)) {
@@ -1051,6 +1036,8 @@ static int parse_cgroupfs_options(char *data,
1051 } 1036 }
1052 } 1037 }
1053 1038
1039 /* Consistency checks */
1040
1054 /* 1041 /*
1055 * Option noprefix was introduced just for backward compatibility 1042 * Option noprefix was introduced just for backward compatibility
1056 * with the old cpuset, so we allow noprefix only if mounting just 1043 * with the old cpuset, so we allow noprefix only if mounting just
@@ -1060,7 +1047,15 @@ static int parse_cgroupfs_options(char *data,
1060 (opts->subsys_bits & mask)) 1047 (opts->subsys_bits & mask))
1061 return -EINVAL; 1048 return -EINVAL;
1062 1049
1063 /* We can't have an empty hierarchy */ 1050
1051 /* Can't specify "none" and some subsystems */
1052 if (opts->subsys_bits && opts->none)
1053 return -EINVAL;
1054
1055 /*
1056 * We either have to specify by name or by subsystems. (So all
1057 * empty hierarchies must have a name).
1058 */
1064 if (!opts->subsys_bits && !opts->name) 1059 if (!opts->subsys_bits && !opts->name)
1065 return -EINVAL; 1060 return -EINVAL;
1066 1061
@@ -1141,6 +1136,31 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1141 init_cgroup_housekeeping(cgrp); 1136 init_cgroup_housekeeping(cgrp);
1142} 1137}
1143 1138
1139static bool init_root_id(struct cgroupfs_root *root)
1140{
1141 int ret = 0;
1142
1143 do {
1144 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1145 return false;
1146 spin_lock(&hierarchy_id_lock);
1147 /* Try to allocate the next unused ID */
1148 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1149 &root->hierarchy_id);
1150 if (ret == -ENOSPC)
1151 /* Try again starting from 0 */
1152 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1153 if (!ret) {
1154 next_hierarchy_id = root->hierarchy_id + 1;
1155 } else if (ret != -EAGAIN) {
1156 /* Can only get here if the 31-bit IDR is full ... */
1157 BUG_ON(ret);
1158 }
1159 spin_unlock(&hierarchy_id_lock);
1160 } while (ret);
1161 return true;
1162}
1163
1144static int cgroup_test_super(struct super_block *sb, void *data) 1164static int cgroup_test_super(struct super_block *sb, void *data)
1145{ 1165{
1146 struct cgroup_sb_opts *opts = data; 1166 struct cgroup_sb_opts *opts = data;
@@ -1150,8 +1170,12 @@ static int cgroup_test_super(struct super_block *sb, void *data)
1150 if (opts->name && strcmp(opts->name, root->name)) 1170 if (opts->name && strcmp(opts->name, root->name))
1151 return 0; 1171 return 0;
1152 1172
1153 /* If we asked for subsystems then they must match */ 1173 /*
1154 if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits)) 1174 * If we asked for subsystems (or explicitly for no
1175 * subsystems) then they must match
1176 */
1177 if ((opts->subsys_bits || opts->none)
1178 && (opts->subsys_bits != root->subsys_bits))
1155 return 0; 1179 return 0;
1156 1180
1157 return 1; 1181 return 1;
@@ -1161,15 +1185,19 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1161{ 1185{
1162 struct cgroupfs_root *root; 1186 struct cgroupfs_root *root;
1163 1187
1164 /* Empty hierarchies aren't supported */ 1188 if (!opts->subsys_bits && !opts->none)
1165 if (!opts->subsys_bits)
1166 return NULL; 1189 return NULL;
1167 1190
1168 root = kzalloc(sizeof(*root), GFP_KERNEL); 1191 root = kzalloc(sizeof(*root), GFP_KERNEL);
1169 if (!root) 1192 if (!root)
1170 return ERR_PTR(-ENOMEM); 1193 return ERR_PTR(-ENOMEM);
1171 1194
1195 if (!init_root_id(root)) {
1196 kfree(root);
1197 return ERR_PTR(-ENOMEM);
1198 }
1172 init_cgroup_root(root); 1199 init_cgroup_root(root);
1200
1173 root->subsys_bits = opts->subsys_bits; 1201 root->subsys_bits = opts->subsys_bits;
1174 root->flags = opts->flags; 1202 root->flags = opts->flags;
1175 if (opts->release_agent) 1203 if (opts->release_agent)
@@ -1179,6 +1207,18 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1179 return root; 1207 return root;
1180} 1208}
1181 1209
1210static void cgroup_drop_root(struct cgroupfs_root *root)
1211{
1212 if (!root)
1213 return;
1214
1215 BUG_ON(!root->hierarchy_id);
1216 spin_lock(&hierarchy_id_lock);
1217 ida_remove(&hierarchy_ida, root->hierarchy_id);
1218 spin_unlock(&hierarchy_id_lock);
1219 kfree(root);
1220}
1221
1182static int cgroup_set_super(struct super_block *sb, void *data) 1222static int cgroup_set_super(struct super_block *sb, void *data)
1183{ 1223{
1184 int ret; 1224 int ret;
@@ -1188,7 +1228,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1188 if (!opts->new_root) 1228 if (!opts->new_root)
1189 return -EINVAL; 1229 return -EINVAL;
1190 1230
1191 BUG_ON(!opts->subsys_bits); 1231 BUG_ON(!opts->subsys_bits && !opts->none);
1192 1232
1193 ret = set_anon_super(sb, NULL); 1233 ret = set_anon_super(sb, NULL);
1194 if (ret) 1234 if (ret)
@@ -1257,7 +1297,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1257 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); 1297 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1258 if (IS_ERR(sb)) { 1298 if (IS_ERR(sb)) {
1259 ret = PTR_ERR(sb); 1299 ret = PTR_ERR(sb);
1260 kfree(opts.new_root); 1300 cgroup_drop_root(opts.new_root);
1261 goto out_err; 1301 goto out_err;
1262 } 1302 }
1263 1303
@@ -1351,7 +1391,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1351 * We re-used an existing hierarchy - the new root (if 1391 * We re-used an existing hierarchy - the new root (if
1352 * any) is not needed 1392 * any) is not needed
1353 */ 1393 */
1354 kfree(opts.new_root); 1394 cgroup_drop_root(opts.new_root);
1355 } 1395 }
1356 1396
1357 simple_set_mnt(mnt, sb); 1397 simple_set_mnt(mnt, sb);
@@ -1410,7 +1450,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1410 mutex_unlock(&cgroup_mutex); 1450 mutex_unlock(&cgroup_mutex);
1411 1451
1412 kill_litter_super(sb); 1452 kill_litter_super(sb);
1413 kfree(root); 1453 cgroup_drop_root(root);
1414} 1454}
1415 1455
1416static struct file_system_type cgroup_fs_type = { 1456static struct file_system_type cgroup_fs_type = {
@@ -3109,7 +3149,7 @@ int __init cgroup_init(void)
3109 /* Add init_css_set to the hash table */ 3149 /* Add init_css_set to the hash table */
3110 hhead = css_set_hash(init_css_set.subsys); 3150 hhead = css_set_hash(init_css_set.subsys);
3111 hlist_add_head(&init_css_set.hlist, hhead); 3151 hlist_add_head(&init_css_set.hlist, hhead);
3112 3152 BUG_ON(!init_root_id(&rootnode));
3113 err = register_filesystem(&cgroup_fs_type); 3153 err = register_filesystem(&cgroup_fs_type);
3114 if (err < 0) 3154 if (err < 0)
3115 goto out; 3155 goto out;
@@ -3164,7 +3204,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
3164 struct cgroup *cgrp; 3204 struct cgroup *cgrp;
3165 int count = 0; 3205 int count = 0;
3166 3206
3167 seq_printf(m, "%lu:", root->subsys_bits); 3207 seq_printf(m, "%d:", root->hierarchy_id);
3168 for_each_subsys(root, ss) 3208 for_each_subsys(root, ss)
3169 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3209 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3170 if (strlen(root->name)) 3210 if (strlen(root->name))
@@ -3210,8 +3250,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3210 mutex_lock(&cgroup_mutex); 3250 mutex_lock(&cgroup_mutex);
3211 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3251 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3212 struct cgroup_subsys *ss = subsys[i]; 3252 struct cgroup_subsys *ss = subsys[i];
3213 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3253 seq_printf(m, "%s\t%d\t%d\t%d\n",
3214 ss->name, ss->root->subsys_bits, 3254 ss->name, ss->root->hierarchy_id,
3215 ss->root->number_of_cgroups, !ss->disabled); 3255 ss->root->number_of_cgroups, !ss->disabled);
3216 } 3256 }
3217 mutex_unlock(&cgroup_mutex); 3257 mutex_unlock(&cgroup_mutex);
@@ -3929,8 +3969,8 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
3929 name = c->dentry->d_name.name; 3969 name = c->dentry->d_name.name;
3930 else 3970 else
3931 name = "?"; 3971 name = "?";
3932 seq_printf(seq, "Root %lu group %s\n", 3972 seq_printf(seq, "Root %d group %s\n",
3933 c->root->subsys_bits, name); 3973 c->root->hierarchy_id, name);
3934 } 3974 }
3935 rcu_read_unlock(); 3975 rcu_read_unlock();
3936 read_unlock(&css_set_lock); 3976 read_unlock(&css_set_lock);