aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2009-09-23 18:56:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:58 -0400
commit2c6ab6d200827e1c41dc71fff3a2ac7473f51777 (patch)
tree1ea1e6b46356a0c350c6bc3b39cb852628263fd9 /kernel
parent7717f7ba92de485bce8293419a20ffef130f4286 (diff)
cgroups: allow cgroup hierarchies to be created with no bound subsystems
This patch removes the restriction that a cgroup hierarchy must have at least one bound subsystem. The mount option "none" is treated as an explicit request for no bound subsystems. A hierarchy with no subsystems can be useful for plain task tracking, and is also a step towards the support for multiply-bindable subsystems. As part of this change, the hierarchy id is no longer calculated from the bitmask of subsystems in the hierarchy (since this is not guaranteed to be unique) but is allocated via an ida. Reference counts on cgroups from css_set objects are now taken explicitly one per hierarchy, rather than one per subsystem. Example usage: mount -t cgroup -o none,name=foo cgroup /mnt/cgroup Based on the "no-op"/"none" subsystem concept proposed by kamezawa.hiroyu@jp.fujitsu.com Signed-off-by: Paul Menage <menage@google.com> Reviewed-by: Li Zefan <lizf@cn.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c158
1 files changed, 99 insertions, 59 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8ba680985335..14efffed72c8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -49,6 +49,7 @@
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
51#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
52 53
53#include <asm/atomic.h> 54#include <asm/atomic.h>
54 55
@@ -77,6 +78,9 @@ struct cgroupfs_root {
77 */ 78 */
78 unsigned long subsys_bits; 79 unsigned long subsys_bits;
79 80
81 /* Unique id for this hierarchy. */
82 int hierarchy_id;
83
80 /* The bitmask of subsystems currently attached to this hierarchy */ 84 /* The bitmask of subsystems currently attached to this hierarchy */
81 unsigned long actual_subsys_bits; 85 unsigned long actual_subsys_bits;
82 86
@@ -147,6 +151,10 @@ struct css_id {
147static LIST_HEAD(roots); 151static LIST_HEAD(roots);
148static int root_count; 152static int root_count;
149 153
154static DEFINE_IDA(hierarchy_ida);
155static int next_hierarchy_id;
156static DEFINE_SPINLOCK(hierarchy_id_lock);
157
150/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 158/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
151#define dummytop (&rootnode.top_cgroup) 159#define dummytop (&rootnode.top_cgroup)
152 160
@@ -264,42 +272,10 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
264 * compiled into their kernel but not actually in use */ 272 * compiled into their kernel but not actually in use */
265static int use_task_css_set_links __read_mostly; 273static int use_task_css_set_links __read_mostly;
266 274
267/* When we create or destroy a css_set, the operation simply 275static void __put_css_set(struct css_set *cg, int taskexit)
268 * takes/releases a reference count on all the cgroups referenced
269 * by subsystems in this css_set. This can end up multiple-counting
270 * some cgroups, but that's OK - the ref-count is just a
271 * busy/not-busy indicator; ensuring that we only count each cgroup
272 * once would require taking a global lock to ensure that no
273 * subsystems moved between hierarchies while we were doing so.
274 *
275 * Possible TODO: decide at boot time based on the number of
276 * registered subsystems and the number of CPUs or NUMA nodes whether
277 * it's better for performance to ref-count every subsystem, or to
278 * take a global lock and only add one ref count to each hierarchy.
279 */
280
281/*
282 * unlink a css_set from the list and free it
283 */
284static void unlink_css_set(struct css_set *cg)
285{ 276{
286 struct cg_cgroup_link *link; 277 struct cg_cgroup_link *link;
287 struct cg_cgroup_link *saved_link; 278 struct cg_cgroup_link *saved_link;
288
289 hlist_del(&cg->hlist);
290 css_set_count--;
291
292 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
293 cg_link_list) {
294 list_del(&link->cg_link_list);
295 list_del(&link->cgrp_link_list);
296 kfree(link);
297 }
298}
299
300static void __put_css_set(struct css_set *cg, int taskexit)
301{
302 int i;
303 /* 279 /*
304 * Ensure that the refcount doesn't hit zero while any readers 280 * Ensure that the refcount doesn't hit zero while any readers
305 * can see it. Similar to atomic_dec_and_lock(), but for an 281 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -312,20 +288,27 @@ static void __put_css_set(struct css_set *cg, int taskexit)
312 write_unlock(&css_set_lock); 288 write_unlock(&css_set_lock);
313 return; 289 return;
314 } 290 }
315 unlink_css_set(cg);
316 write_unlock(&css_set_lock);
317 291
318 rcu_read_lock(); 292 /* This css_set is dead. unlink it and release cgroup refcounts */
319 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 293 hlist_del(&cg->hlist);
320 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 294 css_set_count--;
295
296 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
297 cg_link_list) {
298 struct cgroup *cgrp = link->cgrp;
299 list_del(&link->cg_link_list);
300 list_del(&link->cgrp_link_list);
321 if (atomic_dec_and_test(&cgrp->count) && 301 if (atomic_dec_and_test(&cgrp->count) &&
322 notify_on_release(cgrp)) { 302 notify_on_release(cgrp)) {
323 if (taskexit) 303 if (taskexit)
324 set_bit(CGRP_RELEASABLE, &cgrp->flags); 304 set_bit(CGRP_RELEASABLE, &cgrp->flags);
325 check_for_release(cgrp); 305 check_for_release(cgrp);
326 } 306 }
307
308 kfree(link);
327 } 309 }
328 rcu_read_unlock(); 310
311 write_unlock(&css_set_lock);
329 kfree(cg); 312 kfree(cg);
330} 313}
331 314
@@ -519,6 +502,7 @@ static void link_css_set(struct list_head *tmp_cg_links,
519 cgrp_link_list); 502 cgrp_link_list);
520 link->cg = cg; 503 link->cg = cg;
521 link->cgrp = cgrp; 504 link->cgrp = cgrp;
505 atomic_inc(&cgrp->count);
522 list_move(&link->cgrp_link_list, &cgrp->css_sets); 506 list_move(&link->cgrp_link_list, &cgrp->css_sets);
523 /* 507 /*
524 * Always add links to the tail of the list so that the list 508 * Always add links to the tail of the list so that the list
@@ -539,7 +523,6 @@ static struct css_set *find_css_set(
539{ 523{
540 struct css_set *res; 524 struct css_set *res;
541 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 525 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
542 int i;
543 526
544 struct list_head tmp_cg_links; 527 struct list_head tmp_cg_links;
545 528
@@ -578,10 +561,6 @@ static struct css_set *find_css_set(
578 561
579 write_lock(&css_set_lock); 562 write_lock(&css_set_lock);
580 /* Add reference counts and links from the new css_set. */ 563 /* Add reference counts and links from the new css_set. */
581 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
582 struct cgroup *cgrp = res->subsys[i]->cgroup;
583 atomic_inc(&cgrp->count);
584 }
585 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 564 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
586 struct cgroup *c = link->cgrp; 565 struct cgroup *c = link->cgrp;
587 if (c->root == cgrp->root) 566 if (c->root == cgrp->root)
@@ -972,8 +951,11 @@ struct cgroup_sb_opts {
972 unsigned long flags; 951 unsigned long flags;
973 char *release_agent; 952 char *release_agent;
974 char *name; 953 char *name;
954 /* User explicitly requested empty subsystem */
955 bool none;
975 956
976 struct cgroupfs_root *new_root; 957 struct cgroupfs_root *new_root;
958
977}; 959};
978 960
979/* Convert a hierarchy specifier into a bitmask of subsystems and 961/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -1002,6 +984,9 @@ static int parse_cgroupfs_options(char *data,
1002 if (!ss->disabled) 984 if (!ss->disabled)
1003 opts->subsys_bits |= 1ul << i; 985 opts->subsys_bits |= 1ul << i;
1004 } 986 }
987 } else if (!strcmp(token, "none")) {
988 /* Explicitly have no subsystems */
989 opts->none = true;
1005 } else if (!strcmp(token, "noprefix")) { 990 } else if (!strcmp(token, "noprefix")) {
1006 set_bit(ROOT_NOPREFIX, &opts->flags); 991 set_bit(ROOT_NOPREFIX, &opts->flags);
1007 } else if (!strncmp(token, "release_agent=", 14)) { 992 } else if (!strncmp(token, "release_agent=", 14)) {
@@ -1051,6 +1036,8 @@ static int parse_cgroupfs_options(char *data,
1051 } 1036 }
1052 } 1037 }
1053 1038
1039 /* Consistency checks */
1040
1054 /* 1041 /*
1055 * Option noprefix was introduced just for backward compatibility 1042 * Option noprefix was introduced just for backward compatibility
1056 * with the old cpuset, so we allow noprefix only if mounting just 1043 * with the old cpuset, so we allow noprefix only if mounting just
@@ -1060,7 +1047,15 @@ static int parse_cgroupfs_options(char *data,
1060 (opts->subsys_bits & mask)) 1047 (opts->subsys_bits & mask))
1061 return -EINVAL; 1048 return -EINVAL;
1062 1049
1063 /* We can't have an empty hierarchy */ 1050
1051 /* Can't specify "none" and some subsystems */
1052 if (opts->subsys_bits && opts->none)
1053 return -EINVAL;
1054
1055 /*
1056 * We either have to specify by name or by subsystems. (So all
1057 * empty hierarchies must have a name).
1058 */
1064 if (!opts->subsys_bits && !opts->name) 1059 if (!opts->subsys_bits && !opts->name)
1065 return -EINVAL; 1060 return -EINVAL;
1066 1061
@@ -1141,6 +1136,31 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1141 init_cgroup_housekeeping(cgrp); 1136 init_cgroup_housekeeping(cgrp);
1142} 1137}
1143 1138
1139static bool init_root_id(struct cgroupfs_root *root)
1140{
1141 int ret = 0;
1142
1143 do {
1144 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1145 return false;
1146 spin_lock(&hierarchy_id_lock);
1147 /* Try to allocate the next unused ID */
1148 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1149 &root->hierarchy_id);
1150 if (ret == -ENOSPC)
1151 /* Try again starting from 0 */
1152 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1153 if (!ret) {
1154 next_hierarchy_id = root->hierarchy_id + 1;
1155 } else if (ret != -EAGAIN) {
1156 /* Can only get here if the 31-bit IDR is full ... */
1157 BUG_ON(ret);
1158 }
1159 spin_unlock(&hierarchy_id_lock);
1160 } while (ret);
1161 return true;
1162}
1163
1144static int cgroup_test_super(struct super_block *sb, void *data) 1164static int cgroup_test_super(struct super_block *sb, void *data)
1145{ 1165{
1146 struct cgroup_sb_opts *opts = data; 1166 struct cgroup_sb_opts *opts = data;
@@ -1150,8 +1170,12 @@ static int cgroup_test_super(struct super_block *sb, void *data)
1150 if (opts->name && strcmp(opts->name, root->name)) 1170 if (opts->name && strcmp(opts->name, root->name))
1151 return 0; 1171 return 0;
1152 1172
1153 /* If we asked for subsystems then they must match */ 1173 /*
1154 if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits)) 1174 * If we asked for subsystems (or explicitly for no
1175 * subsystems) then they must match
1176 */
1177 if ((opts->subsys_bits || opts->none)
1178 && (opts->subsys_bits != root->subsys_bits))
1155 return 0; 1179 return 0;
1156 1180
1157 return 1; 1181 return 1;
@@ -1161,15 +1185,19 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1161{ 1185{
1162 struct cgroupfs_root *root; 1186 struct cgroupfs_root *root;
1163 1187
1164 /* Empty hierarchies aren't supported */ 1188 if (!opts->subsys_bits && !opts->none)
1165 if (!opts->subsys_bits)
1166 return NULL; 1189 return NULL;
1167 1190
1168 root = kzalloc(sizeof(*root), GFP_KERNEL); 1191 root = kzalloc(sizeof(*root), GFP_KERNEL);
1169 if (!root) 1192 if (!root)
1170 return ERR_PTR(-ENOMEM); 1193 return ERR_PTR(-ENOMEM);
1171 1194
1195 if (!init_root_id(root)) {
1196 kfree(root);
1197 return ERR_PTR(-ENOMEM);
1198 }
1172 init_cgroup_root(root); 1199 init_cgroup_root(root);
1200
1173 root->subsys_bits = opts->subsys_bits; 1201 root->subsys_bits = opts->subsys_bits;
1174 root->flags = opts->flags; 1202 root->flags = opts->flags;
1175 if (opts->release_agent) 1203 if (opts->release_agent)
@@ -1179,6 +1207,18 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1179 return root; 1207 return root;
1180} 1208}
1181 1209
1210static void cgroup_drop_root(struct cgroupfs_root *root)
1211{
1212 if (!root)
1213 return;
1214
1215 BUG_ON(!root->hierarchy_id);
1216 spin_lock(&hierarchy_id_lock);
1217 ida_remove(&hierarchy_ida, root->hierarchy_id);
1218 spin_unlock(&hierarchy_id_lock);
1219 kfree(root);
1220}
1221
1182static int cgroup_set_super(struct super_block *sb, void *data) 1222static int cgroup_set_super(struct super_block *sb, void *data)
1183{ 1223{
1184 int ret; 1224 int ret;
@@ -1188,7 +1228,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1188 if (!opts->new_root) 1228 if (!opts->new_root)
1189 return -EINVAL; 1229 return -EINVAL;
1190 1230
1191 BUG_ON(!opts->subsys_bits); 1231 BUG_ON(!opts->subsys_bits && !opts->none);
1192 1232
1193 ret = set_anon_super(sb, NULL); 1233 ret = set_anon_super(sb, NULL);
1194 if (ret) 1234 if (ret)
@@ -1257,7 +1297,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1257 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); 1297 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1258 if (IS_ERR(sb)) { 1298 if (IS_ERR(sb)) {
1259 ret = PTR_ERR(sb); 1299 ret = PTR_ERR(sb);
1260 kfree(opts.new_root); 1300 cgroup_drop_root(opts.new_root);
1261 goto out_err; 1301 goto out_err;
1262 } 1302 }
1263 1303
@@ -1351,7 +1391,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1351 * We re-used an existing hierarchy - the new root (if 1391 * We re-used an existing hierarchy - the new root (if
1352 * any) is not needed 1392 * any) is not needed
1353 */ 1393 */
1354 kfree(opts.new_root); 1394 cgroup_drop_root(opts.new_root);
1355 } 1395 }
1356 1396
1357 simple_set_mnt(mnt, sb); 1397 simple_set_mnt(mnt, sb);
@@ -1410,7 +1450,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1410 mutex_unlock(&cgroup_mutex); 1450 mutex_unlock(&cgroup_mutex);
1411 1451
1412 kill_litter_super(sb); 1452 kill_litter_super(sb);
1413 kfree(root); 1453 cgroup_drop_root(root);
1414} 1454}
1415 1455
1416static struct file_system_type cgroup_fs_type = { 1456static struct file_system_type cgroup_fs_type = {
@@ -3109,7 +3149,7 @@ int __init cgroup_init(void)
3109 /* Add init_css_set to the hash table */ 3149 /* Add init_css_set to the hash table */
3110 hhead = css_set_hash(init_css_set.subsys); 3150 hhead = css_set_hash(init_css_set.subsys);
3111 hlist_add_head(&init_css_set.hlist, hhead); 3151 hlist_add_head(&init_css_set.hlist, hhead);
3112 3152 BUG_ON(!init_root_id(&rootnode));
3113 err = register_filesystem(&cgroup_fs_type); 3153 err = register_filesystem(&cgroup_fs_type);
3114 if (err < 0) 3154 if (err < 0)
3115 goto out; 3155 goto out;
@@ -3164,7 +3204,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
3164 struct cgroup *cgrp; 3204 struct cgroup *cgrp;
3165 int count = 0; 3205 int count = 0;
3166 3206
3167 seq_printf(m, "%lu:", root->subsys_bits); 3207 seq_printf(m, "%d:", root->hierarchy_id);
3168 for_each_subsys(root, ss) 3208 for_each_subsys(root, ss)
3169 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3209 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3170 if (strlen(root->name)) 3210 if (strlen(root->name))
@@ -3210,8 +3250,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3210 mutex_lock(&cgroup_mutex); 3250 mutex_lock(&cgroup_mutex);
3211 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3251 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3212 struct cgroup_subsys *ss = subsys[i]; 3252 struct cgroup_subsys *ss = subsys[i];
3213 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3253 seq_printf(m, "%s\t%d\t%d\t%d\n",
3214 ss->name, ss->root->subsys_bits, 3254 ss->name, ss->root->hierarchy_id,
3215 ss->root->number_of_cgroups, !ss->disabled); 3255 ss->root->number_of_cgroups, !ss->disabled);
3216 } 3256 }
3217 mutex_unlock(&cgroup_mutex); 3257 mutex_unlock(&cgroup_mutex);
@@ -3929,8 +3969,8 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
3929 name = c->dentry->d_name.name; 3969 name = c->dentry->d_name.name;
3930 else 3970 else
3931 name = "?"; 3971 name = "?";
3932 seq_printf(seq, "Root %lu group %s\n", 3972 seq_printf(seq, "Root %d group %s\n",
3933 c->root->subsys_bits, name); 3973 c->root->hierarchy_id, name);
3934 } 3974 }
3935 rcu_read_unlock(); 3975 rcu_read_unlock();
3936 read_unlock(&css_set_lock); 3976 read_unlock(&css_set_lock);