diff options
-rw-r--r-- | kernel/cgroup.c | 158 |
1 files changed, 99 insertions, 59 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8ba680985335..14efffed72c8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/namei.h> | 49 | #include <linux/namei.h> |
50 | #include <linux/smp_lock.h> | 50 | #include <linux/smp_lock.h> |
51 | #include <linux/pid_namespace.h> | 51 | #include <linux/pid_namespace.h> |
52 | #include <linux/idr.h> | ||
52 | 53 | ||
53 | #include <asm/atomic.h> | 54 | #include <asm/atomic.h> |
54 | 55 | ||
@@ -77,6 +78,9 @@ struct cgroupfs_root { | |||
77 | */ | 78 | */ |
78 | unsigned long subsys_bits; | 79 | unsigned long subsys_bits; |
79 | 80 | ||
81 | /* Unique id for this hierarchy. */ | ||
82 | int hierarchy_id; | ||
83 | |||
80 | /* The bitmask of subsystems currently attached to this hierarchy */ | 84 | /* The bitmask of subsystems currently attached to this hierarchy */ |
81 | unsigned long actual_subsys_bits; | 85 | unsigned long actual_subsys_bits; |
82 | 86 | ||
@@ -147,6 +151,10 @@ struct css_id { | |||
147 | static LIST_HEAD(roots); | 151 | static LIST_HEAD(roots); |
148 | static int root_count; | 152 | static int root_count; |
149 | 153 | ||
154 | static DEFINE_IDA(hierarchy_ida); | ||
155 | static int next_hierarchy_id; | ||
156 | static DEFINE_SPINLOCK(hierarchy_id_lock); | ||
157 | |||
150 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | 158 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ |
151 | #define dummytop (&rootnode.top_cgroup) | 159 | #define dummytop (&rootnode.top_cgroup) |
152 | 160 | ||
@@ -264,42 +272,10 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | |||
264 | * compiled into their kernel but not actually in use */ | 272 | * compiled into their kernel but not actually in use */ |
265 | static int use_task_css_set_links __read_mostly; | 273 | static int use_task_css_set_links __read_mostly; |
266 | 274 | ||
267 | /* When we create or destroy a css_set, the operation simply | 275 | static void __put_css_set(struct css_set *cg, int taskexit) |
268 | * takes/releases a reference count on all the cgroups referenced | ||
269 | * by subsystems in this css_set. This can end up multiple-counting | ||
270 | * some cgroups, but that's OK - the ref-count is just a | ||
271 | * busy/not-busy indicator; ensuring that we only count each cgroup | ||
272 | * once would require taking a global lock to ensure that no | ||
273 | * subsystems moved between hierarchies while we were doing so. | ||
274 | * | ||
275 | * Possible TODO: decide at boot time based on the number of | ||
276 | * registered subsystems and the number of CPUs or NUMA nodes whether | ||
277 | * it's better for performance to ref-count every subsystem, or to | ||
278 | * take a global lock and only add one ref count to each hierarchy. | ||
279 | */ | ||
280 | |||
281 | /* | ||
282 | * unlink a css_set from the list and free it | ||
283 | */ | ||
284 | static void unlink_css_set(struct css_set *cg) | ||
285 | { | 276 | { |
286 | struct cg_cgroup_link *link; | 277 | struct cg_cgroup_link *link; |
287 | struct cg_cgroup_link *saved_link; | 278 | struct cg_cgroup_link *saved_link; |
288 | |||
289 | hlist_del(&cg->hlist); | ||
290 | css_set_count--; | ||
291 | |||
292 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | ||
293 | cg_link_list) { | ||
294 | list_del(&link->cg_link_list); | ||
295 | list_del(&link->cgrp_link_list); | ||
296 | kfree(link); | ||
297 | } | ||
298 | } | ||
299 | |||
300 | static void __put_css_set(struct css_set *cg, int taskexit) | ||
301 | { | ||
302 | int i; | ||
303 | /* | 279 | /* |
304 | * Ensure that the refcount doesn't hit zero while any readers | 280 | * Ensure that the refcount doesn't hit zero while any readers |
305 | * can see it. Similar to atomic_dec_and_lock(), but for an | 281 | * can see it. Similar to atomic_dec_and_lock(), but for an |
@@ -312,20 +288,27 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
312 | write_unlock(&css_set_lock); | 288 | write_unlock(&css_set_lock); |
313 | return; | 289 | return; |
314 | } | 290 | } |
315 | unlink_css_set(cg); | ||
316 | write_unlock(&css_set_lock); | ||
317 | 291 | ||
318 | rcu_read_lock(); | 292 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
319 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 293 | hlist_del(&cg->hlist); |
320 | struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); | 294 | css_set_count--; |
295 | |||
296 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | ||
297 | cg_link_list) { | ||
298 | struct cgroup *cgrp = link->cgrp; | ||
299 | list_del(&link->cg_link_list); | ||
300 | list_del(&link->cgrp_link_list); | ||
321 | if (atomic_dec_and_test(&cgrp->count) && | 301 | if (atomic_dec_and_test(&cgrp->count) && |
322 | notify_on_release(cgrp)) { | 302 | notify_on_release(cgrp)) { |
323 | if (taskexit) | 303 | if (taskexit) |
324 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 304 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
325 | check_for_release(cgrp); | 305 | check_for_release(cgrp); |
326 | } | 306 | } |
307 | |||
308 | kfree(link); | ||
327 | } | 309 | } |
328 | rcu_read_unlock(); | 310 | |
311 | write_unlock(&css_set_lock); | ||
329 | kfree(cg); | 312 | kfree(cg); |
330 | } | 313 | } |
331 | 314 | ||
@@ -519,6 +502,7 @@ static void link_css_set(struct list_head *tmp_cg_links, | |||
519 | cgrp_link_list); | 502 | cgrp_link_list); |
520 | link->cg = cg; | 503 | link->cg = cg; |
521 | link->cgrp = cgrp; | 504 | link->cgrp = cgrp; |
505 | atomic_inc(&cgrp->count); | ||
522 | list_move(&link->cgrp_link_list, &cgrp->css_sets); | 506 | list_move(&link->cgrp_link_list, &cgrp->css_sets); |
523 | /* | 507 | /* |
524 | * Always add links to the tail of the list so that the list | 508 | * Always add links to the tail of the list so that the list |
@@ -539,7 +523,6 @@ static struct css_set *find_css_set( | |||
539 | { | 523 | { |
540 | struct css_set *res; | 524 | struct css_set *res; |
541 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | 525 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; |
542 | int i; | ||
543 | 526 | ||
544 | struct list_head tmp_cg_links; | 527 | struct list_head tmp_cg_links; |
545 | 528 | ||
@@ -578,10 +561,6 @@ static struct css_set *find_css_set( | |||
578 | 561 | ||
579 | write_lock(&css_set_lock); | 562 | write_lock(&css_set_lock); |
580 | /* Add reference counts and links from the new css_set. */ | 563 | /* Add reference counts and links from the new css_set. */ |
581 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
582 | struct cgroup *cgrp = res->subsys[i]->cgroup; | ||
583 | atomic_inc(&cgrp->count); | ||
584 | } | ||
585 | list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { | 564 | list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { |
586 | struct cgroup *c = link->cgrp; | 565 | struct cgroup *c = link->cgrp; |
587 | if (c->root == cgrp->root) | 566 | if (c->root == cgrp->root) |
@@ -972,8 +951,11 @@ struct cgroup_sb_opts { | |||
972 | unsigned long flags; | 951 | unsigned long flags; |
973 | char *release_agent; | 952 | char *release_agent; |
974 | char *name; | 953 | char *name; |
954 | /* User explicitly requested empty subsystem */ | ||
955 | bool none; | ||
975 | 956 | ||
976 | struct cgroupfs_root *new_root; | 957 | struct cgroupfs_root *new_root; |
958 | |||
977 | }; | 959 | }; |
978 | 960 | ||
979 | /* Convert a hierarchy specifier into a bitmask of subsystems and | 961 | /* Convert a hierarchy specifier into a bitmask of subsystems and |
@@ -1002,6 +984,9 @@ static int parse_cgroupfs_options(char *data, | |||
1002 | if (!ss->disabled) | 984 | if (!ss->disabled) |
1003 | opts->subsys_bits |= 1ul << i; | 985 | opts->subsys_bits |= 1ul << i; |
1004 | } | 986 | } |
987 | } else if (!strcmp(token, "none")) { | ||
988 | /* Explicitly have no subsystems */ | ||
989 | opts->none = true; | ||
1005 | } else if (!strcmp(token, "noprefix")) { | 990 | } else if (!strcmp(token, "noprefix")) { |
1006 | set_bit(ROOT_NOPREFIX, &opts->flags); | 991 | set_bit(ROOT_NOPREFIX, &opts->flags); |
1007 | } else if (!strncmp(token, "release_agent=", 14)) { | 992 | } else if (!strncmp(token, "release_agent=", 14)) { |
@@ -1051,6 +1036,8 @@ static int parse_cgroupfs_options(char *data, | |||
1051 | } | 1036 | } |
1052 | } | 1037 | } |
1053 | 1038 | ||
1039 | /* Consistency checks */ | ||
1040 | |||
1054 | /* | 1041 | /* |
1055 | * Option noprefix was introduced just for backward compatibility | 1042 | * Option noprefix was introduced just for backward compatibility |
1056 | * with the old cpuset, so we allow noprefix only if mounting just | 1043 | * with the old cpuset, so we allow noprefix only if mounting just |
@@ -1060,7 +1047,15 @@ static int parse_cgroupfs_options(char *data, | |||
1060 | (opts->subsys_bits & mask)) | 1047 | (opts->subsys_bits & mask)) |
1061 | return -EINVAL; | 1048 | return -EINVAL; |
1062 | 1049 | ||
1063 | /* We can't have an empty hierarchy */ | 1050 | |
1051 | /* Can't specify "none" and some subsystems */ | ||
1052 | if (opts->subsys_bits && opts->none) | ||
1053 | return -EINVAL; | ||
1054 | |||
1055 | /* | ||
1056 | * We either have to specify by name or by subsystems. (So all | ||
1057 | * empty hierarchies must have a name). | ||
1058 | */ | ||
1064 | if (!opts->subsys_bits && !opts->name) | 1059 | if (!opts->subsys_bits && !opts->name) |
1065 | return -EINVAL; | 1060 | return -EINVAL; |
1066 | 1061 | ||
@@ -1141,6 +1136,31 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1141 | init_cgroup_housekeeping(cgrp); | 1136 | init_cgroup_housekeeping(cgrp); |
1142 | } | 1137 | } |
1143 | 1138 | ||
1139 | static bool init_root_id(struct cgroupfs_root *root) | ||
1140 | { | ||
1141 | int ret = 0; | ||
1142 | |||
1143 | do { | ||
1144 | if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) | ||
1145 | return false; | ||
1146 | spin_lock(&hierarchy_id_lock); | ||
1147 | /* Try to allocate the next unused ID */ | ||
1148 | ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, | ||
1149 | &root->hierarchy_id); | ||
1150 | if (ret == -ENOSPC) | ||
1151 | /* Try again starting from 0 */ | ||
1152 | ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); | ||
1153 | if (!ret) { | ||
1154 | next_hierarchy_id = root->hierarchy_id + 1; | ||
1155 | } else if (ret != -EAGAIN) { | ||
1156 | /* Can only get here if the 31-bit IDR is full ... */ | ||
1157 | BUG_ON(ret); | ||
1158 | } | ||
1159 | spin_unlock(&hierarchy_id_lock); | ||
1160 | } while (ret); | ||
1161 | return true; | ||
1162 | } | ||
1163 | |||
1144 | static int cgroup_test_super(struct super_block *sb, void *data) | 1164 | static int cgroup_test_super(struct super_block *sb, void *data) |
1145 | { | 1165 | { |
1146 | struct cgroup_sb_opts *opts = data; | 1166 | struct cgroup_sb_opts *opts = data; |
@@ -1150,8 +1170,12 @@ static int cgroup_test_super(struct super_block *sb, void *data) | |||
1150 | if (opts->name && strcmp(opts->name, root->name)) | 1170 | if (opts->name && strcmp(opts->name, root->name)) |
1151 | return 0; | 1171 | return 0; |
1152 | 1172 | ||
1153 | /* If we asked for subsystems then they must match */ | 1173 | /* |
1154 | if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits)) | 1174 | * If we asked for subsystems (or explicitly for no |
1175 | * subsystems) then they must match | ||
1176 | */ | ||
1177 | if ((opts->subsys_bits || opts->none) | ||
1178 | && (opts->subsys_bits != root->subsys_bits)) | ||
1155 | return 0; | 1179 | return 0; |
1156 | 1180 | ||
1157 | return 1; | 1181 | return 1; |
@@ -1161,15 +1185,19 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1161 | { | 1185 | { |
1162 | struct cgroupfs_root *root; | 1186 | struct cgroupfs_root *root; |
1163 | 1187 | ||
1164 | /* Empty hierarchies aren't supported */ | 1188 | if (!opts->subsys_bits && !opts->none) |
1165 | if (!opts->subsys_bits) | ||
1166 | return NULL; | 1189 | return NULL; |
1167 | 1190 | ||
1168 | root = kzalloc(sizeof(*root), GFP_KERNEL); | 1191 | root = kzalloc(sizeof(*root), GFP_KERNEL); |
1169 | if (!root) | 1192 | if (!root) |
1170 | return ERR_PTR(-ENOMEM); | 1193 | return ERR_PTR(-ENOMEM); |
1171 | 1194 | ||
1195 | if (!init_root_id(root)) { | ||
1196 | kfree(root); | ||
1197 | return ERR_PTR(-ENOMEM); | ||
1198 | } | ||
1172 | init_cgroup_root(root); | 1199 | init_cgroup_root(root); |
1200 | |||
1173 | root->subsys_bits = opts->subsys_bits; | 1201 | root->subsys_bits = opts->subsys_bits; |
1174 | root->flags = opts->flags; | 1202 | root->flags = opts->flags; |
1175 | if (opts->release_agent) | 1203 | if (opts->release_agent) |
@@ -1179,6 +1207,18 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1179 | return root; | 1207 | return root; |
1180 | } | 1208 | } |
1181 | 1209 | ||
1210 | static void cgroup_drop_root(struct cgroupfs_root *root) | ||
1211 | { | ||
1212 | if (!root) | ||
1213 | return; | ||
1214 | |||
1215 | BUG_ON(!root->hierarchy_id); | ||
1216 | spin_lock(&hierarchy_id_lock); | ||
1217 | ida_remove(&hierarchy_ida, root->hierarchy_id); | ||
1218 | spin_unlock(&hierarchy_id_lock); | ||
1219 | kfree(root); | ||
1220 | } | ||
1221 | |||
1182 | static int cgroup_set_super(struct super_block *sb, void *data) | 1222 | static int cgroup_set_super(struct super_block *sb, void *data) |
1183 | { | 1223 | { |
1184 | int ret; | 1224 | int ret; |
@@ -1188,7 +1228,7 @@ static int cgroup_set_super(struct super_block *sb, void *data) | |||
1188 | if (!opts->new_root) | 1228 | if (!opts->new_root) |
1189 | return -EINVAL; | 1229 | return -EINVAL; |
1190 | 1230 | ||
1191 | BUG_ON(!opts->subsys_bits); | 1231 | BUG_ON(!opts->subsys_bits && !opts->none); |
1192 | 1232 | ||
1193 | ret = set_anon_super(sb, NULL); | 1233 | ret = set_anon_super(sb, NULL); |
1194 | if (ret) | 1234 | if (ret) |
@@ -1257,7 +1297,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1257 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); | 1297 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); |
1258 | if (IS_ERR(sb)) { | 1298 | if (IS_ERR(sb)) { |
1259 | ret = PTR_ERR(sb); | 1299 | ret = PTR_ERR(sb); |
1260 | kfree(opts.new_root); | 1300 | cgroup_drop_root(opts.new_root); |
1261 | goto out_err; | 1301 | goto out_err; |
1262 | } | 1302 | } |
1263 | 1303 | ||
@@ -1351,7 +1391,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1351 | * We re-used an existing hierarchy - the new root (if | 1391 | * We re-used an existing hierarchy - the new root (if |
1352 | * any) is not needed | 1392 | * any) is not needed |
1353 | */ | 1393 | */ |
1354 | kfree(opts.new_root); | 1394 | cgroup_drop_root(opts.new_root); |
1355 | } | 1395 | } |
1356 | 1396 | ||
1357 | simple_set_mnt(mnt, sb); | 1397 | simple_set_mnt(mnt, sb); |
@@ -1410,7 +1450,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1410 | mutex_unlock(&cgroup_mutex); | 1450 | mutex_unlock(&cgroup_mutex); |
1411 | 1451 | ||
1412 | kill_litter_super(sb); | 1452 | kill_litter_super(sb); |
1413 | kfree(root); | 1453 | cgroup_drop_root(root); |
1414 | } | 1454 | } |
1415 | 1455 | ||
1416 | static struct file_system_type cgroup_fs_type = { | 1456 | static struct file_system_type cgroup_fs_type = { |
@@ -3109,7 +3149,7 @@ int __init cgroup_init(void) | |||
3109 | /* Add init_css_set to the hash table */ | 3149 | /* Add init_css_set to the hash table */ |
3110 | hhead = css_set_hash(init_css_set.subsys); | 3150 | hhead = css_set_hash(init_css_set.subsys); |
3111 | hlist_add_head(&init_css_set.hlist, hhead); | 3151 | hlist_add_head(&init_css_set.hlist, hhead); |
3112 | 3152 | BUG_ON(!init_root_id(&rootnode)); | |
3113 | err = register_filesystem(&cgroup_fs_type); | 3153 | err = register_filesystem(&cgroup_fs_type); |
3114 | if (err < 0) | 3154 | if (err < 0) |
3115 | goto out; | 3155 | goto out; |
@@ -3164,7 +3204,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v) | |||
3164 | struct cgroup *cgrp; | 3204 | struct cgroup *cgrp; |
3165 | int count = 0; | 3205 | int count = 0; |
3166 | 3206 | ||
3167 | seq_printf(m, "%lu:", root->subsys_bits); | 3207 | seq_printf(m, "%d:", root->hierarchy_id); |
3168 | for_each_subsys(root, ss) | 3208 | for_each_subsys(root, ss) |
3169 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 3209 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
3170 | if (strlen(root->name)) | 3210 | if (strlen(root->name)) |
@@ -3210,8 +3250,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
3210 | mutex_lock(&cgroup_mutex); | 3250 | mutex_lock(&cgroup_mutex); |
3211 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3251 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3212 | struct cgroup_subsys *ss = subsys[i]; | 3252 | struct cgroup_subsys *ss = subsys[i]; |
3213 | seq_printf(m, "%s\t%lu\t%d\t%d\n", | 3253 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
3214 | ss->name, ss->root->subsys_bits, | 3254 | ss->name, ss->root->hierarchy_id, |
3215 | ss->root->number_of_cgroups, !ss->disabled); | 3255 | ss->root->number_of_cgroups, !ss->disabled); |
3216 | } | 3256 | } |
3217 | mutex_unlock(&cgroup_mutex); | 3257 | mutex_unlock(&cgroup_mutex); |
@@ -3929,8 +3969,8 @@ static int current_css_set_cg_links_read(struct cgroup *cont, | |||
3929 | name = c->dentry->d_name.name; | 3969 | name = c->dentry->d_name.name; |
3930 | else | 3970 | else |
3931 | name = "?"; | 3971 | name = "?"; |
3932 | seq_printf(seq, "Root %lu group %s\n", | 3972 | seq_printf(seq, "Root %d group %s\n", |
3933 | c->root->subsys_bits, name); | 3973 | c->root->hierarchy_id, name); |
3934 | } | 3974 | } |
3935 | rcu_read_unlock(); | 3975 | rcu_read_unlock(); |
3936 | read_unlock(&css_set_lock); | 3976 | read_unlock(&css_set_lock); |