aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cgroups.txt5
-rw-r--r--include/linux/cgroup.h4
-rw-r--r--kernel/cgroup.c167
3 files changed, 148 insertions, 28 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index ae8a037a761e..764007b63921 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -489,8 +489,9 @@ Each subsystem should:
489- define a cgroup_subsys object called <name>_subsys 489- define a cgroup_subsys object called <name>_subsys
490 490
491If a subsystem can be compiled as a module, it should also have in its 491If a subsystem can be compiled as a module, it should also have in its
492module initcall a call to cgroup_load_subsys(&its_subsys_struct). It 492module initcall a call to cgroup_load_subsys(), and in its exitcall a
493should also set its_subsys.module = THIS_MODULE in its .c file. 493call to cgroup_unload_subsys(). It should also set its_subsys.module =
494THIS_MODULE in its .c file.
494 495
495Each subsystem may export the following methods. The only mandatory 496Each subsystem may export the following methods. The only mandatory
496methods are create/destroy. Any others that are null are presumed to 497methods are create/destroy. Any others that are null are presumed to
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 402ce477c47e..2a59d3101e5d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -38,6 +38,7 @@ extern void cgroup_exit(struct task_struct *p, int run_callbacks);
38extern int cgroupstats_build(struct cgroupstats *stats, 38extern int cgroupstats_build(struct cgroupstats *stats,
39 struct dentry *dentry); 39 struct dentry *dentry);
40extern int cgroup_load_subsys(struct cgroup_subsys *ss); 40extern int cgroup_load_subsys(struct cgroup_subsys *ss);
41extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
41 42
42extern const struct file_operations proc_cgroup_operations; 43extern const struct file_operations proc_cgroup_operations;
43 44
@@ -271,7 +272,8 @@ struct css_set {
271 /* 272 /*
272 * Set of subsystem states, one for each subsystem. This array 273 * Set of subsystem states, one for each subsystem. This array
273 * is immutable after creation apart from the init_css_set 274 * is immutable after creation apart from the init_css_set
274 * during subsystem registration (at boot time). 275 * during subsystem registration (at boot time) and modular subsystem
276 * loading/unloading.
275 */ 277 */
276 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 278 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
277 279
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cae38e64c59..aa889c96cc74 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -894,7 +894,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
894} 894}
895 895
896/* 896/*
897 * Call with cgroup_mutex held. 897 * Call with cgroup_mutex held. Drops reference counts on modules, including
898 * any duplicate ones that parse_cgroupfs_options took. If this function
899 * returns an error, no reference counts are touched.
898 */ 900 */
899static int rebind_subsystems(struct cgroupfs_root *root, 901static int rebind_subsystems(struct cgroupfs_root *root,
900 unsigned long final_bits) 902 unsigned long final_bits)
@@ -950,6 +952,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
950 if (ss->bind) 952 if (ss->bind)
951 ss->bind(ss, cgrp); 953 ss->bind(ss, cgrp);
952 mutex_unlock(&ss->hierarchy_mutex); 954 mutex_unlock(&ss->hierarchy_mutex);
955 /* refcount was already taken, and we're keeping it */
953 } else if (bit & removed_bits) { 956 } else if (bit & removed_bits) {
954 /* We're removing this subsystem */ 957 /* We're removing this subsystem */
955 BUG_ON(ss == NULL); 958 BUG_ON(ss == NULL);
@@ -963,10 +966,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
963 subsys[i]->root = &rootnode; 966 subsys[i]->root = &rootnode;
964 list_move(&ss->sibling, &rootnode.subsys_list); 967 list_move(&ss->sibling, &rootnode.subsys_list);
965 mutex_unlock(&ss->hierarchy_mutex); 968 mutex_unlock(&ss->hierarchy_mutex);
969 /* subsystem is now free - drop reference on module */
970 module_put(ss->module);
966 } else if (bit & final_bits) { 971 } else if (bit & final_bits) {
967 /* Subsystem state should already exist */ 972 /* Subsystem state should already exist */
968 BUG_ON(ss == NULL); 973 BUG_ON(ss == NULL);
969 BUG_ON(!cgrp->subsys[i]); 974 BUG_ON(!cgrp->subsys[i]);
975 /*
976 * a refcount was taken, but we already had one, so
977 * drop the extra reference.
978 */
979 module_put(ss->module);
980#ifdef CONFIG_MODULE_UNLOAD
981 BUG_ON(ss->module && !module_refcount(ss->module));
982#endif
970 } else { 983 } else {
971 /* Subsystem state shouldn't exist */ 984 /* Subsystem state shouldn't exist */
972 BUG_ON(cgrp->subsys[i]); 985 BUG_ON(cgrp->subsys[i]);
@@ -1010,13 +1023,16 @@ struct cgroup_sb_opts {
1010 1023
1011/* 1024/*
1012 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1025 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
1013 * with cgroup_mutex held to protect the subsys[] array. 1026 * with cgroup_mutex held to protect the subsys[] array. This function takes
1027 * refcounts on subsystems to be used, unless it returns error, in which case
1028 * no refcounts are taken.
1014 */ 1029 */
1015static int parse_cgroupfs_options(char *data, 1030static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1016 struct cgroup_sb_opts *opts)
1017{ 1031{
1018 char *token, *o = data ?: "all"; 1032 char *token, *o = data ?: "all";
1019 unsigned long mask = (unsigned long)-1; 1033 unsigned long mask = (unsigned long)-1;
1034 int i;
1035 bool module_pin_failed = false;
1020 1036
1021 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1037 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1022 1038
@@ -1031,7 +1047,6 @@ static int parse_cgroupfs_options(char *data,
1031 return -EINVAL; 1047 return -EINVAL;
1032 if (!strcmp(token, "all")) { 1048 if (!strcmp(token, "all")) {
1033 /* Add all non-disabled subsystems */ 1049 /* Add all non-disabled subsystems */
1034 int i;
1035 opts->subsys_bits = 0; 1050 opts->subsys_bits = 0;
1036 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1051 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1037 struct cgroup_subsys *ss = subsys[i]; 1052 struct cgroup_subsys *ss = subsys[i];
@@ -1054,7 +1069,6 @@ static int parse_cgroupfs_options(char *data,
1054 if (!opts->release_agent) 1069 if (!opts->release_agent)
1055 return -ENOMEM; 1070 return -ENOMEM;
1056 } else if (!strncmp(token, "name=", 5)) { 1071 } else if (!strncmp(token, "name=", 5)) {
1057 int i;
1058 const char *name = token + 5; 1072 const char *name = token + 5;
1059 /* Can't specify an empty name */ 1073 /* Can't specify an empty name */
1060 if (!strlen(name)) 1074 if (!strlen(name))
@@ -1078,7 +1092,6 @@ static int parse_cgroupfs_options(char *data,
1078 return -ENOMEM; 1092 return -ENOMEM;
1079 } else { 1093 } else {
1080 struct cgroup_subsys *ss; 1094 struct cgroup_subsys *ss;
1081 int i;
1082 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1095 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1083 ss = subsys[i]; 1096 ss = subsys[i];
1084 if (ss == NULL) 1097 if (ss == NULL)
@@ -1117,9 +1130,54 @@ static int parse_cgroupfs_options(char *data,
1117 if (!opts->subsys_bits && !opts->name) 1130 if (!opts->subsys_bits && !opts->name)
1118 return -EINVAL; 1131 return -EINVAL;
1119 1132
1133 /*
1134 * Grab references on all the modules we'll need, so the subsystems
1135 * don't dance around before rebind_subsystems attaches them. This may
1136 * take duplicate reference counts on a subsystem that's already used,
1137 * but rebind_subsystems handles this case.
1138 */
1139 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1140 unsigned long bit = 1UL << i;
1141
1142 if (!(bit & opts->subsys_bits))
1143 continue;
1144 if (!try_module_get(subsys[i]->module)) {
1145 module_pin_failed = true;
1146 break;
1147 }
1148 }
1149 if (module_pin_failed) {
1150 /*
1151 * oops, one of the modules was going away. this means that we
1152 * raced with a module_delete call, and to the user this is
1153 * essentially a "subsystem doesn't exist" case.
1154 */
1155 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1156 /* drop refcounts only on the ones we took */
1157 unsigned long bit = 1UL << i;
1158
1159 if (!(bit & opts->subsys_bits))
1160 continue;
1161 module_put(subsys[i]->module);
1162 }
1163 return -ENOENT;
1164 }
1165
1120 return 0; 1166 return 0;
1121} 1167}
1122 1168
1169static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1170{
1171 int i;
1172 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1173 unsigned long bit = 1UL << i;
1174
1175 if (!(bit & subsys_bits))
1176 continue;
1177 module_put(subsys[i]->module);
1178 }
1179}
1180
1123static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1181static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1124{ 1182{
1125 int ret = 0; 1183 int ret = 0;
@@ -1136,21 +1194,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1136 if (ret) 1194 if (ret)
1137 goto out_unlock; 1195 goto out_unlock;
1138 1196
1139 /* Don't allow flags to change at remount */ 1197 /* Don't allow flags or name to change at remount */
1140 if (opts.flags != root->flags) { 1198 if (opts.flags != root->flags ||
1141 ret = -EINVAL; 1199 (opts.name && strcmp(opts.name, root->name))) {
1142 goto out_unlock;
1143 }
1144
1145 /* Don't allow name to change at remount */
1146 if (opts.name && strcmp(opts.name, root->name)) {
1147 ret = -EINVAL; 1200 ret = -EINVAL;
1201 drop_parsed_module_refcounts(opts.subsys_bits);
1148 goto out_unlock; 1202 goto out_unlock;
1149 } 1203 }
1150 1204
1151 ret = rebind_subsystems(root, opts.subsys_bits); 1205 ret = rebind_subsystems(root, opts.subsys_bits);
1152 if (ret) 1206 if (ret) {
1207 drop_parsed_module_refcounts(opts.subsys_bits);
1153 goto out_unlock; 1208 goto out_unlock;
1209 }
1154 1210
1155 /* (re)populate subsystem files */ 1211 /* (re)populate subsystem files */
1156 cgroup_populate_dir(cgrp); 1212 cgroup_populate_dir(cgrp);
@@ -1349,7 +1405,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1349 new_root = cgroup_root_from_opts(&opts); 1405 new_root = cgroup_root_from_opts(&opts);
1350 if (IS_ERR(new_root)) { 1406 if (IS_ERR(new_root)) {
1351 ret = PTR_ERR(new_root); 1407 ret = PTR_ERR(new_root);
1352 goto out_err; 1408 goto drop_modules;
1353 } 1409 }
1354 opts.new_root = new_root; 1410 opts.new_root = new_root;
1355 1411
@@ -1358,7 +1414,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1358 if (IS_ERR(sb)) { 1414 if (IS_ERR(sb)) {
1359 ret = PTR_ERR(sb); 1415 ret = PTR_ERR(sb);
1360 cgroup_drop_root(opts.new_root); 1416 cgroup_drop_root(opts.new_root);
1361 goto out_err; 1417 goto drop_modules;
1362 } 1418 }
1363 1419
1364 root = sb->s_fs_info; 1420 root = sb->s_fs_info;
@@ -1414,6 +1470,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1414 free_cg_links(&tmp_cg_links); 1470 free_cg_links(&tmp_cg_links);
1415 goto drop_new_super; 1471 goto drop_new_super;
1416 } 1472 }
1473 /*
1474 * There must be no failure case after here, since rebinding
1475 * takes care of subsystems' refcounts, which are explicitly
1476 * dropped in the failure exit path.
1477 */
1417 1478
1418 /* EBUSY should be the only error here */ 1479 /* EBUSY should be the only error here */
1419 BUG_ON(ret); 1480 BUG_ON(ret);
@@ -1452,6 +1513,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1452 * any) is not needed 1513 * any) is not needed
1453 */ 1514 */
1454 cgroup_drop_root(opts.new_root); 1515 cgroup_drop_root(opts.new_root);
1516 /* no subsys rebinding, so refcounts don't change */
1517 drop_parsed_module_refcounts(opts.subsys_bits);
1455 } 1518 }
1456 1519
1457 simple_set_mnt(mnt, sb); 1520 simple_set_mnt(mnt, sb);
@@ -1461,6 +1524,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1461 1524
1462 drop_new_super: 1525 drop_new_super:
1463 deactivate_locked_super(sb); 1526 deactivate_locked_super(sb);
1527 drop_modules:
1528 drop_parsed_module_refcounts(opts.subsys_bits);
1464 out_err: 1529 out_err:
1465 kfree(opts.release_agent); 1530 kfree(opts.release_agent);
1466 kfree(opts.name); 1531 kfree(opts.name);
@@ -3422,13 +3487,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3422 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3487 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3423 ss->active = 1; 3488 ss->active = 1;
3424 3489
3425 /*
3426 * pin the subsystem's module so it doesn't go away. this shouldn't
3427 * fail, since the module's initcall calls us.
3428 * TODO: with module unloading, move this elsewhere
3429 */
3430 BUG_ON(!try_module_get(ss->module));
3431
3432 /* success! */ 3490 /* success! */
3433 mutex_unlock(&cgroup_mutex); 3491 mutex_unlock(&cgroup_mutex);
3434 return 0; 3492 return 0;
@@ -3436,6 +3494,65 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3436EXPORT_SYMBOL_GPL(cgroup_load_subsys); 3494EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3437 3495
3438/** 3496/**
3497 * cgroup_unload_subsys: unload a modular subsystem
3498 * @ss: the subsystem to unload
3499 *
3500 * This function should be called in a modular subsystem's exitcall. When this
3501 * function is invoked, the refcount on the subsystem's module will be 0, so
3502 * the subsystem will not be attached to any hierarchy.
3503 */
3504void cgroup_unload_subsys(struct cgroup_subsys *ss)
3505{
3506 struct cg_cgroup_link *link;
3507 struct hlist_head *hhead;
3508
3509 BUG_ON(ss->module == NULL);
3510
3511 /*
3512 * we shouldn't be called if the subsystem is in use, and the use of
3513 * try_module_get in parse_cgroupfs_options should ensure that it
3514 * doesn't start being used while we're killing it off.
3515 */
3516 BUG_ON(ss->root != &rootnode);
3517
3518 mutex_lock(&cgroup_mutex);
3519 /* deassign the subsys_id */
3520 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3521 subsys[ss->subsys_id] = NULL;
3522
3523 /* remove subsystem from rootnode's list of subsystems */
3524 list_del(&ss->sibling);
3525
3526 /*
3527 * disentangle the css from all css_sets attached to the dummytop. as
3528 * in loading, we need to pay our respects to the hashtable gods.
3529 */
3530 write_lock(&css_set_lock);
3531 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3532 struct css_set *cg = link->cg;
3533
3534 hlist_del(&cg->hlist);
3535 BUG_ON(!cg->subsys[ss->subsys_id]);
3536 cg->subsys[ss->subsys_id] = NULL;
3537 hhead = css_set_hash(cg->subsys);
3538 hlist_add_head(&cg->hlist, hhead);
3539 }
3540 write_unlock(&css_set_lock);
3541
3542 /*
3543 * remove subsystem's css from the dummytop and free it - need to free
3544 * before marking as null because ss->destroy needs the cgrp->subsys
3545 * pointer to find their state. note that this also takes care of
3546 * freeing the css_id.
3547 */
3548 ss->destroy(ss, dummytop);
3549 dummytop->subsys[ss->subsys_id] = NULL;
3550
3551 mutex_unlock(&cgroup_mutex);
3552}
3553EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3554
3555/**
3439 * cgroup_init_early - cgroup initialization at system boot 3556 * cgroup_init_early - cgroup initialization at system boot
3440 * 3557 *
3441 * Initialize cgroups at system boot, and initialize any 3558 * Initialize cgroups at system boot, and initialize any