diff options
author | Ben Blum <bblum@andrew.cmu.edu> | 2010-03-10 18:22:09 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-12 18:52:36 -0500 |
commit | cf5d5941fda647fe3d2f2d00cf9e0245236a5f08 (patch) | |
tree | deee6501f2f08089a2cd62732c3848a59a6f6a93 /kernel | |
parent | e6a1105ba08b265023dd71a4174fb4a29ebc7083 (diff) |
cgroups: subsystem module unloading
Provides support for unloading modular subsystems.
This patch adds a new function cgroup_unload_subsys which is to be used
for removing a loaded subsystem during module deletion. Reference
counting of the subsystems' modules is moved from once (at load time) to
once per attached hierarchy (in parse_cgroupfs_options and
rebind_subsystems) (i.e., 0 or 1).
Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 167 |
1 files changed, 142 insertions, 25 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cae38e64c59..aa889c96cc74 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -894,7 +894,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | |||
894 | } | 894 | } |
895 | 895 | ||
896 | /* | 896 | /* |
897 | * Call with cgroup_mutex held. | 897 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
898 | * any duplicate ones that parse_cgroupfs_options took. If this function | ||
899 | * returns an error, no reference counts are touched. | ||
898 | */ | 900 | */ |
899 | static int rebind_subsystems(struct cgroupfs_root *root, | 901 | static int rebind_subsystems(struct cgroupfs_root *root, |
900 | unsigned long final_bits) | 902 | unsigned long final_bits) |
@@ -950,6 +952,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
950 | if (ss->bind) | 952 | if (ss->bind) |
951 | ss->bind(ss, cgrp); | 953 | ss->bind(ss, cgrp); |
952 | mutex_unlock(&ss->hierarchy_mutex); | 954 | mutex_unlock(&ss->hierarchy_mutex); |
955 | /* refcount was already taken, and we're keeping it */ | ||
953 | } else if (bit & removed_bits) { | 956 | } else if (bit & removed_bits) { |
954 | /* We're removing this subsystem */ | 957 | /* We're removing this subsystem */ |
955 | BUG_ON(ss == NULL); | 958 | BUG_ON(ss == NULL); |
@@ -963,10 +966,20 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
963 | subsys[i]->root = &rootnode; | 966 | subsys[i]->root = &rootnode; |
964 | list_move(&ss->sibling, &rootnode.subsys_list); | 967 | list_move(&ss->sibling, &rootnode.subsys_list); |
965 | mutex_unlock(&ss->hierarchy_mutex); | 968 | mutex_unlock(&ss->hierarchy_mutex); |
969 | /* subsystem is now free - drop reference on module */ | ||
970 | module_put(ss->module); | ||
966 | } else if (bit & final_bits) { | 971 | } else if (bit & final_bits) { |
967 | /* Subsystem state should already exist */ | 972 | /* Subsystem state should already exist */ |
968 | BUG_ON(ss == NULL); | 973 | BUG_ON(ss == NULL); |
969 | BUG_ON(!cgrp->subsys[i]); | 974 | BUG_ON(!cgrp->subsys[i]); |
975 | /* | ||
976 | * a refcount was taken, but we already had one, so | ||
977 | * drop the extra reference. | ||
978 | */ | ||
979 | module_put(ss->module); | ||
980 | #ifdef CONFIG_MODULE_UNLOAD | ||
981 | BUG_ON(ss->module && !module_refcount(ss->module)); | ||
982 | #endif | ||
970 | } else { | 983 | } else { |
971 | /* Subsystem state shouldn't exist */ | 984 | /* Subsystem state shouldn't exist */ |
972 | BUG_ON(cgrp->subsys[i]); | 985 | BUG_ON(cgrp->subsys[i]); |
@@ -1010,13 +1023,16 @@ struct cgroup_sb_opts { | |||
1010 | 1023 | ||
1011 | /* | 1024 | /* |
1012 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call | 1025 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call |
1013 | * with cgroup_mutex held to protect the subsys[] array. | 1026 | * with cgroup_mutex held to protect the subsys[] array. This function takes |
1027 | * refcounts on subsystems to be used, unless it returns error, in which case | ||
1028 | * no refcounts are taken. | ||
1014 | */ | 1029 | */ |
1015 | static int parse_cgroupfs_options(char *data, | 1030 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1016 | struct cgroup_sb_opts *opts) | ||
1017 | { | 1031 | { |
1018 | char *token, *o = data ?: "all"; | 1032 | char *token, *o = data ?: "all"; |
1019 | unsigned long mask = (unsigned long)-1; | 1033 | unsigned long mask = (unsigned long)-1; |
1034 | int i; | ||
1035 | bool module_pin_failed = false; | ||
1020 | 1036 | ||
1021 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1037 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1022 | 1038 | ||
@@ -1031,7 +1047,6 @@ static int parse_cgroupfs_options(char *data, | |||
1031 | return -EINVAL; | 1047 | return -EINVAL; |
1032 | if (!strcmp(token, "all")) { | 1048 | if (!strcmp(token, "all")) { |
1033 | /* Add all non-disabled subsystems */ | 1049 | /* Add all non-disabled subsystems */ |
1034 | int i; | ||
1035 | opts->subsys_bits = 0; | 1050 | opts->subsys_bits = 0; |
1036 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1051 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1037 | struct cgroup_subsys *ss = subsys[i]; | 1052 | struct cgroup_subsys *ss = subsys[i]; |
@@ -1054,7 +1069,6 @@ static int parse_cgroupfs_options(char *data, | |||
1054 | if (!opts->release_agent) | 1069 | if (!opts->release_agent) |
1055 | return -ENOMEM; | 1070 | return -ENOMEM; |
1056 | } else if (!strncmp(token, "name=", 5)) { | 1071 | } else if (!strncmp(token, "name=", 5)) { |
1057 | int i; | ||
1058 | const char *name = token + 5; | 1072 | const char *name = token + 5; |
1059 | /* Can't specify an empty name */ | 1073 | /* Can't specify an empty name */ |
1060 | if (!strlen(name)) | 1074 | if (!strlen(name)) |
@@ -1078,7 +1092,6 @@ static int parse_cgroupfs_options(char *data, | |||
1078 | return -ENOMEM; | 1092 | return -ENOMEM; |
1079 | } else { | 1093 | } else { |
1080 | struct cgroup_subsys *ss; | 1094 | struct cgroup_subsys *ss; |
1081 | int i; | ||
1082 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1095 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1083 | ss = subsys[i]; | 1096 | ss = subsys[i]; |
1084 | if (ss == NULL) | 1097 | if (ss == NULL) |
@@ -1117,9 +1130,54 @@ static int parse_cgroupfs_options(char *data, | |||
1117 | if (!opts->subsys_bits && !opts->name) | 1130 | if (!opts->subsys_bits && !opts->name) |
1118 | return -EINVAL; | 1131 | return -EINVAL; |
1119 | 1132 | ||
1133 | /* | ||
1134 | * Grab references on all the modules we'll need, so the subsystems | ||
1135 | * don't dance around before rebind_subsystems attaches them. This may | ||
1136 | * take duplicate reference counts on a subsystem that's already used, | ||
1137 | * but rebind_subsystems handles this case. | ||
1138 | */ | ||
1139 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1140 | unsigned long bit = 1UL << i; | ||
1141 | |||
1142 | if (!(bit & opts->subsys_bits)) | ||
1143 | continue; | ||
1144 | if (!try_module_get(subsys[i]->module)) { | ||
1145 | module_pin_failed = true; | ||
1146 | break; | ||
1147 | } | ||
1148 | } | ||
1149 | if (module_pin_failed) { | ||
1150 | /* | ||
1151 | * oops, one of the modules was going away. this means that we | ||
1152 | * raced with a module_delete call, and to the user this is | ||
1153 | * essentially a "subsystem doesn't exist" case. | ||
1154 | */ | ||
1155 | for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { | ||
1156 | /* drop refcounts only on the ones we took */ | ||
1157 | unsigned long bit = 1UL << i; | ||
1158 | |||
1159 | if (!(bit & opts->subsys_bits)) | ||
1160 | continue; | ||
1161 | module_put(subsys[i]->module); | ||
1162 | } | ||
1163 | return -ENOENT; | ||
1164 | } | ||
1165 | |||
1120 | return 0; | 1166 | return 0; |
1121 | } | 1167 | } |
1122 | 1168 | ||
1169 | static void drop_parsed_module_refcounts(unsigned long subsys_bits) | ||
1170 | { | ||
1171 | int i; | ||
1172 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1173 | unsigned long bit = 1UL << i; | ||
1174 | |||
1175 | if (!(bit & subsys_bits)) | ||
1176 | continue; | ||
1177 | module_put(subsys[i]->module); | ||
1178 | } | ||
1179 | } | ||
1180 | |||
1123 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1181 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
1124 | { | 1182 | { |
1125 | int ret = 0; | 1183 | int ret = 0; |
@@ -1136,21 +1194,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1136 | if (ret) | 1194 | if (ret) |
1137 | goto out_unlock; | 1195 | goto out_unlock; |
1138 | 1196 | ||
1139 | /* Don't allow flags to change at remount */ | 1197 | /* Don't allow flags or name to change at remount */ |
1140 | if (opts.flags != root->flags) { | 1198 | if (opts.flags != root->flags || |
1141 | ret = -EINVAL; | 1199 | (opts.name && strcmp(opts.name, root->name))) { |
1142 | goto out_unlock; | ||
1143 | } | ||
1144 | |||
1145 | /* Don't allow name to change at remount */ | ||
1146 | if (opts.name && strcmp(opts.name, root->name)) { | ||
1147 | ret = -EINVAL; | 1200 | ret = -EINVAL; |
1201 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1148 | goto out_unlock; | 1202 | goto out_unlock; |
1149 | } | 1203 | } |
1150 | 1204 | ||
1151 | ret = rebind_subsystems(root, opts.subsys_bits); | 1205 | ret = rebind_subsystems(root, opts.subsys_bits); |
1152 | if (ret) | 1206 | if (ret) { |
1207 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1153 | goto out_unlock; | 1208 | goto out_unlock; |
1209 | } | ||
1154 | 1210 | ||
1155 | /* (re)populate subsystem files */ | 1211 | /* (re)populate subsystem files */ |
1156 | cgroup_populate_dir(cgrp); | 1212 | cgroup_populate_dir(cgrp); |
@@ -1349,7 +1405,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1349 | new_root = cgroup_root_from_opts(&opts); | 1405 | new_root = cgroup_root_from_opts(&opts); |
1350 | if (IS_ERR(new_root)) { | 1406 | if (IS_ERR(new_root)) { |
1351 | ret = PTR_ERR(new_root); | 1407 | ret = PTR_ERR(new_root); |
1352 | goto out_err; | 1408 | goto drop_modules; |
1353 | } | 1409 | } |
1354 | opts.new_root = new_root; | 1410 | opts.new_root = new_root; |
1355 | 1411 | ||
@@ -1358,7 +1414,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1358 | if (IS_ERR(sb)) { | 1414 | if (IS_ERR(sb)) { |
1359 | ret = PTR_ERR(sb); | 1415 | ret = PTR_ERR(sb); |
1360 | cgroup_drop_root(opts.new_root); | 1416 | cgroup_drop_root(opts.new_root); |
1361 | goto out_err; | 1417 | goto drop_modules; |
1362 | } | 1418 | } |
1363 | 1419 | ||
1364 | root = sb->s_fs_info; | 1420 | root = sb->s_fs_info; |
@@ -1414,6 +1470,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1414 | free_cg_links(&tmp_cg_links); | 1470 | free_cg_links(&tmp_cg_links); |
1415 | goto drop_new_super; | 1471 | goto drop_new_super; |
1416 | } | 1472 | } |
1473 | /* | ||
1474 | * There must be no failure case after here, since rebinding | ||
1475 | * takes care of subsystems' refcounts, which are explicitly | ||
1476 | * dropped in the failure exit path. | ||
1477 | */ | ||
1417 | 1478 | ||
1418 | /* EBUSY should be the only error here */ | 1479 | /* EBUSY should be the only error here */ |
1419 | BUG_ON(ret); | 1480 | BUG_ON(ret); |
@@ -1452,6 +1513,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1452 | * any) is not needed | 1513 | * any) is not needed |
1453 | */ | 1514 | */ |
1454 | cgroup_drop_root(opts.new_root); | 1515 | cgroup_drop_root(opts.new_root); |
1516 | /* no subsys rebinding, so refcounts don't change */ | ||
1517 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1455 | } | 1518 | } |
1456 | 1519 | ||
1457 | simple_set_mnt(mnt, sb); | 1520 | simple_set_mnt(mnt, sb); |
@@ -1461,6 +1524,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1461 | 1524 | ||
1462 | drop_new_super: | 1525 | drop_new_super: |
1463 | deactivate_locked_super(sb); | 1526 | deactivate_locked_super(sb); |
1527 | drop_modules: | ||
1528 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1464 | out_err: | 1529 | out_err: |
1465 | kfree(opts.release_agent); | 1530 | kfree(opts.release_agent); |
1466 | kfree(opts.name); | 1531 | kfree(opts.name); |
@@ -3422,13 +3487,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
3422 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); | 3487 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); |
3423 | ss->active = 1; | 3488 | ss->active = 1; |
3424 | 3489 | ||
3425 | /* | ||
3426 | * pin the subsystem's module so it doesn't go away. this shouldn't | ||
3427 | * fail, since the module's initcall calls us. | ||
3428 | * TODO: with module unloading, move this elsewhere | ||
3429 | */ | ||
3430 | BUG_ON(!try_module_get(ss->module)); | ||
3431 | |||
3432 | /* success! */ | 3490 | /* success! */ |
3433 | mutex_unlock(&cgroup_mutex); | 3491 | mutex_unlock(&cgroup_mutex); |
3434 | return 0; | 3492 | return 0; |
@@ -3436,6 +3494,65 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
3436 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | 3494 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); |
3437 | 3495 | ||
3438 | /** | 3496 | /** |
3497 | * cgroup_unload_subsys: unload a modular subsystem | ||
3498 | * @ss: the subsystem to unload | ||
3499 | * | ||
3500 | * This function should be called in a modular subsystem's exitcall. When this | ||
3501 | * function is invoked, the refcount on the subsystem's module will be 0, so | ||
3502 | * the subsystem will not be attached to any hierarchy. | ||
3503 | */ | ||
3504 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | ||
3505 | { | ||
3506 | struct cg_cgroup_link *link; | ||
3507 | struct hlist_head *hhead; | ||
3508 | |||
3509 | BUG_ON(ss->module == NULL); | ||
3510 | |||
3511 | /* | ||
3512 | * we shouldn't be called if the subsystem is in use, and the use of | ||
3513 | * try_module_get in parse_cgroupfs_options should ensure that it | ||
3514 | * doesn't start being used while we're killing it off. | ||
3515 | */ | ||
3516 | BUG_ON(ss->root != &rootnode); | ||
3517 | |||
3518 | mutex_lock(&cgroup_mutex); | ||
3519 | /* deassign the subsys_id */ | ||
3520 | BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); | ||
3521 | subsys[ss->subsys_id] = NULL; | ||
3522 | |||
3523 | /* remove subsystem from rootnode's list of subsystems */ | ||
3524 | list_del(&ss->sibling); | ||
3525 | |||
3526 | /* | ||
3527 | * disentangle the css from all css_sets attached to the dummytop. as | ||
3528 | * in loading, we need to pay our respects to the hashtable gods. | ||
3529 | */ | ||
3530 | write_lock(&css_set_lock); | ||
3531 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { | ||
3532 | struct css_set *cg = link->cg; | ||
3533 | |||
3534 | hlist_del(&cg->hlist); | ||
3535 | BUG_ON(!cg->subsys[ss->subsys_id]); | ||
3536 | cg->subsys[ss->subsys_id] = NULL; | ||
3537 | hhead = css_set_hash(cg->subsys); | ||
3538 | hlist_add_head(&cg->hlist, hhead); | ||
3539 | } | ||
3540 | write_unlock(&css_set_lock); | ||
3541 | |||
3542 | /* | ||
3543 | * remove subsystem's css from the dummytop and free it - need to free | ||
3544 | * before marking as null because ss->destroy needs the cgrp->subsys | ||
3545 | * pointer to find their state. note that this also takes care of | ||
3546 | * freeing the css_id. | ||
3547 | */ | ||
3548 | ss->destroy(ss, dummytop); | ||
3549 | dummytop->subsys[ss->subsys_id] = NULL; | ||
3550 | |||
3551 | mutex_unlock(&cgroup_mutex); | ||
3552 | } | ||
3553 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); | ||
3554 | |||
3555 | /** | ||
3439 | * cgroup_init_early - cgroup initialization at system boot | 3556 | * cgroup_init_early - cgroup initialization at system boot |
3440 | * | 3557 | * |
3441 | * Initialize cgroups at system boot, and initialize any | 3558 | * Initialize cgroups at system boot, and initialize any |