diff options
author | Ben Blum <bblum@andrew.cmu.edu> | 2010-03-10 18:22:07 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-12 18:52:36 -0500 |
commit | aae8aab40367036931608fdaf9e2dc568b516f19 (patch) | |
tree | b2a06ee21042eb3972ecd9e4153d61a8f6ed53cb /kernel | |
parent | d7b9fff711d5e8db8c844161c684017e556c38a0 (diff) |
cgroups: revamp subsys array
This patch series provides the ability for cgroup subsystems to be
compiled as modules both within and outside the kernel tree. This is
mainly useful for classifiers and subsystems that hook into components
that are already modules. cls_cgroup and blkio-cgroup serve as the
example use cases for this feature.
It provides an interface cgroup_load_subsys() and cgroup_unload_subsys()
which modular subsystems can use to register and depart during runtime.
The net_cls classifier subsystem serves as the example for a subsystem
which can be converted into a module using these changes.
Patch #1 sets up the subsys[] array so its contents can be dynamic as
modules appear and (eventually) disappear. Iterations over the array are
modified to handle when subsystems are absent, and the dynamic section of
the array is protected by cgroup_mutex.
Patch #2 implements an interface for modules to load subsystems, called
cgroup_load_subsys, similar to cgroup_init_subsys, and adds a module
pointer in struct cgroup_subsys.
Patch #3 adds a mechanism for unloading modular subsystems, which includes
a more advanced rework of the rudimentary reference counting introduced in
patch 2.
Patch #4 modifies the net_cls subsystem, which already had some module
declarations, to be configurable as a module, which also serves as a
simple proof-of-concept.
Part of implementing patches 2 and 4 involved updating css pointers in
each css_set when the module appears or leaves. In doing this, it was
discovered that css_sets always remain linked to the dummy cgroup,
regardless of whether or not any subsystems are actually bound to it
(i.e., not mounted on an actual hierarchy). The subsystem loading and
unloading code therefore should keep in mind the special cases where the
added subsystem is the only one in the dummy cgroup (and therefore all
css_sets need to be linked back into it) and where the removed subsys was
the only one in the dummy cgroup (and therefore all css_sets should be
unlinked from it) - however, as all css_sets always stay attached to the
dummy cgroup anyway, these cases are ignored. Any fix that addresses this
issue should also make sure these cases are addressed in the subsystem
loading and unloading code.
This patch:
Make subsys[] able to be dynamically populated to support modular
subsystems
This patch reworks the way the subsys[] array is used so that subsystems
can register themselves after boot time, and enables the internals of
cgroups to be able to handle when subsystems are not present or may
appear/disappear.
Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 96 |
1 files changed, 80 insertions, 16 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cace83ddbcdc..c92fb9549358 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -57,10 +57,14 @@ | |||
57 | 57 | ||
58 | static DEFINE_MUTEX(cgroup_mutex); | 58 | static DEFINE_MUTEX(cgroup_mutex); |
59 | 59 | ||
60 | /* Generate an array of cgroup subsystem pointers */ | 60 | /* |
61 | * Generate an array of cgroup subsystem pointers. At boot time, this is | ||
62 | * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are | ||
63 | * registered after that. The mutable section of this array is protected by | ||
64 | * cgroup_mutex. | ||
65 | */ | ||
61 | #define SUBSYS(_x) &_x ## _subsys, | 66 | #define SUBSYS(_x) &_x ## _subsys, |
62 | 67 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | |
63 | static struct cgroup_subsys *subsys[] = { | ||
64 | #include <linux/cgroup_subsys.h> | 68 | #include <linux/cgroup_subsys.h> |
65 | }; | 69 | }; |
66 | 70 | ||
@@ -448,8 +452,11 @@ static struct css_set *find_existing_css_set( | |||
448 | struct hlist_node *node; | 452 | struct hlist_node *node; |
449 | struct css_set *cg; | 453 | struct css_set *cg; |
450 | 454 | ||
451 | /* Built the set of subsystem state objects that we want to | 455 | /* |
452 | * see in the new css_set */ | 456 | * Build the set of subsystem state objects that we want to see in the |
457 | * new css_set. while subsystems can change globally, the entries here | ||
458 | * won't change, so no need for locking. | ||
459 | */ | ||
453 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 460 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
454 | if (root->subsys_bits & (1UL << i)) { | 461 | if (root->subsys_bits & (1UL << i)) { |
455 | /* Subsystem is in this hierarchy. So we want | 462 | /* Subsystem is in this hierarchy. So we want |
@@ -884,7 +891,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | |||
884 | css_put(css); | 891 | css_put(css); |
885 | } | 892 | } |
886 | 893 | ||
887 | 894 | /* | |
895 | * Call with cgroup_mutex held. | ||
896 | */ | ||
888 | static int rebind_subsystems(struct cgroupfs_root *root, | 897 | static int rebind_subsystems(struct cgroupfs_root *root, |
889 | unsigned long final_bits) | 898 | unsigned long final_bits) |
890 | { | 899 | { |
@@ -892,6 +901,8 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
892 | struct cgroup *cgrp = &root->top_cgroup; | 901 | struct cgroup *cgrp = &root->top_cgroup; |
893 | int i; | 902 | int i; |
894 | 903 | ||
904 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | ||
905 | |||
895 | removed_bits = root->actual_subsys_bits & ~final_bits; | 906 | removed_bits = root->actual_subsys_bits & ~final_bits; |
896 | added_bits = final_bits & ~root->actual_subsys_bits; | 907 | added_bits = final_bits & ~root->actual_subsys_bits; |
897 | /* Check that any added subsystems are currently free */ | 908 | /* Check that any added subsystems are currently free */ |
@@ -900,6 +911,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
900 | struct cgroup_subsys *ss = subsys[i]; | 911 | struct cgroup_subsys *ss = subsys[i]; |
901 | if (!(bit & added_bits)) | 912 | if (!(bit & added_bits)) |
902 | continue; | 913 | continue; |
914 | /* | ||
915 | * Nobody should tell us to do a subsys that doesn't exist: | ||
916 | * parse_cgroupfs_options should catch that case and refcounts | ||
917 | * ensure that subsystems won't disappear once selected. | ||
918 | */ | ||
919 | BUG_ON(ss == NULL); | ||
903 | if (ss->root != &rootnode) { | 920 | if (ss->root != &rootnode) { |
904 | /* Subsystem isn't free */ | 921 | /* Subsystem isn't free */ |
905 | return -EBUSY; | 922 | return -EBUSY; |
@@ -919,6 +936,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
919 | unsigned long bit = 1UL << i; | 936 | unsigned long bit = 1UL << i; |
920 | if (bit & added_bits) { | 937 | if (bit & added_bits) { |
921 | /* We're binding this subsystem to this hierarchy */ | 938 | /* We're binding this subsystem to this hierarchy */ |
939 | BUG_ON(ss == NULL); | ||
922 | BUG_ON(cgrp->subsys[i]); | 940 | BUG_ON(cgrp->subsys[i]); |
923 | BUG_ON(!dummytop->subsys[i]); | 941 | BUG_ON(!dummytop->subsys[i]); |
924 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | 942 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); |
@@ -932,6 +950,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
932 | mutex_unlock(&ss->hierarchy_mutex); | 950 | mutex_unlock(&ss->hierarchy_mutex); |
933 | } else if (bit & removed_bits) { | 951 | } else if (bit & removed_bits) { |
934 | /* We're removing this subsystem */ | 952 | /* We're removing this subsystem */ |
953 | BUG_ON(ss == NULL); | ||
935 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | 954 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); |
936 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 955 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
937 | mutex_lock(&ss->hierarchy_mutex); | 956 | mutex_lock(&ss->hierarchy_mutex); |
@@ -944,6 +963,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
944 | mutex_unlock(&ss->hierarchy_mutex); | 963 | mutex_unlock(&ss->hierarchy_mutex); |
945 | } else if (bit & final_bits) { | 964 | } else if (bit & final_bits) { |
946 | /* Subsystem state should already exist */ | 965 | /* Subsystem state should already exist */ |
966 | BUG_ON(ss == NULL); | ||
947 | BUG_ON(!cgrp->subsys[i]); | 967 | BUG_ON(!cgrp->subsys[i]); |
948 | } else { | 968 | } else { |
949 | /* Subsystem state shouldn't exist */ | 969 | /* Subsystem state shouldn't exist */ |
@@ -986,14 +1006,18 @@ struct cgroup_sb_opts { | |||
986 | 1006 | ||
987 | }; | 1007 | }; |
988 | 1008 | ||
989 | /* Convert a hierarchy specifier into a bitmask of subsystems and | 1009 | /* |
990 | * flags. */ | 1010 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call |
1011 | * with cgroup_mutex held to protect the subsys[] array. | ||
1012 | */ | ||
991 | static int parse_cgroupfs_options(char *data, | 1013 | static int parse_cgroupfs_options(char *data, |
992 | struct cgroup_sb_opts *opts) | 1014 | struct cgroup_sb_opts *opts) |
993 | { | 1015 | { |
994 | char *token, *o = data ?: "all"; | 1016 | char *token, *o = data ?: "all"; |
995 | unsigned long mask = (unsigned long)-1; | 1017 | unsigned long mask = (unsigned long)-1; |
996 | 1018 | ||
1019 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | ||
1020 | |||
997 | #ifdef CONFIG_CPUSETS | 1021 | #ifdef CONFIG_CPUSETS |
998 | mask = ~(1UL << cpuset_subsys_id); | 1022 | mask = ~(1UL << cpuset_subsys_id); |
999 | #endif | 1023 | #endif |
@@ -1009,6 +1033,8 @@ static int parse_cgroupfs_options(char *data, | |||
1009 | opts->subsys_bits = 0; | 1033 | opts->subsys_bits = 0; |
1010 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1034 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1011 | struct cgroup_subsys *ss = subsys[i]; | 1035 | struct cgroup_subsys *ss = subsys[i]; |
1036 | if (ss == NULL) | ||
1037 | continue; | ||
1012 | if (!ss->disabled) | 1038 | if (!ss->disabled) |
1013 | opts->subsys_bits |= 1ul << i; | 1039 | opts->subsys_bits |= 1ul << i; |
1014 | } | 1040 | } |
@@ -1053,6 +1079,8 @@ static int parse_cgroupfs_options(char *data, | |||
1053 | int i; | 1079 | int i; |
1054 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1080 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1055 | ss = subsys[i]; | 1081 | ss = subsys[i]; |
1082 | if (ss == NULL) | ||
1083 | continue; | ||
1056 | if (!strcmp(token, ss->name)) { | 1084 | if (!strcmp(token, ss->name)) { |
1057 | if (!ss->disabled) | 1085 | if (!ss->disabled) |
1058 | set_bit(i, &opts->subsys_bits); | 1086 | set_bit(i, &opts->subsys_bits); |
@@ -1306,7 +1334,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1306 | struct cgroupfs_root *new_root; | 1334 | struct cgroupfs_root *new_root; |
1307 | 1335 | ||
1308 | /* First find the desired set of subsystems */ | 1336 | /* First find the desired set of subsystems */ |
1337 | mutex_lock(&cgroup_mutex); | ||
1309 | ret = parse_cgroupfs_options(data, &opts); | 1338 | ret = parse_cgroupfs_options(data, &opts); |
1339 | mutex_unlock(&cgroup_mutex); | ||
1310 | if (ret) | 1340 | if (ret) |
1311 | goto out_err; | 1341 | goto out_err; |
1312 | 1342 | ||
@@ -2918,8 +2948,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | |||
2918 | /* We need to take each hierarchy_mutex in a consistent order */ | 2948 | /* We need to take each hierarchy_mutex in a consistent order */ |
2919 | int i; | 2949 | int i; |
2920 | 2950 | ||
2951 | /* | ||
2952 | * No worry about a race with rebind_subsystems that might mess up the | ||
2953 | * locking order, since both parties are under cgroup_mutex. | ||
2954 | */ | ||
2921 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 2955 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
2922 | struct cgroup_subsys *ss = subsys[i]; | 2956 | struct cgroup_subsys *ss = subsys[i]; |
2957 | if (ss == NULL) | ||
2958 | continue; | ||
2923 | if (ss->root == root) | 2959 | if (ss->root == root) |
2924 | mutex_lock(&ss->hierarchy_mutex); | 2960 | mutex_lock(&ss->hierarchy_mutex); |
2925 | } | 2961 | } |
@@ -2931,6 +2967,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) | |||
2931 | 2967 | ||
2932 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 2968 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
2933 | struct cgroup_subsys *ss = subsys[i]; | 2969 | struct cgroup_subsys *ss = subsys[i]; |
2970 | if (ss == NULL) | ||
2971 | continue; | ||
2934 | if (ss->root == root) | 2972 | if (ss->root == root) |
2935 | mutex_unlock(&ss->hierarchy_mutex); | 2973 | mutex_unlock(&ss->hierarchy_mutex); |
2936 | } | 2974 | } |
@@ -3054,11 +3092,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3054 | * synchronization other than RCU, and the subsystem linked | 3092 | * synchronization other than RCU, and the subsystem linked |
3055 | * list isn't RCU-safe */ | 3093 | * list isn't RCU-safe */ |
3056 | int i; | 3094 | int i; |
3095 | /* | ||
3096 | * We won't need to lock the subsys array, because the subsystems | ||
3097 | * we're concerned about aren't going anywhere since our cgroup root | ||
3098 | * has a reference on them. | ||
3099 | */ | ||
3057 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3100 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3058 | struct cgroup_subsys *ss = subsys[i]; | 3101 | struct cgroup_subsys *ss = subsys[i]; |
3059 | struct cgroup_subsys_state *css; | 3102 | struct cgroup_subsys_state *css; |
3060 | /* Skip subsystems not in this hierarchy */ | 3103 | /* Skip subsystems not present or not in this hierarchy */ |
3061 | if (ss->root != cgrp->root) | 3104 | if (ss == NULL || ss->root != cgrp->root) |
3062 | continue; | 3105 | continue; |
3063 | css = cgrp->subsys[ss->subsys_id]; | 3106 | css = cgrp->subsys[ss->subsys_id]; |
3064 | /* When called from check_for_release() it's possible | 3107 | /* When called from check_for_release() it's possible |
@@ -3279,7 +3322,8 @@ int __init cgroup_init_early(void) | |||
3279 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | 3322 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) |
3280 | INIT_HLIST_HEAD(&css_set_table[i]); | 3323 | INIT_HLIST_HEAD(&css_set_table[i]); |
3281 | 3324 | ||
3282 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3325 | /* at bootup time, we don't worry about modular subsystems */ |
3326 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3283 | struct cgroup_subsys *ss = subsys[i]; | 3327 | struct cgroup_subsys *ss = subsys[i]; |
3284 | 3328 | ||
3285 | BUG_ON(!ss->name); | 3329 | BUG_ON(!ss->name); |
@@ -3314,7 +3358,8 @@ int __init cgroup_init(void) | |||
3314 | if (err) | 3358 | if (err) |
3315 | return err; | 3359 | return err; |
3316 | 3360 | ||
3317 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3361 | /* at bootup time, we don't worry about modular subsystems */ |
3362 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3318 | struct cgroup_subsys *ss = subsys[i]; | 3363 | struct cgroup_subsys *ss = subsys[i]; |
3319 | if (!ss->early_init) | 3364 | if (!ss->early_init) |
3320 | cgroup_init_subsys(ss); | 3365 | cgroup_init_subsys(ss); |
@@ -3423,9 +3468,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
3423 | int i; | 3468 | int i; |
3424 | 3469 | ||
3425 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); | 3470 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); |
3471 | /* | ||
3472 | * ideally we don't want subsystems moving around while we do this. | ||
3473 | * cgroup_mutex is also necessary to guarantee an atomic snapshot of | ||
3474 | * subsys/hierarchy state. | ||
3475 | */ | ||
3426 | mutex_lock(&cgroup_mutex); | 3476 | mutex_lock(&cgroup_mutex); |
3427 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3477 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3428 | struct cgroup_subsys *ss = subsys[i]; | 3478 | struct cgroup_subsys *ss = subsys[i]; |
3479 | if (ss == NULL) | ||
3480 | continue; | ||
3429 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 3481 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
3430 | ss->name, ss->root->hierarchy_id, | 3482 | ss->name, ss->root->hierarchy_id, |
3431 | ss->root->number_of_cgroups, !ss->disabled); | 3483 | ss->root->number_of_cgroups, !ss->disabled); |
@@ -3483,7 +3535,12 @@ void cgroup_fork_callbacks(struct task_struct *child) | |||
3483 | { | 3535 | { |
3484 | if (need_forkexit_callback) { | 3536 | if (need_forkexit_callback) { |
3485 | int i; | 3537 | int i; |
3486 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3538 | /* |
3539 | * forkexit callbacks are only supported for builtin | ||
3540 | * subsystems, and the builtin section of the subsys array is | ||
3541 | * immutable, so we don't need to lock the subsys array here. | ||
3542 | */ | ||
3543 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3487 | struct cgroup_subsys *ss = subsys[i]; | 3544 | struct cgroup_subsys *ss = subsys[i]; |
3488 | if (ss->fork) | 3545 | if (ss->fork) |
3489 | ss->fork(ss, child); | 3546 | ss->fork(ss, child); |
@@ -3552,7 +3609,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
3552 | struct css_set *cg; | 3609 | struct css_set *cg; |
3553 | 3610 | ||
3554 | if (run_callbacks && need_forkexit_callback) { | 3611 | if (run_callbacks && need_forkexit_callback) { |
3555 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3612 | /* |
3613 | * modular subsystems can't use callbacks, so no need to lock | ||
3614 | * the subsys array | ||
3615 | */ | ||
3616 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3556 | struct cgroup_subsys *ss = subsys[i]; | 3617 | struct cgroup_subsys *ss = subsys[i]; |
3557 | if (ss->exit) | 3618 | if (ss->exit) |
3558 | ss->exit(ss, tsk); | 3619 | ss->exit(ss, tsk); |
@@ -3844,8 +3905,11 @@ static int __init cgroup_disable(char *str) | |||
3844 | while ((token = strsep(&str, ",")) != NULL) { | 3905 | while ((token = strsep(&str, ",")) != NULL) { |
3845 | if (!*token) | 3906 | if (!*token) |
3846 | continue; | 3907 | continue; |
3847 | 3908 | /* | |
3848 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3909 | * cgroup_disable, being at boot time, can't know about module |
3910 | * subsystems, so we don't worry about them. | ||
3911 | */ | ||
3912 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3849 | struct cgroup_subsys *ss = subsys[i]; | 3913 | struct cgroup_subsys *ss = subsys[i]; |
3850 | 3914 | ||
3851 | if (!strcmp(token, ss->name)) { | 3915 | if (!strcmp(token, ss->name)) { |