aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Blum <bblum@andrew.cmu.edu>2010-03-10 18:22:07 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-12 18:52:36 -0500
commitaae8aab40367036931608fdaf9e2dc568b516f19 (patch)
treeb2a06ee21042eb3972ecd9e4153d61a8f6ed53cb
parentd7b9fff711d5e8db8c844161c684017e556c38a0 (diff)
cgroups: revamp subsys array
This patch series provides the ability for cgroup subsystems to be compiled as modules both within and outside the kernel tree. This is mainly useful for classifiers and subsystems that hook into components that are already modules. cls_cgroup and blkio-cgroup serve as the example use cases for this feature. It provides an interface cgroup_load_subsys() and cgroup_unload_subsys() which modular subsystems can use to register and depart during runtime. The net_cls classifier subsystem serves as the example for a subsystem which can be converted into a module using these changes. Patch #1 sets up the subsys[] array so its contents can be dynamic as modules appear and (eventually) disappear. Iterations over the array are modified to handle when subsystems are absent, and the dynamic section of the array is protected by cgroup_mutex. Patch #2 implements an interface for modules to load subsystems, called cgroup_load_subsys, similar to cgroup_init_subsys, and adds a module pointer in struct cgroup_subsys. Patch #3 adds a mechanism for unloading modular subsystems, which includes a more advanced rework of the rudimentary reference counting introduced in patch 2. Patch #4 modifies the net_cls subsystem, which already had some module declarations, to be configurable as a module, which also serves as a simple proof-of-concept. Part of implementing patches 2 and 4 involved updating css pointers in each css_set when the module appears or leaves. In doing this, it was discovered that css_sets always remain linked to the dummy cgroup, regardless of whether or not any subsystems are actually bound to it (i.e., not mounted on an actual hierarchy). The subsystem loading and unloading code therefore should keep in mind the special cases where the added subsystem is the only one in the dummy cgroup (and therefore all css_sets need to be linked back into it) and where the removed subsys was the only one in the dummy cgroup (and therefore all css_sets should be unlinked from it) - however, as all css_sets always stay attached to the dummy cgroup anyway, these cases are ignored. Any fix that addresses this issue should also make sure these cases are addressed in the subsystem loading and unloading code. This patch: Make subsys[] able to be dynamically populated to support modular subsystems This patch reworks the way the subsys[] array is used so that subsystems can register themselves after boot time, and enables the internals of cgroups to be able to handle when subsystems are not present or may appear/disappear. Signed-off-by: Ben Blum <bblum@andrew.cmu.edu> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h10
-rw-r--r--kernel/cgroup.c96
2 files changed, 88 insertions, 18 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 14160b5b693f..28319a9fe569 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -40,13 +40,19 @@ extern int cgroupstats_build(struct cgroupstats *stats,
40 40
41extern const struct file_operations proc_cgroup_operations; 41extern const struct file_operations proc_cgroup_operations;
42 42
43/* Define the enumeration of all cgroup subsystems */ 43/* Define the enumeration of all builtin cgroup subsystems */
44#define SUBSYS(_x) _x ## _subsys_id, 44#define SUBSYS(_x) _x ## _subsys_id,
45enum cgroup_subsys_id { 45enum cgroup_subsys_id {
46#include <linux/cgroup_subsys.h> 46#include <linux/cgroup_subsys.h>
47 CGROUP_SUBSYS_COUNT 47 CGROUP_BUILTIN_SUBSYS_COUNT
48}; 48};
49#undef SUBSYS 49#undef SUBSYS
50/*
51 * This define indicates the maximum number of subsystems that can be loaded
52 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
53 * track of all of them.
54 */
55#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
50 56
51/* Per-subsystem/per-cgroup state maintained by the system. */ 57/* Per-subsystem/per-cgroup state maintained by the system. */
52struct cgroup_subsys_state { 58struct cgroup_subsys_state {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cace83ddbcdc..c92fb9549358 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,10 +57,14 @@
57 57
58static DEFINE_MUTEX(cgroup_mutex); 58static DEFINE_MUTEX(cgroup_mutex);
59 59
60/* Generate an array of cgroup subsystem pointers */ 60/*
61 * Generate an array of cgroup subsystem pointers. At boot time, this is
62 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
63 * registered after that. The mutable section of this array is protected by
64 * cgroup_mutex.
65 */
61#define SUBSYS(_x) &_x ## _subsys, 66#define SUBSYS(_x) &_x ## _subsys,
62 67static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
63static struct cgroup_subsys *subsys[] = {
64#include <linux/cgroup_subsys.h> 68#include <linux/cgroup_subsys.h>
65}; 69};
66 70
@@ -448,8 +452,11 @@ static struct css_set *find_existing_css_set(
448 struct hlist_node *node; 452 struct hlist_node *node;
449 struct css_set *cg; 453 struct css_set *cg;
450 454
451 /* Built the set of subsystem state objects that we want to 455 /*
452 * see in the new css_set */ 456 * Build the set of subsystem state objects that we want to see in the
457 * new css_set. while subsystems can change globally, the entries here
458 * won't change, so no need for locking.
459 */
453 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 460 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
454 if (root->subsys_bits & (1UL << i)) { 461 if (root->subsys_bits & (1UL << i)) {
455 /* Subsystem is in this hierarchy. So we want 462 /* Subsystem is in this hierarchy. So we want
@@ -884,7 +891,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
884 css_put(css); 891 css_put(css);
885} 892}
886 893
887 894/*
895 * Call with cgroup_mutex held.
896 */
888static int rebind_subsystems(struct cgroupfs_root *root, 897static int rebind_subsystems(struct cgroupfs_root *root,
889 unsigned long final_bits) 898 unsigned long final_bits)
890{ 899{
@@ -892,6 +901,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
892 struct cgroup *cgrp = &root->top_cgroup; 901 struct cgroup *cgrp = &root->top_cgroup;
893 int i; 902 int i;
894 903
904 BUG_ON(!mutex_is_locked(&cgroup_mutex));
905
895 removed_bits = root->actual_subsys_bits & ~final_bits; 906 removed_bits = root->actual_subsys_bits & ~final_bits;
896 added_bits = final_bits & ~root->actual_subsys_bits; 907 added_bits = final_bits & ~root->actual_subsys_bits;
897 /* Check that any added subsystems are currently free */ 908 /* Check that any added subsystems are currently free */
@@ -900,6 +911,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
900 struct cgroup_subsys *ss = subsys[i]; 911 struct cgroup_subsys *ss = subsys[i];
901 if (!(bit & added_bits)) 912 if (!(bit & added_bits))
902 continue; 913 continue;
914 /*
915 * Nobody should tell us to do a subsys that doesn't exist:
916 * parse_cgroupfs_options should catch that case and refcounts
917 * ensure that subsystems won't disappear once selected.
918 */
919 BUG_ON(ss == NULL);
903 if (ss->root != &rootnode) { 920 if (ss->root != &rootnode) {
904 /* Subsystem isn't free */ 921 /* Subsystem isn't free */
905 return -EBUSY; 922 return -EBUSY;
@@ -919,6 +936,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
919 unsigned long bit = 1UL << i; 936 unsigned long bit = 1UL << i;
920 if (bit & added_bits) { 937 if (bit & added_bits) {
921 /* We're binding this subsystem to this hierarchy */ 938 /* We're binding this subsystem to this hierarchy */
939 BUG_ON(ss == NULL);
922 BUG_ON(cgrp->subsys[i]); 940 BUG_ON(cgrp->subsys[i]);
923 BUG_ON(!dummytop->subsys[i]); 941 BUG_ON(!dummytop->subsys[i]);
924 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 942 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -932,6 +950,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
932 mutex_unlock(&ss->hierarchy_mutex); 950 mutex_unlock(&ss->hierarchy_mutex);
933 } else if (bit & removed_bits) { 951 } else if (bit & removed_bits) {
934 /* We're removing this subsystem */ 952 /* We're removing this subsystem */
953 BUG_ON(ss == NULL);
935 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 954 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
936 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 955 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
937 mutex_lock(&ss->hierarchy_mutex); 956 mutex_lock(&ss->hierarchy_mutex);
@@ -944,6 +963,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
944 mutex_unlock(&ss->hierarchy_mutex); 963 mutex_unlock(&ss->hierarchy_mutex);
945 } else if (bit & final_bits) { 964 } else if (bit & final_bits) {
946 /* Subsystem state should already exist */ 965 /* Subsystem state should already exist */
966 BUG_ON(ss == NULL);
947 BUG_ON(!cgrp->subsys[i]); 967 BUG_ON(!cgrp->subsys[i]);
948 } else { 968 } else {
949 /* Subsystem state shouldn't exist */ 969 /* Subsystem state shouldn't exist */
@@ -986,14 +1006,18 @@ struct cgroup_sb_opts {
986 1006
987}; 1007};
988 1008
989/* Convert a hierarchy specifier into a bitmask of subsystems and 1009/*
990 * flags. */ 1010 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
1011 * with cgroup_mutex held to protect the subsys[] array.
1012 */
991static int parse_cgroupfs_options(char *data, 1013static int parse_cgroupfs_options(char *data,
992 struct cgroup_sb_opts *opts) 1014 struct cgroup_sb_opts *opts)
993{ 1015{
994 char *token, *o = data ?: "all"; 1016 char *token, *o = data ?: "all";
995 unsigned long mask = (unsigned long)-1; 1017 unsigned long mask = (unsigned long)-1;
996 1018
1019 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1020
997#ifdef CONFIG_CPUSETS 1021#ifdef CONFIG_CPUSETS
998 mask = ~(1UL << cpuset_subsys_id); 1022 mask = ~(1UL << cpuset_subsys_id);
999#endif 1023#endif
@@ -1009,6 +1033,8 @@ static int parse_cgroupfs_options(char *data,
1009 opts->subsys_bits = 0; 1033 opts->subsys_bits = 0;
1010 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1034 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1011 struct cgroup_subsys *ss = subsys[i]; 1035 struct cgroup_subsys *ss = subsys[i];
1036 if (ss == NULL)
1037 continue;
1012 if (!ss->disabled) 1038 if (!ss->disabled)
1013 opts->subsys_bits |= 1ul << i; 1039 opts->subsys_bits |= 1ul << i;
1014 } 1040 }
@@ -1053,6 +1079,8 @@ static int parse_cgroupfs_options(char *data,
1053 int i; 1079 int i;
1054 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1080 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1055 ss = subsys[i]; 1081 ss = subsys[i];
1082 if (ss == NULL)
1083 continue;
1056 if (!strcmp(token, ss->name)) { 1084 if (!strcmp(token, ss->name)) {
1057 if (!ss->disabled) 1085 if (!ss->disabled)
1058 set_bit(i, &opts->subsys_bits); 1086 set_bit(i, &opts->subsys_bits);
@@ -1306,7 +1334,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1306 struct cgroupfs_root *new_root; 1334 struct cgroupfs_root *new_root;
1307 1335
1308 /* First find the desired set of subsystems */ 1336 /* First find the desired set of subsystems */
1337 mutex_lock(&cgroup_mutex);
1309 ret = parse_cgroupfs_options(data, &opts); 1338 ret = parse_cgroupfs_options(data, &opts);
1339 mutex_unlock(&cgroup_mutex);
1310 if (ret) 1340 if (ret)
1311 goto out_err; 1341 goto out_err;
1312 1342
@@ -2918,8 +2948,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2918 /* We need to take each hierarchy_mutex in a consistent order */ 2948 /* We need to take each hierarchy_mutex in a consistent order */
2919 int i; 2949 int i;
2920 2950
2951 /*
2952 * No worry about a race with rebind_subsystems that might mess up the
2953 * locking order, since both parties are under cgroup_mutex.
2954 */
2921 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2955 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2922 struct cgroup_subsys *ss = subsys[i]; 2956 struct cgroup_subsys *ss = subsys[i];
2957 if (ss == NULL)
2958 continue;
2923 if (ss->root == root) 2959 if (ss->root == root)
2924 mutex_lock(&ss->hierarchy_mutex); 2960 mutex_lock(&ss->hierarchy_mutex);
2925 } 2961 }
@@ -2931,6 +2967,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2931 2967
2932 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2968 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2933 struct cgroup_subsys *ss = subsys[i]; 2969 struct cgroup_subsys *ss = subsys[i];
2970 if (ss == NULL)
2971 continue;
2934 if (ss->root == root) 2972 if (ss->root == root)
2935 mutex_unlock(&ss->hierarchy_mutex); 2973 mutex_unlock(&ss->hierarchy_mutex);
2936 } 2974 }
@@ -3054,11 +3092,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3054 * synchronization other than RCU, and the subsystem linked 3092 * synchronization other than RCU, and the subsystem linked
3055 * list isn't RCU-safe */ 3093 * list isn't RCU-safe */
3056 int i; 3094 int i;
3095 /*
3096 * We won't need to lock the subsys array, because the subsystems
3097 * we're concerned about aren't going anywhere since our cgroup root
3098 * has a reference on them.
3099 */
3057 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3100 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3058 struct cgroup_subsys *ss = subsys[i]; 3101 struct cgroup_subsys *ss = subsys[i];
3059 struct cgroup_subsys_state *css; 3102 struct cgroup_subsys_state *css;
3060 /* Skip subsystems not in this hierarchy */ 3103 /* Skip subsystems not present or not in this hierarchy */
3061 if (ss->root != cgrp->root) 3104 if (ss == NULL || ss->root != cgrp->root)
3062 continue; 3105 continue;
3063 css = cgrp->subsys[ss->subsys_id]; 3106 css = cgrp->subsys[ss->subsys_id];
3064 /* When called from check_for_release() it's possible 3107 /* When called from check_for_release() it's possible
@@ -3279,7 +3322,8 @@ int __init cgroup_init_early(void)
3279 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3322 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3280 INIT_HLIST_HEAD(&css_set_table[i]); 3323 INIT_HLIST_HEAD(&css_set_table[i]);
3281 3324
3282 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3325 /* at bootup time, we don't worry about modular subsystems */
3326 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3283 struct cgroup_subsys *ss = subsys[i]; 3327 struct cgroup_subsys *ss = subsys[i];
3284 3328
3285 BUG_ON(!ss->name); 3329 BUG_ON(!ss->name);
@@ -3314,7 +3358,8 @@ int __init cgroup_init(void)
3314 if (err) 3358 if (err)
3315 return err; 3359 return err;
3316 3360
3317 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3361 /* at bootup time, we don't worry about modular subsystems */
3362 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3318 struct cgroup_subsys *ss = subsys[i]; 3363 struct cgroup_subsys *ss = subsys[i];
3319 if (!ss->early_init) 3364 if (!ss->early_init)
3320 cgroup_init_subsys(ss); 3365 cgroup_init_subsys(ss);
@@ -3423,9 +3468,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3423 int i; 3468 int i;
3424 3469
3425 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3470 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3471 /*
3472 * ideally we don't want subsystems moving around while we do this.
3473 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3474 * subsys/hierarchy state.
3475 */
3426 mutex_lock(&cgroup_mutex); 3476 mutex_lock(&cgroup_mutex);
3427 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3477 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3428 struct cgroup_subsys *ss = subsys[i]; 3478 struct cgroup_subsys *ss = subsys[i];
3479 if (ss == NULL)
3480 continue;
3429 seq_printf(m, "%s\t%d\t%d\t%d\n", 3481 seq_printf(m, "%s\t%d\t%d\t%d\n",
3430 ss->name, ss->root->hierarchy_id, 3482 ss->name, ss->root->hierarchy_id,
3431 ss->root->number_of_cgroups, !ss->disabled); 3483 ss->root->number_of_cgroups, !ss->disabled);
@@ -3483,7 +3535,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3483{ 3535{
3484 if (need_forkexit_callback) { 3536 if (need_forkexit_callback) {
3485 int i; 3537 int i;
3486 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3538 /*
3539 * forkexit callbacks are only supported for builtin
3540 * subsystems, and the builtin section of the subsys array is
3541 * immutable, so we don't need to lock the subsys array here.
3542 */
3543 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3487 struct cgroup_subsys *ss = subsys[i]; 3544 struct cgroup_subsys *ss = subsys[i];
3488 if (ss->fork) 3545 if (ss->fork)
3489 ss->fork(ss, child); 3546 ss->fork(ss, child);
@@ -3552,7 +3609,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3552 struct css_set *cg; 3609 struct css_set *cg;
3553 3610
3554 if (run_callbacks && need_forkexit_callback) { 3611 if (run_callbacks && need_forkexit_callback) {
3555 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3612 /*
3613 * modular subsystems can't use callbacks, so no need to lock
3614 * the subsys array
3615 */
3616 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3556 struct cgroup_subsys *ss = subsys[i]; 3617 struct cgroup_subsys *ss = subsys[i];
3557 if (ss->exit) 3618 if (ss->exit)
3558 ss->exit(ss, tsk); 3619 ss->exit(ss, tsk);
@@ -3844,8 +3905,11 @@ static int __init cgroup_disable(char *str)
3844 while ((token = strsep(&str, ",")) != NULL) { 3905 while ((token = strsep(&str, ",")) != NULL) {
3845 if (!*token) 3906 if (!*token)
3846 continue; 3907 continue;
3847 3908 /*
3848 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3909 * cgroup_disable, being at boot time, can't know about module
3910 * subsystems, so we don't worry about them.
3911 */
3912 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3849 struct cgroup_subsys *ss = subsys[i]; 3913 struct cgroup_subsys *ss = subsys[i];
3850 3914
3851 if (!strcmp(token, ss->name)) { 3915 if (!strcmp(token, ss->name)) {