aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c730
1 files changed, 672 insertions, 58 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044a..3a53c771e503 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
43#include <linux/string.h> 47#include <linux/string.h>
44#include <linux/sort.h> 48#include <linux/sort.h>
45#include <linux/kmod.h> 49#include <linux/kmod.h>
50#include <linux/module.h>
46#include <linux/delayacct.h> 51#include <linux/delayacct.h>
47#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
48#include <linux/hash.h> 53#include <linux/hash.h>
@@ -51,15 +56,21 @@
51#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
52#include <linux/idr.h> 57#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
54 61
55#include <asm/atomic.h> 62#include <asm/atomic.h>
56 63
57static DEFINE_MUTEX(cgroup_mutex); 64static DEFINE_MUTEX(cgroup_mutex);
58 65
59/* Generate an array of cgroup subsystem pointers */ 66/*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex.
71 */
60#define SUBSYS(_x) &_x ## _subsys, 72#define SUBSYS(_x) &_x ## _subsys,
61 73static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
62static struct cgroup_subsys *subsys[] = {
63#include <linux/cgroup_subsys.h> 74#include <linux/cgroup_subsys.h>
64}; 75};
65 76
@@ -146,6 +157,35 @@ struct css_id {
146 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
147}; 158};
148 159
160/*
161 * cgroup_event represents events which userspace want to recieve.
162 */
163struct cgroup_event {
164 /*
165 * Cgroup which the event belongs to.
166 */
167 struct cgroup *cgrp;
168 /*
169 * Control file which the event associated.
170 */
171 struct cftype *cft;
172 /*
173 * eventfd to signal userspace about the event.
174 */
175 struct eventfd_ctx *eventfd;
176 /*
177 * Each of these stored in a list by the cgroup.
178 */
179 struct list_head list;
180 /*
181 * All fields below needed to unregister event when
182 * userspace closes eventfd.
183 */
184 poll_table pt;
185 wait_queue_head_t *wqh;
186 wait_queue_t wait;
187 struct work_struct remove;
188};
149 189
150/* The list of hierarchy roots */ 190/* The list of hierarchy roots */
151 191
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 206 */
167static int need_forkexit_callback __read_mostly; 207static int need_forkexit_callback __read_mostly;
168 208
209#ifdef CONFIG_PROVE_LOCKING
210int cgroup_lock_is_held(void)
211{
212 return lockdep_is_held(&cgroup_mutex);
213}
214#else /* #ifdef CONFIG_PROVE_LOCKING */
215int cgroup_lock_is_held(void)
216{
217 return mutex_is_locked(&cgroup_mutex);
218}
219#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220
221EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222
169/* convenient tests for these bits */ 223/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 224inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 225{
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
235static struct css_set init_css_set; 289static struct css_set init_css_set;
236static struct cg_cgroup_link init_css_set_link; 290static struct cg_cgroup_link init_css_set_link;
237 291
238static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 292static int cgroup_init_idr(struct cgroup_subsys *ss,
293 struct cgroup_subsys_state *css);
239 294
240/* css_set_lock protects the list of css_set objects, and the 295/* css_set_lock protects the list of css_set objects, and the
241 * chain of tasks off each css_set. Nests outside task->alloc_lock 296 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
433 struct hlist_node *node; 488 struct hlist_node *node;
434 struct css_set *cg; 489 struct css_set *cg;
435 490
436 /* Built the set of subsystem state objects that we want to 491 /*
437 * see in the new css_set */ 492 * Build the set of subsystem state objects that we want to see in the
493 * new css_set. while subsystems can change globally, the entries here
494 * won't change, so no need for locking.
495 */
438 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 496 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
439 if (root->subsys_bits & (1UL << i)) { 497 if (root->subsys_bits & (1UL << i)) {
440 /* Subsystem is in this hierarchy. So we want 498 /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
681{ 739{
682 mutex_lock(&cgroup_mutex); 740 mutex_lock(&cgroup_mutex);
683} 741}
742EXPORT_SYMBOL_GPL(cgroup_lock);
684 743
685/** 744/**
686 * cgroup_unlock - release lock on cgroup changes 745 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
691{ 750{
692 mutex_unlock(&cgroup_mutex); 751 mutex_unlock(&cgroup_mutex);
693} 752}
753EXPORT_SYMBOL_GPL(cgroup_unlock);
694 754
695/* 755/*
696 * A couple of forward declarations required, due to cyclic reference loop: 756 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
742 if (ret) 802 if (ret)
743 break; 803 break;
744 } 804 }
805
745 return ret; 806 return ret;
746} 807}
747 808
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
869 css_put(css); 930 css_put(css);
870} 931}
871 932
872 933/*
934 * Call with cgroup_mutex held. Drops reference counts on modules, including
935 * any duplicate ones that parse_cgroupfs_options took. If this function
936 * returns an error, no reference counts are touched.
937 */
873static int rebind_subsystems(struct cgroupfs_root *root, 938static int rebind_subsystems(struct cgroupfs_root *root,
874 unsigned long final_bits) 939 unsigned long final_bits)
875{ 940{
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
877 struct cgroup *cgrp = &root->top_cgroup; 942 struct cgroup *cgrp = &root->top_cgroup;
878 int i; 943 int i;
879 944
945 BUG_ON(!mutex_is_locked(&cgroup_mutex));
946
880 removed_bits = root->actual_subsys_bits & ~final_bits; 947 removed_bits = root->actual_subsys_bits & ~final_bits;
881 added_bits = final_bits & ~root->actual_subsys_bits; 948 added_bits = final_bits & ~root->actual_subsys_bits;
882 /* Check that any added subsystems are currently free */ 949 /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
885 struct cgroup_subsys *ss = subsys[i]; 952 struct cgroup_subsys *ss = subsys[i];
886 if (!(bit & added_bits)) 953 if (!(bit & added_bits))
887 continue; 954 continue;
955 /*
956 * Nobody should tell us to do a subsys that doesn't exist:
957 * parse_cgroupfs_options should catch that case and refcounts
958 * ensure that subsystems won't disappear once selected.
959 */
960 BUG_ON(ss == NULL);
888 if (ss->root != &rootnode) { 961 if (ss->root != &rootnode) {
889 /* Subsystem isn't free */ 962 /* Subsystem isn't free */
890 return -EBUSY; 963 return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
904 unsigned long bit = 1UL << i; 977 unsigned long bit = 1UL << i;
905 if (bit & added_bits) { 978 if (bit & added_bits) {
906 /* We're binding this subsystem to this hierarchy */ 979 /* We're binding this subsystem to this hierarchy */
980 BUG_ON(ss == NULL);
907 BUG_ON(cgrp->subsys[i]); 981 BUG_ON(cgrp->subsys[i]);
908 BUG_ON(!dummytop->subsys[i]); 982 BUG_ON(!dummytop->subsys[i]);
909 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 983 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
915 if (ss->bind) 989 if (ss->bind)
916 ss->bind(ss, cgrp); 990 ss->bind(ss, cgrp);
917 mutex_unlock(&ss->hierarchy_mutex); 991 mutex_unlock(&ss->hierarchy_mutex);
992 /* refcount was already taken, and we're keeping it */
918 } else if (bit & removed_bits) { 993 } else if (bit & removed_bits) {
919 /* We're removing this subsystem */ 994 /* We're removing this subsystem */
995 BUG_ON(ss == NULL);
920 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 996 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
921 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 997 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
922 mutex_lock(&ss->hierarchy_mutex); 998 mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
927 subsys[i]->root = &rootnode; 1003 subsys[i]->root = &rootnode;
928 list_move(&ss->sibling, &rootnode.subsys_list); 1004 list_move(&ss->sibling, &rootnode.subsys_list);
929 mutex_unlock(&ss->hierarchy_mutex); 1005 mutex_unlock(&ss->hierarchy_mutex);
1006 /* subsystem is now free - drop reference on module */
1007 module_put(ss->module);
930 } else if (bit & final_bits) { 1008 } else if (bit & final_bits) {
931 /* Subsystem state should already exist */ 1009 /* Subsystem state should already exist */
1010 BUG_ON(ss == NULL);
932 BUG_ON(!cgrp->subsys[i]); 1011 BUG_ON(!cgrp->subsys[i]);
1012 /*
1013 * a refcount was taken, but we already had one, so
1014 * drop the extra reference.
1015 */
1016 module_put(ss->module);
1017#ifdef CONFIG_MODULE_UNLOAD
1018 BUG_ON(ss->module && !module_refcount(ss->module));
1019#endif
933 } else { 1020 } else {
934 /* Subsystem state shouldn't exist */ 1021 /* Subsystem state shouldn't exist */
935 BUG_ON(cgrp->subsys[i]); 1022 BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
971 1058
972}; 1059};
973 1060
974/* Convert a hierarchy specifier into a bitmask of subsystems and 1061/*
975 * flags. */ 1062 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
976static int parse_cgroupfs_options(char *data, 1063 * with cgroup_mutex held to protect the subsys[] array. This function takes
977 struct cgroup_sb_opts *opts) 1064 * refcounts on subsystems to be used, unless it returns error, in which case
1065 * no refcounts are taken.
1066 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
978{ 1068{
979 char *token, *o = data ?: "all"; 1069 char *token, *o = data ?: "all";
980 unsigned long mask = (unsigned long)-1; 1070 unsigned long mask = (unsigned long)-1;
1071 int i;
1072 bool module_pin_failed = false;
1073
1074 BUG_ON(!mutex_is_locked(&cgroup_mutex));
981 1075
982#ifdef CONFIG_CPUSETS 1076#ifdef CONFIG_CPUSETS
983 mask = ~(1UL << cpuset_subsys_id); 1077 mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
990 return -EINVAL; 1084 return -EINVAL;
991 if (!strcmp(token, "all")) { 1085 if (!strcmp(token, "all")) {
992 /* Add all non-disabled subsystems */ 1086 /* Add all non-disabled subsystems */
993 int i;
994 opts->subsys_bits = 0; 1087 opts->subsys_bits = 0;
995 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
996 struct cgroup_subsys *ss = subsys[i]; 1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
997 if (!ss->disabled) 1092 if (!ss->disabled)
998 opts->subsys_bits |= 1ul << i; 1093 opts->subsys_bits |= 1ul << i;
999 } 1094 }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
1011 if (!opts->release_agent) 1106 if (!opts->release_agent)
1012 return -ENOMEM; 1107 return -ENOMEM;
1013 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
1014 int i;
1015 const char *name = token + 5; 1109 const char *name = token + 5;
1016 /* Can't specify an empty name */ 1110 /* Can't specify an empty name */
1017 if (!strlen(name)) 1111 if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
1035 return -ENOMEM; 1129 return -ENOMEM;
1036 } else { 1130 } else {
1037 struct cgroup_subsys *ss; 1131 struct cgroup_subsys *ss;
1038 int i;
1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1040 ss = subsys[i]; 1133 ss = subsys[i];
1134 if (ss == NULL)
1135 continue;
1041 if (!strcmp(token, ss->name)) { 1136 if (!strcmp(token, ss->name)) {
1042 if (!ss->disabled) 1137 if (!ss->disabled)
1043 set_bit(i, &opts->subsys_bits); 1138 set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
1072 if (!opts->subsys_bits && !opts->name) 1167 if (!opts->subsys_bits && !opts->name)
1073 return -EINVAL; 1168 return -EINVAL;
1074 1169
1170 /*
1171 * Grab references on all the modules we'll need, so the subsystems
1172 * don't dance around before rebind_subsystems attaches them. This may
1173 * take duplicate reference counts on a subsystem that's already used,
1174 * but rebind_subsystems handles this case.
1175 */
1176 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1177 unsigned long bit = 1UL << i;
1178
1179 if (!(bit & opts->subsys_bits))
1180 continue;
1181 if (!try_module_get(subsys[i]->module)) {
1182 module_pin_failed = true;
1183 break;
1184 }
1185 }
1186 if (module_pin_failed) {
1187 /*
1188 * oops, one of the modules was going away. this means that we
1189 * raced with a module_delete call, and to the user this is
1190 * essentially a "subsystem doesn't exist" case.
1191 */
1192 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1193 /* drop refcounts only on the ones we took */
1194 unsigned long bit = 1UL << i;
1195
1196 if (!(bit & opts->subsys_bits))
1197 continue;
1198 module_put(subsys[i]->module);
1199 }
1200 return -ENOENT;
1201 }
1202
1075 return 0; 1203 return 0;
1076} 1204}
1077 1205
1206static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1207{
1208 int i;
1209 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1210 unsigned long bit = 1UL << i;
1211
1212 if (!(bit & subsys_bits))
1213 continue;
1214 module_put(subsys[i]->module);
1215 }
1216}
1217
1078static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1218static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1079{ 1219{
1080 int ret = 0; 1220 int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1091 if (ret) 1231 if (ret)
1092 goto out_unlock; 1232 goto out_unlock;
1093 1233
1094 /* Don't allow flags to change at remount */ 1234 /* Don't allow flags or name to change at remount */
1095 if (opts.flags != root->flags) { 1235 if (opts.flags != root->flags ||
1096 ret = -EINVAL; 1236 (opts.name && strcmp(opts.name, root->name))) {
1097 goto out_unlock;
1098 }
1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL; 1237 ret = -EINVAL;
1238 drop_parsed_module_refcounts(opts.subsys_bits);
1103 goto out_unlock; 1239 goto out_unlock;
1104 } 1240 }
1105 1241
1106 ret = rebind_subsystems(root, opts.subsys_bits); 1242 ret = rebind_subsystems(root, opts.subsys_bits);
1107 if (ret) 1243 if (ret) {
1244 drop_parsed_module_refcounts(opts.subsys_bits);
1108 goto out_unlock; 1245 goto out_unlock;
1246 }
1109 1247
1110 /* (re)populate subsystem files */ 1248 /* (re)populate subsystem files */
1111 cgroup_populate_dir(cgrp); 1249 cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1136 INIT_LIST_HEAD(&cgrp->release_list); 1274 INIT_LIST_HEAD(&cgrp->release_list);
1137 INIT_LIST_HEAD(&cgrp->pidlists); 1275 INIT_LIST_HEAD(&cgrp->pidlists);
1138 mutex_init(&cgrp->pidlist_mutex); 1276 mutex_init(&cgrp->pidlist_mutex);
1277 INIT_LIST_HEAD(&cgrp->event_list);
1278 spin_lock_init(&cgrp->event_list_lock);
1139} 1279}
1140 1280
1141static void init_cgroup_root(struct cgroupfs_root *root) 1281static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1291 struct cgroupfs_root *new_root; 1431 struct cgroupfs_root *new_root;
1292 1432
1293 /* First find the desired set of subsystems */ 1433 /* First find the desired set of subsystems */
1434 mutex_lock(&cgroup_mutex);
1294 ret = parse_cgroupfs_options(data, &opts); 1435 ret = parse_cgroupfs_options(data, &opts);
1436 mutex_unlock(&cgroup_mutex);
1295 if (ret) 1437 if (ret)
1296 goto out_err; 1438 goto out_err;
1297 1439
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1302 new_root = cgroup_root_from_opts(&opts); 1444 new_root = cgroup_root_from_opts(&opts);
1303 if (IS_ERR(new_root)) { 1445 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root); 1446 ret = PTR_ERR(new_root);
1305 goto out_err; 1447 goto drop_modules;
1306 } 1448 }
1307 opts.new_root = new_root; 1449 opts.new_root = new_root;
1308 1450
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1311 if (IS_ERR(sb)) { 1453 if (IS_ERR(sb)) {
1312 ret = PTR_ERR(sb); 1454 ret = PTR_ERR(sb);
1313 cgroup_drop_root(opts.new_root); 1455 cgroup_drop_root(opts.new_root);
1314 goto out_err; 1456 goto drop_modules;
1315 } 1457 }
1316 1458
1317 root = sb->s_fs_info; 1459 root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1367 free_cg_links(&tmp_cg_links); 1509 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super; 1510 goto drop_new_super;
1369 } 1511 }
1512 /*
1513 * There must be no failure case after here, since rebinding
1514 * takes care of subsystems' refcounts, which are explicitly
1515 * dropped in the failure exit path.
1516 */
1370 1517
1371 /* EBUSY should be the only error here */ 1518 /* EBUSY should be the only error here */
1372 BUG_ON(ret); 1519 BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1405 * any) is not needed 1552 * any) is not needed
1406 */ 1553 */
1407 cgroup_drop_root(opts.new_root); 1554 cgroup_drop_root(opts.new_root);
1555 /* no subsys rebinding, so refcounts don't change */
1556 drop_parsed_module_refcounts(opts.subsys_bits);
1408 } 1557 }
1409 1558
1410 simple_set_mnt(mnt, sb); 1559 simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1414 1563
1415 drop_new_super: 1564 drop_new_super:
1416 deactivate_locked_super(sb); 1565 deactivate_locked_super(sb);
1566 drop_modules:
1567 drop_parsed_module_refcounts(opts.subsys_bits);
1417 out_err: 1568 out_err:
1418 kfree(opts.release_agent); 1569 kfree(opts.release_agent);
1419 kfree(opts.name); 1570 kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1495int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1496{ 1647{
1497 char *start; 1648 char *start;
1498 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1499 1652
1500 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1501 /* 1654 /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1511 *--start = '\0'; 1664 *--start = '\0';
1512 for (;;) { 1665 for (;;) {
1513 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1514 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1515 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1516 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1517 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1518 if (!cgrp) 1672 if (!cgrp)
1519 break; 1673 break;
1520 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1521 if (!cgrp->parent) 1678 if (!cgrp->parent)
1522 continue; 1679 continue;
1523 if (--start < buf) 1680 if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1527 memmove(buf, start, buf + buflen - start); 1684 memmove(buf, start, buf + buflen - start);
1528 return 0; 1685 return 0;
1529} 1686}
1687EXPORT_SYMBOL_GPL(cgroup_path);
1530 1688
1531/** 1689/**
1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1690 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1539int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1697int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1540{ 1698{
1541 int retval = 0; 1699 int retval = 0;
1542 struct cgroup_subsys *ss; 1700 struct cgroup_subsys *ss, *failed_ss = NULL;
1543 struct cgroup *oldcgrp; 1701 struct cgroup *oldcgrp;
1544 struct css_set *cg; 1702 struct css_set *cg;
1545 struct css_set *newcg; 1703 struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1553 for_each_subsys(root, ss) { 1711 for_each_subsys(root, ss) {
1554 if (ss->can_attach) { 1712 if (ss->can_attach) {
1555 retval = ss->can_attach(ss, cgrp, tsk, false); 1713 retval = ss->can_attach(ss, cgrp, tsk, false);
1556 if (retval) 1714 if (retval) {
1557 return retval; 1715 /*
1716 * Remember on which subsystem the can_attach()
1717 * failed, so that we only call cancel_attach()
1718 * against the subsystems whose can_attach()
1719 * succeeded. (See below)
1720 */
1721 failed_ss = ss;
1722 goto out;
1723 }
1558 } 1724 }
1559 } 1725 }
1560 1726
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 */ 1734 */
1569 newcg = find_css_set(cg, cgrp); 1735 newcg = find_css_set(cg, cgrp);
1570 put_css_set(cg); 1736 put_css_set(cg);
1571 if (!newcg) 1737 if (!newcg) {
1572 return -ENOMEM; 1738 retval = -ENOMEM;
1739 goto out;
1740 }
1573 1741
1574 task_lock(tsk); 1742 task_lock(tsk);
1575 if (tsk->flags & PF_EXITING) { 1743 if (tsk->flags & PF_EXITING) {
1576 task_unlock(tsk); 1744 task_unlock(tsk);
1577 put_css_set(newcg); 1745 put_css_set(newcg);
1578 return -ESRCH; 1746 retval = -ESRCH;
1747 goto out;
1579 } 1748 }
1580 rcu_assign_pointer(tsk->cgroups, newcg); 1749 rcu_assign_pointer(tsk->cgroups, newcg);
1581 task_unlock(tsk); 1750 task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1601 * is no longer empty. 1770 * is no longer empty.
1602 */ 1771 */
1603 cgroup_wakeup_rmdir_waiter(cgrp); 1772 cgroup_wakeup_rmdir_waiter(cgrp);
1604 return 0; 1773out:
1774 if (retval) {
1775 for_each_subsys(root, ss) {
1776 if (ss == failed_ss)
1777 /*
1778 * This subsystem was the one that failed the
1779 * can_attach() check earlier, so we don't need
1780 * to call cancel_attach() against it or any
1781 * remaining subsystems.
1782 */
1783 break;
1784 if (ss->cancel_attach)
1785 ss->cancel_attach(ss, cgrp, tsk, false);
1786 }
1787 }
1788 return retval;
1605} 1789}
1606 1790
1607/* 1791/*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1667 } 1851 }
1668 return true; 1852 return true;
1669} 1853}
1854EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1670 1855
1671static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1856static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1672 const char *buffer) 1857 const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1935 .rename = cgroup_rename, 2120 .rename = cgroup_rename,
1936}; 2121};
1937 2122
2123/*
2124 * Check if a file is a control file
2125 */
2126static inline struct cftype *__file_cft(struct file *file)
2127{
2128 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2129 return ERR_PTR(-EINVAL);
2130 return __d_cft(file->f_dentry);
2131}
2132
1938static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2133static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1939 struct super_block *sb) 2134 struct super_block *sb)
1940{ 2135{
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2054 error = PTR_ERR(dentry); 2249 error = PTR_ERR(dentry);
2055 return error; 2250 return error;
2056} 2251}
2252EXPORT_SYMBOL_GPL(cgroup_add_file);
2057 2253
2058int cgroup_add_files(struct cgroup *cgrp, 2254int cgroup_add_files(struct cgroup *cgrp,
2059 struct cgroup_subsys *subsys, 2255 struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2068 } 2264 }
2069 return 0; 2265 return 0;
2070} 2266}
2267EXPORT_SYMBOL_GPL(cgroup_add_files);
2071 2268
2072/** 2269/**
2073 * cgroup_task_count - count the number of tasks in a cgroup. 2270 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2453{ 2650{
2454 struct cgroup_pidlist *l; 2651 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */ 2652 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2653 struct pid_namespace *ns = current->nsproxy->pid_ns;
2654
2457 /* 2655 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case 2656 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same 2657 * the last ref-holder is trying to remove l from the list at the same
@@ -2463,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2463 mutex_lock(&cgrp->pidlist_mutex); 2661 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) { 2662 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) { 2663 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */ 2664 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2665 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2666 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2475,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2475 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2671 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2476 if (!l) { 2672 if (!l) {
2477 mutex_unlock(&cgrp->pidlist_mutex); 2673 mutex_unlock(&cgrp->pidlist_mutex);
2478 put_pid_ns(ns);
2479 return l; 2674 return l;
2480 } 2675 }
2481 init_rwsem(&l->mutex); 2676 init_rwsem(&l->mutex);
2482 down_write(&l->mutex); 2677 down_write(&l->mutex);
2483 l->key.type = type; 2678 l->key.type = type;
2484 l->key.ns = ns; 2679 l->key.ns = get_pid_ns(ns);
2485 l->use_count = 0; /* don't increment here */ 2680 l->use_count = 0; /* don't increment here */
2486 l->list = NULL; 2681 l->list = NULL;
2487 l->owner = cgrp; 2682 l->owner = cgrp;
@@ -2789,6 +2984,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2789} 2984}
2790 2985
2791/* 2986/*
2987 * Unregister event and free resources.
2988 *
2989 * Gets called from workqueue.
2990 */
2991static void cgroup_event_remove(struct work_struct *work)
2992{
2993 struct cgroup_event *event = container_of(work, struct cgroup_event,
2994 remove);
2995 struct cgroup *cgrp = event->cgrp;
2996
2997 /* TODO: check return code */
2998 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2999
3000 eventfd_ctx_put(event->eventfd);
3001 kfree(event);
3002 dput(cgrp->dentry);
3003}
3004
3005/*
3006 * Gets called on POLLHUP on eventfd when user closes it.
3007 *
3008 * Called with wqh->lock held and interrupts disabled.
3009 */
3010static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3011 int sync, void *key)
3012{
3013 struct cgroup_event *event = container_of(wait,
3014 struct cgroup_event, wait);
3015 struct cgroup *cgrp = event->cgrp;
3016 unsigned long flags = (unsigned long)key;
3017
3018 if (flags & POLLHUP) {
3019 remove_wait_queue_locked(event->wqh, &event->wait);
3020 spin_lock(&cgrp->event_list_lock);
3021 list_del(&event->list);
3022 spin_unlock(&cgrp->event_list_lock);
3023 /*
3024 * We are in atomic context, but cgroup_event_remove() may
3025 * sleep, so we have to call it in workqueue.
3026 */
3027 schedule_work(&event->remove);
3028 }
3029
3030 return 0;
3031}
3032
3033static void cgroup_event_ptable_queue_proc(struct file *file,
3034 wait_queue_head_t *wqh, poll_table *pt)
3035{
3036 struct cgroup_event *event = container_of(pt,
3037 struct cgroup_event, pt);
3038
3039 event->wqh = wqh;
3040 add_wait_queue(wqh, &event->wait);
3041}
3042
3043/*
3044 * Parse input and register new cgroup event handler.
3045 *
3046 * Input must be in format '<event_fd> <control_fd> <args>'.
3047 * Interpretation of args is defined by control file implementation.
3048 */
3049static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3050 const char *buffer)
3051{
3052 struct cgroup_event *event = NULL;
3053 unsigned int efd, cfd;
3054 struct file *efile = NULL;
3055 struct file *cfile = NULL;
3056 char *endp;
3057 int ret;
3058
3059 efd = simple_strtoul(buffer, &endp, 10);
3060 if (*endp != ' ')
3061 return -EINVAL;
3062 buffer = endp + 1;
3063
3064 cfd = simple_strtoul(buffer, &endp, 10);
3065 if ((*endp != ' ') && (*endp != '\0'))
3066 return -EINVAL;
3067 buffer = endp + 1;
3068
3069 event = kzalloc(sizeof(*event), GFP_KERNEL);
3070 if (!event)
3071 return -ENOMEM;
3072 event->cgrp = cgrp;
3073 INIT_LIST_HEAD(&event->list);
3074 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3075 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3076 INIT_WORK(&event->remove, cgroup_event_remove);
3077
3078 efile = eventfd_fget(efd);
3079 if (IS_ERR(efile)) {
3080 ret = PTR_ERR(efile);
3081 goto fail;
3082 }
3083
3084 event->eventfd = eventfd_ctx_fileget(efile);
3085 if (IS_ERR(event->eventfd)) {
3086 ret = PTR_ERR(event->eventfd);
3087 goto fail;
3088 }
3089
3090 cfile = fget(cfd);
3091 if (!cfile) {
3092 ret = -EBADF;
3093 goto fail;
3094 }
3095
3096 /* the process need read permission on control file */
3097 ret = file_permission(cfile, MAY_READ);
3098 if (ret < 0)
3099 goto fail;
3100
3101 event->cft = __file_cft(cfile);
3102 if (IS_ERR(event->cft)) {
3103 ret = PTR_ERR(event->cft);
3104 goto fail;
3105 }
3106
3107 if (!event->cft->register_event || !event->cft->unregister_event) {
3108 ret = -EINVAL;
3109 goto fail;
3110 }
3111
3112 ret = event->cft->register_event(cgrp, event->cft,
3113 event->eventfd, buffer);
3114 if (ret)
3115 goto fail;
3116
3117 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3118 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3119 ret = 0;
3120 goto fail;
3121 }
3122
3123 /*
3124 * Events should be removed after rmdir of cgroup directory, but before
3125 * destroying subsystem state objects. Let's take reference to cgroup
3126 * directory dentry to do that.
3127 */
3128 dget(cgrp->dentry);
3129
3130 spin_lock(&cgrp->event_list_lock);
3131 list_add(&event->list, &cgrp->event_list);
3132 spin_unlock(&cgrp->event_list_lock);
3133
3134 fput(cfile);
3135 fput(efile);
3136
3137 return 0;
3138
3139fail:
3140 if (cfile)
3141 fput(cfile);
3142
3143 if (event && event->eventfd && !IS_ERR(event->eventfd))
3144 eventfd_ctx_put(event->eventfd);
3145
3146 if (!IS_ERR_OR_NULL(efile))
3147 fput(efile);
3148
3149 kfree(event);
3150
3151 return ret;
3152}
3153
3154/*
2792 * for the common functions, 'private' gives the type of file 3155 * for the common functions, 'private' gives the type of file
2793 */ 3156 */
2794/* for hysterical raisins, we can't put this on the older files */ 3157/* for hysterical raisins, we can't put this on the older files */
@@ -2813,6 +3176,11 @@ static struct cftype files[] = {
2813 .read_u64 = cgroup_read_notify_on_release, 3176 .read_u64 = cgroup_read_notify_on_release,
2814 .write_u64 = cgroup_write_notify_on_release, 3177 .write_u64 = cgroup_write_notify_on_release,
2815 }, 3178 },
3179 {
3180 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3181 .write_string = cgroup_write_event_control,
3182 .mode = S_IWUGO,
3183 },
2816}; 3184};
2817 3185
2818static struct cftype cft_release_agent = { 3186static struct cftype cft_release_agent = {
@@ -2877,8 +3245,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2877 /* We need to take each hierarchy_mutex in a consistent order */ 3245 /* We need to take each hierarchy_mutex in a consistent order */
2878 int i; 3246 int i;
2879 3247
3248 /*
3249 * No worry about a race with rebind_subsystems that might mess up the
3250 * locking order, since both parties are under cgroup_mutex.
3251 */
2880 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3252 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2881 struct cgroup_subsys *ss = subsys[i]; 3253 struct cgroup_subsys *ss = subsys[i];
3254 if (ss == NULL)
3255 continue;
2882 if (ss->root == root) 3256 if (ss->root == root)
2883 mutex_lock(&ss->hierarchy_mutex); 3257 mutex_lock(&ss->hierarchy_mutex);
2884 } 3258 }
@@ -2890,6 +3264,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2890 3264
2891 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3265 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2892 struct cgroup_subsys *ss = subsys[i]; 3266 struct cgroup_subsys *ss = subsys[i];
3267 if (ss == NULL)
3268 continue;
2893 if (ss->root == root) 3269 if (ss->root == root)
2894 mutex_unlock(&ss->hierarchy_mutex); 3270 mutex_unlock(&ss->hierarchy_mutex);
2895 } 3271 }
@@ -2936,14 +3312,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2936 3312
2937 for_each_subsys(root, ss) { 3313 for_each_subsys(root, ss) {
2938 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3314 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3315
2939 if (IS_ERR(css)) { 3316 if (IS_ERR(css)) {
2940 err = PTR_ERR(css); 3317 err = PTR_ERR(css);
2941 goto err_destroy; 3318 goto err_destroy;
2942 } 3319 }
2943 init_cgroup_css(css, ss, cgrp); 3320 init_cgroup_css(css, ss, cgrp);
2944 if (ss->use_id) 3321 if (ss->use_id) {
2945 if (alloc_css_id(ss, parent, cgrp)) 3322 err = alloc_css_id(ss, parent, cgrp);
3323 if (err)
2946 goto err_destroy; 3324 goto err_destroy;
3325 }
2947 /* At error, ->destroy() callback has to free assigned ID. */ 3326 /* At error, ->destroy() callback has to free assigned ID. */
2948 } 3327 }
2949 3328
@@ -3010,11 +3389,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3010 * synchronization other than RCU, and the subsystem linked 3389 * synchronization other than RCU, and the subsystem linked
3011 * list isn't RCU-safe */ 3390 * list isn't RCU-safe */
3012 int i; 3391 int i;
3392 /*
3393 * We won't need to lock the subsys array, because the subsystems
3394 * we're concerned about aren't going anywhere since our cgroup root
3395 * has a reference on them.
3396 */
3013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3397 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3014 struct cgroup_subsys *ss = subsys[i]; 3398 struct cgroup_subsys *ss = subsys[i];
3015 struct cgroup_subsys_state *css; 3399 struct cgroup_subsys_state *css;
3016 /* Skip subsystems not in this hierarchy */ 3400 /* Skip subsystems not present or not in this hierarchy */
3017 if (ss->root != cgrp->root) 3401 if (ss == NULL || ss->root != cgrp->root)
3018 continue; 3402 continue;
3019 css = cgrp->subsys[ss->subsys_id]; 3403 css = cgrp->subsys[ss->subsys_id];
3020 /* When called from check_for_release() it's possible 3404 /* When called from check_for_release() it's possible
@@ -3088,6 +3472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3088 struct dentry *d; 3472 struct dentry *d;
3089 struct cgroup *parent; 3473 struct cgroup *parent;
3090 DEFINE_WAIT(wait); 3474 DEFINE_WAIT(wait);
3475 struct cgroup_event *event, *tmp;
3091 int ret; 3476 int ret;
3092 3477
3093 /* the vfs holds both inode->i_mutex already */ 3478 /* the vfs holds both inode->i_mutex already */
@@ -3171,6 +3556,20 @@ again:
3171 set_bit(CGRP_RELEASABLE, &parent->flags); 3556 set_bit(CGRP_RELEASABLE, &parent->flags);
3172 check_for_release(parent); 3557 check_for_release(parent);
3173 3558
3559 /*
3560 * Unregister events and notify userspace.
3561 * Notify userspace about cgroup removing only after rmdir of cgroup
3562 * directory to avoid race between userspace and kernelspace
3563 */
3564 spin_lock(&cgrp->event_list_lock);
3565 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3566 list_del(&event->list);
3567 remove_wait_queue(event->wqh, &event->wait);
3568 eventfd_signal(event->eventfd, 1);
3569 schedule_work(&event->remove);
3570 }
3571 spin_unlock(&cgrp->event_list_lock);
3572
3174 mutex_unlock(&cgroup_mutex); 3573 mutex_unlock(&cgroup_mutex);
3175 return 0; 3574 return 0;
3176} 3575}
@@ -3205,9 +3604,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3205 mutex_init(&ss->hierarchy_mutex); 3604 mutex_init(&ss->hierarchy_mutex);
3206 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3605 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3207 ss->active = 1; 3606 ss->active = 1;
3607
3608 /* this function shouldn't be used with modular subsystems, since they
3609 * need to register a subsys_id, among other things */
3610 BUG_ON(ss->module);
3208} 3611}
3209 3612
3210/** 3613/**
3614 * cgroup_load_subsys: load and register a modular subsystem at runtime
3615 * @ss: the subsystem to load
3616 *
3617 * This function should be called in a modular subsystem's initcall. If the
3618 * subsytem is built as a module, it will be assigned a new subsys_id and set
3619 * up for use. If the subsystem is built-in anyway, work is delegated to the
3620 * simpler cgroup_init_subsys.
3621 */
3622int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3623{
3624 int i;
3625 struct cgroup_subsys_state *css;
3626
3627 /* check name and function validity */
3628 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3629 ss->create == NULL || ss->destroy == NULL)
3630 return -EINVAL;
3631
3632 /*
3633 * we don't support callbacks in modular subsystems. this check is
3634 * before the ss->module check for consistency; a subsystem that could
3635 * be a module should still have no callbacks even if the user isn't
3636 * compiling it as one.
3637 */
3638 if (ss->fork || ss->exit)
3639 return -EINVAL;
3640
3641 /*
3642 * an optionally modular subsystem is built-in: we want to do nothing,
3643 * since cgroup_init_subsys will have already taken care of it.
3644 */
3645 if (ss->module == NULL) {
3646 /* a few sanity checks */
3647 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3648 BUG_ON(subsys[ss->subsys_id] != ss);
3649 return 0;
3650 }
3651
3652 /*
3653 * need to register a subsys id before anything else - for example,
3654 * init_cgroup_css needs it.
3655 */
3656 mutex_lock(&cgroup_mutex);
3657 /* find the first empty slot in the array */
3658 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3659 if (subsys[i] == NULL)
3660 break;
3661 }
3662 if (i == CGROUP_SUBSYS_COUNT) {
3663 /* maximum number of subsystems already registered! */
3664 mutex_unlock(&cgroup_mutex);
3665 return -EBUSY;
3666 }
3667 /* assign ourselves the subsys_id */
3668 ss->subsys_id = i;
3669 subsys[i] = ss;
3670
3671 /*
3672 * no ss->create seems to need anything important in the ss struct, so
3673 * this can happen first (i.e. before the rootnode attachment).
3674 */
3675 css = ss->create(ss, dummytop);
3676 if (IS_ERR(css)) {
3677 /* failure case - need to deassign the subsys[] slot. */
3678 subsys[i] = NULL;
3679 mutex_unlock(&cgroup_mutex);
3680 return PTR_ERR(css);
3681 }
3682
3683 list_add(&ss->sibling, &rootnode.subsys_list);
3684 ss->root = &rootnode;
3685
3686 /* our new subsystem will be attached to the dummy hierarchy. */
3687 init_cgroup_css(css, ss, dummytop);
3688 /* init_idr must be after init_cgroup_css because it sets css->id. */
3689 if (ss->use_id) {
3690 int ret = cgroup_init_idr(ss, css);
3691 if (ret) {
3692 dummytop->subsys[ss->subsys_id] = NULL;
3693 ss->destroy(ss, dummytop);
3694 subsys[i] = NULL;
3695 mutex_unlock(&cgroup_mutex);
3696 return ret;
3697 }
3698 }
3699
3700 /*
3701 * Now we need to entangle the css into the existing css_sets. unlike
3702 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3703 * will need a new pointer to it; done by iterating the css_set_table.
3704 * furthermore, modifying the existing css_sets will corrupt the hash
3705 * table state, so each changed css_set will need its hash recomputed.
3706 * this is all done under the css_set_lock.
3707 */
3708 write_lock(&css_set_lock);
3709 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3710 struct css_set *cg;
3711 struct hlist_node *node, *tmp;
3712 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3713
3714 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3715 /* skip entries that we already rehashed */
3716 if (cg->subsys[ss->subsys_id])
3717 continue;
3718 /* remove existing entry */
3719 hlist_del(&cg->hlist);
3720 /* set new value */
3721 cg->subsys[ss->subsys_id] = css;
3722 /* recompute hash and restore entry */
3723 new_bucket = css_set_hash(cg->subsys);
3724 hlist_add_head(&cg->hlist, new_bucket);
3725 }
3726 }
3727 write_unlock(&css_set_lock);
3728
3729 mutex_init(&ss->hierarchy_mutex);
3730 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3731 ss->active = 1;
3732
3733 /* success! */
3734 mutex_unlock(&cgroup_mutex);
3735 return 0;
3736}
3737EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3738
3739/**
3740 * cgroup_unload_subsys: unload a modular subsystem
3741 * @ss: the subsystem to unload
3742 *
3743 * This function should be called in a modular subsystem's exitcall. When this
3744 * function is invoked, the refcount on the subsystem's module will be 0, so
3745 * the subsystem will not be attached to any hierarchy.
3746 */
3747void cgroup_unload_subsys(struct cgroup_subsys *ss)
3748{
3749 struct cg_cgroup_link *link;
3750 struct hlist_head *hhead;
3751
3752 BUG_ON(ss->module == NULL);
3753
3754 /*
3755 * we shouldn't be called if the subsystem is in use, and the use of
3756 * try_module_get in parse_cgroupfs_options should ensure that it
3757 * doesn't start being used while we're killing it off.
3758 */
3759 BUG_ON(ss->root != &rootnode);
3760
3761 mutex_lock(&cgroup_mutex);
3762 /* deassign the subsys_id */
3763 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3764 subsys[ss->subsys_id] = NULL;
3765
3766 /* remove subsystem from rootnode's list of subsystems */
3767 list_del(&ss->sibling);
3768
3769 /*
3770 * disentangle the css from all css_sets attached to the dummytop. as
3771 * in loading, we need to pay our respects to the hashtable gods.
3772 */
3773 write_lock(&css_set_lock);
3774 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3775 struct css_set *cg = link->cg;
3776
3777 hlist_del(&cg->hlist);
3778 BUG_ON(!cg->subsys[ss->subsys_id]);
3779 cg->subsys[ss->subsys_id] = NULL;
3780 hhead = css_set_hash(cg->subsys);
3781 hlist_add_head(&cg->hlist, hhead);
3782 }
3783 write_unlock(&css_set_lock);
3784
3785 /*
3786 * remove subsystem's css from the dummytop and free it - need to free
3787 * before marking as null because ss->destroy needs the cgrp->subsys
3788 * pointer to find their state. note that this also takes care of
3789 * freeing the css_id.
3790 */
3791 ss->destroy(ss, dummytop);
3792 dummytop->subsys[ss->subsys_id] = NULL;
3793
3794 mutex_unlock(&cgroup_mutex);
3795}
3796EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3797
3798/**
3211 * cgroup_init_early - cgroup initialization at system boot 3799 * cgroup_init_early - cgroup initialization at system boot
3212 * 3800 *
3213 * Initialize cgroups at system boot, and initialize any 3801 * Initialize cgroups at system boot, and initialize any
@@ -3235,7 +3823,8 @@ int __init cgroup_init_early(void)
3235 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3823 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3236 INIT_HLIST_HEAD(&css_set_table[i]); 3824 INIT_HLIST_HEAD(&css_set_table[i]);
3237 3825
3238 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3826 /* at bootup time, we don't worry about modular subsystems */
3827 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3239 struct cgroup_subsys *ss = subsys[i]; 3828 struct cgroup_subsys *ss = subsys[i];
3240 3829
3241 BUG_ON(!ss->name); 3830 BUG_ON(!ss->name);
@@ -3270,12 +3859,13 @@ int __init cgroup_init(void)
3270 if (err) 3859 if (err)
3271 return err; 3860 return err;
3272 3861
3273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3862 /* at bootup time, we don't worry about modular subsystems */
3863 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3274 struct cgroup_subsys *ss = subsys[i]; 3864 struct cgroup_subsys *ss = subsys[i];
3275 if (!ss->early_init) 3865 if (!ss->early_init)
3276 cgroup_init_subsys(ss); 3866 cgroup_init_subsys(ss);
3277 if (ss->use_id) 3867 if (ss->use_id)
3278 cgroup_subsys_init_idr(ss); 3868 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3279 } 3869 }
3280 3870
3281 /* Add init_css_set to the hash table */ 3871 /* Add init_css_set to the hash table */
@@ -3379,9 +3969,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3379 int i; 3969 int i;
3380 3970
3381 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3971 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3972 /*
3973 * ideally we don't want subsystems moving around while we do this.
3974 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3975 * subsys/hierarchy state.
3976 */
3382 mutex_lock(&cgroup_mutex); 3977 mutex_lock(&cgroup_mutex);
3383 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3978 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3384 struct cgroup_subsys *ss = subsys[i]; 3979 struct cgroup_subsys *ss = subsys[i];
3980 if (ss == NULL)
3981 continue;
3385 seq_printf(m, "%s\t%d\t%d\t%d\n", 3982 seq_printf(m, "%s\t%d\t%d\t%d\n",
3386 ss->name, ss->root->hierarchy_id, 3983 ss->name, ss->root->hierarchy_id,
3387 ss->root->number_of_cgroups, !ss->disabled); 3984 ss->root->number_of_cgroups, !ss->disabled);
@@ -3439,7 +4036,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3439{ 4036{
3440 if (need_forkexit_callback) { 4037 if (need_forkexit_callback) {
3441 int i; 4038 int i;
3442 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4039 /*
4040 * forkexit callbacks are only supported for builtin
4041 * subsystems, and the builtin section of the subsys array is
4042 * immutable, so we don't need to lock the subsys array here.
4043 */
4044 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3443 struct cgroup_subsys *ss = subsys[i]; 4045 struct cgroup_subsys *ss = subsys[i];
3444 if (ss->fork) 4046 if (ss->fork)
3445 ss->fork(ss, child); 4047 ss->fork(ss, child);
@@ -3508,7 +4110,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3508 struct css_set *cg; 4110 struct css_set *cg;
3509 4111
3510 if (run_callbacks && need_forkexit_callback) { 4112 if (run_callbacks && need_forkexit_callback) {
3511 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4113 /*
4114 * modular subsystems can't use callbacks, so no need to lock
4115 * the subsys array
4116 */
4117 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3512 struct cgroup_subsys *ss = subsys[i]; 4118 struct cgroup_subsys *ss = subsys[i];
3513 if (ss->exit) 4119 if (ss->exit)
3514 ss->exit(ss, tsk); 4120 ss->exit(ss, tsk);
@@ -3702,12 +4308,13 @@ static void check_for_release(struct cgroup *cgrp)
3702 } 4308 }
3703} 4309}
3704 4310
3705void __css_put(struct cgroup_subsys_state *css) 4311/* Caller must verify that the css is not for root cgroup */
4312void __css_put(struct cgroup_subsys_state *css, int count)
3706{ 4313{
3707 struct cgroup *cgrp = css->cgroup; 4314 struct cgroup *cgrp = css->cgroup;
3708 int val; 4315 int val;
3709 rcu_read_lock(); 4316 rcu_read_lock();
3710 val = atomic_dec_return(&css->refcnt); 4317 val = atomic_sub_return(count, &css->refcnt);
3711 if (val == 1) { 4318 if (val == 1) {
3712 if (notify_on_release(cgrp)) { 4319 if (notify_on_release(cgrp)) {
3713 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4320 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3718,6 +4325,7 @@ void __css_put(struct cgroup_subsys_state *css)
3718 rcu_read_unlock(); 4325 rcu_read_unlock();
3719 WARN_ON_ONCE(val < 1); 4326 WARN_ON_ONCE(val < 1);
3720} 4327}
4328EXPORT_SYMBOL_GPL(__css_put);
3721 4329
3722/* 4330/*
3723 * Notify userspace when a cgroup is released, by running the 4331 * Notify userspace when a cgroup is released, by running the
@@ -3799,8 +4407,11 @@ static int __init cgroup_disable(char *str)
3799 while ((token = strsep(&str, ",")) != NULL) { 4407 while ((token = strsep(&str, ",")) != NULL) {
3800 if (!*token) 4408 if (!*token)
3801 continue; 4409 continue;
3802 4410 /*
3803 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4411 * cgroup_disable, being at boot time, can't know about module
4412 * subsystems, so we don't worry about them.
4413 */
4414 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3804 struct cgroup_subsys *ss = subsys[i]; 4415 struct cgroup_subsys *ss = subsys[i];
3805 4416
3806 if (!strcmp(token, ss->name)) { 4417 if (!strcmp(token, ss->name)) {
@@ -3830,6 +4441,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
3830 return cssid->id; 4441 return cssid->id;
3831 return 0; 4442 return 0;
3832} 4443}
4444EXPORT_SYMBOL_GPL(css_id);
3833 4445
3834unsigned short css_depth(struct cgroup_subsys_state *css) 4446unsigned short css_depth(struct cgroup_subsys_state *css)
3835{ 4447{
@@ -3839,6 +4451,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
3839 return cssid->depth; 4451 return cssid->depth;
3840 return 0; 4452 return 0;
3841} 4453}
4454EXPORT_SYMBOL_GPL(css_depth);
3842 4455
3843bool css_is_ancestor(struct cgroup_subsys_state *child, 4456bool css_is_ancestor(struct cgroup_subsys_state *child,
3844 const struct cgroup_subsys_state *root) 4457 const struct cgroup_subsys_state *root)
@@ -3875,6 +4488,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3875 spin_unlock(&ss->id_lock); 4488 spin_unlock(&ss->id_lock);
3876 call_rcu(&id->rcu_head, __free_css_id_cb); 4489 call_rcu(&id->rcu_head, __free_css_id_cb);
3877} 4490}
4491EXPORT_SYMBOL_GPL(free_css_id);
3878 4492
3879/* 4493/*
3880 * This is called by init or create(). Then, calls to this function are 4494 * This is called by init or create(). Then, calls to this function are
@@ -3924,15 +4538,14 @@ err_out:
3924 4538
3925} 4539}
3926 4540
3927static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4541static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4542 struct cgroup_subsys_state *rootcss)
3928{ 4543{
3929 struct css_id *newid; 4544 struct css_id *newid;
3930 struct cgroup_subsys_state *rootcss;
3931 4545
3932 spin_lock_init(&ss->id_lock); 4546 spin_lock_init(&ss->id_lock);
3933 idr_init(&ss->idr); 4547 idr_init(&ss->idr);
3934 4548
3935 rootcss = init_css_set.subsys[ss->subsys_id];
3936 newid = get_new_cssid(ss, 0); 4549 newid = get_new_cssid(ss, 0);
3937 if (IS_ERR(newid)) 4550 if (IS_ERR(newid))
3938 return PTR_ERR(newid); 4551 return PTR_ERR(newid);
@@ -3948,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3948{ 4561{
3949 int subsys_id, i, depth = 0; 4562 int subsys_id, i, depth = 0;
3950 struct cgroup_subsys_state *parent_css, *child_css; 4563 struct cgroup_subsys_state *parent_css, *child_css;
3951 struct css_id *child_id, *parent_id = NULL; 4564 struct css_id *child_id, *parent_id;
3952 4565
3953 subsys_id = ss->subsys_id; 4566 subsys_id = ss->subsys_id;
3954 parent_css = parent->subsys[subsys_id]; 4567 parent_css = parent->subsys[subsys_id];
3955 child_css = child->subsys[subsys_id]; 4568 child_css = child->subsys[subsys_id];
3956 depth = css_depth(parent_css) + 1;
3957 parent_id = parent_css->id; 4569 parent_id = parent_css->id;
4570 depth = parent_id->depth;
3958 4571
3959 child_id = get_new_cssid(ss, depth); 4572 child_id = get_new_cssid(ss, depth);
3960 if (IS_ERR(child_id)) 4573 if (IS_ERR(child_id))
@@ -3992,6 +4605,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3992 4605
3993 return rcu_dereference(cssid->css); 4606 return rcu_dereference(cssid->css);
3994} 4607}
4608EXPORT_SYMBOL_GPL(css_lookup);
3995 4609
3996/** 4610/**
3997 * css_get_next - lookup next cgroup under specified hierarchy. 4611 * css_get_next - lookup next cgroup under specified hierarchy.