aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c775
1 files changed, 710 insertions, 65 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044a..3ac6f5b0a64b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
43#include <linux/string.h> 47#include <linux/string.h>
44#include <linux/sort.h> 48#include <linux/sort.h>
45#include <linux/kmod.h> 49#include <linux/kmod.h>
50#include <linux/module.h>
46#include <linux/delayacct.h> 51#include <linux/delayacct.h>
47#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
48#include <linux/hash.h> 53#include <linux/hash.h>
@@ -51,15 +56,21 @@
51#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
52#include <linux/idr.h> 57#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
54 61
55#include <asm/atomic.h> 62#include <asm/atomic.h>
56 63
57static DEFINE_MUTEX(cgroup_mutex); 64static DEFINE_MUTEX(cgroup_mutex);
58 65
59/* Generate an array of cgroup subsystem pointers */ 66/*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex.
71 */
60#define SUBSYS(_x) &_x ## _subsys, 72#define SUBSYS(_x) &_x ## _subsys,
61 73static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
62static struct cgroup_subsys *subsys[] = {
63#include <linux/cgroup_subsys.h> 74#include <linux/cgroup_subsys.h>
64}; 75};
65 76
@@ -146,6 +157,35 @@ struct css_id {
146 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
147}; 158};
148 159
160/*
161 * cgroup_event represents events which userspace want to recieve.
162 */
163struct cgroup_event {
164 /*
165 * Cgroup which the event belongs to.
166 */
167 struct cgroup *cgrp;
168 /*
169 * Control file which the event associated.
170 */
171 struct cftype *cft;
172 /*
173 * eventfd to signal userspace about the event.
174 */
175 struct eventfd_ctx *eventfd;
176 /*
177 * Each of these stored in a list by the cgroup.
178 */
179 struct list_head list;
180 /*
181 * All fields below needed to unregister event when
182 * userspace closes eventfd.
183 */
184 poll_table pt;
185 wait_queue_head_t *wqh;
186 wait_queue_t wait;
187 struct work_struct remove;
188};
149 189
150/* The list of hierarchy roots */ 190/* The list of hierarchy roots */
151 191
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 206 */
167static int need_forkexit_callback __read_mostly; 207static int need_forkexit_callback __read_mostly;
168 208
209#ifdef CONFIG_PROVE_LOCKING
210int cgroup_lock_is_held(void)
211{
212 return lockdep_is_held(&cgroup_mutex);
213}
214#else /* #ifdef CONFIG_PROVE_LOCKING */
215int cgroup_lock_is_held(void)
216{
217 return mutex_is_locked(&cgroup_mutex);
218}
219#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220
221EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222
169/* convenient tests for these bits */ 223/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 224inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 225{
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
235static struct css_set init_css_set; 289static struct css_set init_css_set;
236static struct cg_cgroup_link init_css_set_link; 290static struct cg_cgroup_link init_css_set_link;
237 291
238static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 292static int cgroup_init_idr(struct cgroup_subsys *ss,
293 struct cgroup_subsys_state *css);
239 294
240/* css_set_lock protects the list of css_set objects, and the 295/* css_set_lock protects the list of css_set objects, and the
241 * chain of tasks off each css_set. Nests outside task->alloc_lock 296 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
433 struct hlist_node *node; 488 struct hlist_node *node;
434 struct css_set *cg; 489 struct css_set *cg;
435 490
436 /* Built the set of subsystem state objects that we want to 491 /*
437 * see in the new css_set */ 492 * Build the set of subsystem state objects that we want to see in the
493 * new css_set. while subsystems can change globally, the entries here
494 * won't change, so no need for locking.
495 */
438 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 496 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
439 if (root->subsys_bits & (1UL << i)) { 497 if (root->subsys_bits & (1UL << i)) {
440 /* Subsystem is in this hierarchy. So we want 498 /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
681{ 739{
682 mutex_lock(&cgroup_mutex); 740 mutex_lock(&cgroup_mutex);
683} 741}
742EXPORT_SYMBOL_GPL(cgroup_lock);
684 743
685/** 744/**
686 * cgroup_unlock - release lock on cgroup changes 745 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
691{ 750{
692 mutex_unlock(&cgroup_mutex); 751 mutex_unlock(&cgroup_mutex);
693} 752}
753EXPORT_SYMBOL_GPL(cgroup_unlock);
694 754
695/* 755/*
696 * A couple of forward declarations required, due to cyclic reference loop: 756 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
742 if (ret) 802 if (ret)
743 break; 803 break;
744 } 804 }
805
745 return ret; 806 return ret;
746} 807}
747 808
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
869 css_put(css); 930 css_put(css);
870} 931}
871 932
872 933/*
934 * Call with cgroup_mutex held. Drops reference counts on modules, including
935 * any duplicate ones that parse_cgroupfs_options took. If this function
936 * returns an error, no reference counts are touched.
937 */
873static int rebind_subsystems(struct cgroupfs_root *root, 938static int rebind_subsystems(struct cgroupfs_root *root,
874 unsigned long final_bits) 939 unsigned long final_bits)
875{ 940{
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
877 struct cgroup *cgrp = &root->top_cgroup; 942 struct cgroup *cgrp = &root->top_cgroup;
878 int i; 943 int i;
879 944
945 BUG_ON(!mutex_is_locked(&cgroup_mutex));
946
880 removed_bits = root->actual_subsys_bits & ~final_bits; 947 removed_bits = root->actual_subsys_bits & ~final_bits;
881 added_bits = final_bits & ~root->actual_subsys_bits; 948 added_bits = final_bits & ~root->actual_subsys_bits;
882 /* Check that any added subsystems are currently free */ 949 /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
885 struct cgroup_subsys *ss = subsys[i]; 952 struct cgroup_subsys *ss = subsys[i];
886 if (!(bit & added_bits)) 953 if (!(bit & added_bits))
887 continue; 954 continue;
955 /*
956 * Nobody should tell us to do a subsys that doesn't exist:
957 * parse_cgroupfs_options should catch that case and refcounts
958 * ensure that subsystems won't disappear once selected.
959 */
960 BUG_ON(ss == NULL);
888 if (ss->root != &rootnode) { 961 if (ss->root != &rootnode) {
889 /* Subsystem isn't free */ 962 /* Subsystem isn't free */
890 return -EBUSY; 963 return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
904 unsigned long bit = 1UL << i; 977 unsigned long bit = 1UL << i;
905 if (bit & added_bits) { 978 if (bit & added_bits) {
906 /* We're binding this subsystem to this hierarchy */ 979 /* We're binding this subsystem to this hierarchy */
980 BUG_ON(ss == NULL);
907 BUG_ON(cgrp->subsys[i]); 981 BUG_ON(cgrp->subsys[i]);
908 BUG_ON(!dummytop->subsys[i]); 982 BUG_ON(!dummytop->subsys[i]);
909 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 983 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
915 if (ss->bind) 989 if (ss->bind)
916 ss->bind(ss, cgrp); 990 ss->bind(ss, cgrp);
917 mutex_unlock(&ss->hierarchy_mutex); 991 mutex_unlock(&ss->hierarchy_mutex);
992 /* refcount was already taken, and we're keeping it */
918 } else if (bit & removed_bits) { 993 } else if (bit & removed_bits) {
919 /* We're removing this subsystem */ 994 /* We're removing this subsystem */
995 BUG_ON(ss == NULL);
920 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 996 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
921 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 997 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
922 mutex_lock(&ss->hierarchy_mutex); 998 mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
927 subsys[i]->root = &rootnode; 1003 subsys[i]->root = &rootnode;
928 list_move(&ss->sibling, &rootnode.subsys_list); 1004 list_move(&ss->sibling, &rootnode.subsys_list);
929 mutex_unlock(&ss->hierarchy_mutex); 1005 mutex_unlock(&ss->hierarchy_mutex);
1006 /* subsystem is now free - drop reference on module */
1007 module_put(ss->module);
930 } else if (bit & final_bits) { 1008 } else if (bit & final_bits) {
931 /* Subsystem state should already exist */ 1009 /* Subsystem state should already exist */
1010 BUG_ON(ss == NULL);
932 BUG_ON(!cgrp->subsys[i]); 1011 BUG_ON(!cgrp->subsys[i]);
1012 /*
1013 * a refcount was taken, but we already had one, so
1014 * drop the extra reference.
1015 */
1016 module_put(ss->module);
1017#ifdef CONFIG_MODULE_UNLOAD
1018 BUG_ON(ss->module && !module_refcount(ss->module));
1019#endif
933 } else { 1020 } else {
934 /* Subsystem state shouldn't exist */ 1021 /* Subsystem state shouldn't exist */
935 BUG_ON(cgrp->subsys[i]); 1022 BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
971 1058
972}; 1059};
973 1060
974/* Convert a hierarchy specifier into a bitmask of subsystems and 1061/*
975 * flags. */ 1062 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
976static int parse_cgroupfs_options(char *data, 1063 * with cgroup_mutex held to protect the subsys[] array. This function takes
977 struct cgroup_sb_opts *opts) 1064 * refcounts on subsystems to be used, unless it returns error, in which case
1065 * no refcounts are taken.
1066 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
978{ 1068{
979 char *token, *o = data ?: "all"; 1069 char *token, *o = data ?: "all";
980 unsigned long mask = (unsigned long)-1; 1070 unsigned long mask = (unsigned long)-1;
1071 int i;
1072 bool module_pin_failed = false;
1073
1074 BUG_ON(!mutex_is_locked(&cgroup_mutex));
981 1075
982#ifdef CONFIG_CPUSETS 1076#ifdef CONFIG_CPUSETS
983 mask = ~(1UL << cpuset_subsys_id); 1077 mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
990 return -EINVAL; 1084 return -EINVAL;
991 if (!strcmp(token, "all")) { 1085 if (!strcmp(token, "all")) {
992 /* Add all non-disabled subsystems */ 1086 /* Add all non-disabled subsystems */
993 int i;
994 opts->subsys_bits = 0; 1087 opts->subsys_bits = 0;
995 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
996 struct cgroup_subsys *ss = subsys[i]; 1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
997 if (!ss->disabled) 1092 if (!ss->disabled)
998 opts->subsys_bits |= 1ul << i; 1093 opts->subsys_bits |= 1ul << i;
999 } 1094 }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
1011 if (!opts->release_agent) 1106 if (!opts->release_agent)
1012 return -ENOMEM; 1107 return -ENOMEM;
1013 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
1014 int i;
1015 const char *name = token + 5; 1109 const char *name = token + 5;
1016 /* Can't specify an empty name */ 1110 /* Can't specify an empty name */
1017 if (!strlen(name)) 1111 if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
1035 return -ENOMEM; 1129 return -ENOMEM;
1036 } else { 1130 } else {
1037 struct cgroup_subsys *ss; 1131 struct cgroup_subsys *ss;
1038 int i;
1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1040 ss = subsys[i]; 1133 ss = subsys[i];
1134 if (ss == NULL)
1135 continue;
1041 if (!strcmp(token, ss->name)) { 1136 if (!strcmp(token, ss->name)) {
1042 if (!ss->disabled) 1137 if (!ss->disabled)
1043 set_bit(i, &opts->subsys_bits); 1138 set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
1072 if (!opts->subsys_bits && !opts->name) 1167 if (!opts->subsys_bits && !opts->name)
1073 return -EINVAL; 1168 return -EINVAL;
1074 1169
1170 /*
1171 * Grab references on all the modules we'll need, so the subsystems
1172 * don't dance around before rebind_subsystems attaches them. This may
1173 * take duplicate reference counts on a subsystem that's already used,
1174 * but rebind_subsystems handles this case.
1175 */
1176 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1177 unsigned long bit = 1UL << i;
1178
1179 if (!(bit & opts->subsys_bits))
1180 continue;
1181 if (!try_module_get(subsys[i]->module)) {
1182 module_pin_failed = true;
1183 break;
1184 }
1185 }
1186 if (module_pin_failed) {
1187 /*
1188 * oops, one of the modules was going away. this means that we
1189 * raced with a module_delete call, and to the user this is
1190 * essentially a "subsystem doesn't exist" case.
1191 */
1192 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1193 /* drop refcounts only on the ones we took */
1194 unsigned long bit = 1UL << i;
1195
1196 if (!(bit & opts->subsys_bits))
1197 continue;
1198 module_put(subsys[i]->module);
1199 }
1200 return -ENOENT;
1201 }
1202
1075 return 0; 1203 return 0;
1076} 1204}
1077 1205
1206static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1207{
1208 int i;
1209 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1210 unsigned long bit = 1UL << i;
1211
1212 if (!(bit & subsys_bits))
1213 continue;
1214 module_put(subsys[i]->module);
1215 }
1216}
1217
1078static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1218static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1079{ 1219{
1080 int ret = 0; 1220 int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1091 if (ret) 1231 if (ret)
1092 goto out_unlock; 1232 goto out_unlock;
1093 1233
1094 /* Don't allow flags to change at remount */ 1234 /* Don't allow flags or name to change at remount */
1095 if (opts.flags != root->flags) { 1235 if (opts.flags != root->flags ||
1096 ret = -EINVAL; 1236 (opts.name && strcmp(opts.name, root->name))) {
1097 goto out_unlock;
1098 }
1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL; 1237 ret = -EINVAL;
1238 drop_parsed_module_refcounts(opts.subsys_bits);
1103 goto out_unlock; 1239 goto out_unlock;
1104 } 1240 }
1105 1241
1106 ret = rebind_subsystems(root, opts.subsys_bits); 1242 ret = rebind_subsystems(root, opts.subsys_bits);
1107 if (ret) 1243 if (ret) {
1244 drop_parsed_module_refcounts(opts.subsys_bits);
1108 goto out_unlock; 1245 goto out_unlock;
1246 }
1109 1247
1110 /* (re)populate subsystem files */ 1248 /* (re)populate subsystem files */
1111 cgroup_populate_dir(cgrp); 1249 cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1136 INIT_LIST_HEAD(&cgrp->release_list); 1274 INIT_LIST_HEAD(&cgrp->release_list);
1137 INIT_LIST_HEAD(&cgrp->pidlists); 1275 INIT_LIST_HEAD(&cgrp->pidlists);
1138 mutex_init(&cgrp->pidlist_mutex); 1276 mutex_init(&cgrp->pidlist_mutex);
1277 INIT_LIST_HEAD(&cgrp->event_list);
1278 spin_lock_init(&cgrp->event_list_lock);
1139} 1279}
1140 1280
1141static void init_cgroup_root(struct cgroupfs_root *root) 1281static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1291 struct cgroupfs_root *new_root; 1431 struct cgroupfs_root *new_root;
1292 1432
1293 /* First find the desired set of subsystems */ 1433 /* First find the desired set of subsystems */
1434 mutex_lock(&cgroup_mutex);
1294 ret = parse_cgroupfs_options(data, &opts); 1435 ret = parse_cgroupfs_options(data, &opts);
1436 mutex_unlock(&cgroup_mutex);
1295 if (ret) 1437 if (ret)
1296 goto out_err; 1438 goto out_err;
1297 1439
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1302 new_root = cgroup_root_from_opts(&opts); 1444 new_root = cgroup_root_from_opts(&opts);
1303 if (IS_ERR(new_root)) { 1445 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root); 1446 ret = PTR_ERR(new_root);
1305 goto out_err; 1447 goto drop_modules;
1306 } 1448 }
1307 opts.new_root = new_root; 1449 opts.new_root = new_root;
1308 1450
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1311 if (IS_ERR(sb)) { 1453 if (IS_ERR(sb)) {
1312 ret = PTR_ERR(sb); 1454 ret = PTR_ERR(sb);
1313 cgroup_drop_root(opts.new_root); 1455 cgroup_drop_root(opts.new_root);
1314 goto out_err; 1456 goto drop_modules;
1315 } 1457 }
1316 1458
1317 root = sb->s_fs_info; 1459 root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1367 free_cg_links(&tmp_cg_links); 1509 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super; 1510 goto drop_new_super;
1369 } 1511 }
1512 /*
1513 * There must be no failure case after here, since rebinding
1514 * takes care of subsystems' refcounts, which are explicitly
1515 * dropped in the failure exit path.
1516 */
1370 1517
1371 /* EBUSY should be the only error here */ 1518 /* EBUSY should be the only error here */
1372 BUG_ON(ret); 1519 BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1405 * any) is not needed 1552 * any) is not needed
1406 */ 1553 */
1407 cgroup_drop_root(opts.new_root); 1554 cgroup_drop_root(opts.new_root);
1555 /* no subsys rebinding, so refcounts don't change */
1556 drop_parsed_module_refcounts(opts.subsys_bits);
1408 } 1557 }
1409 1558
1410 simple_set_mnt(mnt, sb); 1559 simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1414 1563
1415 drop_new_super: 1564 drop_new_super:
1416 deactivate_locked_super(sb); 1565 deactivate_locked_super(sb);
1566 drop_modules:
1567 drop_parsed_module_refcounts(opts.subsys_bits);
1417 out_err: 1568 out_err:
1418 kfree(opts.release_agent); 1569 kfree(opts.release_agent);
1419 kfree(opts.name); 1570 kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1495int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1496{ 1647{
1497 char *start; 1648 char *start;
1498 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1499 1652
1500 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1501 /* 1654 /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1511 *--start = '\0'; 1664 *--start = '\0';
1512 for (;;) { 1665 for (;;) {
1513 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1514 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1515 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1516 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1517 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1518 if (!cgrp) 1672 if (!cgrp)
1519 break; 1673 break;
1520 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1521 if (!cgrp->parent) 1678 if (!cgrp->parent)
1522 continue; 1679 continue;
1523 if (--start < buf) 1680 if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1527 memmove(buf, start, buf + buflen - start); 1684 memmove(buf, start, buf + buflen - start);
1528 return 0; 1685 return 0;
1529} 1686}
1687EXPORT_SYMBOL_GPL(cgroup_path);
1530 1688
1531/** 1689/**
1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1690 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1539int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1697int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1540{ 1698{
1541 int retval = 0; 1699 int retval = 0;
1542 struct cgroup_subsys *ss; 1700 struct cgroup_subsys *ss, *failed_ss = NULL;
1543 struct cgroup *oldcgrp; 1701 struct cgroup *oldcgrp;
1544 struct css_set *cg; 1702 struct css_set *cg;
1545 struct css_set *newcg; 1703 struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1553 for_each_subsys(root, ss) { 1711 for_each_subsys(root, ss) {
1554 if (ss->can_attach) { 1712 if (ss->can_attach) {
1555 retval = ss->can_attach(ss, cgrp, tsk, false); 1713 retval = ss->can_attach(ss, cgrp, tsk, false);
1556 if (retval) 1714 if (retval) {
1557 return retval; 1715 /*
1716 * Remember on which subsystem the can_attach()
1717 * failed, so that we only call cancel_attach()
1718 * against the subsystems whose can_attach()
1719 * succeeded. (See below)
1720 */
1721 failed_ss = ss;
1722 goto out;
1723 }
1558 } 1724 }
1559 } 1725 }
1560 1726
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 */ 1734 */
1569 newcg = find_css_set(cg, cgrp); 1735 newcg = find_css_set(cg, cgrp);
1570 put_css_set(cg); 1736 put_css_set(cg);
1571 if (!newcg) 1737 if (!newcg) {
1572 return -ENOMEM; 1738 retval = -ENOMEM;
1739 goto out;
1740 }
1573 1741
1574 task_lock(tsk); 1742 task_lock(tsk);
1575 if (tsk->flags & PF_EXITING) { 1743 if (tsk->flags & PF_EXITING) {
1576 task_unlock(tsk); 1744 task_unlock(tsk);
1577 put_css_set(newcg); 1745 put_css_set(newcg);
1578 return -ESRCH; 1746 retval = -ESRCH;
1747 goto out;
1579 } 1748 }
1580 rcu_assign_pointer(tsk->cgroups, newcg); 1749 rcu_assign_pointer(tsk->cgroups, newcg);
1581 task_unlock(tsk); 1750 task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1601 * is no longer empty. 1770 * is no longer empty.
1602 */ 1771 */
1603 cgroup_wakeup_rmdir_waiter(cgrp); 1772 cgroup_wakeup_rmdir_waiter(cgrp);
1604 return 0; 1773out:
1774 if (retval) {
1775 for_each_subsys(root, ss) {
1776 if (ss == failed_ss)
1777 /*
1778 * This subsystem was the one that failed the
1779 * can_attach() check earlier, so we don't need
1780 * to call cancel_attach() against it or any
1781 * remaining subsystems.
1782 */
1783 break;
1784 if (ss->cancel_attach)
1785 ss->cancel_attach(ss, cgrp, tsk, false);
1786 }
1787 }
1788 return retval;
1605} 1789}
1606 1790
1607/* 1791/*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1667 } 1851 }
1668 return true; 1852 return true;
1669} 1853}
1854EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1670 1855
1671static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1856static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1672 const char *buffer) 1857 const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1935 .rename = cgroup_rename, 2120 .rename = cgroup_rename,
1936}; 2121};
1937 2122
2123/*
2124 * Check if a file is a control file
2125 */
2126static inline struct cftype *__file_cft(struct file *file)
2127{
2128 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2129 return ERR_PTR(-EINVAL);
2130 return __d_cft(file->f_dentry);
2131}
2132
1938static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2133static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1939 struct super_block *sb) 2134 struct super_block *sb)
1940{ 2135{
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2054 error = PTR_ERR(dentry); 2249 error = PTR_ERR(dentry);
2055 return error; 2250 return error;
2056} 2251}
2252EXPORT_SYMBOL_GPL(cgroup_add_file);
2057 2253
2058int cgroup_add_files(struct cgroup *cgrp, 2254int cgroup_add_files(struct cgroup *cgrp,
2059 struct cgroup_subsys *subsys, 2255 struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2068 } 2264 }
2069 return 0; 2265 return 0;
2070} 2266}
2267EXPORT_SYMBOL_GPL(cgroup_add_files);
2071 2268
2072/** 2269/**
2073 * cgroup_task_count - count the number of tasks in a cgroup. 2270 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2453{ 2650{
2454 struct cgroup_pidlist *l; 2651 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */ 2652 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2653 struct pid_namespace *ns = current->nsproxy->pid_ns;
2654
2457 /* 2655 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case 2656 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same 2657 * the last ref-holder is trying to remove l from the list at the same
@@ -2463,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2463 mutex_lock(&cgrp->pidlist_mutex); 2661 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) { 2662 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) { 2663 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */ 2664 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2665 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2666 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2475,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2475 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2671 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2476 if (!l) { 2672 if (!l) {
2477 mutex_unlock(&cgrp->pidlist_mutex); 2673 mutex_unlock(&cgrp->pidlist_mutex);
2478 put_pid_ns(ns);
2479 return l; 2674 return l;
2480 } 2675 }
2481 init_rwsem(&l->mutex); 2676 init_rwsem(&l->mutex);
2482 down_write(&l->mutex); 2677 down_write(&l->mutex);
2483 l->key.type = type; 2678 l->key.type = type;
2484 l->key.ns = ns; 2679 l->key.ns = get_pid_ns(ns);
2485 l->use_count = 0; /* don't increment here */ 2680 l->use_count = 0; /* don't increment here */
2486 l->list = NULL; 2681 l->list = NULL;
2487 l->owner = cgrp; 2682 l->owner = cgrp;
@@ -2789,6 +2984,173 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2789} 2984}
2790 2985
2791/* 2986/*
2987 * Unregister event and free resources.
2988 *
2989 * Gets called from workqueue.
2990 */
2991static void cgroup_event_remove(struct work_struct *work)
2992{
2993 struct cgroup_event *event = container_of(work, struct cgroup_event,
2994 remove);
2995 struct cgroup *cgrp = event->cgrp;
2996
2997 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2998
2999 eventfd_ctx_put(event->eventfd);
3000 kfree(event);
3001 dput(cgrp->dentry);
3002}
3003
3004/*
3005 * Gets called on POLLHUP on eventfd when user closes it.
3006 *
3007 * Called with wqh->lock held and interrupts disabled.
3008 */
3009static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3010 int sync, void *key)
3011{
3012 struct cgroup_event *event = container_of(wait,
3013 struct cgroup_event, wait);
3014 struct cgroup *cgrp = event->cgrp;
3015 unsigned long flags = (unsigned long)key;
3016
3017 if (flags & POLLHUP) {
3018 __remove_wait_queue(event->wqh, &event->wait);
3019 spin_lock(&cgrp->event_list_lock);
3020 list_del(&event->list);
3021 spin_unlock(&cgrp->event_list_lock);
3022 /*
3023 * We are in atomic context, but cgroup_event_remove() may
3024 * sleep, so we have to call it in workqueue.
3025 */
3026 schedule_work(&event->remove);
3027 }
3028
3029 return 0;
3030}
3031
3032static void cgroup_event_ptable_queue_proc(struct file *file,
3033 wait_queue_head_t *wqh, poll_table *pt)
3034{
3035 struct cgroup_event *event = container_of(pt,
3036 struct cgroup_event, pt);
3037
3038 event->wqh = wqh;
3039 add_wait_queue(wqh, &event->wait);
3040}
3041
3042/*
3043 * Parse input and register new cgroup event handler.
3044 *
3045 * Input must be in format '<event_fd> <control_fd> <args>'.
3046 * Interpretation of args is defined by control file implementation.
3047 */
3048static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3049 const char *buffer)
3050{
3051 struct cgroup_event *event = NULL;
3052 unsigned int efd, cfd;
3053 struct file *efile = NULL;
3054 struct file *cfile = NULL;
3055 char *endp;
3056 int ret;
3057
3058 efd = simple_strtoul(buffer, &endp, 10);
3059 if (*endp != ' ')
3060 return -EINVAL;
3061 buffer = endp + 1;
3062
3063 cfd = simple_strtoul(buffer, &endp, 10);
3064 if ((*endp != ' ') && (*endp != '\0'))
3065 return -EINVAL;
3066 buffer = endp + 1;
3067
3068 event = kzalloc(sizeof(*event), GFP_KERNEL);
3069 if (!event)
3070 return -ENOMEM;
3071 event->cgrp = cgrp;
3072 INIT_LIST_HEAD(&event->list);
3073 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3074 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3075 INIT_WORK(&event->remove, cgroup_event_remove);
3076
3077 efile = eventfd_fget(efd);
3078 if (IS_ERR(efile)) {
3079 ret = PTR_ERR(efile);
3080 goto fail;
3081 }
3082
3083 event->eventfd = eventfd_ctx_fileget(efile);
3084 if (IS_ERR(event->eventfd)) {
3085 ret = PTR_ERR(event->eventfd);
3086 goto fail;
3087 }
3088
3089 cfile = fget(cfd);
3090 if (!cfile) {
3091 ret = -EBADF;
3092 goto fail;
3093 }
3094
3095 /* the process need read permission on control file */
3096 ret = file_permission(cfile, MAY_READ);
3097 if (ret < 0)
3098 goto fail;
3099
3100 event->cft = __file_cft(cfile);
3101 if (IS_ERR(event->cft)) {
3102 ret = PTR_ERR(event->cft);
3103 goto fail;
3104 }
3105
3106 if (!event->cft->register_event || !event->cft->unregister_event) {
3107 ret = -EINVAL;
3108 goto fail;
3109 }
3110
3111 ret = event->cft->register_event(cgrp, event->cft,
3112 event->eventfd, buffer);
3113 if (ret)
3114 goto fail;
3115
3116 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3117 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3118 ret = 0;
3119 goto fail;
3120 }
3121
3122 /*
3123 * Events should be removed after rmdir of cgroup directory, but before
3124 * destroying subsystem state objects. Let's take reference to cgroup
3125 * directory dentry to do that.
3126 */
3127 dget(cgrp->dentry);
3128
3129 spin_lock(&cgrp->event_list_lock);
3130 list_add(&event->list, &cgrp->event_list);
3131 spin_unlock(&cgrp->event_list_lock);
3132
3133 fput(cfile);
3134 fput(efile);
3135
3136 return 0;
3137
3138fail:
3139 if (cfile)
3140 fput(cfile);
3141
3142 if (event && event->eventfd && !IS_ERR(event->eventfd))
3143 eventfd_ctx_put(event->eventfd);
3144
3145 if (!IS_ERR_OR_NULL(efile))
3146 fput(efile);
3147
3148 kfree(event);
3149
3150 return ret;
3151}
3152
3153/*
2792 * for the common functions, 'private' gives the type of file 3154 * for the common functions, 'private' gives the type of file
2793 */ 3155 */
2794/* for hysterical raisins, we can't put this on the older files */ 3156/* for hysterical raisins, we can't put this on the older files */
@@ -2813,6 +3175,11 @@ static struct cftype files[] = {
2813 .read_u64 = cgroup_read_notify_on_release, 3175 .read_u64 = cgroup_read_notify_on_release,
2814 .write_u64 = cgroup_write_notify_on_release, 3176 .write_u64 = cgroup_write_notify_on_release,
2815 }, 3177 },
3178 {
3179 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3180 .write_string = cgroup_write_event_control,
3181 .mode = S_IWUGO,
3182 },
2816}; 3183};
2817 3184
2818static struct cftype cft_release_agent = { 3185static struct cftype cft_release_agent = {
@@ -2877,8 +3244,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2877 /* We need to take each hierarchy_mutex in a consistent order */ 3244 /* We need to take each hierarchy_mutex in a consistent order */
2878 int i; 3245 int i;
2879 3246
3247 /*
3248 * No worry about a race with rebind_subsystems that might mess up the
3249 * locking order, since both parties are under cgroup_mutex.
3250 */
2880 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3251 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2881 struct cgroup_subsys *ss = subsys[i]; 3252 struct cgroup_subsys *ss = subsys[i];
3253 if (ss == NULL)
3254 continue;
2882 if (ss->root == root) 3255 if (ss->root == root)
2883 mutex_lock(&ss->hierarchy_mutex); 3256 mutex_lock(&ss->hierarchy_mutex);
2884 } 3257 }
@@ -2890,6 +3263,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2890 3263
2891 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3264 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2892 struct cgroup_subsys *ss = subsys[i]; 3265 struct cgroup_subsys *ss = subsys[i];
3266 if (ss == NULL)
3267 continue;
2893 if (ss->root == root) 3268 if (ss->root == root)
2894 mutex_unlock(&ss->hierarchy_mutex); 3269 mutex_unlock(&ss->hierarchy_mutex);
2895 } 3270 }
@@ -2936,14 +3311,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2936 3311
2937 for_each_subsys(root, ss) { 3312 for_each_subsys(root, ss) {
2938 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3313 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3314
2939 if (IS_ERR(css)) { 3315 if (IS_ERR(css)) {
2940 err = PTR_ERR(css); 3316 err = PTR_ERR(css);
2941 goto err_destroy; 3317 goto err_destroy;
2942 } 3318 }
2943 init_cgroup_css(css, ss, cgrp); 3319 init_cgroup_css(css, ss, cgrp);
2944 if (ss->use_id) 3320 if (ss->use_id) {
2945 if (alloc_css_id(ss, parent, cgrp)) 3321 err = alloc_css_id(ss, parent, cgrp);
3322 if (err)
2946 goto err_destroy; 3323 goto err_destroy;
3324 }
2947 /* At error, ->destroy() callback has to free assigned ID. */ 3325 /* At error, ->destroy() callback has to free assigned ID. */
2948 } 3326 }
2949 3327
@@ -3010,11 +3388,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3010 * synchronization other than RCU, and the subsystem linked 3388 * synchronization other than RCU, and the subsystem linked
3011 * list isn't RCU-safe */ 3389 * list isn't RCU-safe */
3012 int i; 3390 int i;
3391 /*
3392 * We won't need to lock the subsys array, because the subsystems
3393 * we're concerned about aren't going anywhere since our cgroup root
3394 * has a reference on them.
3395 */
3013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3396 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3014 struct cgroup_subsys *ss = subsys[i]; 3397 struct cgroup_subsys *ss = subsys[i];
3015 struct cgroup_subsys_state *css; 3398 struct cgroup_subsys_state *css;
3016 /* Skip subsystems not in this hierarchy */ 3399 /* Skip subsystems not present or not in this hierarchy */
3017 if (ss->root != cgrp->root) 3400 if (ss == NULL || ss->root != cgrp->root)
3018 continue; 3401 continue;
3019 css = cgrp->subsys[ss->subsys_id]; 3402 css = cgrp->subsys[ss->subsys_id];
3020 /* When called from check_for_release() it's possible 3403 /* When called from check_for_release() it's possible
@@ -3088,6 +3471,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3088 struct dentry *d; 3471 struct dentry *d;
3089 struct cgroup *parent; 3472 struct cgroup *parent;
3090 DEFINE_WAIT(wait); 3473 DEFINE_WAIT(wait);
3474 struct cgroup_event *event, *tmp;
3091 int ret; 3475 int ret;
3092 3476
3093 /* the vfs holds both inode->i_mutex already */ 3477 /* the vfs holds both inode->i_mutex already */
@@ -3171,6 +3555,20 @@ again:
3171 set_bit(CGRP_RELEASABLE, &parent->flags); 3555 set_bit(CGRP_RELEASABLE, &parent->flags);
3172 check_for_release(parent); 3556 check_for_release(parent);
3173 3557
3558 /*
3559 * Unregister events and notify userspace.
3560 * Notify userspace about cgroup removing only after rmdir of cgroup
3561 * directory to avoid race between userspace and kernelspace
3562 */
3563 spin_lock(&cgrp->event_list_lock);
3564 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3565 list_del(&event->list);
3566 remove_wait_queue(event->wqh, &event->wait);
3567 eventfd_signal(event->eventfd, 1);
3568 schedule_work(&event->remove);
3569 }
3570 spin_unlock(&cgrp->event_list_lock);
3571
3174 mutex_unlock(&cgroup_mutex); 3572 mutex_unlock(&cgroup_mutex);
3175 return 0; 3573 return 0;
3176} 3574}
@@ -3205,9 +3603,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3205 mutex_init(&ss->hierarchy_mutex); 3603 mutex_init(&ss->hierarchy_mutex);
3206 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3604 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3207 ss->active = 1; 3605 ss->active = 1;
3606
3607 /* this function shouldn't be used with modular subsystems, since they
3608 * need to register a subsys_id, among other things */
3609 BUG_ON(ss->module);
3208} 3610}
3209 3611
3210/** 3612/**
3613 * cgroup_load_subsys: load and register a modular subsystem at runtime
3614 * @ss: the subsystem to load
3615 *
3616 * This function should be called in a modular subsystem's initcall. If the
3617 * subsystem is built as a module, it will be assigned a new subsys_id and set
3618 * up for use. If the subsystem is built-in anyway, work is delegated to the
3619 * simpler cgroup_init_subsys.
3620 */
3621int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3622{
3623 int i;
3624 struct cgroup_subsys_state *css;
3625
3626 /* check name and function validity */
3627 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3628 ss->create == NULL || ss->destroy == NULL)
3629 return -EINVAL;
3630
3631 /*
3632 * we don't support callbacks in modular subsystems. this check is
3633 * before the ss->module check for consistency; a subsystem that could
3634 * be a module should still have no callbacks even if the user isn't
3635 * compiling it as one.
3636 */
3637 if (ss->fork || ss->exit)
3638 return -EINVAL;
3639
3640 /*
3641 * an optionally modular subsystem is built-in: we want to do nothing,
3642 * since cgroup_init_subsys will have already taken care of it.
3643 */
3644 if (ss->module == NULL) {
3645 /* a few sanity checks */
3646 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3647 BUG_ON(subsys[ss->subsys_id] != ss);
3648 return 0;
3649 }
3650
3651 /*
3652 * need to register a subsys id before anything else - for example,
3653 * init_cgroup_css needs it.
3654 */
3655 mutex_lock(&cgroup_mutex);
3656 /* find the first empty slot in the array */
3657 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3658 if (subsys[i] == NULL)
3659 break;
3660 }
3661 if (i == CGROUP_SUBSYS_COUNT) {
3662 /* maximum number of subsystems already registered! */
3663 mutex_unlock(&cgroup_mutex);
3664 return -EBUSY;
3665 }
3666 /* assign ourselves the subsys_id */
3667 ss->subsys_id = i;
3668 subsys[i] = ss;
3669
3670 /*
3671 * no ss->create seems to need anything important in the ss struct, so
3672 * this can happen first (i.e. before the rootnode attachment).
3673 */
3674 css = ss->create(ss, dummytop);
3675 if (IS_ERR(css)) {
3676 /* failure case - need to deassign the subsys[] slot. */
3677 subsys[i] = NULL;
3678 mutex_unlock(&cgroup_mutex);
3679 return PTR_ERR(css);
3680 }
3681
3682 list_add(&ss->sibling, &rootnode.subsys_list);
3683 ss->root = &rootnode;
3684
3685 /* our new subsystem will be attached to the dummy hierarchy. */
3686 init_cgroup_css(css, ss, dummytop);
3687 /* init_idr must be after init_cgroup_css because it sets css->id. */
3688 if (ss->use_id) {
3689 int ret = cgroup_init_idr(ss, css);
3690 if (ret) {
3691 dummytop->subsys[ss->subsys_id] = NULL;
3692 ss->destroy(ss, dummytop);
3693 subsys[i] = NULL;
3694 mutex_unlock(&cgroup_mutex);
3695 return ret;
3696 }
3697 }
3698
3699 /*
3700 * Now we need to entangle the css into the existing css_sets. unlike
3701 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3702 * will need a new pointer to it; done by iterating the css_set_table.
3703 * furthermore, modifying the existing css_sets will corrupt the hash
3704 * table state, so each changed css_set will need its hash recomputed.
3705 * this is all done under the css_set_lock.
3706 */
3707 write_lock(&css_set_lock);
3708 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3709 struct css_set *cg;
3710 struct hlist_node *node, *tmp;
3711 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3712
3713 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3714 /* skip entries that we already rehashed */
3715 if (cg->subsys[ss->subsys_id])
3716 continue;
3717 /* remove existing entry */
3718 hlist_del(&cg->hlist);
3719 /* set new value */
3720 cg->subsys[ss->subsys_id] = css;
3721 /* recompute hash and restore entry */
3722 new_bucket = css_set_hash(cg->subsys);
3723 hlist_add_head(&cg->hlist, new_bucket);
3724 }
3725 }
3726 write_unlock(&css_set_lock);
3727
3728 mutex_init(&ss->hierarchy_mutex);
3729 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3730 ss->active = 1;
3731
3732 /* success! */
3733 mutex_unlock(&cgroup_mutex);
3734 return 0;
3735}
3736EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3737
3738/**
3739 * cgroup_unload_subsys: unload a modular subsystem
3740 * @ss: the subsystem to unload
3741 *
3742 * This function should be called in a modular subsystem's exitcall. When this
3743 * function is invoked, the refcount on the subsystem's module will be 0, so
3744 * the subsystem will not be attached to any hierarchy.
3745 */
3746void cgroup_unload_subsys(struct cgroup_subsys *ss)
3747{
3748 struct cg_cgroup_link *link;
3749 struct hlist_head *hhead;
3750
3751 BUG_ON(ss->module == NULL);
3752
3753 /*
3754 * we shouldn't be called if the subsystem is in use, and the use of
3755 * try_module_get in parse_cgroupfs_options should ensure that it
3756 * doesn't start being used while we're killing it off.
3757 */
3758 BUG_ON(ss->root != &rootnode);
3759
3760 mutex_lock(&cgroup_mutex);
3761 /* deassign the subsys_id */
3762 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3763 subsys[ss->subsys_id] = NULL;
3764
3765 /* remove subsystem from rootnode's list of subsystems */
3766 list_del(&ss->sibling);
3767
3768 /*
3769 * disentangle the css from all css_sets attached to the dummytop. as
3770 * in loading, we need to pay our respects to the hashtable gods.
3771 */
3772 write_lock(&css_set_lock);
3773 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3774 struct css_set *cg = link->cg;
3775
3776 hlist_del(&cg->hlist);
3777 BUG_ON(!cg->subsys[ss->subsys_id]);
3778 cg->subsys[ss->subsys_id] = NULL;
3779 hhead = css_set_hash(cg->subsys);
3780 hlist_add_head(&cg->hlist, hhead);
3781 }
3782 write_unlock(&css_set_lock);
3783
3784 /*
3785 * remove subsystem's css from the dummytop and free it - need to free
3786 * before marking as null because ss->destroy needs the cgrp->subsys
3787 * pointer to find their state. note that this also takes care of
3788 * freeing the css_id.
3789 */
3790 ss->destroy(ss, dummytop);
3791 dummytop->subsys[ss->subsys_id] = NULL;
3792
3793 mutex_unlock(&cgroup_mutex);
3794}
3795EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3796
3797/**
3211 * cgroup_init_early - cgroup initialization at system boot 3798 * cgroup_init_early - cgroup initialization at system boot
3212 * 3799 *
3213 * Initialize cgroups at system boot, and initialize any 3800 * Initialize cgroups at system boot, and initialize any
@@ -3235,7 +3822,8 @@ int __init cgroup_init_early(void)
3235 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3822 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3236 INIT_HLIST_HEAD(&css_set_table[i]); 3823 INIT_HLIST_HEAD(&css_set_table[i]);
3237 3824
3238 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3825 /* at bootup time, we don't worry about modular subsystems */
3826 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3239 struct cgroup_subsys *ss = subsys[i]; 3827 struct cgroup_subsys *ss = subsys[i];
3240 3828
3241 BUG_ON(!ss->name); 3829 BUG_ON(!ss->name);
@@ -3270,12 +3858,13 @@ int __init cgroup_init(void)
3270 if (err) 3858 if (err)
3271 return err; 3859 return err;
3272 3860
3273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3861 /* at bootup time, we don't worry about modular subsystems */
3862 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3274 struct cgroup_subsys *ss = subsys[i]; 3863 struct cgroup_subsys *ss = subsys[i];
3275 if (!ss->early_init) 3864 if (!ss->early_init)
3276 cgroup_init_subsys(ss); 3865 cgroup_init_subsys(ss);
3277 if (ss->use_id) 3866 if (ss->use_id)
3278 cgroup_subsys_init_idr(ss); 3867 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3279 } 3868 }
3280 3869
3281 /* Add init_css_set to the hash table */ 3870 /* Add init_css_set to the hash table */
@@ -3379,9 +3968,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3379 int i; 3968 int i;
3380 3969
3381 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3970 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3971 /*
3972 * ideally we don't want subsystems moving around while we do this.
3973 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3974 * subsys/hierarchy state.
3975 */
3382 mutex_lock(&cgroup_mutex); 3976 mutex_lock(&cgroup_mutex);
3383 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3977 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3384 struct cgroup_subsys *ss = subsys[i]; 3978 struct cgroup_subsys *ss = subsys[i];
3979 if (ss == NULL)
3980 continue;
3385 seq_printf(m, "%s\t%d\t%d\t%d\n", 3981 seq_printf(m, "%s\t%d\t%d\t%d\n",
3386 ss->name, ss->root->hierarchy_id, 3982 ss->name, ss->root->hierarchy_id,
3387 ss->root->number_of_cgroups, !ss->disabled); 3983 ss->root->number_of_cgroups, !ss->disabled);
@@ -3439,7 +4035,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3439{ 4035{
3440 if (need_forkexit_callback) { 4036 if (need_forkexit_callback) {
3441 int i; 4037 int i;
3442 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4038 /*
4039 * forkexit callbacks are only supported for builtin
4040 * subsystems, and the builtin section of the subsys array is
4041 * immutable, so we don't need to lock the subsys array here.
4042 */
4043 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3443 struct cgroup_subsys *ss = subsys[i]; 4044 struct cgroup_subsys *ss = subsys[i];
3444 if (ss->fork) 4045 if (ss->fork)
3445 ss->fork(ss, child); 4046 ss->fork(ss, child);
@@ -3508,7 +4109,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3508 struct css_set *cg; 4109 struct css_set *cg;
3509 4110
3510 if (run_callbacks && need_forkexit_callback) { 4111 if (run_callbacks && need_forkexit_callback) {
3511 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4112 /*
4113 * modular subsystems can't use callbacks, so no need to lock
4114 * the subsys array
4115 */
4116 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3512 struct cgroup_subsys *ss = subsys[i]; 4117 struct cgroup_subsys *ss = subsys[i];
3513 if (ss->exit) 4118 if (ss->exit)
3514 ss->exit(ss, tsk); 4119 ss->exit(ss, tsk);
@@ -3702,12 +4307,13 @@ static void check_for_release(struct cgroup *cgrp)
3702 } 4307 }
3703} 4308}
3704 4309
3705void __css_put(struct cgroup_subsys_state *css) 4310/* Caller must verify that the css is not for root cgroup */
4311void __css_put(struct cgroup_subsys_state *css, int count)
3706{ 4312{
3707 struct cgroup *cgrp = css->cgroup; 4313 struct cgroup *cgrp = css->cgroup;
3708 int val; 4314 int val;
3709 rcu_read_lock(); 4315 rcu_read_lock();
3710 val = atomic_dec_return(&css->refcnt); 4316 val = atomic_sub_return(count, &css->refcnt);
3711 if (val == 1) { 4317 if (val == 1) {
3712 if (notify_on_release(cgrp)) { 4318 if (notify_on_release(cgrp)) {
3713 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4319 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3718,6 +4324,7 @@ void __css_put(struct cgroup_subsys_state *css)
3718 rcu_read_unlock(); 4324 rcu_read_unlock();
3719 WARN_ON_ONCE(val < 1); 4325 WARN_ON_ONCE(val < 1);
3720} 4326}
4327EXPORT_SYMBOL_GPL(__css_put);
3721 4328
3722/* 4329/*
3723 * Notify userspace when a cgroup is released, by running the 4330 * Notify userspace when a cgroup is released, by running the
@@ -3799,8 +4406,11 @@ static int __init cgroup_disable(char *str)
3799 while ((token = strsep(&str, ",")) != NULL) { 4406 while ((token = strsep(&str, ",")) != NULL) {
3800 if (!*token) 4407 if (!*token)
3801 continue; 4408 continue;
3802 4409 /*
3803 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4410 * cgroup_disable, being at boot time, can't know about module
4411 * subsystems, so we don't worry about them.
4412 */
4413 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3804 struct cgroup_subsys *ss = subsys[i]; 4414 struct cgroup_subsys *ss = subsys[i];
3805 4415
3806 if (!strcmp(token, ss->name)) { 4416 if (!strcmp(token, ss->name)) {
@@ -3824,31 +4434,65 @@ __setup("cgroup_disable=", cgroup_disable);
3824 */ 4434 */
3825unsigned short css_id(struct cgroup_subsys_state *css) 4435unsigned short css_id(struct cgroup_subsys_state *css)
3826{ 4436{
3827 struct css_id *cssid = rcu_dereference(css->id); 4437 struct css_id *cssid;
4438
4439 /*
4440 * This css_id() can return correct value when somone has refcnt
4441 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4442 * it's unchanged until freed.
4443 */
4444 cssid = rcu_dereference_check(css->id,
4445 rcu_read_lock_held() || atomic_read(&css->refcnt));
3828 4446
3829 if (cssid) 4447 if (cssid)
3830 return cssid->id; 4448 return cssid->id;
3831 return 0; 4449 return 0;
3832} 4450}
4451EXPORT_SYMBOL_GPL(css_id);
3833 4452
3834unsigned short css_depth(struct cgroup_subsys_state *css) 4453unsigned short css_depth(struct cgroup_subsys_state *css)
3835{ 4454{
3836 struct css_id *cssid = rcu_dereference(css->id); 4455 struct css_id *cssid;
4456
4457 cssid = rcu_dereference_check(css->id,
4458 rcu_read_lock_held() || atomic_read(&css->refcnt));
3837 4459
3838 if (cssid) 4460 if (cssid)
3839 return cssid->depth; 4461 return cssid->depth;
3840 return 0; 4462 return 0;
3841} 4463}
4464EXPORT_SYMBOL_GPL(css_depth);
4465
4466/**
4467 * css_is_ancestor - test "root" css is an ancestor of "child"
4468 * @child: the css to be tested.
4469 * @root: the css supporsed to be an ancestor of the child.
4470 *
4471 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4472 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4473 * But, considering usual usage, the csses should be valid objects after test.
4474 * Assuming that the caller will do some action to the child if this returns
4475 * returns true, the caller must take "child";s reference count.
4476 * If "child" is valid object and this returns true, "root" is valid, too.
4477 */
3842 4478
3843bool css_is_ancestor(struct cgroup_subsys_state *child, 4479bool css_is_ancestor(struct cgroup_subsys_state *child,
3844 const struct cgroup_subsys_state *root) 4480 const struct cgroup_subsys_state *root)
3845{ 4481{
3846 struct css_id *child_id = rcu_dereference(child->id); 4482 struct css_id *child_id;
3847 struct css_id *root_id = rcu_dereference(root->id); 4483 struct css_id *root_id;
4484 bool ret = true;
3848 4485
3849 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4486 rcu_read_lock();
3850 return false; 4487 child_id = rcu_dereference(child->id);
3851 return child_id->stack[root_id->depth] == root_id->id; 4488 root_id = rcu_dereference(root->id);
4489 if (!child_id
4490 || !root_id
4491 || (child_id->depth < root_id->depth)
4492 || (child_id->stack[root_id->depth] != root_id->id))
4493 ret = false;
4494 rcu_read_unlock();
4495 return ret;
3852} 4496}
3853 4497
3854static void __free_css_id_cb(struct rcu_head *head) 4498static void __free_css_id_cb(struct rcu_head *head)
@@ -3875,6 +4519,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3875 spin_unlock(&ss->id_lock); 4519 spin_unlock(&ss->id_lock);
3876 call_rcu(&id->rcu_head, __free_css_id_cb); 4520 call_rcu(&id->rcu_head, __free_css_id_cb);
3877} 4521}
4522EXPORT_SYMBOL_GPL(free_css_id);
3878 4523
3879/* 4524/*
3880 * This is called by init or create(). Then, calls to this function are 4525 * This is called by init or create(). Then, calls to this function are
@@ -3924,15 +4569,14 @@ err_out:
3924 4569
3925} 4570}
3926 4571
3927static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4572static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4573 struct cgroup_subsys_state *rootcss)
3928{ 4574{
3929 struct css_id *newid; 4575 struct css_id *newid;
3930 struct cgroup_subsys_state *rootcss;
3931 4576
3932 spin_lock_init(&ss->id_lock); 4577 spin_lock_init(&ss->id_lock);
3933 idr_init(&ss->idr); 4578 idr_init(&ss->idr);
3934 4579
3935 rootcss = init_css_set.subsys[ss->subsys_id];
3936 newid = get_new_cssid(ss, 0); 4580 newid = get_new_cssid(ss, 0);
3937 if (IS_ERR(newid)) 4581 if (IS_ERR(newid))
3938 return PTR_ERR(newid); 4582 return PTR_ERR(newid);
@@ -3948,13 +4592,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3948{ 4592{
3949 int subsys_id, i, depth = 0; 4593 int subsys_id, i, depth = 0;
3950 struct cgroup_subsys_state *parent_css, *child_css; 4594 struct cgroup_subsys_state *parent_css, *child_css;
3951 struct css_id *child_id, *parent_id = NULL; 4595 struct css_id *child_id, *parent_id;
3952 4596
3953 subsys_id = ss->subsys_id; 4597 subsys_id = ss->subsys_id;
3954 parent_css = parent->subsys[subsys_id]; 4598 parent_css = parent->subsys[subsys_id];
3955 child_css = child->subsys[subsys_id]; 4599 child_css = child->subsys[subsys_id];
3956 depth = css_depth(parent_css) + 1;
3957 parent_id = parent_css->id; 4600 parent_id = parent_css->id;
4601 depth = parent_id->depth + 1;
3958 4602
3959 child_id = get_new_cssid(ss, depth); 4603 child_id = get_new_cssid(ss, depth);
3960 if (IS_ERR(child_id)) 4604 if (IS_ERR(child_id))
@@ -3992,6 +4636,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3992 4636
3993 return rcu_dereference(cssid->css); 4637 return rcu_dereference(cssid->css);
3994} 4638}
4639EXPORT_SYMBOL_GPL(css_lookup);
3995 4640
3996/** 4641/**
3997 * css_get_next - lookup next cgroup under specified hierarchy. 4642 * css_get_next - lookup next cgroup under specified hierarchy.