aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c693
1 files changed, 642 insertions, 51 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4fd90e129772..ef909a329750 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -44,6 +48,7 @@
44#include <linux/string.h> 48#include <linux/string.h>
45#include <linux/sort.h> 49#include <linux/sort.h>
46#include <linux/kmod.h> 50#include <linux/kmod.h>
51#include <linux/module.h>
47#include <linux/delayacct.h> 52#include <linux/delayacct.h>
48#include <linux/cgroupstats.h> 53#include <linux/cgroupstats.h>
49#include <linux/hash.h> 54#include <linux/hash.h>
@@ -52,15 +57,21 @@
52#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
53#include <linux/idr.h> 58#include <linux/idr.h>
54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h>
61#include <linux/poll.h>
55 62
56#include <asm/atomic.h> 63#include <asm/atomic.h>
57 64
58static DEFINE_MUTEX(cgroup_mutex); 65static DEFINE_MUTEX(cgroup_mutex);
59 66
60/* Generate an array of cgroup subsystem pointers */ 67/*
68 * Generate an array of cgroup subsystem pointers. At boot time, this is
69 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
70 * registered after that. The mutable section of this array is protected by
71 * cgroup_mutex.
72 */
61#define SUBSYS(_x) &_x ## _subsys, 73#define SUBSYS(_x) &_x ## _subsys,
62 74static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
63static struct cgroup_subsys *subsys[] = {
64#include <linux/cgroup_subsys.h> 75#include <linux/cgroup_subsys.h>
65}; 76};
66 77
@@ -147,6 +158,35 @@ struct css_id {
147 unsigned short stack[0]; /* Array of Length (depth+1) */ 158 unsigned short stack[0]; /* Array of Length (depth+1) */
148}; 159};
149 160
161/*
162 * cgroup_event represents events which userspace want to recieve.
163 */
164struct cgroup_event {
165 /*
166 * Cgroup which the event belongs to.
167 */
168 struct cgroup *cgrp;
169 /*
170 * Control file which the event associated.
171 */
172 struct cftype *cft;
173 /*
174 * eventfd to signal userspace about the event.
175 */
176 struct eventfd_ctx *eventfd;
177 /*
178 * Each of these stored in a list by the cgroup.
179 */
180 struct list_head list;
181 /*
182 * All fields below needed to unregister event when
183 * userspace closes eventfd.
184 */
185 poll_table pt;
186 wait_queue_head_t *wqh;
187 wait_queue_t wait;
188 struct work_struct remove;
189};
150 190
151/* The list of hierarchy roots */ 191/* The list of hierarchy roots */
152 192
@@ -250,7 +290,8 @@ struct cg_cgroup_link {
250static struct css_set init_css_set; 290static struct css_set init_css_set;
251static struct cg_cgroup_link init_css_set_link; 291static struct cg_cgroup_link init_css_set_link;
252 292
253static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 293static int cgroup_init_idr(struct cgroup_subsys *ss,
294 struct cgroup_subsys_state *css);
254 295
255/* css_set_lock protects the list of css_set objects, and the 296/* css_set_lock protects the list of css_set objects, and the
256 * chain of tasks off each css_set. Nests outside task->alloc_lock 297 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -448,8 +489,11 @@ static struct css_set *find_existing_css_set(
448 struct hlist_node *node; 489 struct hlist_node *node;
449 struct css_set *cg; 490 struct css_set *cg;
450 491
451 /* Built the set of subsystem state objects that we want to 492 /*
452 * see in the new css_set */ 493 * Build the set of subsystem state objects that we want to see in the
494 * new css_set. while subsystems can change globally, the entries here
495 * won't change, so no need for locking.
496 */
453 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 497 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
454 if (root->subsys_bits & (1UL << i)) { 498 if (root->subsys_bits & (1UL << i)) {
455 /* Subsystem is in this hierarchy. So we want 499 /* Subsystem is in this hierarchy. So we want
@@ -696,6 +740,7 @@ void cgroup_lock(void)
696{ 740{
697 mutex_lock(&cgroup_mutex); 741 mutex_lock(&cgroup_mutex);
698} 742}
743EXPORT_SYMBOL_GPL(cgroup_lock);
699 744
700/** 745/**
701 * cgroup_unlock - release lock on cgroup changes 746 * cgroup_unlock - release lock on cgroup changes
@@ -706,6 +751,7 @@ void cgroup_unlock(void)
706{ 751{
707 mutex_unlock(&cgroup_mutex); 752 mutex_unlock(&cgroup_mutex);
708} 753}
754EXPORT_SYMBOL_GPL(cgroup_unlock);
709 755
710/* 756/*
711 * A couple of forward declarations required, due to cyclic reference loop: 757 * A couple of forward declarations required, due to cyclic reference loop:
@@ -757,6 +803,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
757 if (ret) 803 if (ret)
758 break; 804 break;
759 } 805 }
806
760 return ret; 807 return ret;
761} 808}
762 809
@@ -884,7 +931,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
884 css_put(css); 931 css_put(css);
885} 932}
886 933
887 934/*
935 * Call with cgroup_mutex held. Drops reference counts on modules, including
936 * any duplicate ones that parse_cgroupfs_options took. If this function
937 * returns an error, no reference counts are touched.
938 */
888static int rebind_subsystems(struct cgroupfs_root *root, 939static int rebind_subsystems(struct cgroupfs_root *root,
889 unsigned long final_bits) 940 unsigned long final_bits)
890{ 941{
@@ -892,6 +943,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
892 struct cgroup *cgrp = &root->top_cgroup; 943 struct cgroup *cgrp = &root->top_cgroup;
893 int i; 944 int i;
894 945
946 BUG_ON(!mutex_is_locked(&cgroup_mutex));
947
895 removed_bits = root->actual_subsys_bits & ~final_bits; 948 removed_bits = root->actual_subsys_bits & ~final_bits;
896 added_bits = final_bits & ~root->actual_subsys_bits; 949 added_bits = final_bits & ~root->actual_subsys_bits;
897 /* Check that any added subsystems are currently free */ 950 /* Check that any added subsystems are currently free */
@@ -900,6 +953,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
900 struct cgroup_subsys *ss = subsys[i]; 953 struct cgroup_subsys *ss = subsys[i];
901 if (!(bit & added_bits)) 954 if (!(bit & added_bits))
902 continue; 955 continue;
956 /*
957 * Nobody should tell us to do a subsys that doesn't exist:
958 * parse_cgroupfs_options should catch that case and refcounts
959 * ensure that subsystems won't disappear once selected.
960 */
961 BUG_ON(ss == NULL);
903 if (ss->root != &rootnode) { 962 if (ss->root != &rootnode) {
904 /* Subsystem isn't free */ 963 /* Subsystem isn't free */
905 return -EBUSY; 964 return -EBUSY;
@@ -919,6 +978,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
919 unsigned long bit = 1UL << i; 978 unsigned long bit = 1UL << i;
920 if (bit & added_bits) { 979 if (bit & added_bits) {
921 /* We're binding this subsystem to this hierarchy */ 980 /* We're binding this subsystem to this hierarchy */
981 BUG_ON(ss == NULL);
922 BUG_ON(cgrp->subsys[i]); 982 BUG_ON(cgrp->subsys[i]);
923 BUG_ON(!dummytop->subsys[i]); 983 BUG_ON(!dummytop->subsys[i]);
924 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 984 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -930,8 +990,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
930 if (ss->bind) 990 if (ss->bind)
931 ss->bind(ss, cgrp); 991 ss->bind(ss, cgrp);
932 mutex_unlock(&ss->hierarchy_mutex); 992 mutex_unlock(&ss->hierarchy_mutex);
993 /* refcount was already taken, and we're keeping it */
933 } else if (bit & removed_bits) { 994 } else if (bit & removed_bits) {
934 /* We're removing this subsystem */ 995 /* We're removing this subsystem */
996 BUG_ON(ss == NULL);
935 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 997 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
936 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 998 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
937 mutex_lock(&ss->hierarchy_mutex); 999 mutex_lock(&ss->hierarchy_mutex);
@@ -942,9 +1004,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
942 subsys[i]->root = &rootnode; 1004 subsys[i]->root = &rootnode;
943 list_move(&ss->sibling, &rootnode.subsys_list); 1005 list_move(&ss->sibling, &rootnode.subsys_list);
944 mutex_unlock(&ss->hierarchy_mutex); 1006 mutex_unlock(&ss->hierarchy_mutex);
1007 /* subsystem is now free - drop reference on module */
1008 module_put(ss->module);
945 } else if (bit & final_bits) { 1009 } else if (bit & final_bits) {
946 /* Subsystem state should already exist */ 1010 /* Subsystem state should already exist */
1011 BUG_ON(ss == NULL);
947 BUG_ON(!cgrp->subsys[i]); 1012 BUG_ON(!cgrp->subsys[i]);
1013 /*
1014 * a refcount was taken, but we already had one, so
1015 * drop the extra reference.
1016 */
1017 module_put(ss->module);
1018#ifdef CONFIG_MODULE_UNLOAD
1019 BUG_ON(ss->module && !module_refcount(ss->module));
1020#endif
948 } else { 1021 } else {
949 /* Subsystem state shouldn't exist */ 1022 /* Subsystem state shouldn't exist */
950 BUG_ON(cgrp->subsys[i]); 1023 BUG_ON(cgrp->subsys[i]);
@@ -986,13 +1059,20 @@ struct cgroup_sb_opts {
986 1059
987}; 1060};
988 1061
989/* Convert a hierarchy specifier into a bitmask of subsystems and 1062/*
990 * flags. */ 1063 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
991static int parse_cgroupfs_options(char *data, 1064 * with cgroup_mutex held to protect the subsys[] array. This function takes
992 struct cgroup_sb_opts *opts) 1065 * refcounts on subsystems to be used, unless it returns error, in which case
1066 * no refcounts are taken.
1067 */
1068static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
993{ 1069{
994 char *token, *o = data ?: "all"; 1070 char *token, *o = data ?: "all";
995 unsigned long mask = (unsigned long)-1; 1071 unsigned long mask = (unsigned long)-1;
1072 int i;
1073 bool module_pin_failed = false;
1074
1075 BUG_ON(!mutex_is_locked(&cgroup_mutex));
996 1076
997#ifdef CONFIG_CPUSETS 1077#ifdef CONFIG_CPUSETS
998 mask = ~(1UL << cpuset_subsys_id); 1078 mask = ~(1UL << cpuset_subsys_id);
@@ -1005,10 +1085,11 @@ static int parse_cgroupfs_options(char *data,
1005 return -EINVAL; 1085 return -EINVAL;
1006 if (!strcmp(token, "all")) { 1086 if (!strcmp(token, "all")) {
1007 /* Add all non-disabled subsystems */ 1087 /* Add all non-disabled subsystems */
1008 int i;
1009 opts->subsys_bits = 0; 1088 opts->subsys_bits = 0;
1010 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1089 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1011 struct cgroup_subsys *ss = subsys[i]; 1090 struct cgroup_subsys *ss = subsys[i];
1091 if (ss == NULL)
1092 continue;
1012 if (!ss->disabled) 1093 if (!ss->disabled)
1013 opts->subsys_bits |= 1ul << i; 1094 opts->subsys_bits |= 1ul << i;
1014 } 1095 }
@@ -1026,7 +1107,6 @@ static int parse_cgroupfs_options(char *data,
1026 if (!opts->release_agent) 1107 if (!opts->release_agent)
1027 return -ENOMEM; 1108 return -ENOMEM;
1028 } else if (!strncmp(token, "name=", 5)) { 1109 } else if (!strncmp(token, "name=", 5)) {
1029 int i;
1030 const char *name = token + 5; 1110 const char *name = token + 5;
1031 /* Can't specify an empty name */ 1111 /* Can't specify an empty name */
1032 if (!strlen(name)) 1112 if (!strlen(name))
@@ -1050,9 +1130,10 @@ static int parse_cgroupfs_options(char *data,
1050 return -ENOMEM; 1130 return -ENOMEM;
1051 } else { 1131 } else {
1052 struct cgroup_subsys *ss; 1132 struct cgroup_subsys *ss;
1053 int i;
1054 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1133 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1055 ss = subsys[i]; 1134 ss = subsys[i];
1135 if (ss == NULL)
1136 continue;
1056 if (!strcmp(token, ss->name)) { 1137 if (!strcmp(token, ss->name)) {
1057 if (!ss->disabled) 1138 if (!ss->disabled)
1058 set_bit(i, &opts->subsys_bits); 1139 set_bit(i, &opts->subsys_bits);
@@ -1087,9 +1168,54 @@ static int parse_cgroupfs_options(char *data,
1087 if (!opts->subsys_bits && !opts->name) 1168 if (!opts->subsys_bits && !opts->name)
1088 return -EINVAL; 1169 return -EINVAL;
1089 1170
1171 /*
1172 * Grab references on all the modules we'll need, so the subsystems
1173 * don't dance around before rebind_subsystems attaches them. This may
1174 * take duplicate reference counts on a subsystem that's already used,
1175 * but rebind_subsystems handles this case.
1176 */
1177 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1178 unsigned long bit = 1UL << i;
1179
1180 if (!(bit & opts->subsys_bits))
1181 continue;
1182 if (!try_module_get(subsys[i]->module)) {
1183 module_pin_failed = true;
1184 break;
1185 }
1186 }
1187 if (module_pin_failed) {
1188 /*
1189 * oops, one of the modules was going away. this means that we
1190 * raced with a module_delete call, and to the user this is
1191 * essentially a "subsystem doesn't exist" case.
1192 */
1193 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1194 /* drop refcounts only on the ones we took */
1195 unsigned long bit = 1UL << i;
1196
1197 if (!(bit & opts->subsys_bits))
1198 continue;
1199 module_put(subsys[i]->module);
1200 }
1201 return -ENOENT;
1202 }
1203
1090 return 0; 1204 return 0;
1091} 1205}
1092 1206
1207static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1208{
1209 int i;
1210 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1211 unsigned long bit = 1UL << i;
1212
1213 if (!(bit & subsys_bits))
1214 continue;
1215 module_put(subsys[i]->module);
1216 }
1217}
1218
1093static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1219static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1094{ 1220{
1095 int ret = 0; 1221 int ret = 0;
@@ -1106,21 +1232,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1106 if (ret) 1232 if (ret)
1107 goto out_unlock; 1233 goto out_unlock;
1108 1234
1109 /* Don't allow flags to change at remount */ 1235 /* Don't allow flags or name to change at remount */
1110 if (opts.flags != root->flags) { 1236 if (opts.flags != root->flags ||
1111 ret = -EINVAL; 1237 (opts.name && strcmp(opts.name, root->name))) {
1112 goto out_unlock;
1113 }
1114
1115 /* Don't allow name to change at remount */
1116 if (opts.name && strcmp(opts.name, root->name)) {
1117 ret = -EINVAL; 1238 ret = -EINVAL;
1239 drop_parsed_module_refcounts(opts.subsys_bits);
1118 goto out_unlock; 1240 goto out_unlock;
1119 } 1241 }
1120 1242
1121 ret = rebind_subsystems(root, opts.subsys_bits); 1243 ret = rebind_subsystems(root, opts.subsys_bits);
1122 if (ret) 1244 if (ret) {
1245 drop_parsed_module_refcounts(opts.subsys_bits);
1123 goto out_unlock; 1246 goto out_unlock;
1247 }
1124 1248
1125 /* (re)populate subsystem files */ 1249 /* (re)populate subsystem files */
1126 cgroup_populate_dir(cgrp); 1250 cgroup_populate_dir(cgrp);
@@ -1151,6 +1275,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1151 INIT_LIST_HEAD(&cgrp->release_list); 1275 INIT_LIST_HEAD(&cgrp->release_list);
1152 INIT_LIST_HEAD(&cgrp->pidlists); 1276 INIT_LIST_HEAD(&cgrp->pidlists);
1153 mutex_init(&cgrp->pidlist_mutex); 1277 mutex_init(&cgrp->pidlist_mutex);
1278 INIT_LIST_HEAD(&cgrp->event_list);
1279 spin_lock_init(&cgrp->event_list_lock);
1154} 1280}
1155 1281
1156static void init_cgroup_root(struct cgroupfs_root *root) 1282static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1306,7 +1432,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1306 struct cgroupfs_root *new_root; 1432 struct cgroupfs_root *new_root;
1307 1433
1308 /* First find the desired set of subsystems */ 1434 /* First find the desired set of subsystems */
1435 mutex_lock(&cgroup_mutex);
1309 ret = parse_cgroupfs_options(data, &opts); 1436 ret = parse_cgroupfs_options(data, &opts);
1437 mutex_unlock(&cgroup_mutex);
1310 if (ret) 1438 if (ret)
1311 goto out_err; 1439 goto out_err;
1312 1440
@@ -1317,7 +1445,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1317 new_root = cgroup_root_from_opts(&opts); 1445 new_root = cgroup_root_from_opts(&opts);
1318 if (IS_ERR(new_root)) { 1446 if (IS_ERR(new_root)) {
1319 ret = PTR_ERR(new_root); 1447 ret = PTR_ERR(new_root);
1320 goto out_err; 1448 goto drop_modules;
1321 } 1449 }
1322 opts.new_root = new_root; 1450 opts.new_root = new_root;
1323 1451
@@ -1326,7 +1454,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1326 if (IS_ERR(sb)) { 1454 if (IS_ERR(sb)) {
1327 ret = PTR_ERR(sb); 1455 ret = PTR_ERR(sb);
1328 cgroup_drop_root(opts.new_root); 1456 cgroup_drop_root(opts.new_root);
1329 goto out_err; 1457 goto drop_modules;
1330 } 1458 }
1331 1459
1332 root = sb->s_fs_info; 1460 root = sb->s_fs_info;
@@ -1382,6 +1510,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1382 free_cg_links(&tmp_cg_links); 1510 free_cg_links(&tmp_cg_links);
1383 goto drop_new_super; 1511 goto drop_new_super;
1384 } 1512 }
1513 /*
1514 * There must be no failure case after here, since rebinding
1515 * takes care of subsystems' refcounts, which are explicitly
1516 * dropped in the failure exit path.
1517 */
1385 1518
1386 /* EBUSY should be the only error here */ 1519 /* EBUSY should be the only error here */
1387 BUG_ON(ret); 1520 BUG_ON(ret);
@@ -1420,6 +1553,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1420 * any) is not needed 1553 * any) is not needed
1421 */ 1554 */
1422 cgroup_drop_root(opts.new_root); 1555 cgroup_drop_root(opts.new_root);
1556 /* no subsys rebinding, so refcounts don't change */
1557 drop_parsed_module_refcounts(opts.subsys_bits);
1423 } 1558 }
1424 1559
1425 simple_set_mnt(mnt, sb); 1560 simple_set_mnt(mnt, sb);
@@ -1429,6 +1564,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1429 1564
1430 drop_new_super: 1565 drop_new_super:
1431 deactivate_locked_super(sb); 1566 deactivate_locked_super(sb);
1567 drop_modules:
1568 drop_parsed_module_refcounts(opts.subsys_bits);
1432 out_err: 1569 out_err:
1433 kfree(opts.release_agent); 1570 kfree(opts.release_agent);
1434 kfree(opts.name); 1571 kfree(opts.name);
@@ -1542,6 +1679,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1542 memmove(buf, start, buf + buflen - start); 1679 memmove(buf, start, buf + buflen - start);
1543 return 0; 1680 return 0;
1544} 1681}
1682EXPORT_SYMBOL_GPL(cgroup_path);
1545 1683
1546/** 1684/**
1547 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1685 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1554,7 +1692,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1554int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1692int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1555{ 1693{
1556 int retval = 0; 1694 int retval = 0;
1557 struct cgroup_subsys *ss; 1695 struct cgroup_subsys *ss, *failed_ss = NULL;
1558 struct cgroup *oldcgrp; 1696 struct cgroup *oldcgrp;
1559 struct css_set *cg; 1697 struct css_set *cg;
1560 struct css_set *newcg; 1698 struct css_set *newcg;
@@ -1568,8 +1706,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 for_each_subsys(root, ss) { 1706 for_each_subsys(root, ss) {
1569 if (ss->can_attach) { 1707 if (ss->can_attach) {
1570 retval = ss->can_attach(ss, cgrp, tsk, false); 1708 retval = ss->can_attach(ss, cgrp, tsk, false);
1571 if (retval) 1709 if (retval) {
1572 return retval; 1710 /*
1711 * Remember on which subsystem the can_attach()
1712 * failed, so that we only call cancel_attach()
1713 * against the subsystems whose can_attach()
1714 * succeeded. (See below)
1715 */
1716 failed_ss = ss;
1717 goto out;
1718 }
1573 } 1719 }
1574 } 1720 }
1575 1721
@@ -1583,14 +1729,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1583 */ 1729 */
1584 newcg = find_css_set(cg, cgrp); 1730 newcg = find_css_set(cg, cgrp);
1585 put_css_set(cg); 1731 put_css_set(cg);
1586 if (!newcg) 1732 if (!newcg) {
1587 return -ENOMEM; 1733 retval = -ENOMEM;
1734 goto out;
1735 }
1588 1736
1589 task_lock(tsk); 1737 task_lock(tsk);
1590 if (tsk->flags & PF_EXITING) { 1738 if (tsk->flags & PF_EXITING) {
1591 task_unlock(tsk); 1739 task_unlock(tsk);
1592 put_css_set(newcg); 1740 put_css_set(newcg);
1593 return -ESRCH; 1741 retval = -ESRCH;
1742 goto out;
1594 } 1743 }
1595 rcu_assign_pointer(tsk->cgroups, newcg); 1744 rcu_assign_pointer(tsk->cgroups, newcg);
1596 task_unlock(tsk); 1745 task_unlock(tsk);
@@ -1616,7 +1765,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1616 * is no longer empty. 1765 * is no longer empty.
1617 */ 1766 */
1618 cgroup_wakeup_rmdir_waiter(cgrp); 1767 cgroup_wakeup_rmdir_waiter(cgrp);
1619 return 0; 1768out:
1769 if (retval) {
1770 for_each_subsys(root, ss) {
1771 if (ss == failed_ss)
1772 /*
1773 * This subsystem was the one that failed the
1774 * can_attach() check earlier, so we don't need
1775 * to call cancel_attach() against it or any
1776 * remaining subsystems.
1777 */
1778 break;
1779 if (ss->cancel_attach)
1780 ss->cancel_attach(ss, cgrp, tsk, false);
1781 }
1782 }
1783 return retval;
1620} 1784}
1621 1785
1622/* 1786/*
@@ -1682,6 +1846,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1682 } 1846 }
1683 return true; 1847 return true;
1684} 1848}
1849EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1685 1850
1686static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1851static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1687 const char *buffer) 1852 const char *buffer)
@@ -1950,6 +2115,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1950 .rename = cgroup_rename, 2115 .rename = cgroup_rename,
1951}; 2116};
1952 2117
2118/*
2119 * Check if a file is a control file
2120 */
2121static inline struct cftype *__file_cft(struct file *file)
2122{
2123 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2124 return ERR_PTR(-EINVAL);
2125 return __d_cft(file->f_dentry);
2126}
2127
1953static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2128static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1954 struct super_block *sb) 2129 struct super_block *sb)
1955{ 2130{
@@ -2069,6 +2244,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2069 error = PTR_ERR(dentry); 2244 error = PTR_ERR(dentry);
2070 return error; 2245 return error;
2071} 2246}
2247EXPORT_SYMBOL_GPL(cgroup_add_file);
2072 2248
2073int cgroup_add_files(struct cgroup *cgrp, 2249int cgroup_add_files(struct cgroup *cgrp,
2074 struct cgroup_subsys *subsys, 2250 struct cgroup_subsys *subsys,
@@ -2083,6 +2259,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2083 } 2259 }
2084 return 0; 2260 return 0;
2085} 2261}
2262EXPORT_SYMBOL_GPL(cgroup_add_files);
2086 2263
2087/** 2264/**
2088 * cgroup_task_count - count the number of tasks in a cgroup. 2265 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2468,7 +2645,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468{ 2645{
2469 struct cgroup_pidlist *l; 2646 struct cgroup_pidlist *l;
2470 /* don't need task_nsproxy() if we're looking at ourself */ 2647 /* don't need task_nsproxy() if we're looking at ourself */
2471 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2648 struct pid_namespace *ns = current->nsproxy->pid_ns;
2649
2472 /* 2650 /*
2473 * We can't drop the pidlist_mutex before taking the l->mutex in case 2651 * We can't drop the pidlist_mutex before taking the l->mutex in case
2474 * the last ref-holder is trying to remove l from the list at the same 2652 * the last ref-holder is trying to remove l from the list at the same
@@ -2478,8 +2656,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2478 mutex_lock(&cgrp->pidlist_mutex); 2656 mutex_lock(&cgrp->pidlist_mutex);
2479 list_for_each_entry(l, &cgrp->pidlists, links) { 2657 list_for_each_entry(l, &cgrp->pidlists, links) {
2480 if (l->key.type == type && l->key.ns == ns) { 2658 if (l->key.type == type && l->key.ns == ns) {
2481 /* found a matching list - drop the extra refcount */
2482 put_pid_ns(ns);
2483 /* make sure l doesn't vanish out from under us */ 2659 /* make sure l doesn't vanish out from under us */
2484 down_write(&l->mutex); 2660 down_write(&l->mutex);
2485 mutex_unlock(&cgrp->pidlist_mutex); 2661 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2490,13 +2666,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2490 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2666 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2491 if (!l) { 2667 if (!l) {
2492 mutex_unlock(&cgrp->pidlist_mutex); 2668 mutex_unlock(&cgrp->pidlist_mutex);
2493 put_pid_ns(ns);
2494 return l; 2669 return l;
2495 } 2670 }
2496 init_rwsem(&l->mutex); 2671 init_rwsem(&l->mutex);
2497 down_write(&l->mutex); 2672 down_write(&l->mutex);
2498 l->key.type = type; 2673 l->key.type = type;
2499 l->key.ns = ns; 2674 l->key.ns = get_pid_ns(ns);
2500 l->use_count = 0; /* don't increment here */ 2675 l->use_count = 0; /* don't increment here */
2501 l->list = NULL; 2676 l->list = NULL;
2502 l->owner = cgrp; 2677 l->owner = cgrp;
@@ -2804,6 +2979,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2804} 2979}
2805 2980
2806/* 2981/*
2982 * Unregister event and free resources.
2983 *
2984 * Gets called from workqueue.
2985 */
2986static void cgroup_event_remove(struct work_struct *work)
2987{
2988 struct cgroup_event *event = container_of(work, struct cgroup_event,
2989 remove);
2990 struct cgroup *cgrp = event->cgrp;
2991
2992 /* TODO: check return code */
2993 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2994
2995 eventfd_ctx_put(event->eventfd);
2996 kfree(event);
2997 dput(cgrp->dentry);
2998}
2999
3000/*
3001 * Gets called on POLLHUP on eventfd when user closes it.
3002 *
3003 * Called with wqh->lock held and interrupts disabled.
3004 */
3005static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3006 int sync, void *key)
3007{
3008 struct cgroup_event *event = container_of(wait,
3009 struct cgroup_event, wait);
3010 struct cgroup *cgrp = event->cgrp;
3011 unsigned long flags = (unsigned long)key;
3012
3013 if (flags & POLLHUP) {
3014 remove_wait_queue_locked(event->wqh, &event->wait);
3015 spin_lock(&cgrp->event_list_lock);
3016 list_del(&event->list);
3017 spin_unlock(&cgrp->event_list_lock);
3018 /*
3019 * We are in atomic context, but cgroup_event_remove() may
3020 * sleep, so we have to call it in workqueue.
3021 */
3022 schedule_work(&event->remove);
3023 }
3024
3025 return 0;
3026}
3027
3028static void cgroup_event_ptable_queue_proc(struct file *file,
3029 wait_queue_head_t *wqh, poll_table *pt)
3030{
3031 struct cgroup_event *event = container_of(pt,
3032 struct cgroup_event, pt);
3033
3034 event->wqh = wqh;
3035 add_wait_queue(wqh, &event->wait);
3036}
3037
3038/*
3039 * Parse input and register new cgroup event handler.
3040 *
3041 * Input must be in format '<event_fd> <control_fd> <args>'.
3042 * Interpretation of args is defined by control file implementation.
3043 */
3044static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3045 const char *buffer)
3046{
3047 struct cgroup_event *event = NULL;
3048 unsigned int efd, cfd;
3049 struct file *efile = NULL;
3050 struct file *cfile = NULL;
3051 char *endp;
3052 int ret;
3053
3054 efd = simple_strtoul(buffer, &endp, 10);
3055 if (*endp != ' ')
3056 return -EINVAL;
3057 buffer = endp + 1;
3058
3059 cfd = simple_strtoul(buffer, &endp, 10);
3060 if ((*endp != ' ') && (*endp != '\0'))
3061 return -EINVAL;
3062 buffer = endp + 1;
3063
3064 event = kzalloc(sizeof(*event), GFP_KERNEL);
3065 if (!event)
3066 return -ENOMEM;
3067 event->cgrp = cgrp;
3068 INIT_LIST_HEAD(&event->list);
3069 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3070 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3071 INIT_WORK(&event->remove, cgroup_event_remove);
3072
3073 efile = eventfd_fget(efd);
3074 if (IS_ERR(efile)) {
3075 ret = PTR_ERR(efile);
3076 goto fail;
3077 }
3078
3079 event->eventfd = eventfd_ctx_fileget(efile);
3080 if (IS_ERR(event->eventfd)) {
3081 ret = PTR_ERR(event->eventfd);
3082 goto fail;
3083 }
3084
3085 cfile = fget(cfd);
3086 if (!cfile) {
3087 ret = -EBADF;
3088 goto fail;
3089 }
3090
3091 /* the process need read permission on control file */
3092 ret = file_permission(cfile, MAY_READ);
3093 if (ret < 0)
3094 goto fail;
3095
3096 event->cft = __file_cft(cfile);
3097 if (IS_ERR(event->cft)) {
3098 ret = PTR_ERR(event->cft);
3099 goto fail;
3100 }
3101
3102 if (!event->cft->register_event || !event->cft->unregister_event) {
3103 ret = -EINVAL;
3104 goto fail;
3105 }
3106
3107 ret = event->cft->register_event(cgrp, event->cft,
3108 event->eventfd, buffer);
3109 if (ret)
3110 goto fail;
3111
3112 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3113 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3114 ret = 0;
3115 goto fail;
3116 }
3117
3118 /*
3119 * Events should be removed after rmdir of cgroup directory, but before
3120 * destroying subsystem state objects. Let's take reference to cgroup
3121 * directory dentry to do that.
3122 */
3123 dget(cgrp->dentry);
3124
3125 spin_lock(&cgrp->event_list_lock);
3126 list_add(&event->list, &cgrp->event_list);
3127 spin_unlock(&cgrp->event_list_lock);
3128
3129 fput(cfile);
3130 fput(efile);
3131
3132 return 0;
3133
3134fail:
3135 if (cfile)
3136 fput(cfile);
3137
3138 if (event && event->eventfd && !IS_ERR(event->eventfd))
3139 eventfd_ctx_put(event->eventfd);
3140
3141 if (!IS_ERR_OR_NULL(efile))
3142 fput(efile);
3143
3144 kfree(event);
3145
3146 return ret;
3147}
3148
3149/*
2807 * for the common functions, 'private' gives the type of file 3150 * for the common functions, 'private' gives the type of file
2808 */ 3151 */
2809/* for hysterical raisins, we can't put this on the older files */ 3152/* for hysterical raisins, we can't put this on the older files */
@@ -2828,6 +3171,11 @@ static struct cftype files[] = {
2828 .read_u64 = cgroup_read_notify_on_release, 3171 .read_u64 = cgroup_read_notify_on_release,
2829 .write_u64 = cgroup_write_notify_on_release, 3172 .write_u64 = cgroup_write_notify_on_release,
2830 }, 3173 },
3174 {
3175 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3176 .write_string = cgroup_write_event_control,
3177 .mode = S_IWUGO,
3178 },
2831}; 3179};
2832 3180
2833static struct cftype cft_release_agent = { 3181static struct cftype cft_release_agent = {
@@ -2892,8 +3240,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2892 /* We need to take each hierarchy_mutex in a consistent order */ 3240 /* We need to take each hierarchy_mutex in a consistent order */
2893 int i; 3241 int i;
2894 3242
3243 /*
3244 * No worry about a race with rebind_subsystems that might mess up the
3245 * locking order, since both parties are under cgroup_mutex.
3246 */
2895 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3247 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2896 struct cgroup_subsys *ss = subsys[i]; 3248 struct cgroup_subsys *ss = subsys[i];
3249 if (ss == NULL)
3250 continue;
2897 if (ss->root == root) 3251 if (ss->root == root)
2898 mutex_lock(&ss->hierarchy_mutex); 3252 mutex_lock(&ss->hierarchy_mutex);
2899 } 3253 }
@@ -2905,6 +3259,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2905 3259
2906 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3260 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2907 struct cgroup_subsys *ss = subsys[i]; 3261 struct cgroup_subsys *ss = subsys[i];
3262 if (ss == NULL)
3263 continue;
2908 if (ss->root == root) 3264 if (ss->root == root)
2909 mutex_unlock(&ss->hierarchy_mutex); 3265 mutex_unlock(&ss->hierarchy_mutex);
2910 } 3266 }
@@ -3028,11 +3384,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3028 * synchronization other than RCU, and the subsystem linked 3384 * synchronization other than RCU, and the subsystem linked
3029 * list isn't RCU-safe */ 3385 * list isn't RCU-safe */
3030 int i; 3386 int i;
3387 /*
3388 * We won't need to lock the subsys array, because the subsystems
3389 * we're concerned about aren't going anywhere since our cgroup root
3390 * has a reference on them.
3391 */
3031 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3392 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3032 struct cgroup_subsys *ss = subsys[i]; 3393 struct cgroup_subsys *ss = subsys[i];
3033 struct cgroup_subsys_state *css; 3394 struct cgroup_subsys_state *css;
3034 /* Skip subsystems not in this hierarchy */ 3395 /* Skip subsystems not present or not in this hierarchy */
3035 if (ss->root != cgrp->root) 3396 if (ss == NULL || ss->root != cgrp->root)
3036 continue; 3397 continue;
3037 css = cgrp->subsys[ss->subsys_id]; 3398 css = cgrp->subsys[ss->subsys_id];
3038 /* When called from check_for_release() it's possible 3399 /* When called from check_for_release() it's possible
@@ -3106,6 +3467,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3106 struct dentry *d; 3467 struct dentry *d;
3107 struct cgroup *parent; 3468 struct cgroup *parent;
3108 DEFINE_WAIT(wait); 3469 DEFINE_WAIT(wait);
3470 struct cgroup_event *event, *tmp;
3109 int ret; 3471 int ret;
3110 3472
3111 /* the vfs holds both inode->i_mutex already */ 3473 /* the vfs holds both inode->i_mutex already */
@@ -3189,6 +3551,20 @@ again:
3189 set_bit(CGRP_RELEASABLE, &parent->flags); 3551 set_bit(CGRP_RELEASABLE, &parent->flags);
3190 check_for_release(parent); 3552 check_for_release(parent);
3191 3553
3554 /*
3555 * Unregister events and notify userspace.
3556 * Notify userspace about cgroup removing only after rmdir of cgroup
3557 * directory to avoid race between userspace and kernelspace
3558 */
3559 spin_lock(&cgrp->event_list_lock);
3560 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3561 list_del(&event->list);
3562 remove_wait_queue(event->wqh, &event->wait);
3563 eventfd_signal(event->eventfd, 1);
3564 schedule_work(&event->remove);
3565 }
3566 spin_unlock(&cgrp->event_list_lock);
3567
3192 mutex_unlock(&cgroup_mutex); 3568 mutex_unlock(&cgroup_mutex);
3193 return 0; 3569 return 0;
3194} 3570}
@@ -3223,7 +3599,196 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3223 mutex_init(&ss->hierarchy_mutex); 3599 mutex_init(&ss->hierarchy_mutex);
3224 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3600 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3225 ss->active = 1; 3601 ss->active = 1;
3602
3603 /* this function shouldn't be used with modular subsystems, since they
3604 * need to register a subsys_id, among other things */
3605 BUG_ON(ss->module);
3606}
3607
3608/**
3609 * cgroup_load_subsys: load and register a modular subsystem at runtime
3610 * @ss: the subsystem to load
3611 *
3612 * This function should be called in a modular subsystem's initcall. If the
3613 * subsytem is built as a module, it will be assigned a new subsys_id and set
3614 * up for use. If the subsystem is built-in anyway, work is delegated to the
3615 * simpler cgroup_init_subsys.
3616 */
3617int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3618{
3619 int i;
3620 struct cgroup_subsys_state *css;
3621
3622 /* check name and function validity */
3623 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3624 ss->create == NULL || ss->destroy == NULL)
3625 return -EINVAL;
3626
3627 /*
3628 * we don't support callbacks in modular subsystems. this check is
3629 * before the ss->module check for consistency; a subsystem that could
3630 * be a module should still have no callbacks even if the user isn't
3631 * compiling it as one.
3632 */
3633 if (ss->fork || ss->exit)
3634 return -EINVAL;
3635
3636 /*
3637 * an optionally modular subsystem is built-in: we want to do nothing,
3638 * since cgroup_init_subsys will have already taken care of it.
3639 */
3640 if (ss->module == NULL) {
3641 /* a few sanity checks */
3642 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3643 BUG_ON(subsys[ss->subsys_id] != ss);
3644 return 0;
3645 }
3646
3647 /*
3648 * need to register a subsys id before anything else - for example,
3649 * init_cgroup_css needs it.
3650 */
3651 mutex_lock(&cgroup_mutex);
3652 /* find the first empty slot in the array */
3653 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3654 if (subsys[i] == NULL)
3655 break;
3656 }
3657 if (i == CGROUP_SUBSYS_COUNT) {
3658 /* maximum number of subsystems already registered! */
3659 mutex_unlock(&cgroup_mutex);
3660 return -EBUSY;
3661 }
3662 /* assign ourselves the subsys_id */
3663 ss->subsys_id = i;
3664 subsys[i] = ss;
3665
3666 /*
3667 * no ss->create seems to need anything important in the ss struct, so
3668 * this can happen first (i.e. before the rootnode attachment).
3669 */
3670 css = ss->create(ss, dummytop);
3671 if (IS_ERR(css)) {
3672 /* failure case - need to deassign the subsys[] slot. */
3673 subsys[i] = NULL;
3674 mutex_unlock(&cgroup_mutex);
3675 return PTR_ERR(css);
3676 }
3677
3678 list_add(&ss->sibling, &rootnode.subsys_list);
3679 ss->root = &rootnode;
3680
3681 /* our new subsystem will be attached to the dummy hierarchy. */
3682 init_cgroup_css(css, ss, dummytop);
3683 /* init_idr must be after init_cgroup_css because it sets css->id. */
3684 if (ss->use_id) {
3685 int ret = cgroup_init_idr(ss, css);
3686 if (ret) {
3687 dummytop->subsys[ss->subsys_id] = NULL;
3688 ss->destroy(ss, dummytop);
3689 subsys[i] = NULL;
3690 mutex_unlock(&cgroup_mutex);
3691 return ret;
3692 }
3693 }
3694
3695 /*
3696 * Now we need to entangle the css into the existing css_sets. unlike
3697 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3698 * will need a new pointer to it; done by iterating the css_set_table.
3699 * furthermore, modifying the existing css_sets will corrupt the hash
3700 * table state, so each changed css_set will need its hash recomputed.
3701 * this is all done under the css_set_lock.
3702 */
3703 write_lock(&css_set_lock);
3704 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3705 struct css_set *cg;
3706 struct hlist_node *node, *tmp;
3707 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3708
3709 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3710 /* skip entries that we already rehashed */
3711 if (cg->subsys[ss->subsys_id])
3712 continue;
3713 /* remove existing entry */
3714 hlist_del(&cg->hlist);
3715 /* set new value */
3716 cg->subsys[ss->subsys_id] = css;
3717 /* recompute hash and restore entry */
3718 new_bucket = css_set_hash(cg->subsys);
3719 hlist_add_head(&cg->hlist, new_bucket);
3720 }
3721 }
3722 write_unlock(&css_set_lock);
3723
3724 mutex_init(&ss->hierarchy_mutex);
3725 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3726 ss->active = 1;
3727
3728 /* success! */
3729 mutex_unlock(&cgroup_mutex);
3730 return 0;
3226} 3731}
3732EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3733
3734/**
3735 * cgroup_unload_subsys: unload a modular subsystem
3736 * @ss: the subsystem to unload
3737 *
3738 * This function should be called in a modular subsystem's exitcall. When this
3739 * function is invoked, the refcount on the subsystem's module will be 0, so
3740 * the subsystem will not be attached to any hierarchy.
3741 */
3742void cgroup_unload_subsys(struct cgroup_subsys *ss)
3743{
3744 struct cg_cgroup_link *link;
3745 struct hlist_head *hhead;
3746
3747 BUG_ON(ss->module == NULL);
3748
3749 /*
3750 * we shouldn't be called if the subsystem is in use, and the use of
3751 * try_module_get in parse_cgroupfs_options should ensure that it
3752 * doesn't start being used while we're killing it off.
3753 */
3754 BUG_ON(ss->root != &rootnode);
3755
3756 mutex_lock(&cgroup_mutex);
3757 /* deassign the subsys_id */
3758 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3759 subsys[ss->subsys_id] = NULL;
3760
3761 /* remove subsystem from rootnode's list of subsystems */
3762 list_del(&ss->sibling);
3763
3764 /*
3765 * disentangle the css from all css_sets attached to the dummytop. as
3766 * in loading, we need to pay our respects to the hashtable gods.
3767 */
3768 write_lock(&css_set_lock);
3769 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3770 struct css_set *cg = link->cg;
3771
3772 hlist_del(&cg->hlist);
3773 BUG_ON(!cg->subsys[ss->subsys_id]);
3774 cg->subsys[ss->subsys_id] = NULL;
3775 hhead = css_set_hash(cg->subsys);
3776 hlist_add_head(&cg->hlist, hhead);
3777 }
3778 write_unlock(&css_set_lock);
3779
3780 /*
3781 * remove subsystem's css from the dummytop and free it - need to free
3782 * before marking as null because ss->destroy needs the cgrp->subsys
3783 * pointer to find their state. note that this also takes care of
3784 * freeing the css_id.
3785 */
3786 ss->destroy(ss, dummytop);
3787 dummytop->subsys[ss->subsys_id] = NULL;
3788
3789 mutex_unlock(&cgroup_mutex);
3790}
3791EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3227 3792
3228/** 3793/**
3229 * cgroup_init_early - cgroup initialization at system boot 3794 * cgroup_init_early - cgroup initialization at system boot
@@ -3253,7 +3818,8 @@ int __init cgroup_init_early(void)
3253 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3818 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3254 INIT_HLIST_HEAD(&css_set_table[i]); 3819 INIT_HLIST_HEAD(&css_set_table[i]);
3255 3820
3256 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3821 /* at bootup time, we don't worry about modular subsystems */
3822 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3257 struct cgroup_subsys *ss = subsys[i]; 3823 struct cgroup_subsys *ss = subsys[i];
3258 3824
3259 BUG_ON(!ss->name); 3825 BUG_ON(!ss->name);
@@ -3288,12 +3854,13 @@ int __init cgroup_init(void)
3288 if (err) 3854 if (err)
3289 return err; 3855 return err;
3290 3856
3291 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3857 /* at bootup time, we don't worry about modular subsystems */
3858 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3292 struct cgroup_subsys *ss = subsys[i]; 3859 struct cgroup_subsys *ss = subsys[i];
3293 if (!ss->early_init) 3860 if (!ss->early_init)
3294 cgroup_init_subsys(ss); 3861 cgroup_init_subsys(ss);
3295 if (ss->use_id) 3862 if (ss->use_id)
3296 cgroup_subsys_init_idr(ss); 3863 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3297 } 3864 }
3298 3865
3299 /* Add init_css_set to the hash table */ 3866 /* Add init_css_set to the hash table */
@@ -3397,9 +3964,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3397 int i; 3964 int i;
3398 3965
3399 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3966 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3967 /*
3968 * ideally we don't want subsystems moving around while we do this.
3969 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3970 * subsys/hierarchy state.
3971 */
3400 mutex_lock(&cgroup_mutex); 3972 mutex_lock(&cgroup_mutex);
3401 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3973 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3402 struct cgroup_subsys *ss = subsys[i]; 3974 struct cgroup_subsys *ss = subsys[i];
3975 if (ss == NULL)
3976 continue;
3403 seq_printf(m, "%s\t%d\t%d\t%d\n", 3977 seq_printf(m, "%s\t%d\t%d\t%d\n",
3404 ss->name, ss->root->hierarchy_id, 3978 ss->name, ss->root->hierarchy_id,
3405 ss->root->number_of_cgroups, !ss->disabled); 3979 ss->root->number_of_cgroups, !ss->disabled);
@@ -3457,7 +4031,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3457{ 4031{
3458 if (need_forkexit_callback) { 4032 if (need_forkexit_callback) {
3459 int i; 4033 int i;
3460 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4034 /*
4035 * forkexit callbacks are only supported for builtin
4036 * subsystems, and the builtin section of the subsys array is
4037 * immutable, so we don't need to lock the subsys array here.
4038 */
4039 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3461 struct cgroup_subsys *ss = subsys[i]; 4040 struct cgroup_subsys *ss = subsys[i];
3462 if (ss->fork) 4041 if (ss->fork)
3463 ss->fork(ss, child); 4042 ss->fork(ss, child);
@@ -3526,7 +4105,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3526 struct css_set *cg; 4105 struct css_set *cg;
3527 4106
3528 if (run_callbacks && need_forkexit_callback) { 4107 if (run_callbacks && need_forkexit_callback) {
3529 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4108 /*
4109 * modular subsystems can't use callbacks, so no need to lock
4110 * the subsys array
4111 */
4112 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3530 struct cgroup_subsys *ss = subsys[i]; 4113 struct cgroup_subsys *ss = subsys[i];
3531 if (ss->exit) 4114 if (ss->exit)
3532 ss->exit(ss, tsk); 4115 ss->exit(ss, tsk);
@@ -3720,12 +4303,13 @@ static void check_for_release(struct cgroup *cgrp)
3720 } 4303 }
3721} 4304}
3722 4305
3723void __css_put(struct cgroup_subsys_state *css) 4306/* Caller must verify that the css is not for root cgroup */
4307void __css_put(struct cgroup_subsys_state *css, int count)
3724{ 4308{
3725 struct cgroup *cgrp = css->cgroup; 4309 struct cgroup *cgrp = css->cgroup;
3726 int val; 4310 int val;
3727 rcu_read_lock(); 4311 rcu_read_lock();
3728 val = atomic_dec_return(&css->refcnt); 4312 val = atomic_sub_return(count, &css->refcnt);
3729 if (val == 1) { 4313 if (val == 1) {
3730 if (notify_on_release(cgrp)) { 4314 if (notify_on_release(cgrp)) {
3731 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4315 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3736,6 +4320,7 @@ void __css_put(struct cgroup_subsys_state *css)
3736 rcu_read_unlock(); 4320 rcu_read_unlock();
3737 WARN_ON_ONCE(val < 1); 4321 WARN_ON_ONCE(val < 1);
3738} 4322}
4323EXPORT_SYMBOL_GPL(__css_put);
3739 4324
3740/* 4325/*
3741 * Notify userspace when a cgroup is released, by running the 4326 * Notify userspace when a cgroup is released, by running the
@@ -3817,8 +4402,11 @@ static int __init cgroup_disable(char *str)
3817 while ((token = strsep(&str, ",")) != NULL) { 4402 while ((token = strsep(&str, ",")) != NULL) {
3818 if (!*token) 4403 if (!*token)
3819 continue; 4404 continue;
3820 4405 /*
3821 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4406 * cgroup_disable, being at boot time, can't know about module
4407 * subsystems, so we don't worry about them.
4408 */
4409 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3822 struct cgroup_subsys *ss = subsys[i]; 4410 struct cgroup_subsys *ss = subsys[i];
3823 4411
3824 if (!strcmp(token, ss->name)) { 4412 if (!strcmp(token, ss->name)) {
@@ -3848,6 +4436,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
3848 return cssid->id; 4436 return cssid->id;
3849 return 0; 4437 return 0;
3850} 4438}
4439EXPORT_SYMBOL_GPL(css_id);
3851 4440
3852unsigned short css_depth(struct cgroup_subsys_state *css) 4441unsigned short css_depth(struct cgroup_subsys_state *css)
3853{ 4442{
@@ -3857,6 +4446,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
3857 return cssid->depth; 4446 return cssid->depth;
3858 return 0; 4447 return 0;
3859} 4448}
4449EXPORT_SYMBOL_GPL(css_depth);
3860 4450
3861bool css_is_ancestor(struct cgroup_subsys_state *child, 4451bool css_is_ancestor(struct cgroup_subsys_state *child,
3862 const struct cgroup_subsys_state *root) 4452 const struct cgroup_subsys_state *root)
@@ -3893,6 +4483,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3893 spin_unlock(&ss->id_lock); 4483 spin_unlock(&ss->id_lock);
3894 call_rcu(&id->rcu_head, __free_css_id_cb); 4484 call_rcu(&id->rcu_head, __free_css_id_cb);
3895} 4485}
4486EXPORT_SYMBOL_GPL(free_css_id);
3896 4487
3897/* 4488/*
3898 * This is called by init or create(). Then, calls to this function are 4489 * This is called by init or create(). Then, calls to this function are
@@ -3942,15 +4533,14 @@ err_out:
3942 4533
3943} 4534}
3944 4535
3945static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4536static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4537 struct cgroup_subsys_state *rootcss)
3946{ 4538{
3947 struct css_id *newid; 4539 struct css_id *newid;
3948 struct cgroup_subsys_state *rootcss;
3949 4540
3950 spin_lock_init(&ss->id_lock); 4541 spin_lock_init(&ss->id_lock);
3951 idr_init(&ss->idr); 4542 idr_init(&ss->idr);
3952 4543
3953 rootcss = init_css_set.subsys[ss->subsys_id];
3954 newid = get_new_cssid(ss, 0); 4544 newid = get_new_cssid(ss, 0);
3955 if (IS_ERR(newid)) 4545 if (IS_ERR(newid))
3956 return PTR_ERR(newid); 4546 return PTR_ERR(newid);
@@ -4010,6 +4600,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
4010 4600
4011 return rcu_dereference(cssid->css); 4601 return rcu_dereference(cssid->css);
4012} 4602}
4603EXPORT_SYMBOL_GPL(css_lookup);
4013 4604
4014/** 4605/**
4015 * css_get_next - lookup next cgroup under specified hierarchy. 4606 * css_get_next - lookup next cgroup under specified hierarchy.