aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1259
1 files changed, 512 insertions, 747 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc1dcabe9217..0c753ddd223b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/seq_file.h>
45#include <linux/slab.h> 44#include <linux/slab.h>
46#include <linux/magic.h> 45#include <linux/magic.h>
47#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -56,15 +55,20 @@
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 58#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 59#include <linux/kthread.h>
63#include <linux/file.h>
64 60
65#include <linux/atomic.h> 61#include <linux/atomic.h>
66 62
67/* 63/*
64 * pidlists linger the following amount before being destroyed. The goal
65 * is avoiding frequent destruction in the middle of consecutive read calls
66 * Expiring in the middle is a performance problem not a correctness one.
67 * 1 sec should be enough.
68 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70
71/*
68 * cgroup_mutex is the master lock. Any modification to cgroup or its 72 * cgroup_mutex is the master lock. Any modification to cgroup or its
69 * hierarchy must be performed while holding it. 73 * hierarchy must be performed while holding it.
70 * 74 *
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex);
89 93
90static DEFINE_MUTEX(cgroup_root_mutex); 94static DEFINE_MUTEX(cgroup_root_mutex);
91 95
96#define cgroup_assert_mutex_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \
98 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108
92/* 109/*
93 * cgroup destruction makes heavy use of work items and there can be a lot 110 * cgroup destruction makes heavy use of work items and there can be a lot
94 * of concurrent destructions. Use a separate workqueue so that cgroup 111 * of concurrent destructions. Use a separate workqueue so that cgroup
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
98static struct workqueue_struct *cgroup_destroy_wq; 115static struct workqueue_struct *cgroup_destroy_wq;
99 116
100/* 117/*
118 * pidlist destructions need to be flushed on cgroup destruction. Use a
119 * separate workqueue as flush domain.
120 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122
123/*
101 * Generate an array of cgroup subsystem pointers. At boot time, this is 124 * Generate an array of cgroup subsystem pointers. At boot time, this is
102 * populated with the built in subsystems, and modular subsystems are 125 * populated with the built in subsystems, and modular subsystems are
103 * registered after that. The mutable section of this array is protected by 126 * registered after that. The mutable section of this array is protected by
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
119/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
120static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
121 144
122/*
123 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
124 */
125struct cfent {
126 struct list_head node;
127 struct dentry *dentry;
128 struct cftype *type;
129 struct cgroup_subsys_state *css;
130
131 /* file xattrs */
132 struct simple_xattrs xattrs;
133};
134
135/*
136 * cgroup_event represents events which userspace want to receive.
137 */
138struct cgroup_event {
139 /*
140 * css which the event belongs to.
141 */
142 struct cgroup_subsys_state *css;
143 /*
144 * Control file which the event associated.
145 */
146 struct cftype *cft;
147 /*
148 * eventfd to signal userspace about the event.
149 */
150 struct eventfd_ctx *eventfd;
151 /*
152 * Each of these stored in a list by the cgroup.
153 */
154 struct list_head list;
155 /*
156 * All fields below needed to unregister event when
157 * userspace closes eventfd.
158 */
159 poll_table pt;
160 wait_queue_head_t *wqh;
161 wait_queue_t wait;
162 struct work_struct remove;
163};
164
165/* The list of hierarchy roots */ 145/* The list of hierarchy roots */
166 146
167static LIST_HEAD(cgroup_roots); 147static LIST_HEAD(cgroup_roots);
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
201 bool is_add); 181 bool is_add);
202static int cgroup_file_release(struct inode *inode, struct file *file); 182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
203 184
204/** 185/**
205 * cgroup_css - obtain a cgroup's css for the specified subsystem 186 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
262} 243}
263 244
264/** 245/**
246 * for_each_css - iterate all css's of a cgroup
247 * @css: the iteration cursor
248 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
249 * @cgrp: the target cgroup to iterate css's of
250 *
251 * Should be called under cgroup_mutex.
252 */
253#define for_each_css(css, ssid, cgrp) \
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \
257 lockdep_is_held(&cgroup_mutex)))) { } \
258 else
259
260/**
265 * for_each_subsys - iterate all loaded cgroup subsystems 261 * for_each_subsys - iterate all loaded cgroup subsystems
266 * @ss: the iteration cursor 262 * @ss: the iteration cursor
267 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
268 * 264 *
269 * Should be called under cgroup_mutex. 265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
270 */ 267 */
271#define for_each_subsys(ss, i) \ 268#define for_each_subsys(ss, ssid) \
272 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ 269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
273 if (({ lockdep_assert_held(&cgroup_mutex); \ 270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
274 !((ss) = cgroup_subsys[i]); })) { } \ 271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
275 else 272 else
276 273
277/** 274/**
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
286 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ 283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
287 (((ss) = cgroup_subsys[i]) || true); (i)++) 284 (((ss) = cgroup_subsys[i]) || true); (i)++)
288 285
289/* iterate each subsystem attached to a hierarchy */
290#define for_each_root_subsys(root, ss) \
291 list_for_each_entry((ss), &(root)->subsys_list, sibling)
292
293/* iterate across the active hierarchies */ 286/* iterate across the active hierarchies */
294#define for_each_active_root(root) \ 287#define for_each_active_root(root) \
295 list_for_each_entry((root), &cgroup_roots, root_list) 288 list_for_each_entry((root), &cgroup_roots, root_list)
@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
863 */ 856 */
864 deactivate_super(cgrp->root->sb); 857 deactivate_super(cgrp->root->sb);
865 858
866 /* 859 cgroup_pidlist_destroy_all(cgrp);
867 * if we're getting rid of the cgroup, refcount should ensure
868 * that there are no pidlists left.
869 */
870 BUG_ON(!list_empty(&cgrp->pidlists));
871 860
872 simple_xattrs_free(&cgrp->xattrs); 861 simple_xattrs_free(&cgrp->xattrs);
873 862
@@ -897,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
897 * per-subsystem and moved to css->id so that lookups are 886 * per-subsystem and moved to css->id so that lookups are
898 * successful until the target css is released. 887 * successful until the target css is released.
899 */ 888 */
889 mutex_lock(&cgroup_mutex);
900 idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
901 cgrp->id = -1; 892 cgrp->id = -1;
902 893
903 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -1050,7 +1041,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1050 cgroup_css(cgroup_dummy_top, ss)); 1041 cgroup_css(cgroup_dummy_top, ss));
1051 cgroup_css(cgrp, ss)->cgroup = cgrp; 1042 cgroup_css(cgrp, ss)->cgroup = cgrp;
1052 1043
1053 list_move(&ss->sibling, &root->subsys_list);
1054 ss->root = root; 1044 ss->root = root;
1055 if (ss->bind) 1045 if (ss->bind)
1056 ss->bind(cgroup_css(cgrp, ss)); 1046 ss->bind(cgroup_css(cgrp, ss));
@@ -1069,7 +1059,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1069 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1070 1060
1071 cgroup_subsys[i]->root = &cgroup_dummy_root; 1061 cgroup_subsys[i]->root = &cgroup_dummy_root;
1072 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1073 1062
1074 /* subsystem is now free - drop reference on module */ 1063 /* subsystem is now free - drop reference on module */
1075 module_put(ss->module); 1064 module_put(ss->module);
@@ -1096,10 +1085,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1096{ 1085{
1097 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1098 struct cgroup_subsys *ss; 1087 struct cgroup_subsys *ss;
1088 int ssid;
1099 1089
1100 mutex_lock(&cgroup_root_mutex); 1090 mutex_lock(&cgroup_root_mutex);
1101 for_each_root_subsys(root, ss) 1091 for_each_subsys(ss, ssid)
1102 seq_printf(seq, ",%s", ss->name); 1092 if (root->subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name);
1103 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1104 seq_puts(seq, ",sane_behavior"); 1095 seq_puts(seq, ",sane_behavior");
1105 if (root->flags & CGRP_ROOT_NOPREFIX) 1096 if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1362,8 +1353,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1362 INIT_LIST_HEAD(&cgrp->pidlists); 1353 INIT_LIST_HEAD(&cgrp->pidlists);
1363 mutex_init(&cgrp->pidlist_mutex); 1354 mutex_init(&cgrp->pidlist_mutex);
1364 cgrp->dummy_css.cgroup = cgrp; 1355 cgrp->dummy_css.cgroup = cgrp;
1365 INIT_LIST_HEAD(&cgrp->event_list);
1366 spin_lock_init(&cgrp->event_list_lock);
1367 simple_xattrs_init(&cgrp->xattrs); 1356 simple_xattrs_init(&cgrp->xattrs);
1368} 1357}
1369 1358
@@ -1371,7 +1360,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1371{ 1360{
1372 struct cgroup *cgrp = &root->top_cgroup; 1361 struct cgroup *cgrp = &root->top_cgroup;
1373 1362
1374 INIT_LIST_HEAD(&root->subsys_list);
1375 INIT_LIST_HEAD(&root->root_list); 1363 INIT_LIST_HEAD(&root->root_list);
1376 root->number_of_cgroups = 1; 1364 root->number_of_cgroups = 1;
1377 cgrp->root = root; 1365 cgrp->root = root;
@@ -1580,10 +1568,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1580 mutex_lock(&cgroup_mutex); 1568 mutex_lock(&cgroup_mutex);
1581 mutex_lock(&cgroup_root_mutex); 1569 mutex_lock(&cgroup_root_mutex);
1582 1570
1583 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, 1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1584 0, 1, GFP_KERNEL); 1572 if (ret < 0)
1585 if (root_cgrp->id < 0)
1586 goto unlock_drop; 1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1587 1575
1588 /* Check for name clashes with existing mounts */ 1576 /* Check for name clashes with existing mounts */
1589 ret = -EBUSY; 1577 ret = -EBUSY;
@@ -1693,7 +1681,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1693 return ERR_PTR(ret); 1681 return ERR_PTR(ret);
1694} 1682}
1695 1683
1696static void cgroup_kill_sb(struct super_block *sb) { 1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1697 struct cgroupfs_root *root = sb->s_fs_info; 1686 struct cgroupfs_root *root = sb->s_fs_info;
1698 struct cgroup *cgrp = &root->top_cgroup; 1687 struct cgroup *cgrp = &root->top_cgroup;
1699 struct cgrp_cset_link *link, *tmp_link; 1688 struct cgrp_cset_link *link, *tmp_link;
@@ -1976,8 +1965,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1976 bool threadgroup) 1965 bool threadgroup)
1977{ 1966{
1978 int retval, i, group_size; 1967 int retval, i, group_size;
1979 struct cgroup_subsys *ss, *failed_ss = NULL;
1980 struct cgroupfs_root *root = cgrp->root; 1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1981 /* threadgroup list cursor and array */ 1970 /* threadgroup list cursor and array */
1982 struct task_struct *leader = tsk; 1971 struct task_struct *leader = tsk;
1983 struct task_and_cgroup *tc; 1972 struct task_and_cgroup *tc;
@@ -2050,13 +2039,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 /* 2039 /*
2051 * step 1: check that we can legitimately attach to the cgroup. 2040 * step 1: check that we can legitimately attach to the cgroup.
2052 */ 2041 */
2053 for_each_root_subsys(root, ss) { 2042 for_each_css(css, i, cgrp) {
2054 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2043 if (css->ss->can_attach) {
2055 2044 retval = css->ss->can_attach(css, &tset);
2056 if (ss->can_attach) {
2057 retval = ss->can_attach(css, &tset);
2058 if (retval) { 2045 if (retval) {
2059 failed_ss = ss; 2046 failed_css = css;
2060 goto out_cancel_attach; 2047 goto out_cancel_attach;
2061 } 2048 }
2062 } 2049 }
@@ -2092,12 +2079,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2092 /* 2079 /*
2093 * step 4: do subsystem attach callbacks. 2080 * step 4: do subsystem attach callbacks.
2094 */ 2081 */
2095 for_each_root_subsys(root, ss) { 2082 for_each_css(css, i, cgrp)
2096 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2083 if (css->ss->attach)
2097 2084 css->ss->attach(css, &tset);
2098 if (ss->attach)
2099 ss->attach(css, &tset);
2100 }
2101 2085
2102 /* 2086 /*
2103 * step 5: success! and cleanup 2087 * step 5: success! and cleanup
@@ -2114,13 +2098,11 @@ out_put_css_set_refs:
2114 } 2098 }
2115out_cancel_attach: 2099out_cancel_attach:
2116 if (retval) { 2100 if (retval) {
2117 for_each_root_subsys(root, ss) { 2101 for_each_css(css, i, cgrp) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2102 if (css == failed_css)
2119
2120 if (ss == failed_ss)
2121 break; 2103 break;
2122 if (ss->cancel_attach) 2104 if (css->ss->cancel_attach)
2123 ss->cancel_attach(css, &tset); 2105 css->ss->cancel_attach(css, &tset);
2124 } 2106 }
2125 } 2107 }
2126out_free_group_list: 2108out_free_group_list:
@@ -2148,7 +2130,7 @@ retry_find_task:
2148 tsk = find_task_by_vpid(pid); 2130 tsk = find_task_by_vpid(pid);
2149 if (!tsk) { 2131 if (!tsk) {
2150 rcu_read_unlock(); 2132 rcu_read_unlock();
2151 ret= -ESRCH; 2133 ret = -ESRCH;
2152 goto out_unlock_cgroup; 2134 goto out_unlock_cgroup;
2153 } 2135 }
2154 /* 2136 /*
@@ -2260,10 +2242,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2260 return 0; 2242 return 0;
2261} 2243}
2262 2244
2263static int cgroup_release_agent_show(struct cgroup_subsys_state *css, 2245static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2264 struct cftype *cft, struct seq_file *seq)
2265{ 2246{
2266 struct cgroup *cgrp = css->cgroup; 2247 struct cgroup *cgrp = seq_css(seq)->cgroup;
2267 2248
2268 if (!cgroup_lock_live_group(cgrp)) 2249 if (!cgroup_lock_live_group(cgrp))
2269 return -ENODEV; 2250 return -ENODEV;
@@ -2273,174 +2254,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2273 return 0; 2254 return 0;
2274} 2255}
2275 2256
2276static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, 2257static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2277 struct cftype *cft, struct seq_file *seq)
2278{ 2258{
2279 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); 2259 struct cgroup *cgrp = seq_css(seq)->cgroup;
2260
2261 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2280 return 0; 2262 return 0;
2281} 2263}
2282 2264
2283/* A buffer size big enough for numbers or short strings */ 2265/* A buffer size big enough for numbers or short strings */
2284#define CGROUP_LOCAL_BUFFER_SIZE 64 2266#define CGROUP_LOCAL_BUFFER_SIZE 64
2285 2267
2286static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, 2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2287 struct cftype *cft, struct file *file, 2269 size_t nbytes, loff_t *ppos)
2288 const char __user *userbuf, size_t nbytes,
2289 loff_t *unused_ppos)
2290{ 2270{
2291 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2271 struct cfent *cfe = __d_cfe(file->f_dentry);
2292 int retval = 0; 2272 struct cftype *cft = __d_cft(file->f_dentry);
2293 char *end; 2273 struct cgroup_subsys_state *css = cfe->css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret;
2294 2277
2295 if (!nbytes) 2278 if (nbytes >= max_bytes)
2296 return -EINVAL;
2297 if (nbytes >= sizeof(buffer))
2298 return -E2BIG; 2279 return -E2BIG;
2299 if (copy_from_user(buffer, userbuf, nbytes))
2300 return -EFAULT;
2301 2280
2302 buffer[nbytes] = 0; /* nul-terminate */ 2281 buf = kmalloc(nbytes + 1, GFP_KERNEL);
2303 if (cft->write_u64) { 2282 if (!buf)
2304 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2283 return -ENOMEM;
2305 if (*end) 2284
2306 return -EINVAL; 2285 if (copy_from_user(buf, userbuf, nbytes)) {
2307 retval = cft->write_u64(css, cft, val); 2286 ret = -EFAULT;
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291
2292 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf));
2294 } else if (cft->write_u64) {
2295 unsigned long long v;
2296 ret = kstrtoull(buf, 0, &v);
2297 if (!ret)
2298 ret = cft->write_u64(css, cft, v);
2299 } else if (cft->write_s64) {
2300 long long v;
2301 ret = kstrtoll(buf, 0, &v);
2302 if (!ret)
2303 ret = cft->write_s64(css, cft, v);
2304 } else if (cft->trigger) {
2305 ret = cft->trigger(css, (unsigned int)cft->private);
2308 } else { 2306 } else {
2309 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2307 ret = -EINVAL;
2310 if (*end)
2311 return -EINVAL;
2312 retval = cft->write_s64(css, cft, val);
2313 } 2308 }
2314 if (!retval) 2309out_free:
2315 retval = nbytes; 2310 kfree(buf);
2316 return retval; 2311 return ret ?: nbytes;
2317} 2312}
2318 2313
2319static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, 2314/*
2320 struct cftype *cft, struct file *file, 2315 * seqfile ops/methods for returning structured data. Currently just
2321 const char __user *userbuf, size_t nbytes, 2316 * supports string->u64 maps, but can be extended in future.
2322 loff_t *unused_ppos) 2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2323{ 2320{
2324 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2321 struct cftype *cft = seq_cft(seq);
2325 int retval = 0;
2326 size_t max_bytes = cft->max_write_len;
2327 char *buffer = local_buffer;
2328 2322
2329 if (!max_bytes) 2323 if (cft->seq_start) {
2330 max_bytes = sizeof(local_buffer) - 1; 2324 return cft->seq_start(seq, ppos);
2331 if (nbytes >= max_bytes) 2325 } else {
2332 return -E2BIG; 2326 /*
2333 /* Allocate a dynamic buffer if we need one */ 2327 * The same behavior and code as single_open(). Returns
2334 if (nbytes >= sizeof(local_buffer)) { 2328 * !NULL if pos is at the beginning; otherwise, NULL.
2335 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2329 */
2336 if (buffer == NULL) 2330 return NULL + !*ppos;
2337 return -ENOMEM;
2338 }
2339 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2340 retval = -EFAULT;
2341 goto out;
2342 } 2331 }
2343
2344 buffer[nbytes] = 0; /* nul-terminate */
2345 retval = cft->write_string(css, cft, strstrip(buffer));
2346 if (!retval)
2347 retval = nbytes;
2348out:
2349 if (buffer != local_buffer)
2350 kfree(buffer);
2351 return retval;
2352} 2332}
2353 2333
2354static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2355 size_t nbytes, loff_t *ppos)
2356{ 2335{
2357 struct cfent *cfe = __d_cfe(file->f_dentry); 2336 struct cftype *cft = seq_cft(seq);
2358 struct cftype *cft = __d_cft(file->f_dentry);
2359 struct cgroup_subsys_state *css = cfe->css;
2360 2337
2361 if (cft->write) 2338 if (cft->seq_next) {
2362 return cft->write(css, cft, file, buf, nbytes, ppos); 2339 return cft->seq_next(seq, v, ppos);
2363 if (cft->write_u64 || cft->write_s64) 2340 } else {
2364 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); 2341 /*
2365 if (cft->write_string) 2342 * The same behavior and code as single_open(), always
2366 return cgroup_write_string(css, cft, file, buf, nbytes, ppos); 2343 * terminate after the initial read.
2367 if (cft->trigger) { 2344 */
2368 int ret = cft->trigger(css, (unsigned int)cft->private); 2345 ++*ppos;
2369 return ret ? ret : nbytes; 2346 return NULL;
2370 } 2347 }
2371 return -EINVAL;
2372} 2348}
2373 2349
2374static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, 2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2375 struct cftype *cft, struct file *file,
2376 char __user *buf, size_t nbytes, loff_t *ppos)
2377{ 2351{
2378 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2352 struct cftype *cft = seq_cft(seq);
2379 u64 val = cft->read_u64(css, cft);
2380 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2381 2353
2382 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2383} 2356}
2384 2357
2385static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, 2358static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2386 struct cftype *cft, struct file *file,
2387 char __user *buf, size_t nbytes, loff_t *ppos)
2388{ 2359{
2389 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2360 struct cftype *cft = seq_cft(m);
2390 s64 val = cft->read_s64(css, cft); 2361 struct cgroup_subsys_state *css = seq_css(m);
2391 int len = sprintf(tmp, "%lld\n", (long long) val);
2392
2393 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2394}
2395 2362
2396static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2363 if (cft->seq_show)
2397 size_t nbytes, loff_t *ppos) 2364 return cft->seq_show(m, arg);
2398{
2399 struct cfent *cfe = __d_cfe(file->f_dentry);
2400 struct cftype *cft = __d_cft(file->f_dentry);
2401 struct cgroup_subsys_state *css = cfe->css;
2402 2365
2403 if (cft->read)
2404 return cft->read(css, cft, file, buf, nbytes, ppos);
2405 if (cft->read_u64) 2366 if (cft->read_u64)
2406 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); 2367 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2407 if (cft->read_s64) 2368 else if (cft->read_s64)
2408 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); 2369 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2409 return -EINVAL; 2370 else
2410} 2371 return -EINVAL;
2411 2372 return 0;
2412/*
2413 * seqfile ops/methods for returning structured data. Currently just
2414 * supports string->u64 maps, but can be extended in future.
2415 */
2416
2417static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2418{
2419 struct seq_file *sf = cb->state;
2420 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2421}
2422
2423static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2424{
2425 struct cfent *cfe = m->private;
2426 struct cftype *cft = cfe->type;
2427 struct cgroup_subsys_state *css = cfe->css;
2428
2429 if (cft->read_map) {
2430 struct cgroup_map_cb cb = {
2431 .fill = cgroup_map_add,
2432 .state = m,
2433 };
2434 return cft->read_map(css, cft, &cb);
2435 }
2436 return cft->read_seq_string(css, cft, m);
2437} 2373}
2438 2374
2439static const struct file_operations cgroup_seqfile_operations = { 2375static struct seq_operations cgroup_seq_operations = {
2440 .read = seq_read, 2376 .start = cgroup_seqfile_start,
2441 .write = cgroup_file_write, 2377 .next = cgroup_seqfile_next,
2442 .llseek = seq_lseek, 2378 .stop = cgroup_seqfile_stop,
2443 .release = cgroup_file_release, 2379 .show = cgroup_seqfile_show,
2444}; 2380};
2445 2381
2446static int cgroup_file_open(struct inode *inode, struct file *file) 2382static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2449,6 +2385,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2449 struct cftype *cft = __d_cft(file->f_dentry); 2385 struct cftype *cft = __d_cft(file->f_dentry);
2450 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2451 struct cgroup_subsys_state *css; 2387 struct cgroup_subsys_state *css;
2388 struct cgroup_open_file *of;
2452 int err; 2389 int err;
2453 2390
2454 err = generic_file_open(inode, file); 2391 err = generic_file_open(inode, file);
@@ -2478,32 +2415,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2478 WARN_ON_ONCE(cfe->css && cfe->css != css); 2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2479 cfe->css = css; 2416 cfe->css = css;
2480 2417
2481 if (cft->read_map || cft->read_seq_string) { 2418 of = __seq_open_private(file, &cgroup_seq_operations,
2482 file->f_op = &cgroup_seqfile_operations; 2419 sizeof(struct cgroup_open_file));
2483 err = single_open(file, cgroup_seqfile_show, cfe); 2420 if (of) {
2484 } else if (cft->open) { 2421 of->cfe = cfe;
2485 err = cft->open(inode, file); 2422 return 0;
2486 } 2423 }
2487 2424
2488 if (css->ss && err) 2425 if (css->ss)
2489 css_put(css); 2426 css_put(css);
2490 return err; 2427 return -ENOMEM;
2491} 2428}
2492 2429
2493static int cgroup_file_release(struct inode *inode, struct file *file) 2430static int cgroup_file_release(struct inode *inode, struct file *file)
2494{ 2431{
2495 struct cfent *cfe = __d_cfe(file->f_dentry); 2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2496 struct cftype *cft = __d_cft(file->f_dentry);
2497 struct cgroup_subsys_state *css = cfe->css; 2433 struct cgroup_subsys_state *css = cfe->css;
2498 int ret = 0;
2499 2434
2500 if (cft->release)
2501 ret = cft->release(inode, file);
2502 if (css->ss) 2435 if (css->ss)
2503 css_put(css); 2436 css_put(css);
2504 if (file->f_op == &cgroup_seqfile_operations) 2437 return seq_release_private(inode, file);
2505 single_release(inode, file);
2506 return ret;
2507} 2438}
2508 2439
2509/* 2440/*
@@ -2614,7 +2545,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2614} 2545}
2615 2546
2616static const struct file_operations cgroup_file_operations = { 2547static const struct file_operations cgroup_file_operations = {
2617 .read = cgroup_file_read, 2548 .read = seq_read,
2618 .write = cgroup_file_write, 2549 .write = cgroup_file_write,
2619 .llseek = generic_file_llseek, 2550 .llseek = generic_file_llseek,
2620 .open = cgroup_file_open, 2551 .open = cgroup_file_open,
@@ -2639,16 +2570,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2639 .removexattr = cgroup_removexattr, 2570 .removexattr = cgroup_removexattr,
2640}; 2571};
2641 2572
2642/*
2643 * Check if a file is a control file
2644 */
2645static inline struct cftype *__file_cft(struct file *file)
2646{
2647 if (file_inode(file)->i_fop != &cgroup_file_operations)
2648 return ERR_PTR(-EINVAL);
2649 return __d_cft(file->f_dentry);
2650}
2651
2652static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2653 struct super_block *sb) 2574 struct super_block *sb)
2654{ 2575{
@@ -2706,12 +2627,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2706 if (cft->mode) 2627 if (cft->mode)
2707 return cft->mode; 2628 return cft->mode;
2708 2629
2709 if (cft->read || cft->read_u64 || cft->read_s64 || 2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
2710 cft->read_map || cft->read_seq_string)
2711 mode |= S_IRUGO; 2631 mode |= S_IRUGO;
2712 2632
2713 if (cft->write || cft->write_u64 || cft->write_s64 || 2633 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
2714 cft->write_string || cft->trigger) 2634 cft->trigger)
2715 mode |= S_IWUSR; 2635 mode |= S_IWUSR;
2716 2636
2717 return mode; 2637 return mode;
@@ -2845,10 +2765,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2845 */ 2765 */
2846 update_before = cgroup_serial_nr_next; 2766 update_before = cgroup_serial_nr_next;
2847 2767
2848 mutex_unlock(&cgroup_mutex);
2849
2850 /* add/rm files for all cgroups created before */ 2768 /* add/rm files for all cgroups created before */
2851 rcu_read_lock();
2852 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2853 struct cgroup *cgrp = css->cgroup; 2770 struct cgroup *cgrp = css->cgroup;
2854 2771
@@ -2857,23 +2774,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2857 2774
2858 inode = cgrp->dentry->d_inode; 2775 inode = cgrp->dentry->d_inode;
2859 dget(cgrp->dentry); 2776 dget(cgrp->dentry);
2860 rcu_read_unlock();
2861
2862 dput(prev); 2777 dput(prev);
2863 prev = cgrp->dentry; 2778 prev = cgrp->dentry;
2864 2779
2780 mutex_unlock(&cgroup_mutex);
2865 mutex_lock(&inode->i_mutex); 2781 mutex_lock(&inode->i_mutex);
2866 mutex_lock(&cgroup_mutex); 2782 mutex_lock(&cgroup_mutex);
2867 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2868 ret = cgroup_addrm_files(cgrp, cfts, is_add); 2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2869 mutex_unlock(&cgroup_mutex);
2870 mutex_unlock(&inode->i_mutex); 2785 mutex_unlock(&inode->i_mutex);
2871
2872 rcu_read_lock();
2873 if (ret) 2786 if (ret)
2874 break; 2787 break;
2875 } 2788 }
2876 rcu_read_unlock(); 2789 mutex_unlock(&cgroup_mutex);
2877 dput(prev); 2790 dput(prev);
2878 deactivate_super(sb); 2791 deactivate_super(sb);
2879 return ret; 2792 return ret;
@@ -2992,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void)
2992 * We should check if the process is exiting, otherwise 2905 * We should check if the process is exiting, otherwise
2993 * it will race with cgroup_exit() in that the list 2906 * it will race with cgroup_exit() in that the list
2994 * entry won't be deleted though the process has exited. 2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2995 */ 2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2996 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2997 list_add(&p->cg_list, &task_css_set(p)->tasks); 2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2998 task_unlock(p); 2916 task_unlock(p);
2999 } while_each_thread(g, p); 2917 } while_each_thread(g, p);
3000 read_unlock(&tasklist_lock); 2918 read_unlock(&tasklist_lock);
@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
3007 * @parent_css: css whose children to walk 2925 * @parent_css: css whose children to walk
3008 * 2926 *
3009 * This function returns the next child of @parent_css and should be called 2927 * This function returns the next child of @parent_css and should be called
3010 * under RCU read lock. The only requirement is that @parent_css and 2928 * under either cgroup_mutex or RCU read lock. The only requirement is
3011 * @pos_css are accessible. The next sibling is guaranteed to be returned 2929 * that @parent_css and @pos_css are accessible. The next sibling is
3012 * regardless of their states. 2930 * guaranteed to be returned regardless of their states.
3013 */ 2931 */
3014struct cgroup_subsys_state * 2932struct cgroup_subsys_state *
3015css_next_child(struct cgroup_subsys_state *pos_css, 2933css_next_child(struct cgroup_subsys_state *pos_css,
@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
3019 struct cgroup *cgrp = parent_css->cgroup; 2937 struct cgroup *cgrp = parent_css->cgroup;
3020 struct cgroup *next; 2938 struct cgroup *next;
3021 2939
3022 WARN_ON_ONCE(!rcu_read_lock_held()); 2940 cgroup_assert_mutex_or_rcu_locked();
3023 2941
3024 /* 2942 /*
3025 * @pos could already have been removed. Once a cgroup is removed, 2943 * @pos could already have been removed. Once a cgroup is removed,
@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
3066 * to visit for pre-order traversal of @root's descendants. @root is 2984 * to visit for pre-order traversal of @root's descendants. @root is
3067 * included in the iteration and the first node to be visited. 2985 * included in the iteration and the first node to be visited.
3068 * 2986 *
3069 * While this function requires RCU read locking, it doesn't require the 2987 * While this function requires cgroup_mutex or RCU read locking, it
3070 * whole traversal to be contained in a single RCU critical section. This 2988 * doesn't require the whole traversal to be contained in a single critical
3071 * function will return the correct next descendant as long as both @pos 2989 * section. This function will return the correct next descendant as long
3072 * and @root are accessible and @pos is a descendant of @root. 2990 * as both @pos and @root are accessible and @pos is a descendant of @root.
3073 */ 2991 */
3074struct cgroup_subsys_state * 2992struct cgroup_subsys_state *
3075css_next_descendant_pre(struct cgroup_subsys_state *pos, 2993css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3077{ 2995{
3078 struct cgroup_subsys_state *next; 2996 struct cgroup_subsys_state *next;
3079 2997
3080 WARN_ON_ONCE(!rcu_read_lock_held()); 2998 cgroup_assert_mutex_or_rcu_locked();
3081 2999
3082 /* if first iteration, visit @root */ 3000 /* if first iteration, visit @root */
3083 if (!pos) 3001 if (!pos)
@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3108 * is returned. This can be used during pre-order traversal to skip 3026 * is returned. This can be used during pre-order traversal to skip
3109 * subtree of @pos. 3027 * subtree of @pos.
3110 * 3028 *
3111 * While this function requires RCU read locking, it doesn't require the 3029 * While this function requires cgroup_mutex or RCU read locking, it
3112 * whole traversal to be contained in a single RCU critical section. This 3030 * doesn't require the whole traversal to be contained in a single critical
3113 * function will return the correct rightmost descendant as long as @pos is 3031 * section. This function will return the correct rightmost descendant as
3114 * accessible. 3032 * long as @pos is accessible.
3115 */ 3033 */
3116struct cgroup_subsys_state * 3034struct cgroup_subsys_state *
3117css_rightmost_descendant(struct cgroup_subsys_state *pos) 3035css_rightmost_descendant(struct cgroup_subsys_state *pos)
3118{ 3036{
3119 struct cgroup_subsys_state *last, *tmp; 3037 struct cgroup_subsys_state *last, *tmp;
3120 3038
3121 WARN_ON_ONCE(!rcu_read_lock_held()); 3039 cgroup_assert_mutex_or_rcu_locked();
3122 3040
3123 do { 3041 do {
3124 last = pos; 3042 last = pos;
@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
3154 * to visit for post-order traversal of @root's descendants. @root is 3072 * to visit for post-order traversal of @root's descendants. @root is
3155 * included in the iteration and the last node to be visited. 3073 * included in the iteration and the last node to be visited.
3156 * 3074 *
3157 * While this function requires RCU read locking, it doesn't require the 3075 * While this function requires cgroup_mutex or RCU read locking, it
3158 * whole traversal to be contained in a single RCU critical section. This 3076 * doesn't require the whole traversal to be contained in a single critical
3159 * function will return the correct next descendant as long as both @pos 3077 * section. This function will return the correct next descendant as long
3160 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3078 * as both @pos and @cgroup are accessible and @pos is a descendant of
3079 * @cgroup.
3161 */ 3080 */
3162struct cgroup_subsys_state * 3081struct cgroup_subsys_state *
3163css_next_descendant_post(struct cgroup_subsys_state *pos, 3082css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3165{ 3084{
3166 struct cgroup_subsys_state *next; 3085 struct cgroup_subsys_state *next;
3167 3086
3168 WARN_ON_ONCE(!rcu_read_lock_held()); 3087 cgroup_assert_mutex_or_rcu_locked();
3169 3088
3170 /* if first iteration, visit leftmost descendant which may be @root */ 3089 /* if first iteration, visit leftmost descendant which may be @root */
3171 if (!pos) 3090 if (!pos)
@@ -3504,14 +3423,12 @@ struct cgroup_pidlist {
3504 pid_t *list; 3423 pid_t *list;
3505 /* how many elements the above list has */ 3424 /* how many elements the above list has */
3506 int length; 3425 int length;
3507 /* how many files are using the current array */
3508 int use_count;
3509 /* each of these stored in a list by its cgroup */ 3426 /* each of these stored in a list by its cgroup */
3510 struct list_head links; 3427 struct list_head links;
3511 /* pointer to the cgroup we belong to, for list removal purposes */ 3428 /* pointer to the cgroup we belong to, for list removal purposes */
3512 struct cgroup *owner; 3429 struct cgroup *owner;
3513 /* protects the other fields */ 3430 /* for delayed destruction */
3514 struct rw_semaphore rwsem; 3431 struct delayed_work destroy_dwork;
3515}; 3432};
3516 3433
3517/* 3434/*
@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count)
3527 else 3444 else
3528 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3445 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3529} 3446}
3447
3530static void pidlist_free(void *p) 3448static void pidlist_free(void *p)
3531{ 3449{
3532 if (is_vmalloc_addr(p)) 3450 if (is_vmalloc_addr(p))
@@ -3536,6 +3454,47 @@ static void pidlist_free(void *p)
3536} 3454}
3537 3455
3538/* 3456/*
3457 * Used to destroy all pidlists lingering waiting for destroy timer. None
3458 * should be left afterwards.
3459 */
3460static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3461{
3462 struct cgroup_pidlist *l, *tmp_l;
3463
3464 mutex_lock(&cgrp->pidlist_mutex);
3465 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3466 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3467 mutex_unlock(&cgrp->pidlist_mutex);
3468
3469 flush_workqueue(cgroup_pidlist_destroy_wq);
3470 BUG_ON(!list_empty(&cgrp->pidlists));
3471}
3472
3473static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3474{
3475 struct delayed_work *dwork = to_delayed_work(work);
3476 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3477 destroy_dwork);
3478 struct cgroup_pidlist *tofree = NULL;
3479
3480 mutex_lock(&l->owner->pidlist_mutex);
3481
3482 /*
3483 * Destroy iff we didn't get queued again. The state won't change
3484 * as destroy_dwork can only be queued while locked.
3485 */
3486 if (!delayed_work_pending(dwork)) {
3487 list_del(&l->links);
3488 pidlist_free(l->list);
3489 put_pid_ns(l->key.ns);
3490 tofree = l;
3491 }
3492
3493 mutex_unlock(&l->owner->pidlist_mutex);
3494 kfree(tofree);
3495}
3496
3497/*
3539 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3498 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3540 * Returns the number of unique elements. 3499 * Returns the number of unique elements.
3541 */ 3500 */
@@ -3565,52 +3524,92 @@ after:
3565 return dest; 3524 return dest;
3566} 3525}
3567 3526
3527/*
3528 * The two pid files - task and cgroup.procs - guaranteed that the result
3529 * is sorted, which forced this whole pidlist fiasco. As pid order is
3530 * different per namespace, each namespace needs differently sorted list,
3531 * making it impossible to use, for example, single rbtree of member tasks
3532 * sorted by task pointer. As pidlists can be fairly large, allocating one
3533 * per open file is dangerous, so cgroup had to implement shared pool of
3534 * pidlists keyed by cgroup and namespace.
3535 *
3536 * All this extra complexity was caused by the original implementation
3537 * committing to an entirely unnecessary property. In the long term, we
3538 * want to do away with it. Explicitly scramble sort order if
3539 * sane_behavior so that no such expectation exists in the new interface.
3540 *
3541 * Scrambling is done by swapping every two consecutive bits, which is
3542 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3543 */
3544static pid_t pid_fry(pid_t pid)
3545{
3546 unsigned a = pid & 0x55555555;
3547 unsigned b = pid & 0xAAAAAAAA;
3548
3549 return (a << 1) | (b >> 1);
3550}
3551
3552static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3553{
3554 if (cgroup_sane_behavior(cgrp))
3555 return pid_fry(pid);
3556 else
3557 return pid;
3558}
3559
3568static int cmppid(const void *a, const void *b) 3560static int cmppid(const void *a, const void *b)
3569{ 3561{
3570 return *(pid_t *)a - *(pid_t *)b; 3562 return *(pid_t *)a - *(pid_t *)b;
3571} 3563}
3572 3564
3565static int fried_cmppid(const void *a, const void *b)
3566{
3567 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3568}
3569
3570static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3571 enum cgroup_filetype type)
3572{
3573 struct cgroup_pidlist *l;
3574 /* don't need task_nsproxy() if we're looking at ourself */
3575 struct pid_namespace *ns = task_active_pid_ns(current);
3576
3577 lockdep_assert_held(&cgrp->pidlist_mutex);
3578
3579 list_for_each_entry(l, &cgrp->pidlists, links)
3580 if (l->key.type == type && l->key.ns == ns)
3581 return l;
3582 return NULL;
3583}
3584
3573/* 3585/*
3574 * find the appropriate pidlist for our purpose (given procs vs tasks) 3586 * find the appropriate pidlist for our purpose (given procs vs tasks)
3575 * returns with the lock on that pidlist already held, and takes care 3587 * returns with the lock on that pidlist already held, and takes care
3576 * of the use count, or returns NULL with no locks held if we're out of 3588 * of the use count, or returns NULL with no locks held if we're out of
3577 * memory. 3589 * memory.
3578 */ 3590 */
3579static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3591static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3580 enum cgroup_filetype type) 3592 enum cgroup_filetype type)
3581{ 3593{
3582 struct cgroup_pidlist *l; 3594 struct cgroup_pidlist *l;
3583 /* don't need task_nsproxy() if we're looking at ourself */
3584 struct pid_namespace *ns = task_active_pid_ns(current);
3585 3595
3586 /* 3596 lockdep_assert_held(&cgrp->pidlist_mutex);
3587 * We can't drop the pidlist_mutex before taking the l->rwsem in case 3597
3588 * the last ref-holder is trying to remove l from the list at the same 3598 l = cgroup_pidlist_find(cgrp, type);
3589 * time. Holding the pidlist_mutex precludes somebody taking whichever 3599 if (l)
3590 * list we find out from under us - compare release_pid_array(). 3600 return l;
3591 */ 3601
3592 mutex_lock(&cgrp->pidlist_mutex);
3593 list_for_each_entry(l, &cgrp->pidlists, links) {
3594 if (l->key.type == type && l->key.ns == ns) {
3595 /* make sure l doesn't vanish out from under us */
3596 down_write(&l->rwsem);
3597 mutex_unlock(&cgrp->pidlist_mutex);
3598 return l;
3599 }
3600 }
3601 /* entry not found; create a new one */ 3602 /* entry not found; create a new one */
3602 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3603 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3603 if (!l) { 3604 if (!l)
3604 mutex_unlock(&cgrp->pidlist_mutex);
3605 return l; 3605 return l;
3606 } 3606
3607 init_rwsem(&l->rwsem); 3607 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3608 down_write(&l->rwsem);
3609 l->key.type = type; 3608 l->key.type = type;
3610 l->key.ns = get_pid_ns(ns); 3609 /* don't need task_nsproxy() if we're looking at ourself */
3610 l->key.ns = get_pid_ns(task_active_pid_ns(current));
3611 l->owner = cgrp; 3611 l->owner = cgrp;
3612 list_add(&l->links, &cgrp->pidlists); 3612 list_add(&l->links, &cgrp->pidlists);
3613 mutex_unlock(&cgrp->pidlist_mutex);
3614 return l; 3613 return l;
3615} 3614}
3616 3615
@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3627 struct task_struct *tsk; 3626 struct task_struct *tsk;
3628 struct cgroup_pidlist *l; 3627 struct cgroup_pidlist *l;
3629 3628
3629 lockdep_assert_held(&cgrp->pidlist_mutex);
3630
3630 /* 3631 /*
3631 * If cgroup gets more users after we read count, we won't have 3632 * If cgroup gets more users after we read count, we won't have
3632 * enough space - tough. This race is indistinguishable to the 3633 * enough space - tough. This race is indistinguishable to the
@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3653 css_task_iter_end(&it); 3654 css_task_iter_end(&it);
3654 length = n; 3655 length = n;
3655 /* now sort & (if procs) strip out duplicates */ 3656 /* now sort & (if procs) strip out duplicates */
3656 sort(array, length, sizeof(pid_t), cmppid, NULL); 3657 if (cgroup_sane_behavior(cgrp))
3658 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3659 else
3660 sort(array, length, sizeof(pid_t), cmppid, NULL);
3657 if (type == CGROUP_FILE_PROCS) 3661 if (type == CGROUP_FILE_PROCS)
3658 length = pidlist_uniq(array, length); 3662 length = pidlist_uniq(array, length);
3659 l = cgroup_pidlist_find(cgrp, type); 3663
3664 l = cgroup_pidlist_find_create(cgrp, type);
3660 if (!l) { 3665 if (!l) {
3666 mutex_unlock(&cgrp->pidlist_mutex);
3661 pidlist_free(array); 3667 pidlist_free(array);
3662 return -ENOMEM; 3668 return -ENOMEM;
3663 } 3669 }
3664 /* store array, freeing old if necessary - lock already held */ 3670
3671 /* store array, freeing old if necessary */
3665 pidlist_free(l->list); 3672 pidlist_free(l->list);
3666 l->list = array; 3673 l->list = array;
3667 l->length = length; 3674 l->length = length;
3668 l->use_count++;
3669 up_write(&l->rwsem);
3670 *lp = l; 3675 *lp = l;
3671 return 0; 3676 return 0;
3672} 3677}
@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3740 * after a seek to the start). Use a binary-search to find the 3745 * after a seek to the start). Use a binary-search to find the
3741 * next pid to display, if any 3746 * next pid to display, if any
3742 */ 3747 */
3743 struct cgroup_pidlist *l = s->private; 3748 struct cgroup_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private;
3744 int index = 0, pid = *pos; 3752 int index = 0, pid = *pos;
3745 int *iter; 3753 int *iter, ret;
3754
3755 mutex_lock(&cgrp->pidlist_mutex);
3756
3757 /*
3758 * !NULL @of->priv indicates that this isn't the first start()
3759 * after open. If the matching pidlist is around, we can use that.
3760 * Look for it. Note that @of->priv can't be used directly. It
3761 * could already have been destroyed.
3762 */
3763 if (of->priv)
3764 of->priv = cgroup_pidlist_find(cgrp, type);
3765
3766 /*
3767 * Either this is the first start() after open or the matching
3768 * pidlist has been destroyed inbetween. Create a new one.
3769 */
3770 if (!of->priv) {
3771 ret = pidlist_array_load(cgrp, type,
3772 (struct cgroup_pidlist **)&of->priv);
3773 if (ret)
3774 return ERR_PTR(ret);
3775 }
3776 l = of->priv;
3746 3777
3747 down_read(&l->rwsem);
3748 if (pid) { 3778 if (pid) {
3749 int end = l->length; 3779 int end = l->length;
3750 3780
3751 while (index < end) { 3781 while (index < end) {
3752 int mid = (index + end) / 2; 3782 int mid = (index + end) / 2;
3753 if (l->list[mid] == pid) { 3783 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3754 index = mid; 3784 index = mid;
3755 break; 3785 break;
3756 } else if (l->list[mid] <= pid) 3786 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3757 index = mid + 1; 3787 index = mid + 1;
3758 else 3788 else
3759 end = mid; 3789 end = mid;
@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3764 return NULL; 3794 return NULL;
3765 /* Update the abstract position to be the actual pid that we found */ 3795 /* Update the abstract position to be the actual pid that we found */
3766 iter = l->list + index; 3796 iter = l->list + index;
3767 *pos = *iter; 3797 *pos = cgroup_pid_fry(cgrp, *iter);
3768 return iter; 3798 return iter;
3769} 3799}
3770 3800
3771static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3801static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3772{ 3802{
3773 struct cgroup_pidlist *l = s->private; 3803 struct cgroup_open_file *of = s->private;
3774 up_read(&l->rwsem); 3804 struct cgroup_pidlist *l = of->priv;
3805
3806 if (l)
3807 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3808 CGROUP_PIDLIST_DESTROY_DELAY);
3809 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3775} 3810}
3776 3811
3777static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3778{ 3813{
3779 struct cgroup_pidlist *l = s->private; 3814 struct cgroup_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv;
3780 pid_t *p = v; 3816 pid_t *p = v;
3781 pid_t *end = l->list + l->length; 3817 pid_t *end = l->list + l->length;
3782 /* 3818 /*
@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3787 if (p >= end) { 3823 if (p >= end) {
3788 return NULL; 3824 return NULL;
3789 } else { 3825 } else {
3790 *pos = *p; 3826 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3791 return p; 3827 return p;
3792 } 3828 }
3793} 3829}
@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
3808 .show = cgroup_pidlist_show, 3844 .show = cgroup_pidlist_show,
3809}; 3845};
3810 3846
3811static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3812{
3813 /*
3814 * the case where we're the last user of this particular pidlist will
3815 * have us remove it from the cgroup's list, which entails taking the
3816 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3817 * pidlist_mutex, we have to take pidlist_mutex first.
3818 */
3819 mutex_lock(&l->owner->pidlist_mutex);
3820 down_write(&l->rwsem);
3821 BUG_ON(!l->use_count);
3822 if (!--l->use_count) {
3823 /* we're the last user if refcount is 0; remove and free */
3824 list_del(&l->links);
3825 mutex_unlock(&l->owner->pidlist_mutex);
3826 pidlist_free(l->list);
3827 put_pid_ns(l->key.ns);
3828 up_write(&l->rwsem);
3829 kfree(l);
3830 return;
3831 }
3832 mutex_unlock(&l->owner->pidlist_mutex);
3833 up_write(&l->rwsem);
3834}
3835
3836static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3837{
3838 struct cgroup_pidlist *l;
3839 if (!(file->f_mode & FMODE_READ))
3840 return 0;
3841 /*
3842 * the seq_file will only be initialized if the file was opened for
3843 * reading; hence we check if it's not null only in that case.
3844 */
3845 l = ((struct seq_file *)file->private_data)->private;
3846 cgroup_release_pid_array(l);
3847 return seq_release(inode, file);
3848}
3849
3850static const struct file_operations cgroup_pidlist_operations = {
3851 .read = seq_read,
3852 .llseek = seq_lseek,
3853 .write = cgroup_file_write,
3854 .release = cgroup_pidlist_release,
3855};
3856
3857/*
3858 * The following functions handle opens on a file that displays a pidlist
3859 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3860 * in the cgroup.
3861 */
3862/* helper function for the two below it */
3863static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3864{
3865 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3866 struct cgroup_pidlist *l;
3867 int retval;
3868
3869 /* Nothing to do for write-only files */
3870 if (!(file->f_mode & FMODE_READ))
3871 return 0;
3872
3873 /* have the array populated */
3874 retval = pidlist_array_load(cgrp, type, &l);
3875 if (retval)
3876 return retval;
3877 /* configure file information */
3878 file->f_op = &cgroup_pidlist_operations;
3879
3880 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3881 if (retval) {
3882 cgroup_release_pid_array(l);
3883 return retval;
3884 }
3885 ((struct seq_file *)file->private_data)->private = l;
3886 return 0;
3887}
3888static int cgroup_tasks_open(struct inode *unused, struct file *file)
3889{
3890 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3891}
3892static int cgroup_procs_open(struct inode *unused, struct file *file)
3893{
3894 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3895}
3896
3897static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3847static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3898 struct cftype *cft) 3848 struct cftype *cft)
3899{ 3849{
@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3928 deactivate_super(sb); 3878 deactivate_super(sb);
3929} 3879}
3930 3880
3931/*
3932 * Unregister event and free resources.
3933 *
3934 * Gets called from workqueue.
3935 */
3936static void cgroup_event_remove(struct work_struct *work)
3937{
3938 struct cgroup_event *event = container_of(work, struct cgroup_event,
3939 remove);
3940 struct cgroup_subsys_state *css = event->css;
3941
3942 remove_wait_queue(event->wqh, &event->wait);
3943
3944 event->cft->unregister_event(css, event->cft, event->eventfd);
3945
3946 /* Notify userspace the event is going away. */
3947 eventfd_signal(event->eventfd, 1);
3948
3949 eventfd_ctx_put(event->eventfd);
3950 kfree(event);
3951 css_put(css);
3952}
3953
3954/*
3955 * Gets called on POLLHUP on eventfd when user closes it.
3956 *
3957 * Called with wqh->lock held and interrupts disabled.
3958 */
3959static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3960 int sync, void *key)
3961{
3962 struct cgroup_event *event = container_of(wait,
3963 struct cgroup_event, wait);
3964 struct cgroup *cgrp = event->css->cgroup;
3965 unsigned long flags = (unsigned long)key;
3966
3967 if (flags & POLLHUP) {
3968 /*
3969 * If the event has been detached at cgroup removal, we
3970 * can simply return knowing the other side will cleanup
3971 * for us.
3972 *
3973 * We can't race against event freeing since the other
3974 * side will require wqh->lock via remove_wait_queue(),
3975 * which we hold.
3976 */
3977 spin_lock(&cgrp->event_list_lock);
3978 if (!list_empty(&event->list)) {
3979 list_del_init(&event->list);
3980 /*
3981 * We are in atomic context, but cgroup_event_remove()
3982 * may sleep, so we have to call it in workqueue.
3983 */
3984 schedule_work(&event->remove);
3985 }
3986 spin_unlock(&cgrp->event_list_lock);
3987 }
3988
3989 return 0;
3990}
3991
3992static void cgroup_event_ptable_queue_proc(struct file *file,
3993 wait_queue_head_t *wqh, poll_table *pt)
3994{
3995 struct cgroup_event *event = container_of(pt,
3996 struct cgroup_event, pt);
3997
3998 event->wqh = wqh;
3999 add_wait_queue(wqh, &event->wait);
4000}
4001
4002/*
4003 * Parse input and register new cgroup event handler.
4004 *
4005 * Input must be in format '<event_fd> <control_fd> <args>'.
4006 * Interpretation of args is defined by control file implementation.
4007 */
4008static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4009 struct cftype *cft, const char *buffer)
4010{
4011 struct cgroup *cgrp = dummy_css->cgroup;
4012 struct cgroup_event *event;
4013 struct cgroup_subsys_state *cfile_css;
4014 unsigned int efd, cfd;
4015 struct fd efile;
4016 struct fd cfile;
4017 char *endp;
4018 int ret;
4019
4020 efd = simple_strtoul(buffer, &endp, 10);
4021 if (*endp != ' ')
4022 return -EINVAL;
4023 buffer = endp + 1;
4024
4025 cfd = simple_strtoul(buffer, &endp, 10);
4026 if ((*endp != ' ') && (*endp != '\0'))
4027 return -EINVAL;
4028 buffer = endp + 1;
4029
4030 event = kzalloc(sizeof(*event), GFP_KERNEL);
4031 if (!event)
4032 return -ENOMEM;
4033
4034 INIT_LIST_HEAD(&event->list);
4035 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4036 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4037 INIT_WORK(&event->remove, cgroup_event_remove);
4038
4039 efile = fdget(efd);
4040 if (!efile.file) {
4041 ret = -EBADF;
4042 goto out_kfree;
4043 }
4044
4045 event->eventfd = eventfd_ctx_fileget(efile.file);
4046 if (IS_ERR(event->eventfd)) {
4047 ret = PTR_ERR(event->eventfd);
4048 goto out_put_efile;
4049 }
4050
4051 cfile = fdget(cfd);
4052 if (!cfile.file) {
4053 ret = -EBADF;
4054 goto out_put_eventfd;
4055 }
4056
4057 /* the process need read permission on control file */
4058 /* AV: shouldn't we check that it's been opened for read instead? */
4059 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4060 if (ret < 0)
4061 goto out_put_cfile;
4062
4063 event->cft = __file_cft(cfile.file);
4064 if (IS_ERR(event->cft)) {
4065 ret = PTR_ERR(event->cft);
4066 goto out_put_cfile;
4067 }
4068
4069 if (!event->cft->ss) {
4070 ret = -EBADF;
4071 goto out_put_cfile;
4072 }
4073
4074 /*
4075 * Determine the css of @cfile, verify it belongs to the same
4076 * cgroup as cgroup.event_control, and associate @event with it.
4077 * Remaining events are automatically removed on cgroup destruction
4078 * but the removal is asynchronous, so take an extra ref.
4079 */
4080 rcu_read_lock();
4081
4082 ret = -EINVAL;
4083 event->css = cgroup_css(cgrp, event->cft->ss);
4084 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4085 if (event->css && event->css == cfile_css && css_tryget(event->css))
4086 ret = 0;
4087
4088 rcu_read_unlock();
4089 if (ret)
4090 goto out_put_cfile;
4091
4092 if (!event->cft->register_event || !event->cft->unregister_event) {
4093 ret = -EINVAL;
4094 goto out_put_css;
4095 }
4096
4097 ret = event->cft->register_event(event->css, event->cft,
4098 event->eventfd, buffer);
4099 if (ret)
4100 goto out_put_css;
4101
4102 efile.file->f_op->poll(efile.file, &event->pt);
4103
4104 spin_lock(&cgrp->event_list_lock);
4105 list_add(&event->list, &cgrp->event_list);
4106 spin_unlock(&cgrp->event_list_lock);
4107
4108 fdput(cfile);
4109 fdput(efile);
4110
4111 return 0;
4112
4113out_put_css:
4114 css_put(event->css);
4115out_put_cfile:
4116 fdput(cfile);
4117out_put_eventfd:
4118 eventfd_ctx_put(event->eventfd);
4119out_put_efile:
4120 fdput(efile);
4121out_kfree:
4122 kfree(event);
4123
4124 return ret;
4125}
4126
4127static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4128 struct cftype *cft) 3882 struct cftype *cft)
4129{ 3883{
@@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4143static struct cftype cgroup_base_files[] = { 3897static struct cftype cgroup_base_files[] = {
4144 { 3898 {
4145 .name = "cgroup.procs", 3899 .name = "cgroup.procs",
4146 .open = cgroup_procs_open, 3900 .seq_start = cgroup_pidlist_start,
3901 .seq_next = cgroup_pidlist_next,
3902 .seq_stop = cgroup_pidlist_stop,
3903 .seq_show = cgroup_pidlist_show,
3904 .private = CGROUP_FILE_PROCS,
4147 .write_u64 = cgroup_procs_write, 3905 .write_u64 = cgroup_procs_write,
4148 .release = cgroup_pidlist_release,
4149 .mode = S_IRUGO | S_IWUSR, 3906 .mode = S_IRUGO | S_IWUSR,
4150 }, 3907 },
4151 { 3908 {
4152 .name = "cgroup.event_control",
4153 .write_string = cgroup_write_event_control,
4154 .mode = S_IWUGO,
4155 },
4156 {
4157 .name = "cgroup.clone_children", 3909 .name = "cgroup.clone_children",
4158 .flags = CFTYPE_INSANE, 3910 .flags = CFTYPE_INSANE,
4159 .read_u64 = cgroup_clone_children_read, 3911 .read_u64 = cgroup_clone_children_read,
@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
4162 { 3914 {
4163 .name = "cgroup.sane_behavior", 3915 .name = "cgroup.sane_behavior",
4164 .flags = CFTYPE_ONLY_ON_ROOT, 3916 .flags = CFTYPE_ONLY_ON_ROOT,
4165 .read_seq_string = cgroup_sane_behavior_show, 3917 .seq_show = cgroup_sane_behavior_show,
4166 }, 3918 },
4167 3919
4168 /* 3920 /*
@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
4173 { 3925 {
4174 .name = "tasks", 3926 .name = "tasks",
4175 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3927 .flags = CFTYPE_INSANE, /* use "procs" instead */
4176 .open = cgroup_tasks_open, 3928 .seq_start = cgroup_pidlist_start,
3929 .seq_next = cgroup_pidlist_next,
3930 .seq_stop = cgroup_pidlist_stop,
3931 .seq_show = cgroup_pidlist_show,
3932 .private = CGROUP_FILE_TASKS,
4177 .write_u64 = cgroup_tasks_write, 3933 .write_u64 = cgroup_tasks_write,
4178 .release = cgroup_pidlist_release,
4179 .mode = S_IRUGO | S_IWUSR, 3934 .mode = S_IRUGO | S_IWUSR,
4180 }, 3935 },
4181 { 3936 {
@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
4187 { 3942 {
4188 .name = "release_agent", 3943 .name = "release_agent",
4189 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4190 .read_seq_string = cgroup_release_agent_show, 3945 .seq_show = cgroup_release_agent_show,
4191 .write_string = cgroup_release_agent_write, 3946 .write_string = cgroup_release_agent_write,
4192 .max_write_len = PATH_MAX, 3947 .max_write_len = PATH_MAX,
4193 }, 3948 },
@@ -4333,6 +4088,65 @@ static void offline_css(struct cgroup_subsys_state *css)
4333 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4334} 4089}
4335 4090
4091/**
4092 * create_css - create a cgroup_subsys_state
4093 * @cgrp: the cgroup new css will be associated with
4094 * @ss: the subsys of new css
4095 *
4096 * Create a new css associated with @cgrp - @ss pair. On success, the new
4097 * css is online and installed in @cgrp with all interface files created.
4098 * Returns 0 on success, -errno on failure.
4099 */
4100static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4101{
4102 struct cgroup *parent = cgrp->parent;
4103 struct cgroup_subsys_state *css;
4104 int err;
4105
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex);
4108
4109 css = ss->css_alloc(cgroup_css(parent, ss));
4110 if (IS_ERR(css))
4111 return PTR_ERR(css);
4112
4113 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err)
4115 goto err_free_css;
4116
4117 init_css(css, ss, cgrp);
4118
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4120 if (err)
4121 goto err_free_percpu_ref;
4122
4123 err = online_css(css);
4124 if (err)
4125 goto err_clear_dir;
4126
4127 dget(cgrp->dentry);
4128 css_get(css->parent);
4129
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4133 current->comm, current->pid, ss->name);
4134 if (!strcmp(ss->name, "memory"))
4135 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4136 ss->warned_broken_hierarchy = true;
4137 }
4138
4139 return 0;
4140
4141err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4143err_free_percpu_ref:
4144 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css:
4146 ss->css_free(css);
4147 return err;
4148}
4149
4336/* 4150/*
4337 * cgroup_create - create a cgroup 4151 * cgroup_create - create a cgroup
4338 * @parent: cgroup that will be parent of the new cgroup 4152 * @parent: cgroup that will be parent of the new cgroup
@@ -4344,11 +4158,10 @@ static void offline_css(struct cgroup_subsys_state *css)
4344static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4345 umode_t mode) 4159 umode_t mode)
4346{ 4160{
4347 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4348 struct cgroup *cgrp; 4161 struct cgroup *cgrp;
4349 struct cgroup_name *name; 4162 struct cgroup_name *name;
4350 struct cgroupfs_root *root = parent->root; 4163 struct cgroupfs_root *root = parent->root;
4351 int err = 0; 4164 int ssid, err;
4352 struct cgroup_subsys *ss; 4165 struct cgroup_subsys *ss;
4353 struct super_block *sb = root->sb; 4166 struct super_block *sb = root->sb;
4354 4167
@@ -4358,19 +4171,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4358 return -ENOMEM; 4171 return -ENOMEM;
4359 4172
4360 name = cgroup_alloc_name(dentry); 4173 name = cgroup_alloc_name(dentry);
4361 if (!name) 4174 if (!name) {
4175 err = -ENOMEM;
4362 goto err_free_cgrp; 4176 goto err_free_cgrp;
4177 }
4363 rcu_assign_pointer(cgrp->name, name); 4178 rcu_assign_pointer(cgrp->name, name);
4364 4179
4365 /* 4180 /*
4366 * Temporarily set the pointer to NULL, so idr_find() won't return
4367 * a half-baked cgroup.
4368 */
4369 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4370 if (cgrp->id < 0)
4371 goto err_free_name;
4372
4373 /*
4374 * Only live parents can have children. Note that the liveliness 4181 * Only live parents can have children. Note that the liveliness
4375 * check isn't strictly necessary because cgroup_mkdir() and 4182 * check isn't strictly necessary because cgroup_mkdir() and
4376 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it 4183 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
@@ -4379,7 +4186,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4379 */ 4186 */
4380 if (!cgroup_lock_live_group(parent)) { 4187 if (!cgroup_lock_live_group(parent)) {
4381 err = -ENODEV; 4188 err = -ENODEV;
4382 goto err_free_id; 4189 goto err_free_name;
4190 }
4191
4192 /*
4193 * Temporarily set the pointer to NULL, so idr_find() won't return
4194 * a half-baked cgroup.
4195 */
4196 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4197 if (cgrp->id < 0) {
4198 err = -ENOMEM;
4199 goto err_unlock;
4383 } 4200 }
4384 4201
4385 /* Grab a reference on the superblock so the hierarchy doesn't 4202 /* Grab a reference on the superblock so the hierarchy doesn't
@@ -4404,23 +4221,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4404 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4405 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4406 4223
4407 for_each_root_subsys(root, ss) {
4408 struct cgroup_subsys_state *css;
4409
4410 css = ss->css_alloc(cgroup_css(parent, ss));
4411 if (IS_ERR(css)) {
4412 err = PTR_ERR(css);
4413 goto err_free_all;
4414 }
4415 css_ar[ss->subsys_id] = css;
4416
4417 err = percpu_ref_init(&css->refcnt, css_release);
4418 if (err)
4419 goto err_free_all;
4420
4421 init_css(css, ss, cgrp);
4422 }
4423
4424 /* 4224 /*
4425 * Create directory. cgroup_create_file() returns with the new 4225 * Create directory. cgroup_create_file() returns with the new
4426 * directory locked on success so that it can be populated without 4226 * directory locked on success so that it can be populated without
@@ -4428,7 +4228,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4428 */ 4228 */
4429 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4430 if (err < 0) 4230 if (err < 0)
4431 goto err_free_all; 4231 goto err_free_id;
4432 lockdep_assert_held(&dentry->d_inode->i_mutex); 4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4433 4233
4434 cgrp->serial_nr = cgroup_serial_nr_next++; 4234 cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4440,60 +4240,36 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4440 /* hold a ref to the parent's dentry */ 4240 /* hold a ref to the parent's dentry */
4441 dget(parent->dentry); 4241 dget(parent->dentry);
4442 4242
4443 /* creation succeeded, notify subsystems */ 4243 /*
4444 for_each_root_subsys(root, ss) { 4244 * @cgrp is now fully operational. If something fails after this
4445 struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4245 * point, it'll be released via the normal destruction path.
4446 4246 */
4447 err = online_css(css);
4448 if (err)
4449 goto err_destroy;
4450
4451 /* each css holds a ref to the cgroup's dentry and parent css */
4452 dget(dentry);
4453 css_get(css->parent);
4454
4455 /* mark it consumed for error path */
4456 css_ar[ss->subsys_id] = NULL;
4457
4458 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4459 parent->parent) {
4460 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4461 current->comm, current->pid, ss->name);
4462 if (!strcmp(ss->name, "memory"))
4463 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4464 ss->warned_broken_hierarchy = true;
4465 }
4466 }
4467
4468 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4247 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4469 4248
4470 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4249 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4471 if (err) 4250 if (err)
4472 goto err_destroy; 4251 goto err_destroy;
4473 4252
4474 err = cgroup_populate_dir(cgrp, root->subsys_mask); 4253 /* let's create and online css's */
4475 if (err) 4254 for_each_subsys(ss, ssid) {
4476 goto err_destroy; 4255 if (root->subsys_mask & (1 << ssid)) {
4256 err = create_css(cgrp, ss);
4257 if (err)
4258 goto err_destroy;
4259 }
4260 }
4477 4261
4478 mutex_unlock(&cgroup_mutex); 4262 mutex_unlock(&cgroup_mutex);
4479 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4263 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4480 4264
4481 return 0; 4265 return 0;
4482 4266
4483err_free_all:
4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4487 if (css) {
4488 percpu_ref_cancel_init(&css->refcnt);
4489 ss->css_free(css);
4490 }
4491 }
4492 mutex_unlock(&cgroup_mutex);
4493 /* Release the reference count that we took on the superblock */
4494 deactivate_super(sb);
4495err_free_id: 4267err_free_id:
4496 idr_remove(&root->cgroup_idr, cgrp->id); 4268 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock:
4272 mutex_unlock(&cgroup_mutex);
4497err_free_name: 4273err_free_name:
4498 kfree(rcu_dereference_raw(cgrp->name)); 4274 kfree(rcu_dereference_raw(cgrp->name));
4499err_free_cgrp: 4275err_free_cgrp:
@@ -4501,14 +4277,6 @@ err_free_cgrp:
4501 return err; 4277 return err;
4502 4278
4503err_destroy: 4279err_destroy:
4504 for_each_root_subsys(root, ss) {
4505 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506
4507 if (css) {
4508 percpu_ref_cancel_init(&css->refcnt);
4509 ss->css_free(css);
4510 }
4511 }
4512 cgroup_destroy_locked(cgrp); 4280 cgroup_destroy_locked(cgrp);
4513 mutex_unlock(&cgroup_mutex); 4281 mutex_unlock(&cgroup_mutex);
4514 mutex_unlock(&dentry->d_inode->i_mutex); 4282 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4631,10 +4399,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4631 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4399 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4632{ 4400{
4633 struct dentry *d = cgrp->dentry; 4401 struct dentry *d = cgrp->dentry;
4634 struct cgroup_event *event, *tmp; 4402 struct cgroup_subsys_state *css;
4635 struct cgroup_subsys *ss;
4636 struct cgroup *child; 4403 struct cgroup *child;
4637 bool empty; 4404 bool empty;
4405 int ssid;
4638 4406
4639 lockdep_assert_held(&d->d_inode->i_mutex); 4407 lockdep_assert_held(&d->d_inode->i_mutex);
4640 lockdep_assert_held(&cgroup_mutex); 4408 lockdep_assert_held(&cgroup_mutex);
@@ -4670,12 +4438,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4670 * will be invoked to perform the rest of destruction once the 4438 * will be invoked to perform the rest of destruction once the
4671 * percpu refs of all css's are confirmed to be killed. 4439 * percpu refs of all css's are confirmed to be killed.
4672 */ 4440 */
4673 for_each_root_subsys(cgrp->root, ss) { 4441 for_each_css(css, ssid, cgrp)
4674 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 4442 kill_css(css);
4675
4676 if (css)
4677 kill_css(css);
4678 }
4679 4443
4680 /* 4444 /*
4681 * Mark @cgrp dead. This prevents further task migration and child 4445 * Mark @cgrp dead. This prevents further task migration and child
@@ -4710,18 +4474,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4710 dget(d); 4474 dget(d);
4711 cgroup_d_remove_dir(d); 4475 cgroup_d_remove_dir(d);
4712 4476
4713 /*
4714 * Unregister events and notify userspace.
4715 * Notify userspace about cgroup removing only after rmdir of cgroup
4716 * directory to avoid race between userspace and kernelspace.
4717 */
4718 spin_lock(&cgrp->event_list_lock);
4719 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4720 list_del_init(&event->list);
4721 schedule_work(&event->remove);
4722 }
4723 spin_unlock(&cgrp->event_list_lock);
4724
4725 return 0; 4477 return 0;
4726}; 4478};
4727 4479
@@ -4792,7 +4544,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4792 cgroup_init_cftsets(ss); 4544 cgroup_init_cftsets(ss);
4793 4545
4794 /* Create the top cgroup state for this subsystem */ 4546 /* Create the top cgroup state for this subsystem */
4795 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4796 ss->root = &cgroup_dummy_root; 4547 ss->root = &cgroup_dummy_root;
4797 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4548 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4798 /* We don't handle early failures gracefully */ 4549 /* We don't handle early failures gracefully */
@@ -4866,6 +4617,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4866 cgroup_init_cftsets(ss); 4617 cgroup_init_cftsets(ss);
4867 4618
4868 mutex_lock(&cgroup_mutex); 4619 mutex_lock(&cgroup_mutex);
4620 mutex_lock(&cgroup_root_mutex);
4869 cgroup_subsys[ss->subsys_id] = ss; 4621 cgroup_subsys[ss->subsys_id] = ss;
4870 4622
4871 /* 4623 /*
@@ -4877,11 +4629,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4877 if (IS_ERR(css)) { 4629 if (IS_ERR(css)) {
4878 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4630 /* failure case - need to deassign the cgroup_subsys[] slot. */
4879 cgroup_subsys[ss->subsys_id] = NULL; 4631 cgroup_subsys[ss->subsys_id] = NULL;
4632 mutex_unlock(&cgroup_root_mutex);
4880 mutex_unlock(&cgroup_mutex); 4633 mutex_unlock(&cgroup_mutex);
4881 return PTR_ERR(css); 4634 return PTR_ERR(css);
4882 } 4635 }
4883 4636
4884 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4885 ss->root = &cgroup_dummy_root; 4637 ss->root = &cgroup_dummy_root;
4886 4638
4887 /* our new subsystem will be attached to the dummy hierarchy. */ 4639 /* our new subsystem will be attached to the dummy hierarchy. */
@@ -4911,14 +4663,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4911 write_unlock(&css_set_lock); 4663 write_unlock(&css_set_lock);
4912 4664
4913 ret = online_css(css); 4665 ret = online_css(css);
4914 if (ret) 4666 if (ret) {
4667 ss->css_free(css);
4915 goto err_unload; 4668 goto err_unload;
4669 }
4916 4670
4917 /* success! */ 4671 /* success! */
4672 mutex_unlock(&cgroup_root_mutex);
4918 mutex_unlock(&cgroup_mutex); 4673 mutex_unlock(&cgroup_mutex);
4919 return 0; 4674 return 0;
4920 4675
4921err_unload: 4676err_unload:
4677 mutex_unlock(&cgroup_root_mutex);
4922 mutex_unlock(&cgroup_mutex); 4678 mutex_unlock(&cgroup_mutex);
4923 /* @ss can't be mounted here as try_module_get() would fail */ 4679 /* @ss can't be mounted here as try_module_get() would fail */
4924 cgroup_unload_subsys(ss); 4680 cgroup_unload_subsys(ss);
@@ -4937,6 +4693,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4937void cgroup_unload_subsys(struct cgroup_subsys *ss) 4693void cgroup_unload_subsys(struct cgroup_subsys *ss)
4938{ 4694{
4939 struct cgrp_cset_link *link; 4695 struct cgrp_cset_link *link;
4696 struct cgroup_subsys_state *css;
4940 4697
4941 BUG_ON(ss->module == NULL); 4698 BUG_ON(ss->module == NULL);
4942 4699
@@ -4948,15 +4705,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4948 BUG_ON(ss->root != &cgroup_dummy_root); 4705 BUG_ON(ss->root != &cgroup_dummy_root);
4949 4706
4950 mutex_lock(&cgroup_mutex); 4707 mutex_lock(&cgroup_mutex);
4708 mutex_lock(&cgroup_root_mutex);
4951 4709
4952 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4710 css = cgroup_css(cgroup_dummy_top, ss);
4711 if (css)
4712 offline_css(css);
4953 4713
4954 /* deassign the subsys_id */ 4714 /* deassign the subsys_id */
4955 cgroup_subsys[ss->subsys_id] = NULL; 4715 cgroup_subsys[ss->subsys_id] = NULL;
4956 4716
4957 /* remove subsystem from the dummy root's list of subsystems */
4958 list_del_init(&ss->sibling);
4959
4960 /* 4717 /*
4961 * disentangle the css from all css_sets attached to the dummy 4718 * disentangle the css from all css_sets attached to the dummy
4962 * top. as in loading, we need to pay our respects to the hashtable 4719 * top. as in loading, we need to pay our respects to the hashtable
@@ -4979,9 +4736,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4979 * need to free before marking as null because ss->css_free needs 4736 * need to free before marking as null because ss->css_free needs
4980 * the cgrp->subsys pointer to find their state. 4737 * the cgrp->subsys pointer to find their state.
4981 */ 4738 */
4982 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4739 if (css)
4740 ss->css_free(css);
4983 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4741 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4984 4742
4743 mutex_unlock(&cgroup_root_mutex);
4985 mutex_unlock(&cgroup_mutex); 4744 mutex_unlock(&cgroup_mutex);
4986} 4745}
4987EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4746EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
@@ -5100,6 +4859,15 @@ static int __init cgroup_wq_init(void)
5100 */ 4859 */
5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4860 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 BUG_ON(!cgroup_destroy_wq); 4861 BUG_ON(!cgroup_destroy_wq);
4862
4863 /*
4864 * Used to destroy pidlists and separate to serve as flush domain.
4865 * Cap @max_active to 1 too.
4866 */
4867 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4868 0, 1);
4869 BUG_ON(!cgroup_pidlist_destroy_wq);
4870
5103 return 0; 4871 return 0;
5104} 4872}
5105core_initcall(cgroup_wq_init); 4873core_initcall(cgroup_wq_init);
@@ -5143,11 +4911,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5143 for_each_active_root(root) { 4911 for_each_active_root(root) {
5144 struct cgroup_subsys *ss; 4912 struct cgroup_subsys *ss;
5145 struct cgroup *cgrp; 4913 struct cgroup *cgrp;
5146 int count = 0; 4914 int ssid, count = 0;
5147 4915
5148 seq_printf(m, "%d:", root->hierarchy_id); 4916 seq_printf(m, "%d:", root->hierarchy_id);
5149 for_each_root_subsys(root, ss) 4917 for_each_subsys(ss, ssid)
5150 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4918 if (root->subsys_mask & (1 << ssid))
4919 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5151 if (strlen(root->name)) 4920 if (strlen(root->name))
5152 seq_printf(m, "%sname=%s", count ? "," : "", 4921 seq_printf(m, "%sname=%s", count ? "," : "",
5153 root->name); 4922 root->name);
@@ -5488,16 +5257,16 @@ __setup("cgroup_disable=", cgroup_disable);
5488 * @dentry: directory dentry of interest 5257 * @dentry: directory dentry of interest
5489 * @ss: subsystem of interest 5258 * @ss: subsystem of interest
5490 * 5259 *
5491 * Must be called under RCU read lock. The caller is responsible for 5260 * Must be called under cgroup_mutex or RCU read lock. The caller is
5492 * pinning the returned css if it needs to be accessed outside the RCU 5261 * responsible for pinning the returned css if it needs to be accessed
5493 * critical section. 5262 * outside the critical section.
5494 */ 5263 */
5495struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 5264struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5496 struct cgroup_subsys *ss) 5265 struct cgroup_subsys *ss)
5497{ 5266{
5498 struct cgroup *cgrp; 5267 struct cgroup *cgrp;
5499 5268
5500 WARN_ON_ONCE(!rcu_read_lock_held()); 5269 cgroup_assert_mutex_or_rcu_locked();
5501 5270
5502 /* is @dentry a cgroup dir? */ 5271 /* is @dentry a cgroup dir? */
5503 if (!dentry->d_inode || 5272 if (!dentry->d_inode ||
@@ -5520,9 +5289,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5520{ 5289{
5521 struct cgroup *cgrp; 5290 struct cgroup *cgrp;
5522 5291
5523 rcu_lockdep_assert(rcu_read_lock_held() || 5292 cgroup_assert_mutex_or_rcu_locked();
5524 lockdep_is_held(&cgroup_mutex),
5525 "css_from_id() needs proper protection");
5526 5293
5527 cgrp = idr_find(&ss->root->cgroup_idr, id); 5294 cgrp = idr_find(&ss->root->cgroup_idr, id);
5528 if (cgrp) 5295 if (cgrp)
@@ -5570,9 +5337,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5570 return count; 5337 return count;
5571} 5338}
5572 5339
5573static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, 5340static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5574 struct cftype *cft,
5575 struct seq_file *seq)
5576{ 5341{
5577 struct cgrp_cset_link *link; 5342 struct cgrp_cset_link *link;
5578 struct css_set *cset; 5343 struct css_set *cset;
@@ -5597,9 +5362,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5597} 5362}
5598 5363
5599#define MAX_TASKS_SHOWN_PER_CSS 25 5364#define MAX_TASKS_SHOWN_PER_CSS 25
5600static int cgroup_css_links_read(struct cgroup_subsys_state *css, 5365static int cgroup_css_links_read(struct seq_file *seq, void *v)
5601 struct cftype *cft, struct seq_file *seq)
5602{ 5366{
5367 struct cgroup_subsys_state *css = seq_css(seq);
5603 struct cgrp_cset_link *link; 5368 struct cgrp_cset_link *link;
5604 5369
5605 read_lock(&css_set_lock); 5370 read_lock(&css_set_lock);
@@ -5645,12 +5410,12 @@ static struct cftype debug_files[] = {
5645 5410
5646 { 5411 {
5647 .name = "current_css_set_cg_links", 5412 .name = "current_css_set_cg_links",
5648 .read_seq_string = current_css_set_cg_links_read, 5413 .seq_show = current_css_set_cg_links_read,
5649 }, 5414 },
5650 5415
5651 { 5416 {
5652 .name = "cgroup_css_links", 5417 .name = "cgroup_css_links",
5653 .read_seq_string = cgroup_css_links_read, 5418 .seq_show = cgroup_css_links_read,
5654 }, 5419 },
5655 5420
5656 { 5421 {