aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 20:51:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 20:51:34 -0500
commitf075e0f6993f41c72dbb1d3e7a2d7740f14e89e2 (patch)
treea25b464a67fffc6f43940e0e85e2735a48bb1ad7 /kernel/cgroup.c
parent5cb7398caf69e3943df78435a19a8a77fe8b9463 (diff)
parentdd4b0a4676907481256d16d5de0851b315a6f22c (diff)
Merge branch 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "The bulk of changes are cleanups and preparations for the upcoming kernfs conversion. - cgroup_event mechanism which is and will be used only by memcg is moved to memcg. - pidlist handling is updated so that it can be served by seq_file. Also, the list is not sorted if sane_behavior. cgroup documentation explicitly states that the file is not sorted but it has been for quite some time. - All cgroup file handling now happens on top of seq_file. This is to prepare for kernfs conversion. In addition, all operations are restructured so that they map 1-1 to kernfs operations. - Other cleanups and low-pri fixes" * 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (40 commits) cgroup: trivial style updates cgroup: remove stray references to css_id doc: cgroups: Fix typo in doc/cgroups cgroup: fix fail path in cgroup_load_subsys() cgroup: fix missing unlock on error in cgroup_load_subsys() cgroup: remove for_each_root_subsys() cgroup: implement for_each_css() cgroup: factor out cgroup_subsys_state creation into create_css() cgroup: combine css handling loops in cgroup_create() cgroup: reorder operations in cgroup_create() cgroup: make for_each_subsys() useable under cgroup_root_mutex cgroup: css iterations and css_from_dir() are safe under cgroup_mutex cgroup: unify pidlist and other file handling cgroup: replace cftype->read_seq_string() with cftype->seq_show() cgroup: attach cgroup_open_file to all cgroup files cgroup: generalize cgroup_pidlist_open_file cgroup: unify read path so that seq_file is always used cgroup: unify cgroup_write_X64() and cgroup_write_string() cgroup: remove cftype->read(), ->read_map() and ->write() hugetlb_cgroup: convert away from cftype->read() ...
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1202
1 files changed, 480 insertions, 722 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc1dcabe9217..e2f46ba37f72 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/seq_file.h>
45#include <linux/slab.h> 44#include <linux/slab.h>
46#include <linux/magic.h> 45#include <linux/magic.h>
47#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -56,15 +55,20 @@
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 58#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 59#include <linux/kthread.h>
63#include <linux/file.h>
64 60
65#include <linux/atomic.h> 61#include <linux/atomic.h>
66 62
67/* 63/*
64 * pidlists linger the following amount before being destroyed. The goal
65 * is avoiding frequent destruction in the middle of consecutive read calls
66 * Expiring in the middle is a performance problem not a correctness one.
67 * 1 sec should be enough.
68 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70
71/*
68 * cgroup_mutex is the master lock. Any modification to cgroup or its 72 * cgroup_mutex is the master lock. Any modification to cgroup or its
69 * hierarchy must be performed while holding it. 73 * hierarchy must be performed while holding it.
70 * 74 *
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex);
89 93
90static DEFINE_MUTEX(cgroup_root_mutex); 94static DEFINE_MUTEX(cgroup_root_mutex);
91 95
96#define cgroup_assert_mutex_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \
98 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108
92/* 109/*
93 * cgroup destruction makes heavy use of work items and there can be a lot 110 * cgroup destruction makes heavy use of work items and there can be a lot
94 * of concurrent destructions. Use a separate workqueue so that cgroup 111 * of concurrent destructions. Use a separate workqueue so that cgroup
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
98static struct workqueue_struct *cgroup_destroy_wq; 115static struct workqueue_struct *cgroup_destroy_wq;
99 116
100/* 117/*
118 * pidlist destructions need to be flushed on cgroup destruction. Use a
119 * separate workqueue as flush domain.
120 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122
123/*
101 * Generate an array of cgroup subsystem pointers. At boot time, this is 124 * Generate an array of cgroup subsystem pointers. At boot time, this is
102 * populated with the built in subsystems, and modular subsystems are 125 * populated with the built in subsystems, and modular subsystems are
103 * registered after that. The mutable section of this array is protected by 126 * registered after that. The mutable section of this array is protected by
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
119/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
120static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
121 144
122/*
123 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
124 */
125struct cfent {
126 struct list_head node;
127 struct dentry *dentry;
128 struct cftype *type;
129 struct cgroup_subsys_state *css;
130
131 /* file xattrs */
132 struct simple_xattrs xattrs;
133};
134
135/*
136 * cgroup_event represents events which userspace want to receive.
137 */
138struct cgroup_event {
139 /*
140 * css which the event belongs to.
141 */
142 struct cgroup_subsys_state *css;
143 /*
144 * Control file which the event associated.
145 */
146 struct cftype *cft;
147 /*
148 * eventfd to signal userspace about the event.
149 */
150 struct eventfd_ctx *eventfd;
151 /*
152 * Each of these stored in a list by the cgroup.
153 */
154 struct list_head list;
155 /*
156 * All fields below needed to unregister event when
157 * userspace closes eventfd.
158 */
159 poll_table pt;
160 wait_queue_head_t *wqh;
161 wait_queue_t wait;
162 struct work_struct remove;
163};
164
165/* The list of hierarchy roots */ 145/* The list of hierarchy roots */
166 146
167static LIST_HEAD(cgroup_roots); 147static LIST_HEAD(cgroup_roots);
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
201 bool is_add); 181 bool is_add);
202static int cgroup_file_release(struct inode *inode, struct file *file); 182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
203 184
204/** 185/**
205 * cgroup_css - obtain a cgroup's css for the specified subsystem 186 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
262} 243}
263 244
264/** 245/**
246 * for_each_css - iterate all css's of a cgroup
247 * @css: the iteration cursor
248 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
249 * @cgrp: the target cgroup to iterate css's of
250 *
251 * Should be called under cgroup_mutex.
252 */
253#define for_each_css(css, ssid, cgrp) \
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \
257 lockdep_is_held(&cgroup_mutex)))) { } \
258 else
259
260/**
265 * for_each_subsys - iterate all loaded cgroup subsystems 261 * for_each_subsys - iterate all loaded cgroup subsystems
266 * @ss: the iteration cursor 262 * @ss: the iteration cursor
267 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
268 * 264 *
269 * Should be called under cgroup_mutex. 265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
270 */ 267 */
271#define for_each_subsys(ss, i) \ 268#define for_each_subsys(ss, ssid) \
272 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ 269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
273 if (({ lockdep_assert_held(&cgroup_mutex); \ 270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
274 !((ss) = cgroup_subsys[i]); })) { } \ 271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
275 else 272 else
276 273
277/** 274/**
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
286 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ 283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
287 (((ss) = cgroup_subsys[i]) || true); (i)++) 284 (((ss) = cgroup_subsys[i]) || true); (i)++)
288 285
289/* iterate each subsystem attached to a hierarchy */
290#define for_each_root_subsys(root, ss) \
291 list_for_each_entry((ss), &(root)->subsys_list, sibling)
292
293/* iterate across the active hierarchies */ 286/* iterate across the active hierarchies */
294#define for_each_active_root(root) \ 287#define for_each_active_root(root) \
295 list_for_each_entry((root), &cgroup_roots, root_list) 288 list_for_each_entry((root), &cgroup_roots, root_list)
@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
863 */ 856 */
864 deactivate_super(cgrp->root->sb); 857 deactivate_super(cgrp->root->sb);
865 858
866 /* 859 cgroup_pidlist_destroy_all(cgrp);
867 * if we're getting rid of the cgroup, refcount should ensure
868 * that there are no pidlists left.
869 */
870 BUG_ON(!list_empty(&cgrp->pidlists));
871 860
872 simple_xattrs_free(&cgrp->xattrs); 861 simple_xattrs_free(&cgrp->xattrs);
873 862
@@ -1050,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1050 cgroup_css(cgroup_dummy_top, ss)); 1039 cgroup_css(cgroup_dummy_top, ss));
1051 cgroup_css(cgrp, ss)->cgroup = cgrp; 1040 cgroup_css(cgrp, ss)->cgroup = cgrp;
1052 1041
1053 list_move(&ss->sibling, &root->subsys_list);
1054 ss->root = root; 1042 ss->root = root;
1055 if (ss->bind) 1043 if (ss->bind)
1056 ss->bind(cgroup_css(cgrp, ss)); 1044 ss->bind(cgroup_css(cgrp, ss));
@@ -1069,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1069 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1057 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1070 1058
1071 cgroup_subsys[i]->root = &cgroup_dummy_root; 1059 cgroup_subsys[i]->root = &cgroup_dummy_root;
1072 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1073 1060
1074 /* subsystem is now free - drop reference on module */ 1061 /* subsystem is now free - drop reference on module */
1075 module_put(ss->module); 1062 module_put(ss->module);
@@ -1096,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1096{ 1083{
1097 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1084 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1098 struct cgroup_subsys *ss; 1085 struct cgroup_subsys *ss;
1086 int ssid;
1099 1087
1100 mutex_lock(&cgroup_root_mutex); 1088 mutex_lock(&cgroup_root_mutex);
1101 for_each_root_subsys(root, ss) 1089 for_each_subsys(ss, ssid)
1102 seq_printf(seq, ",%s", ss->name); 1090 if (root->subsys_mask & (1 << ssid))
1091 seq_printf(seq, ",%s", ss->name);
1103 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1092 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1104 seq_puts(seq, ",sane_behavior"); 1093 seq_puts(seq, ",sane_behavior");
1105 if (root->flags & CGRP_ROOT_NOPREFIX) 1094 if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1362,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1362 INIT_LIST_HEAD(&cgrp->pidlists); 1351 INIT_LIST_HEAD(&cgrp->pidlists);
1363 mutex_init(&cgrp->pidlist_mutex); 1352 mutex_init(&cgrp->pidlist_mutex);
1364 cgrp->dummy_css.cgroup = cgrp; 1353 cgrp->dummy_css.cgroup = cgrp;
1365 INIT_LIST_HEAD(&cgrp->event_list);
1366 spin_lock_init(&cgrp->event_list_lock);
1367 simple_xattrs_init(&cgrp->xattrs); 1354 simple_xattrs_init(&cgrp->xattrs);
1368} 1355}
1369 1356
@@ -1371,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1371{ 1358{
1372 struct cgroup *cgrp = &root->top_cgroup; 1359 struct cgroup *cgrp = &root->top_cgroup;
1373 1360
1374 INIT_LIST_HEAD(&root->subsys_list);
1375 INIT_LIST_HEAD(&root->root_list); 1361 INIT_LIST_HEAD(&root->root_list);
1376 root->number_of_cgroups = 1; 1362 root->number_of_cgroups = 1;
1377 cgrp->root = root; 1363 cgrp->root = root;
@@ -1693,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1693 return ERR_PTR(ret); 1679 return ERR_PTR(ret);
1694} 1680}
1695 1681
1696static void cgroup_kill_sb(struct super_block *sb) { 1682static void cgroup_kill_sb(struct super_block *sb)
1683{
1697 struct cgroupfs_root *root = sb->s_fs_info; 1684 struct cgroupfs_root *root = sb->s_fs_info;
1698 struct cgroup *cgrp = &root->top_cgroup; 1685 struct cgroup *cgrp = &root->top_cgroup;
1699 struct cgrp_cset_link *link, *tmp_link; 1686 struct cgrp_cset_link *link, *tmp_link;
@@ -1976,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1976 bool threadgroup) 1963 bool threadgroup)
1977{ 1964{
1978 int retval, i, group_size; 1965 int retval, i, group_size;
1979 struct cgroup_subsys *ss, *failed_ss = NULL;
1980 struct cgroupfs_root *root = cgrp->root; 1966 struct cgroupfs_root *root = cgrp->root;
1967 struct cgroup_subsys_state *css, *failed_css = NULL;
1981 /* threadgroup list cursor and array */ 1968 /* threadgroup list cursor and array */
1982 struct task_struct *leader = tsk; 1969 struct task_struct *leader = tsk;
1983 struct task_and_cgroup *tc; 1970 struct task_and_cgroup *tc;
@@ -2050,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 /* 2037 /*
2051 * step 1: check that we can legitimately attach to the cgroup. 2038 * step 1: check that we can legitimately attach to the cgroup.
2052 */ 2039 */
2053 for_each_root_subsys(root, ss) { 2040 for_each_css(css, i, cgrp) {
2054 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2041 if (css->ss->can_attach) {
2055 2042 retval = css->ss->can_attach(css, &tset);
2056 if (ss->can_attach) {
2057 retval = ss->can_attach(css, &tset);
2058 if (retval) { 2043 if (retval) {
2059 failed_ss = ss; 2044 failed_css = css;
2060 goto out_cancel_attach; 2045 goto out_cancel_attach;
2061 } 2046 }
2062 } 2047 }
@@ -2092,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2092 /* 2077 /*
2093 * step 4: do subsystem attach callbacks. 2078 * step 4: do subsystem attach callbacks.
2094 */ 2079 */
2095 for_each_root_subsys(root, ss) { 2080 for_each_css(css, i, cgrp)
2096 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2081 if (css->ss->attach)
2097 2082 css->ss->attach(css, &tset);
2098 if (ss->attach)
2099 ss->attach(css, &tset);
2100 }
2101 2083
2102 /* 2084 /*
2103 * step 5: success! and cleanup 2085 * step 5: success! and cleanup
@@ -2114,13 +2096,11 @@ out_put_css_set_refs:
2114 } 2096 }
2115out_cancel_attach: 2097out_cancel_attach:
2116 if (retval) { 2098 if (retval) {
2117 for_each_root_subsys(root, ss) { 2099 for_each_css(css, i, cgrp) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2100 if (css == failed_css)
2119
2120 if (ss == failed_ss)
2121 break; 2101 break;
2122 if (ss->cancel_attach) 2102 if (css->ss->cancel_attach)
2123 ss->cancel_attach(css, &tset); 2103 css->ss->cancel_attach(css, &tset);
2124 } 2104 }
2125 } 2105 }
2126out_free_group_list: 2106out_free_group_list:
@@ -2148,7 +2128,7 @@ retry_find_task:
2148 tsk = find_task_by_vpid(pid); 2128 tsk = find_task_by_vpid(pid);
2149 if (!tsk) { 2129 if (!tsk) {
2150 rcu_read_unlock(); 2130 rcu_read_unlock();
2151 ret= -ESRCH; 2131 ret = -ESRCH;
2152 goto out_unlock_cgroup; 2132 goto out_unlock_cgroup;
2153 } 2133 }
2154 /* 2134 /*
@@ -2260,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2260 return 0; 2240 return 0;
2261} 2241}
2262 2242
2263static int cgroup_release_agent_show(struct cgroup_subsys_state *css, 2243static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2264 struct cftype *cft, struct seq_file *seq)
2265{ 2244{
2266 struct cgroup *cgrp = css->cgroup; 2245 struct cgroup *cgrp = seq_css(seq)->cgroup;
2267 2246
2268 if (!cgroup_lock_live_group(cgrp)) 2247 if (!cgroup_lock_live_group(cgrp))
2269 return -ENODEV; 2248 return -ENODEV;
@@ -2273,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2273 return 0; 2252 return 0;
2274} 2253}
2275 2254
2276static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, 2255static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2277 struct cftype *cft, struct seq_file *seq)
2278{ 2256{
2279 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); 2257 struct cgroup *cgrp = seq_css(seq)->cgroup;
2258
2259 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2280 return 0; 2260 return 0;
2281} 2261}
2282 2262
2283/* A buffer size big enough for numbers or short strings */ 2263/* A buffer size big enough for numbers or short strings */
2284#define CGROUP_LOCAL_BUFFER_SIZE 64 2264#define CGROUP_LOCAL_BUFFER_SIZE 64
2285 2265
2286static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, 2266static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2287 struct cftype *cft, struct file *file, 2267 size_t nbytes, loff_t *ppos)
2288 const char __user *userbuf, size_t nbytes,
2289 loff_t *unused_ppos)
2290{ 2268{
2291 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2269 struct cfent *cfe = __d_cfe(file->f_dentry);
2292 int retval = 0; 2270 struct cftype *cft = __d_cft(file->f_dentry);
2293 char *end; 2271 struct cgroup_subsys_state *css = cfe->css;
2272 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2273 char *buf;
2274 int ret;
2294 2275
2295 if (!nbytes) 2276 if (nbytes >= max_bytes)
2296 return -EINVAL;
2297 if (nbytes >= sizeof(buffer))
2298 return -E2BIG; 2277 return -E2BIG;
2299 if (copy_from_user(buffer, userbuf, nbytes))
2300 return -EFAULT;
2301 2278
2302 buffer[nbytes] = 0; /* nul-terminate */ 2279 buf = kmalloc(nbytes + 1, GFP_KERNEL);
2303 if (cft->write_u64) { 2280 if (!buf)
2304 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2281 return -ENOMEM;
2305 if (*end) 2282
2306 return -EINVAL; 2283 if (copy_from_user(buf, userbuf, nbytes)) {
2307 retval = cft->write_u64(css, cft, val); 2284 ret = -EFAULT;
2285 goto out_free;
2286 }
2287
2288 buf[nbytes] = '\0';
2289
2290 if (cft->write_string) {
2291 ret = cft->write_string(css, cft, strstrip(buf));
2292 } else if (cft->write_u64) {
2293 unsigned long long v;
2294 ret = kstrtoull(buf, 0, &v);
2295 if (!ret)
2296 ret = cft->write_u64(css, cft, v);
2297 } else if (cft->write_s64) {
2298 long long v;
2299 ret = kstrtoll(buf, 0, &v);
2300 if (!ret)
2301 ret = cft->write_s64(css, cft, v);
2302 } else if (cft->trigger) {
2303 ret = cft->trigger(css, (unsigned int)cft->private);
2308 } else { 2304 } else {
2309 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2305 ret = -EINVAL;
2310 if (*end)
2311 return -EINVAL;
2312 retval = cft->write_s64(css, cft, val);
2313 } 2306 }
2314 if (!retval) 2307out_free:
2315 retval = nbytes; 2308 kfree(buf);
2316 return retval; 2309 return ret ?: nbytes;
2317} 2310}
2318 2311
2319static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, 2312/*
2320 struct cftype *cft, struct file *file, 2313 * seqfile ops/methods for returning structured data. Currently just
2321 const char __user *userbuf, size_t nbytes, 2314 * supports string->u64 maps, but can be extended in future.
2322 loff_t *unused_ppos) 2315 */
2316
2317static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2323{ 2318{
2324 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2319 struct cftype *cft = seq_cft(seq);
2325 int retval = 0;
2326 size_t max_bytes = cft->max_write_len;
2327 char *buffer = local_buffer;
2328 2320
2329 if (!max_bytes) 2321 if (cft->seq_start) {
2330 max_bytes = sizeof(local_buffer) - 1; 2322 return cft->seq_start(seq, ppos);
2331 if (nbytes >= max_bytes) 2323 } else {
2332 return -E2BIG; 2324 /*
2333 /* Allocate a dynamic buffer if we need one */ 2325 * The same behavior and code as single_open(). Returns
2334 if (nbytes >= sizeof(local_buffer)) { 2326 * !NULL if pos is at the beginning; otherwise, NULL.
2335 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2327 */
2336 if (buffer == NULL) 2328 return NULL + !*ppos;
2337 return -ENOMEM;
2338 }
2339 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2340 retval = -EFAULT;
2341 goto out;
2342 } 2329 }
2343
2344 buffer[nbytes] = 0; /* nul-terminate */
2345 retval = cft->write_string(css, cft, strstrip(buffer));
2346 if (!retval)
2347 retval = nbytes;
2348out:
2349 if (buffer != local_buffer)
2350 kfree(buffer);
2351 return retval;
2352} 2330}
2353 2331
2354static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2332static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2355 size_t nbytes, loff_t *ppos)
2356{ 2333{
2357 struct cfent *cfe = __d_cfe(file->f_dentry); 2334 struct cftype *cft = seq_cft(seq);
2358 struct cftype *cft = __d_cft(file->f_dentry);
2359 struct cgroup_subsys_state *css = cfe->css;
2360 2335
2361 if (cft->write) 2336 if (cft->seq_next) {
2362 return cft->write(css, cft, file, buf, nbytes, ppos); 2337 return cft->seq_next(seq, v, ppos);
2363 if (cft->write_u64 || cft->write_s64) 2338 } else {
2364 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); 2339 /*
2365 if (cft->write_string) 2340 * The same behavior and code as single_open(), always
2366 return cgroup_write_string(css, cft, file, buf, nbytes, ppos); 2341 * terminate after the initial read.
2367 if (cft->trigger) { 2342 */
2368 int ret = cft->trigger(css, (unsigned int)cft->private); 2343 ++*ppos;
2369 return ret ? ret : nbytes; 2344 return NULL;
2370 } 2345 }
2371 return -EINVAL;
2372} 2346}
2373 2347
2374static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, 2348static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2375 struct cftype *cft, struct file *file,
2376 char __user *buf, size_t nbytes, loff_t *ppos)
2377{ 2349{
2378 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2350 struct cftype *cft = seq_cft(seq);
2379 u64 val = cft->read_u64(css, cft);
2380 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2381 2351
2382 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2352 if (cft->seq_stop)
2353 cft->seq_stop(seq, v);
2383} 2354}
2384 2355
2385static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, 2356static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2386 struct cftype *cft, struct file *file,
2387 char __user *buf, size_t nbytes, loff_t *ppos)
2388{ 2357{
2389 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2358 struct cftype *cft = seq_cft(m);
2390 s64 val = cft->read_s64(css, cft); 2359 struct cgroup_subsys_state *css = seq_css(m);
2391 int len = sprintf(tmp, "%lld\n", (long long) val);
2392 2360
2393 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2361 if (cft->seq_show)
2394} 2362 return cft->seq_show(m, arg);
2395 2363
2396static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2397 size_t nbytes, loff_t *ppos)
2398{
2399 struct cfent *cfe = __d_cfe(file->f_dentry);
2400 struct cftype *cft = __d_cft(file->f_dentry);
2401 struct cgroup_subsys_state *css = cfe->css;
2402
2403 if (cft->read)
2404 return cft->read(css, cft, file, buf, nbytes, ppos);
2405 if (cft->read_u64) 2364 if (cft->read_u64)
2406 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); 2365 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2407 if (cft->read_s64) 2366 else if (cft->read_s64)
2408 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); 2367 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2409 return -EINVAL; 2368 else
2410} 2369 return -EINVAL;
2411 2370 return 0;
2412/*
2413 * seqfile ops/methods for returning structured data. Currently just
2414 * supports string->u64 maps, but can be extended in future.
2415 */
2416
2417static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2418{
2419 struct seq_file *sf = cb->state;
2420 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2421}
2422
2423static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2424{
2425 struct cfent *cfe = m->private;
2426 struct cftype *cft = cfe->type;
2427 struct cgroup_subsys_state *css = cfe->css;
2428
2429 if (cft->read_map) {
2430 struct cgroup_map_cb cb = {
2431 .fill = cgroup_map_add,
2432 .state = m,
2433 };
2434 return cft->read_map(css, cft, &cb);
2435 }
2436 return cft->read_seq_string(css, cft, m);
2437} 2371}
2438 2372
2439static const struct file_operations cgroup_seqfile_operations = { 2373static struct seq_operations cgroup_seq_operations = {
2440 .read = seq_read, 2374 .start = cgroup_seqfile_start,
2441 .write = cgroup_file_write, 2375 .next = cgroup_seqfile_next,
2442 .llseek = seq_lseek, 2376 .stop = cgroup_seqfile_stop,
2443 .release = cgroup_file_release, 2377 .show = cgroup_seqfile_show,
2444}; 2378};
2445 2379
2446static int cgroup_file_open(struct inode *inode, struct file *file) 2380static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2449,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2449 struct cftype *cft = __d_cft(file->f_dentry); 2383 struct cftype *cft = __d_cft(file->f_dentry);
2450 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2384 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2451 struct cgroup_subsys_state *css; 2385 struct cgroup_subsys_state *css;
2386 struct cgroup_open_file *of;
2452 int err; 2387 int err;
2453 2388
2454 err = generic_file_open(inode, file); 2389 err = generic_file_open(inode, file);
@@ -2478,32 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2478 WARN_ON_ONCE(cfe->css && cfe->css != css); 2413 WARN_ON_ONCE(cfe->css && cfe->css != css);
2479 cfe->css = css; 2414 cfe->css = css;
2480 2415
2481 if (cft->read_map || cft->read_seq_string) { 2416 of = __seq_open_private(file, &cgroup_seq_operations,
2482 file->f_op = &cgroup_seqfile_operations; 2417 sizeof(struct cgroup_open_file));
2483 err = single_open(file, cgroup_seqfile_show, cfe); 2418 if (of) {
2484 } else if (cft->open) { 2419 of->cfe = cfe;
2485 err = cft->open(inode, file); 2420 return 0;
2486 } 2421 }
2487 2422
2488 if (css->ss && err) 2423 if (css->ss)
2489 css_put(css); 2424 css_put(css);
2490 return err; 2425 return -ENOMEM;
2491} 2426}
2492 2427
2493static int cgroup_file_release(struct inode *inode, struct file *file) 2428static int cgroup_file_release(struct inode *inode, struct file *file)
2494{ 2429{
2495 struct cfent *cfe = __d_cfe(file->f_dentry); 2430 struct cfent *cfe = __d_cfe(file->f_dentry);
2496 struct cftype *cft = __d_cft(file->f_dentry);
2497 struct cgroup_subsys_state *css = cfe->css; 2431 struct cgroup_subsys_state *css = cfe->css;
2498 int ret = 0;
2499 2432
2500 if (cft->release)
2501 ret = cft->release(inode, file);
2502 if (css->ss) 2433 if (css->ss)
2503 css_put(css); 2434 css_put(css);
2504 if (file->f_op == &cgroup_seqfile_operations) 2435 return seq_release_private(inode, file);
2505 single_release(inode, file);
2506 return ret;
2507} 2436}
2508 2437
2509/* 2438/*
@@ -2614,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2614} 2543}
2615 2544
2616static const struct file_operations cgroup_file_operations = { 2545static const struct file_operations cgroup_file_operations = {
2617 .read = cgroup_file_read, 2546 .read = seq_read,
2618 .write = cgroup_file_write, 2547 .write = cgroup_file_write,
2619 .llseek = generic_file_llseek, 2548 .llseek = generic_file_llseek,
2620 .open = cgroup_file_open, 2549 .open = cgroup_file_open,
@@ -2639,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2639 .removexattr = cgroup_removexattr, 2568 .removexattr = cgroup_removexattr,
2640}; 2569};
2641 2570
2642/*
2643 * Check if a file is a control file
2644 */
2645static inline struct cftype *__file_cft(struct file *file)
2646{
2647 if (file_inode(file)->i_fop != &cgroup_file_operations)
2648 return ERR_PTR(-EINVAL);
2649 return __d_cft(file->f_dentry);
2650}
2651
2652static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2571static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2653 struct super_block *sb) 2572 struct super_block *sb)
2654{ 2573{
@@ -2706,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2706 if (cft->mode) 2625 if (cft->mode)
2707 return cft->mode; 2626 return cft->mode;
2708 2627
2709 if (cft->read || cft->read_u64 || cft->read_s64 || 2628 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
2710 cft->read_map || cft->read_seq_string)
2711 mode |= S_IRUGO; 2629 mode |= S_IRUGO;
2712 2630
2713 if (cft->write || cft->write_u64 || cft->write_s64 || 2631 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
2714 cft->write_string || cft->trigger) 2632 cft->trigger)
2715 mode |= S_IWUSR; 2633 mode |= S_IWUSR;
2716 2634
2717 return mode; 2635 return mode;
@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
3007 * @parent_css: css whose children to walk 2925 * @parent_css: css whose children to walk
3008 * 2926 *
3009 * This function returns the next child of @parent_css and should be called 2927 * This function returns the next child of @parent_css and should be called
3010 * under RCU read lock. The only requirement is that @parent_css and 2928 * under either cgroup_mutex or RCU read lock. The only requirement is
3011 * @pos_css are accessible. The next sibling is guaranteed to be returned 2929 * that @parent_css and @pos_css are accessible. The next sibling is
3012 * regardless of their states. 2930 * guaranteed to be returned regardless of their states.
3013 */ 2931 */
3014struct cgroup_subsys_state * 2932struct cgroup_subsys_state *
3015css_next_child(struct cgroup_subsys_state *pos_css, 2933css_next_child(struct cgroup_subsys_state *pos_css,
@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
3019 struct cgroup *cgrp = parent_css->cgroup; 2937 struct cgroup *cgrp = parent_css->cgroup;
3020 struct cgroup *next; 2938 struct cgroup *next;
3021 2939
3022 WARN_ON_ONCE(!rcu_read_lock_held()); 2940 cgroup_assert_mutex_or_rcu_locked();
3023 2941
3024 /* 2942 /*
3025 * @pos could already have been removed. Once a cgroup is removed, 2943 * @pos could already have been removed. Once a cgroup is removed,
@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
3066 * to visit for pre-order traversal of @root's descendants. @root is 2984 * to visit for pre-order traversal of @root's descendants. @root is
3067 * included in the iteration and the first node to be visited. 2985 * included in the iteration and the first node to be visited.
3068 * 2986 *
3069 * While this function requires RCU read locking, it doesn't require the 2987 * While this function requires cgroup_mutex or RCU read locking, it
3070 * whole traversal to be contained in a single RCU critical section. This 2988 * doesn't require the whole traversal to be contained in a single critical
3071 * function will return the correct next descendant as long as both @pos 2989 * section. This function will return the correct next descendant as long
3072 * and @root are accessible and @pos is a descendant of @root. 2990 * as both @pos and @root are accessible and @pos is a descendant of @root.
3073 */ 2991 */
3074struct cgroup_subsys_state * 2992struct cgroup_subsys_state *
3075css_next_descendant_pre(struct cgroup_subsys_state *pos, 2993css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3077{ 2995{
3078 struct cgroup_subsys_state *next; 2996 struct cgroup_subsys_state *next;
3079 2997
3080 WARN_ON_ONCE(!rcu_read_lock_held()); 2998 cgroup_assert_mutex_or_rcu_locked();
3081 2999
3082 /* if first iteration, visit @root */ 3000 /* if first iteration, visit @root */
3083 if (!pos) 3001 if (!pos)
@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3108 * is returned. This can be used during pre-order traversal to skip 3026 * is returned. This can be used during pre-order traversal to skip
3109 * subtree of @pos. 3027 * subtree of @pos.
3110 * 3028 *
3111 * While this function requires RCU read locking, it doesn't require the 3029 * While this function requires cgroup_mutex or RCU read locking, it
3112 * whole traversal to be contained in a single RCU critical section. This 3030 * doesn't require the whole traversal to be contained in a single critical
3113 * function will return the correct rightmost descendant as long as @pos is 3031 * section. This function will return the correct rightmost descendant as
3114 * accessible. 3032 * long as @pos is accessible.
3115 */ 3033 */
3116struct cgroup_subsys_state * 3034struct cgroup_subsys_state *
3117css_rightmost_descendant(struct cgroup_subsys_state *pos) 3035css_rightmost_descendant(struct cgroup_subsys_state *pos)
3118{ 3036{
3119 struct cgroup_subsys_state *last, *tmp; 3037 struct cgroup_subsys_state *last, *tmp;
3120 3038
3121 WARN_ON_ONCE(!rcu_read_lock_held()); 3039 cgroup_assert_mutex_or_rcu_locked();
3122 3040
3123 do { 3041 do {
3124 last = pos; 3042 last = pos;
@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
3154 * to visit for post-order traversal of @root's descendants. @root is 3072 * to visit for post-order traversal of @root's descendants. @root is
3155 * included in the iteration and the last node to be visited. 3073 * included in the iteration and the last node to be visited.
3156 * 3074 *
3157 * While this function requires RCU read locking, it doesn't require the 3075 * While this function requires cgroup_mutex or RCU read locking, it
3158 * whole traversal to be contained in a single RCU critical section. This 3076 * doesn't require the whole traversal to be contained in a single critical
3159 * function will return the correct next descendant as long as both @pos 3077 * section. This function will return the correct next descendant as long
3160 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3078 * as both @pos and @cgroup are accessible and @pos is a descendant of
3079 * @cgroup.
3161 */ 3080 */
3162struct cgroup_subsys_state * 3081struct cgroup_subsys_state *
3163css_next_descendant_post(struct cgroup_subsys_state *pos, 3082css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3165{ 3084{
3166 struct cgroup_subsys_state *next; 3085 struct cgroup_subsys_state *next;
3167 3086
3168 WARN_ON_ONCE(!rcu_read_lock_held()); 3087 cgroup_assert_mutex_or_rcu_locked();
3169 3088
3170 /* if first iteration, visit leftmost descendant which may be @root */ 3089 /* if first iteration, visit leftmost descendant which may be @root */
3171 if (!pos) 3090 if (!pos)
@@ -3504,14 +3423,12 @@ struct cgroup_pidlist {
3504 pid_t *list; 3423 pid_t *list;
3505 /* how many elements the above list has */ 3424 /* how many elements the above list has */
3506 int length; 3425 int length;
3507 /* how many files are using the current array */
3508 int use_count;
3509 /* each of these stored in a list by its cgroup */ 3426 /* each of these stored in a list by its cgroup */
3510 struct list_head links; 3427 struct list_head links;
3511 /* pointer to the cgroup we belong to, for list removal purposes */ 3428 /* pointer to the cgroup we belong to, for list removal purposes */
3512 struct cgroup *owner; 3429 struct cgroup *owner;
3513 /* protects the other fields */ 3430 /* for delayed destruction */
3514 struct rw_semaphore rwsem; 3431 struct delayed_work destroy_dwork;
3515}; 3432};
3516 3433
3517/* 3434/*
@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count)
3527 else 3444 else
3528 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3445 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3529} 3446}
3447
3530static void pidlist_free(void *p) 3448static void pidlist_free(void *p)
3531{ 3449{
3532 if (is_vmalloc_addr(p)) 3450 if (is_vmalloc_addr(p))
@@ -3536,6 +3454,47 @@ static void pidlist_free(void *p)
3536} 3454}
3537 3455
3538/* 3456/*
3457 * Used to destroy all pidlists lingering waiting for destroy timer. None
3458 * should be left afterwards.
3459 */
3460static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3461{
3462 struct cgroup_pidlist *l, *tmp_l;
3463
3464 mutex_lock(&cgrp->pidlist_mutex);
3465 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3466 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3467 mutex_unlock(&cgrp->pidlist_mutex);
3468
3469 flush_workqueue(cgroup_pidlist_destroy_wq);
3470 BUG_ON(!list_empty(&cgrp->pidlists));
3471}
3472
3473static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3474{
3475 struct delayed_work *dwork = to_delayed_work(work);
3476 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3477 destroy_dwork);
3478 struct cgroup_pidlist *tofree = NULL;
3479
3480 mutex_lock(&l->owner->pidlist_mutex);
3481
3482 /*
3483 * Destroy iff we didn't get queued again. The state won't change
3484 * as destroy_dwork can only be queued while locked.
3485 */
3486 if (!delayed_work_pending(dwork)) {
3487 list_del(&l->links);
3488 pidlist_free(l->list);
3489 put_pid_ns(l->key.ns);
3490 tofree = l;
3491 }
3492
3493 mutex_unlock(&l->owner->pidlist_mutex);
3494 kfree(tofree);
3495}
3496
3497/*
3539 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3498 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3540 * Returns the number of unique elements. 3499 * Returns the number of unique elements.
3541 */ 3500 */
@@ -3565,52 +3524,92 @@ after:
3565 return dest; 3524 return dest;
3566} 3525}
3567 3526
3527/*
3528 * The two pid files - task and cgroup.procs - guaranteed that the result
3529 * is sorted, which forced this whole pidlist fiasco. As pid order is
3530 * different per namespace, each namespace needs differently sorted list,
3531 * making it impossible to use, for example, single rbtree of member tasks
3532 * sorted by task pointer. As pidlists can be fairly large, allocating one
3533 * per open file is dangerous, so cgroup had to implement shared pool of
3534 * pidlists keyed by cgroup and namespace.
3535 *
3536 * All this extra complexity was caused by the original implementation
3537 * committing to an entirely unnecessary property. In the long term, we
3538 * want to do away with it. Explicitly scramble sort order if
3539 * sane_behavior so that no such expectation exists in the new interface.
3540 *
3541 * Scrambling is done by swapping every two consecutive bits, which is
3542 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3543 */
3544static pid_t pid_fry(pid_t pid)
3545{
3546 unsigned a = pid & 0x55555555;
3547 unsigned b = pid & 0xAAAAAAAA;
3548
3549 return (a << 1) | (b >> 1);
3550}
3551
3552static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3553{
3554 if (cgroup_sane_behavior(cgrp))
3555 return pid_fry(pid);
3556 else
3557 return pid;
3558}
3559
3568static int cmppid(const void *a, const void *b) 3560static int cmppid(const void *a, const void *b)
3569{ 3561{
3570 return *(pid_t *)a - *(pid_t *)b; 3562 return *(pid_t *)a - *(pid_t *)b;
3571} 3563}
3572 3564
3565static int fried_cmppid(const void *a, const void *b)
3566{
3567 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3568}
3569
3570static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3571 enum cgroup_filetype type)
3572{
3573 struct cgroup_pidlist *l;
3574 /* don't need task_nsproxy() if we're looking at ourself */
3575 struct pid_namespace *ns = task_active_pid_ns(current);
3576
3577 lockdep_assert_held(&cgrp->pidlist_mutex);
3578
3579 list_for_each_entry(l, &cgrp->pidlists, links)
3580 if (l->key.type == type && l->key.ns == ns)
3581 return l;
3582 return NULL;
3583}
3584
3573/* 3585/*
3574 * find the appropriate pidlist for our purpose (given procs vs tasks) 3586 * find the appropriate pidlist for our purpose (given procs vs tasks)
3575 * returns with the lock on that pidlist already held, and takes care 3587 * returns with the lock on that pidlist already held, and takes care
3576 * of the use count, or returns NULL with no locks held if we're out of 3588 * of the use count, or returns NULL with no locks held if we're out of
3577 * memory. 3589 * memory.
3578 */ 3590 */
3579static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3591static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3580 enum cgroup_filetype type) 3592 enum cgroup_filetype type)
3581{ 3593{
3582 struct cgroup_pidlist *l; 3594 struct cgroup_pidlist *l;
3583 /* don't need task_nsproxy() if we're looking at ourself */
3584 struct pid_namespace *ns = task_active_pid_ns(current);
3585 3595
3586 /* 3596 lockdep_assert_held(&cgrp->pidlist_mutex);
3587 * We can't drop the pidlist_mutex before taking the l->rwsem in case 3597
3588 * the last ref-holder is trying to remove l from the list at the same 3598 l = cgroup_pidlist_find(cgrp, type);
3589 * time. Holding the pidlist_mutex precludes somebody taking whichever 3599 if (l)
3590 * list we find out from under us - compare release_pid_array(). 3600 return l;
3591 */ 3601
3592 mutex_lock(&cgrp->pidlist_mutex);
3593 list_for_each_entry(l, &cgrp->pidlists, links) {
3594 if (l->key.type == type && l->key.ns == ns) {
3595 /* make sure l doesn't vanish out from under us */
3596 down_write(&l->rwsem);
3597 mutex_unlock(&cgrp->pidlist_mutex);
3598 return l;
3599 }
3600 }
3601 /* entry not found; create a new one */ 3602 /* entry not found; create a new one */
3602 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3603 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3603 if (!l) { 3604 if (!l)
3604 mutex_unlock(&cgrp->pidlist_mutex);
3605 return l; 3605 return l;
3606 } 3606
3607 init_rwsem(&l->rwsem); 3607 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3608 down_write(&l->rwsem);
3609 l->key.type = type; 3608 l->key.type = type;
3610 l->key.ns = get_pid_ns(ns); 3609 /* don't need task_nsproxy() if we're looking at ourself */
3610 l->key.ns = get_pid_ns(task_active_pid_ns(current));
3611 l->owner = cgrp; 3611 l->owner = cgrp;
3612 list_add(&l->links, &cgrp->pidlists); 3612 list_add(&l->links, &cgrp->pidlists);
3613 mutex_unlock(&cgrp->pidlist_mutex);
3614 return l; 3613 return l;
3615} 3614}
3616 3615
@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3627 struct task_struct *tsk; 3626 struct task_struct *tsk;
3628 struct cgroup_pidlist *l; 3627 struct cgroup_pidlist *l;
3629 3628
3629 lockdep_assert_held(&cgrp->pidlist_mutex);
3630
3630 /* 3631 /*
3631 * If cgroup gets more users after we read count, we won't have 3632 * If cgroup gets more users after we read count, we won't have
3632 * enough space - tough. This race is indistinguishable to the 3633 * enough space - tough. This race is indistinguishable to the
@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3653 css_task_iter_end(&it); 3654 css_task_iter_end(&it);
3654 length = n; 3655 length = n;
3655 /* now sort & (if procs) strip out duplicates */ 3656 /* now sort & (if procs) strip out duplicates */
3656 sort(array, length, sizeof(pid_t), cmppid, NULL); 3657 if (cgroup_sane_behavior(cgrp))
3658 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3659 else
3660 sort(array, length, sizeof(pid_t), cmppid, NULL);
3657 if (type == CGROUP_FILE_PROCS) 3661 if (type == CGROUP_FILE_PROCS)
3658 length = pidlist_uniq(array, length); 3662 length = pidlist_uniq(array, length);
3659 l = cgroup_pidlist_find(cgrp, type); 3663
3664 l = cgroup_pidlist_find_create(cgrp, type);
3660 if (!l) { 3665 if (!l) {
3666 mutex_unlock(&cgrp->pidlist_mutex);
3661 pidlist_free(array); 3667 pidlist_free(array);
3662 return -ENOMEM; 3668 return -ENOMEM;
3663 } 3669 }
3664 /* store array, freeing old if necessary - lock already held */ 3670
3671 /* store array, freeing old if necessary */
3665 pidlist_free(l->list); 3672 pidlist_free(l->list);
3666 l->list = array; 3673 l->list = array;
3667 l->length = length; 3674 l->length = length;
3668 l->use_count++;
3669 up_write(&l->rwsem);
3670 *lp = l; 3675 *lp = l;
3671 return 0; 3676 return 0;
3672} 3677}
@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3740 * after a seek to the start). Use a binary-search to find the 3745 * after a seek to the start). Use a binary-search to find the
3741 * next pid to display, if any 3746 * next pid to display, if any
3742 */ 3747 */
3743 struct cgroup_pidlist *l = s->private; 3748 struct cgroup_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private;
3744 int index = 0, pid = *pos; 3752 int index = 0, pid = *pos;
3745 int *iter; 3753 int *iter, ret;
3754
3755 mutex_lock(&cgrp->pidlist_mutex);
3756
3757 /*
3758 * !NULL @of->priv indicates that this isn't the first start()
3759 * after open. If the matching pidlist is around, we can use that.
3760 * Look for it. Note that @of->priv can't be used directly. It
3761 * could already have been destroyed.
3762 */
3763 if (of->priv)
3764 of->priv = cgroup_pidlist_find(cgrp, type);
3765
3766 /*
3767 * Either this is the first start() after open or the matching
3768 * pidlist has been destroyed inbetween. Create a new one.
3769 */
3770 if (!of->priv) {
3771 ret = pidlist_array_load(cgrp, type,
3772 (struct cgroup_pidlist **)&of->priv);
3773 if (ret)
3774 return ERR_PTR(ret);
3775 }
3776 l = of->priv;
3746 3777
3747 down_read(&l->rwsem);
3748 if (pid) { 3778 if (pid) {
3749 int end = l->length; 3779 int end = l->length;
3750 3780
3751 while (index < end) { 3781 while (index < end) {
3752 int mid = (index + end) / 2; 3782 int mid = (index + end) / 2;
3753 if (l->list[mid] == pid) { 3783 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3754 index = mid; 3784 index = mid;
3755 break; 3785 break;
3756 } else if (l->list[mid] <= pid) 3786 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3757 index = mid + 1; 3787 index = mid + 1;
3758 else 3788 else
3759 end = mid; 3789 end = mid;
@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3764 return NULL; 3794 return NULL;
3765 /* Update the abstract position to be the actual pid that we found */ 3795 /* Update the abstract position to be the actual pid that we found */
3766 iter = l->list + index; 3796 iter = l->list + index;
3767 *pos = *iter; 3797 *pos = cgroup_pid_fry(cgrp, *iter);
3768 return iter; 3798 return iter;
3769} 3799}
3770 3800
3771static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3801static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3772{ 3802{
3773 struct cgroup_pidlist *l = s->private; 3803 struct cgroup_open_file *of = s->private;
3774 up_read(&l->rwsem); 3804 struct cgroup_pidlist *l = of->priv;
3805
3806 if (l)
3807 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3808 CGROUP_PIDLIST_DESTROY_DELAY);
3809 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3775} 3810}
3776 3811
3777static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3778{ 3813{
3779 struct cgroup_pidlist *l = s->private; 3814 struct cgroup_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv;
3780 pid_t *p = v; 3816 pid_t *p = v;
3781 pid_t *end = l->list + l->length; 3817 pid_t *end = l->list + l->length;
3782 /* 3818 /*
@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3787 if (p >= end) { 3823 if (p >= end) {
3788 return NULL; 3824 return NULL;
3789 } else { 3825 } else {
3790 *pos = *p; 3826 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3791 return p; 3827 return p;
3792 } 3828 }
3793} 3829}
@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
3808 .show = cgroup_pidlist_show, 3844 .show = cgroup_pidlist_show,
3809}; 3845};
3810 3846
3811static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3812{
3813 /*
3814 * the case where we're the last user of this particular pidlist will
3815 * have us remove it from the cgroup's list, which entails taking the
3816 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3817 * pidlist_mutex, we have to take pidlist_mutex first.
3818 */
3819 mutex_lock(&l->owner->pidlist_mutex);
3820 down_write(&l->rwsem);
3821 BUG_ON(!l->use_count);
3822 if (!--l->use_count) {
3823 /* we're the last user if refcount is 0; remove and free */
3824 list_del(&l->links);
3825 mutex_unlock(&l->owner->pidlist_mutex);
3826 pidlist_free(l->list);
3827 put_pid_ns(l->key.ns);
3828 up_write(&l->rwsem);
3829 kfree(l);
3830 return;
3831 }
3832 mutex_unlock(&l->owner->pidlist_mutex);
3833 up_write(&l->rwsem);
3834}
3835
3836static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3837{
3838 struct cgroup_pidlist *l;
3839 if (!(file->f_mode & FMODE_READ))
3840 return 0;
3841 /*
3842 * the seq_file will only be initialized if the file was opened for
3843 * reading; hence we check if it's not null only in that case.
3844 */
3845 l = ((struct seq_file *)file->private_data)->private;
3846 cgroup_release_pid_array(l);
3847 return seq_release(inode, file);
3848}
3849
3850static const struct file_operations cgroup_pidlist_operations = {
3851 .read = seq_read,
3852 .llseek = seq_lseek,
3853 .write = cgroup_file_write,
3854 .release = cgroup_pidlist_release,
3855};
3856
3857/*
3858 * The following functions handle opens on a file that displays a pidlist
3859 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3860 * in the cgroup.
3861 */
3862/* helper function for the two below it */
3863static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3864{
3865 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3866 struct cgroup_pidlist *l;
3867 int retval;
3868
3869 /* Nothing to do for write-only files */
3870 if (!(file->f_mode & FMODE_READ))
3871 return 0;
3872
3873 /* have the array populated */
3874 retval = pidlist_array_load(cgrp, type, &l);
3875 if (retval)
3876 return retval;
3877 /* configure file information */
3878 file->f_op = &cgroup_pidlist_operations;
3879
3880 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3881 if (retval) {
3882 cgroup_release_pid_array(l);
3883 return retval;
3884 }
3885 ((struct seq_file *)file->private_data)->private = l;
3886 return 0;
3887}
3888static int cgroup_tasks_open(struct inode *unused, struct file *file)
3889{
3890 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3891}
3892static int cgroup_procs_open(struct inode *unused, struct file *file)
3893{
3894 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3895}
3896
3897static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3847static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3898 struct cftype *cft) 3848 struct cftype *cft)
3899{ 3849{
@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3928 deactivate_super(sb); 3878 deactivate_super(sb);
3929} 3879}
3930 3880
3931/*
3932 * Unregister event and free resources.
3933 *
3934 * Gets called from workqueue.
3935 */
3936static void cgroup_event_remove(struct work_struct *work)
3937{
3938 struct cgroup_event *event = container_of(work, struct cgroup_event,
3939 remove);
3940 struct cgroup_subsys_state *css = event->css;
3941
3942 remove_wait_queue(event->wqh, &event->wait);
3943
3944 event->cft->unregister_event(css, event->cft, event->eventfd);
3945
3946 /* Notify userspace the event is going away. */
3947 eventfd_signal(event->eventfd, 1);
3948
3949 eventfd_ctx_put(event->eventfd);
3950 kfree(event);
3951 css_put(css);
3952}
3953
3954/*
3955 * Gets called on POLLHUP on eventfd when user closes it.
3956 *
3957 * Called with wqh->lock held and interrupts disabled.
3958 */
3959static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3960 int sync, void *key)
3961{
3962 struct cgroup_event *event = container_of(wait,
3963 struct cgroup_event, wait);
3964 struct cgroup *cgrp = event->css->cgroup;
3965 unsigned long flags = (unsigned long)key;
3966
3967 if (flags & POLLHUP) {
3968 /*
3969 * If the event has been detached at cgroup removal, we
3970 * can simply return knowing the other side will cleanup
3971 * for us.
3972 *
3973 * We can't race against event freeing since the other
3974 * side will require wqh->lock via remove_wait_queue(),
3975 * which we hold.
3976 */
3977 spin_lock(&cgrp->event_list_lock);
3978 if (!list_empty(&event->list)) {
3979 list_del_init(&event->list);
3980 /*
3981 * We are in atomic context, but cgroup_event_remove()
3982 * may sleep, so we have to call it in workqueue.
3983 */
3984 schedule_work(&event->remove);
3985 }
3986 spin_unlock(&cgrp->event_list_lock);
3987 }
3988
3989 return 0;
3990}
3991
3992static void cgroup_event_ptable_queue_proc(struct file *file,
3993 wait_queue_head_t *wqh, poll_table *pt)
3994{
3995 struct cgroup_event *event = container_of(pt,
3996 struct cgroup_event, pt);
3997
3998 event->wqh = wqh;
3999 add_wait_queue(wqh, &event->wait);
4000}
4001
4002/*
4003 * Parse input and register new cgroup event handler.
4004 *
4005 * Input must be in format '<event_fd> <control_fd> <args>'.
4006 * Interpretation of args is defined by control file implementation.
4007 */
4008static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4009 struct cftype *cft, const char *buffer)
4010{
4011 struct cgroup *cgrp = dummy_css->cgroup;
4012 struct cgroup_event *event;
4013 struct cgroup_subsys_state *cfile_css;
4014 unsigned int efd, cfd;
4015 struct fd efile;
4016 struct fd cfile;
4017 char *endp;
4018 int ret;
4019
4020 efd = simple_strtoul(buffer, &endp, 10);
4021 if (*endp != ' ')
4022 return -EINVAL;
4023 buffer = endp + 1;
4024
4025 cfd = simple_strtoul(buffer, &endp, 10);
4026 if ((*endp != ' ') && (*endp != '\0'))
4027 return -EINVAL;
4028 buffer = endp + 1;
4029
4030 event = kzalloc(sizeof(*event), GFP_KERNEL);
4031 if (!event)
4032 return -ENOMEM;
4033
4034 INIT_LIST_HEAD(&event->list);
4035 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4036 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4037 INIT_WORK(&event->remove, cgroup_event_remove);
4038
4039 efile = fdget(efd);
4040 if (!efile.file) {
4041 ret = -EBADF;
4042 goto out_kfree;
4043 }
4044
4045 event->eventfd = eventfd_ctx_fileget(efile.file);
4046 if (IS_ERR(event->eventfd)) {
4047 ret = PTR_ERR(event->eventfd);
4048 goto out_put_efile;
4049 }
4050
4051 cfile = fdget(cfd);
4052 if (!cfile.file) {
4053 ret = -EBADF;
4054 goto out_put_eventfd;
4055 }
4056
4057 /* the process need read permission on control file */
4058 /* AV: shouldn't we check that it's been opened for read instead? */
4059 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4060 if (ret < 0)
4061 goto out_put_cfile;
4062
4063 event->cft = __file_cft(cfile.file);
4064 if (IS_ERR(event->cft)) {
4065 ret = PTR_ERR(event->cft);
4066 goto out_put_cfile;
4067 }
4068
4069 if (!event->cft->ss) {
4070 ret = -EBADF;
4071 goto out_put_cfile;
4072 }
4073
4074 /*
4075 * Determine the css of @cfile, verify it belongs to the same
4076 * cgroup as cgroup.event_control, and associate @event with it.
4077 * Remaining events are automatically removed on cgroup destruction
4078 * but the removal is asynchronous, so take an extra ref.
4079 */
4080 rcu_read_lock();
4081
4082 ret = -EINVAL;
4083 event->css = cgroup_css(cgrp, event->cft->ss);
4084 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4085 if (event->css && event->css == cfile_css && css_tryget(event->css))
4086 ret = 0;
4087
4088 rcu_read_unlock();
4089 if (ret)
4090 goto out_put_cfile;
4091
4092 if (!event->cft->register_event || !event->cft->unregister_event) {
4093 ret = -EINVAL;
4094 goto out_put_css;
4095 }
4096
4097 ret = event->cft->register_event(event->css, event->cft,
4098 event->eventfd, buffer);
4099 if (ret)
4100 goto out_put_css;
4101
4102 efile.file->f_op->poll(efile.file, &event->pt);
4103
4104 spin_lock(&cgrp->event_list_lock);
4105 list_add(&event->list, &cgrp->event_list);
4106 spin_unlock(&cgrp->event_list_lock);
4107
4108 fdput(cfile);
4109 fdput(efile);
4110
4111 return 0;
4112
4113out_put_css:
4114 css_put(event->css);
4115out_put_cfile:
4116 fdput(cfile);
4117out_put_eventfd:
4118 eventfd_ctx_put(event->eventfd);
4119out_put_efile:
4120 fdput(efile);
4121out_kfree:
4122 kfree(event);
4123
4124 return ret;
4125}
4126
4127static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4128 struct cftype *cft) 3882 struct cftype *cft)
4129{ 3883{
@@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4143static struct cftype cgroup_base_files[] = { 3897static struct cftype cgroup_base_files[] = {
4144 { 3898 {
4145 .name = "cgroup.procs", 3899 .name = "cgroup.procs",
4146 .open = cgroup_procs_open, 3900 .seq_start = cgroup_pidlist_start,
3901 .seq_next = cgroup_pidlist_next,
3902 .seq_stop = cgroup_pidlist_stop,
3903 .seq_show = cgroup_pidlist_show,
3904 .private = CGROUP_FILE_PROCS,
4147 .write_u64 = cgroup_procs_write, 3905 .write_u64 = cgroup_procs_write,
4148 .release = cgroup_pidlist_release,
4149 .mode = S_IRUGO | S_IWUSR, 3906 .mode = S_IRUGO | S_IWUSR,
4150 }, 3907 },
4151 { 3908 {
4152 .name = "cgroup.event_control",
4153 .write_string = cgroup_write_event_control,
4154 .mode = S_IWUGO,
4155 },
4156 {
4157 .name = "cgroup.clone_children", 3909 .name = "cgroup.clone_children",
4158 .flags = CFTYPE_INSANE, 3910 .flags = CFTYPE_INSANE,
4159 .read_u64 = cgroup_clone_children_read, 3911 .read_u64 = cgroup_clone_children_read,
@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
4162 { 3914 {
4163 .name = "cgroup.sane_behavior", 3915 .name = "cgroup.sane_behavior",
4164 .flags = CFTYPE_ONLY_ON_ROOT, 3916 .flags = CFTYPE_ONLY_ON_ROOT,
4165 .read_seq_string = cgroup_sane_behavior_show, 3917 .seq_show = cgroup_sane_behavior_show,
4166 }, 3918 },
4167 3919
4168 /* 3920 /*
@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
4173 { 3925 {
4174 .name = "tasks", 3926 .name = "tasks",
4175 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3927 .flags = CFTYPE_INSANE, /* use "procs" instead */
4176 .open = cgroup_tasks_open, 3928 .seq_start = cgroup_pidlist_start,
3929 .seq_next = cgroup_pidlist_next,
3930 .seq_stop = cgroup_pidlist_stop,
3931 .seq_show = cgroup_pidlist_show,
3932 .private = CGROUP_FILE_TASKS,
4177 .write_u64 = cgroup_tasks_write, 3933 .write_u64 = cgroup_tasks_write,
4178 .release = cgroup_pidlist_release,
4179 .mode = S_IRUGO | S_IWUSR, 3934 .mode = S_IRUGO | S_IWUSR,
4180 }, 3935 },
4181 { 3936 {
@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
4187 { 3942 {
4188 .name = "release_agent", 3943 .name = "release_agent",
4189 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4190 .read_seq_string = cgroup_release_agent_show, 3945 .seq_show = cgroup_release_agent_show,
4191 .write_string = cgroup_release_agent_write, 3946 .write_string = cgroup_release_agent_write,
4192 .max_write_len = PATH_MAX, 3947 .max_write_len = PATH_MAX,
4193 }, 3948 },
@@ -4333,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css)
4333 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4334} 4089}
4335 4090
4091/**
4092 * create_css - create a cgroup_subsys_state
4093 * @cgrp: the cgroup new css will be associated with
4094 * @ss: the subsys of new css
4095 *
4096 * Create a new css associated with @cgrp - @ss pair. On success, the new
4097 * css is online and installed in @cgrp with all interface files created.
4098 * Returns 0 on success, -errno on failure.
4099 */
4100static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4101{
4102 struct cgroup *parent = cgrp->parent;
4103 struct cgroup_subsys_state *css;
4104 int err;
4105
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex);
4108
4109 css = ss->css_alloc(cgroup_css(parent, ss));
4110 if (IS_ERR(css))
4111 return PTR_ERR(css);
4112
4113 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err)
4115 goto err_free;
4116
4117 init_css(css, ss, cgrp);
4118
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4120 if (err)
4121 goto err_free;
4122
4123 err = online_css(css);
4124 if (err)
4125 goto err_free;
4126
4127 dget(cgrp->dentry);
4128 css_get(css->parent);
4129
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4133 current->comm, current->pid, ss->name);
4134 if (!strcmp(ss->name, "memory"))
4135 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4136 ss->warned_broken_hierarchy = true;
4137 }
4138
4139 return 0;
4140
4141err_free:
4142 percpu_ref_cancel_init(&css->refcnt);
4143 ss->css_free(css);
4144 return err;
4145}
4146
4336/* 4147/*
4337 * cgroup_create - create a cgroup 4148 * cgroup_create - create a cgroup
4338 * @parent: cgroup that will be parent of the new cgroup 4149 * @parent: cgroup that will be parent of the new cgroup
@@ -4344,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css)
4344static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4155static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4345 umode_t mode) 4156 umode_t mode)
4346{ 4157{
4347 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4348 struct cgroup *cgrp; 4158 struct cgroup *cgrp;
4349 struct cgroup_name *name; 4159 struct cgroup_name *name;
4350 struct cgroupfs_root *root = parent->root; 4160 struct cgroupfs_root *root = parent->root;
4351 int err = 0; 4161 int ssid, err = 0;
4352 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
4353 struct super_block *sb = root->sb; 4163 struct super_block *sb = root->sb;
4354 4164
@@ -4404,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4404 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4214 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4405 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4215 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4406 4216
4407 for_each_root_subsys(root, ss) {
4408 struct cgroup_subsys_state *css;
4409
4410 css = ss->css_alloc(cgroup_css(parent, ss));
4411 if (IS_ERR(css)) {
4412 err = PTR_ERR(css);
4413 goto err_free_all;
4414 }
4415 css_ar[ss->subsys_id] = css;
4416
4417 err = percpu_ref_init(&css->refcnt, css_release);
4418 if (err)
4419 goto err_free_all;
4420
4421 init_css(css, ss, cgrp);
4422 }
4423
4424 /* 4217 /*
4425 * Create directory. cgroup_create_file() returns with the new 4218 * Create directory. cgroup_create_file() returns with the new
4426 * directory locked on success so that it can be populated without 4219 * directory locked on success so that it can be populated without
@@ -4428,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4428 */ 4221 */
4429 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4222 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4430 if (err < 0) 4223 if (err < 0)
4431 goto err_free_all; 4224 goto err_unlock;
4432 lockdep_assert_held(&dentry->d_inode->i_mutex); 4225 lockdep_assert_held(&dentry->d_inode->i_mutex);
4433 4226
4434 cgrp->serial_nr = cgroup_serial_nr_next++; 4227 cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4440,55 +4233,31 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4440 /* hold a ref to the parent's dentry */ 4233 /* hold a ref to the parent's dentry */
4441 dget(parent->dentry); 4234 dget(parent->dentry);
4442 4235
4443 /* creation succeeded, notify subsystems */ 4236 /*
4444 for_each_root_subsys(root, ss) { 4237 * @cgrp is now fully operational. If something fails after this
4445 struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4238 * point, it'll be released via the normal destruction path.
4446 4239 */
4447 err = online_css(css);
4448 if (err)
4449 goto err_destroy;
4450
4451 /* each css holds a ref to the cgroup's dentry and parent css */
4452 dget(dentry);
4453 css_get(css->parent);
4454
4455 /* mark it consumed for error path */
4456 css_ar[ss->subsys_id] = NULL;
4457
4458 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4459 parent->parent) {
4460 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4461 current->comm, current->pid, ss->name);
4462 if (!strcmp(ss->name, "memory"))
4463 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4464 ss->warned_broken_hierarchy = true;
4465 }
4466 }
4467
4468 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4240 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4469 4241
4470 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4242 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4471 if (err) 4243 if (err)
4472 goto err_destroy; 4244 goto err_destroy;
4473 4245
4474 err = cgroup_populate_dir(cgrp, root->subsys_mask); 4246 /* let's create and online css's */
4475 if (err) 4247 for_each_subsys(ss, ssid) {
4476 goto err_destroy; 4248 if (root->subsys_mask & (1 << ssid)) {
4249 err = create_css(cgrp, ss);
4250 if (err)
4251 goto err_destroy;
4252 }
4253 }
4477 4254
4478 mutex_unlock(&cgroup_mutex); 4255 mutex_unlock(&cgroup_mutex);
4479 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4256 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4480 4257
4481 return 0; 4258 return 0;
4482 4259
4483err_free_all: 4260err_unlock:
4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4487 if (css) {
4488 percpu_ref_cancel_init(&css->refcnt);
4489 ss->css_free(css);
4490 }
4491 }
4492 mutex_unlock(&cgroup_mutex); 4261 mutex_unlock(&cgroup_mutex);
4493 /* Release the reference count that we took on the superblock */ 4262 /* Release the reference count that we took on the superblock */
4494 deactivate_super(sb); 4263 deactivate_super(sb);
@@ -4501,14 +4270,6 @@ err_free_cgrp:
4501 return err; 4270 return err;
4502 4271
4503err_destroy: 4272err_destroy:
4504 for_each_root_subsys(root, ss) {
4505 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506
4507 if (css) {
4508 percpu_ref_cancel_init(&css->refcnt);
4509 ss->css_free(css);
4510 }
4511 }
4512 cgroup_destroy_locked(cgrp); 4273 cgroup_destroy_locked(cgrp);
4513 mutex_unlock(&cgroup_mutex); 4274 mutex_unlock(&cgroup_mutex);
4514 mutex_unlock(&dentry->d_inode->i_mutex); 4275 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4631,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4631 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4392 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4632{ 4393{
4633 struct dentry *d = cgrp->dentry; 4394 struct dentry *d = cgrp->dentry;
4634 struct cgroup_event *event, *tmp; 4395 struct cgroup_subsys_state *css;
4635 struct cgroup_subsys *ss;
4636 struct cgroup *child; 4396 struct cgroup *child;
4637 bool empty; 4397 bool empty;
4398 int ssid;
4638 4399
4639 lockdep_assert_held(&d->d_inode->i_mutex); 4400 lockdep_assert_held(&d->d_inode->i_mutex);
4640 lockdep_assert_held(&cgroup_mutex); 4401 lockdep_assert_held(&cgroup_mutex);
@@ -4670,12 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4670 * will be invoked to perform the rest of destruction once the 4431 * will be invoked to perform the rest of destruction once the
4671 * percpu refs of all css's are confirmed to be killed. 4432 * percpu refs of all css's are confirmed to be killed.
4672 */ 4433 */
4673 for_each_root_subsys(cgrp->root, ss) { 4434 for_each_css(css, ssid, cgrp)
4674 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 4435 kill_css(css);
4675
4676 if (css)
4677 kill_css(css);
4678 }
4679 4436
4680 /* 4437 /*
4681 * Mark @cgrp dead. This prevents further task migration and child 4438 * Mark @cgrp dead. This prevents further task migration and child
@@ -4710,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4710 dget(d); 4467 dget(d);
4711 cgroup_d_remove_dir(d); 4468 cgroup_d_remove_dir(d);
4712 4469
4713 /*
4714 * Unregister events and notify userspace.
4715 * Notify userspace about cgroup removing only after rmdir of cgroup
4716 * directory to avoid race between userspace and kernelspace.
4717 */
4718 spin_lock(&cgrp->event_list_lock);
4719 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4720 list_del_init(&event->list);
4721 schedule_work(&event->remove);
4722 }
4723 spin_unlock(&cgrp->event_list_lock);
4724
4725 return 0; 4470 return 0;
4726}; 4471};
4727 4472
@@ -4792,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4792 cgroup_init_cftsets(ss); 4537 cgroup_init_cftsets(ss);
4793 4538
4794 /* Create the top cgroup state for this subsystem */ 4539 /* Create the top cgroup state for this subsystem */
4795 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4796 ss->root = &cgroup_dummy_root; 4540 ss->root = &cgroup_dummy_root;
4797 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4541 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4798 /* We don't handle early failures gracefully */ 4542 /* We don't handle early failures gracefully */
@@ -4866,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4866 cgroup_init_cftsets(ss); 4610 cgroup_init_cftsets(ss);
4867 4611
4868 mutex_lock(&cgroup_mutex); 4612 mutex_lock(&cgroup_mutex);
4613 mutex_lock(&cgroup_root_mutex);
4869 cgroup_subsys[ss->subsys_id] = ss; 4614 cgroup_subsys[ss->subsys_id] = ss;
4870 4615
4871 /* 4616 /*
@@ -4877,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4877 if (IS_ERR(css)) { 4622 if (IS_ERR(css)) {
4878 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4623 /* failure case - need to deassign the cgroup_subsys[] slot. */
4879 cgroup_subsys[ss->subsys_id] = NULL; 4624 cgroup_subsys[ss->subsys_id] = NULL;
4625 mutex_unlock(&cgroup_root_mutex);
4880 mutex_unlock(&cgroup_mutex); 4626 mutex_unlock(&cgroup_mutex);
4881 return PTR_ERR(css); 4627 return PTR_ERR(css);
4882 } 4628 }
4883 4629
4884 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4885 ss->root = &cgroup_dummy_root; 4630 ss->root = &cgroup_dummy_root;
4886 4631
4887 /* our new subsystem will be attached to the dummy hierarchy. */ 4632 /* our new subsystem will be attached to the dummy hierarchy. */
@@ -4911,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4911 write_unlock(&css_set_lock); 4656 write_unlock(&css_set_lock);
4912 4657
4913 ret = online_css(css); 4658 ret = online_css(css);
4914 if (ret) 4659 if (ret) {
4660 ss->css_free(css);
4915 goto err_unload; 4661 goto err_unload;
4662 }
4916 4663
4917 /* success! */ 4664 /* success! */
4665 mutex_unlock(&cgroup_root_mutex);
4918 mutex_unlock(&cgroup_mutex); 4666 mutex_unlock(&cgroup_mutex);
4919 return 0; 4667 return 0;
4920 4668
4921err_unload: 4669err_unload:
4670 mutex_unlock(&cgroup_root_mutex);
4922 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4923 /* @ss can't be mounted here as try_module_get() would fail */ 4672 /* @ss can't be mounted here as try_module_get() would fail */
4924 cgroup_unload_subsys(ss); 4673 cgroup_unload_subsys(ss);
@@ -4937,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4937void cgroup_unload_subsys(struct cgroup_subsys *ss) 4686void cgroup_unload_subsys(struct cgroup_subsys *ss)
4938{ 4687{
4939 struct cgrp_cset_link *link; 4688 struct cgrp_cset_link *link;
4689 struct cgroup_subsys_state *css;
4940 4690
4941 BUG_ON(ss->module == NULL); 4691 BUG_ON(ss->module == NULL);
4942 4692
@@ -4948,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4948 BUG_ON(ss->root != &cgroup_dummy_root); 4698 BUG_ON(ss->root != &cgroup_dummy_root);
4949 4699
4950 mutex_lock(&cgroup_mutex); 4700 mutex_lock(&cgroup_mutex);
4701 mutex_lock(&cgroup_root_mutex);
4951 4702
4952 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4703 css = cgroup_css(cgroup_dummy_top, ss);
4704 if (css)
4705 offline_css(css);
4953 4706
4954 /* deassign the subsys_id */ 4707 /* deassign the subsys_id */
4955 cgroup_subsys[ss->subsys_id] = NULL; 4708 cgroup_subsys[ss->subsys_id] = NULL;
4956 4709
4957 /* remove subsystem from the dummy root's list of subsystems */
4958 list_del_init(&ss->sibling);
4959
4960 /* 4710 /*
4961 * disentangle the css from all css_sets attached to the dummy 4711 * disentangle the css from all css_sets attached to the dummy
4962 * top. as in loading, we need to pay our respects to the hashtable 4712 * top. as in loading, we need to pay our respects to the hashtable
@@ -4979,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4979 * need to free before marking as null because ss->css_free needs 4729 * need to free before marking as null because ss->css_free needs
4980 * the cgrp->subsys pointer to find their state. 4730 * the cgrp->subsys pointer to find their state.
4981 */ 4731 */
4982 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4732 if (css)
4733 ss->css_free(css);
4983 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4734 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4984 4735
4736 mutex_unlock(&cgroup_root_mutex);
4985 mutex_unlock(&cgroup_mutex); 4737 mutex_unlock(&cgroup_mutex);
4986} 4738}
4987EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4739EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
@@ -5100,6 +4852,15 @@ static int __init cgroup_wq_init(void)
5100 */ 4852 */
5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4853 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 BUG_ON(!cgroup_destroy_wq); 4854 BUG_ON(!cgroup_destroy_wq);
4855
4856 /*
4857 * Used to destroy pidlists and separate to serve as flush domain.
4858 * Cap @max_active to 1 too.
4859 */
4860 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4861 0, 1);
4862 BUG_ON(!cgroup_pidlist_destroy_wq);
4863
5103 return 0; 4864 return 0;
5104} 4865}
5105core_initcall(cgroup_wq_init); 4866core_initcall(cgroup_wq_init);
@@ -5143,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5143 for_each_active_root(root) { 4904 for_each_active_root(root) {
5144 struct cgroup_subsys *ss; 4905 struct cgroup_subsys *ss;
5145 struct cgroup *cgrp; 4906 struct cgroup *cgrp;
5146 int count = 0; 4907 int ssid, count = 0;
5147 4908
5148 seq_printf(m, "%d:", root->hierarchy_id); 4909 seq_printf(m, "%d:", root->hierarchy_id);
5149 for_each_root_subsys(root, ss) 4910 for_each_subsys(ss, ssid)
5150 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4911 if (root->subsys_mask & (1 << ssid))
4912 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5151 if (strlen(root->name)) 4913 if (strlen(root->name))
5152 seq_printf(m, "%sname=%s", count ? "," : "", 4914 seq_printf(m, "%sname=%s", count ? "," : "",
5153 root->name); 4915 root->name);
@@ -5488,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable);
5488 * @dentry: directory dentry of interest 5250 * @dentry: directory dentry of interest
5489 * @ss: subsystem of interest 5251 * @ss: subsystem of interest
5490 * 5252 *
5491 * Must be called under RCU read lock. The caller is responsible for 5253 * Must be called under cgroup_mutex or RCU read lock. The caller is
5492 * pinning the returned css if it needs to be accessed outside the RCU 5254 * responsible for pinning the returned css if it needs to be accessed
5493 * critical section. 5255 * outside the critical section.
5494 */ 5256 */
5495struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 5257struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5496 struct cgroup_subsys *ss) 5258 struct cgroup_subsys *ss)
5497{ 5259{
5498 struct cgroup *cgrp; 5260 struct cgroup *cgrp;
5499 5261
5500 WARN_ON_ONCE(!rcu_read_lock_held()); 5262 cgroup_assert_mutex_or_rcu_locked();
5501 5263
5502 /* is @dentry a cgroup dir? */ 5264 /* is @dentry a cgroup dir? */
5503 if (!dentry->d_inode || 5265 if (!dentry->d_inode ||
@@ -5520,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5520{ 5282{
5521 struct cgroup *cgrp; 5283 struct cgroup *cgrp;
5522 5284
5523 rcu_lockdep_assert(rcu_read_lock_held() || 5285 cgroup_assert_mutex_or_rcu_locked();
5524 lockdep_is_held(&cgroup_mutex),
5525 "css_from_id() needs proper protection");
5526 5286
5527 cgrp = idr_find(&ss->root->cgroup_idr, id); 5287 cgrp = idr_find(&ss->root->cgroup_idr, id);
5528 if (cgrp) 5288 if (cgrp)
@@ -5570,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5570 return count; 5330 return count;
5571} 5331}
5572 5332
5573static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, 5333static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5574 struct cftype *cft,
5575 struct seq_file *seq)
5576{ 5334{
5577 struct cgrp_cset_link *link; 5335 struct cgrp_cset_link *link;
5578 struct css_set *cset; 5336 struct css_set *cset;
@@ -5597,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5597} 5355}
5598 5356
5599#define MAX_TASKS_SHOWN_PER_CSS 25 5357#define MAX_TASKS_SHOWN_PER_CSS 25
5600static int cgroup_css_links_read(struct cgroup_subsys_state *css, 5358static int cgroup_css_links_read(struct seq_file *seq, void *v)
5601 struct cftype *cft, struct seq_file *seq)
5602{ 5359{
5360 struct cgroup_subsys_state *css = seq_css(seq);
5603 struct cgrp_cset_link *link; 5361 struct cgrp_cset_link *link;
5604 5362
5605 read_lock(&css_set_lock); 5363 read_lock(&css_set_lock);
@@ -5645,12 +5403,12 @@ static struct cftype debug_files[] = {
5645 5403
5646 { 5404 {
5647 .name = "current_css_set_cg_links", 5405 .name = "current_css_set_cg_links",
5648 .read_seq_string = current_css_set_cg_links_read, 5406 .seq_show = current_css_set_cg_links_read,
5649 }, 5407 },
5650 5408
5651 { 5409 {
5652 .name = "cgroup_css_links", 5410 .name = "cgroup_css_links",
5653 .read_seq_string = cgroup_css_links_read, 5411 .seq_show = cgroup_css_links_read,
5654 }, 5412 },
5655 5413
5656 { 5414 {