diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 1259 |
1 files changed, 512 insertions, 747 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc1dcabe9217..0c753ddd223b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include <linux/rcupdate.h> | 41 | #include <linux/rcupdate.h> |
42 | #include <linux/sched.h> | 42 | #include <linux/sched.h> |
43 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/seq_file.h> | ||
45 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
46 | #include <linux/magic.h> | 45 | #include <linux/magic.h> |
47 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
@@ -56,15 +55,20 @@ | |||
56 | #include <linux/pid_namespace.h> | 55 | #include <linux/pid_namespace.h> |
57 | #include <linux/idr.h> | 56 | #include <linux/idr.h> |
58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
59 | #include <linux/eventfd.h> | ||
60 | #include <linux/poll.h> | ||
61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ | 58 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ |
62 | #include <linux/kthread.h> | 59 | #include <linux/kthread.h> |
63 | #include <linux/file.h> | ||
64 | 60 | ||
65 | #include <linux/atomic.h> | 61 | #include <linux/atomic.h> |
66 | 62 | ||
67 | /* | 63 | /* |
64 | * pidlists linger the following amount before being destroyed. The goal | ||
65 | * is avoiding frequent destruction in the middle of consecutive read calls | ||
66 | * Expiring in the middle is a performance problem not a correctness one. | ||
67 | * 1 sec should be enough. | ||
68 | */ | ||
69 | #define CGROUP_PIDLIST_DESTROY_DELAY HZ | ||
70 | |||
71 | /* | ||
68 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 72 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
69 | * hierarchy must be performed while holding it. | 73 | * hierarchy must be performed while holding it. |
70 | * | 74 | * |
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex); | |||
89 | 93 | ||
90 | static DEFINE_MUTEX(cgroup_root_mutex); | 94 | static DEFINE_MUTEX(cgroup_root_mutex); |
91 | 95 | ||
96 | #define cgroup_assert_mutex_or_rcu_locked() \ | ||
97 | rcu_lockdep_assert(rcu_read_lock_held() || \ | ||
98 | lockdep_is_held(&cgroup_mutex), \ | ||
99 | "cgroup_mutex or RCU read lock required"); | ||
100 | |||
101 | #ifdef CONFIG_LOCKDEP | ||
102 | #define cgroup_assert_mutex_or_root_locked() \ | ||
103 | WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ | ||
104 | !lockdep_is_held(&cgroup_root_mutex))) | ||
105 | #else | ||
106 | #define cgroup_assert_mutex_or_root_locked() do { } while (0) | ||
107 | #endif | ||
108 | |||
92 | /* | 109 | /* |
93 | * cgroup destruction makes heavy use of work items and there can be a lot | 110 | * cgroup destruction makes heavy use of work items and there can be a lot |
94 | * of concurrent destructions. Use a separate workqueue so that cgroup | 111 | * of concurrent destructions. Use a separate workqueue so that cgroup |
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); | |||
98 | static struct workqueue_struct *cgroup_destroy_wq; | 115 | static struct workqueue_struct *cgroup_destroy_wq; |
99 | 116 | ||
100 | /* | 117 | /* |
118 | * pidlist destructions need to be flushed on cgroup destruction. Use a | ||
119 | * separate workqueue as flush domain. | ||
120 | */ | ||
121 | static struct workqueue_struct *cgroup_pidlist_destroy_wq; | ||
122 | |||
123 | /* | ||
101 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 124 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
102 | * populated with the built in subsystems, and modular subsystems are | 125 | * populated with the built in subsystems, and modular subsystems are |
103 | * registered after that. The mutable section of this array is protected by | 126 | * registered after that. The mutable section of this array is protected by |
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root; | |||
119 | /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ | 142 | /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ |
120 | static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; | 143 | static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; |
121 | 144 | ||
122 | /* | ||
123 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
124 | */ | ||
125 | struct cfent { | ||
126 | struct list_head node; | ||
127 | struct dentry *dentry; | ||
128 | struct cftype *type; | ||
129 | struct cgroup_subsys_state *css; | ||
130 | |||
131 | /* file xattrs */ | ||
132 | struct simple_xattrs xattrs; | ||
133 | }; | ||
134 | |||
135 | /* | ||
136 | * cgroup_event represents events which userspace want to receive. | ||
137 | */ | ||
138 | struct cgroup_event { | ||
139 | /* | ||
140 | * css which the event belongs to. | ||
141 | */ | ||
142 | struct cgroup_subsys_state *css; | ||
143 | /* | ||
144 | * Control file which the event associated. | ||
145 | */ | ||
146 | struct cftype *cft; | ||
147 | /* | ||
148 | * eventfd to signal userspace about the event. | ||
149 | */ | ||
150 | struct eventfd_ctx *eventfd; | ||
151 | /* | ||
152 | * Each of these stored in a list by the cgroup. | ||
153 | */ | ||
154 | struct list_head list; | ||
155 | /* | ||
156 | * All fields below needed to unregister event when | ||
157 | * userspace closes eventfd. | ||
158 | */ | ||
159 | poll_table pt; | ||
160 | wait_queue_head_t *wqh; | ||
161 | wait_queue_t wait; | ||
162 | struct work_struct remove; | ||
163 | }; | ||
164 | |||
165 | /* The list of hierarchy roots */ | 145 | /* The list of hierarchy roots */ |
166 | 146 | ||
167 | static LIST_HEAD(cgroup_roots); | 147 | static LIST_HEAD(cgroup_roots); |
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); | |||
200 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 180 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
201 | bool is_add); | 181 | bool is_add); |
202 | static int cgroup_file_release(struct inode *inode, struct file *file); | 182 | static int cgroup_file_release(struct inode *inode, struct file *file); |
183 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | ||
203 | 184 | ||
204 | /** | 185 | /** |
205 | * cgroup_css - obtain a cgroup's css for the specified subsystem | 186 | * cgroup_css - obtain a cgroup's css for the specified subsystem |
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
262 | } | 243 | } |
263 | 244 | ||
264 | /** | 245 | /** |
246 | * for_each_css - iterate all css's of a cgroup | ||
247 | * @css: the iteration cursor | ||
248 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | ||
249 | * @cgrp: the target cgroup to iterate css's of | ||
250 | * | ||
251 | * Should be called under cgroup_mutex. | ||
252 | */ | ||
253 | #define for_each_css(css, ssid, cgrp) \ | ||
254 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | ||
255 | if (!((css) = rcu_dereference_check( \ | ||
256 | (cgrp)->subsys[(ssid)], \ | ||
257 | lockdep_is_held(&cgroup_mutex)))) { } \ | ||
258 | else | ||
259 | |||
260 | /** | ||
265 | * for_each_subsys - iterate all loaded cgroup subsystems | 261 | * for_each_subsys - iterate all loaded cgroup subsystems |
266 | * @ss: the iteration cursor | 262 | * @ss: the iteration cursor |
267 | * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | 263 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end |
268 | * | 264 | * |
269 | * Should be called under cgroup_mutex. | 265 | * Iterates through all loaded subsystems. Should be called under |
266 | * cgroup_mutex or cgroup_root_mutex. | ||
270 | */ | 267 | */ |
271 | #define for_each_subsys(ss, i) \ | 268 | #define for_each_subsys(ss, ssid) \ |
272 | for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ | 269 | for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ |
273 | if (({ lockdep_assert_held(&cgroup_mutex); \ | 270 | (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
274 | !((ss) = cgroup_subsys[i]); })) { } \ | 271 | if (!((ss) = cgroup_subsys[(ssid)])) { } \ |
275 | else | 272 | else |
276 | 273 | ||
277 | /** | 274 | /** |
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
286 | for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ | 283 | for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ |
287 | (((ss) = cgroup_subsys[i]) || true); (i)++) | 284 | (((ss) = cgroup_subsys[i]) || true); (i)++) |
288 | 285 | ||
289 | /* iterate each subsystem attached to a hierarchy */ | ||
290 | #define for_each_root_subsys(root, ss) \ | ||
291 | list_for_each_entry((ss), &(root)->subsys_list, sibling) | ||
292 | |||
293 | /* iterate across the active hierarchies */ | 286 | /* iterate across the active hierarchies */ |
294 | #define for_each_active_root(root) \ | 287 | #define for_each_active_root(root) \ |
295 | list_for_each_entry((root), &cgroup_roots, root_list) | 288 | list_for_each_entry((root), &cgroup_roots, root_list) |
@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work) | |||
863 | */ | 856 | */ |
864 | deactivate_super(cgrp->root->sb); | 857 | deactivate_super(cgrp->root->sb); |
865 | 858 | ||
866 | /* | 859 | cgroup_pidlist_destroy_all(cgrp); |
867 | * if we're getting rid of the cgroup, refcount should ensure | ||
868 | * that there are no pidlists left. | ||
869 | */ | ||
870 | BUG_ON(!list_empty(&cgrp->pidlists)); | ||
871 | 860 | ||
872 | simple_xattrs_free(&cgrp->xattrs); | 861 | simple_xattrs_free(&cgrp->xattrs); |
873 | 862 | ||
@@ -897,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
897 | * per-subsystem and moved to css->id so that lookups are | 886 | * per-subsystem and moved to css->id so that lookups are |
898 | * successful until the target css is released. | 887 | * successful until the target css is released. |
899 | */ | 888 | */ |
889 | mutex_lock(&cgroup_mutex); | ||
900 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 890 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
891 | mutex_unlock(&cgroup_mutex); | ||
901 | cgrp->id = -1; | 892 | cgrp->id = -1; |
902 | 893 | ||
903 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 894 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
@@ -1050,7 +1041,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1050 | cgroup_css(cgroup_dummy_top, ss)); | 1041 | cgroup_css(cgroup_dummy_top, ss)); |
1051 | cgroup_css(cgrp, ss)->cgroup = cgrp; | 1042 | cgroup_css(cgrp, ss)->cgroup = cgrp; |
1052 | 1043 | ||
1053 | list_move(&ss->sibling, &root->subsys_list); | ||
1054 | ss->root = root; | 1044 | ss->root = root; |
1055 | if (ss->bind) | 1045 | if (ss->bind) |
1056 | ss->bind(cgroup_css(cgrp, ss)); | 1046 | ss->bind(cgroup_css(cgrp, ss)); |
@@ -1069,7 +1059,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1069 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); | 1059 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); |
1070 | 1060 | ||
1071 | cgroup_subsys[i]->root = &cgroup_dummy_root; | 1061 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
1072 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); | ||
1073 | 1062 | ||
1074 | /* subsystem is now free - drop reference on module */ | 1063 | /* subsystem is now free - drop reference on module */ |
1075 | module_put(ss->module); | 1064 | module_put(ss->module); |
@@ -1096,10 +1085,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1096 | { | 1085 | { |
1097 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | 1086 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; |
1098 | struct cgroup_subsys *ss; | 1087 | struct cgroup_subsys *ss; |
1088 | int ssid; | ||
1099 | 1089 | ||
1100 | mutex_lock(&cgroup_root_mutex); | 1090 | mutex_lock(&cgroup_root_mutex); |
1101 | for_each_root_subsys(root, ss) | 1091 | for_each_subsys(ss, ssid) |
1102 | seq_printf(seq, ",%s", ss->name); | 1092 | if (root->subsys_mask & (1 << ssid)) |
1093 | seq_printf(seq, ",%s", ss->name); | ||
1103 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | 1094 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
1104 | seq_puts(seq, ",sane_behavior"); | 1095 | seq_puts(seq, ",sane_behavior"); |
1105 | if (root->flags & CGRP_ROOT_NOPREFIX) | 1096 | if (root->flags & CGRP_ROOT_NOPREFIX) |
@@ -1362,8 +1353,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1362 | INIT_LIST_HEAD(&cgrp->pidlists); | 1353 | INIT_LIST_HEAD(&cgrp->pidlists); |
1363 | mutex_init(&cgrp->pidlist_mutex); | 1354 | mutex_init(&cgrp->pidlist_mutex); |
1364 | cgrp->dummy_css.cgroup = cgrp; | 1355 | cgrp->dummy_css.cgroup = cgrp; |
1365 | INIT_LIST_HEAD(&cgrp->event_list); | ||
1366 | spin_lock_init(&cgrp->event_list_lock); | ||
1367 | simple_xattrs_init(&cgrp->xattrs); | 1356 | simple_xattrs_init(&cgrp->xattrs); |
1368 | } | 1357 | } |
1369 | 1358 | ||
@@ -1371,7 +1360,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1371 | { | 1360 | { |
1372 | struct cgroup *cgrp = &root->top_cgroup; | 1361 | struct cgroup *cgrp = &root->top_cgroup; |
1373 | 1362 | ||
1374 | INIT_LIST_HEAD(&root->subsys_list); | ||
1375 | INIT_LIST_HEAD(&root->root_list); | 1363 | INIT_LIST_HEAD(&root->root_list); |
1376 | root->number_of_cgroups = 1; | 1364 | root->number_of_cgroups = 1; |
1377 | cgrp->root = root; | 1365 | cgrp->root = root; |
@@ -1580,10 +1568,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1580 | mutex_lock(&cgroup_mutex); | 1568 | mutex_lock(&cgroup_mutex); |
1581 | mutex_lock(&cgroup_root_mutex); | 1569 | mutex_lock(&cgroup_root_mutex); |
1582 | 1570 | ||
1583 | root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, | 1571 | ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); |
1584 | 0, 1, GFP_KERNEL); | 1572 | if (ret < 0) |
1585 | if (root_cgrp->id < 0) | ||
1586 | goto unlock_drop; | 1573 | goto unlock_drop; |
1574 | root_cgrp->id = ret; | ||
1587 | 1575 | ||
1588 | /* Check for name clashes with existing mounts */ | 1576 | /* Check for name clashes with existing mounts */ |
1589 | ret = -EBUSY; | 1577 | ret = -EBUSY; |
@@ -1693,7 +1681,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1693 | return ERR_PTR(ret); | 1681 | return ERR_PTR(ret); |
1694 | } | 1682 | } |
1695 | 1683 | ||
1696 | static void cgroup_kill_sb(struct super_block *sb) { | 1684 | static void cgroup_kill_sb(struct super_block *sb) |
1685 | { | ||
1697 | struct cgroupfs_root *root = sb->s_fs_info; | 1686 | struct cgroupfs_root *root = sb->s_fs_info; |
1698 | struct cgroup *cgrp = &root->top_cgroup; | 1687 | struct cgroup *cgrp = &root->top_cgroup; |
1699 | struct cgrp_cset_link *link, *tmp_link; | 1688 | struct cgrp_cset_link *link, *tmp_link; |
@@ -1976,8 +1965,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
1976 | bool threadgroup) | 1965 | bool threadgroup) |
1977 | { | 1966 | { |
1978 | int retval, i, group_size; | 1967 | int retval, i, group_size; |
1979 | struct cgroup_subsys *ss, *failed_ss = NULL; | ||
1980 | struct cgroupfs_root *root = cgrp->root; | 1968 | struct cgroupfs_root *root = cgrp->root; |
1969 | struct cgroup_subsys_state *css, *failed_css = NULL; | ||
1981 | /* threadgroup list cursor and array */ | 1970 | /* threadgroup list cursor and array */ |
1982 | struct task_struct *leader = tsk; | 1971 | struct task_struct *leader = tsk; |
1983 | struct task_and_cgroup *tc; | 1972 | struct task_and_cgroup *tc; |
@@ -2050,13 +2039,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2050 | /* | 2039 | /* |
2051 | * step 1: check that we can legitimately attach to the cgroup. | 2040 | * step 1: check that we can legitimately attach to the cgroup. |
2052 | */ | 2041 | */ |
2053 | for_each_root_subsys(root, ss) { | 2042 | for_each_css(css, i, cgrp) { |
2054 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 2043 | if (css->ss->can_attach) { |
2055 | 2044 | retval = css->ss->can_attach(css, &tset); | |
2056 | if (ss->can_attach) { | ||
2057 | retval = ss->can_attach(css, &tset); | ||
2058 | if (retval) { | 2045 | if (retval) { |
2059 | failed_ss = ss; | 2046 | failed_css = css; |
2060 | goto out_cancel_attach; | 2047 | goto out_cancel_attach; |
2061 | } | 2048 | } |
2062 | } | 2049 | } |
@@ -2092,12 +2079,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2092 | /* | 2079 | /* |
2093 | * step 4: do subsystem attach callbacks. | 2080 | * step 4: do subsystem attach callbacks. |
2094 | */ | 2081 | */ |
2095 | for_each_root_subsys(root, ss) { | 2082 | for_each_css(css, i, cgrp) |
2096 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 2083 | if (css->ss->attach) |
2097 | 2084 | css->ss->attach(css, &tset); | |
2098 | if (ss->attach) | ||
2099 | ss->attach(css, &tset); | ||
2100 | } | ||
2101 | 2085 | ||
2102 | /* | 2086 | /* |
2103 | * step 5: success! and cleanup | 2087 | * step 5: success! and cleanup |
@@ -2114,13 +2098,11 @@ out_put_css_set_refs: | |||
2114 | } | 2098 | } |
2115 | out_cancel_attach: | 2099 | out_cancel_attach: |
2116 | if (retval) { | 2100 | if (retval) { |
2117 | for_each_root_subsys(root, ss) { | 2101 | for_each_css(css, i, cgrp) { |
2118 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 2102 | if (css == failed_css) |
2119 | |||
2120 | if (ss == failed_ss) | ||
2121 | break; | 2103 | break; |
2122 | if (ss->cancel_attach) | 2104 | if (css->ss->cancel_attach) |
2123 | ss->cancel_attach(css, &tset); | 2105 | css->ss->cancel_attach(css, &tset); |
2124 | } | 2106 | } |
2125 | } | 2107 | } |
2126 | out_free_group_list: | 2108 | out_free_group_list: |
@@ -2148,7 +2130,7 @@ retry_find_task: | |||
2148 | tsk = find_task_by_vpid(pid); | 2130 | tsk = find_task_by_vpid(pid); |
2149 | if (!tsk) { | 2131 | if (!tsk) { |
2150 | rcu_read_unlock(); | 2132 | rcu_read_unlock(); |
2151 | ret= -ESRCH; | 2133 | ret = -ESRCH; |
2152 | goto out_unlock_cgroup; | 2134 | goto out_unlock_cgroup; |
2153 | } | 2135 | } |
2154 | /* | 2136 | /* |
@@ -2260,10 +2242,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css, | |||
2260 | return 0; | 2242 | return 0; |
2261 | } | 2243 | } |
2262 | 2244 | ||
2263 | static int cgroup_release_agent_show(struct cgroup_subsys_state *css, | 2245 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) |
2264 | struct cftype *cft, struct seq_file *seq) | ||
2265 | { | 2246 | { |
2266 | struct cgroup *cgrp = css->cgroup; | 2247 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
2267 | 2248 | ||
2268 | if (!cgroup_lock_live_group(cgrp)) | 2249 | if (!cgroup_lock_live_group(cgrp)) |
2269 | return -ENODEV; | 2250 | return -ENODEV; |
@@ -2273,174 +2254,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css, | |||
2273 | return 0; | 2254 | return 0; |
2274 | } | 2255 | } |
2275 | 2256 | ||
2276 | static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, | 2257 | static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) |
2277 | struct cftype *cft, struct seq_file *seq) | ||
2278 | { | 2258 | { |
2279 | seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); | 2259 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
2260 | |||
2261 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | ||
2280 | return 0; | 2262 | return 0; |
2281 | } | 2263 | } |
2282 | 2264 | ||
2283 | /* A buffer size big enough for numbers or short strings */ | 2265 | /* A buffer size big enough for numbers or short strings */ |
2284 | #define CGROUP_LOCAL_BUFFER_SIZE 64 | 2266 | #define CGROUP_LOCAL_BUFFER_SIZE 64 |
2285 | 2267 | ||
2286 | static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, | 2268 | static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, |
2287 | struct cftype *cft, struct file *file, | 2269 | size_t nbytes, loff_t *ppos) |
2288 | const char __user *userbuf, size_t nbytes, | ||
2289 | loff_t *unused_ppos) | ||
2290 | { | 2270 | { |
2291 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2271 | struct cfent *cfe = __d_cfe(file->f_dentry); |
2292 | int retval = 0; | 2272 | struct cftype *cft = __d_cft(file->f_dentry); |
2293 | char *end; | 2273 | struct cgroup_subsys_state *css = cfe->css; |
2274 | size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; | ||
2275 | char *buf; | ||
2276 | int ret; | ||
2294 | 2277 | ||
2295 | if (!nbytes) | 2278 | if (nbytes >= max_bytes) |
2296 | return -EINVAL; | ||
2297 | if (nbytes >= sizeof(buffer)) | ||
2298 | return -E2BIG; | 2279 | return -E2BIG; |
2299 | if (copy_from_user(buffer, userbuf, nbytes)) | ||
2300 | return -EFAULT; | ||
2301 | 2280 | ||
2302 | buffer[nbytes] = 0; /* nul-terminate */ | 2281 | buf = kmalloc(nbytes + 1, GFP_KERNEL); |
2303 | if (cft->write_u64) { | 2282 | if (!buf) |
2304 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); | 2283 | return -ENOMEM; |
2305 | if (*end) | 2284 | |
2306 | return -EINVAL; | 2285 | if (copy_from_user(buf, userbuf, nbytes)) { |
2307 | retval = cft->write_u64(css, cft, val); | 2286 | ret = -EFAULT; |
2287 | goto out_free; | ||
2288 | } | ||
2289 | |||
2290 | buf[nbytes] = '\0'; | ||
2291 | |||
2292 | if (cft->write_string) { | ||
2293 | ret = cft->write_string(css, cft, strstrip(buf)); | ||
2294 | } else if (cft->write_u64) { | ||
2295 | unsigned long long v; | ||
2296 | ret = kstrtoull(buf, 0, &v); | ||
2297 | if (!ret) | ||
2298 | ret = cft->write_u64(css, cft, v); | ||
2299 | } else if (cft->write_s64) { | ||
2300 | long long v; | ||
2301 | ret = kstrtoll(buf, 0, &v); | ||
2302 | if (!ret) | ||
2303 | ret = cft->write_s64(css, cft, v); | ||
2304 | } else if (cft->trigger) { | ||
2305 | ret = cft->trigger(css, (unsigned int)cft->private); | ||
2308 | } else { | 2306 | } else { |
2309 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); | 2307 | ret = -EINVAL; |
2310 | if (*end) | ||
2311 | return -EINVAL; | ||
2312 | retval = cft->write_s64(css, cft, val); | ||
2313 | } | 2308 | } |
2314 | if (!retval) | 2309 | out_free: |
2315 | retval = nbytes; | 2310 | kfree(buf); |
2316 | return retval; | 2311 | return ret ?: nbytes; |
2317 | } | 2312 | } |
2318 | 2313 | ||
2319 | static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, | 2314 | /* |
2320 | struct cftype *cft, struct file *file, | 2315 | * seqfile ops/methods for returning structured data. Currently just |
2321 | const char __user *userbuf, size_t nbytes, | 2316 | * supports string->u64 maps, but can be extended in future. |
2322 | loff_t *unused_ppos) | 2317 | */ |
2318 | |||
2319 | static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) | ||
2323 | { | 2320 | { |
2324 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2321 | struct cftype *cft = seq_cft(seq); |
2325 | int retval = 0; | ||
2326 | size_t max_bytes = cft->max_write_len; | ||
2327 | char *buffer = local_buffer; | ||
2328 | 2322 | ||
2329 | if (!max_bytes) | 2323 | if (cft->seq_start) { |
2330 | max_bytes = sizeof(local_buffer) - 1; | 2324 | return cft->seq_start(seq, ppos); |
2331 | if (nbytes >= max_bytes) | 2325 | } else { |
2332 | return -E2BIG; | 2326 | /* |
2333 | /* Allocate a dynamic buffer if we need one */ | 2327 | * The same behavior and code as single_open(). Returns |
2334 | if (nbytes >= sizeof(local_buffer)) { | 2328 | * !NULL if pos is at the beginning; otherwise, NULL. |
2335 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | 2329 | */ |
2336 | if (buffer == NULL) | 2330 | return NULL + !*ppos; |
2337 | return -ENOMEM; | ||
2338 | } | ||
2339 | if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { | ||
2340 | retval = -EFAULT; | ||
2341 | goto out; | ||
2342 | } | 2331 | } |
2343 | |||
2344 | buffer[nbytes] = 0; /* nul-terminate */ | ||
2345 | retval = cft->write_string(css, cft, strstrip(buffer)); | ||
2346 | if (!retval) | ||
2347 | retval = nbytes; | ||
2348 | out: | ||
2349 | if (buffer != local_buffer) | ||
2350 | kfree(buffer); | ||
2351 | return retval; | ||
2352 | } | 2332 | } |
2353 | 2333 | ||
2354 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 2334 | static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) |
2355 | size_t nbytes, loff_t *ppos) | ||
2356 | { | 2335 | { |
2357 | struct cfent *cfe = __d_cfe(file->f_dentry); | 2336 | struct cftype *cft = seq_cft(seq); |
2358 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2359 | struct cgroup_subsys_state *css = cfe->css; | ||
2360 | 2337 | ||
2361 | if (cft->write) | 2338 | if (cft->seq_next) { |
2362 | return cft->write(css, cft, file, buf, nbytes, ppos); | 2339 | return cft->seq_next(seq, v, ppos); |
2363 | if (cft->write_u64 || cft->write_s64) | 2340 | } else { |
2364 | return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); | 2341 | /* |
2365 | if (cft->write_string) | 2342 | * The same behavior and code as single_open(), always |
2366 | return cgroup_write_string(css, cft, file, buf, nbytes, ppos); | 2343 | * terminate after the initial read. |
2367 | if (cft->trigger) { | 2344 | */ |
2368 | int ret = cft->trigger(css, (unsigned int)cft->private); | 2345 | ++*ppos; |
2369 | return ret ? ret : nbytes; | 2346 | return NULL; |
2370 | } | 2347 | } |
2371 | return -EINVAL; | ||
2372 | } | 2348 | } |
2373 | 2349 | ||
2374 | static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, | 2350 | static void cgroup_seqfile_stop(struct seq_file *seq, void *v) |
2375 | struct cftype *cft, struct file *file, | ||
2376 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
2377 | { | 2351 | { |
2378 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2352 | struct cftype *cft = seq_cft(seq); |
2379 | u64 val = cft->read_u64(css, cft); | ||
2380 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | ||
2381 | 2353 | ||
2382 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2354 | if (cft->seq_stop) |
2355 | cft->seq_stop(seq, v); | ||
2383 | } | 2356 | } |
2384 | 2357 | ||
2385 | static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, | 2358 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) |
2386 | struct cftype *cft, struct file *file, | ||
2387 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
2388 | { | 2359 | { |
2389 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2360 | struct cftype *cft = seq_cft(m); |
2390 | s64 val = cft->read_s64(css, cft); | 2361 | struct cgroup_subsys_state *css = seq_css(m); |
2391 | int len = sprintf(tmp, "%lld\n", (long long) val); | ||
2392 | |||
2393 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
2394 | } | ||
2395 | 2362 | ||
2396 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 2363 | if (cft->seq_show) |
2397 | size_t nbytes, loff_t *ppos) | 2364 | return cft->seq_show(m, arg); |
2398 | { | ||
2399 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2400 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2401 | struct cgroup_subsys_state *css = cfe->css; | ||
2402 | 2365 | ||
2403 | if (cft->read) | ||
2404 | return cft->read(css, cft, file, buf, nbytes, ppos); | ||
2405 | if (cft->read_u64) | 2366 | if (cft->read_u64) |
2406 | return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); | 2367 | seq_printf(m, "%llu\n", cft->read_u64(css, cft)); |
2407 | if (cft->read_s64) | 2368 | else if (cft->read_s64) |
2408 | return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); | 2369 | seq_printf(m, "%lld\n", cft->read_s64(css, cft)); |
2409 | return -EINVAL; | 2370 | else |
2410 | } | 2371 | return -EINVAL; |
2411 | 2372 | return 0; | |
2412 | /* | ||
2413 | * seqfile ops/methods for returning structured data. Currently just | ||
2414 | * supports string->u64 maps, but can be extended in future. | ||
2415 | */ | ||
2416 | |||
2417 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | ||
2418 | { | ||
2419 | struct seq_file *sf = cb->state; | ||
2420 | return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); | ||
2421 | } | ||
2422 | |||
2423 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | ||
2424 | { | ||
2425 | struct cfent *cfe = m->private; | ||
2426 | struct cftype *cft = cfe->type; | ||
2427 | struct cgroup_subsys_state *css = cfe->css; | ||
2428 | |||
2429 | if (cft->read_map) { | ||
2430 | struct cgroup_map_cb cb = { | ||
2431 | .fill = cgroup_map_add, | ||
2432 | .state = m, | ||
2433 | }; | ||
2434 | return cft->read_map(css, cft, &cb); | ||
2435 | } | ||
2436 | return cft->read_seq_string(css, cft, m); | ||
2437 | } | 2373 | } |
2438 | 2374 | ||
2439 | static const struct file_operations cgroup_seqfile_operations = { | 2375 | static struct seq_operations cgroup_seq_operations = { |
2440 | .read = seq_read, | 2376 | .start = cgroup_seqfile_start, |
2441 | .write = cgroup_file_write, | 2377 | .next = cgroup_seqfile_next, |
2442 | .llseek = seq_lseek, | 2378 | .stop = cgroup_seqfile_stop, |
2443 | .release = cgroup_file_release, | 2379 | .show = cgroup_seqfile_show, |
2444 | }; | 2380 | }; |
2445 | 2381 | ||
2446 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2382 | static int cgroup_file_open(struct inode *inode, struct file *file) |
@@ -2449,6 +2385,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
2449 | struct cftype *cft = __d_cft(file->f_dentry); | 2385 | struct cftype *cft = __d_cft(file->f_dentry); |
2450 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); | 2386 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); |
2451 | struct cgroup_subsys_state *css; | 2387 | struct cgroup_subsys_state *css; |
2388 | struct cgroup_open_file *of; | ||
2452 | int err; | 2389 | int err; |
2453 | 2390 | ||
2454 | err = generic_file_open(inode, file); | 2391 | err = generic_file_open(inode, file); |
@@ -2478,32 +2415,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
2478 | WARN_ON_ONCE(cfe->css && cfe->css != css); | 2415 | WARN_ON_ONCE(cfe->css && cfe->css != css); |
2479 | cfe->css = css; | 2416 | cfe->css = css; |
2480 | 2417 | ||
2481 | if (cft->read_map || cft->read_seq_string) { | 2418 | of = __seq_open_private(file, &cgroup_seq_operations, |
2482 | file->f_op = &cgroup_seqfile_operations; | 2419 | sizeof(struct cgroup_open_file)); |
2483 | err = single_open(file, cgroup_seqfile_show, cfe); | 2420 | if (of) { |
2484 | } else if (cft->open) { | 2421 | of->cfe = cfe; |
2485 | err = cft->open(inode, file); | 2422 | return 0; |
2486 | } | 2423 | } |
2487 | 2424 | ||
2488 | if (css->ss && err) | 2425 | if (css->ss) |
2489 | css_put(css); | 2426 | css_put(css); |
2490 | return err; | 2427 | return -ENOMEM; |
2491 | } | 2428 | } |
2492 | 2429 | ||
2493 | static int cgroup_file_release(struct inode *inode, struct file *file) | 2430 | static int cgroup_file_release(struct inode *inode, struct file *file) |
2494 | { | 2431 | { |
2495 | struct cfent *cfe = __d_cfe(file->f_dentry); | 2432 | struct cfent *cfe = __d_cfe(file->f_dentry); |
2496 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2497 | struct cgroup_subsys_state *css = cfe->css; | 2433 | struct cgroup_subsys_state *css = cfe->css; |
2498 | int ret = 0; | ||
2499 | 2434 | ||
2500 | if (cft->release) | ||
2501 | ret = cft->release(inode, file); | ||
2502 | if (css->ss) | 2435 | if (css->ss) |
2503 | css_put(css); | 2436 | css_put(css); |
2504 | if (file->f_op == &cgroup_seqfile_operations) | 2437 | return seq_release_private(inode, file); |
2505 | single_release(inode, file); | ||
2506 | return ret; | ||
2507 | } | 2438 | } |
2508 | 2439 | ||
2509 | /* | 2440 | /* |
@@ -2614,7 +2545,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) | |||
2614 | } | 2545 | } |
2615 | 2546 | ||
2616 | static const struct file_operations cgroup_file_operations = { | 2547 | static const struct file_operations cgroup_file_operations = { |
2617 | .read = cgroup_file_read, | 2548 | .read = seq_read, |
2618 | .write = cgroup_file_write, | 2549 | .write = cgroup_file_write, |
2619 | .llseek = generic_file_llseek, | 2550 | .llseek = generic_file_llseek, |
2620 | .open = cgroup_file_open, | 2551 | .open = cgroup_file_open, |
@@ -2639,16 +2570,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
2639 | .removexattr = cgroup_removexattr, | 2570 | .removexattr = cgroup_removexattr, |
2640 | }; | 2571 | }; |
2641 | 2572 | ||
2642 | /* | ||
2643 | * Check if a file is a control file | ||
2644 | */ | ||
2645 | static inline struct cftype *__file_cft(struct file *file) | ||
2646 | { | ||
2647 | if (file_inode(file)->i_fop != &cgroup_file_operations) | ||
2648 | return ERR_PTR(-EINVAL); | ||
2649 | return __d_cft(file->f_dentry); | ||
2650 | } | ||
2651 | |||
2652 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, | 2573 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, |
2653 | struct super_block *sb) | 2574 | struct super_block *sb) |
2654 | { | 2575 | { |
@@ -2706,12 +2627,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2706 | if (cft->mode) | 2627 | if (cft->mode) |
2707 | return cft->mode; | 2628 | return cft->mode; |
2708 | 2629 | ||
2709 | if (cft->read || cft->read_u64 || cft->read_s64 || | 2630 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) |
2710 | cft->read_map || cft->read_seq_string) | ||
2711 | mode |= S_IRUGO; | 2631 | mode |= S_IRUGO; |
2712 | 2632 | ||
2713 | if (cft->write || cft->write_u64 || cft->write_s64 || | 2633 | if (cft->write_u64 || cft->write_s64 || cft->write_string || |
2714 | cft->write_string || cft->trigger) | 2634 | cft->trigger) |
2715 | mode |= S_IWUSR; | 2635 | mode |= S_IWUSR; |
2716 | 2636 | ||
2717 | return mode; | 2637 | return mode; |
@@ -2845,10 +2765,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) | |||
2845 | */ | 2765 | */ |
2846 | update_before = cgroup_serial_nr_next; | 2766 | update_before = cgroup_serial_nr_next; |
2847 | 2767 | ||
2848 | mutex_unlock(&cgroup_mutex); | ||
2849 | |||
2850 | /* add/rm files for all cgroups created before */ | 2768 | /* add/rm files for all cgroups created before */ |
2851 | rcu_read_lock(); | ||
2852 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { | 2769 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
2853 | struct cgroup *cgrp = css->cgroup; | 2770 | struct cgroup *cgrp = css->cgroup; |
2854 | 2771 | ||
@@ -2857,23 +2774,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) | |||
2857 | 2774 | ||
2858 | inode = cgrp->dentry->d_inode; | 2775 | inode = cgrp->dentry->d_inode; |
2859 | dget(cgrp->dentry); | 2776 | dget(cgrp->dentry); |
2860 | rcu_read_unlock(); | ||
2861 | |||
2862 | dput(prev); | 2777 | dput(prev); |
2863 | prev = cgrp->dentry; | 2778 | prev = cgrp->dentry; |
2864 | 2779 | ||
2780 | mutex_unlock(&cgroup_mutex); | ||
2865 | mutex_lock(&inode->i_mutex); | 2781 | mutex_lock(&inode->i_mutex); |
2866 | mutex_lock(&cgroup_mutex); | 2782 | mutex_lock(&cgroup_mutex); |
2867 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) | 2783 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
2868 | ret = cgroup_addrm_files(cgrp, cfts, is_add); | 2784 | ret = cgroup_addrm_files(cgrp, cfts, is_add); |
2869 | mutex_unlock(&cgroup_mutex); | ||
2870 | mutex_unlock(&inode->i_mutex); | 2785 | mutex_unlock(&inode->i_mutex); |
2871 | |||
2872 | rcu_read_lock(); | ||
2873 | if (ret) | 2786 | if (ret) |
2874 | break; | 2787 | break; |
2875 | } | 2788 | } |
2876 | rcu_read_unlock(); | 2789 | mutex_unlock(&cgroup_mutex); |
2877 | dput(prev); | 2790 | dput(prev); |
2878 | deactivate_super(sb); | 2791 | deactivate_super(sb); |
2879 | return ret; | 2792 | return ret; |
@@ -2992,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void) | |||
2992 | * We should check if the process is exiting, otherwise | 2905 | * We should check if the process is exiting, otherwise |
2993 | * it will race with cgroup_exit() in that the list | 2906 | * it will race with cgroup_exit() in that the list |
2994 | * entry won't be deleted though the process has exited. | 2907 | * entry won't be deleted though the process has exited. |
2908 | * Do it while holding siglock so that we don't end up | ||
2909 | * racing against cgroup_exit(). | ||
2995 | */ | 2910 | */ |
2911 | spin_lock_irq(&p->sighand->siglock); | ||
2996 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) | 2912 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) |
2997 | list_add(&p->cg_list, &task_css_set(p)->tasks); | 2913 | list_add(&p->cg_list, &task_css_set(p)->tasks); |
2914 | spin_unlock_irq(&p->sighand->siglock); | ||
2915 | |||
2998 | task_unlock(p); | 2916 | task_unlock(p); |
2999 | } while_each_thread(g, p); | 2917 | } while_each_thread(g, p); |
3000 | read_unlock(&tasklist_lock); | 2918 | read_unlock(&tasklist_lock); |
@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void) | |||
3007 | * @parent_css: css whose children to walk | 2925 | * @parent_css: css whose children to walk |
3008 | * | 2926 | * |
3009 | * This function returns the next child of @parent_css and should be called | 2927 | * This function returns the next child of @parent_css and should be called |
3010 | * under RCU read lock. The only requirement is that @parent_css and | 2928 | * under either cgroup_mutex or RCU read lock. The only requirement is |
3011 | * @pos_css are accessible. The next sibling is guaranteed to be returned | 2929 | * that @parent_css and @pos_css are accessible. The next sibling is |
3012 | * regardless of their states. | 2930 | * guaranteed to be returned regardless of their states. |
3013 | */ | 2931 | */ |
3014 | struct cgroup_subsys_state * | 2932 | struct cgroup_subsys_state * |
3015 | css_next_child(struct cgroup_subsys_state *pos_css, | 2933 | css_next_child(struct cgroup_subsys_state *pos_css, |
@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, | |||
3019 | struct cgroup *cgrp = parent_css->cgroup; | 2937 | struct cgroup *cgrp = parent_css->cgroup; |
3020 | struct cgroup *next; | 2938 | struct cgroup *next; |
3021 | 2939 | ||
3022 | WARN_ON_ONCE(!rcu_read_lock_held()); | 2940 | cgroup_assert_mutex_or_rcu_locked(); |
3023 | 2941 | ||
3024 | /* | 2942 | /* |
3025 | * @pos could already have been removed. Once a cgroup is removed, | 2943 | * @pos could already have been removed. Once a cgroup is removed, |
@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child); | |||
3066 | * to visit for pre-order traversal of @root's descendants. @root is | 2984 | * to visit for pre-order traversal of @root's descendants. @root is |
3067 | * included in the iteration and the first node to be visited. | 2985 | * included in the iteration and the first node to be visited. |
3068 | * | 2986 | * |
3069 | * While this function requires RCU read locking, it doesn't require the | 2987 | * While this function requires cgroup_mutex or RCU read locking, it |
3070 | * whole traversal to be contained in a single RCU critical section. This | 2988 | * doesn't require the whole traversal to be contained in a single critical |
3071 | * function will return the correct next descendant as long as both @pos | 2989 | * section. This function will return the correct next descendant as long |
3072 | * and @root are accessible and @pos is a descendant of @root. | 2990 | * as both @pos and @root are accessible and @pos is a descendant of @root. |
3073 | */ | 2991 | */ |
3074 | struct cgroup_subsys_state * | 2992 | struct cgroup_subsys_state * |
3075 | css_next_descendant_pre(struct cgroup_subsys_state *pos, | 2993 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
3077 | { | 2995 | { |
3078 | struct cgroup_subsys_state *next; | 2996 | struct cgroup_subsys_state *next; |
3079 | 2997 | ||
3080 | WARN_ON_ONCE(!rcu_read_lock_held()); | 2998 | cgroup_assert_mutex_or_rcu_locked(); |
3081 | 2999 | ||
3082 | /* if first iteration, visit @root */ | 3000 | /* if first iteration, visit @root */ |
3083 | if (!pos) | 3001 | if (!pos) |
@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); | |||
3108 | * is returned. This can be used during pre-order traversal to skip | 3026 | * is returned. This can be used during pre-order traversal to skip |
3109 | * subtree of @pos. | 3027 | * subtree of @pos. |
3110 | * | 3028 | * |
3111 | * While this function requires RCU read locking, it doesn't require the | 3029 | * While this function requires cgroup_mutex or RCU read locking, it |
3112 | * whole traversal to be contained in a single RCU critical section. This | 3030 | * doesn't require the whole traversal to be contained in a single critical |
3113 | * function will return the correct rightmost descendant as long as @pos is | 3031 | * section. This function will return the correct rightmost descendant as |
3114 | * accessible. | 3032 | * long as @pos is accessible. |
3115 | */ | 3033 | */ |
3116 | struct cgroup_subsys_state * | 3034 | struct cgroup_subsys_state * |
3117 | css_rightmost_descendant(struct cgroup_subsys_state *pos) | 3035 | css_rightmost_descendant(struct cgroup_subsys_state *pos) |
3118 | { | 3036 | { |
3119 | struct cgroup_subsys_state *last, *tmp; | 3037 | struct cgroup_subsys_state *last, *tmp; |
3120 | 3038 | ||
3121 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3039 | cgroup_assert_mutex_or_rcu_locked(); |
3122 | 3040 | ||
3123 | do { | 3041 | do { |
3124 | last = pos; | 3042 | last = pos; |
@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) | |||
3154 | * to visit for post-order traversal of @root's descendants. @root is | 3072 | * to visit for post-order traversal of @root's descendants. @root is |
3155 | * included in the iteration and the last node to be visited. | 3073 | * included in the iteration and the last node to be visited. |
3156 | * | 3074 | * |
3157 | * While this function requires RCU read locking, it doesn't require the | 3075 | * While this function requires cgroup_mutex or RCU read locking, it |
3158 | * whole traversal to be contained in a single RCU critical section. This | 3076 | * doesn't require the whole traversal to be contained in a single critical |
3159 | * function will return the correct next descendant as long as both @pos | 3077 | * section. This function will return the correct next descendant as long |
3160 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3078 | * as both @pos and @cgroup are accessible and @pos is a descendant of |
3079 | * @cgroup. | ||
3161 | */ | 3080 | */ |
3162 | struct cgroup_subsys_state * | 3081 | struct cgroup_subsys_state * |
3163 | css_next_descendant_post(struct cgroup_subsys_state *pos, | 3082 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
3165 | { | 3084 | { |
3166 | struct cgroup_subsys_state *next; | 3085 | struct cgroup_subsys_state *next; |
3167 | 3086 | ||
3168 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3087 | cgroup_assert_mutex_or_rcu_locked(); |
3169 | 3088 | ||
3170 | /* if first iteration, visit leftmost descendant which may be @root */ | 3089 | /* if first iteration, visit leftmost descendant which may be @root */ |
3171 | if (!pos) | 3090 | if (!pos) |
@@ -3504,14 +3423,12 @@ struct cgroup_pidlist { | |||
3504 | pid_t *list; | 3423 | pid_t *list; |
3505 | /* how many elements the above list has */ | 3424 | /* how many elements the above list has */ |
3506 | int length; | 3425 | int length; |
3507 | /* how many files are using the current array */ | ||
3508 | int use_count; | ||
3509 | /* each of these stored in a list by its cgroup */ | 3426 | /* each of these stored in a list by its cgroup */ |
3510 | struct list_head links; | 3427 | struct list_head links; |
3511 | /* pointer to the cgroup we belong to, for list removal purposes */ | 3428 | /* pointer to the cgroup we belong to, for list removal purposes */ |
3512 | struct cgroup *owner; | 3429 | struct cgroup *owner; |
3513 | /* protects the other fields */ | 3430 | /* for delayed destruction */ |
3514 | struct rw_semaphore rwsem; | 3431 | struct delayed_work destroy_dwork; |
3515 | }; | 3432 | }; |
3516 | 3433 | ||
3517 | /* | 3434 | /* |
@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count) | |||
3527 | else | 3444 | else |
3528 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); | 3445 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); |
3529 | } | 3446 | } |
3447 | |||
3530 | static void pidlist_free(void *p) | 3448 | static void pidlist_free(void *p) |
3531 | { | 3449 | { |
3532 | if (is_vmalloc_addr(p)) | 3450 | if (is_vmalloc_addr(p)) |
@@ -3536,6 +3454,47 @@ static void pidlist_free(void *p) | |||
3536 | } | 3454 | } |
3537 | 3455 | ||
3538 | /* | 3456 | /* |
3457 | * Used to destroy all pidlists lingering waiting for destroy timer. None | ||
3458 | * should be left afterwards. | ||
3459 | */ | ||
3460 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) | ||
3461 | { | ||
3462 | struct cgroup_pidlist *l, *tmp_l; | ||
3463 | |||
3464 | mutex_lock(&cgrp->pidlist_mutex); | ||
3465 | list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) | ||
3466 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); | ||
3467 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3468 | |||
3469 | flush_workqueue(cgroup_pidlist_destroy_wq); | ||
3470 | BUG_ON(!list_empty(&cgrp->pidlists)); | ||
3471 | } | ||
3472 | |||
3473 | static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) | ||
3474 | { | ||
3475 | struct delayed_work *dwork = to_delayed_work(work); | ||
3476 | struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, | ||
3477 | destroy_dwork); | ||
3478 | struct cgroup_pidlist *tofree = NULL; | ||
3479 | |||
3480 | mutex_lock(&l->owner->pidlist_mutex); | ||
3481 | |||
3482 | /* | ||
3483 | * Destroy iff we didn't get queued again. The state won't change | ||
3484 | * as destroy_dwork can only be queued while locked. | ||
3485 | */ | ||
3486 | if (!delayed_work_pending(dwork)) { | ||
3487 | list_del(&l->links); | ||
3488 | pidlist_free(l->list); | ||
3489 | put_pid_ns(l->key.ns); | ||
3490 | tofree = l; | ||
3491 | } | ||
3492 | |||
3493 | mutex_unlock(&l->owner->pidlist_mutex); | ||
3494 | kfree(tofree); | ||
3495 | } | ||
3496 | |||
3497 | /* | ||
3539 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries | 3498 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries |
3540 | * Returns the number of unique elements. | 3499 | * Returns the number of unique elements. |
3541 | */ | 3500 | */ |
@@ -3565,52 +3524,92 @@ after: | |||
3565 | return dest; | 3524 | return dest; |
3566 | } | 3525 | } |
3567 | 3526 | ||
3527 | /* | ||
3528 | * The two pid files - task and cgroup.procs - guaranteed that the result | ||
3529 | * is sorted, which forced this whole pidlist fiasco. As pid order is | ||
3530 | * different per namespace, each namespace needs differently sorted list, | ||
3531 | * making it impossible to use, for example, single rbtree of member tasks | ||
3532 | * sorted by task pointer. As pidlists can be fairly large, allocating one | ||
3533 | * per open file is dangerous, so cgroup had to implement shared pool of | ||
3534 | * pidlists keyed by cgroup and namespace. | ||
3535 | * | ||
3536 | * All this extra complexity was caused by the original implementation | ||
3537 | * committing to an entirely unnecessary property. In the long term, we | ||
3538 | * want to do away with it. Explicitly scramble sort order if | ||
3539 | * sane_behavior so that no such expectation exists in the new interface. | ||
3540 | * | ||
3541 | * Scrambling is done by swapping every two consecutive bits, which is | ||
3542 | * non-identity one-to-one mapping which disturbs sort order sufficiently. | ||
3543 | */ | ||
3544 | static pid_t pid_fry(pid_t pid) | ||
3545 | { | ||
3546 | unsigned a = pid & 0x55555555; | ||
3547 | unsigned b = pid & 0xAAAAAAAA; | ||
3548 | |||
3549 | return (a << 1) | (b >> 1); | ||
3550 | } | ||
3551 | |||
3552 | static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) | ||
3553 | { | ||
3554 | if (cgroup_sane_behavior(cgrp)) | ||
3555 | return pid_fry(pid); | ||
3556 | else | ||
3557 | return pid; | ||
3558 | } | ||
3559 | |||
3568 | static int cmppid(const void *a, const void *b) | 3560 | static int cmppid(const void *a, const void *b) |
3569 | { | 3561 | { |
3570 | return *(pid_t *)a - *(pid_t *)b; | 3562 | return *(pid_t *)a - *(pid_t *)b; |
3571 | } | 3563 | } |
3572 | 3564 | ||
3565 | static int fried_cmppid(const void *a, const void *b) | ||
3566 | { | ||
3567 | return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); | ||
3568 | } | ||
3569 | |||
3570 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | ||
3571 | enum cgroup_filetype type) | ||
3572 | { | ||
3573 | struct cgroup_pidlist *l; | ||
3574 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
3575 | struct pid_namespace *ns = task_active_pid_ns(current); | ||
3576 | |||
3577 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
3578 | |||
3579 | list_for_each_entry(l, &cgrp->pidlists, links) | ||
3580 | if (l->key.type == type && l->key.ns == ns) | ||
3581 | return l; | ||
3582 | return NULL; | ||
3583 | } | ||
3584 | |||
3573 | /* | 3585 | /* |
3574 | * find the appropriate pidlist for our purpose (given procs vs tasks) | 3586 | * find the appropriate pidlist for our purpose (given procs vs tasks) |
3575 | * returns with the lock on that pidlist already held, and takes care | 3587 | * returns with the lock on that pidlist already held, and takes care |
3576 | * of the use count, or returns NULL with no locks held if we're out of | 3588 | * of the use count, or returns NULL with no locks held if we're out of |
3577 | * memory. | 3589 | * memory. |
3578 | */ | 3590 | */ |
3579 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | 3591 | static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, |
3580 | enum cgroup_filetype type) | 3592 | enum cgroup_filetype type) |
3581 | { | 3593 | { |
3582 | struct cgroup_pidlist *l; | 3594 | struct cgroup_pidlist *l; |
3583 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
3584 | struct pid_namespace *ns = task_active_pid_ns(current); | ||
3585 | 3595 | ||
3586 | /* | 3596 | lockdep_assert_held(&cgrp->pidlist_mutex); |
3587 | * We can't drop the pidlist_mutex before taking the l->rwsem in case | 3597 | |
3588 | * the last ref-holder is trying to remove l from the list at the same | 3598 | l = cgroup_pidlist_find(cgrp, type); |
3589 | * time. Holding the pidlist_mutex precludes somebody taking whichever | 3599 | if (l) |
3590 | * list we find out from under us - compare release_pid_array(). | 3600 | return l; |
3591 | */ | 3601 | |
3592 | mutex_lock(&cgrp->pidlist_mutex); | ||
3593 | list_for_each_entry(l, &cgrp->pidlists, links) { | ||
3594 | if (l->key.type == type && l->key.ns == ns) { | ||
3595 | /* make sure l doesn't vanish out from under us */ | ||
3596 | down_write(&l->rwsem); | ||
3597 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3598 | return l; | ||
3599 | } | ||
3600 | } | ||
3601 | /* entry not found; create a new one */ | 3602 | /* entry not found; create a new one */ |
3602 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | 3603 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); |
3603 | if (!l) { | 3604 | if (!l) |
3604 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3605 | return l; | 3605 | return l; |
3606 | } | 3606 | |
3607 | init_rwsem(&l->rwsem); | 3607 | INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); |
3608 | down_write(&l->rwsem); | ||
3609 | l->key.type = type; | 3608 | l->key.type = type; |
3610 | l->key.ns = get_pid_ns(ns); | 3609 | /* don't need task_nsproxy() if we're looking at ourself */ |
3610 | l->key.ns = get_pid_ns(task_active_pid_ns(current)); | ||
3611 | l->owner = cgrp; | 3611 | l->owner = cgrp; |
3612 | list_add(&l->links, &cgrp->pidlists); | 3612 | list_add(&l->links, &cgrp->pidlists); |
3613 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3614 | return l; | 3613 | return l; |
3615 | } | 3614 | } |
3616 | 3615 | ||
@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3627 | struct task_struct *tsk; | 3626 | struct task_struct *tsk; |
3628 | struct cgroup_pidlist *l; | 3627 | struct cgroup_pidlist *l; |
3629 | 3628 | ||
3629 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
3630 | |||
3630 | /* | 3631 | /* |
3631 | * If cgroup gets more users after we read count, we won't have | 3632 | * If cgroup gets more users after we read count, we won't have |
3632 | * enough space - tough. This race is indistinguishable to the | 3633 | * enough space - tough. This race is indistinguishable to the |
@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3653 | css_task_iter_end(&it); | 3654 | css_task_iter_end(&it); |
3654 | length = n; | 3655 | length = n; |
3655 | /* now sort & (if procs) strip out duplicates */ | 3656 | /* now sort & (if procs) strip out duplicates */ |
3656 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3657 | if (cgroup_sane_behavior(cgrp)) |
3658 | sort(array, length, sizeof(pid_t), fried_cmppid, NULL); | ||
3659 | else | ||
3660 | sort(array, length, sizeof(pid_t), cmppid, NULL); | ||
3657 | if (type == CGROUP_FILE_PROCS) | 3661 | if (type == CGROUP_FILE_PROCS) |
3658 | length = pidlist_uniq(array, length); | 3662 | length = pidlist_uniq(array, length); |
3659 | l = cgroup_pidlist_find(cgrp, type); | 3663 | |
3664 | l = cgroup_pidlist_find_create(cgrp, type); | ||
3660 | if (!l) { | 3665 | if (!l) { |
3666 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3661 | pidlist_free(array); | 3667 | pidlist_free(array); |
3662 | return -ENOMEM; | 3668 | return -ENOMEM; |
3663 | } | 3669 | } |
3664 | /* store array, freeing old if necessary - lock already held */ | 3670 | |
3671 | /* store array, freeing old if necessary */ | ||
3665 | pidlist_free(l->list); | 3672 | pidlist_free(l->list); |
3666 | l->list = array; | 3673 | l->list = array; |
3667 | l->length = length; | 3674 | l->length = length; |
3668 | l->use_count++; | ||
3669 | up_write(&l->rwsem); | ||
3670 | *lp = l; | 3675 | *lp = l; |
3671 | return 0; | 3676 | return 0; |
3672 | } | 3677 | } |
@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3740 | * after a seek to the start). Use a binary-search to find the | 3745 | * after a seek to the start). Use a binary-search to find the |
3741 | * next pid to display, if any | 3746 | * next pid to display, if any |
3742 | */ | 3747 | */ |
3743 | struct cgroup_pidlist *l = s->private; | 3748 | struct cgroup_open_file *of = s->private; |
3749 | struct cgroup *cgrp = seq_css(s)->cgroup; | ||
3750 | struct cgroup_pidlist *l; | ||
3751 | enum cgroup_filetype type = seq_cft(s)->private; | ||
3744 | int index = 0, pid = *pos; | 3752 | int index = 0, pid = *pos; |
3745 | int *iter; | 3753 | int *iter, ret; |
3754 | |||
3755 | mutex_lock(&cgrp->pidlist_mutex); | ||
3756 | |||
3757 | /* | ||
3758 | * !NULL @of->priv indicates that this isn't the first start() | ||
3759 | * after open. If the matching pidlist is around, we can use that. | ||
3760 | * Look for it. Note that @of->priv can't be used directly. It | ||
3761 | * could already have been destroyed. | ||
3762 | */ | ||
3763 | if (of->priv) | ||
3764 | of->priv = cgroup_pidlist_find(cgrp, type); | ||
3765 | |||
3766 | /* | ||
3767 | * Either this is the first start() after open or the matching | ||
3768 | * pidlist has been destroyed inbetween. Create a new one. | ||
3769 | */ | ||
3770 | if (!of->priv) { | ||
3771 | ret = pidlist_array_load(cgrp, type, | ||
3772 | (struct cgroup_pidlist **)&of->priv); | ||
3773 | if (ret) | ||
3774 | return ERR_PTR(ret); | ||
3775 | } | ||
3776 | l = of->priv; | ||
3746 | 3777 | ||
3747 | down_read(&l->rwsem); | ||
3748 | if (pid) { | 3778 | if (pid) { |
3749 | int end = l->length; | 3779 | int end = l->length; |
3750 | 3780 | ||
3751 | while (index < end) { | 3781 | while (index < end) { |
3752 | int mid = (index + end) / 2; | 3782 | int mid = (index + end) / 2; |
3753 | if (l->list[mid] == pid) { | 3783 | if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { |
3754 | index = mid; | 3784 | index = mid; |
3755 | break; | 3785 | break; |
3756 | } else if (l->list[mid] <= pid) | 3786 | } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) |
3757 | index = mid + 1; | 3787 | index = mid + 1; |
3758 | else | 3788 | else |
3759 | end = mid; | 3789 | end = mid; |
@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3764 | return NULL; | 3794 | return NULL; |
3765 | /* Update the abstract position to be the actual pid that we found */ | 3795 | /* Update the abstract position to be the actual pid that we found */ |
3766 | iter = l->list + index; | 3796 | iter = l->list + index; |
3767 | *pos = *iter; | 3797 | *pos = cgroup_pid_fry(cgrp, *iter); |
3768 | return iter; | 3798 | return iter; |
3769 | } | 3799 | } |
3770 | 3800 | ||
3771 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | 3801 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
3772 | { | 3802 | { |
3773 | struct cgroup_pidlist *l = s->private; | 3803 | struct cgroup_open_file *of = s->private; |
3774 | up_read(&l->rwsem); | 3804 | struct cgroup_pidlist *l = of->priv; |
3805 | |||
3806 | if (l) | ||
3807 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, | ||
3808 | CGROUP_PIDLIST_DESTROY_DELAY); | ||
3809 | mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); | ||
3775 | } | 3810 | } |
3776 | 3811 | ||
3777 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | 3812 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
3778 | { | 3813 | { |
3779 | struct cgroup_pidlist *l = s->private; | 3814 | struct cgroup_open_file *of = s->private; |
3815 | struct cgroup_pidlist *l = of->priv; | ||
3780 | pid_t *p = v; | 3816 | pid_t *p = v; |
3781 | pid_t *end = l->list + l->length; | 3817 | pid_t *end = l->list + l->length; |
3782 | /* | 3818 | /* |
@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | |||
3787 | if (p >= end) { | 3823 | if (p >= end) { |
3788 | return NULL; | 3824 | return NULL; |
3789 | } else { | 3825 | } else { |
3790 | *pos = *p; | 3826 | *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); |
3791 | return p; | 3827 | return p; |
3792 | } | 3828 | } |
3793 | } | 3829 | } |
@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { | |||
3808 | .show = cgroup_pidlist_show, | 3844 | .show = cgroup_pidlist_show, |
3809 | }; | 3845 | }; |
3810 | 3846 | ||
3811 | static void cgroup_release_pid_array(struct cgroup_pidlist *l) | ||
3812 | { | ||
3813 | /* | ||
3814 | * the case where we're the last user of this particular pidlist will | ||
3815 | * have us remove it from the cgroup's list, which entails taking the | ||
3816 | * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> | ||
3817 | * pidlist_mutex, we have to take pidlist_mutex first. | ||
3818 | */ | ||
3819 | mutex_lock(&l->owner->pidlist_mutex); | ||
3820 | down_write(&l->rwsem); | ||
3821 | BUG_ON(!l->use_count); | ||
3822 | if (!--l->use_count) { | ||
3823 | /* we're the last user if refcount is 0; remove and free */ | ||
3824 | list_del(&l->links); | ||
3825 | mutex_unlock(&l->owner->pidlist_mutex); | ||
3826 | pidlist_free(l->list); | ||
3827 | put_pid_ns(l->key.ns); | ||
3828 | up_write(&l->rwsem); | ||
3829 | kfree(l); | ||
3830 | return; | ||
3831 | } | ||
3832 | mutex_unlock(&l->owner->pidlist_mutex); | ||
3833 | up_write(&l->rwsem); | ||
3834 | } | ||
3835 | |||
3836 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) | ||
3837 | { | ||
3838 | struct cgroup_pidlist *l; | ||
3839 | if (!(file->f_mode & FMODE_READ)) | ||
3840 | return 0; | ||
3841 | /* | ||
3842 | * the seq_file will only be initialized if the file was opened for | ||
3843 | * reading; hence we check if it's not null only in that case. | ||
3844 | */ | ||
3845 | l = ((struct seq_file *)file->private_data)->private; | ||
3846 | cgroup_release_pid_array(l); | ||
3847 | return seq_release(inode, file); | ||
3848 | } | ||
3849 | |||
3850 | static const struct file_operations cgroup_pidlist_operations = { | ||
3851 | .read = seq_read, | ||
3852 | .llseek = seq_lseek, | ||
3853 | .write = cgroup_file_write, | ||
3854 | .release = cgroup_pidlist_release, | ||
3855 | }; | ||
3856 | |||
3857 | /* | ||
3858 | * The following functions handle opens on a file that displays a pidlist | ||
3859 | * (tasks or procs). Prepare an array of the process/thread IDs of whoever's | ||
3860 | * in the cgroup. | ||
3861 | */ | ||
3862 | /* helper function for the two below it */ | ||
3863 | static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) | ||
3864 | { | ||
3865 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | ||
3866 | struct cgroup_pidlist *l; | ||
3867 | int retval; | ||
3868 | |||
3869 | /* Nothing to do for write-only files */ | ||
3870 | if (!(file->f_mode & FMODE_READ)) | ||
3871 | return 0; | ||
3872 | |||
3873 | /* have the array populated */ | ||
3874 | retval = pidlist_array_load(cgrp, type, &l); | ||
3875 | if (retval) | ||
3876 | return retval; | ||
3877 | /* configure file information */ | ||
3878 | file->f_op = &cgroup_pidlist_operations; | ||
3879 | |||
3880 | retval = seq_open(file, &cgroup_pidlist_seq_operations); | ||
3881 | if (retval) { | ||
3882 | cgroup_release_pid_array(l); | ||
3883 | return retval; | ||
3884 | } | ||
3885 | ((struct seq_file *)file->private_data)->private = l; | ||
3886 | return 0; | ||
3887 | } | ||
3888 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | ||
3889 | { | ||
3890 | return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); | ||
3891 | } | ||
3892 | static int cgroup_procs_open(struct inode *unused, struct file *file) | ||
3893 | { | ||
3894 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); | ||
3895 | } | ||
3896 | |||
3897 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 3847 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
3898 | struct cftype *cft) | 3848 | struct cftype *cft) |
3899 | { | 3849 | { |
@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp) | |||
3928 | deactivate_super(sb); | 3878 | deactivate_super(sb); |
3929 | } | 3879 | } |
3930 | 3880 | ||
3931 | /* | ||
3932 | * Unregister event and free resources. | ||
3933 | * | ||
3934 | * Gets called from workqueue. | ||
3935 | */ | ||
3936 | static void cgroup_event_remove(struct work_struct *work) | ||
3937 | { | ||
3938 | struct cgroup_event *event = container_of(work, struct cgroup_event, | ||
3939 | remove); | ||
3940 | struct cgroup_subsys_state *css = event->css; | ||
3941 | |||
3942 | remove_wait_queue(event->wqh, &event->wait); | ||
3943 | |||
3944 | event->cft->unregister_event(css, event->cft, event->eventfd); | ||
3945 | |||
3946 | /* Notify userspace the event is going away. */ | ||
3947 | eventfd_signal(event->eventfd, 1); | ||
3948 | |||
3949 | eventfd_ctx_put(event->eventfd); | ||
3950 | kfree(event); | ||
3951 | css_put(css); | ||
3952 | } | ||
3953 | |||
3954 | /* | ||
3955 | * Gets called on POLLHUP on eventfd when user closes it. | ||
3956 | * | ||
3957 | * Called with wqh->lock held and interrupts disabled. | ||
3958 | */ | ||
3959 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | ||
3960 | int sync, void *key) | ||
3961 | { | ||
3962 | struct cgroup_event *event = container_of(wait, | ||
3963 | struct cgroup_event, wait); | ||
3964 | struct cgroup *cgrp = event->css->cgroup; | ||
3965 | unsigned long flags = (unsigned long)key; | ||
3966 | |||
3967 | if (flags & POLLHUP) { | ||
3968 | /* | ||
3969 | * If the event has been detached at cgroup removal, we | ||
3970 | * can simply return knowing the other side will cleanup | ||
3971 | * for us. | ||
3972 | * | ||
3973 | * We can't race against event freeing since the other | ||
3974 | * side will require wqh->lock via remove_wait_queue(), | ||
3975 | * which we hold. | ||
3976 | */ | ||
3977 | spin_lock(&cgrp->event_list_lock); | ||
3978 | if (!list_empty(&event->list)) { | ||
3979 | list_del_init(&event->list); | ||
3980 | /* | ||
3981 | * We are in atomic context, but cgroup_event_remove() | ||
3982 | * may sleep, so we have to call it in workqueue. | ||
3983 | */ | ||
3984 | schedule_work(&event->remove); | ||
3985 | } | ||
3986 | spin_unlock(&cgrp->event_list_lock); | ||
3987 | } | ||
3988 | |||
3989 | return 0; | ||
3990 | } | ||
3991 | |||
3992 | static void cgroup_event_ptable_queue_proc(struct file *file, | ||
3993 | wait_queue_head_t *wqh, poll_table *pt) | ||
3994 | { | ||
3995 | struct cgroup_event *event = container_of(pt, | ||
3996 | struct cgroup_event, pt); | ||
3997 | |||
3998 | event->wqh = wqh; | ||
3999 | add_wait_queue(wqh, &event->wait); | ||
4000 | } | ||
4001 | |||
4002 | /* | ||
4003 | * Parse input and register new cgroup event handler. | ||
4004 | * | ||
4005 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
4006 | * Interpretation of args is defined by control file implementation. | ||
4007 | */ | ||
4008 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, | ||
4009 | struct cftype *cft, const char *buffer) | ||
4010 | { | ||
4011 | struct cgroup *cgrp = dummy_css->cgroup; | ||
4012 | struct cgroup_event *event; | ||
4013 | struct cgroup_subsys_state *cfile_css; | ||
4014 | unsigned int efd, cfd; | ||
4015 | struct fd efile; | ||
4016 | struct fd cfile; | ||
4017 | char *endp; | ||
4018 | int ret; | ||
4019 | |||
4020 | efd = simple_strtoul(buffer, &endp, 10); | ||
4021 | if (*endp != ' ') | ||
4022 | return -EINVAL; | ||
4023 | buffer = endp + 1; | ||
4024 | |||
4025 | cfd = simple_strtoul(buffer, &endp, 10); | ||
4026 | if ((*endp != ' ') && (*endp != '\0')) | ||
4027 | return -EINVAL; | ||
4028 | buffer = endp + 1; | ||
4029 | |||
4030 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
4031 | if (!event) | ||
4032 | return -ENOMEM; | ||
4033 | |||
4034 | INIT_LIST_HEAD(&event->list); | ||
4035 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | ||
4036 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | ||
4037 | INIT_WORK(&event->remove, cgroup_event_remove); | ||
4038 | |||
4039 | efile = fdget(efd); | ||
4040 | if (!efile.file) { | ||
4041 | ret = -EBADF; | ||
4042 | goto out_kfree; | ||
4043 | } | ||
4044 | |||
4045 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
4046 | if (IS_ERR(event->eventfd)) { | ||
4047 | ret = PTR_ERR(event->eventfd); | ||
4048 | goto out_put_efile; | ||
4049 | } | ||
4050 | |||
4051 | cfile = fdget(cfd); | ||
4052 | if (!cfile.file) { | ||
4053 | ret = -EBADF; | ||
4054 | goto out_put_eventfd; | ||
4055 | } | ||
4056 | |||
4057 | /* the process need read permission on control file */ | ||
4058 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
4059 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
4060 | if (ret < 0) | ||
4061 | goto out_put_cfile; | ||
4062 | |||
4063 | event->cft = __file_cft(cfile.file); | ||
4064 | if (IS_ERR(event->cft)) { | ||
4065 | ret = PTR_ERR(event->cft); | ||
4066 | goto out_put_cfile; | ||
4067 | } | ||
4068 | |||
4069 | if (!event->cft->ss) { | ||
4070 | ret = -EBADF; | ||
4071 | goto out_put_cfile; | ||
4072 | } | ||
4073 | |||
4074 | /* | ||
4075 | * Determine the css of @cfile, verify it belongs to the same | ||
4076 | * cgroup as cgroup.event_control, and associate @event with it. | ||
4077 | * Remaining events are automatically removed on cgroup destruction | ||
4078 | * but the removal is asynchronous, so take an extra ref. | ||
4079 | */ | ||
4080 | rcu_read_lock(); | ||
4081 | |||
4082 | ret = -EINVAL; | ||
4083 | event->css = cgroup_css(cgrp, event->cft->ss); | ||
4084 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); | ||
4085 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
4086 | ret = 0; | ||
4087 | |||
4088 | rcu_read_unlock(); | ||
4089 | if (ret) | ||
4090 | goto out_put_cfile; | ||
4091 | |||
4092 | if (!event->cft->register_event || !event->cft->unregister_event) { | ||
4093 | ret = -EINVAL; | ||
4094 | goto out_put_css; | ||
4095 | } | ||
4096 | |||
4097 | ret = event->cft->register_event(event->css, event->cft, | ||
4098 | event->eventfd, buffer); | ||
4099 | if (ret) | ||
4100 | goto out_put_css; | ||
4101 | |||
4102 | efile.file->f_op->poll(efile.file, &event->pt); | ||
4103 | |||
4104 | spin_lock(&cgrp->event_list_lock); | ||
4105 | list_add(&event->list, &cgrp->event_list); | ||
4106 | spin_unlock(&cgrp->event_list_lock); | ||
4107 | |||
4108 | fdput(cfile); | ||
4109 | fdput(efile); | ||
4110 | |||
4111 | return 0; | ||
4112 | |||
4113 | out_put_css: | ||
4114 | css_put(event->css); | ||
4115 | out_put_cfile: | ||
4116 | fdput(cfile); | ||
4117 | out_put_eventfd: | ||
4118 | eventfd_ctx_put(event->eventfd); | ||
4119 | out_put_efile: | ||
4120 | fdput(efile); | ||
4121 | out_kfree: | ||
4122 | kfree(event); | ||
4123 | |||
4124 | return ret; | ||
4125 | } | ||
4126 | |||
4127 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, | 3881 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
4128 | struct cftype *cft) | 3882 | struct cftype *cft) |
4129 | { | 3883 | { |
@@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, | |||
4143 | static struct cftype cgroup_base_files[] = { | 3897 | static struct cftype cgroup_base_files[] = { |
4144 | { | 3898 | { |
4145 | .name = "cgroup.procs", | 3899 | .name = "cgroup.procs", |
4146 | .open = cgroup_procs_open, | 3900 | .seq_start = cgroup_pidlist_start, |
3901 | .seq_next = cgroup_pidlist_next, | ||
3902 | .seq_stop = cgroup_pidlist_stop, | ||
3903 | .seq_show = cgroup_pidlist_show, | ||
3904 | .private = CGROUP_FILE_PROCS, | ||
4147 | .write_u64 = cgroup_procs_write, | 3905 | .write_u64 = cgroup_procs_write, |
4148 | .release = cgroup_pidlist_release, | ||
4149 | .mode = S_IRUGO | S_IWUSR, | 3906 | .mode = S_IRUGO | S_IWUSR, |
4150 | }, | 3907 | }, |
4151 | { | 3908 | { |
4152 | .name = "cgroup.event_control", | ||
4153 | .write_string = cgroup_write_event_control, | ||
4154 | .mode = S_IWUGO, | ||
4155 | }, | ||
4156 | { | ||
4157 | .name = "cgroup.clone_children", | 3909 | .name = "cgroup.clone_children", |
4158 | .flags = CFTYPE_INSANE, | 3910 | .flags = CFTYPE_INSANE, |
4159 | .read_u64 = cgroup_clone_children_read, | 3911 | .read_u64 = cgroup_clone_children_read, |
@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = { | |||
4162 | { | 3914 | { |
4163 | .name = "cgroup.sane_behavior", | 3915 | .name = "cgroup.sane_behavior", |
4164 | .flags = CFTYPE_ONLY_ON_ROOT, | 3916 | .flags = CFTYPE_ONLY_ON_ROOT, |
4165 | .read_seq_string = cgroup_sane_behavior_show, | 3917 | .seq_show = cgroup_sane_behavior_show, |
4166 | }, | 3918 | }, |
4167 | 3919 | ||
4168 | /* | 3920 | /* |
@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = { | |||
4173 | { | 3925 | { |
4174 | .name = "tasks", | 3926 | .name = "tasks", |
4175 | .flags = CFTYPE_INSANE, /* use "procs" instead */ | 3927 | .flags = CFTYPE_INSANE, /* use "procs" instead */ |
4176 | .open = cgroup_tasks_open, | 3928 | .seq_start = cgroup_pidlist_start, |
3929 | .seq_next = cgroup_pidlist_next, | ||
3930 | .seq_stop = cgroup_pidlist_stop, | ||
3931 | .seq_show = cgroup_pidlist_show, | ||
3932 | .private = CGROUP_FILE_TASKS, | ||
4177 | .write_u64 = cgroup_tasks_write, | 3933 | .write_u64 = cgroup_tasks_write, |
4178 | .release = cgroup_pidlist_release, | ||
4179 | .mode = S_IRUGO | S_IWUSR, | 3934 | .mode = S_IRUGO | S_IWUSR, |
4180 | }, | 3935 | }, |
4181 | { | 3936 | { |
@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = { | |||
4187 | { | 3942 | { |
4188 | .name = "release_agent", | 3943 | .name = "release_agent", |
4189 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, | 3944 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, |
4190 | .read_seq_string = cgroup_release_agent_show, | 3945 | .seq_show = cgroup_release_agent_show, |
4191 | .write_string = cgroup_release_agent_write, | 3946 | .write_string = cgroup_release_agent_write, |
4192 | .max_write_len = PATH_MAX, | 3947 | .max_write_len = PATH_MAX, |
4193 | }, | 3948 | }, |
@@ -4333,6 +4088,65 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
4333 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); | 4088 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); |
4334 | } | 4089 | } |
4335 | 4090 | ||
4091 | /** | ||
4092 | * create_css - create a cgroup_subsys_state | ||
4093 | * @cgrp: the cgroup new css will be associated with | ||
4094 | * @ss: the subsys of new css | ||
4095 | * | ||
4096 | * Create a new css associated with @cgrp - @ss pair. On success, the new | ||
4097 | * css is online and installed in @cgrp with all interface files created. | ||
4098 | * Returns 0 on success, -errno on failure. | ||
4099 | */ | ||
4100 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) | ||
4101 | { | ||
4102 | struct cgroup *parent = cgrp->parent; | ||
4103 | struct cgroup_subsys_state *css; | ||
4104 | int err; | ||
4105 | |||
4106 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | ||
4107 | lockdep_assert_held(&cgroup_mutex); | ||
4108 | |||
4109 | css = ss->css_alloc(cgroup_css(parent, ss)); | ||
4110 | if (IS_ERR(css)) | ||
4111 | return PTR_ERR(css); | ||
4112 | |||
4113 | err = percpu_ref_init(&css->refcnt, css_release); | ||
4114 | if (err) | ||
4115 | goto err_free_css; | ||
4116 | |||
4117 | init_css(css, ss, cgrp); | ||
4118 | |||
4119 | err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); | ||
4120 | if (err) | ||
4121 | goto err_free_percpu_ref; | ||
4122 | |||
4123 | err = online_css(css); | ||
4124 | if (err) | ||
4125 | goto err_clear_dir; | ||
4126 | |||
4127 | dget(cgrp->dentry); | ||
4128 | css_get(css->parent); | ||
4129 | |||
4130 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | ||
4131 | parent->parent) { | ||
4132 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | ||
4133 | current->comm, current->pid, ss->name); | ||
4134 | if (!strcmp(ss->name, "memory")) | ||
4135 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | ||
4136 | ss->warned_broken_hierarchy = true; | ||
4137 | } | ||
4138 | |||
4139 | return 0; | ||
4140 | |||
4141 | err_clear_dir: | ||
4142 | cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); | ||
4143 | err_free_percpu_ref: | ||
4144 | percpu_ref_cancel_init(&css->refcnt); | ||
4145 | err_free_css: | ||
4146 | ss->css_free(css); | ||
4147 | return err; | ||
4148 | } | ||
4149 | |||
4336 | /* | 4150 | /* |
4337 | * cgroup_create - create a cgroup | 4151 | * cgroup_create - create a cgroup |
4338 | * @parent: cgroup that will be parent of the new cgroup | 4152 | * @parent: cgroup that will be parent of the new cgroup |
@@ -4344,11 +4158,10 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
4344 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 4158 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
4345 | umode_t mode) | 4159 | umode_t mode) |
4346 | { | 4160 | { |
4347 | struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; | ||
4348 | struct cgroup *cgrp; | 4161 | struct cgroup *cgrp; |
4349 | struct cgroup_name *name; | 4162 | struct cgroup_name *name; |
4350 | struct cgroupfs_root *root = parent->root; | 4163 | struct cgroupfs_root *root = parent->root; |
4351 | int err = 0; | 4164 | int ssid, err; |
4352 | struct cgroup_subsys *ss; | 4165 | struct cgroup_subsys *ss; |
4353 | struct super_block *sb = root->sb; | 4166 | struct super_block *sb = root->sb; |
4354 | 4167 | ||
@@ -4358,19 +4171,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4358 | return -ENOMEM; | 4171 | return -ENOMEM; |
4359 | 4172 | ||
4360 | name = cgroup_alloc_name(dentry); | 4173 | name = cgroup_alloc_name(dentry); |
4361 | if (!name) | 4174 | if (!name) { |
4175 | err = -ENOMEM; | ||
4362 | goto err_free_cgrp; | 4176 | goto err_free_cgrp; |
4177 | } | ||
4363 | rcu_assign_pointer(cgrp->name, name); | 4178 | rcu_assign_pointer(cgrp->name, name); |
4364 | 4179 | ||
4365 | /* | 4180 | /* |
4366 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
4367 | * a half-baked cgroup. | ||
4368 | */ | ||
4369 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
4370 | if (cgrp->id < 0) | ||
4371 | goto err_free_name; | ||
4372 | |||
4373 | /* | ||
4374 | * Only live parents can have children. Note that the liveliness | 4181 | * Only live parents can have children. Note that the liveliness |
4375 | * check isn't strictly necessary because cgroup_mkdir() and | 4182 | * check isn't strictly necessary because cgroup_mkdir() and |
4376 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | 4183 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it |
@@ -4379,7 +4186,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4379 | */ | 4186 | */ |
4380 | if (!cgroup_lock_live_group(parent)) { | 4187 | if (!cgroup_lock_live_group(parent)) { |
4381 | err = -ENODEV; | 4188 | err = -ENODEV; |
4382 | goto err_free_id; | 4189 | goto err_free_name; |
4190 | } | ||
4191 | |||
4192 | /* | ||
4193 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
4194 | * a half-baked cgroup. | ||
4195 | */ | ||
4196 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
4197 | if (cgrp->id < 0) { | ||
4198 | err = -ENOMEM; | ||
4199 | goto err_unlock; | ||
4383 | } | 4200 | } |
4384 | 4201 | ||
4385 | /* Grab a reference on the superblock so the hierarchy doesn't | 4202 | /* Grab a reference on the superblock so the hierarchy doesn't |
@@ -4404,23 +4221,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4404 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) | 4221 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4405 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4222 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4406 | 4223 | ||
4407 | for_each_root_subsys(root, ss) { | ||
4408 | struct cgroup_subsys_state *css; | ||
4409 | |||
4410 | css = ss->css_alloc(cgroup_css(parent, ss)); | ||
4411 | if (IS_ERR(css)) { | ||
4412 | err = PTR_ERR(css); | ||
4413 | goto err_free_all; | ||
4414 | } | ||
4415 | css_ar[ss->subsys_id] = css; | ||
4416 | |||
4417 | err = percpu_ref_init(&css->refcnt, css_release); | ||
4418 | if (err) | ||
4419 | goto err_free_all; | ||
4420 | |||
4421 | init_css(css, ss, cgrp); | ||
4422 | } | ||
4423 | |||
4424 | /* | 4224 | /* |
4425 | * Create directory. cgroup_create_file() returns with the new | 4225 | * Create directory. cgroup_create_file() returns with the new |
4426 | * directory locked on success so that it can be populated without | 4226 | * directory locked on success so that it can be populated without |
@@ -4428,7 +4228,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4428 | */ | 4228 | */ |
4429 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); | 4229 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); |
4430 | if (err < 0) | 4230 | if (err < 0) |
4431 | goto err_free_all; | 4231 | goto err_free_id; |
4432 | lockdep_assert_held(&dentry->d_inode->i_mutex); | 4232 | lockdep_assert_held(&dentry->d_inode->i_mutex); |
4433 | 4233 | ||
4434 | cgrp->serial_nr = cgroup_serial_nr_next++; | 4234 | cgrp->serial_nr = cgroup_serial_nr_next++; |
@@ -4440,60 +4240,36 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4440 | /* hold a ref to the parent's dentry */ | 4240 | /* hold a ref to the parent's dentry */ |
4441 | dget(parent->dentry); | 4241 | dget(parent->dentry); |
4442 | 4242 | ||
4443 | /* creation succeeded, notify subsystems */ | 4243 | /* |
4444 | for_each_root_subsys(root, ss) { | 4244 | * @cgrp is now fully operational. If something fails after this |
4445 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | 4245 | * point, it'll be released via the normal destruction path. |
4446 | 4246 | */ | |
4447 | err = online_css(css); | ||
4448 | if (err) | ||
4449 | goto err_destroy; | ||
4450 | |||
4451 | /* each css holds a ref to the cgroup's dentry and parent css */ | ||
4452 | dget(dentry); | ||
4453 | css_get(css->parent); | ||
4454 | |||
4455 | /* mark it consumed for error path */ | ||
4456 | css_ar[ss->subsys_id] = NULL; | ||
4457 | |||
4458 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | ||
4459 | parent->parent) { | ||
4460 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | ||
4461 | current->comm, current->pid, ss->name); | ||
4462 | if (!strcmp(ss->name, "memory")) | ||
4463 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | ||
4464 | ss->warned_broken_hierarchy = true; | ||
4465 | } | ||
4466 | } | ||
4467 | |||
4468 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); | 4247 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
4469 | 4248 | ||
4470 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | 4249 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); |
4471 | if (err) | 4250 | if (err) |
4472 | goto err_destroy; | 4251 | goto err_destroy; |
4473 | 4252 | ||
4474 | err = cgroup_populate_dir(cgrp, root->subsys_mask); | 4253 | /* let's create and online css's */ |
4475 | if (err) | 4254 | for_each_subsys(ss, ssid) { |
4476 | goto err_destroy; | 4255 | if (root->subsys_mask & (1 << ssid)) { |
4256 | err = create_css(cgrp, ss); | ||
4257 | if (err) | ||
4258 | goto err_destroy; | ||
4259 | } | ||
4260 | } | ||
4477 | 4261 | ||
4478 | mutex_unlock(&cgroup_mutex); | 4262 | mutex_unlock(&cgroup_mutex); |
4479 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 4263 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
4480 | 4264 | ||
4481 | return 0; | 4265 | return 0; |
4482 | 4266 | ||
4483 | err_free_all: | ||
4484 | for_each_root_subsys(root, ss) { | ||
4485 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4486 | |||
4487 | if (css) { | ||
4488 | percpu_ref_cancel_init(&css->refcnt); | ||
4489 | ss->css_free(css); | ||
4490 | } | ||
4491 | } | ||
4492 | mutex_unlock(&cgroup_mutex); | ||
4493 | /* Release the reference count that we took on the superblock */ | ||
4494 | deactivate_super(sb); | ||
4495 | err_free_id: | 4267 | err_free_id: |
4496 | idr_remove(&root->cgroup_idr, cgrp->id); | 4268 | idr_remove(&root->cgroup_idr, cgrp->id); |
4269 | /* Release the reference count that we took on the superblock */ | ||
4270 | deactivate_super(sb); | ||
4271 | err_unlock: | ||
4272 | mutex_unlock(&cgroup_mutex); | ||
4497 | err_free_name: | 4273 | err_free_name: |
4498 | kfree(rcu_dereference_raw(cgrp->name)); | 4274 | kfree(rcu_dereference_raw(cgrp->name)); |
4499 | err_free_cgrp: | 4275 | err_free_cgrp: |
@@ -4501,14 +4277,6 @@ err_free_cgrp: | |||
4501 | return err; | 4277 | return err; |
4502 | 4278 | ||
4503 | err_destroy: | 4279 | err_destroy: |
4504 | for_each_root_subsys(root, ss) { | ||
4505 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4506 | |||
4507 | if (css) { | ||
4508 | percpu_ref_cancel_init(&css->refcnt); | ||
4509 | ss->css_free(css); | ||
4510 | } | ||
4511 | } | ||
4512 | cgroup_destroy_locked(cgrp); | 4280 | cgroup_destroy_locked(cgrp); |
4513 | mutex_unlock(&cgroup_mutex); | 4281 | mutex_unlock(&cgroup_mutex); |
4514 | mutex_unlock(&dentry->d_inode->i_mutex); | 4282 | mutex_unlock(&dentry->d_inode->i_mutex); |
@@ -4631,10 +4399,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4631 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4399 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4632 | { | 4400 | { |
4633 | struct dentry *d = cgrp->dentry; | 4401 | struct dentry *d = cgrp->dentry; |
4634 | struct cgroup_event *event, *tmp; | 4402 | struct cgroup_subsys_state *css; |
4635 | struct cgroup_subsys *ss; | ||
4636 | struct cgroup *child; | 4403 | struct cgroup *child; |
4637 | bool empty; | 4404 | bool empty; |
4405 | int ssid; | ||
4638 | 4406 | ||
4639 | lockdep_assert_held(&d->d_inode->i_mutex); | 4407 | lockdep_assert_held(&d->d_inode->i_mutex); |
4640 | lockdep_assert_held(&cgroup_mutex); | 4408 | lockdep_assert_held(&cgroup_mutex); |
@@ -4670,12 +4438,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4670 | * will be invoked to perform the rest of destruction once the | 4438 | * will be invoked to perform the rest of destruction once the |
4671 | * percpu refs of all css's are confirmed to be killed. | 4439 | * percpu refs of all css's are confirmed to be killed. |
4672 | */ | 4440 | */ |
4673 | for_each_root_subsys(cgrp->root, ss) { | 4441 | for_each_css(css, ssid, cgrp) |
4674 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 4442 | kill_css(css); |
4675 | |||
4676 | if (css) | ||
4677 | kill_css(css); | ||
4678 | } | ||
4679 | 4443 | ||
4680 | /* | 4444 | /* |
4681 | * Mark @cgrp dead. This prevents further task migration and child | 4445 | * Mark @cgrp dead. This prevents further task migration and child |
@@ -4710,18 +4474,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4710 | dget(d); | 4474 | dget(d); |
4711 | cgroup_d_remove_dir(d); | 4475 | cgroup_d_remove_dir(d); |
4712 | 4476 | ||
4713 | /* | ||
4714 | * Unregister events and notify userspace. | ||
4715 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
4716 | * directory to avoid race between userspace and kernelspace. | ||
4717 | */ | ||
4718 | spin_lock(&cgrp->event_list_lock); | ||
4719 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | ||
4720 | list_del_init(&event->list); | ||
4721 | schedule_work(&event->remove); | ||
4722 | } | ||
4723 | spin_unlock(&cgrp->event_list_lock); | ||
4724 | |||
4725 | return 0; | 4477 | return 0; |
4726 | }; | 4478 | }; |
4727 | 4479 | ||
@@ -4792,7 +4544,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4792 | cgroup_init_cftsets(ss); | 4544 | cgroup_init_cftsets(ss); |
4793 | 4545 | ||
4794 | /* Create the top cgroup state for this subsystem */ | 4546 | /* Create the top cgroup state for this subsystem */ |
4795 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | ||
4796 | ss->root = &cgroup_dummy_root; | 4547 | ss->root = &cgroup_dummy_root; |
4797 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); | 4548 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4798 | /* We don't handle early failures gracefully */ | 4549 | /* We don't handle early failures gracefully */ |
@@ -4866,6 +4617,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4866 | cgroup_init_cftsets(ss); | 4617 | cgroup_init_cftsets(ss); |
4867 | 4618 | ||
4868 | mutex_lock(&cgroup_mutex); | 4619 | mutex_lock(&cgroup_mutex); |
4620 | mutex_lock(&cgroup_root_mutex); | ||
4869 | cgroup_subsys[ss->subsys_id] = ss; | 4621 | cgroup_subsys[ss->subsys_id] = ss; |
4870 | 4622 | ||
4871 | /* | 4623 | /* |
@@ -4877,11 +4629,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4877 | if (IS_ERR(css)) { | 4629 | if (IS_ERR(css)) { |
4878 | /* failure case - need to deassign the cgroup_subsys[] slot. */ | 4630 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
4879 | cgroup_subsys[ss->subsys_id] = NULL; | 4631 | cgroup_subsys[ss->subsys_id] = NULL; |
4632 | mutex_unlock(&cgroup_root_mutex); | ||
4880 | mutex_unlock(&cgroup_mutex); | 4633 | mutex_unlock(&cgroup_mutex); |
4881 | return PTR_ERR(css); | 4634 | return PTR_ERR(css); |
4882 | } | 4635 | } |
4883 | 4636 | ||
4884 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | ||
4885 | ss->root = &cgroup_dummy_root; | 4637 | ss->root = &cgroup_dummy_root; |
4886 | 4638 | ||
4887 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4639 | /* our new subsystem will be attached to the dummy hierarchy. */ |
@@ -4911,14 +4663,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4911 | write_unlock(&css_set_lock); | 4663 | write_unlock(&css_set_lock); |
4912 | 4664 | ||
4913 | ret = online_css(css); | 4665 | ret = online_css(css); |
4914 | if (ret) | 4666 | if (ret) { |
4667 | ss->css_free(css); | ||
4915 | goto err_unload; | 4668 | goto err_unload; |
4669 | } | ||
4916 | 4670 | ||
4917 | /* success! */ | 4671 | /* success! */ |
4672 | mutex_unlock(&cgroup_root_mutex); | ||
4918 | mutex_unlock(&cgroup_mutex); | 4673 | mutex_unlock(&cgroup_mutex); |
4919 | return 0; | 4674 | return 0; |
4920 | 4675 | ||
4921 | err_unload: | 4676 | err_unload: |
4677 | mutex_unlock(&cgroup_root_mutex); | ||
4922 | mutex_unlock(&cgroup_mutex); | 4678 | mutex_unlock(&cgroup_mutex); |
4923 | /* @ss can't be mounted here as try_module_get() would fail */ | 4679 | /* @ss can't be mounted here as try_module_get() would fail */ |
4924 | cgroup_unload_subsys(ss); | 4680 | cgroup_unload_subsys(ss); |
@@ -4937,6 +4693,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); | |||
4937 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | 4693 | void cgroup_unload_subsys(struct cgroup_subsys *ss) |
4938 | { | 4694 | { |
4939 | struct cgrp_cset_link *link; | 4695 | struct cgrp_cset_link *link; |
4696 | struct cgroup_subsys_state *css; | ||
4940 | 4697 | ||
4941 | BUG_ON(ss->module == NULL); | 4698 | BUG_ON(ss->module == NULL); |
4942 | 4699 | ||
@@ -4948,15 +4705,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4948 | BUG_ON(ss->root != &cgroup_dummy_root); | 4705 | BUG_ON(ss->root != &cgroup_dummy_root); |
4949 | 4706 | ||
4950 | mutex_lock(&cgroup_mutex); | 4707 | mutex_lock(&cgroup_mutex); |
4708 | mutex_lock(&cgroup_root_mutex); | ||
4951 | 4709 | ||
4952 | offline_css(cgroup_css(cgroup_dummy_top, ss)); | 4710 | css = cgroup_css(cgroup_dummy_top, ss); |
4711 | if (css) | ||
4712 | offline_css(css); | ||
4953 | 4713 | ||
4954 | /* deassign the subsys_id */ | 4714 | /* deassign the subsys_id */ |
4955 | cgroup_subsys[ss->subsys_id] = NULL; | 4715 | cgroup_subsys[ss->subsys_id] = NULL; |
4956 | 4716 | ||
4957 | /* remove subsystem from the dummy root's list of subsystems */ | ||
4958 | list_del_init(&ss->sibling); | ||
4959 | |||
4960 | /* | 4717 | /* |
4961 | * disentangle the css from all css_sets attached to the dummy | 4718 | * disentangle the css from all css_sets attached to the dummy |
4962 | * top. as in loading, we need to pay our respects to the hashtable | 4719 | * top. as in loading, we need to pay our respects to the hashtable |
@@ -4979,9 +4736,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4979 | * need to free before marking as null because ss->css_free needs | 4736 | * need to free before marking as null because ss->css_free needs |
4980 | * the cgrp->subsys pointer to find their state. | 4737 | * the cgrp->subsys pointer to find their state. |
4981 | */ | 4738 | */ |
4982 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); | 4739 | if (css) |
4740 | ss->css_free(css); | ||
4983 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); | 4741 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); |
4984 | 4742 | ||
4743 | mutex_unlock(&cgroup_root_mutex); | ||
4985 | mutex_unlock(&cgroup_mutex); | 4744 | mutex_unlock(&cgroup_mutex); |
4986 | } | 4745 | } |
4987 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); | 4746 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); |
@@ -5100,6 +4859,15 @@ static int __init cgroup_wq_init(void) | |||
5100 | */ | 4859 | */ |
5101 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); | 4860 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); |
5102 | BUG_ON(!cgroup_destroy_wq); | 4861 | BUG_ON(!cgroup_destroy_wq); |
4862 | |||
4863 | /* | ||
4864 | * Used to destroy pidlists and separate to serve as flush domain. | ||
4865 | * Cap @max_active to 1 too. | ||
4866 | */ | ||
4867 | cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", | ||
4868 | 0, 1); | ||
4869 | BUG_ON(!cgroup_pidlist_destroy_wq); | ||
4870 | |||
5103 | return 0; | 4871 | return 0; |
5104 | } | 4872 | } |
5105 | core_initcall(cgroup_wq_init); | 4873 | core_initcall(cgroup_wq_init); |
@@ -5143,11 +4911,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
5143 | for_each_active_root(root) { | 4911 | for_each_active_root(root) { |
5144 | struct cgroup_subsys *ss; | 4912 | struct cgroup_subsys *ss; |
5145 | struct cgroup *cgrp; | 4913 | struct cgroup *cgrp; |
5146 | int count = 0; | 4914 | int ssid, count = 0; |
5147 | 4915 | ||
5148 | seq_printf(m, "%d:", root->hierarchy_id); | 4916 | seq_printf(m, "%d:", root->hierarchy_id); |
5149 | for_each_root_subsys(root, ss) | 4917 | for_each_subsys(ss, ssid) |
5150 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4918 | if (root->subsys_mask & (1 << ssid)) |
4919 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | ||
5151 | if (strlen(root->name)) | 4920 | if (strlen(root->name)) |
5152 | seq_printf(m, "%sname=%s", count ? "," : "", | 4921 | seq_printf(m, "%sname=%s", count ? "," : "", |
5153 | root->name); | 4922 | root->name); |
@@ -5488,16 +5257,16 @@ __setup("cgroup_disable=", cgroup_disable); | |||
5488 | * @dentry: directory dentry of interest | 5257 | * @dentry: directory dentry of interest |
5489 | * @ss: subsystem of interest | 5258 | * @ss: subsystem of interest |
5490 | * | 5259 | * |
5491 | * Must be called under RCU read lock. The caller is responsible for | 5260 | * Must be called under cgroup_mutex or RCU read lock. The caller is |
5492 | * pinning the returned css if it needs to be accessed outside the RCU | 5261 | * responsible for pinning the returned css if it needs to be accessed |
5493 | * critical section. | 5262 | * outside the critical section. |
5494 | */ | 5263 | */ |
5495 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, | 5264 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, |
5496 | struct cgroup_subsys *ss) | 5265 | struct cgroup_subsys *ss) |
5497 | { | 5266 | { |
5498 | struct cgroup *cgrp; | 5267 | struct cgroup *cgrp; |
5499 | 5268 | ||
5500 | WARN_ON_ONCE(!rcu_read_lock_held()); | 5269 | cgroup_assert_mutex_or_rcu_locked(); |
5501 | 5270 | ||
5502 | /* is @dentry a cgroup dir? */ | 5271 | /* is @dentry a cgroup dir? */ |
5503 | if (!dentry->d_inode || | 5272 | if (!dentry->d_inode || |
@@ -5520,9 +5289,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | |||
5520 | { | 5289 | { |
5521 | struct cgroup *cgrp; | 5290 | struct cgroup *cgrp; |
5522 | 5291 | ||
5523 | rcu_lockdep_assert(rcu_read_lock_held() || | 5292 | cgroup_assert_mutex_or_rcu_locked(); |
5524 | lockdep_is_held(&cgroup_mutex), | ||
5525 | "css_from_id() needs proper protection"); | ||
5526 | 5293 | ||
5527 | cgrp = idr_find(&ss->root->cgroup_idr, id); | 5294 | cgrp = idr_find(&ss->root->cgroup_idr, id); |
5528 | if (cgrp) | 5295 | if (cgrp) |
@@ -5570,9 +5337,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | |||
5570 | return count; | 5337 | return count; |
5571 | } | 5338 | } |
5572 | 5339 | ||
5573 | static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, | 5340 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) |
5574 | struct cftype *cft, | ||
5575 | struct seq_file *seq) | ||
5576 | { | 5341 | { |
5577 | struct cgrp_cset_link *link; | 5342 | struct cgrp_cset_link *link; |
5578 | struct css_set *cset; | 5343 | struct css_set *cset; |
@@ -5597,9 +5362,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, | |||
5597 | } | 5362 | } |
5598 | 5363 | ||
5599 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5364 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5600 | static int cgroup_css_links_read(struct cgroup_subsys_state *css, | 5365 | static int cgroup_css_links_read(struct seq_file *seq, void *v) |
5601 | struct cftype *cft, struct seq_file *seq) | ||
5602 | { | 5366 | { |
5367 | struct cgroup_subsys_state *css = seq_css(seq); | ||
5603 | struct cgrp_cset_link *link; | 5368 | struct cgrp_cset_link *link; |
5604 | 5369 | ||
5605 | read_lock(&css_set_lock); | 5370 | read_lock(&css_set_lock); |
@@ -5645,12 +5410,12 @@ static struct cftype debug_files[] = { | |||
5645 | 5410 | ||
5646 | { | 5411 | { |
5647 | .name = "current_css_set_cg_links", | 5412 | .name = "current_css_set_cg_links", |
5648 | .read_seq_string = current_css_set_cg_links_read, | 5413 | .seq_show = current_css_set_cg_links_read, |
5649 | }, | 5414 | }, |
5650 | 5415 | ||
5651 | { | 5416 | { |
5652 | .name = "cgroup_css_links", | 5417 | .name = "cgroup_css_links", |
5653 | .read_seq_string = cgroup_css_links_read, | 5418 | .seq_show = cgroup_css_links_read, |
5654 | }, | 5419 | }, |
5655 | 5420 | ||
5656 | { | 5421 | { |