diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-26 22:50:04 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-26 22:50:04 -0400 |
commit | bbe179f88d39274630823a0dc07d2714fd19a103 (patch) | |
tree | f70181a660e0f859f230233643faded7d44360e5 | |
parent | 4b703b1d4c46ca4a00109ca1a391943ec21991b3 (diff) | |
parent | 8a0792ef8e01f03cb43806c6a87738bde34df713 (diff) |
Merge branch 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- threadgroup_lock got reorganized so that its users can pick the
actual locking mechanism to use. Its only user - cgroups - is
updated to use a percpu_rwsem instead of per-process rwsem.
This makes things a bit lighter on hot paths and allows cgroups to
perform and fail multi-task (a process) migrations atomically.
Multi-task migrations are used in several places including the
unified hierarchy.
- Delegation rule and documentation added to unified hierarchy. This
will likely be the last interface update from the cgroup core side
for unified hierarchy before lifting the devel mask.
- Some groundwork for the pids controller which is scheduled to be
merged in the coming devel cycle.
* 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: add delegation section to unified hierarchy documentation
cgroup: require write perm on common ancestor when moving processes on the default hierarchy
cgroup: separate out cgroup_procs_write_permission() from __cgroup_procs_write()
kernfs: make kernfs_get_inode() public
MAINTAINERS: add a cgroup core co-maintainer
cgroup: fix uninitialised iterator in for_each_subsys_which
cgroup: replace explicit ss_mask checking with for_each_subsys_which
cgroup: use bitmask to filter for_each_subsys
cgroup: add seq_file forward declaration for struct cftype
cgroup: simplify threadgroup locking
sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem
sched, cgroup: reorganize threadgroup locking
cgroup: switch to unsigned long for bitmasks
cgroup: reorganize include/linux/cgroup.h
cgroup: separate out include/linux/cgroup-defs.h
cgroup: fix some comment typos
-rw-r--r-- | Documentation/cgroups/unified-hierarchy.txt | 102 | ||||
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | fs/kernfs/kernfs-internal.h | 1 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 501 | ||||
-rw-r--r-- | include/linux/cgroup.h | 1007 | ||||
-rw-r--r-- | include/linux/init_task.h | 8 | ||||
-rw-r--r-- | include/linux/kernfs.h | 5 | ||||
-rw-r--r-- | include/linux/sched.h | 65 | ||||
-rw-r--r-- | init/Kconfig | 1 | ||||
-rw-r--r-- | kernel/cgroup.c | 273 | ||||
-rw-r--r-- | kernel/fork.c | 4 |
11 files changed, 1028 insertions, 940 deletions
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index eb102fb72213..86847a7647ab 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt | |||
@@ -17,15 +17,18 @@ CONTENTS | |||
17 | 3. Structural Constraints | 17 | 3. Structural Constraints |
18 | 3-1. Top-down | 18 | 3-1. Top-down |
19 | 3-2. No internal tasks | 19 | 3-2. No internal tasks |
20 | 4. Other Changes | 20 | 4. Delegation |
21 | 4-1. [Un]populated Notification | 21 | 4-1. Model of delegation |
22 | 4-2. Other Core Changes | 22 | 4-2. Common ancestor rule |
23 | 4-3. Per-Controller Changes | 23 | 5. Other Changes |
24 | 4-3-1. blkio | 24 | 5-1. [Un]populated Notification |
25 | 4-3-2. cpuset | 25 | 5-2. Other Core Changes |
26 | 4-3-3. memory | 26 | 5-3. Per-Controller Changes |
27 | 5. Planned Changes | 27 | 5-3-1. blkio |
28 | 5-1. CAP for resource control | 28 | 5-3-2. cpuset |
29 | 5-3-3. memory | ||
30 | 6. Planned Changes | ||
31 | 6-1. CAP for resource control | ||
29 | 32 | ||
30 | 33 | ||
31 | 1. Background | 34 | 1. Background |
@@ -245,9 +248,72 @@ cgroup must create children and transfer all its tasks to the children | |||
245 | before enabling controllers in its "cgroup.subtree_control" file. | 248 | before enabling controllers in its "cgroup.subtree_control" file. |
246 | 249 | ||
247 | 250 | ||
248 | 4. Other Changes | 251 | 4. Delegation |
249 | 252 | ||
250 | 4-1. [Un]populated Notification | 253 | 4-1. Model of delegation |
254 | |||
255 | A cgroup can be delegated to a less privileged user by granting write | ||
256 | access of the directory and its "cgroup.procs" file to the user. Note | ||
257 | that the resource control knobs in a given directory concern the | ||
258 | resources of the parent and thus must not be delegated along with the | ||
259 | directory. | ||
260 | |||
261 | Once delegated, the user can build sub-hierarchy under the directory, | ||
262 | organize processes as it sees fit and further distribute the resources | ||
263 | it got from the parent. The limits and other settings of all resource | ||
264 | controllers are hierarchical and regardless of what happens in the | ||
265 | delegated sub-hierarchy, nothing can escape the resource restrictions | ||
266 | imposed by the parent. | ||
267 | |||
268 | Currently, cgroup doesn't impose any restrictions on the number of | ||
269 | cgroups in or nesting depth of a delegated sub-hierarchy; however, | ||
270 | this may in the future be limited explicitly. | ||
271 | |||
272 | |||
273 | 4-2. Common ancestor rule | ||
274 | |||
275 | On the unified hierarchy, to write to a "cgroup.procs" file, in | ||
276 | addition to the usual write permission to the file and uid match, the | ||
277 | writer must also have write access to the "cgroup.procs" file of the | ||
278 | common ancestor of the source and destination cgroups. This prevents | ||
279 | delegatees from smuggling processes across disjoint sub-hierarchies. | ||
280 | |||
281 | Let's say cgroups C0 and C1 have been delegated to user U0 who created | ||
282 | C00, C01 under C0 and C10 under C1 as follows. | ||
283 | |||
284 | ~~~~~~~~~~~~~ - C0 - C00 | ||
285 | ~ cgroup ~ \ C01 | ||
286 | ~ hierarchy ~ | ||
287 | ~~~~~~~~~~~~~ - C1 - C10 | ||
288 | |||
289 | C0 and C1 are separate entities in terms of resource distribution | ||
290 | regardless of their relative positions in the hierarchy. The | ||
291 | resources the processes under C0 are entitled to are controlled by | ||
292 | C0's ancestors and may be completely different from C1. It's clear | ||
293 | that the intention of delegating C0 to U0 is allowing U0 to organize | ||
294 | the processes under C0 and further control the distribution of C0's | ||
295 | resources. | ||
296 | |||
297 | On traditional hierarchies, if a task has write access to "tasks" or | ||
298 | "cgroup.procs" file of a cgroup and its uid agrees with the target, it | ||
299 | can move the target to the cgroup. In the above example, U0 will not | ||
300 | only be able to move processes in each sub-hierarchy but also across | ||
301 | the two sub-hierarchies, effectively allowing it to violate the | ||
302 | organizational and resource restrictions implied by the hierarchical | ||
303 | structure above C0 and C1. | ||
304 | |||
305 | On the unified hierarchy, let's say U0 wants to write the pid of a | ||
306 | process which has a matching uid and is currently in C10 into | ||
307 | "C00/cgroup.procs". U0 obviously has write access to the file and | ||
308 | migration permission on the process; however, the common ancestor of | ||
309 | the source cgroup C10 and the destination cgroup C00 is above the | ||
310 | points of delegation and U0 would not have write access to its | ||
311 | "cgroup.procs" and thus be denied with -EACCES. | ||
312 | |||
313 | |||
314 | 5. Other Changes | ||
315 | |||
316 | 5-1. [Un]populated Notification | ||
251 | 317 | ||
252 | cgroup users often need a way to determine when a cgroup's | 318 | cgroup users often need a way to determine when a cgroup's |
253 | subhierarchy becomes empty so that it can be cleaned up. cgroup | 319 | subhierarchy becomes empty so that it can be cleaned up. cgroup |
@@ -289,7 +355,7 @@ supported and the interface files "release_agent" and | |||
289 | "notify_on_release" do not exist. | 355 | "notify_on_release" do not exist. |
290 | 356 | ||
291 | 357 | ||
292 | 4-2. Other Core Changes | 358 | 5-2. Other Core Changes |
293 | 359 | ||
294 | - None of the mount options is allowed. | 360 | - None of the mount options is allowed. |
295 | 361 | ||
@@ -306,14 +372,14 @@ supported and the interface files "release_agent" and | |||
306 | - The "cgroup.clone_children" file is removed. | 372 | - The "cgroup.clone_children" file is removed. |
307 | 373 | ||
308 | 374 | ||
309 | 4-3. Per-Controller Changes | 375 | 5-3. Per-Controller Changes |
310 | 376 | ||
311 | 4-3-1. blkio | 377 | 5-3-1. blkio |
312 | 378 | ||
313 | - blk-throttle becomes properly hierarchical. | 379 | - blk-throttle becomes properly hierarchical. |
314 | 380 | ||
315 | 381 | ||
316 | 4-3-2. cpuset | 382 | 5-3-2. cpuset |
317 | 383 | ||
318 | - Tasks are kept in empty cpusets after hotplug and take on the masks | 384 | - Tasks are kept in empty cpusets after hotplug and take on the masks |
319 | of the nearest non-empty ancestor, instead of being moved to it. | 385 | of the nearest non-empty ancestor, instead of being moved to it. |
@@ -322,7 +388,7 @@ supported and the interface files "release_agent" and | |||
322 | masks of the nearest non-empty ancestor. | 388 | masks of the nearest non-empty ancestor. |
323 | 389 | ||
324 | 390 | ||
325 | 4-3-3. memory | 391 | 5-3-3. memory |
326 | 392 | ||
327 | - use_hierarchy is on by default and the cgroup file for the flag is | 393 | - use_hierarchy is on by default and the cgroup file for the flag is |
328 | not created. | 394 | not created. |
@@ -407,9 +473,9 @@ supported and the interface files "release_agent" and | |||
407 | memory.low, memory.high, and memory.max will use the string "max" to | 473 | memory.low, memory.high, and memory.max will use the string "max" to |
408 | indicate and set the highest possible value. | 474 | indicate and set the highest possible value. |
409 | 475 | ||
410 | 5. Planned Changes | 476 | 6. Planned Changes |
411 | 477 | ||
412 | 5-1. CAP for resource control | 478 | 6-1. CAP for resource control |
413 | 479 | ||
414 | Unified hierarchy will require one of the capabilities(7), which is | 480 | Unified hierarchy will require one of the capabilities(7), which is |
415 | yet to be decided, for all resource control related knobs. Process | 481 | yet to be decided, for all resource control related knobs. Process |
diff --git a/MAINTAINERS b/MAINTAINERS index 68457d869b61..c54a67434048 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2816,6 +2816,7 @@ F: drivers/connector/ | |||
2816 | CONTROL GROUP (CGROUP) | 2816 | CONTROL GROUP (CGROUP) |
2817 | M: Tejun Heo <tj@kernel.org> | 2817 | M: Tejun Heo <tj@kernel.org> |
2818 | M: Li Zefan <lizefan@huawei.com> | 2818 | M: Li Zefan <lizefan@huawei.com> |
2819 | M: Johannes Weiner <hannes@cmpxchg.org> | ||
2819 | L: cgroups@vger.kernel.org | 2820 | L: cgroups@vger.kernel.org |
2820 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git | 2821 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git |
2821 | S: Maintained | 2822 | S: Maintained |
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index af9fa7499919..6762bfbd8207 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h | |||
@@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache; | |||
76 | /* | 76 | /* |
77 | * inode.c | 77 | * inode.c |
78 | */ | 78 | */ |
79 | struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); | ||
80 | void kernfs_evict_inode(struct inode *inode); | 79 | void kernfs_evict_inode(struct inode *inode); |
81 | int kernfs_iop_permission(struct inode *inode, int mask); | 80 | int kernfs_iop_permission(struct inode *inode, int mask); |
82 | int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); | 81 | int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); |
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h new file mode 100644 index 000000000000..93755a629299 --- /dev/null +++ b/include/linux/cgroup-defs.h | |||
@@ -0,0 +1,501 @@ | |||
1 | /* | ||
2 | * linux/cgroup-defs.h - basic definitions for cgroup | ||
3 | * | ||
4 | * This file provides basic type and interface. Include this file directly | ||
5 | * only if necessary to avoid cyclic dependencies. | ||
6 | */ | ||
7 | #ifndef _LINUX_CGROUP_DEFS_H | ||
8 | #define _LINUX_CGROUP_DEFS_H | ||
9 | |||
10 | #include <linux/limits.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/idr.h> | ||
13 | #include <linux/wait.h> | ||
14 | #include <linux/mutex.h> | ||
15 | #include <linux/rcupdate.h> | ||
16 | #include <linux/percpu-refcount.h> | ||
17 | #include <linux/percpu-rwsem.h> | ||
18 | #include <linux/workqueue.h> | ||
19 | |||
20 | #ifdef CONFIG_CGROUPS | ||
21 | |||
22 | struct cgroup; | ||
23 | struct cgroup_root; | ||
24 | struct cgroup_subsys; | ||
25 | struct cgroup_taskset; | ||
26 | struct kernfs_node; | ||
27 | struct kernfs_ops; | ||
28 | struct kernfs_open_file; | ||
29 | struct seq_file; | ||
30 | |||
31 | #define MAX_CGROUP_TYPE_NAMELEN 32 | ||
32 | #define MAX_CGROUP_ROOT_NAMELEN 64 | ||
33 | #define MAX_CFTYPE_NAME 64 | ||
34 | |||
35 | /* define the enumeration of all cgroup subsystems */ | ||
36 | #define SUBSYS(_x) _x ## _cgrp_id, | ||
37 | enum cgroup_subsys_id { | ||
38 | #include <linux/cgroup_subsys.h> | ||
39 | CGROUP_SUBSYS_COUNT, | ||
40 | }; | ||
41 | #undef SUBSYS | ||
42 | |||
43 | /* bits in struct cgroup_subsys_state flags field */ | ||
44 | enum { | ||
45 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ | ||
46 | CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ | ||
47 | CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ | ||
48 | }; | ||
49 | |||
50 | /* bits in struct cgroup flags field */ | ||
51 | enum { | ||
52 | /* Control Group requires release notifications to userspace */ | ||
53 | CGRP_NOTIFY_ON_RELEASE, | ||
54 | /* | ||
55 | * Clone the parent's configuration when creating a new child | ||
56 | * cpuset cgroup. For historical reasons, this option can be | ||
57 | * specified at mount time and thus is implemented here. | ||
58 | */ | ||
59 | CGRP_CPUSET_CLONE_CHILDREN, | ||
60 | }; | ||
61 | |||
62 | /* cgroup_root->flags */ | ||
63 | enum { | ||
64 | CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ | ||
65 | CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ | ||
66 | CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ | ||
67 | }; | ||
68 | |||
69 | /* cftype->flags */ | ||
70 | enum { | ||
71 | CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ | ||
72 | CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ | ||
73 | CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ | ||
74 | |||
75 | /* internal flags, do not use outside cgroup core proper */ | ||
76 | __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ | ||
77 | __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ | ||
78 | }; | ||
79 | |||
80 | /* | ||
81 | * Per-subsystem/per-cgroup state maintained by the system. This is the | ||
82 | * fundamental structural building block that controllers deal with. | ||
83 | * | ||
84 | * Fields marked with "PI:" are public and immutable and may be accessed | ||
85 | * directly without synchronization. | ||
86 | */ | ||
87 | struct cgroup_subsys_state { | ||
88 | /* PI: the cgroup that this css is attached to */ | ||
89 | struct cgroup *cgroup; | ||
90 | |||
91 | /* PI: the cgroup subsystem that this css is attached to */ | ||
92 | struct cgroup_subsys *ss; | ||
93 | |||
94 | /* reference count - access via css_[try]get() and css_put() */ | ||
95 | struct percpu_ref refcnt; | ||
96 | |||
97 | /* PI: the parent css */ | ||
98 | struct cgroup_subsys_state *parent; | ||
99 | |||
100 | /* siblings list anchored at the parent's ->children */ | ||
101 | struct list_head sibling; | ||
102 | struct list_head children; | ||
103 | |||
104 | /* | ||
105 | * PI: Subsys-unique ID. 0 is unused and root is always 1. The | ||
106 | * matching css can be looked up using css_from_id(). | ||
107 | */ | ||
108 | int id; | ||
109 | |||
110 | unsigned int flags; | ||
111 | |||
112 | /* | ||
113 | * Monotonically increasing unique serial number which defines a | ||
114 | * uniform order among all csses. It's guaranteed that all | ||
115 | * ->children lists are in the ascending order of ->serial_nr and | ||
116 | * used to allow interrupting and resuming iterations. | ||
117 | */ | ||
118 | u64 serial_nr; | ||
119 | |||
120 | /* percpu_ref killing and RCU release */ | ||
121 | struct rcu_head rcu_head; | ||
122 | struct work_struct destroy_work; | ||
123 | }; | ||
124 | |||
125 | /* | ||
126 | * A css_set is a structure holding pointers to a set of | ||
127 | * cgroup_subsys_state objects. This saves space in the task struct | ||
128 | * object and speeds up fork()/exit(), since a single inc/dec and a | ||
129 | * list_add()/del() can bump the reference count on the entire cgroup | ||
130 | * set for a task. | ||
131 | */ | ||
132 | struct css_set { | ||
133 | /* Reference count */ | ||
134 | atomic_t refcount; | ||
135 | |||
136 | /* | ||
137 | * List running through all cgroup groups in the same hash | ||
138 | * slot. Protected by css_set_lock | ||
139 | */ | ||
140 | struct hlist_node hlist; | ||
141 | |||
142 | /* | ||
143 | * Lists running through all tasks using this cgroup group. | ||
144 | * mg_tasks lists tasks which belong to this cset but are in the | ||
145 | * process of being migrated out or in. Protected by | ||
146 | * css_set_rwsem, but, during migration, once tasks are moved to | ||
147 | * mg_tasks, it can be read safely while holding cgroup_mutex. | ||
148 | */ | ||
149 | struct list_head tasks; | ||
150 | struct list_head mg_tasks; | ||
151 | |||
152 | /* | ||
153 | * List of cgrp_cset_links pointing at cgroups referenced from this | ||
154 | * css_set. Protected by css_set_lock. | ||
155 | */ | ||
156 | struct list_head cgrp_links; | ||
157 | |||
158 | /* the default cgroup associated with this css_set */ | ||
159 | struct cgroup *dfl_cgrp; | ||
160 | |||
161 | /* | ||
162 | * Set of subsystem states, one for each subsystem. This array is | ||
163 | * immutable after creation apart from the init_css_set during | ||
164 | * subsystem registration (at boot time). | ||
165 | */ | ||
166 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | ||
167 | |||
168 | /* | ||
169 | * List of csets participating in the on-going migration either as | ||
170 | * source or destination. Protected by cgroup_mutex. | ||
171 | */ | ||
172 | struct list_head mg_preload_node; | ||
173 | struct list_head mg_node; | ||
174 | |||
175 | /* | ||
176 | * If this cset is acting as the source of migration the following | ||
177 | * two fields are set. mg_src_cgrp is the source cgroup of the | ||
178 | * on-going migration and mg_dst_cset is the destination cset the | ||
179 | * target tasks on this cset should be migrated to. Protected by | ||
180 | * cgroup_mutex. | ||
181 | */ | ||
182 | struct cgroup *mg_src_cgrp; | ||
183 | struct css_set *mg_dst_cset; | ||
184 | |||
185 | /* | ||
186 | * On the default hierarhcy, ->subsys[ssid] may point to a css | ||
187 | * attached to an ancestor instead of the cgroup this css_set is | ||
188 | * associated with. The following node is anchored at | ||
189 | * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to | ||
190 | * iterate through all css's attached to a given cgroup. | ||
191 | */ | ||
192 | struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; | ||
193 | |||
194 | /* For RCU-protected deletion */ | ||
195 | struct rcu_head rcu_head; | ||
196 | }; | ||
197 | |||
198 | struct cgroup { | ||
199 | /* self css with NULL ->ss, points back to this cgroup */ | ||
200 | struct cgroup_subsys_state self; | ||
201 | |||
202 | unsigned long flags; /* "unsigned long" so bitops work */ | ||
203 | |||
204 | /* | ||
205 | * idr allocated in-hierarchy ID. | ||
206 | * | ||
207 | * ID 0 is not used, the ID of the root cgroup is always 1, and a | ||
208 | * new cgroup will be assigned with a smallest available ID. | ||
209 | * | ||
210 | * Allocating/Removing ID must be protected by cgroup_mutex. | ||
211 | */ | ||
212 | int id; | ||
213 | |||
214 | /* | ||
215 | * If this cgroup contains any tasks, it contributes one to | ||
216 | * populated_cnt. All children with non-zero popuplated_cnt of | ||
217 | * their own contribute one. The count is zero iff there's no task | ||
218 | * in this cgroup or its subtree. | ||
219 | */ | ||
220 | int populated_cnt; | ||
221 | |||
222 | struct kernfs_node *kn; /* cgroup kernfs entry */ | ||
223 | struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ | ||
224 | struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ | ||
225 | |||
226 | /* | ||
227 | * The bitmask of subsystems enabled on the child cgroups. | ||
228 | * ->subtree_control is the one configured through | ||
229 | * "cgroup.subtree_control" while ->child_subsys_mask is the | ||
230 | * effective one which may have more subsystems enabled. | ||
231 | * Controller knobs are made available iff it's enabled in | ||
232 | * ->subtree_control. | ||
233 | */ | ||
234 | unsigned int subtree_control; | ||
235 | unsigned int child_subsys_mask; | ||
236 | |||
237 | /* Private pointers for each registered subsystem */ | ||
238 | struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; | ||
239 | |||
240 | struct cgroup_root *root; | ||
241 | |||
242 | /* | ||
243 | * List of cgrp_cset_links pointing at css_sets with tasks in this | ||
244 | * cgroup. Protected by css_set_lock. | ||
245 | */ | ||
246 | struct list_head cset_links; | ||
247 | |||
248 | /* | ||
249 | * On the default hierarchy, a css_set for a cgroup with some | ||
250 | * susbsys disabled will point to css's which are associated with | ||
251 | * the closest ancestor which has the subsys enabled. The | ||
252 | * following lists all css_sets which point to this cgroup's css | ||
253 | * for the given subsystem. | ||
254 | */ | ||
255 | struct list_head e_csets[CGROUP_SUBSYS_COUNT]; | ||
256 | |||
257 | /* | ||
258 | * list of pidlists, up to two for each namespace (one for procs, one | ||
259 | * for tasks); created on demand. | ||
260 | */ | ||
261 | struct list_head pidlists; | ||
262 | struct mutex pidlist_mutex; | ||
263 | |||
264 | /* used to wait for offlining of csses */ | ||
265 | wait_queue_head_t offline_waitq; | ||
266 | |||
267 | /* used to schedule release agent */ | ||
268 | struct work_struct release_agent_work; | ||
269 | }; | ||
270 | |||
271 | /* | ||
272 | * A cgroup_root represents the root of a cgroup hierarchy, and may be | ||
273 | * associated with a kernfs_root to form an active hierarchy. This is | ||
274 | * internal to cgroup core. Don't access directly from controllers. | ||
275 | */ | ||
276 | struct cgroup_root { | ||
277 | struct kernfs_root *kf_root; | ||
278 | |||
279 | /* The bitmask of subsystems attached to this hierarchy */ | ||
280 | unsigned int subsys_mask; | ||
281 | |||
282 | /* Unique id for this hierarchy. */ | ||
283 | int hierarchy_id; | ||
284 | |||
285 | /* The root cgroup. Root is destroyed on its release. */ | ||
286 | struct cgroup cgrp; | ||
287 | |||
288 | /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ | ||
289 | atomic_t nr_cgrps; | ||
290 | |||
291 | /* A list running through the active hierarchies */ | ||
292 | struct list_head root_list; | ||
293 | |||
294 | /* Hierarchy-specific flags */ | ||
295 | unsigned int flags; | ||
296 | |||
297 | /* IDs for cgroups in this hierarchy */ | ||
298 | struct idr cgroup_idr; | ||
299 | |||
300 | /* The path to use for release notifications. */ | ||
301 | char release_agent_path[PATH_MAX]; | ||
302 | |||
303 | /* The name for this hierarchy - may be empty */ | ||
304 | char name[MAX_CGROUP_ROOT_NAMELEN]; | ||
305 | }; | ||
306 | |||
307 | /* | ||
308 | * struct cftype: handler definitions for cgroup control files | ||
309 | * | ||
310 | * When reading/writing to a file: | ||
311 | * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata | ||
312 | * - the 'cftype' of the file is file->f_path.dentry->d_fsdata | ||
313 | */ | ||
314 | struct cftype { | ||
315 | /* | ||
316 | * By convention, the name should begin with the name of the | ||
317 | * subsystem, followed by a period. Zero length string indicates | ||
318 | * end of cftype array. | ||
319 | */ | ||
320 | char name[MAX_CFTYPE_NAME]; | ||
321 | int private; | ||
322 | /* | ||
323 | * If not 0, file mode is set to this value, otherwise it will | ||
324 | * be figured out automatically | ||
325 | */ | ||
326 | umode_t mode; | ||
327 | |||
328 | /* | ||
329 | * The maximum length of string, excluding trailing nul, that can | ||
330 | * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. | ||
331 | */ | ||
332 | size_t max_write_len; | ||
333 | |||
334 | /* CFTYPE_* flags */ | ||
335 | unsigned int flags; | ||
336 | |||
337 | /* | ||
338 | * Fields used for internal bookkeeping. Initialized automatically | ||
339 | * during registration. | ||
340 | */ | ||
341 | struct cgroup_subsys *ss; /* NULL for cgroup core files */ | ||
342 | struct list_head node; /* anchored at ss->cfts */ | ||
343 | struct kernfs_ops *kf_ops; | ||
344 | |||
345 | /* | ||
346 | * read_u64() is a shortcut for the common case of returning a | ||
347 | * single integer. Use it in place of read() | ||
348 | */ | ||
349 | u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); | ||
350 | /* | ||
351 | * read_s64() is a signed version of read_u64() | ||
352 | */ | ||
353 | s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); | ||
354 | |||
355 | /* generic seq_file read interface */ | ||
356 | int (*seq_show)(struct seq_file *sf, void *v); | ||
357 | |||
358 | /* optional ops, implement all or none */ | ||
359 | void *(*seq_start)(struct seq_file *sf, loff_t *ppos); | ||
360 | void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); | ||
361 | void (*seq_stop)(struct seq_file *sf, void *v); | ||
362 | |||
363 | /* | ||
364 | * write_u64() is a shortcut for the common case of accepting | ||
365 | * a single integer (as parsed by simple_strtoull) from | ||
366 | * userspace. Use in place of write(); return 0 or error. | ||
367 | */ | ||
368 | int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, | ||
369 | u64 val); | ||
370 | /* | ||
371 | * write_s64() is a signed version of write_u64() | ||
372 | */ | ||
373 | int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, | ||
374 | s64 val); | ||
375 | |||
376 | /* | ||
377 | * write() is the generic write callback which maps directly to | ||
378 | * kernfs write operation and overrides all other operations. | ||
379 | * Maximum write size is determined by ->max_write_len. Use | ||
380 | * of_css/cft() to access the associated css and cft. | ||
381 | */ | ||
382 | ssize_t (*write)(struct kernfs_open_file *of, | ||
383 | char *buf, size_t nbytes, loff_t off); | ||
384 | |||
385 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
386 | struct lock_class_key lockdep_key; | ||
387 | #endif | ||
388 | }; | ||
389 | |||
390 | /* | ||
391 | * Control Group subsystem type. | ||
392 | * See Documentation/cgroups/cgroups.txt for details | ||
393 | */ | ||
394 | struct cgroup_subsys { | ||
395 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); | ||
396 | int (*css_online)(struct cgroup_subsys_state *css); | ||
397 | void (*css_offline)(struct cgroup_subsys_state *css); | ||
398 | void (*css_released)(struct cgroup_subsys_state *css); | ||
399 | void (*css_free)(struct cgroup_subsys_state *css); | ||
400 | void (*css_reset)(struct cgroup_subsys_state *css); | ||
401 | void (*css_e_css_changed)(struct cgroup_subsys_state *css); | ||
402 | |||
403 | int (*can_attach)(struct cgroup_subsys_state *css, | ||
404 | struct cgroup_taskset *tset); | ||
405 | void (*cancel_attach)(struct cgroup_subsys_state *css, | ||
406 | struct cgroup_taskset *tset); | ||
407 | void (*attach)(struct cgroup_subsys_state *css, | ||
408 | struct cgroup_taskset *tset); | ||
409 | void (*fork)(struct task_struct *task); | ||
410 | void (*exit)(struct cgroup_subsys_state *css, | ||
411 | struct cgroup_subsys_state *old_css, | ||
412 | struct task_struct *task); | ||
413 | void (*bind)(struct cgroup_subsys_state *root_css); | ||
414 | |||
415 | int disabled; | ||
416 | int early_init; | ||
417 | |||
418 | /* | ||
419 | * If %false, this subsystem is properly hierarchical - | ||
420 | * configuration, resource accounting and restriction on a parent | ||
421 | * cgroup cover those of its children. If %true, hierarchy support | ||
422 | * is broken in some ways - some subsystems ignore hierarchy | ||
423 | * completely while others are only implemented half-way. | ||
424 | * | ||
425 | * It's now disallowed to create nested cgroups if the subsystem is | ||
426 | * broken and cgroup core will emit a warning message on such | ||
427 | * cases. Eventually, all subsystems will be made properly | ||
428 | * hierarchical and this will go away. | ||
429 | */ | ||
430 | bool broken_hierarchy; | ||
431 | bool warned_broken_hierarchy; | ||
432 | |||
433 | /* the following two fields are initialized automtically during boot */ | ||
434 | int id; | ||
435 | const char *name; | ||
436 | |||
437 | /* link to parent, protected by cgroup_lock() */ | ||
438 | struct cgroup_root *root; | ||
439 | |||
440 | /* idr for css->id */ | ||
441 | struct idr css_idr; | ||
442 | |||
443 | /* | ||
444 | * List of cftypes. Each entry is the first entry of an array | ||
445 | * terminated by zero length name. | ||
446 | */ | ||
447 | struct list_head cfts; | ||
448 | |||
449 | /* | ||
450 | * Base cftypes which are automatically registered. The two can | ||
451 | * point to the same array. | ||
452 | */ | ||
453 | struct cftype *dfl_cftypes; /* for the default hierarchy */ | ||
454 | struct cftype *legacy_cftypes; /* for the legacy hierarchies */ | ||
455 | |||
456 | /* | ||
457 | * A subsystem may depend on other subsystems. When such subsystem | ||
458 | * is enabled on a cgroup, the depended-upon subsystems are enabled | ||
459 | * together if available. Subsystems enabled due to dependency are | ||
460 | * not visible to userland until explicitly enabled. The following | ||
461 | * specifies the mask of subsystems that this one depends on. | ||
462 | */ | ||
463 | unsigned int depends_on; | ||
464 | }; | ||
465 | |||
466 | extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | ||
467 | |||
468 | /** | ||
469 | * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups | ||
470 | * @tsk: target task | ||
471 | * | ||
472 | * Called from threadgroup_change_begin() and allows cgroup operations to | ||
473 | * synchronize against threadgroup changes using a percpu_rw_semaphore. | ||
474 | */ | ||
475 | static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) | ||
476 | { | ||
477 | percpu_down_read(&cgroup_threadgroup_rwsem); | ||
478 | } | ||
479 | |||
480 | /** | ||
481 | * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups | ||
482 | * @tsk: target task | ||
483 | * | ||
484 | * Called from threadgroup_change_end(). Counterpart of | ||
485 | * cgroup_threadcgroup_change_begin(). | ||
486 | */ | ||
487 | static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) | ||
488 | { | ||
489 | percpu_up_read(&cgroup_threadgroup_rwsem); | ||
490 | } | ||
491 | |||
492 | #else /* CONFIG_CGROUPS */ | ||
493 | |||
494 | #define CGROUP_SUBSYS_COUNT 0 | ||
495 | |||
496 | static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} | ||
497 | static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} | ||
498 | |||
499 | #endif /* CONFIG_CGROUPS */ | ||
500 | |||
501 | #endif /* _LINUX_CGROUP_DEFS_H */ | ||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e7da0aa65b2d..a593e299162e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -11,94 +11,200 @@ | |||
11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/rcupdate.h> | ||
15 | #include <linux/rculist.h> | 14 | #include <linux/rculist.h> |
16 | #include <linux/cgroupstats.h> | 15 | #include <linux/cgroupstats.h> |
17 | #include <linux/rwsem.h> | 16 | #include <linux/rwsem.h> |
18 | #include <linux/idr.h> | ||
19 | #include <linux/workqueue.h> | ||
20 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
21 | #include <linux/percpu-refcount.h> | ||
22 | #include <linux/seq_file.h> | 18 | #include <linux/seq_file.h> |
23 | #include <linux/kernfs.h> | 19 | #include <linux/kernfs.h> |
24 | #include <linux/wait.h> | 20 | |
21 | #include <linux/cgroup-defs.h> | ||
25 | 22 | ||
26 | #ifdef CONFIG_CGROUPS | 23 | #ifdef CONFIG_CGROUPS |
27 | 24 | ||
28 | struct cgroup_root; | 25 | /* a css_task_iter should be treated as an opaque object */ |
29 | struct cgroup_subsys; | 26 | struct css_task_iter { |
30 | struct cgroup; | 27 | struct cgroup_subsys *ss; |
31 | 28 | ||
32 | extern int cgroup_init_early(void); | 29 | struct list_head *cset_pos; |
33 | extern int cgroup_init(void); | 30 | struct list_head *cset_head; |
34 | extern void cgroup_fork(struct task_struct *p); | 31 | |
35 | extern void cgroup_post_fork(struct task_struct *p); | 32 | struct list_head *task_pos; |
36 | extern void cgroup_exit(struct task_struct *p); | 33 | struct list_head *tasks_head; |
37 | extern int cgroupstats_build(struct cgroupstats *stats, | 34 | struct list_head *mg_tasks_head; |
38 | struct dentry *dentry); | 35 | }; |
39 | 36 | ||
40 | extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | 37 | extern struct cgroup_root cgrp_dfl_root; |
41 | struct pid *pid, struct task_struct *tsk); | 38 | extern struct css_set init_css_set; |
42 | 39 | ||
43 | /* define the enumeration of all cgroup subsystems */ | 40 | #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; |
44 | #define SUBSYS(_x) _x ## _cgrp_id, | ||
45 | enum cgroup_subsys_id { | ||
46 | #include <linux/cgroup_subsys.h> | 41 | #include <linux/cgroup_subsys.h> |
47 | CGROUP_SUBSYS_COUNT, | ||
48 | }; | ||
49 | #undef SUBSYS | 42 | #undef SUBSYS |
50 | 43 | ||
44 | bool css_has_online_children(struct cgroup_subsys_state *css); | ||
45 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); | ||
46 | struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, | ||
47 | struct cgroup_subsys *ss); | ||
48 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | ||
49 | struct cgroup_subsys *ss); | ||
50 | |||
51 | bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); | ||
52 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); | ||
53 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); | ||
54 | |||
55 | int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | ||
56 | int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | ||
57 | int cgroup_rm_cftypes(struct cftype *cfts); | ||
58 | |||
59 | char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); | ||
60 | int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); | ||
61 | int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | ||
62 | struct pid *pid, struct task_struct *tsk); | ||
63 | |||
64 | void cgroup_fork(struct task_struct *p); | ||
65 | void cgroup_post_fork(struct task_struct *p); | ||
66 | void cgroup_exit(struct task_struct *p); | ||
67 | |||
68 | int cgroup_init_early(void); | ||
69 | int cgroup_init(void); | ||
70 | |||
51 | /* | 71 | /* |
52 | * Per-subsystem/per-cgroup state maintained by the system. This is the | 72 | * Iteration helpers and macros. |
53 | * fundamental structural building block that controllers deal with. | 73 | */ |
74 | |||
75 | struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, | ||
76 | struct cgroup_subsys_state *parent); | ||
77 | struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, | ||
78 | struct cgroup_subsys_state *css); | ||
79 | struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); | ||
80 | struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, | ||
81 | struct cgroup_subsys_state *css); | ||
82 | |||
83 | struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); | ||
84 | struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); | ||
85 | |||
86 | void css_task_iter_start(struct cgroup_subsys_state *css, | ||
87 | struct css_task_iter *it); | ||
88 | struct task_struct *css_task_iter_next(struct css_task_iter *it); | ||
89 | void css_task_iter_end(struct css_task_iter *it); | ||
90 | |||
91 | /** | ||
92 | * css_for_each_child - iterate through children of a css | ||
93 | * @pos: the css * to use as the loop cursor | ||
94 | * @parent: css whose children to walk | ||
54 | * | 95 | * |
55 | * Fields marked with "PI:" are public and immutable and may be accessed | 96 | * Walk @parent's children. Must be called under rcu_read_lock(). |
56 | * directly without synchronization. | 97 | * |
98 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
99 | * css which finished ->css_online() is guaranteed to be visible in the | ||
100 | * future iterations and will stay visible until the last reference is put. | ||
101 | * A css which hasn't finished ->css_online() or already finished | ||
102 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
103 | * responsibility to synchronize against on/offlining. | ||
104 | * | ||
105 | * It is allowed to temporarily drop RCU read lock during iteration. The | ||
106 | * caller is responsible for ensuring that @pos remains accessible until | ||
107 | * the start of the next iteration by, for example, bumping the css refcnt. | ||
57 | */ | 108 | */ |
58 | struct cgroup_subsys_state { | 109 | #define css_for_each_child(pos, parent) \ |
59 | /* PI: the cgroup that this css is attached to */ | 110 | for ((pos) = css_next_child(NULL, (parent)); (pos); \ |
60 | struct cgroup *cgroup; | 111 | (pos) = css_next_child((pos), (parent))) |
61 | |||
62 | /* PI: the cgroup subsystem that this css is attached to */ | ||
63 | struct cgroup_subsys *ss; | ||
64 | |||
65 | /* reference count - access via css_[try]get() and css_put() */ | ||
66 | struct percpu_ref refcnt; | ||
67 | |||
68 | /* PI: the parent css */ | ||
69 | struct cgroup_subsys_state *parent; | ||
70 | |||
71 | /* siblings list anchored at the parent's ->children */ | ||
72 | struct list_head sibling; | ||
73 | struct list_head children; | ||
74 | |||
75 | /* | ||
76 | * PI: Subsys-unique ID. 0 is unused and root is always 1. The | ||
77 | * matching css can be looked up using css_from_id(). | ||
78 | */ | ||
79 | int id; | ||
80 | |||
81 | unsigned int flags; | ||
82 | |||
83 | /* | ||
84 | * Monotonically increasing unique serial number which defines a | ||
85 | * uniform order among all csses. It's guaranteed that all | ||
86 | * ->children lists are in the ascending order of ->serial_nr and | ||
87 | * used to allow interrupting and resuming iterations. | ||
88 | */ | ||
89 | u64 serial_nr; | ||
90 | |||
91 | /* percpu_ref killing and RCU release */ | ||
92 | struct rcu_head rcu_head; | ||
93 | struct work_struct destroy_work; | ||
94 | }; | ||
95 | 112 | ||
96 | /* bits in struct cgroup_subsys_state flags field */ | 113 | /** |
97 | enum { | 114 | * css_for_each_descendant_pre - pre-order walk of a css's descendants |
98 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ | 115 | * @pos: the css * to use as the loop cursor |
99 | CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ | 116 | * @root: css whose descendants to walk |
100 | CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ | 117 | * |
101 | }; | 118 | * Walk @root's descendants. @root is included in the iteration and the |
119 | * first node to be visited. Must be called under rcu_read_lock(). | ||
120 | * | ||
121 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
122 | * css which finished ->css_online() is guaranteed to be visible in the | ||
123 | * future iterations and will stay visible until the last reference is put. | ||
124 | * A css which hasn't finished ->css_online() or already finished | ||
125 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
126 | * responsibility to synchronize against on/offlining. | ||
127 | * | ||
128 | * For example, the following guarantees that a descendant can't escape | ||
129 | * state updates of its ancestors. | ||
130 | * | ||
131 | * my_online(@css) | ||
132 | * { | ||
133 | * Lock @css's parent and @css; | ||
134 | * Inherit state from the parent; | ||
135 | * Unlock both. | ||
136 | * } | ||
137 | * | ||
138 | * my_update_state(@css) | ||
139 | * { | ||
140 | * css_for_each_descendant_pre(@pos, @css) { | ||
141 | * Lock @pos; | ||
142 | * if (@pos == @css) | ||
143 | * Update @css's state; | ||
144 | * else | ||
145 | * Verify @pos is alive and inherit state from its parent; | ||
146 | * Unlock @pos; | ||
147 | * } | ||
148 | * } | ||
149 | * | ||
150 | * As long as the inheriting step, including checking the parent state, is | ||
151 | * enclosed inside @pos locking, double-locking the parent isn't necessary | ||
152 | * while inheriting. The state update to the parent is guaranteed to be | ||
153 | * visible by walking order and, as long as inheriting operations to the | ||
154 | * same @pos are atomic to each other, multiple updates racing each other | ||
155 | * still result in the correct state. It's guaranateed that at least one | ||
156 | * inheritance happens for any css after the latest update to its parent. | ||
157 | * | ||
158 | * If checking parent's state requires locking the parent, each inheriting | ||
159 | * iteration should lock and unlock both @pos->parent and @pos. | ||
160 | * | ||
161 | * Alternatively, a subsystem may choose to use a single global lock to | ||
162 | * synchronize ->css_online() and ->css_offline() against tree-walking | ||
163 | * operations. | ||
164 | * | ||
165 | * It is allowed to temporarily drop RCU read lock during iteration. The | ||
166 | * caller is responsible for ensuring that @pos remains accessible until | ||
167 | * the start of the next iteration by, for example, bumping the css refcnt. | ||
168 | */ | ||
169 | #define css_for_each_descendant_pre(pos, css) \ | ||
170 | for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ | ||
171 | (pos) = css_next_descendant_pre((pos), (css))) | ||
172 | |||
173 | /** | ||
174 | * css_for_each_descendant_post - post-order walk of a css's descendants | ||
175 | * @pos: the css * to use as the loop cursor | ||
176 | * @css: css whose descendants to walk | ||
177 | * | ||
178 | * Similar to css_for_each_descendant_pre() but performs post-order | ||
179 | * traversal instead. @root is included in the iteration and the last | ||
180 | * node to be visited. | ||
181 | * | ||
182 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
183 | * css which finished ->css_online() is guaranteed to be visible in the | ||
184 | * future iterations and will stay visible until the last reference is put. | ||
185 | * A css which hasn't finished ->css_online() or already finished | ||
186 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
187 | * responsibility to synchronize against on/offlining. | ||
188 | * | ||
189 | * Note that the walk visibility guarantee example described in pre-order | ||
190 | * walk doesn't apply the same to post-order walks. | ||
191 | */ | ||
192 | #define css_for_each_descendant_post(pos, css) \ | ||
193 | for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ | ||
194 | (pos) = css_next_descendant_post((pos), (css))) | ||
195 | |||
196 | /** | ||
197 | * cgroup_taskset_for_each - iterate cgroup_taskset | ||
198 | * @task: the loop cursor | ||
199 | * @tset: taskset to iterate | ||
200 | */ | ||
201 | #define cgroup_taskset_for_each(task, tset) \ | ||
202 | for ((task) = cgroup_taskset_first((tset)); (task); \ | ||
203 | (task) = cgroup_taskset_next((tset))) | ||
204 | |||
205 | /* | ||
206 | * Inline functions. | ||
207 | */ | ||
102 | 208 | ||
103 | /** | 209 | /** |
104 | * css_get - obtain a reference on the specified css | 210 | * css_get - obtain a reference on the specified css |
@@ -185,532 +291,6 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) | |||
185 | percpu_ref_put_many(&css->refcnt, n); | 291 | percpu_ref_put_many(&css->refcnt, n); |
186 | } | 292 | } |
187 | 293 | ||
188 | /* bits in struct cgroup flags field */ | ||
189 | enum { | ||
190 | /* Control Group requires release notifications to userspace */ | ||
191 | CGRP_NOTIFY_ON_RELEASE, | ||
192 | /* | ||
193 | * Clone the parent's configuration when creating a new child | ||
194 | * cpuset cgroup. For historical reasons, this option can be | ||
195 | * specified at mount time and thus is implemented here. | ||
196 | */ | ||
197 | CGRP_CPUSET_CLONE_CHILDREN, | ||
198 | }; | ||
199 | |||
200 | struct cgroup { | ||
201 | /* self css with NULL ->ss, points back to this cgroup */ | ||
202 | struct cgroup_subsys_state self; | ||
203 | |||
204 | unsigned long flags; /* "unsigned long" so bitops work */ | ||
205 | |||
206 | /* | ||
207 | * idr allocated in-hierarchy ID. | ||
208 | * | ||
209 | * ID 0 is not used, the ID of the root cgroup is always 1, and a | ||
210 | * new cgroup will be assigned with a smallest available ID. | ||
211 | * | ||
212 | * Allocating/Removing ID must be protected by cgroup_mutex. | ||
213 | */ | ||
214 | int id; | ||
215 | |||
216 | /* | ||
217 | * If this cgroup contains any tasks, it contributes one to | ||
218 | * populated_cnt. All children with non-zero popuplated_cnt of | ||
219 | * their own contribute one. The count is zero iff there's no task | ||
220 | * in this cgroup or its subtree. | ||
221 | */ | ||
222 | int populated_cnt; | ||
223 | |||
224 | struct kernfs_node *kn; /* cgroup kernfs entry */ | ||
225 | struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ | ||
226 | |||
227 | /* | ||
228 | * The bitmask of subsystems enabled on the child cgroups. | ||
229 | * ->subtree_control is the one configured through | ||
230 | * "cgroup.subtree_control" while ->child_subsys_mask is the | ||
231 | * effective one which may have more subsystems enabled. | ||
232 | * Controller knobs are made available iff it's enabled in | ||
233 | * ->subtree_control. | ||
234 | */ | ||
235 | unsigned int subtree_control; | ||
236 | unsigned int child_subsys_mask; | ||
237 | |||
238 | /* Private pointers for each registered subsystem */ | ||
239 | struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; | ||
240 | |||
241 | struct cgroup_root *root; | ||
242 | |||
243 | /* | ||
244 | * List of cgrp_cset_links pointing at css_sets with tasks in this | ||
245 | * cgroup. Protected by css_set_lock. | ||
246 | */ | ||
247 | struct list_head cset_links; | ||
248 | |||
249 | /* | ||
250 | * On the default hierarchy, a css_set for a cgroup with some | ||
251 | * susbsys disabled will point to css's which are associated with | ||
252 | * the closest ancestor which has the subsys enabled. The | ||
253 | * following lists all css_sets which point to this cgroup's css | ||
254 | * for the given subsystem. | ||
255 | */ | ||
256 | struct list_head e_csets[CGROUP_SUBSYS_COUNT]; | ||
257 | |||
258 | /* | ||
259 | * list of pidlists, up to two for each namespace (one for procs, one | ||
260 | * for tasks); created on demand. | ||
261 | */ | ||
262 | struct list_head pidlists; | ||
263 | struct mutex pidlist_mutex; | ||
264 | |||
265 | /* used to wait for offlining of csses */ | ||
266 | wait_queue_head_t offline_waitq; | ||
267 | |||
268 | /* used to schedule release agent */ | ||
269 | struct work_struct release_agent_work; | ||
270 | }; | ||
271 | |||
272 | #define MAX_CGROUP_ROOT_NAMELEN 64 | ||
273 | |||
274 | /* cgroup_root->flags */ | ||
275 | enum { | ||
276 | CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ | ||
277 | CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ | ||
278 | CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ | ||
279 | }; | ||
280 | |||
281 | /* | ||
282 | * A cgroup_root represents the root of a cgroup hierarchy, and may be | ||
283 | * associated with a kernfs_root to form an active hierarchy. This is | ||
284 | * internal to cgroup core. Don't access directly from controllers. | ||
285 | */ | ||
286 | struct cgroup_root { | ||
287 | struct kernfs_root *kf_root; | ||
288 | |||
289 | /* The bitmask of subsystems attached to this hierarchy */ | ||
290 | unsigned int subsys_mask; | ||
291 | |||
292 | /* Unique id for this hierarchy. */ | ||
293 | int hierarchy_id; | ||
294 | |||
295 | /* The root cgroup. Root is destroyed on its release. */ | ||
296 | struct cgroup cgrp; | ||
297 | |||
298 | /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ | ||
299 | atomic_t nr_cgrps; | ||
300 | |||
301 | /* A list running through the active hierarchies */ | ||
302 | struct list_head root_list; | ||
303 | |||
304 | /* Hierarchy-specific flags */ | ||
305 | unsigned int flags; | ||
306 | |||
307 | /* IDs for cgroups in this hierarchy */ | ||
308 | struct idr cgroup_idr; | ||
309 | |||
310 | /* The path to use for release notifications. */ | ||
311 | char release_agent_path[PATH_MAX]; | ||
312 | |||
313 | /* The name for this hierarchy - may be empty */ | ||
314 | char name[MAX_CGROUP_ROOT_NAMELEN]; | ||
315 | }; | ||
316 | |||
317 | /* | ||
318 | * A css_set is a structure holding pointers to a set of | ||
319 | * cgroup_subsys_state objects. This saves space in the task struct | ||
320 | * object and speeds up fork()/exit(), since a single inc/dec and a | ||
321 | * list_add()/del() can bump the reference count on the entire cgroup | ||
322 | * set for a task. | ||
323 | */ | ||
324 | |||
325 | struct css_set { | ||
326 | |||
327 | /* Reference count */ | ||
328 | atomic_t refcount; | ||
329 | |||
330 | /* | ||
331 | * List running through all cgroup groups in the same hash | ||
332 | * slot. Protected by css_set_lock | ||
333 | */ | ||
334 | struct hlist_node hlist; | ||
335 | |||
336 | /* | ||
337 | * Lists running through all tasks using this cgroup group. | ||
338 | * mg_tasks lists tasks which belong to this cset but are in the | ||
339 | * process of being migrated out or in. Protected by | ||
340 | * css_set_rwsem, but, during migration, once tasks are moved to | ||
341 | * mg_tasks, it can be read safely while holding cgroup_mutex. | ||
342 | */ | ||
343 | struct list_head tasks; | ||
344 | struct list_head mg_tasks; | ||
345 | |||
346 | /* | ||
347 | * List of cgrp_cset_links pointing at cgroups referenced from this | ||
348 | * css_set. Protected by css_set_lock. | ||
349 | */ | ||
350 | struct list_head cgrp_links; | ||
351 | |||
352 | /* the default cgroup associated with this css_set */ | ||
353 | struct cgroup *dfl_cgrp; | ||
354 | |||
355 | /* | ||
356 | * Set of subsystem states, one for each subsystem. This array is | ||
357 | * immutable after creation apart from the init_css_set during | ||
358 | * subsystem registration (at boot time). | ||
359 | */ | ||
360 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | ||
361 | |||
362 | /* | ||
363 | * List of csets participating in the on-going migration either as | ||
364 | * source or destination. Protected by cgroup_mutex. | ||
365 | */ | ||
366 | struct list_head mg_preload_node; | ||
367 | struct list_head mg_node; | ||
368 | |||
369 | /* | ||
370 | * If this cset is acting as the source of migration the following | ||
371 | * two fields are set. mg_src_cgrp is the source cgroup of the | ||
372 | * on-going migration and mg_dst_cset is the destination cset the | ||
373 | * target tasks on this cset should be migrated to. Protected by | ||
374 | * cgroup_mutex. | ||
375 | */ | ||
376 | struct cgroup *mg_src_cgrp; | ||
377 | struct css_set *mg_dst_cset; | ||
378 | |||
379 | /* | ||
380 | * On the default hierarhcy, ->subsys[ssid] may point to a css | ||
381 | * attached to an ancestor instead of the cgroup this css_set is | ||
382 | * associated with. The following node is anchored at | ||
383 | * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to | ||
384 | * iterate through all css's attached to a given cgroup. | ||
385 | */ | ||
386 | struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; | ||
387 | |||
388 | /* For RCU-protected deletion */ | ||
389 | struct rcu_head rcu_head; | ||
390 | }; | ||
391 | |||
392 | /* | ||
393 | * struct cftype: handler definitions for cgroup control files | ||
394 | * | ||
395 | * When reading/writing to a file: | ||
396 | * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata | ||
397 | * - the 'cftype' of the file is file->f_path.dentry->d_fsdata | ||
398 | */ | ||
399 | |||
400 | /* cftype->flags */ | ||
401 | enum { | ||
402 | CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ | ||
403 | CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ | ||
404 | CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ | ||
405 | |||
406 | /* internal flags, do not use outside cgroup core proper */ | ||
407 | __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ | ||
408 | __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ | ||
409 | }; | ||
410 | |||
411 | #define MAX_CFTYPE_NAME 64 | ||
412 | |||
413 | struct cftype { | ||
414 | /* | ||
415 | * By convention, the name should begin with the name of the | ||
416 | * subsystem, followed by a period. Zero length string indicates | ||
417 | * end of cftype array. | ||
418 | */ | ||
419 | char name[MAX_CFTYPE_NAME]; | ||
420 | int private; | ||
421 | /* | ||
422 | * If not 0, file mode is set to this value, otherwise it will | ||
423 | * be figured out automatically | ||
424 | */ | ||
425 | umode_t mode; | ||
426 | |||
427 | /* | ||
428 | * The maximum length of string, excluding trailing nul, that can | ||
429 | * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. | ||
430 | */ | ||
431 | size_t max_write_len; | ||
432 | |||
433 | /* CFTYPE_* flags */ | ||
434 | unsigned int flags; | ||
435 | |||
436 | /* | ||
437 | * Fields used for internal bookkeeping. Initialized automatically | ||
438 | * during registration. | ||
439 | */ | ||
440 | struct cgroup_subsys *ss; /* NULL for cgroup core files */ | ||
441 | struct list_head node; /* anchored at ss->cfts */ | ||
442 | struct kernfs_ops *kf_ops; | ||
443 | |||
444 | /* | ||
445 | * read_u64() is a shortcut for the common case of returning a | ||
446 | * single integer. Use it in place of read() | ||
447 | */ | ||
448 | u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); | ||
449 | /* | ||
450 | * read_s64() is a signed version of read_u64() | ||
451 | */ | ||
452 | s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); | ||
453 | |||
454 | /* generic seq_file read interface */ | ||
455 | int (*seq_show)(struct seq_file *sf, void *v); | ||
456 | |||
457 | /* optional ops, implement all or none */ | ||
458 | void *(*seq_start)(struct seq_file *sf, loff_t *ppos); | ||
459 | void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); | ||
460 | void (*seq_stop)(struct seq_file *sf, void *v); | ||
461 | |||
462 | /* | ||
463 | * write_u64() is a shortcut for the common case of accepting | ||
464 | * a single integer (as parsed by simple_strtoull) from | ||
465 | * userspace. Use in place of write(); return 0 or error. | ||
466 | */ | ||
467 | int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, | ||
468 | u64 val); | ||
469 | /* | ||
470 | * write_s64() is a signed version of write_u64() | ||
471 | */ | ||
472 | int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, | ||
473 | s64 val); | ||
474 | |||
475 | /* | ||
476 | * write() is the generic write callback which maps directly to | ||
477 | * kernfs write operation and overrides all other operations. | ||
478 | * Maximum write size is determined by ->max_write_len. Use | ||
479 | * of_css/cft() to access the associated css and cft. | ||
480 | */ | ||
481 | ssize_t (*write)(struct kernfs_open_file *of, | ||
482 | char *buf, size_t nbytes, loff_t off); | ||
483 | |||
484 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
485 | struct lock_class_key lockdep_key; | ||
486 | #endif | ||
487 | }; | ||
488 | |||
489 | extern struct cgroup_root cgrp_dfl_root; | ||
490 | extern struct css_set init_css_set; | ||
491 | |||
492 | /** | ||
493 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy | ||
494 | * @cgrp: the cgroup of interest | ||
495 | * | ||
496 | * The default hierarchy is the v2 interface of cgroup and this function | ||
497 | * can be used to test whether a cgroup is on the default hierarchy for | ||
498 | * cases where a subsystem should behave differnetly depending on the | ||
499 | * interface version. | ||
500 | * | ||
501 | * The set of behaviors which change on the default hierarchy are still | ||
502 | * being determined and the mount option is prefixed with __DEVEL__. | ||
503 | * | ||
504 | * List of changed behaviors: | ||
505 | * | ||
506 | * - Mount options "noprefix", "xattr", "clone_children", "release_agent" | ||
507 | * and "name" are disallowed. | ||
508 | * | ||
509 | * - When mounting an existing superblock, mount options should match. | ||
510 | * | ||
511 | * - Remount is disallowed. | ||
512 | * | ||
513 | * - rename(2) is disallowed. | ||
514 | * | ||
515 | * - "tasks" is removed. Everything should be at process granularity. Use | ||
516 | * "cgroup.procs" instead. | ||
517 | * | ||
518 | * - "cgroup.procs" is not sorted. pids will be unique unless they got | ||
519 | * recycled inbetween reads. | ||
520 | * | ||
521 | * - "release_agent" and "notify_on_release" are removed. Replacement | ||
522 | * notification mechanism will be implemented. | ||
523 | * | ||
524 | * - "cgroup.clone_children" is removed. | ||
525 | * | ||
526 | * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup | ||
527 | * and its descendants contain no task; otherwise, 1. The file also | ||
528 | * generates kernfs notification which can be monitored through poll and | ||
529 | * [di]notify when the value of the file changes. | ||
530 | * | ||
531 | * - cpuset: tasks will be kept in empty cpusets when hotplug happens and | ||
532 | * take masks of ancestors with non-empty cpus/mems, instead of being | ||
533 | * moved to an ancestor. | ||
534 | * | ||
535 | * - cpuset: a task can be moved into an empty cpuset, and again it takes | ||
536 | * masks of ancestors. | ||
537 | * | ||
538 | * - memcg: use_hierarchy is on by default and the cgroup file for the flag | ||
539 | * is not created. | ||
540 | * | ||
541 | * - blkcg: blk-throttle becomes properly hierarchical. | ||
542 | * | ||
543 | * - debug: disallowed on the default hierarchy. | ||
544 | */ | ||
545 | static inline bool cgroup_on_dfl(const struct cgroup *cgrp) | ||
546 | { | ||
547 | return cgrp->root == &cgrp_dfl_root; | ||
548 | } | ||
549 | |||
550 | /* no synchronization, the result can only be used as a hint */ | ||
551 | static inline bool cgroup_has_tasks(struct cgroup *cgrp) | ||
552 | { | ||
553 | return !list_empty(&cgrp->cset_links); | ||
554 | } | ||
555 | |||
556 | /* returns ino associated with a cgroup */ | ||
557 | static inline ino_t cgroup_ino(struct cgroup *cgrp) | ||
558 | { | ||
559 | return cgrp->kn->ino; | ||
560 | } | ||
561 | |||
562 | /* cft/css accessors for cftype->write() operation */ | ||
563 | static inline struct cftype *of_cft(struct kernfs_open_file *of) | ||
564 | { | ||
565 | return of->kn->priv; | ||
566 | } | ||
567 | |||
568 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); | ||
569 | |||
570 | /* cft/css accessors for cftype->seq_*() operations */ | ||
571 | static inline struct cftype *seq_cft(struct seq_file *seq) | ||
572 | { | ||
573 | return of_cft(seq->private); | ||
574 | } | ||
575 | |||
576 | static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) | ||
577 | { | ||
578 | return of_css(seq->private); | ||
579 | } | ||
580 | |||
581 | /* | ||
582 | * Name / path handling functions. All are thin wrappers around the kernfs | ||
583 | * counterparts and can be called under any context. | ||
584 | */ | ||
585 | |||
586 | static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) | ||
587 | { | ||
588 | return kernfs_name(cgrp->kn, buf, buflen); | ||
589 | } | ||
590 | |||
591 | static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, | ||
592 | size_t buflen) | ||
593 | { | ||
594 | return kernfs_path(cgrp->kn, buf, buflen); | ||
595 | } | ||
596 | |||
597 | static inline void pr_cont_cgroup_name(struct cgroup *cgrp) | ||
598 | { | ||
599 | pr_cont_kernfs_name(cgrp->kn); | ||
600 | } | ||
601 | |||
602 | static inline void pr_cont_cgroup_path(struct cgroup *cgrp) | ||
603 | { | ||
604 | pr_cont_kernfs_path(cgrp->kn); | ||
605 | } | ||
606 | |||
607 | char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); | ||
608 | |||
609 | int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | ||
610 | int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | ||
611 | int cgroup_rm_cftypes(struct cftype *cfts); | ||
612 | |||
613 | bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); | ||
614 | |||
615 | /* | ||
616 | * Control Group taskset, used to pass around set of tasks to cgroup_subsys | ||
617 | * methods. | ||
618 | */ | ||
619 | struct cgroup_taskset; | ||
620 | struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); | ||
621 | struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); | ||
622 | |||
623 | /** | ||
624 | * cgroup_taskset_for_each - iterate cgroup_taskset | ||
625 | * @task: the loop cursor | ||
626 | * @tset: taskset to iterate | ||
627 | */ | ||
628 | #define cgroup_taskset_for_each(task, tset) \ | ||
629 | for ((task) = cgroup_taskset_first((tset)); (task); \ | ||
630 | (task) = cgroup_taskset_next((tset))) | ||
631 | |||
632 | /* | ||
633 | * Control Group subsystem type. | ||
634 | * See Documentation/cgroups/cgroups.txt for details | ||
635 | */ | ||
636 | |||
637 | struct cgroup_subsys { | ||
638 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); | ||
639 | int (*css_online)(struct cgroup_subsys_state *css); | ||
640 | void (*css_offline)(struct cgroup_subsys_state *css); | ||
641 | void (*css_released)(struct cgroup_subsys_state *css); | ||
642 | void (*css_free)(struct cgroup_subsys_state *css); | ||
643 | void (*css_reset)(struct cgroup_subsys_state *css); | ||
644 | void (*css_e_css_changed)(struct cgroup_subsys_state *css); | ||
645 | |||
646 | int (*can_attach)(struct cgroup_subsys_state *css, | ||
647 | struct cgroup_taskset *tset); | ||
648 | void (*cancel_attach)(struct cgroup_subsys_state *css, | ||
649 | struct cgroup_taskset *tset); | ||
650 | void (*attach)(struct cgroup_subsys_state *css, | ||
651 | struct cgroup_taskset *tset); | ||
652 | void (*fork)(struct task_struct *task); | ||
653 | void (*exit)(struct cgroup_subsys_state *css, | ||
654 | struct cgroup_subsys_state *old_css, | ||
655 | struct task_struct *task); | ||
656 | void (*bind)(struct cgroup_subsys_state *root_css); | ||
657 | |||
658 | int disabled; | ||
659 | int early_init; | ||
660 | |||
661 | /* | ||
662 | * If %false, this subsystem is properly hierarchical - | ||
663 | * configuration, resource accounting and restriction on a parent | ||
664 | * cgroup cover those of its children. If %true, hierarchy support | ||
665 | * is broken in some ways - some subsystems ignore hierarchy | ||
666 | * completely while others are only implemented half-way. | ||
667 | * | ||
668 | * It's now disallowed to create nested cgroups if the subsystem is | ||
669 | * broken and cgroup core will emit a warning message on such | ||
670 | * cases. Eventually, all subsystems will be made properly | ||
671 | * hierarchical and this will go away. | ||
672 | */ | ||
673 | bool broken_hierarchy; | ||
674 | bool warned_broken_hierarchy; | ||
675 | |||
676 | /* the following two fields are initialized automtically during boot */ | ||
677 | int id; | ||
678 | #define MAX_CGROUP_TYPE_NAMELEN 32 | ||
679 | const char *name; | ||
680 | |||
681 | /* link to parent, protected by cgroup_lock() */ | ||
682 | struct cgroup_root *root; | ||
683 | |||
684 | /* idr for css->id */ | ||
685 | struct idr css_idr; | ||
686 | |||
687 | /* | ||
688 | * List of cftypes. Each entry is the first entry of an array | ||
689 | * terminated by zero length name. | ||
690 | */ | ||
691 | struct list_head cfts; | ||
692 | |||
693 | /* | ||
694 | * Base cftypes which are automatically registered. The two can | ||
695 | * point to the same array. | ||
696 | */ | ||
697 | struct cftype *dfl_cftypes; /* for the default hierarchy */ | ||
698 | struct cftype *legacy_cftypes; /* for the legacy hierarchies */ | ||
699 | |||
700 | /* | ||
701 | * A subsystem may depend on other subsystems. When such subsystem | ||
702 | * is enabled on a cgroup, the depended-upon subsystems are enabled | ||
703 | * together if available. Subsystems enabled due to dependency are | ||
704 | * not visible to userland until explicitly enabled. The following | ||
705 | * specifies the mask of subsystems that this one depends on. | ||
706 | */ | ||
707 | unsigned int depends_on; | ||
708 | }; | ||
709 | |||
710 | #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; | ||
711 | #include <linux/cgroup_subsys.h> | ||
712 | #undef SUBSYS | ||
713 | |||
714 | /** | 294 | /** |
715 | * task_css_set_check - obtain a task's css_set with extra access conditions | 295 | * task_css_set_check - obtain a task's css_set with extra access conditions |
716 | * @task: the task to obtain css_set for | 296 | * @task: the task to obtain css_set for |
@@ -818,178 +398,137 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, | |||
818 | return task_css(task, subsys_id)->cgroup; | 398 | return task_css(task, subsys_id)->cgroup; |
819 | } | 399 | } |
820 | 400 | ||
821 | struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, | ||
822 | struct cgroup_subsys_state *parent); | ||
823 | |||
824 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); | ||
825 | |||
826 | /** | 401 | /** |
827 | * css_for_each_child - iterate through children of a css | 402 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy |
828 | * @pos: the css * to use as the loop cursor | 403 | * @cgrp: the cgroup of interest |
829 | * @parent: css whose children to walk | ||
830 | * | 404 | * |
831 | * Walk @parent's children. Must be called under rcu_read_lock(). | 405 | * The default hierarchy is the v2 interface of cgroup and this function |
406 | * can be used to test whether a cgroup is on the default hierarchy for | ||
407 | * cases where a subsystem should behave differnetly depending on the | ||
408 | * interface version. | ||
832 | * | 409 | * |
833 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | 410 | * The set of behaviors which change on the default hierarchy are still |
834 | * css which finished ->css_online() is guaranteed to be visible in the | 411 | * being determined and the mount option is prefixed with __DEVEL__. |
835 | * future iterations and will stay visible until the last reference is put. | ||
836 | * A css which hasn't finished ->css_online() or already finished | ||
837 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
838 | * responsibility to synchronize against on/offlining. | ||
839 | * | 412 | * |
840 | * It is allowed to temporarily drop RCU read lock during iteration. The | 413 | * List of changed behaviors: |
841 | * caller is responsible for ensuring that @pos remains accessible until | ||
842 | * the start of the next iteration by, for example, bumping the css refcnt. | ||
843 | */ | ||
844 | #define css_for_each_child(pos, parent) \ | ||
845 | for ((pos) = css_next_child(NULL, (parent)); (pos); \ | ||
846 | (pos) = css_next_child((pos), (parent))) | ||
847 | |||
848 | struct cgroup_subsys_state * | ||
849 | css_next_descendant_pre(struct cgroup_subsys_state *pos, | ||
850 | struct cgroup_subsys_state *css); | ||
851 | |||
852 | struct cgroup_subsys_state * | ||
853 | css_rightmost_descendant(struct cgroup_subsys_state *pos); | ||
854 | |||
855 | /** | ||
856 | * css_for_each_descendant_pre - pre-order walk of a css's descendants | ||
857 | * @pos: the css * to use as the loop cursor | ||
858 | * @root: css whose descendants to walk | ||
859 | * | 414 | * |
860 | * Walk @root's descendants. @root is included in the iteration and the | 415 | * - Mount options "noprefix", "xattr", "clone_children", "release_agent" |
861 | * first node to be visited. Must be called under rcu_read_lock(). | 416 | * and "name" are disallowed. |
862 | * | 417 | * |
863 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | 418 | * - When mounting an existing superblock, mount options should match. |
864 | * css which finished ->css_online() is guaranteed to be visible in the | ||
865 | * future iterations and will stay visible until the last reference is put. | ||
866 | * A css which hasn't finished ->css_online() or already finished | ||
867 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
868 | * responsibility to synchronize against on/offlining. | ||
869 | * | 419 | * |
870 | * For example, the following guarantees that a descendant can't escape | 420 | * - Remount is disallowed. |
871 | * state updates of its ancestors. | ||
872 | * | 421 | * |
873 | * my_online(@css) | 422 | * - rename(2) is disallowed. |
874 | * { | ||
875 | * Lock @css's parent and @css; | ||
876 | * Inherit state from the parent; | ||
877 | * Unlock both. | ||
878 | * } | ||
879 | * | 423 | * |
880 | * my_update_state(@css) | 424 | * - "tasks" is removed. Everything should be at process granularity. Use |
881 | * { | 425 | * "cgroup.procs" instead. |
882 | * css_for_each_descendant_pre(@pos, @css) { | ||
883 | * Lock @pos; | ||
884 | * if (@pos == @css) | ||
885 | * Update @css's state; | ||
886 | * else | ||
887 | * Verify @pos is alive and inherit state from its parent; | ||
888 | * Unlock @pos; | ||
889 | * } | ||
890 | * } | ||
891 | * | 426 | * |
892 | * As long as the inheriting step, including checking the parent state, is | 427 | * - "cgroup.procs" is not sorted. pids will be unique unless they got |
893 | * enclosed inside @pos locking, double-locking the parent isn't necessary | 428 | * recycled inbetween reads. |
894 | * while inheriting. The state update to the parent is guaranteed to be | ||
895 | * visible by walking order and, as long as inheriting operations to the | ||
896 | * same @pos are atomic to each other, multiple updates racing each other | ||
897 | * still result in the correct state. It's guaranateed that at least one | ||
898 | * inheritance happens for any css after the latest update to its parent. | ||
899 | * | 429 | * |
900 | * If checking parent's state requires locking the parent, each inheriting | 430 | * - "release_agent" and "notify_on_release" are removed. Replacement |
901 | * iteration should lock and unlock both @pos->parent and @pos. | 431 | * notification mechanism will be implemented. |
902 | * | 432 | * |
903 | * Alternatively, a subsystem may choose to use a single global lock to | 433 | * - "cgroup.clone_children" is removed. |
904 | * synchronize ->css_online() and ->css_offline() against tree-walking | ||
905 | * operations. | ||
906 | * | 434 | * |
907 | * It is allowed to temporarily drop RCU read lock during iteration. The | 435 | * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup |
908 | * caller is responsible for ensuring that @pos remains accessible until | 436 | * and its descendants contain no task; otherwise, 1. The file also |
909 | * the start of the next iteration by, for example, bumping the css refcnt. | 437 | * generates kernfs notification which can be monitored through poll and |
910 | */ | 438 | * [di]notify when the value of the file changes. |
911 | #define css_for_each_descendant_pre(pos, css) \ | ||
912 | for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ | ||
913 | (pos) = css_next_descendant_pre((pos), (css))) | ||
914 | |||
915 | struct cgroup_subsys_state * | ||
916 | css_next_descendant_post(struct cgroup_subsys_state *pos, | ||
917 | struct cgroup_subsys_state *css); | ||
918 | |||
919 | /** | ||
920 | * css_for_each_descendant_post - post-order walk of a css's descendants | ||
921 | * @pos: the css * to use as the loop cursor | ||
922 | * @css: css whose descendants to walk | ||
923 | * | 439 | * |
924 | * Similar to css_for_each_descendant_pre() but performs post-order | 440 | * - cpuset: tasks will be kept in empty cpusets when hotplug happens and |
925 | * traversal instead. @root is included in the iteration and the last | 441 | * take masks of ancestors with non-empty cpus/mems, instead of being |
926 | * node to be visited. | 442 | * moved to an ancestor. |
927 | * | 443 | * |
928 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | 444 | * - cpuset: a task can be moved into an empty cpuset, and again it takes |
929 | * css which finished ->css_online() is guaranteed to be visible in the | 445 | * masks of ancestors. |
930 | * future iterations and will stay visible until the last reference is put. | ||
931 | * A css which hasn't finished ->css_online() or already finished | ||
932 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
933 | * responsibility to synchronize against on/offlining. | ||
934 | * | 446 | * |
935 | * Note that the walk visibility guarantee example described in pre-order | 447 | * - memcg: use_hierarchy is on by default and the cgroup file for the flag |
936 | * walk doesn't apply the same to post-order walks. | 448 | * is not created. |
449 | * | ||
450 | * - blkcg: blk-throttle becomes properly hierarchical. | ||
451 | * | ||
452 | * - debug: disallowed on the default hierarchy. | ||
937 | */ | 453 | */ |
938 | #define css_for_each_descendant_post(pos, css) \ | 454 | static inline bool cgroup_on_dfl(const struct cgroup *cgrp) |
939 | for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ | 455 | { |
940 | (pos) = css_next_descendant_post((pos), (css))) | 456 | return cgrp->root == &cgrp_dfl_root; |
457 | } | ||
941 | 458 | ||
942 | bool css_has_online_children(struct cgroup_subsys_state *css); | 459 | /* no synchronization, the result can only be used as a hint */ |
460 | static inline bool cgroup_has_tasks(struct cgroup *cgrp) | ||
461 | { | ||
462 | return !list_empty(&cgrp->cset_links); | ||
463 | } | ||
943 | 464 | ||
944 | /* A css_task_iter should be treated as an opaque object */ | 465 | /* returns ino associated with a cgroup */ |
945 | struct css_task_iter { | 466 | static inline ino_t cgroup_ino(struct cgroup *cgrp) |
946 | struct cgroup_subsys *ss; | 467 | { |
468 | return cgrp->kn->ino; | ||
469 | } | ||
947 | 470 | ||
948 | struct list_head *cset_pos; | 471 | /* cft/css accessors for cftype->write() operation */ |
949 | struct list_head *cset_head; | 472 | static inline struct cftype *of_cft(struct kernfs_open_file *of) |
473 | { | ||
474 | return of->kn->priv; | ||
475 | } | ||
950 | 476 | ||
951 | struct list_head *task_pos; | 477 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); |
952 | struct list_head *tasks_head; | ||
953 | struct list_head *mg_tasks_head; | ||
954 | }; | ||
955 | 478 | ||
956 | void css_task_iter_start(struct cgroup_subsys_state *css, | 479 | /* cft/css accessors for cftype->seq_*() operations */ |
957 | struct css_task_iter *it); | 480 | static inline struct cftype *seq_cft(struct seq_file *seq) |
958 | struct task_struct *css_task_iter_next(struct css_task_iter *it); | 481 | { |
959 | void css_task_iter_end(struct css_task_iter *it); | 482 | return of_cft(seq->private); |
483 | } | ||
960 | 484 | ||
961 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); | 485 | static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) |
962 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); | 486 | { |
487 | return of_css(seq->private); | ||
488 | } | ||
963 | 489 | ||
964 | struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, | 490 | /* |
965 | struct cgroup_subsys *ss); | 491 | * Name / path handling functions. All are thin wrappers around the kernfs |
966 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | 492 | * counterparts and can be called under any context. |
967 | struct cgroup_subsys *ss); | 493 | */ |
968 | 494 | ||
969 | #else /* !CONFIG_CGROUPS */ | 495 | static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) |
496 | { | ||
497 | return kernfs_name(cgrp->kn, buf, buflen); | ||
498 | } | ||
970 | 499 | ||
971 | struct cgroup_subsys_state; | 500 | static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, |
501 | size_t buflen) | ||
502 | { | ||
503 | return kernfs_path(cgrp->kn, buf, buflen); | ||
504 | } | ||
972 | 505 | ||
973 | static inline int cgroup_init_early(void) { return 0; } | 506 | static inline void pr_cont_cgroup_name(struct cgroup *cgrp) |
974 | static inline int cgroup_init(void) { return 0; } | 507 | { |
975 | static inline void cgroup_fork(struct task_struct *p) {} | 508 | pr_cont_kernfs_name(cgrp->kn); |
976 | static inline void cgroup_post_fork(struct task_struct *p) {} | 509 | } |
977 | static inline void cgroup_exit(struct task_struct *p) {} | ||
978 | 510 | ||
979 | static inline int cgroupstats_build(struct cgroupstats *stats, | 511 | static inline void pr_cont_cgroup_path(struct cgroup *cgrp) |
980 | struct dentry *dentry) | ||
981 | { | 512 | { |
982 | return -EINVAL; | 513 | pr_cont_kernfs_path(cgrp->kn); |
983 | } | 514 | } |
984 | 515 | ||
985 | static inline void css_put(struct cgroup_subsys_state *css) {} | 516 | #else /* !CONFIG_CGROUPS */ |
986 | 517 | ||
987 | /* No cgroups - nothing to do */ | 518 | struct cgroup_subsys_state; |
519 | |||
520 | static inline void css_put(struct cgroup_subsys_state *css) {} | ||
988 | static inline int cgroup_attach_task_all(struct task_struct *from, | 521 | static inline int cgroup_attach_task_all(struct task_struct *from, |
989 | struct task_struct *t) | 522 | struct task_struct *t) { return 0; } |
990 | { | 523 | static inline int cgroupstats_build(struct cgroupstats *stats, |
991 | return 0; | 524 | struct dentry *dentry) { return -EINVAL; } |
992 | } | 525 | |
526 | static inline void cgroup_fork(struct task_struct *p) {} | ||
527 | static inline void cgroup_post_fork(struct task_struct *p) {} | ||
528 | static inline void cgroup_exit(struct task_struct *p) {} | ||
529 | |||
530 | static inline int cgroup_init_early(void) { return 0; } | ||
531 | static inline int cgroup_init(void) { return 0; } | ||
993 | 532 | ||
994 | #endif /* !CONFIG_CGROUPS */ | 533 | #endif /* !CONFIG_CGROUPS */ |
995 | 534 | ||
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bb9b075f0eb0..e8493fee8160 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -25,13 +25,6 @@ | |||
25 | extern struct files_struct init_files; | 25 | extern struct files_struct init_files; |
26 | extern struct fs_struct init_fs; | 26 | extern struct fs_struct init_fs; |
27 | 27 | ||
28 | #ifdef CONFIG_CGROUPS | ||
29 | #define INIT_GROUP_RWSEM(sig) \ | ||
30 | .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), | ||
31 | #else | ||
32 | #define INIT_GROUP_RWSEM(sig) | ||
33 | #endif | ||
34 | |||
35 | #ifdef CONFIG_CPUSETS | 28 | #ifdef CONFIG_CPUSETS |
36 | #define INIT_CPUSET_SEQ(tsk) \ | 29 | #define INIT_CPUSET_SEQ(tsk) \ |
37 | .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), | 30 | .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), |
@@ -55,7 +48,6 @@ extern struct fs_struct init_fs; | |||
55 | }, \ | 48 | }, \ |
56 | .cred_guard_mutex = \ | 49 | .cred_guard_mutex = \ |
57 | __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ | 50 | __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ |
58 | INIT_GROUP_RWSEM(sig) \ | ||
59 | } | 51 | } |
60 | 52 | ||
61 | extern struct nsproxy init_nsproxy; | 53 | extern struct nsproxy init_nsproxy; |
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 71ecdab1671b..e6b2f7db9c0c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h | |||
@@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn); | |||
277 | 277 | ||
278 | struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); | 278 | struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); |
279 | struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); | 279 | struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); |
280 | struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); | ||
280 | 281 | ||
281 | struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, | 282 | struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, |
282 | unsigned int flags, void *priv); | 283 | unsigned int flags, void *priv); |
@@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) | |||
352 | static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) | 353 | static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) |
353 | { return NULL; } | 354 | { return NULL; } |
354 | 355 | ||
356 | static inline struct inode * | ||
357 | kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) | ||
358 | { return NULL; } | ||
359 | |||
355 | static inline struct kernfs_root * | 360 | static inline struct kernfs_root * |
356 | kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, | 361 | kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, |
357 | void *priv) | 362 | void *priv) |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 93ed0b682adb..a09ece354c64 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -58,6 +58,7 @@ struct sched_param { | |||
58 | #include <linux/uidgid.h> | 58 | #include <linux/uidgid.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/magic.h> | 60 | #include <linux/magic.h> |
61 | #include <linux/cgroup-defs.h> | ||
61 | 62 | ||
62 | #include <asm/processor.h> | 63 | #include <asm/processor.h> |
63 | 64 | ||
@@ -755,18 +756,6 @@ struct signal_struct { | |||
755 | unsigned audit_tty_log_passwd; | 756 | unsigned audit_tty_log_passwd; |
756 | struct tty_audit_buf *tty_audit_buf; | 757 | struct tty_audit_buf *tty_audit_buf; |
757 | #endif | 758 | #endif |
758 | #ifdef CONFIG_CGROUPS | ||
759 | /* | ||
760 | * group_rwsem prevents new tasks from entering the threadgroup and | ||
761 | * member tasks from exiting,a more specifically, setting of | ||
762 | * PF_EXITING. fork and exit paths are protected with this rwsem | ||
763 | * using threadgroup_change_begin/end(). Users which require | ||
764 | * threadgroup to remain stable should use threadgroup_[un]lock() | ||
765 | * which also takes care of exec path. Currently, cgroup is the | ||
766 | * only user. | ||
767 | */ | ||
768 | struct rw_semaphore group_rwsem; | ||
769 | #endif | ||
770 | 759 | ||
771 | oom_flags_t oom_flags; | 760 | oom_flags_t oom_flags; |
772 | short oom_score_adj; /* OOM kill score adjustment */ | 761 | short oom_score_adj; /* OOM kill score adjustment */ |
@@ -2725,53 +2714,33 @@ static inline void unlock_task_sighand(struct task_struct *tsk, | |||
2725 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); | 2714 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); |
2726 | } | 2715 | } |
2727 | 2716 | ||
2728 | #ifdef CONFIG_CGROUPS | ||
2729 | static inline void threadgroup_change_begin(struct task_struct *tsk) | ||
2730 | { | ||
2731 | down_read(&tsk->signal->group_rwsem); | ||
2732 | } | ||
2733 | static inline void threadgroup_change_end(struct task_struct *tsk) | ||
2734 | { | ||
2735 | up_read(&tsk->signal->group_rwsem); | ||
2736 | } | ||
2737 | |||
2738 | /** | 2717 | /** |
2739 | * threadgroup_lock - lock threadgroup | 2718 | * threadgroup_change_begin - mark the beginning of changes to a threadgroup |
2740 | * @tsk: member task of the threadgroup to lock | 2719 | * @tsk: task causing the changes |
2741 | * | ||
2742 | * Lock the threadgroup @tsk belongs to. No new task is allowed to enter | ||
2743 | * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or | ||
2744 | * change ->group_leader/pid. This is useful for cases where the threadgroup | ||
2745 | * needs to stay stable across blockable operations. | ||
2746 | * | ||
2747 | * fork and exit paths explicitly call threadgroup_change_{begin|end}() for | ||
2748 | * synchronization. While held, no new task will be added to threadgroup | ||
2749 | * and no existing live task will have its PF_EXITING set. | ||
2750 | * | 2720 | * |
2751 | * de_thread() does threadgroup_change_{begin|end}() when a non-leader | 2721 | * All operations which modify a threadgroup - a new thread joining the |
2752 | * sub-thread becomes a new leader. | 2722 | * group, death of a member thread (the assertion of PF_EXITING) and |
2723 | * exec(2) dethreading the process and replacing the leader - are wrapped | ||
2724 | * by threadgroup_change_{begin|end}(). This is to provide a place which | ||
2725 | * subsystems needing threadgroup stability can hook into for | ||
2726 | * synchronization. | ||
2753 | */ | 2727 | */ |
2754 | static inline void threadgroup_lock(struct task_struct *tsk) | 2728 | static inline void threadgroup_change_begin(struct task_struct *tsk) |
2755 | { | 2729 | { |
2756 | down_write(&tsk->signal->group_rwsem); | 2730 | might_sleep(); |
2731 | cgroup_threadgroup_change_begin(tsk); | ||
2757 | } | 2732 | } |
2758 | 2733 | ||
2759 | /** | 2734 | /** |
2760 | * threadgroup_unlock - unlock threadgroup | 2735 | * threadgroup_change_end - mark the end of changes to a threadgroup |
2761 | * @tsk: member task of the threadgroup to unlock | 2736 | * @tsk: task causing the changes |
2762 | * | 2737 | * |
2763 | * Reverse threadgroup_lock(). | 2738 | * See threadgroup_change_begin(). |
2764 | */ | 2739 | */ |
2765 | static inline void threadgroup_unlock(struct task_struct *tsk) | 2740 | static inline void threadgroup_change_end(struct task_struct *tsk) |
2766 | { | 2741 | { |
2767 | up_write(&tsk->signal->group_rwsem); | 2742 | cgroup_threadgroup_change_end(tsk); |
2768 | } | 2743 | } |
2769 | #else | ||
2770 | static inline void threadgroup_change_begin(struct task_struct *tsk) {} | ||
2771 | static inline void threadgroup_change_end(struct task_struct *tsk) {} | ||
2772 | static inline void threadgroup_lock(struct task_struct *tsk) {} | ||
2773 | static inline void threadgroup_unlock(struct task_struct *tsk) {} | ||
2774 | #endif | ||
2775 | 2744 | ||
2776 | #ifndef __HAVE_THREAD_FUNCTIONS | 2745 | #ifndef __HAVE_THREAD_FUNCTIONS |
2777 | 2746 | ||
diff --git a/init/Kconfig b/init/Kconfig index f0c2e681b506..7d1ffd2ae536 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -924,6 +924,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED | |||
924 | menuconfig CGROUPS | 924 | menuconfig CGROUPS |
925 | bool "Control Group support" | 925 | bool "Control Group support" |
926 | select KERNFS | 926 | select KERNFS |
927 | select PERCPU_RWSEM | ||
927 | help | 928 | help |
928 | This option adds support for grouping sets of processes together, for | 929 | This option adds support for grouping sets of processes together, for |
929 | use with process control subsystems such as Cpusets, CFS, memory | 930 | use with process control subsystems such as Cpusets, CFS, memory |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 469dd547770c..9ef9fc8a774b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
47 | #include <linux/spinlock.h> | 47 | #include <linux/spinlock.h> |
48 | #include <linux/rwsem.h> | 48 | #include <linux/rwsem.h> |
49 | #include <linux/percpu-rwsem.h> | ||
49 | #include <linux/string.h> | 50 | #include <linux/string.h> |
50 | #include <linux/sort.h> | 51 | #include <linux/sort.h> |
51 | #include <linux/kmod.h> | 52 | #include <linux/kmod.h> |
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); | |||
103 | */ | 104 | */ |
104 | static DEFINE_SPINLOCK(release_agent_path_lock); | 105 | static DEFINE_SPINLOCK(release_agent_path_lock); |
105 | 106 | ||
107 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | ||
108 | |||
106 | #define cgroup_assert_mutex_or_rcu_locked() \ | 109 | #define cgroup_assert_mutex_or_rcu_locked() \ |
107 | rcu_lockdep_assert(rcu_read_lock_held() || \ | 110 | rcu_lockdep_assert(rcu_read_lock_held() || \ |
108 | lockdep_is_held(&cgroup_mutex), \ | 111 | lockdep_is_held(&cgroup_mutex), \ |
@@ -156,7 +159,7 @@ static bool cgrp_dfl_root_visible; | |||
156 | static bool cgroup_legacy_files_on_dfl; | 159 | static bool cgroup_legacy_files_on_dfl; |
157 | 160 | ||
158 | /* some controllers are not supported in the default hierarchy */ | 161 | /* some controllers are not supported in the default hierarchy */ |
159 | static unsigned int cgrp_dfl_root_inhibit_ss_mask; | 162 | static unsigned long cgrp_dfl_root_inhibit_ss_mask; |
160 | 163 | ||
161 | /* The list of hierarchy roots */ | 164 | /* The list of hierarchy roots */ |
162 | 165 | ||
@@ -175,18 +178,19 @@ static DEFINE_IDR(cgroup_hierarchy_idr); | |||
175 | */ | 178 | */ |
176 | static u64 css_serial_nr_next = 1; | 179 | static u64 css_serial_nr_next = 1; |
177 | 180 | ||
178 | /* This flag indicates whether tasks in the fork and exit paths should | 181 | /* |
179 | * check for fork/exit handlers to call. This avoids us having to do | 182 | * These bitmask flags indicate whether tasks in the fork and exit paths have |
180 | * extra work in the fork/exit path if none of the subsystems need to | 183 | * fork/exit handlers to call. This avoids us having to do extra work in the |
181 | * be called. | 184 | * fork/exit path to check which subsystems have fork/exit callbacks. |
182 | */ | 185 | */ |
183 | static int need_forkexit_callback __read_mostly; | 186 | static unsigned long have_fork_callback __read_mostly; |
187 | static unsigned long have_exit_callback __read_mostly; | ||
184 | 188 | ||
185 | static struct cftype cgroup_dfl_base_files[]; | 189 | static struct cftype cgroup_dfl_base_files[]; |
186 | static struct cftype cgroup_legacy_base_files[]; | 190 | static struct cftype cgroup_legacy_base_files[]; |
187 | 191 | ||
188 | static int rebind_subsystems(struct cgroup_root *dst_root, | 192 | static int rebind_subsystems(struct cgroup_root *dst_root, |
189 | unsigned int ss_mask); | 193 | unsigned long ss_mask); |
190 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 194 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
191 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | 195 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, |
192 | bool visible); | 196 | bool visible); |
@@ -261,7 +265,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | |||
261 | * @cgrp: the cgroup of interest | 265 | * @cgrp: the cgroup of interest |
262 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) | 266 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) |
263 | * | 267 | * |
264 | * Similar to cgroup_css() but returns the effctive css, which is defined | 268 | * Similar to cgroup_css() but returns the effective css, which is defined |
265 | * as the matching css of the nearest ancestor including self which has @ss | 269 | * as the matching css of the nearest ancestor including self which has @ss |
266 | * enabled. If @ss is associated with the hierarchy @cgrp is on, this | 270 | * enabled. If @ss is associated with the hierarchy @cgrp is on, this |
267 | * function is guaranteed to return non-NULL css. | 271 | * function is guaranteed to return non-NULL css. |
@@ -409,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
409 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ | 413 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ |
410 | (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) | 414 | (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) |
411 | 415 | ||
416 | /** | ||
417 | * for_each_subsys_which - filter for_each_subsys with a bitmask | ||
418 | * @ss: the iteration cursor | ||
419 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | ||
420 | * @ss_maskp: a pointer to the bitmask | ||
421 | * | ||
422 | * The block will only run for cases where the ssid-th bit (1 << ssid) of | ||
423 | * mask is set to 1. | ||
424 | */ | ||
425 | #define for_each_subsys_which(ss, ssid, ss_maskp) \ | ||
426 | if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ | ||
427 | (ssid) = 0; \ | ||
428 | else \ | ||
429 | for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ | ||
430 | if (((ss) = cgroup_subsys[ssid]) && false) \ | ||
431 | break; \ | ||
432 | else | ||
433 | |||
412 | /* iterate across the hierarchies */ | 434 | /* iterate across the hierarchies */ |
413 | #define for_each_root(root) \ | 435 | #define for_each_root(root) \ |
414 | list_for_each_entry((root), &cgroup_roots, root_list) | 436 | list_for_each_entry((root), &cgroup_roots, root_list) |
@@ -882,7 +904,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) | |||
882 | static void cgroup_free_root(struct cgroup_root *root) | 904 | static void cgroup_free_root(struct cgroup_root *root) |
883 | { | 905 | { |
884 | if (root) { | 906 | if (root) { |
885 | /* hierarhcy ID shoulid already have been released */ | 907 | /* hierarchy ID should already have been released */ |
886 | WARN_ON_ONCE(root->hierarchy_id); | 908 | WARN_ON_ONCE(root->hierarchy_id); |
887 | 909 | ||
888 | idr_destroy(&root->cgroup_idr); | 910 | idr_destroy(&root->cgroup_idr); |
@@ -998,7 +1020,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
998 | * update of a tasks cgroup pointer by cgroup_attach_task() | 1020 | * update of a tasks cgroup pointer by cgroup_attach_task() |
999 | */ | 1021 | */ |
1000 | 1022 | ||
1001 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); | 1023 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); |
1002 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; | 1024 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; |
1003 | static const struct file_operations proc_cgroupstats_operations; | 1025 | static const struct file_operations proc_cgroupstats_operations; |
1004 | 1026 | ||
@@ -1068,11 +1090,11 @@ static void cgroup_put(struct cgroup *cgrp) | |||
1068 | * @subtree_control is to be applied to @cgrp. The returned mask is always | 1090 | * @subtree_control is to be applied to @cgrp. The returned mask is always |
1069 | * a superset of @subtree_control and follows the usual hierarchy rules. | 1091 | * a superset of @subtree_control and follows the usual hierarchy rules. |
1070 | */ | 1092 | */ |
1071 | static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, | 1093 | static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, |
1072 | unsigned int subtree_control) | 1094 | unsigned long subtree_control) |
1073 | { | 1095 | { |
1074 | struct cgroup *parent = cgroup_parent(cgrp); | 1096 | struct cgroup *parent = cgroup_parent(cgrp); |
1075 | unsigned int cur_ss_mask = subtree_control; | 1097 | unsigned long cur_ss_mask = subtree_control; |
1076 | struct cgroup_subsys *ss; | 1098 | struct cgroup_subsys *ss; |
1077 | int ssid; | 1099 | int ssid; |
1078 | 1100 | ||
@@ -1082,11 +1104,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, | |||
1082 | return cur_ss_mask; | 1104 | return cur_ss_mask; |
1083 | 1105 | ||
1084 | while (true) { | 1106 | while (true) { |
1085 | unsigned int new_ss_mask = cur_ss_mask; | 1107 | unsigned long new_ss_mask = cur_ss_mask; |
1086 | 1108 | ||
1087 | for_each_subsys(ss, ssid) | 1109 | for_each_subsys_which(ss, ssid, &cur_ss_mask) |
1088 | if (cur_ss_mask & (1 << ssid)) | 1110 | new_ss_mask |= ss->depends_on; |
1089 | new_ss_mask |= ss->depends_on; | ||
1090 | 1111 | ||
1091 | /* | 1112 | /* |
1092 | * Mask out subsystems which aren't available. This can | 1113 | * Mask out subsystems which aren't available. This can |
@@ -1200,7 +1221,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
1200 | * @cgrp: target cgroup | 1221 | * @cgrp: target cgroup |
1201 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 1222 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
1202 | */ | 1223 | */ |
1203 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) | 1224 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
1204 | { | 1225 | { |
1205 | struct cgroup_subsys *ss; | 1226 | struct cgroup_subsys *ss; |
1206 | int i; | 1227 | int i; |
@@ -1215,18 +1236,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) | |||
1215 | } | 1236 | } |
1216 | } | 1237 | } |
1217 | 1238 | ||
1218 | static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) | 1239 | static int rebind_subsystems(struct cgroup_root *dst_root, |
1240 | unsigned long ss_mask) | ||
1219 | { | 1241 | { |
1220 | struct cgroup_subsys *ss; | 1242 | struct cgroup_subsys *ss; |
1221 | unsigned int tmp_ss_mask; | 1243 | unsigned long tmp_ss_mask; |
1222 | int ssid, i, ret; | 1244 | int ssid, i, ret; |
1223 | 1245 | ||
1224 | lockdep_assert_held(&cgroup_mutex); | 1246 | lockdep_assert_held(&cgroup_mutex); |
1225 | 1247 | ||
1226 | for_each_subsys(ss, ssid) { | 1248 | for_each_subsys_which(ss, ssid, &ss_mask) { |
1227 | if (!(ss_mask & (1 << ssid))) | ||
1228 | continue; | ||
1229 | |||
1230 | /* if @ss has non-root csses attached to it, can't move */ | 1249 | /* if @ss has non-root csses attached to it, can't move */ |
1231 | if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) | 1250 | if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) |
1232 | return -EBUSY; | 1251 | return -EBUSY; |
@@ -1253,7 +1272,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) | |||
1253 | * Just warn about it and continue. | 1272 | * Just warn about it and continue. |
1254 | */ | 1273 | */ |
1255 | if (cgrp_dfl_root_visible) { | 1274 | if (cgrp_dfl_root_visible) { |
1256 | pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", | 1275 | pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", |
1257 | ret, ss_mask); | 1276 | ret, ss_mask); |
1258 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); | 1277 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); |
1259 | } | 1278 | } |
@@ -1263,18 +1282,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) | |||
1263 | * Nothing can fail from this point on. Remove files for the | 1282 | * Nothing can fail from this point on. Remove files for the |
1264 | * removed subsystems and rebind each subsystem. | 1283 | * removed subsystems and rebind each subsystem. |
1265 | */ | 1284 | */ |
1266 | for_each_subsys(ss, ssid) | 1285 | for_each_subsys_which(ss, ssid, &ss_mask) |
1267 | if (ss_mask & (1 << ssid)) | 1286 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); |
1268 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); | ||
1269 | 1287 | ||
1270 | for_each_subsys(ss, ssid) { | 1288 | for_each_subsys_which(ss, ssid, &ss_mask) { |
1271 | struct cgroup_root *src_root; | 1289 | struct cgroup_root *src_root; |
1272 | struct cgroup_subsys_state *css; | 1290 | struct cgroup_subsys_state *css; |
1273 | struct css_set *cset; | 1291 | struct css_set *cset; |
1274 | 1292 | ||
1275 | if (!(ss_mask & (1 << ssid))) | ||
1276 | continue; | ||
1277 | |||
1278 | src_root = ss->root; | 1293 | src_root = ss->root; |
1279 | css = cgroup_css(&src_root->cgrp, ss); | 1294 | css = cgroup_css(&src_root->cgrp, ss); |
1280 | 1295 | ||
@@ -1338,7 +1353,7 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1338 | } | 1353 | } |
1339 | 1354 | ||
1340 | struct cgroup_sb_opts { | 1355 | struct cgroup_sb_opts { |
1341 | unsigned int subsys_mask; | 1356 | unsigned long subsys_mask; |
1342 | unsigned int flags; | 1357 | unsigned int flags; |
1343 | char *release_agent; | 1358 | char *release_agent; |
1344 | bool cpuset_clone_children; | 1359 | bool cpuset_clone_children; |
@@ -1351,7 +1366,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1351 | { | 1366 | { |
1352 | char *token, *o = data; | 1367 | char *token, *o = data; |
1353 | bool all_ss = false, one_ss = false; | 1368 | bool all_ss = false, one_ss = false; |
1354 | unsigned int mask = -1U; | 1369 | unsigned long mask = -1UL; |
1355 | struct cgroup_subsys *ss; | 1370 | struct cgroup_subsys *ss; |
1356 | int nr_opts = 0; | 1371 | int nr_opts = 0; |
1357 | int i; | 1372 | int i; |
@@ -1495,7 +1510,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1495 | int ret = 0; | 1510 | int ret = 0; |
1496 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1511 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
1497 | struct cgroup_sb_opts opts; | 1512 | struct cgroup_sb_opts opts; |
1498 | unsigned int added_mask, removed_mask; | 1513 | unsigned long added_mask, removed_mask; |
1499 | 1514 | ||
1500 | if (root == &cgrp_dfl_root) { | 1515 | if (root == &cgrp_dfl_root) { |
1501 | pr_err("remount is not allowed\n"); | 1516 | pr_err("remount is not allowed\n"); |
@@ -1641,7 +1656,7 @@ static void init_cgroup_root(struct cgroup_root *root, | |||
1641 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); | 1656 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
1642 | } | 1657 | } |
1643 | 1658 | ||
1644 | static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) | 1659 | static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) |
1645 | { | 1660 | { |
1646 | LIST_HEAD(tmp_links); | 1661 | LIST_HEAD(tmp_links); |
1647 | struct cgroup *root_cgrp = &root->cgrp; | 1662 | struct cgroup *root_cgrp = &root->cgrp; |
@@ -2052,9 +2067,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, | |||
2052 | lockdep_assert_held(&css_set_rwsem); | 2067 | lockdep_assert_held(&css_set_rwsem); |
2053 | 2068 | ||
2054 | /* | 2069 | /* |
2055 | * We are synchronized through threadgroup_lock() against PF_EXITING | 2070 | * We are synchronized through cgroup_threadgroup_rwsem against |
2056 | * setting such that we can't race against cgroup_exit() changing the | 2071 | * PF_EXITING setting such that we can't race against cgroup_exit() |
2057 | * css_set to init_css_set and dropping the old one. | 2072 | * changing the css_set to init_css_set and dropping the old one. |
2058 | */ | 2073 | */ |
2059 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | 2074 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
2060 | old_cset = task_css_set(tsk); | 2075 | old_cset = task_css_set(tsk); |
@@ -2111,10 +2126,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2111 | * @src_cset and add it to @preloaded_csets, which should later be cleaned | 2126 | * @src_cset and add it to @preloaded_csets, which should later be cleaned |
2112 | * up by cgroup_migrate_finish(). | 2127 | * up by cgroup_migrate_finish(). |
2113 | * | 2128 | * |
2114 | * This function may be called without holding threadgroup_lock even if the | 2129 | * This function may be called without holding cgroup_threadgroup_rwsem |
2115 | * target is a process. Threads may be created and destroyed but as long | 2130 | * even if the target is a process. Threads may be created and destroyed |
2116 | * as cgroup_mutex is not dropped, no new css_set can be put into play and | 2131 | * but as long as cgroup_mutex is not dropped, no new css_set can be put |
2117 | * the preloaded css_sets are guaranteed to cover all migrations. | 2132 | * into play and the preloaded css_sets are guaranteed to cover all |
2133 | * migrations. | ||
2118 | */ | 2134 | */ |
2119 | static void cgroup_migrate_add_src(struct css_set *src_cset, | 2135 | static void cgroup_migrate_add_src(struct css_set *src_cset, |
2120 | struct cgroup *dst_cgrp, | 2136 | struct cgroup *dst_cgrp, |
@@ -2217,7 +2233,7 @@ err: | |||
2217 | * @threadgroup: whether @leader points to the whole process or a single task | 2233 | * @threadgroup: whether @leader points to the whole process or a single task |
2218 | * | 2234 | * |
2219 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a | 2235 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a |
2220 | * process, the caller must be holding threadgroup_lock of @leader. The | 2236 | * process, the caller must be holding cgroup_threadgroup_rwsem. The |
2221 | * caller is also responsible for invoking cgroup_migrate_add_src() and | 2237 | * caller is also responsible for invoking cgroup_migrate_add_src() and |
2222 | * cgroup_migrate_prepare_dst() on the targets before invoking this | 2238 | * cgroup_migrate_prepare_dst() on the targets before invoking this |
2223 | * function and following up with cgroup_migrate_finish(). | 2239 | * function and following up with cgroup_migrate_finish(). |
@@ -2345,7 +2361,7 @@ out_release_tset: | |||
2345 | * @leader: the task or the leader of the threadgroup to be attached | 2361 | * @leader: the task or the leader of the threadgroup to be attached |
2346 | * @threadgroup: attach the whole threadgroup? | 2362 | * @threadgroup: attach the whole threadgroup? |
2347 | * | 2363 | * |
2348 | * Call holding cgroup_mutex and threadgroup_lock of @leader. | 2364 | * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. |
2349 | */ | 2365 | */ |
2350 | static int cgroup_attach_task(struct cgroup *dst_cgrp, | 2366 | static int cgroup_attach_task(struct cgroup *dst_cgrp, |
2351 | struct task_struct *leader, bool threadgroup) | 2367 | struct task_struct *leader, bool threadgroup) |
@@ -2376,6 +2392,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2376 | return ret; | 2392 | return ret; |
2377 | } | 2393 | } |
2378 | 2394 | ||
2395 | static int cgroup_procs_write_permission(struct task_struct *task, | ||
2396 | struct cgroup *dst_cgrp, | ||
2397 | struct kernfs_open_file *of) | ||
2398 | { | ||
2399 | const struct cred *cred = current_cred(); | ||
2400 | const struct cred *tcred = get_task_cred(task); | ||
2401 | int ret = 0; | ||
2402 | |||
2403 | /* | ||
2404 | * even if we're attaching all tasks in the thread group, we only | ||
2405 | * need to check permissions on one of them. | ||
2406 | */ | ||
2407 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && | ||
2408 | !uid_eq(cred->euid, tcred->uid) && | ||
2409 | !uid_eq(cred->euid, tcred->suid)) | ||
2410 | ret = -EACCES; | ||
2411 | |||
2412 | if (!ret && cgroup_on_dfl(dst_cgrp)) { | ||
2413 | struct super_block *sb = of->file->f_path.dentry->d_sb; | ||
2414 | struct cgroup *cgrp; | ||
2415 | struct inode *inode; | ||
2416 | |||
2417 | down_read(&css_set_rwsem); | ||
2418 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | ||
2419 | up_read(&css_set_rwsem); | ||
2420 | |||
2421 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) | ||
2422 | cgrp = cgroup_parent(cgrp); | ||
2423 | |||
2424 | ret = -ENOMEM; | ||
2425 | inode = kernfs_get_inode(sb, cgrp->procs_kn); | ||
2426 | if (inode) { | ||
2427 | ret = inode_permission(inode, MAY_WRITE); | ||
2428 | iput(inode); | ||
2429 | } | ||
2430 | } | ||
2431 | |||
2432 | put_cred(tcred); | ||
2433 | return ret; | ||
2434 | } | ||
2435 | |||
2379 | /* | 2436 | /* |
2380 | * Find the task_struct of the task to attach by vpid and pass it along to the | 2437 | * Find the task_struct of the task to attach by vpid and pass it along to the |
2381 | * function to attach either it or all tasks in its threadgroup. Will lock | 2438 | * function to attach either it or all tasks in its threadgroup. Will lock |
@@ -2385,7 +2442,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, | |||
2385 | size_t nbytes, loff_t off, bool threadgroup) | 2442 | size_t nbytes, loff_t off, bool threadgroup) |
2386 | { | 2443 | { |
2387 | struct task_struct *tsk; | 2444 | struct task_struct *tsk; |
2388 | const struct cred *cred = current_cred(), *tcred; | ||
2389 | struct cgroup *cgrp; | 2445 | struct cgroup *cgrp; |
2390 | pid_t pid; | 2446 | pid_t pid; |
2391 | int ret; | 2447 | int ret; |
@@ -2397,29 +2453,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, | |||
2397 | if (!cgrp) | 2453 | if (!cgrp) |
2398 | return -ENODEV; | 2454 | return -ENODEV; |
2399 | 2455 | ||
2400 | retry_find_task: | 2456 | percpu_down_write(&cgroup_threadgroup_rwsem); |
2401 | rcu_read_lock(); | 2457 | rcu_read_lock(); |
2402 | if (pid) { | 2458 | if (pid) { |
2403 | tsk = find_task_by_vpid(pid); | 2459 | tsk = find_task_by_vpid(pid); |
2404 | if (!tsk) { | 2460 | if (!tsk) { |
2405 | rcu_read_unlock(); | ||
2406 | ret = -ESRCH; | 2461 | ret = -ESRCH; |
2407 | goto out_unlock_cgroup; | 2462 | goto out_unlock_rcu; |
2408 | } | 2463 | } |
2409 | /* | 2464 | } else { |
2410 | * even if we're attaching all tasks in the thread group, we | ||
2411 | * only need to check permissions on one of them. | ||
2412 | */ | ||
2413 | tcred = __task_cred(tsk); | ||
2414 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && | ||
2415 | !uid_eq(cred->euid, tcred->uid) && | ||
2416 | !uid_eq(cred->euid, tcred->suid)) { | ||
2417 | rcu_read_unlock(); | ||
2418 | ret = -EACCES; | ||
2419 | goto out_unlock_cgroup; | ||
2420 | } | ||
2421 | } else | ||
2422 | tsk = current; | 2465 | tsk = current; |
2466 | } | ||
2423 | 2467 | ||
2424 | if (threadgroup) | 2468 | if (threadgroup) |
2425 | tsk = tsk->group_leader; | 2469 | tsk = tsk->group_leader; |
@@ -2431,35 +2475,23 @@ retry_find_task: | |||
2431 | */ | 2475 | */ |
2432 | if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { | 2476 | if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { |
2433 | ret = -EINVAL; | 2477 | ret = -EINVAL; |
2434 | rcu_read_unlock(); | 2478 | goto out_unlock_rcu; |
2435 | goto out_unlock_cgroup; | ||
2436 | } | 2479 | } |
2437 | 2480 | ||
2438 | get_task_struct(tsk); | 2481 | get_task_struct(tsk); |
2439 | rcu_read_unlock(); | 2482 | rcu_read_unlock(); |
2440 | 2483 | ||
2441 | threadgroup_lock(tsk); | 2484 | ret = cgroup_procs_write_permission(tsk, cgrp, of); |
2442 | if (threadgroup) { | 2485 | if (!ret) |
2443 | if (!thread_group_leader(tsk)) { | 2486 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); |
2444 | /* | ||
2445 | * a race with de_thread from another thread's exec() | ||
2446 | * may strip us of our leadership, if this happens, | ||
2447 | * there is no choice but to throw this task away and | ||
2448 | * try again; this is | ||
2449 | * "double-double-toil-and-trouble-check locking". | ||
2450 | */ | ||
2451 | threadgroup_unlock(tsk); | ||
2452 | put_task_struct(tsk); | ||
2453 | goto retry_find_task; | ||
2454 | } | ||
2455 | } | ||
2456 | |||
2457 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); | ||
2458 | |||
2459 | threadgroup_unlock(tsk); | ||
2460 | 2487 | ||
2461 | put_task_struct(tsk); | 2488 | put_task_struct(tsk); |
2462 | out_unlock_cgroup: | 2489 | goto out_unlock_threadgroup; |
2490 | |||
2491 | out_unlock_rcu: | ||
2492 | rcu_read_unlock(); | ||
2493 | out_unlock_threadgroup: | ||
2494 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
2463 | cgroup_kn_unlock(of->kn); | 2495 | cgroup_kn_unlock(of->kn); |
2464 | return ret ?: nbytes; | 2496 | return ret ?: nbytes; |
2465 | } | 2497 | } |
@@ -2542,19 +2574,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) | |||
2542 | return 0; | 2574 | return 0; |
2543 | } | 2575 | } |
2544 | 2576 | ||
2545 | static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) | 2577 | static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) |
2546 | { | 2578 | { |
2547 | struct cgroup_subsys *ss; | 2579 | struct cgroup_subsys *ss; |
2548 | bool printed = false; | 2580 | bool printed = false; |
2549 | int ssid; | 2581 | int ssid; |
2550 | 2582 | ||
2551 | for_each_subsys(ss, ssid) { | 2583 | for_each_subsys_which(ss, ssid, &ss_mask) { |
2552 | if (ss_mask & (1 << ssid)) { | 2584 | if (printed) |
2553 | if (printed) | 2585 | seq_putc(seq, ' '); |
2554 | seq_putc(seq, ' '); | 2586 | seq_printf(seq, "%s", ss->name); |
2555 | seq_printf(seq, "%s", ss->name); | 2587 | printed = true; |
2556 | printed = true; | ||
2557 | } | ||
2558 | } | 2588 | } |
2559 | if (printed) | 2589 | if (printed) |
2560 | seq_putc(seq, '\n'); | 2590 | seq_putc(seq, '\n'); |
@@ -2606,6 +2636,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
2606 | 2636 | ||
2607 | lockdep_assert_held(&cgroup_mutex); | 2637 | lockdep_assert_held(&cgroup_mutex); |
2608 | 2638 | ||
2639 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
2640 | |||
2609 | /* look up all csses currently attached to @cgrp's subtree */ | 2641 | /* look up all csses currently attached to @cgrp's subtree */ |
2610 | down_read(&css_set_rwsem); | 2642 | down_read(&css_set_rwsem); |
2611 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { | 2643 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { |
@@ -2661,17 +2693,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
2661 | goto out_finish; | 2693 | goto out_finish; |
2662 | last_task = task; | 2694 | last_task = task; |
2663 | 2695 | ||
2664 | threadgroup_lock(task); | ||
2665 | /* raced against de_thread() from another thread? */ | ||
2666 | if (!thread_group_leader(task)) { | ||
2667 | threadgroup_unlock(task); | ||
2668 | put_task_struct(task); | ||
2669 | continue; | ||
2670 | } | ||
2671 | |||
2672 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | 2696 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); |
2673 | 2697 | ||
2674 | threadgroup_unlock(task); | ||
2675 | put_task_struct(task); | 2698 | put_task_struct(task); |
2676 | 2699 | ||
2677 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | 2700 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) |
@@ -2681,6 +2704,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
2681 | 2704 | ||
2682 | out_finish: | 2705 | out_finish: |
2683 | cgroup_migrate_finish(&preloaded_csets); | 2706 | cgroup_migrate_finish(&preloaded_csets); |
2707 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
2684 | return ret; | 2708 | return ret; |
2685 | } | 2709 | } |
2686 | 2710 | ||
@@ -2689,8 +2713,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2689 | char *buf, size_t nbytes, | 2713 | char *buf, size_t nbytes, |
2690 | loff_t off) | 2714 | loff_t off) |
2691 | { | 2715 | { |
2692 | unsigned int enable = 0, disable = 0; | 2716 | unsigned long enable = 0, disable = 0; |
2693 | unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; | 2717 | unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; |
2694 | struct cgroup *cgrp, *child; | 2718 | struct cgroup *cgrp, *child; |
2695 | struct cgroup_subsys *ss; | 2719 | struct cgroup_subsys *ss; |
2696 | char *tok; | 2720 | char *tok; |
@@ -2702,11 +2726,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2702 | */ | 2726 | */ |
2703 | buf = strstrip(buf); | 2727 | buf = strstrip(buf); |
2704 | while ((tok = strsep(&buf, " "))) { | 2728 | while ((tok = strsep(&buf, " "))) { |
2729 | unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; | ||
2730 | |||
2705 | if (tok[0] == '\0') | 2731 | if (tok[0] == '\0') |
2706 | continue; | 2732 | continue; |
2707 | for_each_subsys(ss, ssid) { | 2733 | for_each_subsys_which(ss, ssid, &tmp_ss_mask) { |
2708 | if (ss->disabled || strcmp(tok + 1, ss->name) || | 2734 | if (ss->disabled || strcmp(tok + 1, ss->name)) |
2709 | ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) | ||
2710 | continue; | 2735 | continue; |
2711 | 2736 | ||
2712 | if (*tok == '+') { | 2737 | if (*tok == '+') { |
@@ -2793,10 +2818,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2793 | * still around. In such cases, wait till it's gone using | 2818 | * still around. In such cases, wait till it's gone using |
2794 | * offline_waitq. | 2819 | * offline_waitq. |
2795 | */ | 2820 | */ |
2796 | for_each_subsys(ss, ssid) { | 2821 | for_each_subsys_which(ss, ssid, &css_enable) { |
2797 | if (!(css_enable & (1 << ssid))) | ||
2798 | continue; | ||
2799 | |||
2800 | cgroup_for_each_live_child(child, cgrp) { | 2822 | cgroup_for_each_live_child(child, cgrp) { |
2801 | DEFINE_WAIT(wait); | 2823 | DEFINE_WAIT(wait); |
2802 | 2824 | ||
@@ -3087,7 +3109,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | |||
3087 | return ret; | 3109 | return ret; |
3088 | } | 3110 | } |
3089 | 3111 | ||
3090 | if (cft->seq_show == cgroup_populated_show) | 3112 | if (cft->write == cgroup_procs_write) |
3113 | cgrp->procs_kn = kn; | ||
3114 | else if (cft->seq_show == cgroup_populated_show) | ||
3091 | cgrp->populated_kn = kn; | 3115 | cgrp->populated_kn = kn; |
3092 | return 0; | 3116 | return 0; |
3093 | } | 3117 | } |
@@ -4322,7 +4346,7 @@ static struct cftype cgroup_legacy_base_files[] = { | |||
4322 | * | 4346 | * |
4323 | * On failure, no file is added. | 4347 | * On failure, no file is added. |
4324 | */ | 4348 | */ |
4325 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) | 4349 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
4326 | { | 4350 | { |
4327 | struct cgroup_subsys *ss; | 4351 | struct cgroup_subsys *ss; |
4328 | int i, ret = 0; | 4352 | int i, ret = 0; |
@@ -4931,7 +4955,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) | |||
4931 | * init_css_set is in the subsystem's root cgroup. */ | 4955 | * init_css_set is in the subsystem's root cgroup. */ |
4932 | init_css_set.subsys[ss->id] = css; | 4956 | init_css_set.subsys[ss->id] = css; |
4933 | 4957 | ||
4934 | need_forkexit_callback |= ss->fork || ss->exit; | 4958 | have_fork_callback |= (bool)ss->fork << ss->id; |
4959 | have_exit_callback |= (bool)ss->exit << ss->id; | ||
4935 | 4960 | ||
4936 | /* At system boot, before all subsystems have been | 4961 | /* At system boot, before all subsystems have been |
4937 | * registered, no tasks have been forked, so we don't | 4962 | * registered, no tasks have been forked, so we don't |
@@ -4989,6 +5014,7 @@ int __init cgroup_init(void) | |||
4989 | unsigned long key; | 5014 | unsigned long key; |
4990 | int ssid, err; | 5015 | int ssid, err; |
4991 | 5016 | ||
5017 | BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); | ||
4992 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); | 5018 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
4993 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); | 5019 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); |
4994 | 5020 | ||
@@ -5241,11 +5267,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
5241 | * css_set; otherwise, @child might change state between ->fork() | 5267 | * css_set; otherwise, @child might change state between ->fork() |
5242 | * and addition to css_set. | 5268 | * and addition to css_set. |
5243 | */ | 5269 | */ |
5244 | if (need_forkexit_callback) { | 5270 | for_each_subsys_which(ss, i, &have_fork_callback) |
5245 | for_each_subsys(ss, i) | 5271 | ss->fork(child); |
5246 | if (ss->fork) | ||
5247 | ss->fork(child); | ||
5248 | } | ||
5249 | } | 5272 | } |
5250 | 5273 | ||
5251 | /** | 5274 | /** |
@@ -5289,16 +5312,12 @@ void cgroup_exit(struct task_struct *tsk) | |||
5289 | cset = task_css_set(tsk); | 5312 | cset = task_css_set(tsk); |
5290 | RCU_INIT_POINTER(tsk->cgroups, &init_css_set); | 5313 | RCU_INIT_POINTER(tsk->cgroups, &init_css_set); |
5291 | 5314 | ||
5292 | if (need_forkexit_callback) { | 5315 | /* see cgroup_post_fork() for details */ |
5293 | /* see cgroup_post_fork() for details */ | 5316 | for_each_subsys_which(ss, i, &have_exit_callback) { |
5294 | for_each_subsys(ss, i) { | 5317 | struct cgroup_subsys_state *old_css = cset->subsys[i]; |
5295 | if (ss->exit) { | 5318 | struct cgroup_subsys_state *css = task_css(tsk, i); |
5296 | struct cgroup_subsys_state *old_css = cset->subsys[i]; | ||
5297 | struct cgroup_subsys_state *css = task_css(tsk, i); | ||
5298 | 5319 | ||
5299 | ss->exit(css, old_css, tsk); | 5320 | ss->exit(css, old_css, tsk); |
5300 | } | ||
5301 | } | ||
5302 | } | 5321 | } |
5303 | 5322 | ||
5304 | if (put_cset) | 5323 | if (put_cset) |
diff --git a/kernel/fork.c b/kernel/fork.c index 4c95cb34243c..1bfefc6f96a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1141,10 +1141,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1141 | tty_audit_fork(sig); | 1141 | tty_audit_fork(sig); |
1142 | sched_autogroup_fork(sig); | 1142 | sched_autogroup_fork(sig); |
1143 | 1143 | ||
1144 | #ifdef CONFIG_CGROUPS | ||
1145 | init_rwsem(&sig->group_rwsem); | ||
1146 | #endif | ||
1147 | |||
1148 | sig->oom_score_adj = current->signal->oom_score_adj; | 1144 | sig->oom_score_adj = current->signal->oom_score_adj; |
1149 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 1145 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
1150 | 1146 | ||