diff options
-rw-r--r-- | Documentation/cgroups.txt | 526 | ||||
-rw-r--r-- | include/linux/cgroup.h | 214 | ||||
-rw-r--r-- | include/linux/cgroup_subsys.h | 10 | ||||
-rw-r--r-- | include/linux/magic.h | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 34 | ||||
-rw-r--r-- | init/Kconfig | 8 | ||||
-rw-r--r-- | init/main.c | 3 | ||||
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/cgroup.c | 1198 |
9 files changed, 1994 insertions, 1 deletions
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt new file mode 100644 index 000000000000..4717887fd75d --- /dev/null +++ b/Documentation/cgroups.txt | |||
@@ -0,0 +1,526 @@ | |||
1 | CGROUPS | ||
2 | ------- | ||
3 | |||
4 | Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt | ||
5 | |||
6 | Original copyright statements from cpusets.txt: | ||
7 | Portions Copyright (C) 2004 BULL SA. | ||
8 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | ||
9 | Modified by Paul Jackson <pj@sgi.com> | ||
10 | Modified by Christoph Lameter <clameter@sgi.com> | ||
11 | |||
12 | CONTENTS: | ||
13 | ========= | ||
14 | |||
15 | 1. Control Groups | ||
16 | 1.1 What are cgroups ? | ||
17 | 1.2 Why are cgroups needed ? | ||
18 | 1.3 How are cgroups implemented ? | ||
19 | 1.4 What does notify_on_release do ? | ||
20 | 1.5 How do I use cgroups ? | ||
21 | 2. Usage Examples and Syntax | ||
22 | 2.1 Basic Usage | ||
23 | 2.2 Attaching processes | ||
24 | 3. Kernel API | ||
25 | 3.1 Overview | ||
26 | 3.2 Synchronization | ||
27 | 3.3 Subsystem API | ||
28 | 4. Questions | ||
29 | |||
30 | 1. Control Groups | ||
31 | ========== | ||
32 | |||
33 | 1.1 What are cgroups ? | ||
34 | ---------------------- | ||
35 | |||
36 | Control Groups provide a mechanism for aggregating/partitioning sets of | ||
37 | tasks, and all their future children, into hierarchical groups with | ||
38 | specialized behaviour. | ||
39 | |||
40 | Definitions: | ||
41 | |||
42 | A *cgroup* associates a set of tasks with a set of parameters for one | ||
43 | or more subsystems. | ||
44 | |||
45 | A *subsystem* is a module that makes use of the task grouping | ||
46 | facilities provided by cgroups to treat groups of tasks in | ||
47 | particular ways. A subsystem is typically a "resource controller" that | ||
48 | schedules a resource or applies per-cgroup limits, but it may be | ||
49 | anything that wants to act on a group of processes, e.g. a | ||
50 | virtualization subsystem. | ||
51 | |||
52 | A *hierarchy* is a set of cgroups arranged in a tree, such that | ||
53 | every task in the system is in exactly one of the cgroups in the | ||
54 | hierarchy, and a set of subsystems; each subsystem has system-specific | ||
55 | state attached to each cgroup in the hierarchy. Each hierarchy has | ||
56 | an instance of the cgroup virtual filesystem associated with it. | ||
57 | |||
58 | At any one time there may be multiple active hierachies of task | ||
59 | cgroups. Each hierarchy is a partition of all tasks in the system. | ||
60 | |||
61 | User level code may create and destroy cgroups by name in an | ||
62 | instance of the cgroup virtual file system, specify and query to | ||
63 | which cgroup a task is assigned, and list the task pids assigned to | ||
64 | a cgroup. Those creations and assignments only affect the hierarchy | ||
65 | associated with that instance of the cgroup file system. | ||
66 | |||
67 | On their own, the only use for cgroups is for simple job | ||
68 | tracking. The intention is that other subsystems hook into the generic | ||
69 | cgroup support to provide new attributes for cgroups, such as | ||
70 | accounting/limiting the resources which processes in a cgroup can | ||
71 | access. For example, cpusets (see Documentation/cpusets.txt) allows | ||
72 | you to associate a set of CPUs and a set of memory nodes with the | ||
73 | tasks in each cgroup. | ||
74 | |||
75 | 1.2 Why are cgroups needed ? | ||
76 | ---------------------------- | ||
77 | |||
78 | There are multiple efforts to provide process aggregations in the | ||
79 | Linux kernel, mainly for resource tracking purposes. Such efforts | ||
80 | include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server | ||
81 | namespaces. These all require the basic notion of a | ||
82 | grouping/partitioning of processes, with newly forked processes ending | ||
83 | in the same group (cgroup) as their parent process. | ||
84 | |||
85 | The kernel cgroup patch provides the minimum essential kernel | ||
86 | mechanisms required to efficiently implement such groups. It has | ||
87 | minimal impact on the system fast paths, and provides hooks for | ||
88 | specific subsystems such as cpusets to provide additional behaviour as | ||
89 | desired. | ||
90 | |||
91 | Multiple hierarchy support is provided to allow for situations where | ||
92 | the division of tasks into cgroups is distinctly different for | ||
93 | different subsystems - having parallel hierarchies allows each | ||
94 | hierarchy to be a natural division of tasks, without having to handle | ||
95 | complex combinations of tasks that would be present if several | ||
96 | unrelated subsystems needed to be forced into the same tree of | ||
97 | cgroups. | ||
98 | |||
99 | At one extreme, each resource controller or subsystem could be in a | ||
100 | separate hierarchy; at the other extreme, all subsystems | ||
101 | would be attached to the same hierarchy. | ||
102 | |||
103 | As an example of a scenario (originally proposed by vatsa@in.ibm.com) | ||
104 | that can benefit from multiple hierarchies, consider a large | ||
105 | university server with various users - students, professors, system | ||
106 | tasks etc. The resource planning for this server could be along the | ||
107 | following lines: | ||
108 | |||
109 | CPU : Top cpuset | ||
110 | / \ | ||
111 | CPUSet1 CPUSet2 | ||
112 | | | | ||
113 | (Profs) (Students) | ||
114 | |||
115 | In addition (system tasks) are attached to topcpuset (so | ||
116 | that they can run anywhere) with a limit of 20% | ||
117 | |||
118 | Memory : Professors (50%), students (30%), system (20%) | ||
119 | |||
120 | Disk : Prof (50%), students (30%), system (20%) | ||
121 | |||
122 | Network : WWW browsing (20%), Network File System (60%), others (20%) | ||
123 | / \ | ||
124 | Prof (15%) students (5%) | ||
125 | |||
126 | Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go | ||
127 | into NFS network class. | ||
128 | |||
129 | At the same time firefox/lynx will share an appropriate CPU/Memory class | ||
130 | depending on who launched it (prof/student). | ||
131 | |||
132 | With the ability to classify tasks differently for different resources | ||
133 | (by putting those resource subsystems in different hierarchies) then | ||
134 | the admin can easily set up a script which receives exec notifications | ||
135 | and depending on who is launching the browser he can | ||
136 | |||
137 | # echo browser_pid > /mnt/<restype>/<userclass>/tasks | ||
138 | |||
139 | With only a single hierarchy, he now would potentially have to create | ||
140 | a separate cgroup for every browser launched and associate it with | ||
141 | approp network and other resource class. This may lead to | ||
142 | proliferation of such cgroups. | ||
143 | |||
144 | Also lets say that the administrator would like to give enhanced network | ||
145 | access temporarily to a student's browser (since it is night and the user | ||
146 | wants to do online gaming :) OR give one of the students simulation | ||
147 | apps enhanced CPU power, | ||
148 | |||
149 | With ability to write pids directly to resource classes, its just a | ||
150 | matter of : | ||
151 | |||
152 | # echo pid > /mnt/network/<new_class>/tasks | ||
153 | (after some time) | ||
154 | # echo pid > /mnt/network/<orig_class>/tasks | ||
155 | |||
156 | Without this ability, he would have to split the cgroup into | ||
157 | multiple separate ones and then associate the new cgroups with the | ||
158 | new resource classes. | ||
159 | |||
160 | |||
161 | |||
162 | 1.3 How are cgroups implemented ? | ||
163 | --------------------------------- | ||
164 | |||
165 | Control Groups extends the kernel as follows: | ||
166 | |||
167 | - Each task in the system has a reference-counted pointer to a | ||
168 | css_set. | ||
169 | |||
170 | - A css_set contains a set of reference-counted pointers to | ||
171 | cgroup_subsys_state objects, one for each cgroup subsystem | ||
172 | registered in the system. There is no direct link from a task to | ||
173 | the cgroup of which it's a member in each hierarchy, but this | ||
174 | can be determined by following pointers through the | ||
175 | cgroup_subsys_state objects. This is because accessing the | ||
176 | subsystem state is something that's expected to happen frequently | ||
177 | and in performance-critical code, whereas operations that require a | ||
178 | task's actual cgroup assignments (in particular, moving between | ||
179 | cgroups) are less common. | ||
180 | |||
181 | - A cgroup hierarchy filesystem can be mounted for browsing and | ||
182 | manipulation from user space. | ||
183 | |||
184 | - You can list all the tasks (by pid) attached to any cgroup. | ||
185 | |||
186 | The implementation of cgroups requires a few, simple hooks | ||
187 | into the rest of the kernel, none in performance critical paths: | ||
188 | |||
189 | - in init/main.c, to initialize the root cgroups and initial | ||
190 | css_set at system boot. | ||
191 | |||
192 | - in fork and exit, to attach and detach a task from its css_set. | ||
193 | |||
194 | In addition a new file system, of type "cgroup" may be mounted, to | ||
195 | enable browsing and modifying the cgroups presently known to the | ||
196 | kernel. When mounting a cgroup hierarchy, you may specify a | ||
197 | comma-separated list of subsystems to mount as the filesystem mount | ||
198 | options. By default, mounting the cgroup filesystem attempts to | ||
199 | mount a hierarchy containing all registered subsystems. | ||
200 | |||
201 | If an active hierarchy with exactly the same set of subsystems already | ||
202 | exists, it will be reused for the new mount. If no existing hierarchy | ||
203 | matches, and any of the requested subsystems are in use in an existing | ||
204 | hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy | ||
205 | is activated, associated with the requested subsystems. | ||
206 | |||
207 | It's not currently possible to bind a new subsystem to an active | ||
208 | cgroup hierarchy, or to unbind a subsystem from an active cgroup | ||
209 | hierarchy. This may be possible in future, but is fraught with nasty | ||
210 | error-recovery issues. | ||
211 | |||
212 | When a cgroup filesystem is unmounted, if there are any | ||
213 | child cgroups created below the top-level cgroup, that hierarchy | ||
214 | will remain active even though unmounted; if there are no | ||
215 | child cgroups then the hierarchy will be deactivated. | ||
216 | |||
217 | No new system calls are added for cgroups - all support for | ||
218 | querying and modifying cgroups is via this cgroup file system. | ||
219 | |||
220 | Each task under /proc has an added file named 'cgroup' displaying, | ||
221 | for each active hierarchy, the subsystem names and the cgroup name | ||
222 | as the path relative to the root of the cgroup file system. | ||
223 | |||
224 | Each cgroup is represented by a directory in the cgroup file system | ||
225 | containing the following files describing that cgroup: | ||
226 | |||
227 | - tasks: list of tasks (by pid) attached to that cgroup | ||
228 | - notify_on_release flag: run /sbin/cgroup_release_agent on exit? | ||
229 | |||
230 | Other subsystems such as cpusets may add additional files in each | ||
231 | cgroup dir | ||
232 | |||
233 | New cgroups are created using the mkdir system call or shell | ||
234 | command. The properties of a cgroup, such as its flags, are | ||
235 | modified by writing to the appropriate file in that cgroups | ||
236 | directory, as listed above. | ||
237 | |||
238 | The named hierarchical structure of nested cgroups allows partitioning | ||
239 | a large system into nested, dynamically changeable, "soft-partitions". | ||
240 | |||
241 | The attachment of each task, automatically inherited at fork by any | ||
242 | children of that task, to a cgroup allows organizing the work load | ||
243 | on a system into related sets of tasks. A task may be re-attached to | ||
244 | any other cgroup, if allowed by the permissions on the necessary | ||
245 | cgroup file system directories. | ||
246 | |||
247 | When a task is moved from one cgroup to another, it gets a new | ||
248 | css_set pointer - if there's an already existing css_set with the | ||
249 | desired collection of cgroups then that group is reused, else a new | ||
250 | css_set is allocated. Note that the current implementation uses a | ||
251 | linear search to locate an appropriate existing css_set, so isn't | ||
252 | very efficient. A future version will use a hash table for better | ||
253 | performance. | ||
254 | |||
255 | The use of a Linux virtual file system (vfs) to represent the | ||
256 | cgroup hierarchy provides for a familiar permission and name space | ||
257 | for cgroups, with a minimum of additional kernel code. | ||
258 | |||
259 | 1.4 What does notify_on_release do ? | ||
260 | ------------------------------------ | ||
261 | |||
262 | *** notify_on_release is disabled in the current patch set. It will be | ||
263 | *** reactivated in a future patch in a less-intrusive manner | ||
264 | |||
265 | If the notify_on_release flag is enabled (1) in a cgroup, then | ||
266 | whenever the last task in the cgroup leaves (exits or attaches to | ||
267 | some other cgroup) and the last child cgroup of that cgroup | ||
268 | is removed, then the kernel runs the command specified by the contents | ||
269 | of the "release_agent" file in that hierarchy's root directory, | ||
270 | supplying the pathname (relative to the mount point of the cgroup | ||
271 | file system) of the abandoned cgroup. This enables automatic | ||
272 | removal of abandoned cgroups. The default value of | ||
273 | notify_on_release in the root cgroup at system boot is disabled | ||
274 | (0). The default value of other cgroups at creation is the current | ||
275 | value of their parents notify_on_release setting. The default value of | ||
276 | a cgroup hierarchy's release_agent path is empty. | ||
277 | |||
278 | 1.5 How do I use cgroups ? | ||
279 | -------------------------- | ||
280 | |||
281 | To start a new job that is to be contained within a cgroup, using | ||
282 | the "cpuset" cgroup subsystem, the steps are something like: | ||
283 | |||
284 | 1) mkdir /dev/cgroup | ||
285 | 2) mount -t cgroup -ocpuset cpuset /dev/cgroup | ||
286 | 3) Create the new cgroup by doing mkdir's and write's (or echo's) in | ||
287 | the /dev/cgroup virtual file system. | ||
288 | 4) Start a task that will be the "founding father" of the new job. | ||
289 | 5) Attach that task to the new cgroup by writing its pid to the | ||
290 | /dev/cgroup tasks file for that cgroup. | ||
291 | 6) fork, exec or clone the job tasks from this founding father task. | ||
292 | |||
293 | For example, the following sequence of commands will setup a cgroup | ||
294 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | ||
295 | and then start a subshell 'sh' in that cgroup: | ||
296 | |||
297 | mount -t cgroup cpuset -ocpuset /dev/cgroup | ||
298 | cd /dev/cgroup | ||
299 | mkdir Charlie | ||
300 | cd Charlie | ||
301 | /bin/echo 2-3 > cpus | ||
302 | /bin/echo 1 > mems | ||
303 | /bin/echo $$ > tasks | ||
304 | sh | ||
305 | # The subshell 'sh' is now running in cgroup Charlie | ||
306 | # The next line should display '/Charlie' | ||
307 | cat /proc/self/cgroup | ||
308 | |||
309 | 2. Usage Examples and Syntax | ||
310 | ============================ | ||
311 | |||
312 | 2.1 Basic Usage | ||
313 | --------------- | ||
314 | |||
315 | Creating, modifying, using the cgroups can be done through the cgroup | ||
316 | virtual filesystem. | ||
317 | |||
318 | To mount a cgroup hierarchy will all available subsystems, type: | ||
319 | # mount -t cgroup xxx /dev/cgroup | ||
320 | |||
321 | The "xxx" is not interpreted by the cgroup code, but will appear in | ||
322 | /proc/mounts so may be any useful identifying string that you like. | ||
323 | |||
324 | To mount a cgroup hierarchy with just the cpuset and numtasks | ||
325 | subsystems, type: | ||
326 | # mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup | ||
327 | |||
328 | To change the set of subsystems bound to a mounted hierarchy, just | ||
329 | remount with different options: | ||
330 | |||
331 | # mount -o remount,cpuset,ns /dev/cgroup | ||
332 | |||
333 | Note that changing the set of subsystems is currently only supported | ||
334 | when the hierarchy consists of a single (root) cgroup. Supporting | ||
335 | the ability to arbitrarily bind/unbind subsystems from an existing | ||
336 | cgroup hierarchy is intended to be implemented in the future. | ||
337 | |||
338 | Then under /dev/cgroup you can find a tree that corresponds to the | ||
339 | tree of the cgroups in the system. For instance, /dev/cgroup | ||
340 | is the cgroup that holds the whole system. | ||
341 | |||
342 | If you want to create a new cgroup under /dev/cgroup: | ||
343 | # cd /dev/cgroup | ||
344 | # mkdir my_cgroup | ||
345 | |||
346 | Now you want to do something with this cgroup. | ||
347 | # cd my_cgroup | ||
348 | |||
349 | In this directory you can find several files: | ||
350 | # ls | ||
351 | notify_on_release release_agent tasks | ||
352 | (plus whatever files are added by the attached subsystems) | ||
353 | |||
354 | Now attach your shell to this cgroup: | ||
355 | # /bin/echo $$ > tasks | ||
356 | |||
357 | You can also create cgroups inside your cgroup by using mkdir in this | ||
358 | directory. | ||
359 | # mkdir my_sub_cs | ||
360 | |||
361 | To remove a cgroup, just use rmdir: | ||
362 | # rmdir my_sub_cs | ||
363 | |||
364 | This will fail if the cgroup is in use (has cgroups inside, or | ||
365 | has processes attached, or is held alive by other subsystem-specific | ||
366 | reference). | ||
367 | |||
368 | 2.2 Attaching processes | ||
369 | ----------------------- | ||
370 | |||
371 | # /bin/echo PID > tasks | ||
372 | |||
373 | Note that it is PID, not PIDs. You can only attach ONE task at a time. | ||
374 | If you have several tasks to attach, you have to do it one after another: | ||
375 | |||
376 | # /bin/echo PID1 > tasks | ||
377 | # /bin/echo PID2 > tasks | ||
378 | ... | ||
379 | # /bin/echo PIDn > tasks | ||
380 | |||
381 | 3. Kernel API | ||
382 | ============= | ||
383 | |||
384 | 3.1 Overview | ||
385 | ------------ | ||
386 | |||
387 | Each kernel subsystem that wants to hook into the generic cgroup | ||
388 | system needs to create a cgroup_subsys object. This contains | ||
389 | various methods, which are callbacks from the cgroup system, along | ||
390 | with a subsystem id which will be assigned by the cgroup system. | ||
391 | |||
392 | Other fields in the cgroup_subsys object include: | ||
393 | |||
394 | - subsys_id: a unique array index for the subsystem, indicating which | ||
395 | entry in cgroup->subsys[] this subsystem should be | ||
396 | managing. Initialized by cgroup_register_subsys(); prior to this | ||
397 | it should be initialized to -1 | ||
398 | |||
399 | - hierarchy: an index indicating which hierarchy, if any, this | ||
400 | subsystem is currently attached to. If this is -1, then the | ||
401 | subsystem is not attached to any hierarchy, and all tasks should be | ||
402 | considered to be members of the subsystem's top_cgroup. It should | ||
403 | be initialized to -1. | ||
404 | |||
405 | - name: should be initialized to a unique subsystem name prior to | ||
406 | calling cgroup_register_subsystem. Should be no longer than | ||
407 | MAX_CGROUP_TYPE_NAMELEN | ||
408 | |||
409 | Each cgroup object created by the system has an array of pointers, | ||
410 | indexed by subsystem id; this pointer is entirely managed by the | ||
411 | subsystem; the generic cgroup code will never touch this pointer. | ||
412 | |||
413 | 3.2 Synchronization | ||
414 | ------------------- | ||
415 | |||
416 | There is a global mutex, cgroup_mutex, used by the cgroup | ||
417 | system. This should be taken by anything that wants to modify a | ||
418 | cgroup. It may also be taken to prevent cgroups from being | ||
419 | modified, but more specific locks may be more appropriate in that | ||
420 | situation. | ||
421 | |||
422 | See kernel/cgroup.c for more details. | ||
423 | |||
424 | Subsystems can take/release the cgroup_mutex via the functions | ||
425 | cgroup_lock()/cgroup_unlock(), and can | ||
426 | take/release the callback_mutex via the functions | ||
427 | cgroup_lock()/cgroup_unlock(). | ||
428 | |||
429 | Accessing a task's cgroup pointer may be done in the following ways: | ||
430 | - while holding cgroup_mutex | ||
431 | - while holding the task's alloc_lock (via task_lock()) | ||
432 | - inside an rcu_read_lock() section via rcu_dereference() | ||
433 | |||
434 | 3.3 Subsystem API | ||
435 | -------------------------- | ||
436 | |||
437 | Each subsystem should: | ||
438 | |||
439 | - add an entry in linux/cgroup_subsys.h | ||
440 | - define a cgroup_subsys object called <name>_subsys | ||
441 | |||
442 | Each subsystem may export the following methods. The only mandatory | ||
443 | methods are create/destroy. Any others that are null are presumed to | ||
444 | be successful no-ops. | ||
445 | |||
446 | struct cgroup_subsys_state *create(struct cgroup *cont) | ||
447 | LL=cgroup_mutex | ||
448 | |||
449 | Called to create a subsystem state object for a cgroup. The | ||
450 | subsystem should allocate its subsystem state object for the passed | ||
451 | cgroup, returning a pointer to the new object on success or a | ||
452 | negative error code. On success, the subsystem pointer should point to | ||
453 | a structure of type cgroup_subsys_state (typically embedded in a | ||
454 | larger subsystem-specific object), which will be initialized by the | ||
455 | cgroup system. Note that this will be called at initialization to | ||
456 | create the root subsystem state for this subsystem; this case can be | ||
457 | identified by the passed cgroup object having a NULL parent (since | ||
458 | it's the root of the hierarchy) and may be an appropriate place for | ||
459 | initialization code. | ||
460 | |||
461 | void destroy(struct cgroup *cont) | ||
462 | LL=cgroup_mutex | ||
463 | |||
464 | The cgroup system is about to destroy the passed cgroup; the | ||
465 | subsystem should do any necessary cleanup | ||
466 | |||
467 | int can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
468 | struct task_struct *task) | ||
469 | LL=cgroup_mutex | ||
470 | |||
471 | Called prior to moving a task into a cgroup; if the subsystem | ||
472 | returns an error, this will abort the attach operation. If a NULL | ||
473 | task is passed, then a successful result indicates that *any* | ||
474 | unspecified task can be moved into the cgroup. Note that this isn't | ||
475 | called on a fork. If this method returns 0 (success) then this should | ||
476 | remain valid while the caller holds cgroup_mutex. | ||
477 | |||
478 | void attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
479 | struct cgroup *old_cont, struct task_struct *task) | ||
480 | LL=cgroup_mutex | ||
481 | |||
482 | |||
483 | Called after the task has been attached to the cgroup, to allow any | ||
484 | post-attachment activity that requires memory allocations or blocking. | ||
485 | |||
486 | void fork(struct cgroup_subsy *ss, struct task_struct *task) | ||
487 | LL=callback_mutex, maybe read_lock(tasklist_lock) | ||
488 | |||
489 | Called when a task is forked into a cgroup. Also called during | ||
490 | registration for all existing tasks. | ||
491 | |||
492 | void exit(struct cgroup_subsys *ss, struct task_struct *task) | ||
493 | LL=callback_mutex | ||
494 | |||
495 | Called during task exit | ||
496 | |||
497 | int populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
498 | LL=none | ||
499 | |||
500 | Called after creation of a cgroup to allow a subsystem to populate | ||
501 | the cgroup directory with file entries. The subsystem should make | ||
502 | calls to cgroup_add_file() with objects of type cftype (see | ||
503 | include/linux/cgroup.h for details). Note that although this | ||
504 | method can return an error code, the error code is currently not | ||
505 | always handled well. | ||
506 | |||
507 | void bind(struct cgroup_subsys *ss, struct cgroup *root) | ||
508 | LL=callback_mutex | ||
509 | |||
510 | Called when a cgroup subsystem is rebound to a different hierarchy | ||
511 | and root cgroup. Currently this will only involve movement between | ||
512 | the default hierarchy (which never has sub-cgroups) and a hierarchy | ||
513 | that is being created/destroyed (and hence has no sub-cgroups). | ||
514 | |||
515 | 4. Questions | ||
516 | ============ | ||
517 | |||
518 | Q: what's up with this '/bin/echo' ? | ||
519 | A: bash's builtin 'echo' command does not check calls to write() against | ||
520 | errors. If you use it in the cgroup file system, you won't be | ||
521 | able to tell whether a command succeeded or failed. | ||
522 | |||
523 | Q: When I attach processes, only the first of the line gets really attached ! | ||
524 | A: We can only return one error code per call to write(). So you should also | ||
525 | put only ONE pid. | ||
526 | |||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h new file mode 100644 index 000000000000..60735dcf427a --- /dev/null +++ b/include/linux/cgroup.h | |||
@@ -0,0 +1,214 @@ | |||
1 | #ifndef _LINUX_CGROUP_H | ||
2 | #define _LINUX_CGROUP_H | ||
3 | /* | ||
4 | * cgroup interface | ||
5 | * | ||
6 | * Copyright (C) 2003 BULL SA | ||
7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | #include <linux/sched.h> | ||
12 | #include <linux/kref.h> | ||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/nodemask.h> | ||
15 | #include <linux/rcupdate.h> | ||
16 | |||
17 | #ifdef CONFIG_CGROUPS | ||
18 | |||
19 | struct cgroupfs_root; | ||
20 | struct cgroup_subsys; | ||
21 | struct inode; | ||
22 | |||
23 | extern int cgroup_init_early(void); | ||
24 | extern int cgroup_init(void); | ||
25 | extern void cgroup_init_smp(void); | ||
26 | extern void cgroup_lock(void); | ||
27 | extern void cgroup_unlock(void); | ||
28 | |||
29 | /* Per-subsystem/per-cgroup state maintained by the system. */ | ||
30 | struct cgroup_subsys_state { | ||
31 | /* The cgroup that this subsystem is attached to. Useful | ||
32 | * for subsystems that want to know about the cgroup | ||
33 | * hierarchy structure */ | ||
34 | struct cgroup *cgroup; | ||
35 | |||
36 | /* State maintained by the cgroup system to allow | ||
37 | * subsystems to be "busy". Should be accessed via css_get() | ||
38 | * and css_put() */ | ||
39 | |||
40 | atomic_t refcnt; | ||
41 | |||
42 | unsigned long flags; | ||
43 | }; | ||
44 | |||
45 | /* bits in struct cgroup_subsys_state flags field */ | ||
46 | enum { | ||
47 | CSS_ROOT, /* This CSS is the root of the subsystem */ | ||
48 | }; | ||
49 | |||
50 | /* | ||
51 | * Call css_get() to hold a reference on the cgroup; | ||
52 | * | ||
53 | */ | ||
54 | |||
55 | static inline void css_get(struct cgroup_subsys_state *css) | ||
56 | { | ||
57 | /* We don't need to reference count the root state */ | ||
58 | if (!test_bit(CSS_ROOT, &css->flags)) | ||
59 | atomic_inc(&css->refcnt); | ||
60 | } | ||
61 | /* | ||
62 | * css_put() should be called to release a reference taken by | ||
63 | * css_get() | ||
64 | */ | ||
65 | |||
66 | static inline void css_put(struct cgroup_subsys_state *css) | ||
67 | { | ||
68 | if (!test_bit(CSS_ROOT, &css->flags)) | ||
69 | atomic_dec(&css->refcnt); | ||
70 | } | ||
71 | |||
72 | struct cgroup { | ||
73 | unsigned long flags; /* "unsigned long" so bitops work */ | ||
74 | |||
75 | /* count users of this cgroup. >0 means busy, but doesn't | ||
76 | * necessarily indicate the number of tasks in the | ||
77 | * cgroup */ | ||
78 | atomic_t count; | ||
79 | |||
80 | /* | ||
81 | * We link our 'sibling' struct into our parent's 'children'. | ||
82 | * Our children link their 'sibling' into our 'children'. | ||
83 | */ | ||
84 | struct list_head sibling; /* my parent's children */ | ||
85 | struct list_head children; /* my children */ | ||
86 | |||
87 | struct cgroup *parent; /* my parent */ | ||
88 | struct dentry *dentry; /* cgroup fs entry */ | ||
89 | |||
90 | /* Private pointers for each registered subsystem */ | ||
91 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | ||
92 | |||
93 | struct cgroupfs_root *root; | ||
94 | struct cgroup *top_cgroup; | ||
95 | }; | ||
96 | |||
97 | /* struct cftype: | ||
98 | * | ||
99 | * The files in the cgroup filesystem mostly have a very simple read/write | ||
100 | * handling, some common function will take care of it. Nevertheless some cases | ||
101 | * (read tasks) are special and therefore I define this structure for every | ||
102 | * kind of file. | ||
103 | * | ||
104 | * | ||
105 | * When reading/writing to a file: | ||
106 | * - the cgroup to use in file->f_dentry->d_parent->d_fsdata | ||
107 | * - the 'cftype' of the file is file->f_dentry->d_fsdata | ||
108 | */ | ||
109 | |||
110 | #define MAX_CFTYPE_NAME 64 | ||
111 | struct cftype { | ||
112 | /* By convention, the name should begin with the name of the | ||
113 | * subsystem, followed by a period */ | ||
114 | char name[MAX_CFTYPE_NAME]; | ||
115 | int private; | ||
116 | int (*open) (struct inode *inode, struct file *file); | ||
117 | ssize_t (*read) (struct cgroup *cont, struct cftype *cft, | ||
118 | struct file *file, | ||
119 | char __user *buf, size_t nbytes, loff_t *ppos); | ||
120 | /* | ||
121 | * read_uint() is a shortcut for the common case of returning a | ||
122 | * single integer. Use it in place of read() | ||
123 | */ | ||
124 | u64 (*read_uint) (struct cgroup *cont, struct cftype *cft); | ||
125 | ssize_t (*write) (struct cgroup *cont, struct cftype *cft, | ||
126 | struct file *file, | ||
127 | const char __user *buf, size_t nbytes, loff_t *ppos); | ||
128 | int (*release) (struct inode *inode, struct file *file); | ||
129 | }; | ||
130 | |||
131 | /* Add a new file to the given cgroup directory. Should only be | ||
132 | * called by subsystems from within a populate() method */ | ||
133 | int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys, | ||
134 | const struct cftype *cft); | ||
135 | |||
136 | /* Add a set of new files to the given cgroup directory. Should | ||
137 | * only be called by subsystems from within a populate() method */ | ||
138 | int cgroup_add_files(struct cgroup *cont, | ||
139 | struct cgroup_subsys *subsys, | ||
140 | const struct cftype cft[], | ||
141 | int count); | ||
142 | |||
143 | int cgroup_is_removed(const struct cgroup *cont); | ||
144 | |||
145 | int cgroup_path(const struct cgroup *cont, char *buf, int buflen); | ||
146 | |||
147 | /* Return true if the cgroup is a descendant of the current cgroup */ | ||
148 | int cgroup_is_descendant(const struct cgroup *cont); | ||
149 | |||
150 | /* Control Group subsystem type. See Documentation/cgroups.txt for details */ | ||
151 | |||
152 | struct cgroup_subsys { | ||
153 | struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, | ||
154 | struct cgroup *cont); | ||
155 | void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cont); | ||
156 | int (*can_attach)(struct cgroup_subsys *ss, | ||
157 | struct cgroup *cont, struct task_struct *tsk); | ||
158 | void (*attach)(struct cgroup_subsys *ss, struct cgroup *cont, | ||
159 | struct cgroup *old_cont, struct task_struct *tsk); | ||
160 | void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); | ||
161 | void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); | ||
162 | int (*populate)(struct cgroup_subsys *ss, | ||
163 | struct cgroup *cont); | ||
164 | void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); | ||
165 | int subsys_id; | ||
166 | int active; | ||
167 | int early_init; | ||
168 | #define MAX_CGROUP_TYPE_NAMELEN 32 | ||
169 | const char *name; | ||
170 | |||
171 | /* Protected by RCU */ | ||
172 | struct cgroupfs_root *root; | ||
173 | |||
174 | struct list_head sibling; | ||
175 | |||
176 | void *private; | ||
177 | }; | ||
178 | |||
179 | #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; | ||
180 | #include <linux/cgroup_subsys.h> | ||
181 | #undef SUBSYS | ||
182 | |||
183 | static inline struct cgroup_subsys_state *cgroup_subsys_state( | ||
184 | struct cgroup *cont, int subsys_id) | ||
185 | { | ||
186 | return cont->subsys[subsys_id]; | ||
187 | } | ||
188 | |||
189 | static inline struct cgroup_subsys_state *task_subsys_state( | ||
190 | struct task_struct *task, int subsys_id) | ||
191 | { | ||
192 | return rcu_dereference(task->cgroups.subsys[subsys_id]); | ||
193 | } | ||
194 | |||
195 | static inline struct cgroup* task_cgroup(struct task_struct *task, | ||
196 | int subsys_id) | ||
197 | { | ||
198 | return task_subsys_state(task, subsys_id)->cgroup; | ||
199 | } | ||
200 | |||
201 | int cgroup_path(const struct cgroup *cont, char *buf, int buflen); | ||
202 | |||
203 | #else /* !CONFIG_CGROUPS */ | ||
204 | |||
205 | static inline int cgroup_init_early(void) { return 0; } | ||
206 | static inline int cgroup_init(void) { return 0; } | ||
207 | static inline void cgroup_init_smp(void) {} | ||
208 | |||
209 | static inline void cgroup_lock(void) {} | ||
210 | static inline void cgroup_unlock(void) {} | ||
211 | |||
212 | #endif /* !CONFIG_CGROUPS */ | ||
213 | |||
214 | #endif /* _LINUX_CGROUP_H */ | ||
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h new file mode 100644 index 000000000000..f8eddbbcad9a --- /dev/null +++ b/include/linux/cgroup_subsys.h | |||
@@ -0,0 +1,10 @@ | |||
1 | /* Add subsystem definitions of the form SUBSYS(<name>) in this | ||
2 | * file. Surround each one by a line of comment markers so that | ||
3 | * patches don't collide | ||
4 | */ | ||
5 | |||
6 | /* */ | ||
7 | |||
8 | /* */ | ||
9 | |||
10 | /* */ | ||
diff --git a/include/linux/magic.h b/include/linux/magic.h index 722d4755060f..1fa0c2ce4dec 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h | |||
@@ -37,6 +37,7 @@ | |||
37 | 37 | ||
38 | #define SMB_SUPER_MAGIC 0x517B | 38 | #define SMB_SUPER_MAGIC 0x517B |
39 | #define USBDEVICE_SUPER_MAGIC 0x9fa2 | 39 | #define USBDEVICE_SUPER_MAGIC 0x9fa2 |
40 | #define CGROUP_SUPER_MAGIC 0x27e0eb | ||
40 | 41 | ||
41 | #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA | 42 | #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA |
42 | #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA | 43 | #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 10a83d8d5775..af2ed4bae678 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -894,6 +894,34 @@ struct sched_entity { | |||
894 | #endif | 894 | #endif |
895 | }; | 895 | }; |
896 | 896 | ||
897 | #ifdef CONFIG_CGROUPS | ||
898 | |||
899 | #define SUBSYS(_x) _x ## _subsys_id, | ||
900 | enum cgroup_subsys_id { | ||
901 | #include <linux/cgroup_subsys.h> | ||
902 | CGROUP_SUBSYS_COUNT | ||
903 | }; | ||
904 | #undef SUBSYS | ||
905 | |||
906 | /* A css_set is a structure holding pointers to a set of | ||
907 | * cgroup_subsys_state objects. | ||
908 | */ | ||
909 | |||
910 | struct css_set { | ||
911 | |||
912 | /* Set of subsystem states, one for each subsystem. NULL for | ||
913 | * subsystems that aren't part of this hierarchy. These | ||
914 | * pointers reduce the number of dereferences required to get | ||
915 | * from a task to its state for a given cgroup, but result | ||
916 | * in increased space usage if tasks are in wildly different | ||
917 | * groupings across different hierarchies. This array is | ||
918 | * immutable after creation */ | ||
919 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | ||
920 | |||
921 | }; | ||
922 | |||
923 | #endif /* CONFIG_CGROUPS */ | ||
924 | |||
897 | struct task_struct { | 925 | struct task_struct { |
898 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | 926 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
899 | void *stack; | 927 | void *stack; |
@@ -1130,6 +1158,9 @@ struct task_struct { | |||
1130 | int cpuset_mems_generation; | 1158 | int cpuset_mems_generation; |
1131 | int cpuset_mem_spread_rotor; | 1159 | int cpuset_mem_spread_rotor; |
1132 | #endif | 1160 | #endif |
1161 | #ifdef CONFIG_CGROUPS | ||
1162 | struct css_set cgroups; | ||
1163 | #endif | ||
1133 | #ifdef CONFIG_FUTEX | 1164 | #ifdef CONFIG_FUTEX |
1134 | struct robust_list_head __user *robust_list; | 1165 | struct robust_list_head __user *robust_list; |
1135 | #ifdef CONFIG_COMPAT | 1166 | #ifdef CONFIG_COMPAT |
@@ -1625,7 +1656,8 @@ static inline int thread_group_empty(struct task_struct *p) | |||
1625 | /* | 1656 | /* |
1626 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring | 1657 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring |
1627 | * subscriptions and synchronises with wait4(). Also used in procfs. Also | 1658 | * subscriptions and synchronises with wait4(). Also used in procfs. Also |
1628 | * pins the final release of task.io_context. Also protects ->cpuset. | 1659 | * pins the final release of task.io_context. Also protects ->cpuset and |
1660 | * ->cgroup.subsys[]. | ||
1629 | * | 1661 | * |
1630 | * Nests both inside and outside of read_lock(&tasklist_lock). | 1662 | * Nests both inside and outside of read_lock(&tasklist_lock). |
1631 | * It must not be nested with write_lock_irq(&tasklist_lock), | 1663 | * It must not be nested with write_lock_irq(&tasklist_lock), |
diff --git a/init/Kconfig b/init/Kconfig index a29a688c47d3..51b3d14f44f1 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -270,6 +270,14 @@ config LOG_BUF_SHIFT | |||
270 | 13 => 8 KB | 270 | 13 => 8 KB |
271 | 12 => 4 KB | 271 | 12 => 4 KB |
272 | 272 | ||
273 | config CGROUPS | ||
274 | bool "Control Group support" | ||
275 | help | ||
276 | This option will let you use process cgroup subsystems | ||
277 | such as Cpusets | ||
278 | |||
279 | Say N if unsure. | ||
280 | |||
273 | config CPUSETS | 281 | config CPUSETS |
274 | bool "Cpuset support" | 282 | bool "Cpuset support" |
275 | depends on SMP | 283 | depends on SMP |
diff --git a/init/main.c b/init/main.c index 9def935ab13a..0dd0e7a1f632 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/writeback.h> | 39 | #include <linux/writeback.h> |
40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
41 | #include <linux/cpuset.h> | 41 | #include <linux/cpuset.h> |
42 | #include <linux/cgroup.h> | ||
42 | #include <linux/efi.h> | 43 | #include <linux/efi.h> |
43 | #include <linux/tick.h> | 44 | #include <linux/tick.h> |
44 | #include <linux/interrupt.h> | 45 | #include <linux/interrupt.h> |
@@ -523,6 +524,7 @@ asmlinkage void __init start_kernel(void) | |||
523 | */ | 524 | */ |
524 | unwind_init(); | 525 | unwind_init(); |
525 | lockdep_init(); | 526 | lockdep_init(); |
527 | cgroup_init_early(); | ||
526 | 528 | ||
527 | local_irq_disable(); | 529 | local_irq_disable(); |
528 | early_boot_irqs_off(); | 530 | early_boot_irqs_off(); |
@@ -640,6 +642,7 @@ asmlinkage void __init start_kernel(void) | |||
640 | #ifdef CONFIG_PROC_FS | 642 | #ifdef CONFIG_PROC_FS |
641 | proc_root_init(); | 643 | proc_root_init(); |
642 | #endif | 644 | #endif |
645 | cgroup_init(); | ||
643 | cpuset_init(); | 646 | cpuset_init(); |
644 | taskstats_init_early(); | 647 | taskstats_init_early(); |
645 | delayacct_init(); | 648 | delayacct_init(); |
diff --git a/kernel/Makefile b/kernel/Makefile index 001bd3b65dd1..ea8c8a12e19a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -36,6 +36,7 @@ obj-$(CONFIG_PM) += power/ | |||
36 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 36 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
37 | obj-$(CONFIG_KEXEC) += kexec.o | 37 | obj-$(CONFIG_KEXEC) += kexec.o |
38 | obj-$(CONFIG_COMPAT) += compat.o | 38 | obj-$(CONFIG_COMPAT) += compat.o |
39 | obj-$(CONFIG_CGROUPS) += cgroup.o | ||
39 | obj-$(CONFIG_CPUSETS) += cpuset.o | 40 | obj-$(CONFIG_CPUSETS) += cpuset.o |
40 | obj-$(CONFIG_IKCONFIG) += configs.o | 41 | obj-$(CONFIG_IKCONFIG) += configs.o |
41 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 42 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c new file mode 100644 index 000000000000..6ba857bec71b --- /dev/null +++ b/kernel/cgroup.c | |||
@@ -0,0 +1,1198 @@ | |||
1 | /* | ||
2 | * kernel/cgroup.c | ||
3 | * | ||
4 | * Generic process-grouping system. | ||
5 | * | ||
6 | * Based originally on the cpuset system, extracted by Paul Menage | ||
7 | * Copyright (C) 2006 Google, Inc | ||
8 | * | ||
9 | * Copyright notices from the original cpuset code: | ||
10 | * -------------------------------------------------- | ||
11 | * Copyright (C) 2003 BULL SA. | ||
12 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. | ||
13 | * | ||
14 | * Portions derived from Patrick Mochel's sysfs code. | ||
15 | * sysfs is Copyright (c) 2001-3 Patrick Mochel | ||
16 | * | ||
17 | * 2003-10-10 Written by Simon Derr. | ||
18 | * 2003-10-22 Updates by Stephen Hemminger. | ||
19 | * 2004 May-July Rework by Paul Jackson. | ||
20 | * --------------------------------------------------- | ||
21 | * | ||
22 | * This file is subject to the terms and conditions of the GNU General Public | ||
23 | * License. See the file COPYING in the main directory of the Linux | ||
24 | * distribution for more details. | ||
25 | */ | ||
26 | |||
27 | #include <linux/cgroup.h> | ||
28 | #include <linux/errno.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/kernel.h> | ||
31 | #include <linux/list.h> | ||
32 | #include <linux/mm.h> | ||
33 | #include <linux/mutex.h> | ||
34 | #include <linux/mount.h> | ||
35 | #include <linux/pagemap.h> | ||
36 | #include <linux/rcupdate.h> | ||
37 | #include <linux/sched.h> | ||
38 | #include <linux/seq_file.h> | ||
39 | #include <linux/slab.h> | ||
40 | #include <linux/magic.h> | ||
41 | #include <linux/spinlock.h> | ||
42 | #include <linux/string.h> | ||
43 | |||
44 | #include <asm/atomic.h> | ||
45 | |||
46 | /* Generate an array of cgroup subsystem pointers */ | ||
47 | #define SUBSYS(_x) &_x ## _subsys, | ||
48 | |||
49 | static struct cgroup_subsys *subsys[] = { | ||
50 | #include <linux/cgroup_subsys.h> | ||
51 | }; | ||
52 | |||
53 | /* | ||
54 | * A cgroupfs_root represents the root of a cgroup hierarchy, | ||
55 | * and may be associated with a superblock to form an active | ||
56 | * hierarchy | ||
57 | */ | ||
58 | struct cgroupfs_root { | ||
59 | struct super_block *sb; | ||
60 | |||
61 | /* | ||
62 | * The bitmask of subsystems intended to be attached to this | ||
63 | * hierarchy | ||
64 | */ | ||
65 | unsigned long subsys_bits; | ||
66 | |||
67 | /* The bitmask of subsystems currently attached to this hierarchy */ | ||
68 | unsigned long actual_subsys_bits; | ||
69 | |||
70 | /* A list running through the attached subsystems */ | ||
71 | struct list_head subsys_list; | ||
72 | |||
73 | /* The root cgroup for this hierarchy */ | ||
74 | struct cgroup top_cgroup; | ||
75 | |||
76 | /* Tracks how many cgroups are currently defined in hierarchy.*/ | ||
77 | int number_of_cgroups; | ||
78 | |||
79 | /* A list running through the mounted hierarchies */ | ||
80 | struct list_head root_list; | ||
81 | |||
82 | /* Hierarchy-specific flags */ | ||
83 | unsigned long flags; | ||
84 | }; | ||
85 | |||
86 | |||
87 | /* | ||
88 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the | ||
89 | * subsystems that are otherwise unattached - it never has more than a | ||
90 | * single cgroup, and all tasks are part of that cgroup. | ||
91 | */ | ||
92 | static struct cgroupfs_root rootnode; | ||
93 | |||
94 | /* The list of hierarchy roots */ | ||
95 | |||
96 | static LIST_HEAD(roots); | ||
97 | |||
98 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | ||
99 | #define dummytop (&rootnode.top_cgroup) | ||
100 | |||
101 | /* This flag indicates whether tasks in the fork and exit paths should | ||
102 | * take callback_mutex and check for fork/exit handlers to call. This | ||
103 | * avoids us having to do extra work in the fork/exit path if none of the | ||
104 | * subsystems need to be called. | ||
105 | */ | ||
106 | static int need_forkexit_callback; | ||
107 | |||
108 | /* bits in struct cgroup flags field */ | ||
109 | enum { | ||
110 | CONT_REMOVED, | ||
111 | }; | ||
112 | |||
113 | /* convenient tests for these bits */ | ||
114 | inline int cgroup_is_removed(const struct cgroup *cont) | ||
115 | { | ||
116 | return test_bit(CONT_REMOVED, &cont->flags); | ||
117 | } | ||
118 | |||
119 | /* bits in struct cgroupfs_root flags field */ | ||
120 | enum { | ||
121 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | ||
122 | }; | ||
123 | |||
124 | /* | ||
125 | * for_each_subsys() allows you to iterate on each subsystem attached to | ||
126 | * an active hierarchy | ||
127 | */ | ||
128 | #define for_each_subsys(_root, _ss) \ | ||
129 | list_for_each_entry(_ss, &_root->subsys_list, sibling) | ||
130 | |||
131 | /* for_each_root() allows you to iterate across the active hierarchies */ | ||
132 | #define for_each_root(_root) \ | ||
133 | list_for_each_entry(_root, &roots, root_list) | ||
134 | |||
135 | /* | ||
136 | * There is one global cgroup mutex. We also require taking | ||
137 | * task_lock() when dereferencing a task's cgroup subsys pointers. | ||
138 | * See "The task_lock() exception", at the end of this comment. | ||
139 | * | ||
140 | * A task must hold cgroup_mutex to modify cgroups. | ||
141 | * | ||
142 | * Any task can increment and decrement the count field without lock. | ||
143 | * So in general, code holding cgroup_mutex can't rely on the count | ||
144 | * field not changing. However, if the count goes to zero, then only | ||
145 | * attach_task() can increment it again. Because a count of zero | ||
146 | * means that no tasks are currently attached, therefore there is no | ||
147 | * way a task attached to that cgroup can fork (the other way to | ||
148 | * increment the count). So code holding cgroup_mutex can safely | ||
149 | * assume that if the count is zero, it will stay zero. Similarly, if | ||
150 | * a task holds cgroup_mutex on a cgroup with zero count, it | ||
151 | * knows that the cgroup won't be removed, as cgroup_rmdir() | ||
152 | * needs that mutex. | ||
153 | * | ||
154 | * The cgroup_common_file_write handler for operations that modify | ||
155 | * the cgroup hierarchy holds cgroup_mutex across the entire operation, | ||
156 | * single threading all such cgroup modifications across the system. | ||
157 | * | ||
158 | * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't | ||
159 | * (usually) take cgroup_mutex. These are the two most performance | ||
160 | * critical pieces of code here. The exception occurs on cgroup_exit(), | ||
161 | * when a task in a notify_on_release cgroup exits. Then cgroup_mutex | ||
162 | * is taken, and if the cgroup count is zero, a usermode call made | ||
163 | * to /sbin/cgroup_release_agent with the name of the cgroup (path | ||
164 | * relative to the root of cgroup file system) as the argument. | ||
165 | * | ||
166 | * A cgroup can only be deleted if both its 'count' of using tasks | ||
167 | * is zero, and its list of 'children' cgroups is empty. Since all | ||
168 | * tasks in the system use _some_ cgroup, and since there is always at | ||
169 | * least one task in the system (init, pid == 1), therefore, top_cgroup | ||
170 | * always has either children cgroups and/or using tasks. So we don't | ||
171 | * need a special hack to ensure that top_cgroup cannot be deleted. | ||
172 | * | ||
173 | * The task_lock() exception | ||
174 | * | ||
175 | * The need for this exception arises from the action of | ||
176 | * attach_task(), which overwrites one tasks cgroup pointer with | ||
177 | * another. It does so using cgroup_mutexe, however there are | ||
178 | * several performance critical places that need to reference | ||
179 | * task->cgroup without the expense of grabbing a system global | ||
180 | * mutex. Therefore except as noted below, when dereferencing or, as | ||
181 | * in attach_task(), modifying a task'ss cgroup pointer we use | ||
182 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | ||
183 | * the task_struct routinely used for such matters. | ||
184 | * | ||
185 | * P.S. One more locking exception. RCU is used to guard the | ||
186 | * update of a tasks cgroup pointer by attach_task() | ||
187 | */ | ||
188 | |||
189 | static DEFINE_MUTEX(cgroup_mutex); | ||
190 | |||
191 | /** | ||
192 | * cgroup_lock - lock out any changes to cgroup structures | ||
193 | * | ||
194 | */ | ||
195 | |||
196 | void cgroup_lock(void) | ||
197 | { | ||
198 | mutex_lock(&cgroup_mutex); | ||
199 | } | ||
200 | |||
201 | /** | ||
202 | * cgroup_unlock - release lock on cgroup changes | ||
203 | * | ||
204 | * Undo the lock taken in a previous cgroup_lock() call. | ||
205 | */ | ||
206 | |||
207 | void cgroup_unlock(void) | ||
208 | { | ||
209 | mutex_unlock(&cgroup_mutex); | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * A couple of forward declarations required, due to cyclic reference loop: | ||
214 | * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> | ||
215 | * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations | ||
216 | * -> cgroup_mkdir. | ||
217 | */ | ||
218 | |||
219 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | ||
220 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | ||
221 | static int cgroup_populate_dir(struct cgroup *cont); | ||
222 | static struct inode_operations cgroup_dir_inode_operations; | ||
223 | |||
224 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | ||
225 | { | ||
226 | struct inode *inode = new_inode(sb); | ||
227 | static struct backing_dev_info cgroup_backing_dev_info = { | ||
228 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | ||
229 | }; | ||
230 | |||
231 | if (inode) { | ||
232 | inode->i_mode = mode; | ||
233 | inode->i_uid = current->fsuid; | ||
234 | inode->i_gid = current->fsgid; | ||
235 | inode->i_blocks = 0; | ||
236 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
237 | inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; | ||
238 | } | ||
239 | return inode; | ||
240 | } | ||
241 | |||
242 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | ||
243 | { | ||
244 | /* is dentry a directory ? if so, kfree() associated cgroup */ | ||
245 | if (S_ISDIR(inode->i_mode)) { | ||
246 | struct cgroup *cont = dentry->d_fsdata; | ||
247 | BUG_ON(!(cgroup_is_removed(cont))); | ||
248 | kfree(cont); | ||
249 | } | ||
250 | iput(inode); | ||
251 | } | ||
252 | |||
253 | static void remove_dir(struct dentry *d) | ||
254 | { | ||
255 | struct dentry *parent = dget(d->d_parent); | ||
256 | |||
257 | d_delete(d); | ||
258 | simple_rmdir(parent->d_inode, d); | ||
259 | dput(parent); | ||
260 | } | ||
261 | |||
262 | static void cgroup_clear_directory(struct dentry *dentry) | ||
263 | { | ||
264 | struct list_head *node; | ||
265 | |||
266 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | ||
267 | spin_lock(&dcache_lock); | ||
268 | node = dentry->d_subdirs.next; | ||
269 | while (node != &dentry->d_subdirs) { | ||
270 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | ||
271 | list_del_init(node); | ||
272 | if (d->d_inode) { | ||
273 | /* This should never be called on a cgroup | ||
274 | * directory with child cgroups */ | ||
275 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | ||
276 | d = dget_locked(d); | ||
277 | spin_unlock(&dcache_lock); | ||
278 | d_delete(d); | ||
279 | simple_unlink(dentry->d_inode, d); | ||
280 | dput(d); | ||
281 | spin_lock(&dcache_lock); | ||
282 | } | ||
283 | node = dentry->d_subdirs.next; | ||
284 | } | ||
285 | spin_unlock(&dcache_lock); | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * NOTE : the dentry must have been dget()'ed | ||
290 | */ | ||
291 | static void cgroup_d_remove_dir(struct dentry *dentry) | ||
292 | { | ||
293 | cgroup_clear_directory(dentry); | ||
294 | |||
295 | spin_lock(&dcache_lock); | ||
296 | list_del_init(&dentry->d_u.d_child); | ||
297 | spin_unlock(&dcache_lock); | ||
298 | remove_dir(dentry); | ||
299 | } | ||
300 | |||
301 | static int rebind_subsystems(struct cgroupfs_root *root, | ||
302 | unsigned long final_bits) | ||
303 | { | ||
304 | unsigned long added_bits, removed_bits; | ||
305 | struct cgroup *cont = &root->top_cgroup; | ||
306 | int i; | ||
307 | |||
308 | removed_bits = root->actual_subsys_bits & ~final_bits; | ||
309 | added_bits = final_bits & ~root->actual_subsys_bits; | ||
310 | /* Check that any added subsystems are currently free */ | ||
311 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
312 | unsigned long long bit = 1ull << i; | ||
313 | struct cgroup_subsys *ss = subsys[i]; | ||
314 | if (!(bit & added_bits)) | ||
315 | continue; | ||
316 | if (ss->root != &rootnode) { | ||
317 | /* Subsystem isn't free */ | ||
318 | return -EBUSY; | ||
319 | } | ||
320 | } | ||
321 | |||
322 | /* Currently we don't handle adding/removing subsystems when | ||
323 | * any child cgroups exist. This is theoretically supportable | ||
324 | * but involves complex error handling, so it's being left until | ||
325 | * later */ | ||
326 | if (!list_empty(&cont->children)) | ||
327 | return -EBUSY; | ||
328 | |||
329 | /* Process each subsystem */ | ||
330 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
331 | struct cgroup_subsys *ss = subsys[i]; | ||
332 | unsigned long bit = 1UL << i; | ||
333 | if (bit & added_bits) { | ||
334 | /* We're binding this subsystem to this hierarchy */ | ||
335 | BUG_ON(cont->subsys[i]); | ||
336 | BUG_ON(!dummytop->subsys[i]); | ||
337 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | ||
338 | cont->subsys[i] = dummytop->subsys[i]; | ||
339 | cont->subsys[i]->cgroup = cont; | ||
340 | list_add(&ss->sibling, &root->subsys_list); | ||
341 | rcu_assign_pointer(ss->root, root); | ||
342 | if (ss->bind) | ||
343 | ss->bind(ss, cont); | ||
344 | |||
345 | } else if (bit & removed_bits) { | ||
346 | /* We're removing this subsystem */ | ||
347 | BUG_ON(cont->subsys[i] != dummytop->subsys[i]); | ||
348 | BUG_ON(cont->subsys[i]->cgroup != cont); | ||
349 | if (ss->bind) | ||
350 | ss->bind(ss, dummytop); | ||
351 | dummytop->subsys[i]->cgroup = dummytop; | ||
352 | cont->subsys[i] = NULL; | ||
353 | rcu_assign_pointer(subsys[i]->root, &rootnode); | ||
354 | list_del(&ss->sibling); | ||
355 | } else if (bit & final_bits) { | ||
356 | /* Subsystem state should already exist */ | ||
357 | BUG_ON(!cont->subsys[i]); | ||
358 | } else { | ||
359 | /* Subsystem state shouldn't exist */ | ||
360 | BUG_ON(cont->subsys[i]); | ||
361 | } | ||
362 | } | ||
363 | root->subsys_bits = root->actual_subsys_bits = final_bits; | ||
364 | synchronize_rcu(); | ||
365 | |||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | ||
370 | { | ||
371 | struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; | ||
372 | struct cgroup_subsys *ss; | ||
373 | |||
374 | mutex_lock(&cgroup_mutex); | ||
375 | for_each_subsys(root, ss) | ||
376 | seq_printf(seq, ",%s", ss->name); | ||
377 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | ||
378 | seq_puts(seq, ",noprefix"); | ||
379 | mutex_unlock(&cgroup_mutex); | ||
380 | return 0; | ||
381 | } | ||
382 | |||
383 | struct cgroup_sb_opts { | ||
384 | unsigned long subsys_bits; | ||
385 | unsigned long flags; | ||
386 | }; | ||
387 | |||
388 | /* Convert a hierarchy specifier into a bitmask of subsystems and | ||
389 | * flags. */ | ||
390 | static int parse_cgroupfs_options(char *data, | ||
391 | struct cgroup_sb_opts *opts) | ||
392 | { | ||
393 | char *token, *o = data ?: "all"; | ||
394 | |||
395 | opts->subsys_bits = 0; | ||
396 | opts->flags = 0; | ||
397 | |||
398 | while ((token = strsep(&o, ",")) != NULL) { | ||
399 | if (!*token) | ||
400 | return -EINVAL; | ||
401 | if (!strcmp(token, "all")) { | ||
402 | opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; | ||
403 | } else if (!strcmp(token, "noprefix")) { | ||
404 | set_bit(ROOT_NOPREFIX, &opts->flags); | ||
405 | } else { | ||
406 | struct cgroup_subsys *ss; | ||
407 | int i; | ||
408 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
409 | ss = subsys[i]; | ||
410 | if (!strcmp(token, ss->name)) { | ||
411 | set_bit(i, &opts->subsys_bits); | ||
412 | break; | ||
413 | } | ||
414 | } | ||
415 | if (i == CGROUP_SUBSYS_COUNT) | ||
416 | return -ENOENT; | ||
417 | } | ||
418 | } | ||
419 | |||
420 | /* We can't have an empty hierarchy */ | ||
421 | if (!opts->subsys_bits) | ||
422 | return -EINVAL; | ||
423 | |||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | ||
428 | { | ||
429 | int ret = 0; | ||
430 | struct cgroupfs_root *root = sb->s_fs_info; | ||
431 | struct cgroup *cont = &root->top_cgroup; | ||
432 | struct cgroup_sb_opts opts; | ||
433 | |||
434 | mutex_lock(&cont->dentry->d_inode->i_mutex); | ||
435 | mutex_lock(&cgroup_mutex); | ||
436 | |||
437 | /* See what subsystems are wanted */ | ||
438 | ret = parse_cgroupfs_options(data, &opts); | ||
439 | if (ret) | ||
440 | goto out_unlock; | ||
441 | |||
442 | /* Don't allow flags to change at remount */ | ||
443 | if (opts.flags != root->flags) { | ||
444 | ret = -EINVAL; | ||
445 | goto out_unlock; | ||
446 | } | ||
447 | |||
448 | ret = rebind_subsystems(root, opts.subsys_bits); | ||
449 | |||
450 | /* (re)populate subsystem files */ | ||
451 | if (!ret) | ||
452 | cgroup_populate_dir(cont); | ||
453 | |||
454 | out_unlock: | ||
455 | mutex_unlock(&cgroup_mutex); | ||
456 | mutex_unlock(&cont->dentry->d_inode->i_mutex); | ||
457 | return ret; | ||
458 | } | ||
459 | |||
460 | static struct super_operations cgroup_ops = { | ||
461 | .statfs = simple_statfs, | ||
462 | .drop_inode = generic_delete_inode, | ||
463 | .show_options = cgroup_show_options, | ||
464 | .remount_fs = cgroup_remount, | ||
465 | }; | ||
466 | |||
467 | static void init_cgroup_root(struct cgroupfs_root *root) | ||
468 | { | ||
469 | struct cgroup *cont = &root->top_cgroup; | ||
470 | INIT_LIST_HEAD(&root->subsys_list); | ||
471 | INIT_LIST_HEAD(&root->root_list); | ||
472 | root->number_of_cgroups = 1; | ||
473 | cont->root = root; | ||
474 | cont->top_cgroup = cont; | ||
475 | INIT_LIST_HEAD(&cont->sibling); | ||
476 | INIT_LIST_HEAD(&cont->children); | ||
477 | } | ||
478 | |||
479 | static int cgroup_test_super(struct super_block *sb, void *data) | ||
480 | { | ||
481 | struct cgroupfs_root *new = data; | ||
482 | struct cgroupfs_root *root = sb->s_fs_info; | ||
483 | |||
484 | /* First check subsystems */ | ||
485 | if (new->subsys_bits != root->subsys_bits) | ||
486 | return 0; | ||
487 | |||
488 | /* Next check flags */ | ||
489 | if (new->flags != root->flags) | ||
490 | return 0; | ||
491 | |||
492 | return 1; | ||
493 | } | ||
494 | |||
495 | static int cgroup_set_super(struct super_block *sb, void *data) | ||
496 | { | ||
497 | int ret; | ||
498 | struct cgroupfs_root *root = data; | ||
499 | |||
500 | ret = set_anon_super(sb, NULL); | ||
501 | if (ret) | ||
502 | return ret; | ||
503 | |||
504 | sb->s_fs_info = root; | ||
505 | root->sb = sb; | ||
506 | |||
507 | sb->s_blocksize = PAGE_CACHE_SIZE; | ||
508 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | ||
509 | sb->s_magic = CGROUP_SUPER_MAGIC; | ||
510 | sb->s_op = &cgroup_ops; | ||
511 | |||
512 | return 0; | ||
513 | } | ||
514 | |||
515 | static int cgroup_get_rootdir(struct super_block *sb) | ||
516 | { | ||
517 | struct inode *inode = | ||
518 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); | ||
519 | struct dentry *dentry; | ||
520 | |||
521 | if (!inode) | ||
522 | return -ENOMEM; | ||
523 | |||
524 | inode->i_op = &simple_dir_inode_operations; | ||
525 | inode->i_fop = &simple_dir_operations; | ||
526 | inode->i_op = &cgroup_dir_inode_operations; | ||
527 | /* directories start off with i_nlink == 2 (for "." entry) */ | ||
528 | inc_nlink(inode); | ||
529 | dentry = d_alloc_root(inode); | ||
530 | if (!dentry) { | ||
531 | iput(inode); | ||
532 | return -ENOMEM; | ||
533 | } | ||
534 | sb->s_root = dentry; | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | static int cgroup_get_sb(struct file_system_type *fs_type, | ||
539 | int flags, const char *unused_dev_name, | ||
540 | void *data, struct vfsmount *mnt) | ||
541 | { | ||
542 | struct cgroup_sb_opts opts; | ||
543 | int ret = 0; | ||
544 | struct super_block *sb; | ||
545 | struct cgroupfs_root *root; | ||
546 | |||
547 | /* First find the desired set of subsystems */ | ||
548 | ret = parse_cgroupfs_options(data, &opts); | ||
549 | if (ret) | ||
550 | return ret; | ||
551 | |||
552 | root = kzalloc(sizeof(*root), GFP_KERNEL); | ||
553 | if (!root) | ||
554 | return -ENOMEM; | ||
555 | |||
556 | init_cgroup_root(root); | ||
557 | root->subsys_bits = opts.subsys_bits; | ||
558 | root->flags = opts.flags; | ||
559 | |||
560 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); | ||
561 | |||
562 | if (IS_ERR(sb)) { | ||
563 | kfree(root); | ||
564 | return PTR_ERR(sb); | ||
565 | } | ||
566 | |||
567 | if (sb->s_fs_info != root) { | ||
568 | /* Reusing an existing superblock */ | ||
569 | BUG_ON(sb->s_root == NULL); | ||
570 | kfree(root); | ||
571 | root = NULL; | ||
572 | } else { | ||
573 | /* New superblock */ | ||
574 | struct cgroup *cont = &root->top_cgroup; | ||
575 | |||
576 | BUG_ON(sb->s_root != NULL); | ||
577 | |||
578 | ret = cgroup_get_rootdir(sb); | ||
579 | if (ret) | ||
580 | goto drop_new_super; | ||
581 | |||
582 | mutex_lock(&cgroup_mutex); | ||
583 | |||
584 | ret = rebind_subsystems(root, root->subsys_bits); | ||
585 | if (ret == -EBUSY) { | ||
586 | mutex_unlock(&cgroup_mutex); | ||
587 | goto drop_new_super; | ||
588 | } | ||
589 | |||
590 | /* EBUSY should be the only error here */ | ||
591 | BUG_ON(ret); | ||
592 | |||
593 | list_add(&root->root_list, &roots); | ||
594 | |||
595 | sb->s_root->d_fsdata = &root->top_cgroup; | ||
596 | root->top_cgroup.dentry = sb->s_root; | ||
597 | |||
598 | BUG_ON(!list_empty(&cont->sibling)); | ||
599 | BUG_ON(!list_empty(&cont->children)); | ||
600 | BUG_ON(root->number_of_cgroups != 1); | ||
601 | |||
602 | /* | ||
603 | * I believe that it's safe to nest i_mutex inside | ||
604 | * cgroup_mutex in this case, since no-one else can | ||
605 | * be accessing this directory yet. But we still need | ||
606 | * to teach lockdep that this is the case - currently | ||
607 | * a cgroupfs remount triggers a lockdep warning | ||
608 | */ | ||
609 | mutex_lock(&cont->dentry->d_inode->i_mutex); | ||
610 | cgroup_populate_dir(cont); | ||
611 | mutex_unlock(&cont->dentry->d_inode->i_mutex); | ||
612 | mutex_unlock(&cgroup_mutex); | ||
613 | } | ||
614 | |||
615 | return simple_set_mnt(mnt, sb); | ||
616 | |||
617 | drop_new_super: | ||
618 | up_write(&sb->s_umount); | ||
619 | deactivate_super(sb); | ||
620 | return ret; | ||
621 | } | ||
622 | |||
623 | static void cgroup_kill_sb(struct super_block *sb) { | ||
624 | struct cgroupfs_root *root = sb->s_fs_info; | ||
625 | struct cgroup *cont = &root->top_cgroup; | ||
626 | int ret; | ||
627 | |||
628 | BUG_ON(!root); | ||
629 | |||
630 | BUG_ON(root->number_of_cgroups != 1); | ||
631 | BUG_ON(!list_empty(&cont->children)); | ||
632 | BUG_ON(!list_empty(&cont->sibling)); | ||
633 | |||
634 | mutex_lock(&cgroup_mutex); | ||
635 | |||
636 | /* Rebind all subsystems back to the default hierarchy */ | ||
637 | ret = rebind_subsystems(root, 0); | ||
638 | /* Shouldn't be able to fail ... */ | ||
639 | BUG_ON(ret); | ||
640 | |||
641 | if (!list_empty(&root->root_list)) | ||
642 | list_del(&root->root_list); | ||
643 | mutex_unlock(&cgroup_mutex); | ||
644 | |||
645 | kfree(root); | ||
646 | kill_litter_super(sb); | ||
647 | } | ||
648 | |||
649 | static struct file_system_type cgroup_fs_type = { | ||
650 | .name = "cgroup", | ||
651 | .get_sb = cgroup_get_sb, | ||
652 | .kill_sb = cgroup_kill_sb, | ||
653 | }; | ||
654 | |||
655 | static inline struct cgroup *__d_cont(struct dentry *dentry) | ||
656 | { | ||
657 | return dentry->d_fsdata; | ||
658 | } | ||
659 | |||
660 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
661 | { | ||
662 | return dentry->d_fsdata; | ||
663 | } | ||
664 | |||
665 | /* | ||
666 | * Called with cgroup_mutex held. Writes path of cgroup into buf. | ||
667 | * Returns 0 on success, -errno on error. | ||
668 | */ | ||
669 | int cgroup_path(const struct cgroup *cont, char *buf, int buflen) | ||
670 | { | ||
671 | char *start; | ||
672 | |||
673 | if (cont == dummytop) { | ||
674 | /* | ||
675 | * Inactive subsystems have no dentry for their root | ||
676 | * cgroup | ||
677 | */ | ||
678 | strcpy(buf, "/"); | ||
679 | return 0; | ||
680 | } | ||
681 | |||
682 | start = buf + buflen; | ||
683 | |||
684 | *--start = '\0'; | ||
685 | for (;;) { | ||
686 | int len = cont->dentry->d_name.len; | ||
687 | if ((start -= len) < buf) | ||
688 | return -ENAMETOOLONG; | ||
689 | memcpy(start, cont->dentry->d_name.name, len); | ||
690 | cont = cont->parent; | ||
691 | if (!cont) | ||
692 | break; | ||
693 | if (!cont->parent) | ||
694 | continue; | ||
695 | if (--start < buf) | ||
696 | return -ENAMETOOLONG; | ||
697 | *start = '/'; | ||
698 | } | ||
699 | memmove(buf, start, buf + buflen - start); | ||
700 | return 0; | ||
701 | } | ||
702 | |||
703 | /* The various types of files and directories in a cgroup file system */ | ||
704 | |||
705 | enum cgroup_filetype { | ||
706 | FILE_ROOT, | ||
707 | FILE_DIR, | ||
708 | FILE_TASKLIST, | ||
709 | }; | ||
710 | |||
711 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | ||
712 | size_t nbytes, loff_t *ppos) | ||
713 | { | ||
714 | struct cftype *cft = __d_cft(file->f_dentry); | ||
715 | struct cgroup *cont = __d_cont(file->f_dentry->d_parent); | ||
716 | |||
717 | if (!cft) | ||
718 | return -ENODEV; | ||
719 | if (!cft->write) | ||
720 | return -EINVAL; | ||
721 | |||
722 | return cft->write(cont, cft, file, buf, nbytes, ppos); | ||
723 | } | ||
724 | |||
725 | static ssize_t cgroup_read_uint(struct cgroup *cont, struct cftype *cft, | ||
726 | struct file *file, | ||
727 | char __user *buf, size_t nbytes, | ||
728 | loff_t *ppos) | ||
729 | { | ||
730 | char tmp[64]; | ||
731 | u64 val = cft->read_uint(cont, cft); | ||
732 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | ||
733 | |||
734 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
735 | } | ||
736 | |||
737 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | ||
738 | size_t nbytes, loff_t *ppos) | ||
739 | { | ||
740 | struct cftype *cft = __d_cft(file->f_dentry); | ||
741 | struct cgroup *cont = __d_cont(file->f_dentry->d_parent); | ||
742 | |||
743 | if (!cft) | ||
744 | return -ENODEV; | ||
745 | |||
746 | if (cft->read) | ||
747 | return cft->read(cont, cft, file, buf, nbytes, ppos); | ||
748 | if (cft->read_uint) | ||
749 | return cgroup_read_uint(cont, cft, file, buf, nbytes, ppos); | ||
750 | return -EINVAL; | ||
751 | } | ||
752 | |||
753 | static int cgroup_file_open(struct inode *inode, struct file *file) | ||
754 | { | ||
755 | int err; | ||
756 | struct cftype *cft; | ||
757 | |||
758 | err = generic_file_open(inode, file); | ||
759 | if (err) | ||
760 | return err; | ||
761 | |||
762 | cft = __d_cft(file->f_dentry); | ||
763 | if (!cft) | ||
764 | return -ENODEV; | ||
765 | if (cft->open) | ||
766 | err = cft->open(inode, file); | ||
767 | else | ||
768 | err = 0; | ||
769 | |||
770 | return err; | ||
771 | } | ||
772 | |||
773 | static int cgroup_file_release(struct inode *inode, struct file *file) | ||
774 | { | ||
775 | struct cftype *cft = __d_cft(file->f_dentry); | ||
776 | if (cft->release) | ||
777 | return cft->release(inode, file); | ||
778 | return 0; | ||
779 | } | ||
780 | |||
781 | /* | ||
782 | * cgroup_rename - Only allow simple rename of directories in place. | ||
783 | */ | ||
784 | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
785 | struct inode *new_dir, struct dentry *new_dentry) | ||
786 | { | ||
787 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | ||
788 | return -ENOTDIR; | ||
789 | if (new_dentry->d_inode) | ||
790 | return -EEXIST; | ||
791 | if (old_dir != new_dir) | ||
792 | return -EIO; | ||
793 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||
794 | } | ||
795 | |||
796 | static struct file_operations cgroup_file_operations = { | ||
797 | .read = cgroup_file_read, | ||
798 | .write = cgroup_file_write, | ||
799 | .llseek = generic_file_llseek, | ||
800 | .open = cgroup_file_open, | ||
801 | .release = cgroup_file_release, | ||
802 | }; | ||
803 | |||
804 | static struct inode_operations cgroup_dir_inode_operations = { | ||
805 | .lookup = simple_lookup, | ||
806 | .mkdir = cgroup_mkdir, | ||
807 | .rmdir = cgroup_rmdir, | ||
808 | .rename = cgroup_rename, | ||
809 | }; | ||
810 | |||
811 | static int cgroup_create_file(struct dentry *dentry, int mode, | ||
812 | struct super_block *sb) | ||
813 | { | ||
814 | static struct dentry_operations cgroup_dops = { | ||
815 | .d_iput = cgroup_diput, | ||
816 | }; | ||
817 | |||
818 | struct inode *inode; | ||
819 | |||
820 | if (!dentry) | ||
821 | return -ENOENT; | ||
822 | if (dentry->d_inode) | ||
823 | return -EEXIST; | ||
824 | |||
825 | inode = cgroup_new_inode(mode, sb); | ||
826 | if (!inode) | ||
827 | return -ENOMEM; | ||
828 | |||
829 | if (S_ISDIR(mode)) { | ||
830 | inode->i_op = &cgroup_dir_inode_operations; | ||
831 | inode->i_fop = &simple_dir_operations; | ||
832 | |||
833 | /* start off with i_nlink == 2 (for "." entry) */ | ||
834 | inc_nlink(inode); | ||
835 | |||
836 | /* start with the directory inode held, so that we can | ||
837 | * populate it without racing with another mkdir */ | ||
838 | mutex_lock(&inode->i_mutex); | ||
839 | } else if (S_ISREG(mode)) { | ||
840 | inode->i_size = 0; | ||
841 | inode->i_fop = &cgroup_file_operations; | ||
842 | } | ||
843 | dentry->d_op = &cgroup_dops; | ||
844 | d_instantiate(dentry, inode); | ||
845 | dget(dentry); /* Extra count - pin the dentry in core */ | ||
846 | return 0; | ||
847 | } | ||
848 | |||
849 | /* | ||
850 | * cgroup_create_dir - create a directory for an object. | ||
851 | * cont: the cgroup we create the directory for. | ||
852 | * It must have a valid ->parent field | ||
853 | * And we are going to fill its ->dentry field. | ||
854 | * dentry: dentry of the new container | ||
855 | * mode: mode to set on new directory. | ||
856 | */ | ||
857 | static int cgroup_create_dir(struct cgroup *cont, struct dentry *dentry, | ||
858 | int mode) | ||
859 | { | ||
860 | struct dentry *parent; | ||
861 | int error = 0; | ||
862 | |||
863 | parent = cont->parent->dentry; | ||
864 | error = cgroup_create_file(dentry, S_IFDIR | mode, cont->root->sb); | ||
865 | if (!error) { | ||
866 | dentry->d_fsdata = cont; | ||
867 | inc_nlink(parent->d_inode); | ||
868 | cont->dentry = dentry; | ||
869 | dget(dentry); | ||
870 | } | ||
871 | dput(dentry); | ||
872 | |||
873 | return error; | ||
874 | } | ||
875 | |||
876 | int cgroup_add_file(struct cgroup *cont, | ||
877 | struct cgroup_subsys *subsys, | ||
878 | const struct cftype *cft) | ||
879 | { | ||
880 | struct dentry *dir = cont->dentry; | ||
881 | struct dentry *dentry; | ||
882 | int error; | ||
883 | |||
884 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | ||
885 | if (subsys && !test_bit(ROOT_NOPREFIX, &cont->root->flags)) { | ||
886 | strcpy(name, subsys->name); | ||
887 | strcat(name, "."); | ||
888 | } | ||
889 | strcat(name, cft->name); | ||
890 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | ||
891 | dentry = lookup_one_len(name, dir, strlen(name)); | ||
892 | if (!IS_ERR(dentry)) { | ||
893 | error = cgroup_create_file(dentry, 0644 | S_IFREG, | ||
894 | cont->root->sb); | ||
895 | if (!error) | ||
896 | dentry->d_fsdata = (void *)cft; | ||
897 | dput(dentry); | ||
898 | } else | ||
899 | error = PTR_ERR(dentry); | ||
900 | return error; | ||
901 | } | ||
902 | |||
903 | int cgroup_add_files(struct cgroup *cont, | ||
904 | struct cgroup_subsys *subsys, | ||
905 | const struct cftype cft[], | ||
906 | int count) | ||
907 | { | ||
908 | int i, err; | ||
909 | for (i = 0; i < count; i++) { | ||
910 | err = cgroup_add_file(cont, subsys, &cft[i]); | ||
911 | if (err) | ||
912 | return err; | ||
913 | } | ||
914 | return 0; | ||
915 | } | ||
916 | |||
917 | static int cgroup_populate_dir(struct cgroup *cont) | ||
918 | { | ||
919 | int err; | ||
920 | struct cgroup_subsys *ss; | ||
921 | |||
922 | /* First clear out any existing files */ | ||
923 | cgroup_clear_directory(cont->dentry); | ||
924 | |||
925 | for_each_subsys(cont->root, ss) { | ||
926 | if (ss->populate && (err = ss->populate(ss, cont)) < 0) | ||
927 | return err; | ||
928 | } | ||
929 | |||
930 | return 0; | ||
931 | } | ||
932 | |||
933 | static void init_cgroup_css(struct cgroup_subsys_state *css, | ||
934 | struct cgroup_subsys *ss, | ||
935 | struct cgroup *cont) | ||
936 | { | ||
937 | css->cgroup = cont; | ||
938 | atomic_set(&css->refcnt, 0); | ||
939 | css->flags = 0; | ||
940 | if (cont == dummytop) | ||
941 | set_bit(CSS_ROOT, &css->flags); | ||
942 | BUG_ON(cont->subsys[ss->subsys_id]); | ||
943 | cont->subsys[ss->subsys_id] = css; | ||
944 | } | ||
945 | |||
946 | /* | ||
947 | * cgroup_create - create a cgroup | ||
948 | * parent: cgroup that will be parent of the new cgroup. | ||
949 | * name: name of the new cgroup. Will be strcpy'ed. | ||
950 | * mode: mode to set on new inode | ||
951 | * | ||
952 | * Must be called with the mutex on the parent inode held | ||
953 | */ | ||
954 | |||
955 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | ||
956 | int mode) | ||
957 | { | ||
958 | struct cgroup *cont; | ||
959 | struct cgroupfs_root *root = parent->root; | ||
960 | int err = 0; | ||
961 | struct cgroup_subsys *ss; | ||
962 | struct super_block *sb = root->sb; | ||
963 | |||
964 | cont = kzalloc(sizeof(*cont), GFP_KERNEL); | ||
965 | if (!cont) | ||
966 | return -ENOMEM; | ||
967 | |||
968 | /* Grab a reference on the superblock so the hierarchy doesn't | ||
969 | * get deleted on unmount if there are child cgroups. This | ||
970 | * can be done outside cgroup_mutex, since the sb can't | ||
971 | * disappear while someone has an open control file on the | ||
972 | * fs */ | ||
973 | atomic_inc(&sb->s_active); | ||
974 | |||
975 | mutex_lock(&cgroup_mutex); | ||
976 | |||
977 | cont->flags = 0; | ||
978 | INIT_LIST_HEAD(&cont->sibling); | ||
979 | INIT_LIST_HEAD(&cont->children); | ||
980 | |||
981 | cont->parent = parent; | ||
982 | cont->root = parent->root; | ||
983 | cont->top_cgroup = parent->top_cgroup; | ||
984 | |||
985 | for_each_subsys(root, ss) { | ||
986 | struct cgroup_subsys_state *css = ss->create(ss, cont); | ||
987 | if (IS_ERR(css)) { | ||
988 | err = PTR_ERR(css); | ||
989 | goto err_destroy; | ||
990 | } | ||
991 | init_cgroup_css(css, ss, cont); | ||
992 | } | ||
993 | |||
994 | list_add(&cont->sibling, &cont->parent->children); | ||
995 | root->number_of_cgroups++; | ||
996 | |||
997 | err = cgroup_create_dir(cont, dentry, mode); | ||
998 | if (err < 0) | ||
999 | goto err_remove; | ||
1000 | |||
1001 | /* The cgroup directory was pre-locked for us */ | ||
1002 | BUG_ON(!mutex_is_locked(&cont->dentry->d_inode->i_mutex)); | ||
1003 | |||
1004 | err = cgroup_populate_dir(cont); | ||
1005 | /* If err < 0, we have a half-filled directory - oh well ;) */ | ||
1006 | |||
1007 | mutex_unlock(&cgroup_mutex); | ||
1008 | mutex_unlock(&cont->dentry->d_inode->i_mutex); | ||
1009 | |||
1010 | return 0; | ||
1011 | |||
1012 | err_remove: | ||
1013 | |||
1014 | list_del(&cont->sibling); | ||
1015 | root->number_of_cgroups--; | ||
1016 | |||
1017 | err_destroy: | ||
1018 | |||
1019 | for_each_subsys(root, ss) { | ||
1020 | if (cont->subsys[ss->subsys_id]) | ||
1021 | ss->destroy(ss, cont); | ||
1022 | } | ||
1023 | |||
1024 | mutex_unlock(&cgroup_mutex); | ||
1025 | |||
1026 | /* Release the reference count that we took on the superblock */ | ||
1027 | deactivate_super(sb); | ||
1028 | |||
1029 | kfree(cont); | ||
1030 | return err; | ||
1031 | } | ||
1032 | |||
1033 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
1034 | { | ||
1035 | struct cgroup *c_parent = dentry->d_parent->d_fsdata; | ||
1036 | |||
1037 | /* the vfs holds inode->i_mutex already */ | ||
1038 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | ||
1039 | } | ||
1040 | |||
1041 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
1042 | { | ||
1043 | struct cgroup *cont = dentry->d_fsdata; | ||
1044 | struct dentry *d; | ||
1045 | struct cgroup *parent; | ||
1046 | struct cgroup_subsys *ss; | ||
1047 | struct super_block *sb; | ||
1048 | struct cgroupfs_root *root; | ||
1049 | int css_busy = 0; | ||
1050 | |||
1051 | /* the vfs holds both inode->i_mutex already */ | ||
1052 | |||
1053 | mutex_lock(&cgroup_mutex); | ||
1054 | if (atomic_read(&cont->count) != 0) { | ||
1055 | mutex_unlock(&cgroup_mutex); | ||
1056 | return -EBUSY; | ||
1057 | } | ||
1058 | if (!list_empty(&cont->children)) { | ||
1059 | mutex_unlock(&cgroup_mutex); | ||
1060 | return -EBUSY; | ||
1061 | } | ||
1062 | |||
1063 | parent = cont->parent; | ||
1064 | root = cont->root; | ||
1065 | sb = root->sb; | ||
1066 | |||
1067 | /* Check the reference count on each subsystem. Since we | ||
1068 | * already established that there are no tasks in the | ||
1069 | * cgroup, if the css refcount is also 0, then there should | ||
1070 | * be no outstanding references, so the subsystem is safe to | ||
1071 | * destroy */ | ||
1072 | for_each_subsys(root, ss) { | ||
1073 | struct cgroup_subsys_state *css; | ||
1074 | css = cont->subsys[ss->subsys_id]; | ||
1075 | if (atomic_read(&css->refcnt)) { | ||
1076 | css_busy = 1; | ||
1077 | break; | ||
1078 | } | ||
1079 | } | ||
1080 | if (css_busy) { | ||
1081 | mutex_unlock(&cgroup_mutex); | ||
1082 | return -EBUSY; | ||
1083 | } | ||
1084 | |||
1085 | for_each_subsys(root, ss) { | ||
1086 | if (cont->subsys[ss->subsys_id]) | ||
1087 | ss->destroy(ss, cont); | ||
1088 | } | ||
1089 | |||
1090 | set_bit(CONT_REMOVED, &cont->flags); | ||
1091 | /* delete my sibling from parent->children */ | ||
1092 | list_del(&cont->sibling); | ||
1093 | spin_lock(&cont->dentry->d_lock); | ||
1094 | d = dget(cont->dentry); | ||
1095 | cont->dentry = NULL; | ||
1096 | spin_unlock(&d->d_lock); | ||
1097 | |||
1098 | cgroup_d_remove_dir(d); | ||
1099 | dput(d); | ||
1100 | root->number_of_cgroups--; | ||
1101 | |||
1102 | mutex_unlock(&cgroup_mutex); | ||
1103 | /* Drop the active superblock reference that we took when we | ||
1104 | * created the cgroup */ | ||
1105 | deactivate_super(sb); | ||
1106 | return 0; | ||
1107 | } | ||
1108 | |||
1109 | static void cgroup_init_subsys(struct cgroup_subsys *ss) | ||
1110 | { | ||
1111 | struct task_struct *g, *p; | ||
1112 | struct cgroup_subsys_state *css; | ||
1113 | printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name); | ||
1114 | |||
1115 | /* Create the top cgroup state for this subsystem */ | ||
1116 | ss->root = &rootnode; | ||
1117 | css = ss->create(ss, dummytop); | ||
1118 | /* We don't handle early failures gracefully */ | ||
1119 | BUG_ON(IS_ERR(css)); | ||
1120 | init_cgroup_css(css, ss, dummytop); | ||
1121 | |||
1122 | /* Update all tasks to contain a subsys pointer to this state | ||
1123 | * - since the subsystem is newly registered, all tasks are in | ||
1124 | * the subsystem's top cgroup. */ | ||
1125 | |||
1126 | /* If this subsystem requested that it be notified with fork | ||
1127 | * events, we should send it one now for every process in the | ||
1128 | * system */ | ||
1129 | |||
1130 | read_lock(&tasklist_lock); | ||
1131 | init_task.cgroups.subsys[ss->subsys_id] = css; | ||
1132 | if (ss->fork) | ||
1133 | ss->fork(ss, &init_task); | ||
1134 | |||
1135 | do_each_thread(g, p) { | ||
1136 | printk(KERN_INFO "Setting task %p css to %p (%d)\n", css, p, p->pid); | ||
1137 | p->cgroups.subsys[ss->subsys_id] = css; | ||
1138 | if (ss->fork) | ||
1139 | ss->fork(ss, p); | ||
1140 | } while_each_thread(g, p); | ||
1141 | read_unlock(&tasklist_lock); | ||
1142 | |||
1143 | need_forkexit_callback |= ss->fork || ss->exit; | ||
1144 | |||
1145 | ss->active = 1; | ||
1146 | } | ||
1147 | |||
1148 | /** | ||
1149 | * cgroup_init_early - initialize cgroups at system boot, and | ||
1150 | * initialize any subsystems that request early init. | ||
1151 | */ | ||
1152 | int __init cgroup_init_early(void) | ||
1153 | { | ||
1154 | int i; | ||
1155 | init_cgroup_root(&rootnode); | ||
1156 | list_add(&rootnode.root_list, &roots); | ||
1157 | |||
1158 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1159 | struct cgroup_subsys *ss = subsys[i]; | ||
1160 | |||
1161 | BUG_ON(!ss->name); | ||
1162 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | ||
1163 | BUG_ON(!ss->create); | ||
1164 | BUG_ON(!ss->destroy); | ||
1165 | if (ss->subsys_id != i) { | ||
1166 | printk(KERN_ERR "Subsys %s id == %d\n", | ||
1167 | ss->name, ss->subsys_id); | ||
1168 | BUG(); | ||
1169 | } | ||
1170 | |||
1171 | if (ss->early_init) | ||
1172 | cgroup_init_subsys(ss); | ||
1173 | } | ||
1174 | return 0; | ||
1175 | } | ||
1176 | |||
1177 | /** | ||
1178 | * cgroup_init - register cgroup filesystem and /proc file, and | ||
1179 | * initialize any subsystems that didn't request early init. | ||
1180 | */ | ||
1181 | int __init cgroup_init(void) | ||
1182 | { | ||
1183 | int err; | ||
1184 | int i; | ||
1185 | |||
1186 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1187 | struct cgroup_subsys *ss = subsys[i]; | ||
1188 | if (!ss->early_init) | ||
1189 | cgroup_init_subsys(ss); | ||
1190 | } | ||
1191 | |||
1192 | err = register_filesystem(&cgroup_fs_type); | ||
1193 | if (err < 0) | ||
1194 | goto out; | ||
1195 | |||
1196 | out: | ||
1197 | return err; | ||
1198 | } | ||