diff options
Diffstat (limited to 'Documentation/cgroups')
-rw-r--r-- | Documentation/cgroups/cgroup_event_listener.c | 110 | ||||
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 39 | ||||
-rw-r--r-- | Documentation/cgroups/memcg_test.txt | 47 | ||||
-rw-r--r-- | Documentation/cgroups/memory.txt | 80 |
4 files changed, 268 insertions, 8 deletions
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c new file mode 100644 index 00000000000..8c2bfc4a635 --- /dev/null +++ b/Documentation/cgroups/cgroup_event_listener.c | |||
@@ -0,0 +1,110 @@ | |||
1 | /* | ||
2 | * cgroup_event_listener.c - Simple listener of cgroup events | ||
3 | * | ||
4 | * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name> | ||
5 | */ | ||
6 | |||
7 | #include <assert.h> | ||
8 | #include <errno.h> | ||
9 | #include <fcntl.h> | ||
10 | #include <libgen.h> | ||
11 | #include <limits.h> | ||
12 | #include <stdio.h> | ||
13 | #include <string.h> | ||
14 | #include <unistd.h> | ||
15 | |||
16 | #include <sys/eventfd.h> | ||
17 | |||
18 | #define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n" | ||
19 | |||
20 | int main(int argc, char **argv) | ||
21 | { | ||
22 | int efd = -1; | ||
23 | int cfd = -1; | ||
24 | int event_control = -1; | ||
25 | char event_control_path[PATH_MAX]; | ||
26 | char line[LINE_MAX]; | ||
27 | int ret; | ||
28 | |||
29 | if (argc != 3) { | ||
30 | fputs(USAGE_STR, stderr); | ||
31 | return 1; | ||
32 | } | ||
33 | |||
34 | cfd = open(argv[1], O_RDONLY); | ||
35 | if (cfd == -1) { | ||
36 | fprintf(stderr, "Cannot open %s: %s\n", argv[1], | ||
37 | strerror(errno)); | ||
38 | goto out; | ||
39 | } | ||
40 | |||
41 | ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control", | ||
42 | dirname(argv[1])); | ||
43 | if (ret >= PATH_MAX) { | ||
44 | fputs("Path to cgroup.event_control is too long\n", stderr); | ||
45 | goto out; | ||
46 | } | ||
47 | |||
48 | event_control = open(event_control_path, O_WRONLY); | ||
49 | if (event_control == -1) { | ||
50 | fprintf(stderr, "Cannot open %s: %s\n", event_control_path, | ||
51 | strerror(errno)); | ||
52 | goto out; | ||
53 | } | ||
54 | |||
55 | efd = eventfd(0, 0); | ||
56 | if (efd == -1) { | ||
57 | perror("eventfd() failed"); | ||
58 | goto out; | ||
59 | } | ||
60 | |||
61 | ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]); | ||
62 | if (ret >= LINE_MAX) { | ||
63 | fputs("Arguments string is too long\n", stderr); | ||
64 | goto out; | ||
65 | } | ||
66 | |||
67 | ret = write(event_control, line, strlen(line) + 1); | ||
68 | if (ret == -1) { | ||
69 | perror("Cannot write to cgroup.event_control"); | ||
70 | goto out; | ||
71 | } | ||
72 | |||
73 | while (1) { | ||
74 | uint64_t result; | ||
75 | |||
76 | ret = read(efd, &result, sizeof(result)); | ||
77 | if (ret == -1) { | ||
78 | if (errno == EINTR) | ||
79 | continue; | ||
80 | perror("Cannot read from eventfd"); | ||
81 | break; | ||
82 | } | ||
83 | assert(ret == sizeof(result)); | ||
84 | |||
85 | ret = access(event_control_path, W_OK); | ||
86 | if ((ret == -1) && (errno == ENOENT)) { | ||
87 | puts("The cgroup seems to have removed."); | ||
88 | ret = 0; | ||
89 | break; | ||
90 | } | ||
91 | |||
92 | if (ret == -1) { | ||
93 | perror("cgroup.event_control " | ||
94 | "is not accessable any more"); | ||
95 | break; | ||
96 | } | ||
97 | |||
98 | printf("%s %s: crossed\n", argv[1], argv[2]); | ||
99 | } | ||
100 | |||
101 | out: | ||
102 | if (efd >= 0) | ||
103 | close(efd); | ||
104 | if (event_control >= 0) | ||
105 | close(event_control); | ||
106 | if (cfd >= 0) | ||
107 | close(cfd); | ||
108 | |||
109 | return (ret != 0); | ||
110 | } | ||
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 0b33bfe7dde..fd588ff0e29 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
@@ -22,6 +22,8 @@ CONTENTS: | |||
22 | 2. Usage Examples and Syntax | 22 | 2. Usage Examples and Syntax |
23 | 2.1 Basic Usage | 23 | 2.1 Basic Usage |
24 | 2.2 Attaching processes | 24 | 2.2 Attaching processes |
25 | 2.3 Mounting hierarchies by name | ||
26 | 2.4 Notification API | ||
25 | 3. Kernel API | 27 | 3. Kernel API |
26 | 3.1 Overview | 28 | 3.1 Overview |
27 | 3.2 Synchronization | 29 | 3.2 Synchronization |
@@ -434,6 +436,25 @@ you give a subsystem a name. | |||
434 | The name of the subsystem appears as part of the hierarchy description | 436 | The name of the subsystem appears as part of the hierarchy description |
435 | in /proc/mounts and /proc/<pid>/cgroups. | 437 | in /proc/mounts and /proc/<pid>/cgroups. |
436 | 438 | ||
439 | 2.4 Notification API | ||
440 | -------------------- | ||
441 | |||
442 | There is mechanism which allows to get notifications about changing | ||
443 | status of a cgroup. | ||
444 | |||
445 | To register new notification handler you need: | ||
446 | - create a file descriptor for event notification using eventfd(2); | ||
447 | - open a control file to be monitored (e.g. memory.usage_in_bytes); | ||
448 | - write "<event_fd> <control_fd> <args>" to cgroup.event_control. | ||
449 | Interpretation of args is defined by control file implementation; | ||
450 | |||
451 | eventfd will be woken up by control file implementation or when the | ||
452 | cgroup is removed. | ||
453 | |||
454 | To unregister notification handler just close eventfd. | ||
455 | |||
456 | NOTE: Support of notifications should be implemented for the control | ||
457 | file. See documentation for the subsystem. | ||
437 | 458 | ||
438 | 3. Kernel API | 459 | 3. Kernel API |
439 | ============= | 460 | ============= |
@@ -488,6 +509,11 @@ Each subsystem should: | |||
488 | - add an entry in linux/cgroup_subsys.h | 509 | - add an entry in linux/cgroup_subsys.h |
489 | - define a cgroup_subsys object called <name>_subsys | 510 | - define a cgroup_subsys object called <name>_subsys |
490 | 511 | ||
512 | If a subsystem can be compiled as a module, it should also have in its | ||
513 | module initcall a call to cgroup_load_subsys(), and in its exitcall a | ||
514 | call to cgroup_unload_subsys(). It should also set its_subsys.module = | ||
515 | THIS_MODULE in its .c file. | ||
516 | |||
491 | Each subsystem may export the following methods. The only mandatory | 517 | Each subsystem may export the following methods. The only mandatory |
492 | methods are create/destroy. Any others that are null are presumed to | 518 | methods are create/destroy. Any others that are null are presumed to |
493 | be successful no-ops. | 519 | be successful no-ops. |
@@ -536,10 +562,21 @@ returns an error, this will abort the attach operation. If a NULL | |||
536 | task is passed, then a successful result indicates that *any* | 562 | task is passed, then a successful result indicates that *any* |
537 | unspecified task can be moved into the cgroup. Note that this isn't | 563 | unspecified task can be moved into the cgroup. Note that this isn't |
538 | called on a fork. If this method returns 0 (success) then this should | 564 | called on a fork. If this method returns 0 (success) then this should |
539 | remain valid while the caller holds cgroup_mutex. If threadgroup is | 565 | remain valid while the caller holds cgroup_mutex and it is ensured that either |
566 | attach() or cancel_attach() will be called in future. If threadgroup is | ||
540 | true, then a successful result indicates that all threads in the given | 567 | true, then a successful result indicates that all threads in the given |
541 | thread's threadgroup can be moved together. | 568 | thread's threadgroup can be moved together. |
542 | 569 | ||
570 | void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
571 | struct task_struct *task, bool threadgroup) | ||
572 | (cgroup_mutex held by caller) | ||
573 | |||
574 | Called when a task attach operation has failed after can_attach() has succeeded. | ||
575 | A subsystem whose can_attach() has some side-effects should provide this | ||
576 | function, so that the subsytem can implement a rollback. If not, not necessary. | ||
577 | This will be called only about subsystems whose can_attach() operation have | ||
578 | succeeded. | ||
579 | |||
543 | void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 580 | void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
544 | struct cgroup *old_cgrp, struct task_struct *task, | 581 | struct cgroup *old_cgrp, struct task_struct *task, |
545 | bool threadgroup) | 582 | bool threadgroup) |
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index 72db89ed060..f7f68b2ac19 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | Memory Resource Controller(Memcg) Implementation Memo. | 1 | Memory Resource Controller(Memcg) Implementation Memo. |
2 | Last Updated: 2009/1/20 | 2 | Last Updated: 2010/2 |
3 | Base Kernel Version: based on 2.6.29-rc2. | 3 | Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). |
4 | 4 | ||
5 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior | 5 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior |
6 | is complex. This is a document for memcg's internal behavior. | 6 | is complex. This is a document for memcg's internal behavior. |
@@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
337 | race and lock dependency with other cgroup subsystems. | 337 | race and lock dependency with other cgroup subsystems. |
338 | 338 | ||
339 | example) | 339 | example) |
340 | # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices | 340 | # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices |
341 | 341 | ||
342 | and do task move, mkdir, rmdir etc...under this. | 342 | and do task move, mkdir, rmdir etc...under this. |
343 | 343 | ||
@@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
348 | 348 | ||
349 | For example, test like following is good. | 349 | For example, test like following is good. |
350 | (Shell-A) | 350 | (Shell-A) |
351 | # mount -t cgroup none /cgroup -t memory | 351 | # mount -t cgroup none /cgroup -o memory |
352 | # mkdir /cgroup/test | 352 | # mkdir /cgroup/test |
353 | # echo 40M > /cgroup/test/memory.limit_in_bytes | 353 | # echo 40M > /cgroup/test/memory.limit_in_bytes |
354 | # echo 0 > /cgroup/test/tasks | 354 | # echo 0 > /cgroup/test/tasks |
@@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
378 | #echo 50M > memory.limit_in_bytes | 378 | #echo 50M > memory.limit_in_bytes |
379 | #echo 50M > memory.memsw.limit_in_bytes | 379 | #echo 50M > memory.memsw.limit_in_bytes |
380 | run 51M of malloc | 380 | run 51M of malloc |
381 | |||
382 | 9.9 Move charges at task migration | ||
383 | Charges associated with a task can be moved along with task migration. | ||
384 | |||
385 | (Shell-A) | ||
386 | #mkdir /cgroup/A | ||
387 | #echo $$ >/cgroup/A/tasks | ||
388 | run some programs which uses some amount of memory in /cgroup/A. | ||
389 | |||
390 | (Shell-B) | ||
391 | #mkdir /cgroup/B | ||
392 | #echo 1 >/cgroup/B/memory.move_charge_at_immigrate | ||
393 | #echo "pid of the program running in group A" >/cgroup/B/tasks | ||
394 | |||
395 | You can see charges have been moved by reading *.usage_in_bytes or | ||
396 | memory.stat of both A and B. | ||
397 | See 8.2 of Documentation/cgroups/memory.txt to see what value should be | ||
398 | written to move_charge_at_immigrate. | ||
399 | |||
400 | 9.10 Memory thresholds | ||
401 | Memory controler implements memory thresholds using cgroups notification | ||
402 | API. You can use Documentation/cgroups/cgroup_event_listener.c to test | ||
403 | it. | ||
404 | |||
405 | (Shell-A) Create cgroup and run event listener | ||
406 | # mkdir /cgroup/A | ||
407 | # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M | ||
408 | |||
409 | (Shell-B) Add task to cgroup and try to allocate and free memory | ||
410 | # echo $$ >/cgroup/A/tasks | ||
411 | # a="$(dd if=/dev/zero bs=1M count=10)" | ||
412 | # a= | ||
413 | |||
414 | You will see message from cgroup_event_listener every time you cross | ||
415 | the thresholds. | ||
416 | |||
417 | Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. | ||
418 | |||
419 | It's good idea to test root cgroup as well. | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index b871f2552b4..f8bc802d70b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -182,6 +182,8 @@ list. | |||
182 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any | 182 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any |
183 | limits on the root cgroup. | 183 | limits on the root cgroup. |
184 | 184 | ||
185 | Note2: When panic_on_oom is set to "2", the whole system will panic. | ||
186 | |||
185 | 2. Locking | 187 | 2. Locking |
186 | 188 | ||
187 | The memory controller uses the following hierarchy | 189 | The memory controller uses the following hierarchy |
@@ -262,10 +264,12 @@ some of the pages cached in the cgroup (page cache pages). | |||
262 | 4.2 Task migration | 264 | 4.2 Task migration |
263 | 265 | ||
264 | When a task migrates from one cgroup to another, it's charge is not | 266 | When a task migrates from one cgroup to another, it's charge is not |
265 | carried forward. The pages allocated from the original cgroup still | 267 | carried forward by default. The pages allocated from the original cgroup still |
266 | remain charged to it, the charge is dropped when the page is freed or | 268 | remain charged to it, the charge is dropped when the page is freed or |
267 | reclaimed. | 269 | reclaimed. |
268 | 270 | ||
271 | Note: You can move charges of a task along with task migration. See 8. | ||
272 | |||
269 | 4.3 Removing a cgroup | 273 | 4.3 Removing a cgroup |
270 | 274 | ||
271 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a | 275 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
@@ -377,7 +381,8 @@ The feature can be disabled by | |||
377 | NOTE1: Enabling/disabling will fail if the cgroup already has other | 381 | NOTE1: Enabling/disabling will fail if the cgroup already has other |
378 | cgroups created below it. | 382 | cgroups created below it. |
379 | 383 | ||
380 | NOTE2: This feature can be enabled/disabled per subtree. | 384 | NOTE2: When panic_on_oom is set to "2", the whole system will panic in |
385 | case of an oom event in any cgroup. | ||
381 | 386 | ||
382 | 7. Soft limits | 387 | 7. Soft limits |
383 | 388 | ||
@@ -414,7 +419,76 @@ NOTE1: Soft limits take effect over a long period of time, since they involve | |||
414 | NOTE2: It is recommended to set the soft limit always below the hard limit, | 419 | NOTE2: It is recommended to set the soft limit always below the hard limit, |
415 | otherwise the hard limit will take precedence. | 420 | otherwise the hard limit will take precedence. |
416 | 421 | ||
417 | 8. TODO | 422 | 8. Move charges at task migration |
423 | |||
424 | Users can move charges associated with a task along with task migration, that | ||
425 | is, uncharge task's pages from the old cgroup and charge them to the new cgroup. | ||
426 | This feature is not supported in !CONFIG_MMU environments because of lack of | ||
427 | page tables. | ||
428 | |||
429 | 8.1 Interface | ||
430 | |||
431 | This feature is disabled by default. It can be enabled(and disabled again) by | ||
432 | writing to memory.move_charge_at_immigrate of the destination cgroup. | ||
433 | |||
434 | If you want to enable it: | ||
435 | |||
436 | # echo (some positive value) > memory.move_charge_at_immigrate | ||
437 | |||
438 | Note: Each bits of move_charge_at_immigrate has its own meaning about what type | ||
439 | of charges should be moved. See 8.2 for details. | ||
440 | Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread | ||
441 | group. | ||
442 | Note: If we cannot find enough space for the task in the destination cgroup, we | ||
443 | try to make space by reclaiming memory. Task migration may fail if we | ||
444 | cannot make enough space. | ||
445 | Note: It can take several seconds if you move charges in giga bytes order. | ||
446 | |||
447 | And if you want disable it again: | ||
448 | |||
449 | # echo 0 > memory.move_charge_at_immigrate | ||
450 | |||
451 | 8.2 Type of charges which can be move | ||
452 | |||
453 | Each bits of move_charge_at_immigrate has its own meaning about what type of | ||
454 | charges should be moved. | ||
455 | |||
456 | bit | what type of charges would be moved ? | ||
457 | -----+------------------------------------------------------------------------ | ||
458 | 0 | A charge of an anonymous page(or swap of it) used by the target task. | ||
459 | | Those pages and swaps must be used only by the target task. You must | ||
460 | | enable Swap Extension(see 2.4) to enable move of swap charges. | ||
461 | |||
462 | Note: Those pages and swaps must be charged to the old cgroup. | ||
463 | Note: More type of pages(e.g. file cache, shmem,) will be supported by other | ||
464 | bits in future. | ||
465 | |||
466 | 8.3 TODO | ||
467 | |||
468 | - Add support for other types of pages(e.g. file cache, shmem, etc.). | ||
469 | - Implement madvise(2) to let users decide the vma to be moved or not to be | ||
470 | moved. | ||
471 | - All of moving charge operations are done under cgroup_mutex. It's not good | ||
472 | behavior to hold the mutex too long, so we may need some trick. | ||
473 | |||
474 | 9. Memory thresholds | ||
475 | |||
476 | Memory controler implements memory thresholds using cgroups notification | ||
477 | API (see cgroups.txt). It allows to register multiple memory and memsw | ||
478 | thresholds and gets notifications when it crosses. | ||
479 | |||
480 | To register a threshold application need: | ||
481 | - create an eventfd using eventfd(2); | ||
482 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; | ||
483 | - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to | ||
484 | cgroup.event_control. | ||
485 | |||
486 | Application will be notified through eventfd when memory usage crosses | ||
487 | threshold in any direction. | ||
488 | |||
489 | It's applicable for root and non-root cgroup. | ||
490 | |||
491 | 10. TODO | ||
418 | 492 | ||
419 | 1. Add support for accounting huge pages (as a separate controller) | 493 | 1. Add support for accounting huge pages (as a separate controller) |
420 | 2. Make per-cgroup scanner reclaim not-shared pages first | 494 | 2. Make per-cgroup scanner reclaim not-shared pages first |