aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/cgroups
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/cgroups')
-rw-r--r--Documentation/cgroups/cgroup_event_listener.c110
-rw-r--r--Documentation/cgroups/cgroups.txt39
-rw-r--r--Documentation/cgroups/memcg_test.txt47
-rw-r--r--Documentation/cgroups/memory.txt80
4 files changed, 268 insertions, 8 deletions
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c
new file mode 100644
index 00000000000..8c2bfc4a635
--- /dev/null
+++ b/Documentation/cgroups/cgroup_event_listener.c
@@ -0,0 +1,110 @@
1/*
2 * cgroup_event_listener.c - Simple listener of cgroup events
3 *
4 * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
5 */
6
7#include <assert.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <libgen.h>
11#include <limits.h>
12#include <stdio.h>
13#include <string.h>
14#include <unistd.h>
15
16#include <sys/eventfd.h>
17
18#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
19
20int main(int argc, char **argv)
21{
22 int efd = -1;
23 int cfd = -1;
24 int event_control = -1;
25 char event_control_path[PATH_MAX];
26 char line[LINE_MAX];
27 int ret;
28
29 if (argc != 3) {
30 fputs(USAGE_STR, stderr);
31 return 1;
32 }
33
34 cfd = open(argv[1], O_RDONLY);
35 if (cfd == -1) {
36 fprintf(stderr, "Cannot open %s: %s\n", argv[1],
37 strerror(errno));
38 goto out;
39 }
40
41 ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
42 dirname(argv[1]));
43 if (ret >= PATH_MAX) {
44 fputs("Path to cgroup.event_control is too long\n", stderr);
45 goto out;
46 }
47
48 event_control = open(event_control_path, O_WRONLY);
49 if (event_control == -1) {
50 fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
51 strerror(errno));
52 goto out;
53 }
54
55 efd = eventfd(0, 0);
56 if (efd == -1) {
57 perror("eventfd() failed");
58 goto out;
59 }
60
61 ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
62 if (ret >= LINE_MAX) {
63 fputs("Arguments string is too long\n", stderr);
64 goto out;
65 }
66
67 ret = write(event_control, line, strlen(line) + 1);
68 if (ret == -1) {
69 perror("Cannot write to cgroup.event_control");
70 goto out;
71 }
72
73 while (1) {
74 uint64_t result;
75
76 ret = read(efd, &result, sizeof(result));
77 if (ret == -1) {
78 if (errno == EINTR)
79 continue;
80 perror("Cannot read from eventfd");
81 break;
82 }
83 assert(ret == sizeof(result));
84
85 ret = access(event_control_path, W_OK);
86 if ((ret == -1) && (errno == ENOENT)) {
87 puts("The cgroup seems to have removed.");
88 ret = 0;
89 break;
90 }
91
92 if (ret == -1) {
93 perror("cgroup.event_control "
94 "is not accessable any more");
95 break;
96 }
97
98 printf("%s %s: crossed\n", argv[1], argv[2]);
99 }
100
101out:
102 if (efd >= 0)
103 close(efd);
104 if (event_control >= 0)
105 close(event_control);
106 if (cfd >= 0)
107 close(cfd);
108
109 return (ret != 0);
110}
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 0b33bfe7dde..fd588ff0e29 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -22,6 +22,8 @@ CONTENTS:
222. Usage Examples and Syntax 222. Usage Examples and Syntax
23 2.1 Basic Usage 23 2.1 Basic Usage
24 2.2 Attaching processes 24 2.2 Attaching processes
25 2.3 Mounting hierarchies by name
26 2.4 Notification API
253. Kernel API 273. Kernel API
26 3.1 Overview 28 3.1 Overview
27 3.2 Synchronization 29 3.2 Synchronization
@@ -434,6 +436,25 @@ you give a subsystem a name.
434The name of the subsystem appears as part of the hierarchy description 436The name of the subsystem appears as part of the hierarchy description
435in /proc/mounts and /proc/<pid>/cgroups. 437in /proc/mounts and /proc/<pid>/cgroups.
436 438
4392.4 Notification API
440--------------------
441
442There is mechanism which allows to get notifications about changing
443status of a cgroup.
444
445To register new notification handler you need:
446 - create a file descriptor for event notification using eventfd(2);
447 - open a control file to be monitored (e.g. memory.usage_in_bytes);
448 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
449 Interpretation of args is defined by control file implementation;
450
451eventfd will be woken up by control file implementation or when the
452cgroup is removed.
453
454To unregister notification handler just close eventfd.
455
456NOTE: Support of notifications should be implemented for the control
457file. See documentation for the subsystem.
437 458
4383. Kernel API 4593. Kernel API
439============= 460=============
@@ -488,6 +509,11 @@ Each subsystem should:
488- add an entry in linux/cgroup_subsys.h 509- add an entry in linux/cgroup_subsys.h
489- define a cgroup_subsys object called <name>_subsys 510- define a cgroup_subsys object called <name>_subsys
490 511
512If a subsystem can be compiled as a module, it should also have in its
513module initcall a call to cgroup_load_subsys(), and in its exitcall a
514call to cgroup_unload_subsys(). It should also set its_subsys.module =
515THIS_MODULE in its .c file.
516
491Each subsystem may export the following methods. The only mandatory 517Each subsystem may export the following methods. The only mandatory
492methods are create/destroy. Any others that are null are presumed to 518methods are create/destroy. Any others that are null are presumed to
493be successful no-ops. 519be successful no-ops.
@@ -536,10 +562,21 @@ returns an error, this will abort the attach operation. If a NULL
536task is passed, then a successful result indicates that *any* 562task is passed, then a successful result indicates that *any*
537unspecified task can be moved into the cgroup. Note that this isn't 563unspecified task can be moved into the cgroup. Note that this isn't
538called on a fork. If this method returns 0 (success) then this should 564called on a fork. If this method returns 0 (success) then this should
539remain valid while the caller holds cgroup_mutex. If threadgroup is 565remain valid while the caller holds cgroup_mutex and it is ensured that either
566attach() or cancel_attach() will be called in future. If threadgroup is
540true, then a successful result indicates that all threads in the given 567true, then a successful result indicates that all threads in the given
541thread's threadgroup can be moved together. 568thread's threadgroup can be moved together.
542 569
570void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
571 struct task_struct *task, bool threadgroup)
572(cgroup_mutex held by caller)
573
574Called when a task attach operation has failed after can_attach() has succeeded.
575A subsystem whose can_attach() has some side-effects should provide this
576function, so that the subsytem can implement a rollback. If not, not necessary.
577This will be called only about subsystems whose can_attach() operation have
578succeeded.
579
543void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 580void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
544 struct cgroup *old_cgrp, struct task_struct *task, 581 struct cgroup *old_cgrp, struct task_struct *task,
545 bool threadgroup) 582 bool threadgroup)
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 72db89ed060..f7f68b2ac19 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,6 +1,6 @@
1Memory Resource Controller(Memcg) Implementation Memo. 1Memory Resource Controller(Memcg) Implementation Memo.
2Last Updated: 2009/1/20 2Last Updated: 2010/2
3Base Kernel Version: based on 2.6.29-rc2. 3Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
4 4
5Because VM is getting complex (one of reasons is memcg...), memcg's behavior 5Because VM is getting complex (one of reasons is memcg...), memcg's behavior
6is complex. This is a document for memcg's internal behavior. 6is complex. This is a document for memcg's internal behavior.
@@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
337 race and lock dependency with other cgroup subsystems. 337 race and lock dependency with other cgroup subsystems.
338 338
339 example) 339 example)
340 # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices 340 # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
341 341
342 and do task move, mkdir, rmdir etc...under this. 342 and do task move, mkdir, rmdir etc...under this.
343 343
@@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
348 348
349 For example, test like following is good. 349 For example, test like following is good.
350 (Shell-A) 350 (Shell-A)
351 # mount -t cgroup none /cgroup -t memory 351 # mount -t cgroup none /cgroup -o memory
352 # mkdir /cgroup/test 352 # mkdir /cgroup/test
353 # echo 40M > /cgroup/test/memory.limit_in_bytes 353 # echo 40M > /cgroup/test/memory.limit_in_bytes
354 # echo 0 > /cgroup/test/tasks 354 # echo 0 > /cgroup/test/tasks
@@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
378 #echo 50M > memory.limit_in_bytes 378 #echo 50M > memory.limit_in_bytes
379 #echo 50M > memory.memsw.limit_in_bytes 379 #echo 50M > memory.memsw.limit_in_bytes
380 run 51M of malloc 380 run 51M of malloc
381
382 9.9 Move charges at task migration
383 Charges associated with a task can be moved along with task migration.
384
385 (Shell-A)
386 #mkdir /cgroup/A
387 #echo $$ >/cgroup/A/tasks
388 run some programs which uses some amount of memory in /cgroup/A.
389
390 (Shell-B)
391 #mkdir /cgroup/B
392 #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
393 #echo "pid of the program running in group A" >/cgroup/B/tasks
394
395 You can see charges have been moved by reading *.usage_in_bytes or
396 memory.stat of both A and B.
397 See 8.2 of Documentation/cgroups/memory.txt to see what value should be
398 written to move_charge_at_immigrate.
399
400 9.10 Memory thresholds
401 Memory controler implements memory thresholds using cgroups notification
402 API. You can use Documentation/cgroups/cgroup_event_listener.c to test
403 it.
404
405 (Shell-A) Create cgroup and run event listener
406 # mkdir /cgroup/A
407 # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
408
409 (Shell-B) Add task to cgroup and try to allocate and free memory
410 # echo $$ >/cgroup/A/tasks
411 # a="$(dd if=/dev/zero bs=1M count=10)"
412 # a=
413
414 You will see message from cgroup_event_listener every time you cross
415 the thresholds.
416
417 Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
418
419 It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index b871f2552b4..f8bc802d70b 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -182,6 +182,8 @@ list.
182NOTE: Reclaim does not work for the root cgroup, since we cannot set any 182NOTE: Reclaim does not work for the root cgroup, since we cannot set any
183limits on the root cgroup. 183limits on the root cgroup.
184 184
185Note2: When panic_on_oom is set to "2", the whole system will panic.
186
1852. Locking 1872. Locking
186 188
187The memory controller uses the following hierarchy 189The memory controller uses the following hierarchy
@@ -262,10 +264,12 @@ some of the pages cached in the cgroup (page cache pages).
2624.2 Task migration 2644.2 Task migration
263 265
264When a task migrates from one cgroup to another, it's charge is not 266When a task migrates from one cgroup to another, it's charge is not
265carried forward. The pages allocated from the original cgroup still 267carried forward by default. The pages allocated from the original cgroup still
266remain charged to it, the charge is dropped when the page is freed or 268remain charged to it, the charge is dropped when the page is freed or
267reclaimed. 269reclaimed.
268 270
271Note: You can move charges of a task along with task migration. See 8.
272
2694.3 Removing a cgroup 2734.3 Removing a cgroup
270 274
271A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a 275A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
@@ -377,7 +381,8 @@ The feature can be disabled by
377NOTE1: Enabling/disabling will fail if the cgroup already has other 381NOTE1: Enabling/disabling will fail if the cgroup already has other
378cgroups created below it. 382cgroups created below it.
379 383
380NOTE2: This feature can be enabled/disabled per subtree. 384NOTE2: When panic_on_oom is set to "2", the whole system will panic in
385case of an oom event in any cgroup.
381 386
3827. Soft limits 3877. Soft limits
383 388
@@ -414,7 +419,76 @@ NOTE1: Soft limits take effect over a long period of time, since they involve
414NOTE2: It is recommended to set the soft limit always below the hard limit, 419NOTE2: It is recommended to set the soft limit always below the hard limit,
415 otherwise the hard limit will take precedence. 420 otherwise the hard limit will take precedence.
416 421
4178. TODO 4228. Move charges at task migration
423
424Users can move charges associated with a task along with task migration, that
425is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
426This feature is not supported in !CONFIG_MMU environments because of lack of
427page tables.
428
4298.1 Interface
430
431This feature is disabled by default. It can be enabled(and disabled again) by
432writing to memory.move_charge_at_immigrate of the destination cgroup.
433
434If you want to enable it:
435
436# echo (some positive value) > memory.move_charge_at_immigrate
437
438Note: Each bits of move_charge_at_immigrate has its own meaning about what type
439 of charges should be moved. See 8.2 for details.
440Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
441 group.
442Note: If we cannot find enough space for the task in the destination cgroup, we
443 try to make space by reclaiming memory. Task migration may fail if we
444 cannot make enough space.
445Note: It can take several seconds if you move charges in giga bytes order.
446
447And if you want disable it again:
448
449# echo 0 > memory.move_charge_at_immigrate
450
4518.2 Type of charges which can be move
452
453Each bits of move_charge_at_immigrate has its own meaning about what type of
454charges should be moved.
455
456 bit | what type of charges would be moved ?
457 -----+------------------------------------------------------------------------
458 0 | A charge of an anonymous page(or swap of it) used by the target task.
459 | Those pages and swaps must be used only by the target task. You must
460 | enable Swap Extension(see 2.4) to enable move of swap charges.
461
462Note: Those pages and swaps must be charged to the old cgroup.
463Note: More type of pages(e.g. file cache, shmem,) will be supported by other
464 bits in future.
465
4668.3 TODO
467
468- Add support for other types of pages(e.g. file cache, shmem, etc.).
469- Implement madvise(2) to let users decide the vma to be moved or not to be
470 moved.
471- All of moving charge operations are done under cgroup_mutex. It's not good
472 behavior to hold the mutex too long, so we may need some trick.
473
4749. Memory thresholds
475
476Memory controler implements memory thresholds using cgroups notification
477API (see cgroups.txt). It allows to register multiple memory and memsw
478thresholds and gets notifications when it crosses.
479
480To register a threshold application need:
481 - create an eventfd using eventfd(2);
482 - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
483 - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
484 cgroup.event_control.
485
486Application will be notified through eventfd when memory usage crosses
487threshold in any direction.
488
489It's applicable for root and non-root cgroup.
490
49110. TODO
418 492
4191. Add support for accounting huge pages (as a separate controller) 4931. Add support for accounting huge pages (as a separate controller)
4202. Make per-cgroup scanner reclaim not-shared pages first 4942. Make per-cgroup scanner reclaim not-shared pages first