aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/cgroups
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2010-03-15 19:23:54 -0400
committerDavid S. Miller <davem@davemloft.net>2010-03-15 19:23:54 -0400
commit4961e02f1999e1c3468c09b2669c94d7c3ae82a8 (patch)
tree44c15abb09d7ba5e17a9aba95ee246648b1c1a8a /Documentation/cgroups
parentd14a0ebda7d3daede1a99c01527affb9ceaa4c22 (diff)
parenta3d3203e4bb40f253b1541e310dc0f9305be7c84 (diff)
Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
Diffstat (limited to 'Documentation/cgroups')
-rw-r--r--Documentation/cgroups/cgroup_event_listener.c110
-rw-r--r--Documentation/cgroups/cgroups.txt39
-rw-r--r--Documentation/cgroups/cpusets.txt127
-rw-r--r--Documentation/cgroups/memcg_test.txt47
-rw-r--r--Documentation/cgroups/memory.txt80
5 files changed, 333 insertions, 70 deletions
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c
new file mode 100644
index 000000000000..8c2bfc4a6358
--- /dev/null
+++ b/Documentation/cgroups/cgroup_event_listener.c
@@ -0,0 +1,110 @@
1/*
2 * cgroup_event_listener.c - Simple listener of cgroup events
3 *
4 * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
5 */
6
7#include <assert.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <libgen.h>
11#include <limits.h>
12#include <stdio.h>
13#include <string.h>
14#include <unistd.h>
15
16#include <sys/eventfd.h>
17
18#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
19
20int main(int argc, char **argv)
21{
22 int efd = -1;
23 int cfd = -1;
24 int event_control = -1;
25 char event_control_path[PATH_MAX];
26 char line[LINE_MAX];
27 int ret;
28
29 if (argc != 3) {
30 fputs(USAGE_STR, stderr);
31 return 1;
32 }
33
34 cfd = open(argv[1], O_RDONLY);
35 if (cfd == -1) {
36 fprintf(stderr, "Cannot open %s: %s\n", argv[1],
37 strerror(errno));
38 goto out;
39 }
40
41 ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
42 dirname(argv[1]));
43 if (ret >= PATH_MAX) {
44 fputs("Path to cgroup.event_control is too long\n", stderr);
45 goto out;
46 }
47
48 event_control = open(event_control_path, O_WRONLY);
49 if (event_control == -1) {
50 fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
51 strerror(errno));
52 goto out;
53 }
54
55 efd = eventfd(0, 0);
56 if (efd == -1) {
57 perror("eventfd() failed");
58 goto out;
59 }
60
61 ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
62 if (ret >= LINE_MAX) {
63 fputs("Arguments string is too long\n", stderr);
64 goto out;
65 }
66
67 ret = write(event_control, line, strlen(line) + 1);
68 if (ret == -1) {
69 perror("Cannot write to cgroup.event_control");
70 goto out;
71 }
72
73 while (1) {
74 uint64_t result;
75
76 ret = read(efd, &result, sizeof(result));
77 if (ret == -1) {
78 if (errno == EINTR)
79 continue;
80 perror("Cannot read from eventfd");
81 break;
82 }
83 assert(ret == sizeof(result));
84
85 ret = access(event_control_path, W_OK);
86 if ((ret == -1) && (errno == ENOENT)) {
87 puts("The cgroup seems to have removed.");
88 ret = 0;
89 break;
90 }
91
92 if (ret == -1) {
93 perror("cgroup.event_control "
94 "is not accessable any more");
95 break;
96 }
97
98 printf("%s %s: crossed\n", argv[1], argv[2]);
99 }
100
101out:
102 if (efd >= 0)
103 close(efd);
104 if (event_control >= 0)
105 close(event_control);
106 if (cfd >= 0)
107 close(cfd);
108
109 return (ret != 0);
110}
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 0b33bfe7dde9..fd588ff0e296 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -22,6 +22,8 @@ CONTENTS:
222. Usage Examples and Syntax 222. Usage Examples and Syntax
23 2.1 Basic Usage 23 2.1 Basic Usage
24 2.2 Attaching processes 24 2.2 Attaching processes
25 2.3 Mounting hierarchies by name
26 2.4 Notification API
253. Kernel API 273. Kernel API
26 3.1 Overview 28 3.1 Overview
27 3.2 Synchronization 29 3.2 Synchronization
@@ -434,6 +436,25 @@ you give a subsystem a name.
434The name of the subsystem appears as part of the hierarchy description 436The name of the subsystem appears as part of the hierarchy description
435in /proc/mounts and /proc/<pid>/cgroups. 437in /proc/mounts and /proc/<pid>/cgroups.
436 438
4392.4 Notification API
440--------------------
441
442There is mechanism which allows to get notifications about changing
443status of a cgroup.
444
445To register new notification handler you need:
446 - create a file descriptor for event notification using eventfd(2);
447 - open a control file to be monitored (e.g. memory.usage_in_bytes);
448 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
449 Interpretation of args is defined by control file implementation;
450
451eventfd will be woken up by control file implementation or when the
452cgroup is removed.
453
454To unregister notification handler just close eventfd.
455
456NOTE: Support of notifications should be implemented for the control
457file. See documentation for the subsystem.
437 458
4383. Kernel API 4593. Kernel API
439============= 460=============
@@ -488,6 +509,11 @@ Each subsystem should:
488- add an entry in linux/cgroup_subsys.h 509- add an entry in linux/cgroup_subsys.h
489- define a cgroup_subsys object called <name>_subsys 510- define a cgroup_subsys object called <name>_subsys
490 511
512If a subsystem can be compiled as a module, it should also have in its
513module initcall a call to cgroup_load_subsys(), and in its exitcall a
514call to cgroup_unload_subsys(). It should also set its_subsys.module =
515THIS_MODULE in its .c file.
516
491Each subsystem may export the following methods. The only mandatory 517Each subsystem may export the following methods. The only mandatory
492methods are create/destroy. Any others that are null are presumed to 518methods are create/destroy. Any others that are null are presumed to
493be successful no-ops. 519be successful no-ops.
@@ -536,10 +562,21 @@ returns an error, this will abort the attach operation. If a NULL
536task is passed, then a successful result indicates that *any* 562task is passed, then a successful result indicates that *any*
537unspecified task can be moved into the cgroup. Note that this isn't 563unspecified task can be moved into the cgroup. Note that this isn't
538called on a fork. If this method returns 0 (success) then this should 564called on a fork. If this method returns 0 (success) then this should
539remain valid while the caller holds cgroup_mutex. If threadgroup is 565remain valid while the caller holds cgroup_mutex and it is ensured that either
566attach() or cancel_attach() will be called in future. If threadgroup is
540true, then a successful result indicates that all threads in the given 567true, then a successful result indicates that all threads in the given
541thread's threadgroup can be moved together. 568thread's threadgroup can be moved together.
542 569
570void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
571 struct task_struct *task, bool threadgroup)
572(cgroup_mutex held by caller)
573
574Called when a task attach operation has failed after can_attach() has succeeded.
575A subsystem whose can_attach() has some side-effects should provide this
576function, so that the subsytem can implement a rollback. If not, not necessary.
577This will be called only about subsystems whose can_attach() operation have
578succeeded.
579
543void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 580void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
544 struct cgroup *old_cgrp, struct task_struct *task, 581 struct cgroup *old_cgrp, struct task_struct *task,
545 bool threadgroup) 582 bool threadgroup)
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 1d7e9784439a..4160df82b3f5 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -168,20 +168,20 @@ Each cpuset is represented by a directory in the cgroup file system
168containing (on top of the standard cgroup files) the following 168containing (on top of the standard cgroup files) the following
169files describing that cpuset: 169files describing that cpuset:
170 170
171 - cpus: list of CPUs in that cpuset 171 - cpuset.cpus: list of CPUs in that cpuset
172 - mems: list of Memory Nodes in that cpuset 172 - cpuset.mems: list of Memory Nodes in that cpuset
173 - memory_migrate flag: if set, move pages to cpusets nodes 173 - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
174 - cpu_exclusive flag: is cpu placement exclusive? 174 - cpuset.cpu_exclusive flag: is cpu placement exclusive?
175 - mem_exclusive flag: is memory placement exclusive? 175 - cpuset.mem_exclusive flag: is memory placement exclusive?
176 - mem_hardwall flag: is memory allocation hardwalled 176 - cpuset.mem_hardwall flag: is memory allocation hardwalled
177 - memory_pressure: measure of how much paging pressure in cpuset 177 - cpuset.memory_pressure: measure of how much paging pressure in cpuset
178 - memory_spread_page flag: if set, spread page cache evenly on allowed nodes 178 - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
179 - memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes 179 - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
180 - sched_load_balance flag: if set, load balance within CPUs on that cpuset 180 - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
181 - sched_relax_domain_level: the searching range when migrating tasks 181 - cpuset.sched_relax_domain_level: the searching range when migrating tasks
182 182
183In addition, the root cpuset only has the following file: 183In addition, the root cpuset only has the following file:
184 - memory_pressure_enabled flag: compute memory_pressure? 184 - cpuset.memory_pressure_enabled flag: compute memory_pressure?
185 185
186New cpusets are created using the mkdir system call or shell 186New cpusets are created using the mkdir system call or shell
187command. The properties of a cpuset, such as its flags, allowed 187command. The properties of a cpuset, such as its flags, allowed
@@ -229,7 +229,7 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
229a direct ancestor or descendant, may share any of the same CPUs or 229a direct ancestor or descendant, may share any of the same CPUs or
230Memory Nodes. 230Memory Nodes.
231 231
232A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", 232A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
233i.e. it restricts kernel allocations for page, buffer and other data 233i.e. it restricts kernel allocations for page, buffer and other data
234commonly shared by the kernel across multiple users. All cpusets, 234commonly shared by the kernel across multiple users. All cpusets,
235whether hardwalled or not, restrict allocations of memory for user 235whether hardwalled or not, restrict allocations of memory for user
@@ -304,15 +304,15 @@ times 1000.
304--------------------------- 304---------------------------
305There are two boolean flag files per cpuset that control where the 305There are two boolean flag files per cpuset that control where the
306kernel allocates pages for the file system buffers and related in 306kernel allocates pages for the file system buffers and related in
307kernel data structures. They are called 'memory_spread_page' and 307kernel data structures. They are called 'cpuset.memory_spread_page' and
308'memory_spread_slab'. 308'cpuset.memory_spread_slab'.
309 309
310If the per-cpuset boolean flag file 'memory_spread_page' is set, then 310If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
311the kernel will spread the file system buffers (page cache) evenly 311the kernel will spread the file system buffers (page cache) evenly
312over all the nodes that the faulting task is allowed to use, instead 312over all the nodes that the faulting task is allowed to use, instead
313of preferring to put those pages on the node where the task is running. 313of preferring to put those pages on the node where the task is running.
314 314
315If the per-cpuset boolean flag file 'memory_spread_slab' is set, 315If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
316then the kernel will spread some file system related slab caches, 316then the kernel will spread some file system related slab caches,
317such as for inodes and dentries evenly over all the nodes that the 317such as for inodes and dentries evenly over all the nodes that the
318faulting task is allowed to use, instead of preferring to put those 318faulting task is allowed to use, instead of preferring to put those
@@ -337,21 +337,21 @@ their containing tasks memory spread settings. If memory spreading
337is turned off, then the currently specified NUMA mempolicy once again 337is turned off, then the currently specified NUMA mempolicy once again
338applies to memory page allocations. 338applies to memory page allocations.
339 339
340Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag 340Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
341files. By default they contain "0", meaning that the feature is off 341files. By default they contain "0", meaning that the feature is off
342for that cpuset. If a "1" is written to that file, then that turns 342for that cpuset. If a "1" is written to that file, then that turns
343the named feature on. 343the named feature on.
344 344
345The implementation is simple. 345The implementation is simple.
346 346
347Setting the flag 'memory_spread_page' turns on a per-process flag 347Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
348PF_SPREAD_PAGE for each task that is in that cpuset or subsequently 348PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
349joins that cpuset. The page allocation calls for the page cache 349joins that cpuset. The page allocation calls for the page cache
350is modified to perform an inline check for this PF_SPREAD_PAGE task 350is modified to perform an inline check for this PF_SPREAD_PAGE task
351flag, and if set, a call to a new routine cpuset_mem_spread_node() 351flag, and if set, a call to a new routine cpuset_mem_spread_node()
352returns the node to prefer for the allocation. 352returns the node to prefer for the allocation.
353 353
354Similarly, setting 'memory_spread_slab' turns on the flag 354Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
355PF_SPREAD_SLAB, and appropriately marked slab caches will allocate 355PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
356pages from the node returned by cpuset_mem_spread_node(). 356pages from the node returned by cpuset_mem_spread_node().
357 357
@@ -404,24 +404,24 @@ the following two situations:
404 system overhead on those CPUs, including avoiding task load 404 system overhead on those CPUs, including avoiding task load
405 balancing if that is not needed. 405 balancing if that is not needed.
406 406
407When the per-cpuset flag "sched_load_balance" is enabled (the default 407When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
408setting), it requests that all the CPUs in that cpusets allowed 'cpus' 408setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
409be contained in a single sched domain, ensuring that load balancing 409be contained in a single sched domain, ensuring that load balancing
410can move a task (not otherwised pinned, as by sched_setaffinity) 410can move a task (not otherwised pinned, as by sched_setaffinity)
411from any CPU in that cpuset to any other. 411from any CPU in that cpuset to any other.
412 412
413When the per-cpuset flag "sched_load_balance" is disabled, then the 413When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
414scheduler will avoid load balancing across the CPUs in that cpuset, 414scheduler will avoid load balancing across the CPUs in that cpuset,
415--except-- in so far as is necessary because some overlapping cpuset 415--except-- in so far as is necessary because some overlapping cpuset
416has "sched_load_balance" enabled. 416has "sched_load_balance" enabled.
417 417
418So, for example, if the top cpuset has the flag "sched_load_balance" 418So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
419enabled, then the scheduler will have one sched domain covering all 419enabled, then the scheduler will have one sched domain covering all
420CPUs, and the setting of the "sched_load_balance" flag in any other 420CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
421cpusets won't matter, as we're already fully load balancing. 421cpusets won't matter, as we're already fully load balancing.
422 422
423Therefore in the above two situations, the top cpuset flag 423Therefore in the above two situations, the top cpuset flag
424"sched_load_balance" should be disabled, and only some of the smaller, 424"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
425child cpusets have this flag enabled. 425child cpusets have this flag enabled.
426 426
427When doing this, you don't usually want to leave any unpinned tasks in 427When doing this, you don't usually want to leave any unpinned tasks in
@@ -433,7 +433,7 @@ scheduler might not consider the possibility of load balancing that
433task to that underused CPU. 433task to that underused CPU.
434 434
435Of course, tasks pinned to a particular CPU can be left in a cpuset 435Of course, tasks pinned to a particular CPU can be left in a cpuset
436that disables "sched_load_balance" as those tasks aren't going anywhere 436that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
437else anyway. 437else anyway.
438 438
439There is an impedance mismatch here, between cpusets and sched domains. 439There is an impedance mismatch here, between cpusets and sched domains.
@@ -443,19 +443,19 @@ overlap and each CPU is in at most one sched domain.
443It is necessary for sched domains to be flat because load balancing 443It is necessary for sched domains to be flat because load balancing
444across partially overlapping sets of CPUs would risk unstable dynamics 444across partially overlapping sets of CPUs would risk unstable dynamics
445that would be beyond our understanding. So if each of two partially 445that would be beyond our understanding. So if each of two partially
446overlapping cpusets enables the flag 'sched_load_balance', then we 446overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
447form a single sched domain that is a superset of both. We won't move 447form a single sched domain that is a superset of both. We won't move
448a task to a CPU outside it cpuset, but the scheduler load balancing 448a task to a CPU outside it cpuset, but the scheduler load balancing
449code might waste some compute cycles considering that possibility. 449code might waste some compute cycles considering that possibility.
450 450
451This mismatch is why there is not a simple one-to-one relation 451This mismatch is why there is not a simple one-to-one relation
452between which cpusets have the flag "sched_load_balance" enabled, 452between which cpusets have the flag "cpuset.sched_load_balance" enabled,
453and the sched domain configuration. If a cpuset enables the flag, it 453and the sched domain configuration. If a cpuset enables the flag, it
454will get balancing across all its CPUs, but if it disables the flag, 454will get balancing across all its CPUs, but if it disables the flag,
455it will only be assured of no load balancing if no other overlapping 455it will only be assured of no load balancing if no other overlapping
456cpuset enables the flag. 456cpuset enables the flag.
457 457
458If two cpusets have partially overlapping 'cpus' allowed, and only 458If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
459one of them has this flag enabled, then the other may find its 459one of them has this flag enabled, then the other may find its
460tasks only partially load balanced, just on the overlapping CPUs. 460tasks only partially load balanced, just on the overlapping CPUs.
461This is just the general case of the top_cpuset example given a few 461This is just the general case of the top_cpuset example given a few
@@ -468,23 +468,23 @@ load balancing to the other CPUs.
4681.7.1 sched_load_balance implementation details. 4681.7.1 sched_load_balance implementation details.
469------------------------------------------------ 469------------------------------------------------
470 470
471The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary 471The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
472to most cpuset flags.) When enabled for a cpuset, the kernel will 472to most cpuset flags.) When enabled for a cpuset, the kernel will
473ensure that it can load balance across all the CPUs in that cpuset 473ensure that it can load balance across all the CPUs in that cpuset
474(makes sure that all the CPUs in the cpus_allowed of that cpuset are 474(makes sure that all the CPUs in the cpus_allowed of that cpuset are
475in the same sched domain.) 475in the same sched domain.)
476 476
477If two overlapping cpusets both have 'sched_load_balance' enabled, 477If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
478then they will be (must be) both in the same sched domain. 478then they will be (must be) both in the same sched domain.
479 479
480If, as is the default, the top cpuset has 'sched_load_balance' enabled, 480If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
481then by the above that means there is a single sched domain covering 481then by the above that means there is a single sched domain covering
482the whole system, regardless of any other cpuset settings. 482the whole system, regardless of any other cpuset settings.
483 483
484The kernel commits to user space that it will avoid load balancing 484The kernel commits to user space that it will avoid load balancing
485where it can. It will pick as fine a granularity partition of sched 485where it can. It will pick as fine a granularity partition of sched
486domains as it can while still providing load balancing for any set 486domains as it can while still providing load balancing for any set
487of CPUs allowed to a cpuset having 'sched_load_balance' enabled. 487of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
488 488
489The internal kernel cpuset to scheduler interface passes from the 489The internal kernel cpuset to scheduler interface passes from the
490cpuset code to the scheduler code a partition of the load balanced 490cpuset code to the scheduler code a partition of the load balanced
@@ -495,9 +495,9 @@ all the CPUs that must be load balanced.
495The cpuset code builds a new such partition and passes it to the 495The cpuset code builds a new such partition and passes it to the
496scheduler sched domain setup code, to have the sched domains rebuilt 496scheduler sched domain setup code, to have the sched domains rebuilt
497as necessary, whenever: 497as necessary, whenever:
498 - the 'sched_load_balance' flag of a cpuset with non-empty CPUs changes, 498 - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
499 - or CPUs come or go from a cpuset with this flag enabled, 499 - or CPUs come or go from a cpuset with this flag enabled,
500 - or 'sched_relax_domain_level' value of a cpuset with non-empty CPUs 500 - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
501 and with this flag enabled changes, 501 and with this flag enabled changes,
502 - or a cpuset with non-empty CPUs and with this flag enabled is removed, 502 - or a cpuset with non-empty CPUs and with this flag enabled is removed,
503 - or a cpu is offlined/onlined. 503 - or a cpu is offlined/onlined.
@@ -542,7 +542,7 @@ As the result, task B on CPU X need to wait task A or wait load balance
542on the next tick. For some applications in special situation, waiting 542on the next tick. For some applications in special situation, waiting
5431 tick may be too long. 5431 tick may be too long.
544 544
545The 'sched_relax_domain_level' file allows you to request changing 545The 'cpuset.sched_relax_domain_level' file allows you to request changing
546this searching range as you like. This file takes int value which 546this searching range as you like. This file takes int value which
547indicates size of searching range in levels ideally as follows, 547indicates size of searching range in levels ideally as follows,
548otherwise initial value -1 that indicates the cpuset has no request. 548otherwise initial value -1 that indicates the cpuset has no request.
@@ -559,8 +559,8 @@ The system default is architecture dependent. The system default
559can be changed using the relax_domain_level= boot parameter. 559can be changed using the relax_domain_level= boot parameter.
560 560
561This file is per-cpuset and affect the sched domain where the cpuset 561This file is per-cpuset and affect the sched domain where the cpuset
562belongs to. Therefore if the flag 'sched_load_balance' of a cpuset 562belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
563is disabled, then 'sched_relax_domain_level' have no effect since 563is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
564there is no sched domain belonging the cpuset. 564there is no sched domain belonging the cpuset.
565 565
566If multiple cpusets are overlapping and hence they form a single sched 566If multiple cpusets are overlapping and hence they form a single sched
@@ -607,9 +607,9 @@ from one cpuset to another, then the kernel will adjust the tasks
607memory placement, as above, the next time that the kernel attempts 607memory placement, as above, the next time that the kernel attempts
608to allocate a page of memory for that task. 608to allocate a page of memory for that task.
609 609
610If a cpuset has its 'cpus' modified, then each task in that cpuset 610If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
611will have its allowed CPU placement changed immediately. Similarly, 611will have its allowed CPU placement changed immediately. Similarly,
612if a tasks pid is written to another cpusets 'tasks' file, then its 612if a tasks pid is written to another cpusets 'cpuset.tasks' file, then its
613allowed CPU placement is changed immediately. If such a task had been 613allowed CPU placement is changed immediately. If such a task had been
614bound to some subset of its cpuset using the sched_setaffinity() call, 614bound to some subset of its cpuset using the sched_setaffinity() call,
615the task will be allowed to run on any CPU allowed in its new cpuset, 615the task will be allowed to run on any CPU allowed in its new cpuset,
@@ -622,8 +622,8 @@ and the processor placement is updated immediately.
622Normally, once a page is allocated (given a physical page 622Normally, once a page is allocated (given a physical page
623of main memory) then that page stays on whatever node it 623of main memory) then that page stays on whatever node it
624was allocated, so long as it remains allocated, even if the 624was allocated, so long as it remains allocated, even if the
625cpusets memory placement policy 'mems' subsequently changes. 625cpusets memory placement policy 'cpuset.mems' subsequently changes.
626If the cpuset flag file 'memory_migrate' is set true, then when 626If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
627tasks are attached to that cpuset, any pages that task had 627tasks are attached to that cpuset, any pages that task had
628allocated to it on nodes in its previous cpuset are migrated 628allocated to it on nodes in its previous cpuset are migrated
629to the tasks new cpuset. The relative placement of the page within 629to the tasks new cpuset. The relative placement of the page within
@@ -631,12 +631,12 @@ the cpuset is preserved during these migration operations if possible.
631For example if the page was on the second valid node of the prior cpuset 631For example if the page was on the second valid node of the prior cpuset
632then the page will be placed on the second valid node of the new cpuset. 632then the page will be placed on the second valid node of the new cpuset.
633 633
634Also if 'memory_migrate' is set true, then if that cpusets 634Also if 'cpuset.memory_migrate' is set true, then if that cpusets
635'mems' file is modified, pages allocated to tasks in that 635'cpuset.mems' file is modified, pages allocated to tasks in that
636cpuset, that were on nodes in the previous setting of 'mems', 636cpuset, that were on nodes in the previous setting of 'cpuset.mems',
637will be moved to nodes in the new setting of 'mems.' 637will be moved to nodes in the new setting of 'mems.'
638Pages that were not in the tasks prior cpuset, or in the cpusets 638Pages that were not in the tasks prior cpuset, or in the cpusets
639prior 'mems' setting, will not be moved. 639prior 'cpuset.mems' setting, will not be moved.
640 640
641There is an exception to the above. If hotplug functionality is used 641There is an exception to the above. If hotplug functionality is used
642to remove all the CPUs that are currently assigned to a cpuset, 642to remove all the CPUs that are currently assigned to a cpuset,
@@ -678,8 +678,8 @@ and then start a subshell 'sh' in that cpuset:
678 cd /dev/cpuset 678 cd /dev/cpuset
679 mkdir Charlie 679 mkdir Charlie
680 cd Charlie 680 cd Charlie
681 /bin/echo 2-3 > cpus 681 /bin/echo 2-3 > cpuset.cpus
682 /bin/echo 1 > mems 682 /bin/echo 1 > cpuset.mems
683 /bin/echo $$ > tasks 683 /bin/echo $$ > tasks
684 sh 684 sh
685 # The subshell 'sh' is now running in cpuset Charlie 685 # The subshell 'sh' is now running in cpuset Charlie
@@ -725,10 +725,13 @@ Now you want to do something with this cpuset.
725 725
726In this directory you can find several files: 726In this directory you can find several files:
727# ls 727# ls
728cpu_exclusive memory_migrate mems tasks 728cpuset.cpu_exclusive cpuset.memory_spread_slab
729cpus memory_pressure notify_on_release 729cpuset.cpus cpuset.mems
730mem_exclusive memory_spread_page sched_load_balance 730cpuset.mem_exclusive cpuset.sched_load_balance
731mem_hardwall memory_spread_slab sched_relax_domain_level 731cpuset.mem_hardwall cpuset.sched_relax_domain_level
732cpuset.memory_migrate notify_on_release
733cpuset.memory_pressure tasks
734cpuset.memory_spread_page
732 735
733Reading them will give you information about the state of this cpuset: 736Reading them will give you information about the state of this cpuset:
734the CPUs and Memory Nodes it can use, the processes that are using 737the CPUs and Memory Nodes it can use, the processes that are using
@@ -736,13 +739,13 @@ it, its properties. By writing to these files you can manipulate
736the cpuset. 739the cpuset.
737 740
738Set some flags: 741Set some flags:
739# /bin/echo 1 > cpu_exclusive 742# /bin/echo 1 > cpuset.cpu_exclusive
740 743
741Add some cpus: 744Add some cpus:
742# /bin/echo 0-7 > cpus 745# /bin/echo 0-7 > cpuset.cpus
743 746
744Add some mems: 747Add some mems:
745# /bin/echo 0-7 > mems 748# /bin/echo 0-7 > cpuset.mems
746 749
747Now attach your shell to this cpuset: 750Now attach your shell to this cpuset:
748# /bin/echo $$ > tasks 751# /bin/echo $$ > tasks
@@ -774,28 +777,28 @@ echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
774This is the syntax to use when writing in the cpus or mems files 777This is the syntax to use when writing in the cpus or mems files
775in cpuset directories: 778in cpuset directories:
776 779
777# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 780# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
778# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 781# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
779 782
780To add a CPU to a cpuset, write the new list of CPUs including the 783To add a CPU to a cpuset, write the new list of CPUs including the
781CPU to be added. To add 6 to the above cpuset: 784CPU to be added. To add 6 to the above cpuset:
782 785
783# /bin/echo 1-4,6 > cpus -> set cpus list to cpus 1,2,3,4,6 786# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6
784 787
785Similarly to remove a CPU from a cpuset, write the new list of CPUs 788Similarly to remove a CPU from a cpuset, write the new list of CPUs
786without the CPU to be removed. 789without the CPU to be removed.
787 790
788To remove all the CPUs: 791To remove all the CPUs:
789 792
790# /bin/echo "" > cpus -> clear cpus list 793# /bin/echo "" > cpuset.cpus -> clear cpus list
791 794
7922.3 Setting flags 7952.3 Setting flags
793----------------- 796-----------------
794 797
795The syntax is very simple: 798The syntax is very simple:
796 799
797# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' 800# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive'
798# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' 801# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive'
799 802
8002.4 Attaching processes 8032.4 Attaching processes
801----------------------- 804-----------------------
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 72db89ed0609..f7f68b2ac199 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,6 +1,6 @@
1Memory Resource Controller(Memcg) Implementation Memo. 1Memory Resource Controller(Memcg) Implementation Memo.
2Last Updated: 2009/1/20 2Last Updated: 2010/2
3Base Kernel Version: based on 2.6.29-rc2. 3Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
4 4
5Because VM is getting complex (one of reasons is memcg...), memcg's behavior 5Because VM is getting complex (one of reasons is memcg...), memcg's behavior
6is complex. This is a document for memcg's internal behavior. 6is complex. This is a document for memcg's internal behavior.
@@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
337 race and lock dependency with other cgroup subsystems. 337 race and lock dependency with other cgroup subsystems.
338 338
339 example) 339 example)
340 # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices 340 # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
341 341
342 and do task move, mkdir, rmdir etc...under this. 342 and do task move, mkdir, rmdir etc...under this.
343 343
@@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
348 348
349 For example, test like following is good. 349 For example, test like following is good.
350 (Shell-A) 350 (Shell-A)
351 # mount -t cgroup none /cgroup -t memory 351 # mount -t cgroup none /cgroup -o memory
352 # mkdir /cgroup/test 352 # mkdir /cgroup/test
353 # echo 40M > /cgroup/test/memory.limit_in_bytes 353 # echo 40M > /cgroup/test/memory.limit_in_bytes
354 # echo 0 > /cgroup/test/tasks 354 # echo 0 > /cgroup/test/tasks
@@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
378 #echo 50M > memory.limit_in_bytes 378 #echo 50M > memory.limit_in_bytes
379 #echo 50M > memory.memsw.limit_in_bytes 379 #echo 50M > memory.memsw.limit_in_bytes
380 run 51M of malloc 380 run 51M of malloc
381
382 9.9 Move charges at task migration
383 Charges associated with a task can be moved along with task migration.
384
385 (Shell-A)
386 #mkdir /cgroup/A
387 #echo $$ >/cgroup/A/tasks
388 run some programs which uses some amount of memory in /cgroup/A.
389
390 (Shell-B)
391 #mkdir /cgroup/B
392 #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
393 #echo "pid of the program running in group A" >/cgroup/B/tasks
394
395 You can see charges have been moved by reading *.usage_in_bytes or
396 memory.stat of both A and B.
397 See 8.2 of Documentation/cgroups/memory.txt to see what value should be
398 written to move_charge_at_immigrate.
399
400 9.10 Memory thresholds
401 Memory controler implements memory thresholds using cgroups notification
402 API. You can use Documentation/cgroups/cgroup_event_listener.c to test
403 it.
404
405 (Shell-A) Create cgroup and run event listener
406 # mkdir /cgroup/A
407 # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
408
409 (Shell-B) Add task to cgroup and try to allocate and free memory
410 # echo $$ >/cgroup/A/tasks
411 # a="$(dd if=/dev/zero bs=1M count=10)"
412 # a=
413
414 You will see message from cgroup_event_listener every time you cross
415 the thresholds.
416
417 Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
418
419 It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index b871f2552b45..f8bc802d70b9 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -182,6 +182,8 @@ list.
182NOTE: Reclaim does not work for the root cgroup, since we cannot set any 182NOTE: Reclaim does not work for the root cgroup, since we cannot set any
183limits on the root cgroup. 183limits on the root cgroup.
184 184
185Note2: When panic_on_oom is set to "2", the whole system will panic.
186
1852. Locking 1872. Locking
186 188
187The memory controller uses the following hierarchy 189The memory controller uses the following hierarchy
@@ -262,10 +264,12 @@ some of the pages cached in the cgroup (page cache pages).
2624.2 Task migration 2644.2 Task migration
263 265
264When a task migrates from one cgroup to another, it's charge is not 266When a task migrates from one cgroup to another, it's charge is not
265carried forward. The pages allocated from the original cgroup still 267carried forward by default. The pages allocated from the original cgroup still
266remain charged to it, the charge is dropped when the page is freed or 268remain charged to it, the charge is dropped when the page is freed or
267reclaimed. 269reclaimed.
268 270
271Note: You can move charges of a task along with task migration. See 8.
272
2694.3 Removing a cgroup 2734.3 Removing a cgroup
270 274
271A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a 275A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
@@ -377,7 +381,8 @@ The feature can be disabled by
377NOTE1: Enabling/disabling will fail if the cgroup already has other 381NOTE1: Enabling/disabling will fail if the cgroup already has other
378cgroups created below it. 382cgroups created below it.
379 383
380NOTE2: This feature can be enabled/disabled per subtree. 384NOTE2: When panic_on_oom is set to "2", the whole system will panic in
385case of an oom event in any cgroup.
381 386
3827. Soft limits 3877. Soft limits
383 388
@@ -414,7 +419,76 @@ NOTE1: Soft limits take effect over a long period of time, since they involve
414NOTE2: It is recommended to set the soft limit always below the hard limit, 419NOTE2: It is recommended to set the soft limit always below the hard limit,
415 otherwise the hard limit will take precedence. 420 otherwise the hard limit will take precedence.
416 421
4178. TODO 4228. Move charges at task migration
423
424Users can move charges associated with a task along with task migration, that
425is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
426This feature is not supported in !CONFIG_MMU environments because of lack of
427page tables.
428
4298.1 Interface
430
431This feature is disabled by default. It can be enabled(and disabled again) by
432writing to memory.move_charge_at_immigrate of the destination cgroup.
433
434If you want to enable it:
435
436# echo (some positive value) > memory.move_charge_at_immigrate
437
438Note: Each bits of move_charge_at_immigrate has its own meaning about what type
439 of charges should be moved. See 8.2 for details.
440Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
441 group.
442Note: If we cannot find enough space for the task in the destination cgroup, we
443 try to make space by reclaiming memory. Task migration may fail if we
444 cannot make enough space.
445Note: It can take several seconds if you move charges in giga bytes order.
446
447And if you want disable it again:
448
449# echo 0 > memory.move_charge_at_immigrate
450
4518.2 Type of charges which can be move
452
453Each bits of move_charge_at_immigrate has its own meaning about what type of
454charges should be moved.
455
456 bit | what type of charges would be moved ?
457 -----+------------------------------------------------------------------------
458 0 | A charge of an anonymous page(or swap of it) used by the target task.
459 | Those pages and swaps must be used only by the target task. You must
460 | enable Swap Extension(see 2.4) to enable move of swap charges.
461
462Note: Those pages and swaps must be charged to the old cgroup.
463Note: More type of pages(e.g. file cache, shmem,) will be supported by other
464 bits in future.
465
4668.3 TODO
467
468- Add support for other types of pages(e.g. file cache, shmem, etc.).
469- Implement madvise(2) to let users decide the vma to be moved or not to be
470 moved.
471- All of moving charge operations are done under cgroup_mutex. It's not good
472 behavior to hold the mutex too long, so we may need some trick.
473
4749. Memory thresholds
475
476Memory controler implements memory thresholds using cgroups notification
477API (see cgroups.txt). It allows to register multiple memory and memsw
478thresholds and gets notifications when it crosses.
479
480To register a threshold application need:
481 - create an eventfd using eventfd(2);
482 - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
483 - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
484 cgroup.event_control.
485
486Application will be notified through eventfd when memory usage crosses
487threshold in any direction.
488
489It's applicable for root and non-root cgroup.
490
49110. TODO
418 492
4191. Add support for accounting huge pages (as a separate controller) 4931. Add support for accounting huge pages (as a separate controller)
4202. Make per-cgroup scanner reclaim not-shared pages first 4942. Make per-cgroup scanner reclaim not-shared pages first