diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-09 00:35:12 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-09 00:35:12 -0400 |
commit | 92c1d6522135050cb377a18cc6e30d08dfb87efb (patch) | |
tree | 5d2fa3051c975f1c459b6949f9e71cac2edf74de | |
parent | df2a40f549e6b73aad98b0c03f400c00d284816b (diff) | |
parent | 99c8b231ae6c6ca4ca2fd1c0b3701071f589661f (diff) |
Merge branch 'for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
"Documentation updates and the addition of cgroup_parse_float() which
will be used by new controllers including blk-iocost"
* 'for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
docs: cgroup-v1: convert docs to ReST and rename to *.rst
cgroup: Move cgroup_parse_float() implementation out of CONFIG_SYSFS
cgroup: add cgroup_parse_float()
37 files changed, 1005 insertions, 643 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index cf88c1f98270..a5c845338d6d 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst | |||
@@ -705,6 +705,12 @@ Conventions | |||
705 | informational files on the root cgroup which end up showing global | 705 | informational files on the root cgroup which end up showing global |
706 | information available elsewhere shouldn't exist. | 706 | information available elsewhere shouldn't exist. |
707 | 707 | ||
708 | - The default time unit is microseconds. If a different unit is ever | ||
709 | used, an explicit unit suffix must be present. | ||
710 | |||
711 | - A parts-per quantity should use a percentage decimal with at least | ||
712 | two digit fractional part - e.g. 13.40. | ||
713 | |||
708 | - If a controller implements weight based resource distribution, its | 714 | - If a controller implements weight based resource distribution, its |
709 | interface file should be named "weight" and have the range [1, | 715 | interface file should be named "weight" and have the range [1, |
710 | 10000] with 100 as the default. The values are chosen to allow | 716 | 10000] with 100 as the default. The values are chosen to allow |
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst index 31653a9f0e1b..656aee262e23 100644 --- a/Documentation/admin-guide/hw-vuln/l1tf.rst +++ b/Documentation/admin-guide/hw-vuln/l1tf.rst | |||
@@ -241,7 +241,7 @@ Guest mitigation mechanisms | |||
241 | For further information about confining guests to a single or to a group | 241 | For further information about confining guests to a single or to a group |
242 | of cores consult the cpusets documentation: | 242 | of cores consult the cpusets documentation: |
243 | 243 | ||
244 | https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt | 244 | https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst |
245 | 245 | ||
246 | .. _interrupt_isolation: | 246 | .. _interrupt_isolation: |
247 | 247 | ||
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e6e806285703..74d28efa1c40 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -4084,7 +4084,7 @@ | |||
4084 | 4084 | ||
4085 | relax_domain_level= | 4085 | relax_domain_level= |
4086 | [KNL, SMP] Set scheduler's default relax_domain_level. | 4086 | [KNL, SMP] Set scheduler's default relax_domain_level. |
4087 | See Documentation/cgroup-v1/cpusets.txt. | 4087 | See Documentation/cgroup-v1/cpusets.rst. |
4088 | 4088 | ||
4089 | reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory | 4089 | reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory |
4090 | Format: <base1>,<size1>[,<base2>,<size2>,...] | 4090 | Format: <base1>,<size1>[,<base2>,<size2>,...] |
@@ -4594,7 +4594,7 @@ | |||
4594 | swapaccount=[0|1] | 4594 | swapaccount=[0|1] |
4595 | [KNL] Enable accounting of swap in memory resource | 4595 | [KNL] Enable accounting of swap in memory resource |
4596 | controller if no parameter or 1 is given or disable | 4596 | controller if no parameter or 1 is given or disable |
4597 | it if 0 is given (See Documentation/cgroup-v1/memory.txt) | 4597 | it if 0 is given (See Documentation/cgroup-v1/memory.rst) |
4598 | 4598 | ||
4599 | swiotlb= [ARM,IA-64,PPC,MIPS,X86] | 4599 | swiotlb= [ARM,IA-64,PPC,MIPS,X86] |
4600 | Format: { <int> | force | noforce } | 4600 | Format: { <int> | force | noforce } |
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index d78c5b315f72..546f174e5d6a 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst | |||
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy | |||
15 | support. | 15 | support. |
16 | 16 | ||
17 | Memory policies should not be confused with cpusets | 17 | Memory policies should not be confused with cpusets |
18 | (``Documentation/cgroup-v1/cpusets.txt``) | 18 | (``Documentation/cgroup-v1/cpusets.rst``) |
19 | which is an administrative mechanism for restricting the nodes from which | 19 | which is an administrative mechanism for restricting the nodes from which |
20 | memory may be allocated by a set of processes. Memory policies are a | 20 | memory may be allocated by a set of processes. Memory policies are a |
21 | programming interface that a NUMA-aware application can take advantage of. When | 21 | programming interface that a NUMA-aware application can take advantage of. When |
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 1a0f2ac02eb6..b2265cf6c9c3 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt | |||
@@ -539,7 +539,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files | |||
539 | created, and kept up-to-date by bfq, depends on whether | 539 | created, and kept up-to-date by bfq, depends on whether |
540 | CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all | 540 | CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all |
541 | the stat files documented in | 541 | the stat files documented in |
542 | Documentation/cgroup-v1/blkio-controller.txt. If, instead, | 542 | Documentation/cgroup-v1/blkio-controller.rst. If, instead, |
543 | CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files | 543 | CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files |
544 | blkio.bfq.io_service_bytes | 544 | blkio.bfq.io_service_bytes |
545 | blkio.bfq.io_service_bytes_recursive | 545 | blkio.bfq.io_service_bytes_recursive |
diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.rst index d1a1b7bdd03a..fd3184537d23 100644 --- a/Documentation/cgroup-v1/blkio-controller.txt +++ b/Documentation/cgroup-v1/blkio-controller.rst | |||
@@ -1,5 +1,7 @@ | |||
1 | Block IO Controller | 1 | =================== |
2 | =================== | 2 | Block IO Controller |
3 | =================== | ||
4 | |||
3 | Overview | 5 | Overview |
4 | ======== | 6 | ======== |
5 | cgroup subsys "blkio" implements the block io controller. There seems to be | 7 | cgroup subsys "blkio" implements the block io controller. There seems to be |
@@ -17,24 +19,27 @@ HOWTO | |||
17 | ===== | 19 | ===== |
18 | Throttling/Upper Limit policy | 20 | Throttling/Upper Limit policy |
19 | ----------------------------- | 21 | ----------------------------- |
20 | - Enable Block IO controller | 22 | - Enable Block IO controller:: |
23 | |||
21 | CONFIG_BLK_CGROUP=y | 24 | CONFIG_BLK_CGROUP=y |
22 | 25 | ||
23 | - Enable throttling in block layer | 26 | - Enable throttling in block layer:: |
27 | |||
24 | CONFIG_BLK_DEV_THROTTLING=y | 28 | CONFIG_BLK_DEV_THROTTLING=y |
25 | 29 | ||
26 | - Mount blkio controller (see cgroups.txt, Why are cgroups needed?) | 30 | - Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: |
31 | |||
27 | mount -t cgroup -o blkio none /sys/fs/cgroup/blkio | 32 | mount -t cgroup -o blkio none /sys/fs/cgroup/blkio |
28 | 33 | ||
29 | - Specify a bandwidth rate on particular device for root group. The format | 34 | - Specify a bandwidth rate on particular device for root group. The format |
30 | for policy is "<major>:<minor> <bytes_per_second>". | 35 | for policy is "<major>:<minor> <bytes_per_second>":: |
31 | 36 | ||
32 | echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device | 37 | echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device |
33 | 38 | ||
34 | Above will put a limit of 1MB/second on reads happening for root group | 39 | Above will put a limit of 1MB/second on reads happening for root group |
35 | on device having major/minor number 8:16. | 40 | on device having major/minor number 8:16. |
36 | 41 | ||
37 | - Run dd to read a file and see if rate is throttled to 1MB/s or not. | 42 | - Run dd to read a file and see if rate is throttled to 1MB/s or not:: |
38 | 43 | ||
39 | # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 | 44 | # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 |
40 | 1024+0 records in | 45 | 1024+0 records in |
@@ -51,7 +56,7 @@ throttling's hierarchy support is enabled iff "sane_behavior" is | |||
51 | enabled from cgroup side, which currently is a development option and | 56 | enabled from cgroup side, which currently is a development option and |
52 | not publicly available. | 57 | not publicly available. |
53 | 58 | ||
54 | If somebody created a hierarchy like as follows. | 59 | If somebody created a hierarchy like as follows:: |
55 | 60 | ||
56 | root | 61 | root |
57 | / \ | 62 | / \ |
@@ -66,7 +71,7 @@ directly generated by tasks in that cgroup. | |||
66 | 71 | ||
67 | Throttling without "sane_behavior" enabled from cgroup side will | 72 | Throttling without "sane_behavior" enabled from cgroup side will |
68 | practically treat all groups at same level as if it looks like the | 73 | practically treat all groups at same level as if it looks like the |
69 | following. | 74 | following:: |
70 | 75 | ||
71 | pivot | 76 | pivot |
72 | / / \ \ | 77 | / / \ \ |
@@ -99,27 +104,31 @@ Proportional weight policy files | |||
99 | These rules override the default value of group weight as specified | 104 | These rules override the default value of group weight as specified |
100 | by blkio.weight. | 105 | by blkio.weight. |
101 | 106 | ||
102 | Following is the format. | 107 | Following is the format:: |
108 | |||
109 | # echo dev_maj:dev_minor weight > blkio.weight_device | ||
110 | |||
111 | Configure weight=300 on /dev/sdb (8:16) in this cgroup:: | ||
112 | |||
113 | # echo 8:16 300 > blkio.weight_device | ||
114 | # cat blkio.weight_device | ||
115 | dev weight | ||
116 | 8:16 300 | ||
117 | |||
118 | Configure weight=500 on /dev/sda (8:0) in this cgroup:: | ||
103 | 119 | ||
104 | # echo dev_maj:dev_minor weight > blkio.weight_device | 120 | # echo 8:0 500 > blkio.weight_device |
105 | Configure weight=300 on /dev/sdb (8:16) in this cgroup | 121 | # cat blkio.weight_device |
106 | # echo 8:16 300 > blkio.weight_device | 122 | dev weight |
107 | # cat blkio.weight_device | 123 | 8:0 500 |
108 | dev weight | 124 | 8:16 300 |
109 | 8:16 300 | ||
110 | 125 | ||
111 | Configure weight=500 on /dev/sda (8:0) in this cgroup | 126 | Remove specific weight for /dev/sda in this cgroup:: |
112 | # echo 8:0 500 > blkio.weight_device | ||
113 | # cat blkio.weight_device | ||
114 | dev weight | ||
115 | 8:0 500 | ||
116 | 8:16 300 | ||
117 | 127 | ||
118 | Remove specific weight for /dev/sda in this cgroup | 128 | # echo 8:0 0 > blkio.weight_device |
119 | # echo 8:0 0 > blkio.weight_device | 129 | # cat blkio.weight_device |
120 | # cat blkio.weight_device | 130 | dev weight |
121 | dev weight | 131 | 8:16 300 |
122 | 8:16 300 | ||
123 | 132 | ||
124 | - blkio.leaf_weight[_device] | 133 | - blkio.leaf_weight[_device] |
125 | - Equivalents of blkio.weight[_device] for the purpose of | 134 | - Equivalents of blkio.weight[_device] for the purpose of |
@@ -244,30 +253,30 @@ Throttling/Upper limit policy files | |||
244 | - blkio.throttle.read_bps_device | 253 | - blkio.throttle.read_bps_device |
245 | - Specifies upper limit on READ rate from the device. IO rate is | 254 | - Specifies upper limit on READ rate from the device. IO rate is |
246 | specified in bytes per second. Rules are per device. Following is | 255 | specified in bytes per second. Rules are per device. Following is |
247 | the format. | 256 | the format:: |
248 | 257 | ||
249 | echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device | 258 | echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device |
250 | 259 | ||
251 | - blkio.throttle.write_bps_device | 260 | - blkio.throttle.write_bps_device |
252 | - Specifies upper limit on WRITE rate to the device. IO rate is | 261 | - Specifies upper limit on WRITE rate to the device. IO rate is |
253 | specified in bytes per second. Rules are per device. Following is | 262 | specified in bytes per second. Rules are per device. Following is |
254 | the format. | 263 | the format:: |
255 | 264 | ||
256 | echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device | 265 | echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device |
257 | 266 | ||
258 | - blkio.throttle.read_iops_device | 267 | - blkio.throttle.read_iops_device |
259 | - Specifies upper limit on READ rate from the device. IO rate is | 268 | - Specifies upper limit on READ rate from the device. IO rate is |
260 | specified in IO per second. Rules are per device. Following is | 269 | specified in IO per second. Rules are per device. Following is |
261 | the format. | 270 | the format:: |
262 | 271 | ||
263 | echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device | 272 | echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device |
264 | 273 | ||
265 | - blkio.throttle.write_iops_device | 274 | - blkio.throttle.write_iops_device |
266 | - Specifies upper limit on WRITE rate to the device. IO rate is | 275 | - Specifies upper limit on WRITE rate to the device. IO rate is |
267 | specified in io per second. Rules are per device. Following is | 276 | specified in io per second. Rules are per device. Following is |
268 | the format. | 277 | the format:: |
269 | 278 | ||
270 | echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device | 279 | echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device |
271 | 280 | ||
272 | Note: If both BW and IOPS rules are specified for a device, then IO is | 281 | Note: If both BW and IOPS rules are specified for a device, then IO is |
273 | subjected to both the constraints. | 282 | subjected to both the constraints. |
diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.rst index 059f7063eea6..46bbe7e022d4 100644 --- a/Documentation/cgroup-v1/cgroups.txt +++ b/Documentation/cgroup-v1/cgroups.rst | |||
@@ -1,35 +1,39 @@ | |||
1 | CGROUPS | 1 | ============== |
2 | ------- | 2 | Control Groups |
3 | ============== | ||
3 | 4 | ||
4 | Written by Paul Menage <menage@google.com> based on | 5 | Written by Paul Menage <menage@google.com> based on |
5 | Documentation/cgroup-v1/cpusets.txt | 6 | Documentation/cgroup-v1/cpusets.rst |
6 | 7 | ||
7 | Original copyright statements from cpusets.txt: | 8 | Original copyright statements from cpusets.txt: |
9 | |||
8 | Portions Copyright (C) 2004 BULL SA. | 10 | Portions Copyright (C) 2004 BULL SA. |
11 | |||
9 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | 12 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. |
13 | |||
10 | Modified by Paul Jackson <pj@sgi.com> | 14 | Modified by Paul Jackson <pj@sgi.com> |
15 | |||
11 | Modified by Christoph Lameter <cl@linux.com> | 16 | Modified by Christoph Lameter <cl@linux.com> |
12 | 17 | ||
13 | CONTENTS: | 18 | .. CONTENTS: |
14 | ========= | 19 | |
15 | 20 | 1. Control Groups | |
16 | 1. Control Groups | 21 | 1.1 What are cgroups ? |
17 | 1.1 What are cgroups ? | 22 | 1.2 Why are cgroups needed ? |
18 | 1.2 Why are cgroups needed ? | 23 | 1.3 How are cgroups implemented ? |
19 | 1.3 How are cgroups implemented ? | 24 | 1.4 What does notify_on_release do ? |
20 | 1.4 What does notify_on_release do ? | 25 | 1.5 What does clone_children do ? |
21 | 1.5 What does clone_children do ? | 26 | 1.6 How do I use cgroups ? |
22 | 1.6 How do I use cgroups ? | 27 | 2. Usage Examples and Syntax |
23 | 2. Usage Examples and Syntax | 28 | 2.1 Basic Usage |
24 | 2.1 Basic Usage | 29 | 2.2 Attaching processes |
25 | 2.2 Attaching processes | 30 | 2.3 Mounting hierarchies by name |
26 | 2.3 Mounting hierarchies by name | 31 | 3. Kernel API |
27 | 3. Kernel API | 32 | 3.1 Overview |
28 | 3.1 Overview | 33 | 3.2 Synchronization |
29 | 3.2 Synchronization | 34 | 3.3 Subsystem API |
30 | 3.3 Subsystem API | 35 | 4. Extended attributes usage |
31 | 4. Extended attributes usage | 36 | 5. Questions |
32 | 5. Questions | ||
33 | 37 | ||
34 | 1. Control Groups | 38 | 1. Control Groups |
35 | ================= | 39 | ================= |
@@ -72,7 +76,7 @@ On their own, the only use for cgroups is for simple job | |||
72 | tracking. The intention is that other subsystems hook into the generic | 76 | tracking. The intention is that other subsystems hook into the generic |
73 | cgroup support to provide new attributes for cgroups, such as | 77 | cgroup support to provide new attributes for cgroups, such as |
74 | accounting/limiting the resources which processes in a cgroup can | 78 | accounting/limiting the resources which processes in a cgroup can |
75 | access. For example, cpusets (see Documentation/cgroup-v1/cpusets.txt) allow | 79 | access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow |
76 | you to associate a set of CPUs and a set of memory nodes with the | 80 | you to associate a set of CPUs and a set of memory nodes with the |
77 | tasks in each cgroup. | 81 | tasks in each cgroup. |
78 | 82 | ||
@@ -108,7 +112,7 @@ As an example of a scenario (originally proposed by vatsa@in.ibm.com) | |||
108 | that can benefit from multiple hierarchies, consider a large | 112 | that can benefit from multiple hierarchies, consider a large |
109 | university server with various users - students, professors, system | 113 | university server with various users - students, professors, system |
110 | tasks etc. The resource planning for this server could be along the | 114 | tasks etc. The resource planning for this server could be along the |
111 | following lines: | 115 | following lines:: |
112 | 116 | ||
113 | CPU : "Top cpuset" | 117 | CPU : "Top cpuset" |
114 | / \ | 118 | / \ |
@@ -136,7 +140,7 @@ depending on who launched it (prof/student). | |||
136 | With the ability to classify tasks differently for different resources | 140 | With the ability to classify tasks differently for different resources |
137 | (by putting those resource subsystems in different hierarchies), | 141 | (by putting those resource subsystems in different hierarchies), |
138 | the admin can easily set up a script which receives exec notifications | 142 | the admin can easily set up a script which receives exec notifications |
139 | and depending on who is launching the browser he can | 143 | and depending on who is launching the browser he can:: |
140 | 144 | ||
141 | # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks | 145 | # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks |
142 | 146 | ||
@@ -151,7 +155,7 @@ wants to do online gaming :)) OR give one of the student's simulation | |||
151 | apps enhanced CPU power. | 155 | apps enhanced CPU power. |
152 | 156 | ||
153 | With ability to write PIDs directly to resource classes, it's just a | 157 | With ability to write PIDs directly to resource classes, it's just a |
154 | matter of: | 158 | matter of:: |
155 | 159 | ||
156 | # echo pid > /sys/fs/cgroup/network/<new_class>/tasks | 160 | # echo pid > /sys/fs/cgroup/network/<new_class>/tasks |
157 | (after some time) | 161 | (after some time) |
@@ -306,7 +310,7 @@ configuration from the parent during initialization. | |||
306 | -------------------------- | 310 | -------------------------- |
307 | 311 | ||
308 | To start a new job that is to be contained within a cgroup, using | 312 | To start a new job that is to be contained within a cgroup, using |
309 | the "cpuset" cgroup subsystem, the steps are something like: | 313 | the "cpuset" cgroup subsystem, the steps are something like:: |
310 | 314 | ||
311 | 1) mount -t tmpfs cgroup_root /sys/fs/cgroup | 315 | 1) mount -t tmpfs cgroup_root /sys/fs/cgroup |
312 | 2) mkdir /sys/fs/cgroup/cpuset | 316 | 2) mkdir /sys/fs/cgroup/cpuset |
@@ -320,7 +324,7 @@ the "cpuset" cgroup subsystem, the steps are something like: | |||
320 | 324 | ||
321 | For example, the following sequence of commands will setup a cgroup | 325 | For example, the following sequence of commands will setup a cgroup |
322 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | 326 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, |
323 | and then start a subshell 'sh' in that cgroup: | 327 | and then start a subshell 'sh' in that cgroup:: |
324 | 328 | ||
325 | mount -t tmpfs cgroup_root /sys/fs/cgroup | 329 | mount -t tmpfs cgroup_root /sys/fs/cgroup |
326 | mkdir /sys/fs/cgroup/cpuset | 330 | mkdir /sys/fs/cgroup/cpuset |
@@ -345,8 +349,9 @@ and then start a subshell 'sh' in that cgroup: | |||
345 | Creating, modifying, using cgroups can be done through the cgroup | 349 | Creating, modifying, using cgroups can be done through the cgroup |
346 | virtual filesystem. | 350 | virtual filesystem. |
347 | 351 | ||
348 | To mount a cgroup hierarchy with all available subsystems, type: | 352 | To mount a cgroup hierarchy with all available subsystems, type:: |
349 | # mount -t cgroup xxx /sys/fs/cgroup | 353 | |
354 | # mount -t cgroup xxx /sys/fs/cgroup | ||
350 | 355 | ||
351 | The "xxx" is not interpreted by the cgroup code, but will appear in | 356 | The "xxx" is not interpreted by the cgroup code, but will appear in |
352 | /proc/mounts so may be any useful identifying string that you like. | 357 | /proc/mounts so may be any useful identifying string that you like. |
@@ -355,18 +360,19 @@ Note: Some subsystems do not work without some user input first. For instance, | |||
355 | if cpusets are enabled the user will have to populate the cpus and mems files | 360 | if cpusets are enabled the user will have to populate the cpus and mems files |
356 | for each new cgroup created before that group can be used. | 361 | for each new cgroup created before that group can be used. |
357 | 362 | ||
358 | As explained in section `1.2 Why are cgroups needed?' you should create | 363 | As explained in section `1.2 Why are cgroups needed?` you should create |
359 | different hierarchies of cgroups for each single resource or group of | 364 | different hierarchies of cgroups for each single resource or group of |
360 | resources you want to control. Therefore, you should mount a tmpfs on | 365 | resources you want to control. Therefore, you should mount a tmpfs on |
361 | /sys/fs/cgroup and create directories for each cgroup resource or resource | 366 | /sys/fs/cgroup and create directories for each cgroup resource or resource |
362 | group. | 367 | group:: |
363 | 368 | ||
364 | # mount -t tmpfs cgroup_root /sys/fs/cgroup | 369 | # mount -t tmpfs cgroup_root /sys/fs/cgroup |
365 | # mkdir /sys/fs/cgroup/rg1 | 370 | # mkdir /sys/fs/cgroup/rg1 |
366 | 371 | ||
367 | To mount a cgroup hierarchy with just the cpuset and memory | 372 | To mount a cgroup hierarchy with just the cpuset and memory |
368 | subsystems, type: | 373 | subsystems, type:: |
369 | # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 | 374 | |
375 | # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 | ||
370 | 376 | ||
371 | While remounting cgroups is currently supported, it is not recommend | 377 | While remounting cgroups is currently supported, it is not recommend |
372 | to use it. Remounting allows changing bound subsystems and | 378 | to use it. Remounting allows changing bound subsystems and |
@@ -375,9 +381,10 @@ hierarchy is empty and release_agent itself should be replaced with | |||
375 | conventional fsnotify. The support for remounting will be removed in | 381 | conventional fsnotify. The support for remounting will be removed in |
376 | the future. | 382 | the future. |
377 | 383 | ||
378 | To Specify a hierarchy's release_agent: | 384 | To Specify a hierarchy's release_agent:: |
379 | # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ | 385 | |
380 | xxx /sys/fs/cgroup/rg1 | 386 | # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ |
387 | xxx /sys/fs/cgroup/rg1 | ||
381 | 388 | ||
382 | Note that specifying 'release_agent' more than once will return failure. | 389 | Note that specifying 'release_agent' more than once will return failure. |
383 | 390 | ||
@@ -390,32 +397,39 @@ Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the | |||
390 | tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 | 397 | tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 |
391 | is the cgroup that holds the whole system. | 398 | is the cgroup that holds the whole system. |
392 | 399 | ||
393 | If you want to change the value of release_agent: | 400 | If you want to change the value of release_agent:: |
394 | # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent | 401 | |
402 | # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent | ||
395 | 403 | ||
396 | It can also be changed via remount. | 404 | It can also be changed via remount. |
397 | 405 | ||
398 | If you want to create a new cgroup under /sys/fs/cgroup/rg1: | 406 | If you want to create a new cgroup under /sys/fs/cgroup/rg1:: |
399 | # cd /sys/fs/cgroup/rg1 | 407 | |
400 | # mkdir my_cgroup | 408 | # cd /sys/fs/cgroup/rg1 |
409 | # mkdir my_cgroup | ||
410 | |||
411 | Now you want to do something with this cgroup: | ||
412 | |||
413 | # cd my_cgroup | ||
401 | 414 | ||
402 | Now you want to do something with this cgroup. | 415 | In this directory you can find several files:: |
403 | # cd my_cgroup | ||
404 | 416 | ||
405 | In this directory you can find several files: | 417 | # ls |
406 | # ls | 418 | cgroup.procs notify_on_release tasks |
407 | cgroup.procs notify_on_release tasks | 419 | (plus whatever files added by the attached subsystems) |
408 | (plus whatever files added by the attached subsystems) | ||
409 | 420 | ||
410 | Now attach your shell to this cgroup: | 421 | Now attach your shell to this cgroup:: |
411 | # /bin/echo $$ > tasks | 422 | |
423 | # /bin/echo $$ > tasks | ||
412 | 424 | ||
413 | You can also create cgroups inside your cgroup by using mkdir in this | 425 | You can also create cgroups inside your cgroup by using mkdir in this |
414 | directory. | 426 | directory:: |
415 | # mkdir my_sub_cs | 427 | |
428 | # mkdir my_sub_cs | ||
429 | |||
430 | To remove a cgroup, just use rmdir:: | ||
416 | 431 | ||
417 | To remove a cgroup, just use rmdir: | 432 | # rmdir my_sub_cs |
418 | # rmdir my_sub_cs | ||
419 | 433 | ||
420 | This will fail if the cgroup is in use (has cgroups inside, or | 434 | This will fail if the cgroup is in use (has cgroups inside, or |
421 | has processes attached, or is held alive by other subsystem-specific | 435 | has processes attached, or is held alive by other subsystem-specific |
@@ -424,19 +438,21 @@ reference). | |||
424 | 2.2 Attaching processes | 438 | 2.2 Attaching processes |
425 | ----------------------- | 439 | ----------------------- |
426 | 440 | ||
427 | # /bin/echo PID > tasks | 441 | :: |
442 | |||
443 | # /bin/echo PID > tasks | ||
428 | 444 | ||
429 | Note that it is PID, not PIDs. You can only attach ONE task at a time. | 445 | Note that it is PID, not PIDs. You can only attach ONE task at a time. |
430 | If you have several tasks to attach, you have to do it one after another: | 446 | If you have several tasks to attach, you have to do it one after another:: |
431 | 447 | ||
432 | # /bin/echo PID1 > tasks | 448 | # /bin/echo PID1 > tasks |
433 | # /bin/echo PID2 > tasks | 449 | # /bin/echo PID2 > tasks |
434 | ... | 450 | ... |
435 | # /bin/echo PIDn > tasks | 451 | # /bin/echo PIDn > tasks |
436 | 452 | ||
437 | You can attach the current shell task by echoing 0: | 453 | You can attach the current shell task by echoing 0:: |
438 | 454 | ||
439 | # echo 0 > tasks | 455 | # echo 0 > tasks |
440 | 456 | ||
441 | You can use the cgroup.procs file instead of the tasks file to move all | 457 | You can use the cgroup.procs file instead of the tasks file to move all |
442 | threads in a threadgroup at once. Echoing the PID of any task in a | 458 | threads in a threadgroup at once. Echoing the PID of any task in a |
@@ -529,7 +545,7 @@ Each subsystem may export the following methods. The only mandatory | |||
529 | methods are css_alloc/free. Any others that are null are presumed to | 545 | methods are css_alloc/free. Any others that are null are presumed to |
530 | be successful no-ops. | 546 | be successful no-ops. |
531 | 547 | ||
532 | struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) | 548 | ``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` |
533 | (cgroup_mutex held by caller) | 549 | (cgroup_mutex held by caller) |
534 | 550 | ||
535 | Called to allocate a subsystem state object for a cgroup. The | 551 | Called to allocate a subsystem state object for a cgroup. The |
@@ -544,7 +560,7 @@ identified by the passed cgroup object having a NULL parent (since | |||
544 | it's the root of the hierarchy) and may be an appropriate place for | 560 | it's the root of the hierarchy) and may be an appropriate place for |
545 | initialization code. | 561 | initialization code. |
546 | 562 | ||
547 | int css_online(struct cgroup *cgrp) | 563 | ``int css_online(struct cgroup *cgrp)`` |
548 | (cgroup_mutex held by caller) | 564 | (cgroup_mutex held by caller) |
549 | 565 | ||
550 | Called after @cgrp successfully completed all allocations and made | 566 | Called after @cgrp successfully completed all allocations and made |
@@ -554,7 +570,7 @@ callback can be used to implement reliable state sharing and | |||
554 | propagation along the hierarchy. See the comment on | 570 | propagation along the hierarchy. See the comment on |
555 | cgroup_for_each_descendant_pre() for details. | 571 | cgroup_for_each_descendant_pre() for details. |
556 | 572 | ||
557 | void css_offline(struct cgroup *cgrp); | 573 | ``void css_offline(struct cgroup *cgrp);`` |
558 | (cgroup_mutex held by caller) | 574 | (cgroup_mutex held by caller) |
559 | 575 | ||
560 | This is the counterpart of css_online() and called iff css_online() | 576 | This is the counterpart of css_online() and called iff css_online() |
@@ -564,7 +580,7 @@ all references it's holding on @cgrp. When all references are dropped, | |||
564 | cgroup removal will proceed to the next step - css_free(). After this | 580 | cgroup removal will proceed to the next step - css_free(). After this |
565 | callback, @cgrp should be considered dead to the subsystem. | 581 | callback, @cgrp should be considered dead to the subsystem. |
566 | 582 | ||
567 | void css_free(struct cgroup *cgrp) | 583 | ``void css_free(struct cgroup *cgrp)`` |
568 | (cgroup_mutex held by caller) | 584 | (cgroup_mutex held by caller) |
569 | 585 | ||
570 | The cgroup system is about to free @cgrp; the subsystem should free | 586 | The cgroup system is about to free @cgrp; the subsystem should free |
@@ -573,7 +589,7 @@ is completely unused; @cgrp->parent is still valid. (Note - can also | |||
573 | be called for a newly-created cgroup if an error occurs after this | 589 | be called for a newly-created cgroup if an error occurs after this |
574 | subsystem's create() method has been called for the new cgroup). | 590 | subsystem's create() method has been called for the new cgroup). |
575 | 591 | ||
576 | int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 592 | ``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` |
577 | (cgroup_mutex held by caller) | 593 | (cgroup_mutex held by caller) |
578 | 594 | ||
579 | Called prior to moving one or more tasks into a cgroup; if the | 595 | Called prior to moving one or more tasks into a cgroup; if the |
@@ -594,7 +610,7 @@ fork. If this method returns 0 (success) then this should remain valid | |||
594 | while the caller holds cgroup_mutex and it is ensured that either | 610 | while the caller holds cgroup_mutex and it is ensured that either |
595 | attach() or cancel_attach() will be called in future. | 611 | attach() or cancel_attach() will be called in future. |
596 | 612 | ||
597 | void css_reset(struct cgroup_subsys_state *css) | 613 | ``void css_reset(struct cgroup_subsys_state *css)`` |
598 | (cgroup_mutex held by caller) | 614 | (cgroup_mutex held by caller) |
599 | 615 | ||
600 | An optional operation which should restore @css's configuration to the | 616 | An optional operation which should restore @css's configuration to the |
@@ -608,7 +624,7 @@ This prevents unexpected resource control from a hidden css and | |||
608 | ensures that the configuration is in the initial state when it is made | 624 | ensures that the configuration is in the initial state when it is made |
609 | visible again later. | 625 | visible again later. |
610 | 626 | ||
611 | void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 627 | ``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` |
612 | (cgroup_mutex held by caller) | 628 | (cgroup_mutex held by caller) |
613 | 629 | ||
614 | Called when a task attach operation has failed after can_attach() has succeeded. | 630 | Called when a task attach operation has failed after can_attach() has succeeded. |
@@ -617,26 +633,26 @@ function, so that the subsystem can implement a rollback. If not, not necessary. | |||
617 | This will be called only about subsystems whose can_attach() operation have | 633 | This will be called only about subsystems whose can_attach() operation have |
618 | succeeded. The parameters are identical to can_attach(). | 634 | succeeded. The parameters are identical to can_attach(). |
619 | 635 | ||
620 | void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 636 | ``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` |
621 | (cgroup_mutex held by caller) | 637 | (cgroup_mutex held by caller) |
622 | 638 | ||
623 | Called after the task has been attached to the cgroup, to allow any | 639 | Called after the task has been attached to the cgroup, to allow any |
624 | post-attachment activity that requires memory allocations or blocking. | 640 | post-attachment activity that requires memory allocations or blocking. |
625 | The parameters are identical to can_attach(). | 641 | The parameters are identical to can_attach(). |
626 | 642 | ||
627 | void fork(struct task_struct *task) | 643 | ``void fork(struct task_struct *task)`` |
628 | 644 | ||
629 | Called when a task is forked into a cgroup. | 645 | Called when a task is forked into a cgroup. |
630 | 646 | ||
631 | void exit(struct task_struct *task) | 647 | ``void exit(struct task_struct *task)`` |
632 | 648 | ||
633 | Called during task exit. | 649 | Called during task exit. |
634 | 650 | ||
635 | void free(struct task_struct *task) | 651 | ``void free(struct task_struct *task)`` |
636 | 652 | ||
637 | Called when the task_struct is freed. | 653 | Called when the task_struct is freed. |
638 | 654 | ||
639 | void bind(struct cgroup *root) | 655 | ``void bind(struct cgroup *root)`` |
640 | (cgroup_mutex held by caller) | 656 | (cgroup_mutex held by caller) |
641 | 657 | ||
642 | Called when a cgroup subsystem is rebound to a different hierarchy | 658 | Called when a cgroup subsystem is rebound to a different hierarchy |
@@ -649,6 +665,7 @@ that is being created/destroyed (and hence has no sub-cgroups). | |||
649 | 665 | ||
650 | cgroup filesystem supports certain types of extended attributes in its | 666 | cgroup filesystem supports certain types of extended attributes in its |
651 | directories and files. The current supported types are: | 667 | directories and files. The current supported types are: |
668 | |||
652 | - Trusted (XATTR_TRUSTED) | 669 | - Trusted (XATTR_TRUSTED) |
653 | - Security (XATTR_SECURITY) | 670 | - Security (XATTR_SECURITY) |
654 | 671 | ||
@@ -666,12 +683,13 @@ in containers and systemd for assorted meta data like main PID in a cgroup | |||
666 | 5. Questions | 683 | 5. Questions |
667 | ============ | 684 | ============ |
668 | 685 | ||
669 | Q: what's up with this '/bin/echo' ? | 686 | :: |
670 | A: bash's builtin 'echo' command does not check calls to write() against | ||
671 | errors. If you use it in the cgroup file system, you won't be | ||
672 | able to tell whether a command succeeded or failed. | ||
673 | 687 | ||
674 | Q: When I attach processes, only the first of the line gets really attached ! | 688 | Q: what's up with this '/bin/echo' ? |
675 | A: We can only return one error code per call to write(). So you should also | 689 | A: bash's builtin 'echo' command does not check calls to write() against |
676 | put only ONE PID. | 690 | errors. If you use it in the cgroup file system, you won't be |
691 | able to tell whether a command succeeded or failed. | ||
677 | 692 | ||
693 | Q: When I attach processes, only the first of the line gets really attached ! | ||
694 | A: We can only return one error code per call to write(). So you should also | ||
695 | put only ONE PID. | ||
diff --git a/Documentation/cgroup-v1/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.rst index 9d73cc0cadb9..d30ed81d2ad7 100644 --- a/Documentation/cgroup-v1/cpuacct.txt +++ b/Documentation/cgroup-v1/cpuacct.rst | |||
@@ -1,5 +1,6 @@ | |||
1 | ========================= | ||
1 | CPU Accounting Controller | 2 | CPU Accounting Controller |
2 | ------------------------- | 3 | ========================= |
3 | 4 | ||
4 | The CPU accounting controller is used to group tasks using cgroups and | 5 | The CPU accounting controller is used to group tasks using cgroups and |
5 | account the CPU usage of these groups of tasks. | 6 | account the CPU usage of these groups of tasks. |
@@ -8,9 +9,9 @@ The CPU accounting controller supports multi-hierarchy groups. An accounting | |||
8 | group accumulates the CPU usage of all of its child groups and the tasks | 9 | group accumulates the CPU usage of all of its child groups and the tasks |
9 | directly present in its group. | 10 | directly present in its group. |
10 | 11 | ||
11 | Accounting groups can be created by first mounting the cgroup filesystem. | 12 | Accounting groups can be created by first mounting the cgroup filesystem:: |
12 | 13 | ||
13 | # mount -t cgroup -ocpuacct none /sys/fs/cgroup | 14 | # mount -t cgroup -ocpuacct none /sys/fs/cgroup |
14 | 15 | ||
15 | With the above step, the initial or the parent accounting group becomes | 16 | With the above step, the initial or the parent accounting group becomes |
16 | visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in | 17 | visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in |
@@ -19,11 +20,11 @@ the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. | |||
19 | by this group which is essentially the CPU time obtained by all the tasks | 20 | by this group which is essentially the CPU time obtained by all the tasks |
20 | in the system. | 21 | in the system. |
21 | 22 | ||
22 | New accounting groups can be created under the parent group /sys/fs/cgroup. | 23 | New accounting groups can be created under the parent group /sys/fs/cgroup:: |
23 | 24 | ||
24 | # cd /sys/fs/cgroup | 25 | # cd /sys/fs/cgroup |
25 | # mkdir g1 | 26 | # mkdir g1 |
26 | # echo $$ > g1/tasks | 27 | # echo $$ > g1/tasks |
27 | 28 | ||
28 | The above steps create a new group g1 and move the current shell | 29 | The above steps create a new group g1 and move the current shell |
29 | process (bash) into it. CPU time consumed by this bash and its children | 30 | process (bash) into it. CPU time consumed by this bash and its children |
diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.rst index 8402dd6de8df..b6a42cdea72b 100644 --- a/Documentation/cgroup-v1/cpusets.txt +++ b/Documentation/cgroup-v1/cpusets.rst | |||
@@ -1,35 +1,36 @@ | |||
1 | CPUSETS | 1 | ======= |
2 | ------- | 2 | CPUSETS |
3 | ======= | ||
3 | 4 | ||
4 | Copyright (C) 2004 BULL SA. | 5 | Copyright (C) 2004 BULL SA. |
5 | Written by Simon.Derr@bull.net | ||
6 | |||
7 | Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. | ||
8 | Modified by Paul Jackson <pj@sgi.com> | ||
9 | Modified by Christoph Lameter <cl@linux.com> | ||
10 | Modified by Paul Menage <menage@google.com> | ||
11 | Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> | ||
12 | 6 | ||
13 | CONTENTS: | 7 | Written by Simon.Derr@bull.net |
14 | ========= | ||
15 | 8 | ||
16 | 1. Cpusets | 9 | - Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. |
17 | 1.1 What are cpusets ? | 10 | - Modified by Paul Jackson <pj@sgi.com> |
18 | 1.2 Why are cpusets needed ? | 11 | - Modified by Christoph Lameter <cl@linux.com> |
19 | 1.3 How are cpusets implemented ? | 12 | - Modified by Paul Menage <menage@google.com> |
20 | 1.4 What are exclusive cpusets ? | 13 | - Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> |
21 | 1.5 What is memory_pressure ? | 14 | |
22 | 1.6 What is memory spread ? | 15 | .. CONTENTS: |
23 | 1.7 What is sched_load_balance ? | 16 | |
24 | 1.8 What is sched_relax_domain_level ? | 17 | 1. Cpusets |
25 | 1.9 How do I use cpusets ? | 18 | 1.1 What are cpusets ? |
26 | 2. Usage Examples and Syntax | 19 | 1.2 Why are cpusets needed ? |
27 | 2.1 Basic Usage | 20 | 1.3 How are cpusets implemented ? |
28 | 2.2 Adding/removing cpus | 21 | 1.4 What are exclusive cpusets ? |
29 | 2.3 Setting flags | 22 | 1.5 What is memory_pressure ? |
30 | 2.4 Attaching processes | 23 | 1.6 What is memory spread ? |
31 | 3. Questions | 24 | 1.7 What is sched_load_balance ? |
32 | 4. Contact | 25 | 1.8 What is sched_relax_domain_level ? |
26 | 1.9 How do I use cpusets ? | ||
27 | 2. Usage Examples and Syntax | ||
28 | 2.1 Basic Usage | ||
29 | 2.2 Adding/removing cpus | ||
30 | 2.3 Setting flags | ||
31 | 2.4 Attaching processes | ||
32 | 3. Questions | ||
33 | 4. Contact | ||
33 | 34 | ||
34 | 1. Cpusets | 35 | 1. Cpusets |
35 | ========== | 36 | ========== |
@@ -48,7 +49,7 @@ hooks, beyond what is already present, required to manage dynamic | |||
48 | job placement on large systems. | 49 | job placement on large systems. |
49 | 50 | ||
50 | Cpusets use the generic cgroup subsystem described in | 51 | Cpusets use the generic cgroup subsystem described in |
51 | Documentation/cgroup-v1/cgroups.txt. | 52 | Documentation/cgroup-v1/cgroups.rst. |
52 | 53 | ||
53 | Requests by a task, using the sched_setaffinity(2) system call to | 54 | Requests by a task, using the sched_setaffinity(2) system call to |
54 | include CPUs in its CPU affinity mask, and using the mbind(2) and | 55 | include CPUs in its CPU affinity mask, and using the mbind(2) and |
@@ -157,7 +158,7 @@ modifying cpusets is via this cpuset file system. | |||
157 | The /proc/<pid>/status file for each task has four added lines, | 158 | The /proc/<pid>/status file for each task has four added lines, |
158 | displaying the task's cpus_allowed (on which CPUs it may be scheduled) | 159 | displaying the task's cpus_allowed (on which CPUs it may be scheduled) |
159 | and mems_allowed (on which Memory Nodes it may obtain memory), | 160 | and mems_allowed (on which Memory Nodes it may obtain memory), |
160 | in the two formats seen in the following example: | 161 | in the two formats seen in the following example:: |
161 | 162 | ||
162 | Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff | 163 | Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff |
163 | Cpus_allowed_list: 0-127 | 164 | Cpus_allowed_list: 0-127 |
@@ -181,6 +182,7 @@ files describing that cpuset: | |||
181 | - cpuset.sched_relax_domain_level: the searching range when migrating tasks | 182 | - cpuset.sched_relax_domain_level: the searching range when migrating tasks |
182 | 183 | ||
183 | In addition, only the root cpuset has the following file: | 184 | In addition, only the root cpuset has the following file: |
185 | |||
184 | - cpuset.memory_pressure_enabled flag: compute memory_pressure? | 186 | - cpuset.memory_pressure_enabled flag: compute memory_pressure? |
185 | 187 | ||
186 | New cpusets are created using the mkdir system call or shell | 188 | New cpusets are created using the mkdir system call or shell |
@@ -266,7 +268,8 @@ to monitor a cpuset for signs of memory pressure. It's up to the | |||
266 | batch manager or other user code to decide what to do about it and | 268 | batch manager or other user code to decide what to do about it and |
267 | take action. | 269 | take action. |
268 | 270 | ||
269 | ==> Unless this feature is enabled by writing "1" to the special file | 271 | ==> |
272 | Unless this feature is enabled by writing "1" to the special file | ||
270 | /dev/cpuset/memory_pressure_enabled, the hook in the rebalance | 273 | /dev/cpuset/memory_pressure_enabled, the hook in the rebalance |
271 | code of __alloc_pages() for this metric reduces to simply noticing | 274 | code of __alloc_pages() for this metric reduces to simply noticing |
272 | that the cpuset_memory_pressure_enabled flag is zero. So only | 275 | that the cpuset_memory_pressure_enabled flag is zero. So only |
@@ -399,6 +402,7 @@ have tasks running on them unless explicitly assigned. | |||
399 | 402 | ||
400 | This default load balancing across all CPUs is not well suited for | 403 | This default load balancing across all CPUs is not well suited for |
401 | the following two situations: | 404 | the following two situations: |
405 | |||
402 | 1) On large systems, load balancing across many CPUs is expensive. | 406 | 1) On large systems, load balancing across many CPUs is expensive. |
403 | If the system is managed using cpusets to place independent jobs | 407 | If the system is managed using cpusets to place independent jobs |
404 | on separate sets of CPUs, full load balancing is unnecessary. | 408 | on separate sets of CPUs, full load balancing is unnecessary. |
@@ -501,6 +505,7 @@ all the CPUs that must be load balanced. | |||
501 | The cpuset code builds a new such partition and passes it to the | 505 | The cpuset code builds a new such partition and passes it to the |
502 | scheduler sched domain setup code, to have the sched domains rebuilt | 506 | scheduler sched domain setup code, to have the sched domains rebuilt |
503 | as necessary, whenever: | 507 | as necessary, whenever: |
508 | |||
504 | - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, | 509 | - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, |
505 | - or CPUs come or go from a cpuset with this flag enabled, | 510 | - or CPUs come or go from a cpuset with this flag enabled, |
506 | - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs | 511 | - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs |
@@ -553,13 +558,15 @@ this searching range as you like. This file takes int value which | |||
553 | indicates size of searching range in levels ideally as follows, | 558 | indicates size of searching range in levels ideally as follows, |
554 | otherwise initial value -1 that indicates the cpuset has no request. | 559 | otherwise initial value -1 that indicates the cpuset has no request. |
555 | 560 | ||
556 | -1 : no request. use system default or follow request of others. | 561 | ====== =========================================================== |
557 | 0 : no search. | 562 | -1 no request. use system default or follow request of others. |
558 | 1 : search siblings (hyperthreads in a core). | 563 | 0 no search. |
559 | 2 : search cores in a package. | 564 | 1 search siblings (hyperthreads in a core). |
560 | 3 : search cpus in a node [= system wide on non-NUMA system] | 565 | 2 search cores in a package. |
561 | 4 : search nodes in a chunk of node [on NUMA system] | 566 | 3 search cpus in a node [= system wide on non-NUMA system] |
562 | 5 : search system wide [on NUMA system] | 567 | 4 search nodes in a chunk of node [on NUMA system] |
568 | 5 search system wide [on NUMA system] | ||
569 | ====== =========================================================== | ||
563 | 570 | ||
564 | The system default is architecture dependent. The system default | 571 | The system default is architecture dependent. The system default |
565 | can be changed using the relax_domain_level= boot parameter. | 572 | can be changed using the relax_domain_level= boot parameter. |
@@ -578,13 +585,14 @@ and whether it is acceptable or not depends on your situation. | |||
578 | Don't modify this file if you are not sure. | 585 | Don't modify this file if you are not sure. |
579 | 586 | ||
580 | If your situation is: | 587 | If your situation is: |
588 | |||
581 | - The migration costs between each cpu can be assumed considerably | 589 | - The migration costs between each cpu can be assumed considerably |
582 | small(for you) due to your special application's behavior or | 590 | small(for you) due to your special application's behavior or |
583 | special hardware support for CPU cache etc. | 591 | special hardware support for CPU cache etc. |
584 | - The searching cost doesn't have impact(for you) or you can make | 592 | - The searching cost doesn't have impact(for you) or you can make |
585 | the searching cost enough small by managing cpuset to compact etc. | 593 | the searching cost enough small by managing cpuset to compact etc. |
586 | - The latency is required even it sacrifices cache hit rate etc. | 594 | - The latency is required even it sacrifices cache hit rate etc. |
587 | then increasing 'sched_relax_domain_level' would benefit you. | 595 | then increasing 'sched_relax_domain_level' would benefit you. |
588 | 596 | ||
589 | 597 | ||
590 | 1.9 How do I use cpusets ? | 598 | 1.9 How do I use cpusets ? |
@@ -678,7 +686,7 @@ To start a new job that is to be contained within a cpuset, the steps are: | |||
678 | 686 | ||
679 | For example, the following sequence of commands will setup a cpuset | 687 | For example, the following sequence of commands will setup a cpuset |
680 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, | 688 | named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, |
681 | and then start a subshell 'sh' in that cpuset: | 689 | and then start a subshell 'sh' in that cpuset:: |
682 | 690 | ||
683 | mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset | 691 | mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset |
684 | cd /sys/fs/cgroup/cpuset | 692 | cd /sys/fs/cgroup/cpuset |
@@ -693,6 +701,7 @@ and then start a subshell 'sh' in that cpuset: | |||
693 | cat /proc/self/cpuset | 701 | cat /proc/self/cpuset |
694 | 702 | ||
695 | There are ways to query or modify cpusets: | 703 | There are ways to query or modify cpusets: |
704 | |||
696 | - via the cpuset file system directly, using the various cd, mkdir, echo, | 705 | - via the cpuset file system directly, using the various cd, mkdir, echo, |
697 | cat, rmdir commands from the shell, or their equivalent from C. | 706 | cat, rmdir commands from the shell, or their equivalent from C. |
698 | - via the C library libcpuset. | 707 | - via the C library libcpuset. |
@@ -722,115 +731,133 @@ Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the | |||
722 | tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset | 731 | tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset |
723 | is the cpuset that holds the whole system. | 732 | is the cpuset that holds the whole system. |
724 | 733 | ||
725 | If you want to create a new cpuset under /sys/fs/cgroup/cpuset: | 734 | If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: |
726 | # cd /sys/fs/cgroup/cpuset | 735 | |
727 | # mkdir my_cpuset | 736 | # cd /sys/fs/cgroup/cpuset |
737 | # mkdir my_cpuset | ||
728 | 738 | ||
729 | Now you want to do something with this cpuset. | 739 | Now you want to do something with this cpuset:: |
730 | # cd my_cpuset | ||
731 | 740 | ||
732 | In this directory you can find several files: | 741 | # cd my_cpuset |
733 | # ls | 742 | |
734 | cgroup.clone_children cpuset.memory_pressure | 743 | In this directory you can find several files:: |
735 | cgroup.event_control cpuset.memory_spread_page | 744 | |
736 | cgroup.procs cpuset.memory_spread_slab | 745 | # ls |
737 | cpuset.cpu_exclusive cpuset.mems | 746 | cgroup.clone_children cpuset.memory_pressure |
738 | cpuset.cpus cpuset.sched_load_balance | 747 | cgroup.event_control cpuset.memory_spread_page |
739 | cpuset.mem_exclusive cpuset.sched_relax_domain_level | 748 | cgroup.procs cpuset.memory_spread_slab |
740 | cpuset.mem_hardwall notify_on_release | 749 | cpuset.cpu_exclusive cpuset.mems |
741 | cpuset.memory_migrate tasks | 750 | cpuset.cpus cpuset.sched_load_balance |
751 | cpuset.mem_exclusive cpuset.sched_relax_domain_level | ||
752 | cpuset.mem_hardwall notify_on_release | ||
753 | cpuset.memory_migrate tasks | ||
742 | 754 | ||
743 | Reading them will give you information about the state of this cpuset: | 755 | Reading them will give you information about the state of this cpuset: |
744 | the CPUs and Memory Nodes it can use, the processes that are using | 756 | the CPUs and Memory Nodes it can use, the processes that are using |
745 | it, its properties. By writing to these files you can manipulate | 757 | it, its properties. By writing to these files you can manipulate |
746 | the cpuset. | 758 | the cpuset. |
747 | 759 | ||
748 | Set some flags: | 760 | Set some flags:: |
749 | # /bin/echo 1 > cpuset.cpu_exclusive | 761 | |
762 | # /bin/echo 1 > cpuset.cpu_exclusive | ||
763 | |||
764 | Add some cpus:: | ||
765 | |||
766 | # /bin/echo 0-7 > cpuset.cpus | ||
767 | |||
768 | Add some mems:: | ||
750 | 769 | ||
751 | Add some cpus: | 770 | # /bin/echo 0-7 > cpuset.mems |
752 | # /bin/echo 0-7 > cpuset.cpus | ||
753 | 771 | ||
754 | Add some mems: | 772 | Now attach your shell to this cpuset:: |
755 | # /bin/echo 0-7 > cpuset.mems | ||
756 | 773 | ||
757 | Now attach your shell to this cpuset: | 774 | # /bin/echo $$ > tasks |
758 | # /bin/echo $$ > tasks | ||
759 | 775 | ||
760 | You can also create cpusets inside your cpuset by using mkdir in this | 776 | You can also create cpusets inside your cpuset by using mkdir in this |
761 | directory. | 777 | directory:: |
762 | # mkdir my_sub_cs | 778 | |
779 | # mkdir my_sub_cs | ||
780 | |||
781 | To remove a cpuset, just use rmdir:: | ||
782 | |||
783 | # rmdir my_sub_cs | ||
763 | 784 | ||
764 | To remove a cpuset, just use rmdir: | ||
765 | # rmdir my_sub_cs | ||
766 | This will fail if the cpuset is in use (has cpusets inside, or has | 785 | This will fail if the cpuset is in use (has cpusets inside, or has |
767 | processes attached). | 786 | processes attached). |
768 | 787 | ||
769 | Note that for legacy reasons, the "cpuset" filesystem exists as a | 788 | Note that for legacy reasons, the "cpuset" filesystem exists as a |
770 | wrapper around the cgroup filesystem. | 789 | wrapper around the cgroup filesystem. |
771 | 790 | ||
772 | The command | 791 | The command:: |
773 | 792 | ||
774 | mount -t cpuset X /sys/fs/cgroup/cpuset | 793 | mount -t cpuset X /sys/fs/cgroup/cpuset |
775 | 794 | ||
776 | is equivalent to | 795 | is equivalent to:: |
777 | 796 | ||
778 | mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset | 797 | mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset |
779 | echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent | 798 | echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent |
780 | 799 | ||
781 | 2.2 Adding/removing cpus | 800 | 2.2 Adding/removing cpus |
782 | ------------------------ | 801 | ------------------------ |
783 | 802 | ||
784 | This is the syntax to use when writing in the cpus or mems files | 803 | This is the syntax to use when writing in the cpus or mems files |
785 | in cpuset directories: | 804 | in cpuset directories:: |
786 | 805 | ||
787 | # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 | 806 | # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 |
788 | # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 | 807 | # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 |
789 | 808 | ||
790 | To add a CPU to a cpuset, write the new list of CPUs including the | 809 | To add a CPU to a cpuset, write the new list of CPUs including the |
791 | CPU to be added. To add 6 to the above cpuset: | 810 | CPU to be added. To add 6 to the above cpuset:: |
792 | 811 | ||
793 | # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 | 812 | # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 |
794 | 813 | ||
795 | Similarly to remove a CPU from a cpuset, write the new list of CPUs | 814 | Similarly to remove a CPU from a cpuset, write the new list of CPUs |
796 | without the CPU to be removed. | 815 | without the CPU to be removed. |
797 | 816 | ||
798 | To remove all the CPUs: | 817 | To remove all the CPUs:: |
799 | 818 | ||
800 | # /bin/echo "" > cpuset.cpus -> clear cpus list | 819 | # /bin/echo "" > cpuset.cpus -> clear cpus list |
801 | 820 | ||
802 | 2.3 Setting flags | 821 | 2.3 Setting flags |
803 | ----------------- | 822 | ----------------- |
804 | 823 | ||
805 | The syntax is very simple: | 824 | The syntax is very simple:: |
806 | 825 | ||
807 | # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' | 826 | # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' |
808 | # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' | 827 | # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' |
809 | 828 | ||
810 | 2.4 Attaching processes | 829 | 2.4 Attaching processes |
811 | ----------------------- | 830 | ----------------------- |
812 | 831 | ||
813 | # /bin/echo PID > tasks | 832 | :: |
833 | |||
834 | # /bin/echo PID > tasks | ||
814 | 835 | ||
815 | Note that it is PID, not PIDs. You can only attach ONE task at a time. | 836 | Note that it is PID, not PIDs. You can only attach ONE task at a time. |
816 | If you have several tasks to attach, you have to do it one after another: | 837 | If you have several tasks to attach, you have to do it one after another:: |
817 | 838 | ||
818 | # /bin/echo PID1 > tasks | 839 | # /bin/echo PID1 > tasks |
819 | # /bin/echo PID2 > tasks | 840 | # /bin/echo PID2 > tasks |
820 | ... | 841 | ... |
821 | # /bin/echo PIDn > tasks | 842 | # /bin/echo PIDn > tasks |
822 | 843 | ||
823 | 844 | ||
824 | 3. Questions | 845 | 3. Questions |
825 | ============ | 846 | ============ |
826 | 847 | ||
827 | Q: what's up with this '/bin/echo' ? | 848 | Q: |
828 | A: bash's builtin 'echo' command does not check calls to write() against | 849 | what's up with this '/bin/echo' ? |
850 | |||
851 | A: | ||
852 | bash's builtin 'echo' command does not check calls to write() against | ||
829 | errors. If you use it in the cpuset file system, you won't be | 853 | errors. If you use it in the cpuset file system, you won't be |
830 | able to tell whether a command succeeded or failed. | 854 | able to tell whether a command succeeded or failed. |
831 | 855 | ||
832 | Q: When I attach processes, only the first of the line gets really attached ! | 856 | Q: |
833 | A: We can only return one error code per call to write(). So you should also | 857 | When I attach processes, only the first of the line gets really attached ! |
858 | |||
859 | A: | ||
860 | We can only return one error code per call to write(). So you should also | ||
834 | put only ONE pid. | 861 | put only ONE pid. |
835 | 862 | ||
836 | 4. Contact | 863 | 4. Contact |
diff --git a/Documentation/cgroup-v1/devices.txt b/Documentation/cgroup-v1/devices.rst index 3c1095ca02ea..e1886783961e 100644 --- a/Documentation/cgroup-v1/devices.txt +++ b/Documentation/cgroup-v1/devices.rst | |||
@@ -1,6 +1,9 @@ | |||
1 | =========================== | ||
1 | Device Whitelist Controller | 2 | Device Whitelist Controller |
3 | =========================== | ||
2 | 4 | ||
3 | 1. Description: | 5 | 1. Description |
6 | ============== | ||
4 | 7 | ||
5 | Implement a cgroup to track and enforce open and mknod restrictions | 8 | Implement a cgroup to track and enforce open and mknod restrictions |
6 | on device files. A device cgroup associates a device access | 9 | on device files. A device cgroup associates a device access |
@@ -16,24 +19,26 @@ devices from the whitelist or add new entries. A child cgroup can | |||
16 | never receive a device access which is denied by its parent. | 19 | never receive a device access which is denied by its parent. |
17 | 20 | ||
18 | 2. User Interface | 21 | 2. User Interface |
22 | ================= | ||
19 | 23 | ||
20 | An entry is added using devices.allow, and removed using | 24 | An entry is added using devices.allow, and removed using |
21 | devices.deny. For instance | 25 | devices.deny. For instance:: |
22 | 26 | ||
23 | echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow | 27 | echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow |
24 | 28 | ||
25 | allows cgroup 1 to read and mknod the device usually known as | 29 | allows cgroup 1 to read and mknod the device usually known as |
26 | /dev/null. Doing | 30 | /dev/null. Doing:: |
27 | 31 | ||
28 | echo a > /sys/fs/cgroup/1/devices.deny | 32 | echo a > /sys/fs/cgroup/1/devices.deny |
29 | 33 | ||
30 | will remove the default 'a *:* rwm' entry. Doing | 34 | will remove the default 'a *:* rwm' entry. Doing:: |
31 | 35 | ||
32 | echo a > /sys/fs/cgroup/1/devices.allow | 36 | echo a > /sys/fs/cgroup/1/devices.allow |
33 | 37 | ||
34 | will add the 'a *:* rwm' entry to the whitelist. | 38 | will add the 'a *:* rwm' entry to the whitelist. |
35 | 39 | ||
36 | 3. Security | 40 | 3. Security |
41 | =========== | ||
37 | 42 | ||
38 | Any task can move itself between cgroups. This clearly won't | 43 | Any task can move itself between cgroups. This clearly won't |
39 | suffice, but we can decide the best way to adequately restrict | 44 | suffice, but we can decide the best way to adequately restrict |
@@ -50,6 +55,7 @@ A cgroup may not be granted more permissions than the cgroup's | |||
50 | parent has. | 55 | parent has. |
51 | 56 | ||
52 | 4. Hierarchy | 57 | 4. Hierarchy |
58 | ============ | ||
53 | 59 | ||
54 | device cgroups maintain hierarchy by making sure a cgroup never has more | 60 | device cgroups maintain hierarchy by making sure a cgroup never has more |
55 | access permissions than its parent. Every time an entry is written to | 61 | access permissions than its parent. Every time an entry is written to |
@@ -58,7 +64,8 @@ from their whitelist and all the locally set whitelist entries will be | |||
58 | re-evaluated. In case one of the locally set whitelist entries would provide | 64 | re-evaluated. In case one of the locally set whitelist entries would provide |
59 | more access than the cgroup's parent, it'll be removed from the whitelist. | 65 | more access than the cgroup's parent, it'll be removed from the whitelist. |
60 | 66 | ||
61 | Example: | 67 | Example:: |
68 | |||
62 | A | 69 | A |
63 | / \ | 70 | / \ |
64 | B | 71 | B |
@@ -67,10 +74,12 @@ Example: | |||
67 | A allow "b 8:* rwm", "c 116:1 rw" | 74 | A allow "b 8:* rwm", "c 116:1 rw" |
68 | B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" | 75 | B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" |
69 | 76 | ||
70 | If a device is denied in group A: | 77 | If a device is denied in group A:: |
78 | |||
71 | # echo "c 116:* r" > A/devices.deny | 79 | # echo "c 116:* r" > A/devices.deny |
80 | |||
72 | it'll propagate down and after revalidating B's entries, the whitelist entry | 81 | it'll propagate down and after revalidating B's entries, the whitelist entry |
73 | "c 116:2 rwm" will be removed: | 82 | "c 116:2 rwm" will be removed:: |
74 | 83 | ||
75 | group whitelist entries denied devices | 84 | group whitelist entries denied devices |
76 | A all "b 8:* rwm", "c 116:* rw" | 85 | A all "b 8:* rwm", "c 116:* rw" |
@@ -79,7 +88,8 @@ it'll propagate down and after revalidating B's entries, the whitelist entry | |||
79 | In case parent's exceptions change and local exceptions are not allowed | 88 | In case parent's exceptions change and local exceptions are not allowed |
80 | anymore, they'll be deleted. | 89 | anymore, they'll be deleted. |
81 | 90 | ||
82 | Notice that new whitelist entries will not be propagated: | 91 | Notice that new whitelist entries will not be propagated:: |
92 | |||
83 | A | 93 | A |
84 | / \ | 94 | / \ |
85 | B | 95 | B |
@@ -88,24 +98,30 @@ Notice that new whitelist entries will not be propagated: | |||
88 | A "c 1:3 rwm", "c 1:5 r" all the rest | 98 | A "c 1:3 rwm", "c 1:5 r" all the rest |
89 | B "c 1:3 rwm", "c 1:5 r" all the rest | 99 | B "c 1:3 rwm", "c 1:5 r" all the rest |
90 | 100 | ||
91 | when adding "c *:3 rwm": | 101 | when adding ``c *:3 rwm``:: |
102 | |||
92 | # echo "c *:3 rwm" >A/devices.allow | 103 | # echo "c *:3 rwm" >A/devices.allow |
93 | 104 | ||
94 | the result: | 105 | the result:: |
106 | |||
95 | group whitelist entries denied devices | 107 | group whitelist entries denied devices |
96 | A "c *:3 rwm", "c 1:5 r" all the rest | 108 | A "c *:3 rwm", "c 1:5 r" all the rest |
97 | B "c 1:3 rwm", "c 1:5 r" all the rest | 109 | B "c 1:3 rwm", "c 1:5 r" all the rest |
98 | 110 | ||
99 | but now it'll be possible to add new entries to B: | 111 | but now it'll be possible to add new entries to B:: |
112 | |||
100 | # echo "c 2:3 rwm" >B/devices.allow | 113 | # echo "c 2:3 rwm" >B/devices.allow |
101 | # echo "c 50:3 r" >B/devices.allow | 114 | # echo "c 50:3 r" >B/devices.allow |
102 | or even | 115 | |
116 | or even:: | ||
117 | |||
103 | # echo "c *:3 rwm" >B/devices.allow | 118 | # echo "c *:3 rwm" >B/devices.allow |
104 | 119 | ||
105 | Allowing or denying all by writing 'a' to devices.allow or devices.deny will | 120 | Allowing or denying all by writing 'a' to devices.allow or devices.deny will |
106 | not be possible once the device cgroups has children. | 121 | not be possible once the device cgroups has children. |
107 | 122 | ||
108 | 4.1 Hierarchy (internal implementation) | 123 | 4.1 Hierarchy (internal implementation) |
124 | --------------------------------------- | ||
109 | 125 | ||
110 | device cgroups is implemented internally using a behavior (ALLOW, DENY) and a | 126 | device cgroups is implemented internally using a behavior (ALLOW, DENY) and a |
111 | list of exceptions. The internal state is controlled using the same user | 127 | list of exceptions. The internal state is controlled using the same user |
diff --git a/Documentation/cgroup-v1/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.rst index e831cb2b8394..582d3427de3f 100644 --- a/Documentation/cgroup-v1/freezer-subsystem.txt +++ b/Documentation/cgroup-v1/freezer-subsystem.rst | |||
@@ -1,3 +1,7 @@ | |||
1 | ============== | ||
2 | Cgroup Freezer | ||
3 | ============== | ||
4 | |||
1 | The cgroup freezer is useful to batch job management system which start | 5 | The cgroup freezer is useful to batch job management system which start |
2 | and stop sets of tasks in order to schedule the resources of a machine | 6 | and stop sets of tasks in order to schedule the resources of a machine |
3 | according to the desires of a system administrator. This sort of program | 7 | according to the desires of a system administrator. This sort of program |
@@ -23,7 +27,7 @@ blocked, or ignored it can be seen by waiting or ptracing parent tasks. | |||
23 | SIGCONT is especially unsuitable since it can be caught by the task. Any | 27 | SIGCONT is especially unsuitable since it can be caught by the task. Any |
24 | programs designed to watch for SIGSTOP and SIGCONT could be broken by | 28 | programs designed to watch for SIGSTOP and SIGCONT could be broken by |
25 | attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can | 29 | attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can |
26 | demonstrate this problem using nested bash shells: | 30 | demonstrate this problem using nested bash shells:: |
27 | 31 | ||
28 | $ echo $$ | 32 | $ echo $$ |
29 | 16644 | 33 | 16644 |
@@ -93,19 +97,19 @@ The following cgroupfs files are created by cgroup freezer. | |||
93 | The root cgroup is non-freezable and the above interface files don't | 97 | The root cgroup is non-freezable and the above interface files don't |
94 | exist. | 98 | exist. |
95 | 99 | ||
96 | * Examples of usage : | 100 | * Examples of usage:: |
97 | 101 | ||
98 | # mkdir /sys/fs/cgroup/freezer | 102 | # mkdir /sys/fs/cgroup/freezer |
99 | # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer | 103 | # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer |
100 | # mkdir /sys/fs/cgroup/freezer/0 | 104 | # mkdir /sys/fs/cgroup/freezer/0 |
101 | # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks | 105 | # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks |
102 | 106 | ||
103 | to get status of the freezer subsystem : | 107 | to get status of the freezer subsystem:: |
104 | 108 | ||
105 | # cat /sys/fs/cgroup/freezer/0/freezer.state | 109 | # cat /sys/fs/cgroup/freezer/0/freezer.state |
106 | THAWED | 110 | THAWED |
107 | 111 | ||
108 | to freeze all tasks in the container : | 112 | to freeze all tasks in the container:: |
109 | 113 | ||
110 | # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state | 114 | # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state |
111 | # cat /sys/fs/cgroup/freezer/0/freezer.state | 115 | # cat /sys/fs/cgroup/freezer/0/freezer.state |
@@ -113,7 +117,7 @@ to freeze all tasks in the container : | |||
113 | # cat /sys/fs/cgroup/freezer/0/freezer.state | 117 | # cat /sys/fs/cgroup/freezer/0/freezer.state |
114 | FROZEN | 118 | FROZEN |
115 | 119 | ||
116 | to unfreeze all tasks in the container : | 120 | to unfreeze all tasks in the container:: |
117 | 121 | ||
118 | # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state | 122 | # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state |
119 | # cat /sys/fs/cgroup/freezer/0/freezer.state | 123 | # cat /sys/fs/cgroup/freezer/0/freezer.state |
diff --git a/Documentation/cgroup-v1/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.rst index 1260e5369b9b..a3902aa253a9 100644 --- a/Documentation/cgroup-v1/hugetlb.txt +++ b/Documentation/cgroup-v1/hugetlb.rst | |||
@@ -1,5 +1,6 @@ | |||
1 | ================== | ||
1 | HugeTLB Controller | 2 | HugeTLB Controller |
2 | ------------------- | 3 | ================== |
3 | 4 | ||
4 | The HugeTLB controller allows to limit the HugeTLB usage per control group and | 5 | The HugeTLB controller allows to limit the HugeTLB usage per control group and |
5 | enforces the controller limit during page fault. Since HugeTLB doesn't | 6 | enforces the controller limit during page fault. Since HugeTLB doesn't |
@@ -16,16 +17,16 @@ With the above step, the initial or the parent HugeTLB group becomes | |||
16 | visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in | 17 | visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in |
17 | the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. | 18 | the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. |
18 | 19 | ||
19 | New groups can be created under the parent group /sys/fs/cgroup. | 20 | New groups can be created under the parent group /sys/fs/cgroup:: |
20 | 21 | ||
21 | # cd /sys/fs/cgroup | 22 | # cd /sys/fs/cgroup |
22 | # mkdir g1 | 23 | # mkdir g1 |
23 | # echo $$ > g1/tasks | 24 | # echo $$ > g1/tasks |
24 | 25 | ||
25 | The above steps create a new group g1 and move the current shell | 26 | The above steps create a new group g1 and move the current shell |
26 | process (bash) into it. | 27 | process (bash) into it. |
27 | 28 | ||
28 | Brief summary of control files | 29 | Brief summary of control files:: |
29 | 30 | ||
30 | hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage | 31 | hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage |
31 | hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded | 32 | hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded |
@@ -33,17 +34,17 @@ Brief summary of control files | |||
33 | hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit | 34 | hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit |
34 | 35 | ||
35 | For a system supporting three hugepage sizes (64k, 32M and 1G), the control | 36 | For a system supporting three hugepage sizes (64k, 32M and 1G), the control |
36 | files include: | 37 | files include:: |
37 | 38 | ||
38 | hugetlb.1GB.limit_in_bytes | 39 | hugetlb.1GB.limit_in_bytes |
39 | hugetlb.1GB.max_usage_in_bytes | 40 | hugetlb.1GB.max_usage_in_bytes |
40 | hugetlb.1GB.usage_in_bytes | 41 | hugetlb.1GB.usage_in_bytes |
41 | hugetlb.1GB.failcnt | 42 | hugetlb.1GB.failcnt |
42 | hugetlb.64KB.limit_in_bytes | 43 | hugetlb.64KB.limit_in_bytes |
43 | hugetlb.64KB.max_usage_in_bytes | 44 | hugetlb.64KB.max_usage_in_bytes |
44 | hugetlb.64KB.usage_in_bytes | 45 | hugetlb.64KB.usage_in_bytes |
45 | hugetlb.64KB.failcnt | 46 | hugetlb.64KB.failcnt |
46 | hugetlb.32MB.limit_in_bytes | 47 | hugetlb.32MB.limit_in_bytes |
47 | hugetlb.32MB.max_usage_in_bytes | 48 | hugetlb.32MB.max_usage_in_bytes |
48 | hugetlb.32MB.usage_in_bytes | 49 | hugetlb.32MB.usage_in_bytes |
49 | hugetlb.32MB.failcnt | 50 | hugetlb.32MB.failcnt |
diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst new file mode 100644 index 000000000000..fe76d42edc11 --- /dev/null +++ b/Documentation/cgroup-v1/index.rst | |||
@@ -0,0 +1,30 @@ | |||
1 | :orphan: | ||
2 | |||
3 | ======================== | ||
4 | Control Groups version 1 | ||
5 | ======================== | ||
6 | |||
7 | .. toctree:: | ||
8 | :maxdepth: 1 | ||
9 | |||
10 | cgroups | ||
11 | |||
12 | blkio-controller | ||
13 | cpuacct | ||
14 | cpusets | ||
15 | devices | ||
16 | freezer-subsystem | ||
17 | hugetlb | ||
18 | memcg_test | ||
19 | memory | ||
20 | net_cls | ||
21 | net_prio | ||
22 | pids | ||
23 | rdma | ||
24 | |||
25 | .. only:: subproject and html | ||
26 | |||
27 | Indices | ||
28 | ======= | ||
29 | |||
30 | * :ref:`genindex` | ||
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.rst index 621e29ffb358..91bd18c6a514 100644 --- a/Documentation/cgroup-v1/memcg_test.txt +++ b/Documentation/cgroup-v1/memcg_test.rst | |||
@@ -1,32 +1,43 @@ | |||
1 | Memory Resource Controller(Memcg) Implementation Memo. | 1 | ===================================================== |
2 | Memory Resource Controller(Memcg) Implementation Memo | ||
3 | ===================================================== | ||
4 | |||
2 | Last Updated: 2010/2 | 5 | Last Updated: 2010/2 |
6 | |||
3 | Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). | 7 | Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). |
4 | 8 | ||
5 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior | 9 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior |
6 | is complex. This is a document for memcg's internal behavior. | 10 | is complex. This is a document for memcg's internal behavior. |
7 | Please note that implementation details can be changed. | 11 | Please note that implementation details can be changed. |
8 | 12 | ||
9 | (*) Topics on API should be in Documentation/cgroup-v1/memory.txt) | 13 | (*) Topics on API should be in Documentation/cgroup-v1/memory.rst) |
10 | 14 | ||
11 | 0. How to record usage ? | 15 | 0. How to record usage ? |
16 | ======================== | ||
17 | |||
12 | 2 objects are used. | 18 | 2 objects are used. |
13 | 19 | ||
14 | page_cgroup ....an object per page. | 20 | page_cgroup ....an object per page. |
21 | |||
15 | Allocated at boot or memory hotplug. Freed at memory hot removal. | 22 | Allocated at boot or memory hotplug. Freed at memory hot removal. |
16 | 23 | ||
17 | swap_cgroup ... an entry per swp_entry. | 24 | swap_cgroup ... an entry per swp_entry. |
25 | |||
18 | Allocated at swapon(). Freed at swapoff(). | 26 | Allocated at swapon(). Freed at swapoff(). |
19 | 27 | ||
20 | The page_cgroup has USED bit and double count against a page_cgroup never | 28 | The page_cgroup has USED bit and double count against a page_cgroup never |
21 | occurs. swap_cgroup is used only when a charged page is swapped-out. | 29 | occurs. swap_cgroup is used only when a charged page is swapped-out. |
22 | 30 | ||
23 | 1. Charge | 31 | 1. Charge |
32 | ========= | ||
24 | 33 | ||
25 | a page/swp_entry may be charged (usage += PAGE_SIZE) at | 34 | a page/swp_entry may be charged (usage += PAGE_SIZE) at |
26 | 35 | ||
27 | mem_cgroup_try_charge() | 36 | mem_cgroup_try_charge() |
28 | 37 | ||
29 | 2. Uncharge | 38 | 2. Uncharge |
39 | =========== | ||
40 | |||
30 | a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by | 41 | a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by |
31 | 42 | ||
32 | mem_cgroup_uncharge() | 43 | mem_cgroup_uncharge() |
@@ -37,9 +48,12 @@ Please note that implementation details can be changed. | |||
37 | disappears. | 48 | disappears. |
38 | 49 | ||
39 | 3. charge-commit-cancel | 50 | 3. charge-commit-cancel |
51 | ======================= | ||
52 | |||
40 | Memcg pages are charged in two steps: | 53 | Memcg pages are charged in two steps: |
41 | mem_cgroup_try_charge() | 54 | |
42 | mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() | 55 | - mem_cgroup_try_charge() |
56 | - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() | ||
43 | 57 | ||
44 | At try_charge(), there are no flags to say "this page is charged". | 58 | At try_charge(), there are no flags to say "this page is charged". |
45 | at this point, usage += PAGE_SIZE. | 59 | at this point, usage += PAGE_SIZE. |
@@ -51,6 +65,8 @@ Please note that implementation details can be changed. | |||
51 | Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | 65 | Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. |
52 | 66 | ||
53 | 4. Anonymous | 67 | 4. Anonymous |
68 | ============ | ||
69 | |||
54 | Anonymous page is newly allocated at | 70 | Anonymous page is newly allocated at |
55 | - page fault into MAP_ANONYMOUS mapping. | 71 | - page fault into MAP_ANONYMOUS mapping. |
56 | - Copy-On-Write. | 72 | - Copy-On-Write. |
@@ -78,34 +94,45 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
78 | (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. | 94 | (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. |
79 | 95 | ||
80 | 5. Page Cache | 96 | 5. Page Cache |
81 | Page Cache is charged at | 97 | ============= |
98 | |||
99 | Page Cache is charged at | ||
82 | - add_to_page_cache_locked(). | 100 | - add_to_page_cache_locked(). |
83 | 101 | ||
84 | The logic is very clear. (About migration, see below) | 102 | The logic is very clear. (About migration, see below) |
85 | Note: __remove_from_page_cache() is called by remove_from_page_cache() | 103 | |
86 | and __remove_mapping(). | 104 | Note: |
105 | __remove_from_page_cache() is called by remove_from_page_cache() | ||
106 | and __remove_mapping(). | ||
87 | 107 | ||
88 | 6. Shmem(tmpfs) Page Cache | 108 | 6. Shmem(tmpfs) Page Cache |
109 | =========================== | ||
110 | |||
89 | The best way to understand shmem's page state transition is to read | 111 | The best way to understand shmem's page state transition is to read |
90 | mm/shmem.c. | 112 | mm/shmem.c. |
113 | |||
91 | But brief explanation of the behavior of memcg around shmem will be | 114 | But brief explanation of the behavior of memcg around shmem will be |
92 | helpful to understand the logic. | 115 | helpful to understand the logic. |
93 | 116 | ||
94 | Shmem's page (just leaf page, not direct/indirect block) can be on | 117 | Shmem's page (just leaf page, not direct/indirect block) can be on |
118 | |||
95 | - radix-tree of shmem's inode. | 119 | - radix-tree of shmem's inode. |
96 | - SwapCache. | 120 | - SwapCache. |
97 | - Both on radix-tree and SwapCache. This happens at swap-in | 121 | - Both on radix-tree and SwapCache. This happens at swap-in |
98 | and swap-out, | 122 | and swap-out, |
99 | 123 | ||
100 | It's charged when... | 124 | It's charged when... |
125 | |||
101 | - A new page is added to shmem's radix-tree. | 126 | - A new page is added to shmem's radix-tree. |
102 | - A swp page is read. (move a charge from swap_cgroup to page_cgroup) | 127 | - A swp page is read. (move a charge from swap_cgroup to page_cgroup) |
103 | 128 | ||
104 | 7. Page Migration | 129 | 7. Page Migration |
130 | ================= | ||
105 | 131 | ||
106 | mem_cgroup_migrate() | 132 | mem_cgroup_migrate() |
107 | 133 | ||
108 | 8. LRU | 134 | 8. LRU |
135 | ====== | ||
109 | Each memcg has its own private LRU. Now, its handling is under global | 136 | Each memcg has its own private LRU. Now, its handling is under global |
110 | VM's control (means that it's handled under global pgdat->lru_lock). | 137 | VM's control (means that it's handled under global pgdat->lru_lock). |
111 | Almost all routines around memcg's LRU is called by global LRU's | 138 | Almost all routines around memcg's LRU is called by global LRU's |
@@ -114,163 +141,211 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
114 | A special function is mem_cgroup_isolate_pages(). This scans | 141 | A special function is mem_cgroup_isolate_pages(). This scans |
115 | memcg's private LRU and call __isolate_lru_page() to extract a page | 142 | memcg's private LRU and call __isolate_lru_page() to extract a page |
116 | from LRU. | 143 | from LRU. |
144 | |||
117 | (By __isolate_lru_page(), the page is removed from both of global and | 145 | (By __isolate_lru_page(), the page is removed from both of global and |
118 | private LRU.) | 146 | private LRU.) |
119 | 147 | ||
120 | 148 | ||
121 | 9. Typical Tests. | 149 | 9. Typical Tests. |
150 | ================= | ||
122 | 151 | ||
123 | Tests for racy cases. | 152 | Tests for racy cases. |
124 | 153 | ||
125 | 9.1 Small limit to memcg. | 154 | 9.1 Small limit to memcg. |
155 | ------------------------- | ||
156 | |||
126 | When you do test to do racy case, it's good test to set memcg's limit | 157 | When you do test to do racy case, it's good test to set memcg's limit |
127 | to be very small rather than GB. Many races found in the test under | 158 | to be very small rather than GB. Many races found in the test under |
128 | xKB or xxMB limits. | 159 | xKB or xxMB limits. |
160 | |||
129 | (Memory behavior under GB and Memory behavior under MB shows very | 161 | (Memory behavior under GB and Memory behavior under MB shows very |
130 | different situation.) | 162 | different situation.) |
163 | |||
164 | 9.2 Shmem | ||
165 | --------- | ||
131 | 166 | ||
132 | 9.2 Shmem | ||
133 | Historically, memcg's shmem handling was poor and we saw some amount | 167 | Historically, memcg's shmem handling was poor and we saw some amount |
134 | of troubles here. This is because shmem is page-cache but can be | 168 | of troubles here. This is because shmem is page-cache but can be |
135 | SwapCache. Test with shmem/tmpfs is always good test. | 169 | SwapCache. Test with shmem/tmpfs is always good test. |
136 | 170 | ||
137 | 9.3 Migration | 171 | 9.3 Migration |
172 | ------------- | ||
173 | |||
138 | For NUMA, migration is an another special case. To do easy test, cpuset | 174 | For NUMA, migration is an another special case. To do easy test, cpuset |
139 | is useful. Following is a sample script to do migration. | 175 | is useful. Following is a sample script to do migration:: |
140 | 176 | ||
141 | mount -t cgroup -o cpuset none /opt/cpuset | 177 | mount -t cgroup -o cpuset none /opt/cpuset |
142 | 178 | ||
143 | mkdir /opt/cpuset/01 | 179 | mkdir /opt/cpuset/01 |
144 | echo 1 > /opt/cpuset/01/cpuset.cpus | 180 | echo 1 > /opt/cpuset/01/cpuset.cpus |
145 | echo 0 > /opt/cpuset/01/cpuset.mems | 181 | echo 0 > /opt/cpuset/01/cpuset.mems |
146 | echo 1 > /opt/cpuset/01/cpuset.memory_migrate | 182 | echo 1 > /opt/cpuset/01/cpuset.memory_migrate |
147 | mkdir /opt/cpuset/02 | 183 | mkdir /opt/cpuset/02 |
148 | echo 1 > /opt/cpuset/02/cpuset.cpus | 184 | echo 1 > /opt/cpuset/02/cpuset.cpus |
149 | echo 1 > /opt/cpuset/02/cpuset.mems | 185 | echo 1 > /opt/cpuset/02/cpuset.mems |
150 | echo 1 > /opt/cpuset/02/cpuset.memory_migrate | 186 | echo 1 > /opt/cpuset/02/cpuset.memory_migrate |
151 | 187 | ||
152 | In above set, when you moves a task from 01 to 02, page migration to | 188 | In above set, when you moves a task from 01 to 02, page migration to |
153 | node 0 to node 1 will occur. Following is a script to migrate all | 189 | node 0 to node 1 will occur. Following is a script to migrate all |
154 | under cpuset. | 190 | under cpuset.:: |
155 | -- | 191 | |
156 | move_task() | 192 | -- |
157 | { | 193 | move_task() |
158 | for pid in $1 | 194 | { |
159 | do | 195 | for pid in $1 |
160 | /bin/echo $pid >$2/tasks 2>/dev/null | 196 | do |
161 | echo -n $pid | 197 | /bin/echo $pid >$2/tasks 2>/dev/null |
162 | echo -n " " | 198 | echo -n $pid |
163 | done | 199 | echo -n " " |
164 | echo END | 200 | done |
165 | } | 201 | echo END |
166 | 202 | } | |
167 | G1_TASK=`cat ${G1}/tasks` | 203 | |
168 | G2_TASK=`cat ${G2}/tasks` | 204 | G1_TASK=`cat ${G1}/tasks` |
169 | move_task "${G1_TASK}" ${G2} & | 205 | G2_TASK=`cat ${G2}/tasks` |
170 | -- | 206 | move_task "${G1_TASK}" ${G2} & |
171 | 9.4 Memory hotplug. | 207 | -- |
208 | |||
209 | 9.4 Memory hotplug | ||
210 | ------------------ | ||
211 | |||
172 | memory hotplug test is one of good test. | 212 | memory hotplug test is one of good test. |
173 | to offline memory, do following. | 213 | |
174 | # echo offline > /sys/devices/system/memory/memoryXXX/state | 214 | to offline memory, do following:: |
215 | |||
216 | # echo offline > /sys/devices/system/memory/memoryXXX/state | ||
217 | |||
175 | (XXX is the place of memory) | 218 | (XXX is the place of memory) |
219 | |||
176 | This is an easy way to test page migration, too. | 220 | This is an easy way to test page migration, too. |
177 | 221 | ||
178 | 9.5 mkdir/rmdir | 222 | 9.5 mkdir/rmdir |
223 | --------------- | ||
224 | |||
179 | When using hierarchy, mkdir/rmdir test should be done. | 225 | When using hierarchy, mkdir/rmdir test should be done. |
180 | Use tests like the following. | 226 | Use tests like the following:: |
227 | |||
228 | echo 1 >/opt/cgroup/01/memory/use_hierarchy | ||
229 | mkdir /opt/cgroup/01/child_a | ||
230 | mkdir /opt/cgroup/01/child_b | ||
181 | 231 | ||
182 | echo 1 >/opt/cgroup/01/memory/use_hierarchy | 232 | set limit to 01. |
183 | mkdir /opt/cgroup/01/child_a | 233 | add limit to 01/child_b |
184 | mkdir /opt/cgroup/01/child_b | 234 | run jobs under child_a and child_b |
185 | 235 | ||
186 | set limit to 01. | 236 | create/delete following groups at random while jobs are running:: |
187 | add limit to 01/child_b | ||
188 | run jobs under child_a and child_b | ||
189 | 237 | ||
190 | create/delete following groups at random while jobs are running. | 238 | /opt/cgroup/01/child_a/child_aa |
191 | /opt/cgroup/01/child_a/child_aa | 239 | /opt/cgroup/01/child_b/child_bb |
192 | /opt/cgroup/01/child_b/child_bb | 240 | /opt/cgroup/01/child_c |
193 | /opt/cgroup/01/child_c | ||
194 | 241 | ||
195 | running new jobs in new group is also good. | 242 | running new jobs in new group is also good. |
196 | 243 | ||
197 | 9.6 Mount with other subsystems. | 244 | 9.6 Mount with other subsystems |
245 | ------------------------------- | ||
246 | |||
198 | Mounting with other subsystems is a good test because there is a | 247 | Mounting with other subsystems is a good test because there is a |
199 | race and lock dependency with other cgroup subsystems. | 248 | race and lock dependency with other cgroup subsystems. |
200 | 249 | ||
201 | example) | 250 | example:: |
202 | # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices | 251 | |
252 | # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices | ||
203 | 253 | ||
204 | and do task move, mkdir, rmdir etc...under this. | 254 | and do task move, mkdir, rmdir etc...under this. |
205 | 255 | ||
206 | 9.7 swapoff. | 256 | 9.7 swapoff |
257 | ----------- | ||
258 | |||
207 | Besides management of swap is one of complicated parts of memcg, | 259 | Besides management of swap is one of complicated parts of memcg, |
208 | call path of swap-in at swapoff is not same as usual swap-in path.. | 260 | call path of swap-in at swapoff is not same as usual swap-in path.. |
209 | It's worth to be tested explicitly. | 261 | It's worth to be tested explicitly. |
210 | 262 | ||
211 | For example, test like following is good. | 263 | For example, test like following is good: |
212 | (Shell-A) | 264 | |
213 | # mount -t cgroup none /cgroup -o memory | 265 | (Shell-A):: |
214 | # mkdir /cgroup/test | 266 | |
215 | # echo 40M > /cgroup/test/memory.limit_in_bytes | 267 | # mount -t cgroup none /cgroup -o memory |
216 | # echo 0 > /cgroup/test/tasks | 268 | # mkdir /cgroup/test |
269 | # echo 40M > /cgroup/test/memory.limit_in_bytes | ||
270 | # echo 0 > /cgroup/test/tasks | ||
271 | |||
217 | Run malloc(100M) program under this. You'll see 60M of swaps. | 272 | Run malloc(100M) program under this. You'll see 60M of swaps. |
218 | (Shell-B) | 273 | |
219 | # move all tasks in /cgroup/test to /cgroup | 274 | (Shell-B):: |
220 | # /sbin/swapoff -a | 275 | |
221 | # rmdir /cgroup/test | 276 | # move all tasks in /cgroup/test to /cgroup |
222 | # kill malloc task. | 277 | # /sbin/swapoff -a |
278 | # rmdir /cgroup/test | ||
279 | # kill malloc task. | ||
223 | 280 | ||
224 | Of course, tmpfs v.s. swapoff test should be tested, too. | 281 | Of course, tmpfs v.s. swapoff test should be tested, too. |
225 | 282 | ||
226 | 9.8 OOM-Killer | 283 | 9.8 OOM-Killer |
284 | -------------- | ||
285 | |||
227 | Out-of-memory caused by memcg's limit will kill tasks under | 286 | Out-of-memory caused by memcg's limit will kill tasks under |
228 | the memcg. When hierarchy is used, a task under hierarchy | 287 | the memcg. When hierarchy is used, a task under hierarchy |
229 | will be killed by the kernel. | 288 | will be killed by the kernel. |
289 | |||
230 | In this case, panic_on_oom shouldn't be invoked and tasks | 290 | In this case, panic_on_oom shouldn't be invoked and tasks |
231 | in other groups shouldn't be killed. | 291 | in other groups shouldn't be killed. |
232 | 292 | ||
233 | It's not difficult to cause OOM under memcg as following. | 293 | It's not difficult to cause OOM under memcg as following. |
234 | Case A) when you can swapoff | 294 | |
235 | #swapoff -a | 295 | Case A) when you can swapoff:: |
236 | #echo 50M > /memory.limit_in_bytes | 296 | |
297 | #swapoff -a | ||
298 | #echo 50M > /memory.limit_in_bytes | ||
299 | |||
237 | run 51M of malloc | 300 | run 51M of malloc |
238 | 301 | ||
239 | Case B) when you use mem+swap limitation. | 302 | Case B) when you use mem+swap limitation:: |
240 | #echo 50M > memory.limit_in_bytes | 303 | |
241 | #echo 50M > memory.memsw.limit_in_bytes | 304 | #echo 50M > memory.limit_in_bytes |
305 | #echo 50M > memory.memsw.limit_in_bytes | ||
306 | |||
242 | run 51M of malloc | 307 | run 51M of malloc |
243 | 308 | ||
244 | 9.9 Move charges at task migration | 309 | 9.9 Move charges at task migration |
310 | ---------------------------------- | ||
311 | |||
245 | Charges associated with a task can be moved along with task migration. | 312 | Charges associated with a task can be moved along with task migration. |
246 | 313 | ||
247 | (Shell-A) | 314 | (Shell-A):: |
248 | #mkdir /cgroup/A | 315 | |
249 | #echo $$ >/cgroup/A/tasks | 316 | #mkdir /cgroup/A |
317 | #echo $$ >/cgroup/A/tasks | ||
318 | |||
250 | run some programs which uses some amount of memory in /cgroup/A. | 319 | run some programs which uses some amount of memory in /cgroup/A. |
251 | 320 | ||
252 | (Shell-B) | 321 | (Shell-B):: |
253 | #mkdir /cgroup/B | 322 | |
254 | #echo 1 >/cgroup/B/memory.move_charge_at_immigrate | 323 | #mkdir /cgroup/B |
255 | #echo "pid of the program running in group A" >/cgroup/B/tasks | 324 | #echo 1 >/cgroup/B/memory.move_charge_at_immigrate |
325 | #echo "pid of the program running in group A" >/cgroup/B/tasks | ||
256 | 326 | ||
257 | You can see charges have been moved by reading *.usage_in_bytes or | 327 | You can see charges have been moved by reading ``*.usage_in_bytes`` or |
258 | memory.stat of both A and B. | 328 | memory.stat of both A and B. |
259 | See 8.2 of Documentation/cgroup-v1/memory.txt to see what value should be | ||
260 | written to move_charge_at_immigrate. | ||
261 | 329 | ||
262 | 9.10 Memory thresholds | 330 | See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should |
331 | be written to move_charge_at_immigrate. | ||
332 | |||
333 | 9.10 Memory thresholds | ||
334 | ---------------------- | ||
335 | |||
263 | Memory controller implements memory thresholds using cgroups notification | 336 | Memory controller implements memory thresholds using cgroups notification |
264 | API. You can use tools/cgroup/cgroup_event_listener.c to test it. | 337 | API. You can use tools/cgroup/cgroup_event_listener.c to test it. |
265 | 338 | ||
266 | (Shell-A) Create cgroup and run event listener | 339 | (Shell-A) Create cgroup and run event listener:: |
267 | # mkdir /cgroup/A | 340 | |
268 | # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M | 341 | # mkdir /cgroup/A |
342 | # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M | ||
343 | |||
344 | (Shell-B) Add task to cgroup and try to allocate and free memory:: | ||
269 | 345 | ||
270 | (Shell-B) Add task to cgroup and try to allocate and free memory | 346 | # echo $$ >/cgroup/A/tasks |
271 | # echo $$ >/cgroup/A/tasks | 347 | # a="$(dd if=/dev/zero bs=1M count=10)" |
272 | # a="$(dd if=/dev/zero bs=1M count=10)" | 348 | # a= |
273 | # a= | ||
274 | 349 | ||
275 | You will see message from cgroup_event_listener every time you cross | 350 | You will see message from cgroup_event_listener every time you cross |
276 | the thresholds. | 351 | the thresholds. |
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.rst index a33cedf85427..41bdc038dad9 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.rst | |||
@@ -1,22 +1,26 @@ | |||
1 | ========================== | ||
1 | Memory Resource Controller | 2 | Memory Resource Controller |
3 | ========================== | ||
2 | 4 | ||
3 | NOTE: This document is hopelessly outdated and it asks for a complete | 5 | NOTE: |
6 | This document is hopelessly outdated and it asks for a complete | ||
4 | rewrite. It still contains a useful information so we are keeping it | 7 | rewrite. It still contains a useful information so we are keeping it |
5 | here but make sure to check the current code if you need a deeper | 8 | here but make sure to check the current code if you need a deeper |
6 | understanding. | 9 | understanding. |
7 | 10 | ||
8 | NOTE: The Memory Resource Controller has generically been referred to as the | 11 | NOTE: |
12 | The Memory Resource Controller has generically been referred to as the | ||
9 | memory controller in this document. Do not confuse memory controller | 13 | memory controller in this document. Do not confuse memory controller |
10 | used here with the memory controller that is used in hardware. | 14 | used here with the memory controller that is used in hardware. |
11 | 15 | ||
12 | (For editors) | 16 | (For editors) In this document: |
13 | In this document: | ||
14 | When we mention a cgroup (cgroupfs's directory) with memory controller, | 17 | When we mention a cgroup (cgroupfs's directory) with memory controller, |
15 | we call it "memory cgroup". When you see git-log and source code, you'll | 18 | we call it "memory cgroup". When you see git-log and source code, you'll |
16 | see patch's title and function names tend to use "memcg". | 19 | see patch's title and function names tend to use "memcg". |
17 | In this document, we avoid using it. | 20 | In this document, we avoid using it. |
18 | 21 | ||
19 | Benefits and Purpose of the memory controller | 22 | Benefits and Purpose of the memory controller |
23 | ============================================= | ||
20 | 24 | ||
21 | The memory controller isolates the memory behaviour of a group of tasks | 25 | The memory controller isolates the memory behaviour of a group of tasks |
22 | from the rest of the system. The article on LWN [12] mentions some probable | 26 | from the rest of the system. The article on LWN [12] mentions some probable |
@@ -38,6 +42,7 @@ e. There are several other use cases; find one or use the controller just | |||
38 | Current Status: linux-2.6.34-mmotm(development version of 2010/April) | 42 | Current Status: linux-2.6.34-mmotm(development version of 2010/April) |
39 | 43 | ||
40 | Features: | 44 | Features: |
45 | |||
41 | - accounting anonymous pages, file caches, swap caches usage and limiting them. | 46 | - accounting anonymous pages, file caches, swap caches usage and limiting them. |
42 | - pages are linked to per-memcg LRU exclusively, and there is no global LRU. | 47 | - pages are linked to per-memcg LRU exclusively, and there is no global LRU. |
43 | - optionally, memory+swap usage can be accounted and limited. | 48 | - optionally, memory+swap usage can be accounted and limited. |
@@ -54,41 +59,48 @@ Features: | |||
54 | 59 | ||
55 | Brief summary of control files. | 60 | Brief summary of control files. |
56 | 61 | ||
57 | tasks # attach a task(thread) and show list of threads | 62 | ==================================== ========================================== |
58 | cgroup.procs # show list of processes | 63 | tasks attach a task(thread) and show list of |
59 | cgroup.event_control # an interface for event_fd() | 64 | threads |
60 | memory.usage_in_bytes # show current usage for memory | 65 | cgroup.procs show list of processes |
61 | (See 5.5 for details) | 66 | cgroup.event_control an interface for event_fd() |
62 | memory.memsw.usage_in_bytes # show current usage for memory+Swap | 67 | memory.usage_in_bytes show current usage for memory |
63 | (See 5.5 for details) | 68 | (See 5.5 for details) |
64 | memory.limit_in_bytes # set/show limit of memory usage | 69 | memory.memsw.usage_in_bytes show current usage for memory+Swap |
65 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage | 70 | (See 5.5 for details) |
66 | memory.failcnt # show the number of memory usage hits limits | 71 | memory.limit_in_bytes set/show limit of memory usage |
67 | memory.memsw.failcnt # show the number of memory+Swap hits limits | 72 | memory.memsw.limit_in_bytes set/show limit of memory+Swap usage |
68 | memory.max_usage_in_bytes # show max memory usage recorded | 73 | memory.failcnt show the number of memory usage hits limits |
69 | memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded | 74 | memory.memsw.failcnt show the number of memory+Swap hits limits |
70 | memory.soft_limit_in_bytes # set/show soft limit of memory usage | 75 | memory.max_usage_in_bytes show max memory usage recorded |
71 | memory.stat # show various statistics | 76 | memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded |
72 | memory.use_hierarchy # set/show hierarchical account enabled | 77 | memory.soft_limit_in_bytes set/show soft limit of memory usage |
73 | memory.force_empty # trigger forced page reclaim | 78 | memory.stat show various statistics |
74 | memory.pressure_level # set memory pressure notifications | 79 | memory.use_hierarchy set/show hierarchical account enabled |
75 | memory.swappiness # set/show swappiness parameter of vmscan | 80 | memory.force_empty trigger forced page reclaim |
76 | (See sysctl's vm.swappiness) | 81 | memory.pressure_level set memory pressure notifications |
77 | memory.move_charge_at_immigrate # set/show controls of moving charges | 82 | memory.swappiness set/show swappiness parameter of vmscan |
78 | memory.oom_control # set/show oom controls. | 83 | (See sysctl's vm.swappiness) |
79 | memory.numa_stat # show the number of memory usage per numa node | 84 | memory.move_charge_at_immigrate set/show controls of moving charges |
80 | 85 | memory.oom_control set/show oom controls. | |
81 | memory.kmem.limit_in_bytes # set/show hard limit for kernel memory | 86 | memory.numa_stat show the number of memory usage per numa |
82 | memory.kmem.usage_in_bytes # show current kernel memory allocation | 87 | node |
83 | memory.kmem.failcnt # show the number of kernel memory usage hits limits | 88 | |
84 | memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded | 89 | memory.kmem.limit_in_bytes set/show hard limit for kernel memory |
85 | 90 | memory.kmem.usage_in_bytes show current kernel memory allocation | |
86 | memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory | 91 | memory.kmem.failcnt show the number of kernel memory usage |
87 | memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation | 92 | hits limits |
88 | memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits | 93 | memory.kmem.max_usage_in_bytes show max kernel memory usage recorded |
89 | memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded | 94 | |
95 | memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory | ||
96 | memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation | ||
97 | memory.kmem.tcp.failcnt show the number of tcp buf memory usage | ||
98 | hits limits | ||
99 | memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded | ||
100 | ==================================== ========================================== | ||
90 | 101 | ||
91 | 1. History | 102 | 1. History |
103 | ========== | ||
92 | 104 | ||
93 | The memory controller has a long history. A request for comments for the memory | 105 | The memory controller has a long history. A request for comments for the memory |
94 | controller was posted by Balbir Singh [1]. At the time the RFC was posted | 106 | controller was posted by Balbir Singh [1]. At the time the RFC was posted |
@@ -103,6 +115,7 @@ at version 6; it combines both mapped (RSS) and unmapped Page | |||
103 | Cache Control [11]. | 115 | Cache Control [11]. |
104 | 116 | ||
105 | 2. Memory Control | 117 | 2. Memory Control |
118 | ================= | ||
106 | 119 | ||
107 | Memory is a unique resource in the sense that it is present in a limited | 120 | Memory is a unique resource in the sense that it is present in a limited |
108 | amount. If a task requires a lot of CPU processing, the task can spread | 121 | amount. If a task requires a lot of CPU processing, the task can spread |
@@ -120,6 +133,7 @@ are: | |||
120 | The memory controller is the first controller developed. | 133 | The memory controller is the first controller developed. |
121 | 134 | ||
122 | 2.1. Design | 135 | 2.1. Design |
136 | ----------- | ||
123 | 137 | ||
124 | The core of the design is a counter called the page_counter. The | 138 | The core of the design is a counter called the page_counter. The |
125 | page_counter tracks the current memory usage and limit of the group of | 139 | page_counter tracks the current memory usage and limit of the group of |
@@ -127,6 +141,9 @@ processes associated with the controller. Each cgroup has a memory controller | |||
127 | specific data structure (mem_cgroup) associated with it. | 141 | specific data structure (mem_cgroup) associated with it. |
128 | 142 | ||
129 | 2.2. Accounting | 143 | 2.2. Accounting |
144 | --------------- | ||
145 | |||
146 | :: | ||
130 | 147 | ||
131 | +--------------------+ | 148 | +--------------------+ |
132 | | mem_cgroup | | 149 | | mem_cgroup | |
@@ -165,6 +182,7 @@ updated. page_cgroup has its own LRU on cgroup. | |||
165 | (*) page_cgroup structure is allocated at boot/memory-hotplug time. | 182 | (*) page_cgroup structure is allocated at boot/memory-hotplug time. |
166 | 183 | ||
167 | 2.2.1 Accounting details | 184 | 2.2.1 Accounting details |
185 | ------------------------ | ||
168 | 186 | ||
169 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. | 187 | All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. |
170 | Some pages which are never reclaimable and will not be on the LRU | 188 | Some pages which are never reclaimable and will not be on the LRU |
@@ -191,6 +209,7 @@ Note: we just account pages-on-LRU because our purpose is to control amount | |||
191 | of used pages; not-on-LRU pages tend to be out-of-control from VM view. | 209 | of used pages; not-on-LRU pages tend to be out-of-control from VM view. |
192 | 210 | ||
193 | 2.3 Shared Page Accounting | 211 | 2.3 Shared Page Accounting |
212 | -------------------------- | ||
194 | 213 | ||
195 | Shared pages are accounted on the basis of the first touch approach. The | 214 | Shared pages are accounted on the basis of the first touch approach. The |
196 | cgroup that first touches a page is accounted for the page. The principle | 215 | cgroup that first touches a page is accounted for the page. The principle |
@@ -207,11 +226,13 @@ be backed into memory in force, charges for pages are accounted against the | |||
207 | caller of swapoff rather than the users of shmem. | 226 | caller of swapoff rather than the users of shmem. |
208 | 227 | ||
209 | 2.4 Swap Extension (CONFIG_MEMCG_SWAP) | 228 | 2.4 Swap Extension (CONFIG_MEMCG_SWAP) |
229 | -------------------------------------- | ||
210 | 230 | ||
211 | Swap Extension allows you to record charge for swap. A swapped-in page is | 231 | Swap Extension allows you to record charge for swap. A swapped-in page is |
212 | charged back to original page allocator if possible. | 232 | charged back to original page allocator if possible. |
213 | 233 | ||
214 | When swap is accounted, following files are added. | 234 | When swap is accounted, following files are added. |
235 | |||
215 | - memory.memsw.usage_in_bytes. | 236 | - memory.memsw.usage_in_bytes. |
216 | - memory.memsw.limit_in_bytes. | 237 | - memory.memsw.limit_in_bytes. |
217 | 238 | ||
@@ -224,14 +245,16 @@ In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. | |||
224 | By using the memsw limit, you can avoid system OOM which can be caused by swap | 245 | By using the memsw limit, you can avoid system OOM which can be caused by swap |
225 | shortage. | 246 | shortage. |
226 | 247 | ||
227 | * why 'memory+swap' rather than swap. | 248 | **why 'memory+swap' rather than swap** |
249 | |||
228 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means | 250 | The global LRU(kswapd) can swap out arbitrary pages. Swap-out means |
229 | to move account from memory to swap...there is no change in usage of | 251 | to move account from memory to swap...there is no change in usage of |
230 | memory+swap. In other words, when we want to limit the usage of swap without | 252 | memory+swap. In other words, when we want to limit the usage of swap without |
231 | affecting global LRU, memory+swap limit is better than just limiting swap from | 253 | affecting global LRU, memory+swap limit is better than just limiting swap from |
232 | an OS point of view. | 254 | an OS point of view. |
233 | 255 | ||
234 | * What happens when a cgroup hits memory.memsw.limit_in_bytes | 256 | **What happens when a cgroup hits memory.memsw.limit_in_bytes** |
257 | |||
235 | When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out | 258 | When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out |
236 | in this cgroup. Then, swap-out will not be done by cgroup routine and file | 259 | in this cgroup. Then, swap-out will not be done by cgroup routine and file |
237 | caches are dropped. But as mentioned above, global LRU can do swapout memory | 260 | caches are dropped. But as mentioned above, global LRU can do swapout memory |
@@ -239,6 +262,7 @@ from it for sanity of the system's memory management state. You can't forbid | |||
239 | it by cgroup. | 262 | it by cgroup. |
240 | 263 | ||
241 | 2.5 Reclaim | 264 | 2.5 Reclaim |
265 | ----------- | ||
242 | 266 | ||
243 | Each cgroup maintains a per cgroup LRU which has the same structure as | 267 | Each cgroup maintains a per cgroup LRU which has the same structure as |
244 | global VM. When a cgroup goes over its limit, we first try | 268 | global VM. When a cgroup goes over its limit, we first try |
@@ -251,29 +275,36 @@ The reclaim algorithm has not been modified for cgroups, except that | |||
251 | pages that are selected for reclaiming come from the per-cgroup LRU | 275 | pages that are selected for reclaiming come from the per-cgroup LRU |
252 | list. | 276 | list. |
253 | 277 | ||
254 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any | 278 | NOTE: |
255 | limits on the root cgroup. | 279 | Reclaim does not work for the root cgroup, since we cannot set any |
280 | limits on the root cgroup. | ||
256 | 281 | ||
257 | Note2: When panic_on_oom is set to "2", the whole system will panic. | 282 | Note2: |
283 | When panic_on_oom is set to "2", the whole system will panic. | ||
258 | 284 | ||
259 | When oom event notifier is registered, event will be delivered. | 285 | When oom event notifier is registered, event will be delivered. |
260 | (See oom_control section) | 286 | (See oom_control section) |
261 | 287 | ||
262 | 2.6 Locking | 288 | 2.6 Locking |
289 | ----------- | ||
263 | 290 | ||
264 | lock_page_cgroup()/unlock_page_cgroup() should not be called under | 291 | lock_page_cgroup()/unlock_page_cgroup() should not be called under |
265 | the i_pages lock. | 292 | the i_pages lock. |
266 | 293 | ||
267 | Other lock order is following: | 294 | Other lock order is following: |
295 | |||
268 | PG_locked. | 296 | PG_locked. |
269 | mm->page_table_lock | 297 | mm->page_table_lock |
270 | pgdat->lru_lock | 298 | pgdat->lru_lock |
271 | lock_page_cgroup. | 299 | lock_page_cgroup. |
300 | |||
272 | In many cases, just lock_page_cgroup() is called. | 301 | In many cases, just lock_page_cgroup() is called. |
302 | |||
273 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | 303 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by |
274 | pgdat->lru_lock, it has no lock of its own. | 304 | pgdat->lru_lock, it has no lock of its own. |
275 | 305 | ||
276 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) | 306 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) |
307 | ----------------------------------------------- | ||
277 | 308 | ||
278 | With the Kernel memory extension, the Memory Controller is able to limit | 309 | With the Kernel memory extension, the Memory Controller is able to limit |
279 | the amount of kernel memory used by the system. Kernel memory is fundamentally | 310 | the amount of kernel memory used by the system. Kernel memory is fundamentally |
@@ -288,6 +319,7 @@ Kernel memory limits are not imposed for the root cgroup. Usage for the root | |||
288 | cgroup may or may not be accounted. The memory used is accumulated into | 319 | cgroup may or may not be accounted. The memory used is accumulated into |
289 | memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. | 320 | memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. |
290 | (currently only for tcp). | 321 | (currently only for tcp). |
322 | |||
291 | The main "kmem" counter is fed into the main counter, so kmem charges will | 323 | The main "kmem" counter is fed into the main counter, so kmem charges will |
292 | also be visible from the user counter. | 324 | also be visible from the user counter. |
293 | 325 | ||
@@ -295,36 +327,42 @@ Currently no soft limit is implemented for kernel memory. It is future work | |||
295 | to trigger slab reclaim when those limits are reached. | 327 | to trigger slab reclaim when those limits are reached. |
296 | 328 | ||
297 | 2.7.1 Current Kernel Memory resources accounted | 329 | 2.7.1 Current Kernel Memory resources accounted |
330 | ----------------------------------------------- | ||
298 | 331 | ||
299 | * stack pages: every process consumes some stack pages. By accounting into | 332 | stack pages: |
300 | kernel memory, we prevent new processes from being created when the kernel | 333 | every process consumes some stack pages. By accounting into |
301 | memory usage is too high. | 334 | kernel memory, we prevent new processes from being created when the kernel |
335 | memory usage is too high. | ||
302 | 336 | ||
303 | * slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy | 337 | slab pages: |
304 | of each kmem_cache is created every time the cache is touched by the first time | 338 | pages allocated by the SLAB or SLUB allocator are tracked. A copy |
305 | from inside the memcg. The creation is done lazily, so some objects can still be | 339 | of each kmem_cache is created every time the cache is touched by the first time |
306 | skipped while the cache is being created. All objects in a slab page should | 340 | from inside the memcg. The creation is done lazily, so some objects can still be |
307 | belong to the same memcg. This only fails to hold when a task is migrated to a | 341 | skipped while the cache is being created. All objects in a slab page should |
308 | different memcg during the page allocation by the cache. | 342 | belong to the same memcg. This only fails to hold when a task is migrated to a |
343 | different memcg during the page allocation by the cache. | ||
309 | 344 | ||
310 | * sockets memory pressure: some sockets protocols have memory pressure | 345 | sockets memory pressure: |
311 | thresholds. The Memory Controller allows them to be controlled individually | 346 | some sockets protocols have memory pressure |
312 | per cgroup, instead of globally. | 347 | thresholds. The Memory Controller allows them to be controlled individually |
348 | per cgroup, instead of globally. | ||
313 | 349 | ||
314 | * tcp memory pressure: sockets memory pressure for the tcp protocol. | 350 | tcp memory pressure: |
351 | sockets memory pressure for the tcp protocol. | ||
315 | 352 | ||
316 | 2.7.2 Common use cases | 353 | 2.7.2 Common use cases |
354 | ---------------------- | ||
317 | 355 | ||
318 | Because the "kmem" counter is fed to the main user counter, kernel memory can | 356 | Because the "kmem" counter is fed to the main user counter, kernel memory can |
319 | never be limited completely independently of user memory. Say "U" is the user | 357 | never be limited completely independently of user memory. Say "U" is the user |
320 | limit, and "K" the kernel limit. There are three possible ways limits can be | 358 | limit, and "K" the kernel limit. There are three possible ways limits can be |
321 | set: | 359 | set: |
322 | 360 | ||
323 | U != 0, K = unlimited: | 361 | U != 0, K = unlimited: |
324 | This is the standard memcg limitation mechanism already present before kmem | 362 | This is the standard memcg limitation mechanism already present before kmem |
325 | accounting. Kernel memory is completely ignored. | 363 | accounting. Kernel memory is completely ignored. |
326 | 364 | ||
327 | U != 0, K < U: | 365 | U != 0, K < U: |
328 | Kernel memory is a subset of the user memory. This setup is useful in | 366 | Kernel memory is a subset of the user memory. This setup is useful in |
329 | deployments where the total amount of memory per-cgroup is overcommited. | 367 | deployments where the total amount of memory per-cgroup is overcommited. |
330 | Overcommiting kernel memory limits is definitely not recommended, since the | 368 | Overcommiting kernel memory limits is definitely not recommended, since the |
@@ -332,19 +370,23 @@ set: | |||
332 | In this case, the admin could set up K so that the sum of all groups is | 370 | In this case, the admin could set up K so that the sum of all groups is |
333 | never greater than the total memory, and freely set U at the cost of his | 371 | never greater than the total memory, and freely set U at the cost of his |
334 | QoS. | 372 | QoS. |
335 | WARNING: In the current implementation, memory reclaim will NOT be | 373 | |
374 | WARNING: | ||
375 | In the current implementation, memory reclaim will NOT be | ||
336 | triggered for a cgroup when it hits K while staying below U, which makes | 376 | triggered for a cgroup when it hits K while staying below U, which makes |
337 | this setup impractical. | 377 | this setup impractical. |
338 | 378 | ||
339 | U != 0, K >= U: | 379 | U != 0, K >= U: |
340 | Since kmem charges will also be fed to the user counter and reclaim will be | 380 | Since kmem charges will also be fed to the user counter and reclaim will be |
341 | triggered for the cgroup for both kinds of memory. This setup gives the | 381 | triggered for the cgroup for both kinds of memory. This setup gives the |
342 | admin a unified view of memory, and it is also useful for people who just | 382 | admin a unified view of memory, and it is also useful for people who just |
343 | want to track kernel memory usage. | 383 | want to track kernel memory usage. |
344 | 384 | ||
345 | 3. User Interface | 385 | 3. User Interface |
386 | ================= | ||
346 | 387 | ||
347 | 3.0. Configuration | 388 | 3.0. Configuration |
389 | ------------------ | ||
348 | 390 | ||
349 | a. Enable CONFIG_CGROUPS | 391 | a. Enable CONFIG_CGROUPS |
350 | b. Enable CONFIG_MEMCG | 392 | b. Enable CONFIG_MEMCG |
@@ -352,39 +394,53 @@ c. Enable CONFIG_MEMCG_SWAP (to use swap extension) | |||
352 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) | 394 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) |
353 | 395 | ||
354 | 3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) | 396 | 3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) |
355 | # mount -t tmpfs none /sys/fs/cgroup | 397 | ------------------------------------------------------------------- |
356 | # mkdir /sys/fs/cgroup/memory | 398 | |
357 | # mount -t cgroup none /sys/fs/cgroup/memory -o memory | 399 | :: |
400 | |||
401 | # mount -t tmpfs none /sys/fs/cgroup | ||
402 | # mkdir /sys/fs/cgroup/memory | ||
403 | # mount -t cgroup none /sys/fs/cgroup/memory -o memory | ||
404 | |||
405 | 3.2. Make the new group and move bash into it:: | ||
406 | |||
407 | # mkdir /sys/fs/cgroup/memory/0 | ||
408 | # echo $$ > /sys/fs/cgroup/memory/0/tasks | ||
358 | 409 | ||
359 | 3.2. Make the new group and move bash into it | 410 | Since now we're in the 0 cgroup, we can alter the memory limit:: |
360 | # mkdir /sys/fs/cgroup/memory/0 | ||
361 | # echo $$ > /sys/fs/cgroup/memory/0/tasks | ||
362 | 411 | ||
363 | Since now we're in the 0 cgroup, we can alter the memory limit: | 412 | # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes |
364 | # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes | ||
365 | 413 | ||
366 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | 414 | NOTE: |
367 | mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) | 415 | We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, |
416 | mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, | ||
417 | Gibibytes.) | ||
368 | 418 | ||
369 | NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). | 419 | NOTE: |
370 | NOTE: We cannot set limits on the root cgroup any more. | 420 | We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. |
371 | 421 | ||
372 | # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes | 422 | NOTE: |
373 | 4194304 | 423 | We cannot set limits on the root cgroup any more. |
374 | 424 | ||
375 | We can check the usage: | 425 | :: |
376 | # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes | 426 | |
377 | 1216512 | 427 | # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes |
428 | 4194304 | ||
429 | |||
430 | We can check the usage:: | ||
431 | |||
432 | # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes | ||
433 | 1216512 | ||
378 | 434 | ||
379 | A successful write to this file does not guarantee a successful setting of | 435 | A successful write to this file does not guarantee a successful setting of |
380 | this limit to the value written into the file. This can be due to a | 436 | this limit to the value written into the file. This can be due to a |
381 | number of factors, such as rounding up to page boundaries or the total | 437 | number of factors, such as rounding up to page boundaries or the total |
382 | availability of memory on the system. The user is required to re-read | 438 | availability of memory on the system. The user is required to re-read |
383 | this file after a write to guarantee the value committed by the kernel. | 439 | this file after a write to guarantee the value committed by the kernel:: |
384 | 440 | ||
385 | # echo 1 > memory.limit_in_bytes | 441 | # echo 1 > memory.limit_in_bytes |
386 | # cat memory.limit_in_bytes | 442 | # cat memory.limit_in_bytes |
387 | 4096 | 443 | 4096 |
388 | 444 | ||
389 | The memory.failcnt field gives the number of times that the cgroup limit was | 445 | The memory.failcnt field gives the number of times that the cgroup limit was |
390 | exceeded. | 446 | exceeded. |
@@ -393,6 +449,7 @@ The memory.stat file gives accounting information. Now, the number of | |||
393 | caches, RSS and Active pages/Inactive pages are shown. | 449 | caches, RSS and Active pages/Inactive pages are shown. |
394 | 450 | ||
395 | 4. Testing | 451 | 4. Testing |
452 | ========== | ||
396 | 453 | ||
397 | For testing features and implementation, see memcg_test.txt. | 454 | For testing features and implementation, see memcg_test.txt. |
398 | 455 | ||
@@ -408,6 +465,7 @@ But the above two are testing extreme situations. | |||
408 | Trying usual test under memory controller is always helpful. | 465 | Trying usual test under memory controller is always helpful. |
409 | 466 | ||
410 | 4.1 Troubleshooting | 467 | 4.1 Troubleshooting |
468 | ------------------- | ||
411 | 469 | ||
412 | Sometimes a user might find that the application under a cgroup is | 470 | Sometimes a user might find that the application under a cgroup is |
413 | terminated by the OOM killer. There are several causes for this: | 471 | terminated by the OOM killer. There are several causes for this: |
@@ -422,6 +480,7 @@ To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and | |||
422 | seeing what happens will be helpful. | 480 | seeing what happens will be helpful. |
423 | 481 | ||
424 | 4.2 Task migration | 482 | 4.2 Task migration |
483 | ------------------ | ||
425 | 484 | ||
426 | When a task migrates from one cgroup to another, its charge is not | 485 | When a task migrates from one cgroup to another, its charge is not |
427 | carried forward by default. The pages allocated from the original cgroup still | 486 | carried forward by default. The pages allocated from the original cgroup still |
@@ -432,6 +491,7 @@ You can move charges of a task along with task migration. | |||
432 | See 8. "Move charges at task migration" | 491 | See 8. "Move charges at task migration" |
433 | 492 | ||
434 | 4.3 Removing a cgroup | 493 | 4.3 Removing a cgroup |
494 | --------------------- | ||
435 | 495 | ||
436 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a | 496 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
437 | cgroup might have some charge associated with it, even though all | 497 | cgroup might have some charge associated with it, even though all |
@@ -448,13 +508,15 @@ will be charged as a new owner of it. | |||
448 | 508 | ||
449 | About use_hierarchy, see Section 6. | 509 | About use_hierarchy, see Section 6. |
450 | 510 | ||
451 | 5. Misc. interfaces. | 511 | 5. Misc. interfaces |
512 | =================== | ||
452 | 513 | ||
453 | 5.1 force_empty | 514 | 5.1 force_empty |
515 | --------------- | ||
454 | memory.force_empty interface is provided to make cgroup's memory usage empty. | 516 | memory.force_empty interface is provided to make cgroup's memory usage empty. |
455 | When writing anything to this | 517 | When writing anything to this:: |
456 | 518 | ||
457 | # echo 0 > memory.force_empty | 519 | # echo 0 > memory.force_empty |
458 | 520 | ||
459 | the cgroup will be reclaimed and as many pages reclaimed as possible. | 521 | the cgroup will be reclaimed and as many pages reclaimed as possible. |
460 | 522 | ||
@@ -471,50 +533,61 @@ About use_hierarchy, see Section 6. | |||
471 | About use_hierarchy, see Section 6. | 533 | About use_hierarchy, see Section 6. |
472 | 534 | ||
473 | 5.2 stat file | 535 | 5.2 stat file |
536 | ------------- | ||
474 | 537 | ||
475 | memory.stat file includes following statistics | 538 | memory.stat file includes following statistics |
476 | 539 | ||
477 | # per-memory cgroup local status | 540 | per-memory cgroup local status |
478 | cache - # of bytes of page cache memory. | 541 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
479 | rss - # of bytes of anonymous and swap cache memory (includes | 542 | |
543 | =============== =============================================================== | ||
544 | cache # of bytes of page cache memory. | ||
545 | rss # of bytes of anonymous and swap cache memory (includes | ||
480 | transparent hugepages). | 546 | transparent hugepages). |
481 | rss_huge - # of bytes of anonymous transparent hugepages. | 547 | rss_huge # of bytes of anonymous transparent hugepages. |
482 | mapped_file - # of bytes of mapped file (includes tmpfs/shmem) | 548 | mapped_file # of bytes of mapped file (includes tmpfs/shmem) |
483 | pgpgin - # of charging events to the memory cgroup. The charging | 549 | pgpgin # of charging events to the memory cgroup. The charging |
484 | event happens each time a page is accounted as either mapped | 550 | event happens each time a page is accounted as either mapped |
485 | anon page(RSS) or cache page(Page Cache) to the cgroup. | 551 | anon page(RSS) or cache page(Page Cache) to the cgroup. |
486 | pgpgout - # of uncharging events to the memory cgroup. The uncharging | 552 | pgpgout # of uncharging events to the memory cgroup. The uncharging |
487 | event happens each time a page is unaccounted from the cgroup. | 553 | event happens each time a page is unaccounted from the cgroup. |
488 | swap - # of bytes of swap usage | 554 | swap # of bytes of swap usage |
489 | dirty - # of bytes that are waiting to get written back to the disk. | 555 | dirty # of bytes that are waiting to get written back to the disk. |
490 | writeback - # of bytes of file/anon cache that are queued for syncing to | 556 | writeback # of bytes of file/anon cache that are queued for syncing to |
491 | disk. | 557 | disk. |
492 | inactive_anon - # of bytes of anonymous and swap cache memory on inactive | 558 | inactive_anon # of bytes of anonymous and swap cache memory on inactive |
493 | LRU list. | 559 | LRU list. |
494 | active_anon - # of bytes of anonymous and swap cache memory on active | 560 | active_anon # of bytes of anonymous and swap cache memory on active |
495 | LRU list. | 561 | LRU list. |
496 | inactive_file - # of bytes of file-backed memory on inactive LRU list. | 562 | inactive_file # of bytes of file-backed memory on inactive LRU list. |
497 | active_file - # of bytes of file-backed memory on active LRU list. | 563 | active_file # of bytes of file-backed memory on active LRU list. |
498 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). | 564 | unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). |
499 | 565 | =============== =============================================================== | |
500 | # status considering hierarchy (see memory.use_hierarchy settings) | 566 | |
501 | 567 | status considering hierarchy (see memory.use_hierarchy settings) | |
502 | hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy | 568 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
503 | under which the memory cgroup is | 569 | |
504 | hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to | 570 | ========================= =================================================== |
505 | hierarchy under which memory cgroup is. | 571 | hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy |
506 | 572 | under which the memory cgroup is | |
507 | total_<counter> - # hierarchical version of <counter>, which in | 573 | hierarchical_memsw_limit # of bytes of memory+swap limit with regard to |
508 | addition to the cgroup's own value includes the | 574 | hierarchy under which memory cgroup is. |
509 | sum of all hierarchical children's values of | 575 | |
510 | <counter>, i.e. total_cache | 576 | total_<counter> # hierarchical version of <counter>, which in |
511 | 577 | addition to the cgroup's own value includes the | |
512 | # The following additional stats are dependent on CONFIG_DEBUG_VM. | 578 | sum of all hierarchical children's values of |
513 | 579 | <counter>, i.e. total_cache | |
514 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) | 580 | ========================= =================================================== |
515 | recent_rotated_file - VM internal parameter. (see mm/vmscan.c) | 581 | |
516 | recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | 582 | The following additional stats are dependent on CONFIG_DEBUG_VM |
517 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | 583 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
584 | |||
585 | ========================= ======================================== | ||
586 | recent_rotated_anon VM internal parameter. (see mm/vmscan.c) | ||
587 | recent_rotated_file VM internal parameter. (see mm/vmscan.c) | ||
588 | recent_scanned_anon VM internal parameter. (see mm/vmscan.c) | ||
589 | recent_scanned_file VM internal parameter. (see mm/vmscan.c) | ||
590 | ========================= ======================================== | ||
518 | 591 | ||
519 | Memo: | 592 | Memo: |
520 | recent_rotated means recent frequency of LRU rotation. | 593 | recent_rotated means recent frequency of LRU rotation. |
@@ -525,12 +598,15 @@ Note: | |||
525 | Only anonymous and swap cache memory is listed as part of 'rss' stat. | 598 | Only anonymous and swap cache memory is listed as part of 'rss' stat. |
526 | This should not be confused with the true 'resident set size' or the | 599 | This should not be confused with the true 'resident set size' or the |
527 | amount of physical memory used by the cgroup. | 600 | amount of physical memory used by the cgroup. |
601 | |||
528 | 'rss + mapped_file" will give you resident set size of cgroup. | 602 | 'rss + mapped_file" will give you resident set size of cgroup. |
603 | |||
529 | (Note: file and shmem may be shared among other cgroups. In that case, | 604 | (Note: file and shmem may be shared among other cgroups. In that case, |
530 | mapped_file is accounted only when the memory cgroup is owner of page | 605 | mapped_file is accounted only when the memory cgroup is owner of page |
531 | cache.) | 606 | cache.) |
532 | 607 | ||
533 | 5.3 swappiness | 608 | 5.3 swappiness |
609 | -------------- | ||
534 | 610 | ||
535 | Overrides /proc/sys/vm/swappiness for the particular group. The tunable | 611 | Overrides /proc/sys/vm/swappiness for the particular group. The tunable |
536 | in the root cgroup corresponds to the global swappiness setting. | 612 | in the root cgroup corresponds to the global swappiness setting. |
@@ -541,16 +617,19 @@ there is a swap storage available. This might lead to memcg OOM killer | |||
541 | if there are no file pages to reclaim. | 617 | if there are no file pages to reclaim. |
542 | 618 | ||
543 | 5.4 failcnt | 619 | 5.4 failcnt |
620 | ----------- | ||
544 | 621 | ||
545 | A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. | 622 | A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. |
546 | This failcnt(== failure count) shows the number of times that a usage counter | 623 | This failcnt(== failure count) shows the number of times that a usage counter |
547 | hit its limit. When a memory cgroup hits a limit, failcnt increases and | 624 | hit its limit. When a memory cgroup hits a limit, failcnt increases and |
548 | memory under it will be reclaimed. | 625 | memory under it will be reclaimed. |
549 | 626 | ||
550 | You can reset failcnt by writing 0 to failcnt file. | 627 | You can reset failcnt by writing 0 to failcnt file:: |
551 | # echo 0 > .../memory.failcnt | 628 | |
629 | # echo 0 > .../memory.failcnt | ||
552 | 630 | ||
553 | 5.5 usage_in_bytes | 631 | 5.5 usage_in_bytes |
632 | ------------------ | ||
554 | 633 | ||
555 | For efficiency, as other kernel components, memory cgroup uses some optimization | 634 | For efficiency, as other kernel components, memory cgroup uses some optimization |
556 | to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the | 635 | to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the |
@@ -560,6 +639,7 @@ If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) | |||
560 | value in memory.stat(see 5.2). | 639 | value in memory.stat(see 5.2). |
561 | 640 | ||
562 | 5.6 numa_stat | 641 | 5.6 numa_stat |
642 | ------------- | ||
563 | 643 | ||
564 | This is similar to numa_maps but operates on a per-memcg basis. This is | 644 | This is similar to numa_maps but operates on a per-memcg basis. This is |
565 | useful for providing visibility into the numa locality information within | 645 | useful for providing visibility into the numa locality information within |
@@ -571,22 +651,23 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" | |||
571 | per-node page counts including "hierarchical_<counter>" which sums up all | 651 | per-node page counts including "hierarchical_<counter>" which sums up all |
572 | hierarchical children's values in addition to the memcg's own value. | 652 | hierarchical children's values in addition to the memcg's own value. |
573 | 653 | ||
574 | The output format of memory.numa_stat is: | 654 | The output format of memory.numa_stat is:: |
575 | 655 | ||
576 | total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... | 656 | total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... |
577 | file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... | 657 | file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... |
578 | anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... | 658 | anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... |
579 | unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... | 659 | unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... |
580 | hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ... | 660 | hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ... |
581 | 661 | ||
582 | The "total" count is sum of file + anon + unevictable. | 662 | The "total" count is sum of file + anon + unevictable. |
583 | 663 | ||
584 | 6. Hierarchy support | 664 | 6. Hierarchy support |
665 | ==================== | ||
585 | 666 | ||
586 | The memory controller supports a deep hierarchy and hierarchical accounting. | 667 | The memory controller supports a deep hierarchy and hierarchical accounting. |
587 | The hierarchy is created by creating the appropriate cgroups in the | 668 | The hierarchy is created by creating the appropriate cgroups in the |
588 | cgroup filesystem. Consider for example, the following cgroup filesystem | 669 | cgroup filesystem. Consider for example, the following cgroup filesystem |
589 | hierarchy | 670 | hierarchy:: |
590 | 671 | ||
591 | root | 672 | root |
592 | / | \ | 673 | / | \ |
@@ -603,24 +684,28 @@ limit, the reclaim algorithm reclaims from the tasks in the ancestor and the | |||
603 | children of the ancestor. | 684 | children of the ancestor. |
604 | 685 | ||
605 | 6.1 Enabling hierarchical accounting and reclaim | 686 | 6.1 Enabling hierarchical accounting and reclaim |
687 | ------------------------------------------------ | ||
606 | 688 | ||
607 | A memory cgroup by default disables the hierarchy feature. Support | 689 | A memory cgroup by default disables the hierarchy feature. Support |
608 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup | 690 | can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: |
609 | 691 | ||
610 | # echo 1 > memory.use_hierarchy | 692 | # echo 1 > memory.use_hierarchy |
611 | 693 | ||
612 | The feature can be disabled by | 694 | The feature can be disabled by:: |
613 | 695 | ||
614 | # echo 0 > memory.use_hierarchy | 696 | # echo 0 > memory.use_hierarchy |
615 | 697 | ||
616 | NOTE1: Enabling/disabling will fail if either the cgroup already has other | 698 | NOTE1: |
699 | Enabling/disabling will fail if either the cgroup already has other | ||
617 | cgroups created below it, or if the parent cgroup has use_hierarchy | 700 | cgroups created below it, or if the parent cgroup has use_hierarchy |
618 | enabled. | 701 | enabled. |
619 | 702 | ||
620 | NOTE2: When panic_on_oom is set to "2", the whole system will panic in | 703 | NOTE2: |
704 | When panic_on_oom is set to "2", the whole system will panic in | ||
621 | case of an OOM event in any cgroup. | 705 | case of an OOM event in any cgroup. |
622 | 706 | ||
623 | 7. Soft limits | 707 | 7. Soft limits |
708 | ============== | ||
624 | 709 | ||
625 | Soft limits allow for greater sharing of memory. The idea behind soft limits | 710 | Soft limits allow for greater sharing of memory. The idea behind soft limits |
626 | is to allow control groups to use as much of the memory as needed, provided | 711 | is to allow control groups to use as much of the memory as needed, provided |
@@ -640,22 +725,26 @@ hints/setup. Currently soft limit based reclaim is set up such that | |||
640 | it gets invoked from balance_pgdat (kswapd). | 725 | it gets invoked from balance_pgdat (kswapd). |
641 | 726 | ||
642 | 7.1 Interface | 727 | 7.1 Interface |
728 | ------------- | ||
643 | 729 | ||
644 | Soft limits can be setup by using the following commands (in this example we | 730 | Soft limits can be setup by using the following commands (in this example we |
645 | assume a soft limit of 256 MiB) | 731 | assume a soft limit of 256 MiB):: |
646 | 732 | ||
647 | # echo 256M > memory.soft_limit_in_bytes | 733 | # echo 256M > memory.soft_limit_in_bytes |
648 | 734 | ||
649 | If we want to change this to 1G, we can at any time use | 735 | If we want to change this to 1G, we can at any time use:: |
650 | 736 | ||
651 | # echo 1G > memory.soft_limit_in_bytes | 737 | # echo 1G > memory.soft_limit_in_bytes |
652 | 738 | ||
653 | NOTE1: Soft limits take effect over a long period of time, since they involve | 739 | NOTE1: |
740 | Soft limits take effect over a long period of time, since they involve | ||
654 | reclaiming memory for balancing between memory cgroups | 741 | reclaiming memory for balancing between memory cgroups |
655 | NOTE2: It is recommended to set the soft limit always below the hard limit, | 742 | NOTE2: |
743 | It is recommended to set the soft limit always below the hard limit, | ||
656 | otherwise the hard limit will take precedence. | 744 | otherwise the hard limit will take precedence. |
657 | 745 | ||
658 | 8. Move charges at task migration | 746 | 8. Move charges at task migration |
747 | ================================= | ||
659 | 748 | ||
660 | Users can move charges associated with a task along with task migration, that | 749 | Users can move charges associated with a task along with task migration, that |
661 | is, uncharge task's pages from the old cgroup and charge them to the new cgroup. | 750 | is, uncharge task's pages from the old cgroup and charge them to the new cgroup. |
@@ -663,60 +752,71 @@ This feature is not supported in !CONFIG_MMU environments because of lack of | |||
663 | page tables. | 752 | page tables. |
664 | 753 | ||
665 | 8.1 Interface | 754 | 8.1 Interface |
755 | ------------- | ||
666 | 756 | ||
667 | This feature is disabled by default. It can be enabled (and disabled again) by | 757 | This feature is disabled by default. It can be enabled (and disabled again) by |
668 | writing to memory.move_charge_at_immigrate of the destination cgroup. | 758 | writing to memory.move_charge_at_immigrate of the destination cgroup. |
669 | 759 | ||
670 | If you want to enable it: | 760 | If you want to enable it:: |
671 | 761 | ||
672 | # echo (some positive value) > memory.move_charge_at_immigrate | 762 | # echo (some positive value) > memory.move_charge_at_immigrate |
673 | 763 | ||
674 | Note: Each bits of move_charge_at_immigrate has its own meaning about what type | 764 | Note: |
765 | Each bits of move_charge_at_immigrate has its own meaning about what type | ||
675 | of charges should be moved. See 8.2 for details. | 766 | of charges should be moved. See 8.2 for details. |
676 | Note: Charges are moved only when you move mm->owner, in other words, | 767 | Note: |
768 | Charges are moved only when you move mm->owner, in other words, | ||
677 | a leader of a thread group. | 769 | a leader of a thread group. |
678 | Note: If we cannot find enough space for the task in the destination cgroup, we | 770 | Note: |
771 | If we cannot find enough space for the task in the destination cgroup, we | ||
679 | try to make space by reclaiming memory. Task migration may fail if we | 772 | try to make space by reclaiming memory. Task migration may fail if we |
680 | cannot make enough space. | 773 | cannot make enough space. |
681 | Note: It can take several seconds if you move charges much. | 774 | Note: |
775 | It can take several seconds if you move charges much. | ||
682 | 776 | ||
683 | And if you want disable it again: | 777 | And if you want disable it again:: |
684 | 778 | ||
685 | # echo 0 > memory.move_charge_at_immigrate | 779 | # echo 0 > memory.move_charge_at_immigrate |
686 | 780 | ||
687 | 8.2 Type of charges which can be moved | 781 | 8.2 Type of charges which can be moved |
782 | -------------------------------------- | ||
688 | 783 | ||
689 | Each bit in move_charge_at_immigrate has its own meaning about what type of | 784 | Each bit in move_charge_at_immigrate has its own meaning about what type of |
690 | charges should be moved. But in any case, it must be noted that an account of | 785 | charges should be moved. But in any case, it must be noted that an account of |
691 | a page or a swap can be moved only when it is charged to the task's current | 786 | a page or a swap can be moved only when it is charged to the task's current |
692 | (old) memory cgroup. | 787 | (old) memory cgroup. |
693 | 788 | ||
694 | bit | what type of charges would be moved ? | 789 | +---+--------------------------------------------------------------------------+ |
695 | -----+------------------------------------------------------------------------ | 790 | |bit| what type of charges would be moved ? | |
696 | 0 | A charge of an anonymous page (or swap of it) used by the target task. | 791 | +===+==========================================================================+ |
697 | | You must enable Swap Extension (see 2.4) to enable move of swap charges. | 792 | | 0 | A charge of an anonymous page (or swap of it) used by the target task. | |
698 | -----+------------------------------------------------------------------------ | 793 | | | You must enable Swap Extension (see 2.4) to enable move of swap charges. | |
699 | 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | 794 | +---+--------------------------------------------------------------------------+ |
700 | | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | 795 | | 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | |
701 | | anonymous pages, file pages (and swaps) in the range mmapped by the task | 796 | | | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | |
702 | | will be moved even if the task hasn't done page fault, i.e. they might | 797 | | | anonymous pages, file pages (and swaps) in the range mmapped by the task | |
703 | | not be the task's "RSS", but other task's "RSS" that maps the same file. | 798 | | | will be moved even if the task hasn't done page fault, i.e. they might | |
704 | | And mapcount of the page is ignored (the page can be moved even if | 799 | | | not be the task's "RSS", but other task's "RSS" that maps the same file. | |
705 | | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | 800 | | | And mapcount of the page is ignored (the page can be moved even if | |
706 | | enable move of swap charges. | 801 | | | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | |
802 | | | enable move of swap charges. | | ||
803 | +---+--------------------------------------------------------------------------+ | ||
707 | 804 | ||
708 | 8.3 TODO | 805 | 8.3 TODO |
806 | -------- | ||
709 | 807 | ||
710 | - All of moving charge operations are done under cgroup_mutex. It's not good | 808 | - All of moving charge operations are done under cgroup_mutex. It's not good |
711 | behavior to hold the mutex too long, so we may need some trick. | 809 | behavior to hold the mutex too long, so we may need some trick. |
712 | 810 | ||
713 | 9. Memory thresholds | 811 | 9. Memory thresholds |
812 | ==================== | ||
714 | 813 | ||
715 | Memory cgroup implements memory thresholds using the cgroups notification | 814 | Memory cgroup implements memory thresholds using the cgroups notification |
716 | API (see cgroups.txt). It allows to register multiple memory and memsw | 815 | API (see cgroups.txt). It allows to register multiple memory and memsw |
717 | thresholds and gets notifications when it crosses. | 816 | thresholds and gets notifications when it crosses. |
718 | 817 | ||
719 | To register a threshold, an application must: | 818 | To register a threshold, an application must: |
819 | |||
720 | - create an eventfd using eventfd(2); | 820 | - create an eventfd using eventfd(2); |
721 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; | 821 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; |
722 | - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to | 822 | - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to |
@@ -728,6 +828,7 @@ threshold in any direction. | |||
728 | It's applicable for root and non-root cgroup. | 828 | It's applicable for root and non-root cgroup. |
729 | 829 | ||
730 | 10. OOM Control | 830 | 10. OOM Control |
831 | =============== | ||
731 | 832 | ||
732 | memory.oom_control file is for OOM notification and other controls. | 833 | memory.oom_control file is for OOM notification and other controls. |
733 | 834 | ||
@@ -736,6 +837,7 @@ API (See cgroups.txt). It allows to register multiple OOM notification | |||
736 | delivery and gets notification when OOM happens. | 837 | delivery and gets notification when OOM happens. |
737 | 838 | ||
738 | To register a notifier, an application must: | 839 | To register a notifier, an application must: |
840 | |||
739 | - create an eventfd using eventfd(2) | 841 | - create an eventfd using eventfd(2) |
740 | - open memory.oom_control file | 842 | - open memory.oom_control file |
741 | - write string like "<event_fd> <fd of memory.oom_control>" to | 843 | - write string like "<event_fd> <fd of memory.oom_control>" to |
@@ -752,8 +854,11 @@ If OOM-killer is disabled, tasks under cgroup will hang/sleep | |||
752 | in memory cgroup's OOM-waitqueue when they request accountable memory. | 854 | in memory cgroup's OOM-waitqueue when they request accountable memory. |
753 | 855 | ||
754 | For running them, you have to relax the memory cgroup's OOM status by | 856 | For running them, you have to relax the memory cgroup's OOM status by |
857 | |||
755 | * enlarge limit or reduce usage. | 858 | * enlarge limit or reduce usage. |
859 | |||
756 | To reduce usage, | 860 | To reduce usage, |
861 | |||
757 | * kill some tasks. | 862 | * kill some tasks. |
758 | * move some tasks to other group with account migration. | 863 | * move some tasks to other group with account migration. |
759 | * remove some files (on tmpfs?) | 864 | * remove some files (on tmpfs?) |
@@ -761,11 +866,14 @@ To reduce usage, | |||
761 | Then, stopped tasks will work again. | 866 | Then, stopped tasks will work again. |
762 | 867 | ||
763 | At reading, current status of OOM is shown. | 868 | At reading, current status of OOM is shown. |
764 | oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) | 869 | |
765 | under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may | 870 | - oom_kill_disable 0 or 1 |
766 | be stopped.) | 871 | (if 1, oom-killer is disabled) |
872 | - under_oom 0 or 1 | ||
873 | (if 1, the memory cgroup is under OOM, tasks may be stopped.) | ||
767 | 874 | ||
768 | 11. Memory Pressure | 875 | 11. Memory Pressure |
876 | =================== | ||
769 | 877 | ||
770 | The pressure level notifications can be used to monitor the memory | 878 | The pressure level notifications can be used to monitor the memory |
771 | allocation cost; based on the pressure, applications can implement | 879 | allocation cost; based on the pressure, applications can implement |
@@ -840,21 +948,22 @@ Test: | |||
840 | 948 | ||
841 | Here is a small script example that makes a new cgroup, sets up a | 949 | Here is a small script example that makes a new cgroup, sets up a |
842 | memory limit, sets up a notification in the cgroup and then makes child | 950 | memory limit, sets up a notification in the cgroup and then makes child |
843 | cgroup experience a critical pressure: | 951 | cgroup experience a critical pressure:: |
844 | 952 | ||
845 | # cd /sys/fs/cgroup/memory/ | 953 | # cd /sys/fs/cgroup/memory/ |
846 | # mkdir foo | 954 | # mkdir foo |
847 | # cd foo | 955 | # cd foo |
848 | # cgroup_event_listener memory.pressure_level low,hierarchy & | 956 | # cgroup_event_listener memory.pressure_level low,hierarchy & |
849 | # echo 8000000 > memory.limit_in_bytes | 957 | # echo 8000000 > memory.limit_in_bytes |
850 | # echo 8000000 > memory.memsw.limit_in_bytes | 958 | # echo 8000000 > memory.memsw.limit_in_bytes |
851 | # echo $$ > tasks | 959 | # echo $$ > tasks |
852 | # dd if=/dev/zero | read x | 960 | # dd if=/dev/zero | read x |
853 | 961 | ||
854 | (Expect a bunch of notifications, and eventually, the oom-killer will | 962 | (Expect a bunch of notifications, and eventually, the oom-killer will |
855 | trigger.) | 963 | trigger.) |
856 | 964 | ||
857 | 12. TODO | 965 | 12. TODO |
966 | ======== | ||
858 | 967 | ||
859 | 1. Make per-cgroup scanner reclaim not-shared pages first | 968 | 1. Make per-cgroup scanner reclaim not-shared pages first |
860 | 2. Teach controller to account for shared-pages | 969 | 2. Teach controller to account for shared-pages |
@@ -862,11 +971,13 @@ Test: | |||
862 | not yet hit but the usage is getting closer | 971 | not yet hit but the usage is getting closer |
863 | 972 | ||
864 | Summary | 973 | Summary |
974 | ======= | ||
865 | 975 | ||
866 | Overall, the memory controller has been a stable controller and has been | 976 | Overall, the memory controller has been a stable controller and has been |
867 | commented and discussed quite extensively in the community. | 977 | commented and discussed quite extensively in the community. |
868 | 978 | ||
869 | References | 979 | References |
980 | ========== | ||
870 | 981 | ||
871 | 1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ | 982 | 1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ |
872 | 2. Singh, Balbir. Memory Controller (RSS Control), | 983 | 2. Singh, Balbir. Memory Controller (RSS Control), |
diff --git a/Documentation/cgroup-v1/net_cls.txt b/Documentation/cgroup-v1/net_cls.rst index ec182346dea2..a2cf272af7a0 100644 --- a/Documentation/cgroup-v1/net_cls.txt +++ b/Documentation/cgroup-v1/net_cls.rst | |||
@@ -1,5 +1,6 @@ | |||
1 | ========================= | ||
1 | Network classifier cgroup | 2 | Network classifier cgroup |
2 | ------------------------- | 3 | ========================= |
3 | 4 | ||
4 | The Network classifier cgroup provides an interface to | 5 | The Network classifier cgroup provides an interface to |
5 | tag network packets with a class identifier (classid). | 6 | tag network packets with a class identifier (classid). |
@@ -17,23 +18,27 @@ values is 0xAAAABBBB; AAAA is the major handle number and BBBB | |||
17 | is the minor handle number. | 18 | is the minor handle number. |
18 | Reading net_cls.classid yields a decimal result. | 19 | Reading net_cls.classid yields a decimal result. |
19 | 20 | ||
20 | Example: | 21 | Example:: |
21 | mkdir /sys/fs/cgroup/net_cls | ||
22 | mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls | ||
23 | mkdir /sys/fs/cgroup/net_cls/0 | ||
24 | echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid | ||
25 | - setting a 10:1 handle. | ||
26 | 22 | ||
27 | cat /sys/fs/cgroup/net_cls/0/net_cls.classid | 23 | mkdir /sys/fs/cgroup/net_cls |
28 | 1048577 | 24 | mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls |
25 | mkdir /sys/fs/cgroup/net_cls/0 | ||
26 | echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid | ||
29 | 27 | ||
30 | configuring tc: | 28 | - setting a 10:1 handle:: |
31 | tc qdisc add dev eth0 root handle 10: htb | ||
32 | 29 | ||
33 | tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit | 30 | cat /sys/fs/cgroup/net_cls/0/net_cls.classid |
34 | - creating traffic class 10:1 | 31 | 1048577 |
35 | 32 | ||
36 | tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup | 33 | - configuring tc:: |
37 | 34 | ||
38 | configuring iptables, basic example: | 35 | tc qdisc add dev eth0 root handle 10: htb |
39 | iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP | 36 | tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit |
37 | |||
38 | - creating traffic class 10:1:: | ||
39 | |||
40 | tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup | ||
41 | |||
42 | configuring iptables, basic example:: | ||
43 | |||
44 | iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP | ||
diff --git a/Documentation/cgroup-v1/net_prio.txt b/Documentation/cgroup-v1/net_prio.rst index a82cbd28ea8a..b40905871c64 100644 --- a/Documentation/cgroup-v1/net_prio.txt +++ b/Documentation/cgroup-v1/net_prio.rst | |||
@@ -1,5 +1,6 @@ | |||
1 | ======================= | ||
1 | Network priority cgroup | 2 | Network priority cgroup |
2 | ------------------------- | 3 | ======================= |
3 | 4 | ||
4 | The Network priority cgroup provides an interface to allow an administrator to | 5 | The Network priority cgroup provides an interface to allow an administrator to |
5 | dynamically set the priority of network traffic generated by various | 6 | dynamically set the priority of network traffic generated by various |
@@ -14,9 +15,9 @@ SO_PRIORITY socket option. This however, is not always possible because: | |||
14 | 15 | ||
15 | This cgroup allows an administrator to assign a process to a group which defines | 16 | This cgroup allows an administrator to assign a process to a group which defines |
16 | the priority of egress traffic on a given interface. Network priority groups can | 17 | the priority of egress traffic on a given interface. Network priority groups can |
17 | be created by first mounting the cgroup filesystem. | 18 | be created by first mounting the cgroup filesystem:: |
18 | 19 | ||
19 | # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio | 20 | # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio |
20 | 21 | ||
21 | With the above step, the initial group acting as the parent accounting group | 22 | With the above step, the initial group acting as the parent accounting group |
22 | becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in | 23 | becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in |
@@ -25,17 +26,18 @@ the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. | |||
25 | Each net_prio cgroup contains two files that are subsystem specific | 26 | Each net_prio cgroup contains two files that are subsystem specific |
26 | 27 | ||
27 | net_prio.prioidx | 28 | net_prio.prioidx |
28 | This file is read-only, and is simply informative. It contains a unique integer | 29 | This file is read-only, and is simply informative. It contains a unique |
29 | value that the kernel uses as an internal representation of this cgroup. | 30 | integer value that the kernel uses as an internal representation of this |
31 | cgroup. | ||
30 | 32 | ||
31 | net_prio.ifpriomap | 33 | net_prio.ifpriomap |
32 | This file contains a map of the priorities assigned to traffic originating from | 34 | This file contains a map of the priorities assigned to traffic originating |
33 | processes in this group and egressing the system on various interfaces. It | 35 | from processes in this group and egressing the system on various interfaces. |
34 | contains a list of tuples in the form <ifname priority>. Contents of this file | 36 | It contains a list of tuples in the form <ifname priority>. Contents of this |
35 | can be modified by echoing a string into the file using the same tuple format. | 37 | file can be modified by echoing a string into the file using the same tuple |
36 | for example: | 38 | format. For example:: |
37 | 39 | ||
38 | echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap | 40 | echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap |
39 | 41 | ||
40 | This command would force any traffic originating from processes belonging to the | 42 | This command would force any traffic originating from processes belonging to the |
41 | iscsi net_prio cgroup and egressing on interface eth0 to have the priority of | 43 | iscsi net_prio cgroup and egressing on interface eth0 to have the priority of |
diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.rst index e105d708ccde..6acebd9e72c8 100644 --- a/Documentation/cgroup-v1/pids.txt +++ b/Documentation/cgroup-v1/pids.rst | |||
@@ -1,5 +1,6 @@ | |||
1 | Process Number Controller | 1 | ========================= |
2 | ========================= | 2 | Process Number Controller |
3 | ========================= | ||
3 | 4 | ||
4 | Abstract | 5 | Abstract |
5 | -------- | 6 | -------- |
@@ -34,55 +35,58 @@ pids.current tracks all child cgroup hierarchies, so parent/pids.current is a | |||
34 | superset of parent/child/pids.current. | 35 | superset of parent/child/pids.current. |
35 | 36 | ||
36 | The pids.events file contains event counters: | 37 | The pids.events file contains event counters: |
38 | |||
37 | - max: Number of times fork failed because limit was hit. | 39 | - max: Number of times fork failed because limit was hit. |
38 | 40 | ||
39 | Example | 41 | Example |
40 | ------- | 42 | ------- |
41 | 43 | ||
42 | First, we mount the pids controller: | 44 | First, we mount the pids controller:: |
43 | # mkdir -p /sys/fs/cgroup/pids | 45 | |
44 | # mount -t cgroup -o pids none /sys/fs/cgroup/pids | 46 | # mkdir -p /sys/fs/cgroup/pids |
47 | # mount -t cgroup -o pids none /sys/fs/cgroup/pids | ||
48 | |||
49 | Then we create a hierarchy, set limits and attach processes to it:: | ||
45 | 50 | ||
46 | Then we create a hierarchy, set limits and attach processes to it: | 51 | # mkdir -p /sys/fs/cgroup/pids/parent/child |
47 | # mkdir -p /sys/fs/cgroup/pids/parent/child | 52 | # echo 2 > /sys/fs/cgroup/pids/parent/pids.max |
48 | # echo 2 > /sys/fs/cgroup/pids/parent/pids.max | 53 | # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs |
49 | # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs | 54 | # cat /sys/fs/cgroup/pids/parent/pids.current |
50 | # cat /sys/fs/cgroup/pids/parent/pids.current | 55 | 2 |
51 | 2 | 56 | # |
52 | # | ||
53 | 57 | ||
54 | It should be noted that attempts to overcome the set limit (2 in this case) will | 58 | It should be noted that attempts to overcome the set limit (2 in this case) will |
55 | fail: | 59 | fail:: |
56 | 60 | ||
57 | # cat /sys/fs/cgroup/pids/parent/pids.current | 61 | # cat /sys/fs/cgroup/pids/parent/pids.current |
58 | 2 | 62 | 2 |
59 | # ( /bin/echo "Here's some processes for you." | cat ) | 63 | # ( /bin/echo "Here's some processes for you." | cat ) |
60 | sh: fork: Resource temporary unavailable | 64 | sh: fork: Resource temporary unavailable |
61 | # | 65 | # |
62 | 66 | ||
63 | Even if we migrate to a child cgroup (which doesn't have a set limit), we will | 67 | Even if we migrate to a child cgroup (which doesn't have a set limit), we will |
64 | not be able to overcome the most stringent limit in the hierarchy (in this case, | 68 | not be able to overcome the most stringent limit in the hierarchy (in this case, |
65 | parent's): | 69 | parent's):: |
66 | 70 | ||
67 | # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs | 71 | # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs |
68 | # cat /sys/fs/cgroup/pids/parent/pids.current | 72 | # cat /sys/fs/cgroup/pids/parent/pids.current |
69 | 2 | 73 | 2 |
70 | # cat /sys/fs/cgroup/pids/parent/child/pids.current | 74 | # cat /sys/fs/cgroup/pids/parent/child/pids.current |
71 | 2 | 75 | 2 |
72 | # cat /sys/fs/cgroup/pids/parent/child/pids.max | 76 | # cat /sys/fs/cgroup/pids/parent/child/pids.max |
73 | max | 77 | max |
74 | # ( /bin/echo "Here's some processes for you." | cat ) | 78 | # ( /bin/echo "Here's some processes for you." | cat ) |
75 | sh: fork: Resource temporary unavailable | 79 | sh: fork: Resource temporary unavailable |
76 | # | 80 | # |
77 | 81 | ||
78 | We can set a limit that is smaller than pids.current, which will stop any new | 82 | We can set a limit that is smaller than pids.current, which will stop any new |
79 | processes from being forked at all (note that the shell itself counts towards | 83 | processes from being forked at all (note that the shell itself counts towards |
80 | pids.current): | 84 | pids.current):: |
81 | 85 | ||
82 | # echo 1 > /sys/fs/cgroup/pids/parent/pids.max | 86 | # echo 1 > /sys/fs/cgroup/pids/parent/pids.max |
83 | # /bin/echo "We can't even spawn a single process now." | 87 | # /bin/echo "We can't even spawn a single process now." |
84 | sh: fork: Resource temporary unavailable | 88 | sh: fork: Resource temporary unavailable |
85 | # echo 0 > /sys/fs/cgroup/pids/parent/pids.max | 89 | # echo 0 > /sys/fs/cgroup/pids/parent/pids.max |
86 | # /bin/echo "We can't even spawn a single process now." | 90 | # /bin/echo "We can't even spawn a single process now." |
87 | sh: fork: Resource temporary unavailable | 91 | sh: fork: Resource temporary unavailable |
88 | # | 92 | # |
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.rst index 9bdb7fd03f83..2fcb0a9bf790 100644 --- a/Documentation/cgroup-v1/rdma.txt +++ b/Documentation/cgroup-v1/rdma.rst | |||
@@ -1,16 +1,17 @@ | |||
1 | RDMA Controller | 1 | =============== |
2 | ---------------- | 2 | RDMA Controller |
3 | =============== | ||
3 | 4 | ||
4 | Contents | 5 | .. Contents |
5 | -------- | ||
6 | 6 | ||
7 | 1. Overview | 7 | 1. Overview |
8 | 1-1. What is RDMA controller? | 8 | 1-1. What is RDMA controller? |
9 | 1-2. Why RDMA controller needed? | 9 | 1-2. Why RDMA controller needed? |
10 | 1-3. How is RDMA controller implemented? | 10 | 1-3. How is RDMA controller implemented? |
11 | 2. Usage Examples | 11 | 2. Usage Examples |
12 | 12 | ||
13 | 1. Overview | 13 | 1. Overview |
14 | =========== | ||
14 | 15 | ||
15 | 1-1. What is RDMA controller? | 16 | 1-1. What is RDMA controller? |
16 | ----------------------------- | 17 | ----------------------------- |
@@ -83,27 +84,34 @@ what is configured by user for a given cgroup and what is supported by | |||
83 | IB device. | 84 | IB device. |
84 | 85 | ||
85 | Following resources can be accounted by rdma controller. | 86 | Following resources can be accounted by rdma controller. |
87 | |||
88 | ========== ============================= | ||
86 | hca_handle Maximum number of HCA Handles | 89 | hca_handle Maximum number of HCA Handles |
87 | hca_object Maximum number of HCA Objects | 90 | hca_object Maximum number of HCA Objects |
91 | ========== ============================= | ||
88 | 92 | ||
89 | 2. Usage Examples | 93 | 2. Usage Examples |
90 | ----------------- | 94 | ================= |
91 | 95 | ||
92 | (a) Configure resource limit: | 96 | (a) Configure resource limit:: |
93 | echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max | 97 | |
94 | echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max | 98 | echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max |
95 | 99 | echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max | |
96 | (b) Query resource limit: | 100 | |
97 | cat /sys/fs/cgroup/rdma/2/rdma.max | 101 | (b) Query resource limit:: |
98 | #Output: | 102 | |
99 | mlx4_0 hca_handle=2 hca_object=2000 | 103 | cat /sys/fs/cgroup/rdma/2/rdma.max |
100 | ocrdma1 hca_handle=3 hca_object=max | 104 | #Output: |
101 | 105 | mlx4_0 hca_handle=2 hca_object=2000 | |
102 | (c) Query current usage: | 106 | ocrdma1 hca_handle=3 hca_object=max |
103 | cat /sys/fs/cgroup/rdma/2/rdma.current | 107 | |
104 | #Output: | 108 | (c) Query current usage:: |
105 | mlx4_0 hca_handle=1 hca_object=20 | 109 | |
106 | ocrdma1 hca_handle=1 hca_object=23 | 110 | cat /sys/fs/cgroup/rdma/2/rdma.current |
107 | 111 | #Output: | |
108 | (d) Delete resource limit: | 112 | mlx4_0 hca_handle=1 hca_object=20 |
109 | echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max | 113 | ocrdma1 hca_handle=1 hca_object=23 |
114 | |||
115 | (d) Delete resource limit:: | ||
116 | |||
117 | echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max | ||
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index d06e9a59a9f4..cad797a8a39e 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt | |||
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for | |||
98 | use at file creation time. When a task allocates a file in the file | 98 | use at file creation time. When a task allocates a file in the file |
99 | system, the mount option memory policy will be applied with a NodeList, | 99 | system, the mount option memory policy will be applied with a NodeList, |
100 | if any, modified by the calling task's cpuset constraints | 100 | if any, modified by the calling task's cpuset constraints |
101 | [See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed | 101 | [See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed |
102 | below. If the resulting NodeLists is the empty set, the effective memory | 102 | below. If the resulting NodeLists is the empty set, the effective memory |
103 | policy for the file will revert to "default" policy. | 103 | policy for the file will revert to "default" policy. |
104 | 104 | ||
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index b14e03ff3528..a7514343b660 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt | |||
@@ -652,7 +652,7 @@ CONTENTS | |||
652 | 652 | ||
653 | -deadline tasks cannot have an affinity mask smaller that the entire | 653 | -deadline tasks cannot have an affinity mask smaller that the entire |
654 | root_domain they are created on. However, affinities can be specified | 654 | root_domain they are created on. However, affinities can be specified |
655 | through the cpuset facility (Documentation/cgroup-v1/cpusets.txt). | 655 | through the cpuset facility (Documentation/cgroup-v1/cpusets.rst). |
656 | 656 | ||
657 | 5.1 SCHED_DEADLINE and cpusets HOWTO | 657 | 5.1 SCHED_DEADLINE and cpusets HOWTO |
658 | ------------------------------------ | 658 | ------------------------------------ |
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index edd861c94c1b..d1328890ef28 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt | |||
@@ -215,7 +215,7 @@ SCHED_BATCH) tasks. | |||
215 | 215 | ||
216 | These options need CONFIG_CGROUPS to be defined, and let the administrator | 216 | These options need CONFIG_CGROUPS to be defined, and let the administrator |
217 | create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See | 217 | create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See |
218 | Documentation/cgroup-v1/cgroups.txt for more information about this filesystem. | 218 | Documentation/cgroup-v1/cgroups.rst for more information about this filesystem. |
219 | 219 | ||
220 | When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each | 220 | When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each |
221 | group created using the pseudo filesystem. See example steps below to create | 221 | group created using the pseudo filesystem. See example steps below to create |
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index d8fce3e78457..c09f7a3fee66 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
@@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us" | |||
133 | to control the CPU time reserved for each control group. | 133 | to control the CPU time reserved for each control group. |
134 | 134 | ||
135 | For more information on working with control groups, you should read | 135 | For more information on working with control groups, you should read |
136 | Documentation/cgroup-v1/cgroups.txt as well. | 136 | Documentation/cgroup-v1/cgroups.rst as well. |
137 | 137 | ||
138 | Group settings are checked against the following limits in order to keep the | 138 | Group settings are checked against the following limits in order to keep the |
139 | configuration schedulable: | 139 | configuration schedulable: |
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst index 5cae13e9a08b..0d830edae8fe 100644 --- a/Documentation/vm/numa.rst +++ b/Documentation/vm/numa.rst | |||
@@ -67,7 +67,7 @@ nodes. Each emulated node will manage a fraction of the underlying cells' | |||
67 | physical memory. NUMA emluation is useful for testing NUMA kernel and | 67 | physical memory. NUMA emluation is useful for testing NUMA kernel and |
68 | application features on non-NUMA platforms, and as a sort of memory resource | 68 | application features on non-NUMA platforms, and as a sort of memory resource |
69 | management mechanism when used together with cpusets. | 69 | management mechanism when used together with cpusets. |
70 | [see Documentation/cgroup-v1/cpusets.txt] | 70 | [see Documentation/cgroup-v1/cpusets.rst] |
71 | 71 | ||
72 | For each node with memory, Linux constructs an independent memory management | 72 | For each node with memory, Linux constructs an independent memory management |
73 | subsystem, complete with its own free page lists, in-use page lists, usage | 73 | subsystem, complete with its own free page lists, in-use page lists, usage |
@@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see | |||
114 | 114 | ||
115 | System administrators can restrict the CPUs and nodes' memories that a non- | 115 | System administrators can restrict the CPUs and nodes' memories that a non- |
116 | privileged user can specify in the scheduling or NUMA commands and functions | 116 | privileged user can specify in the scheduling or NUMA commands and functions |
117 | using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.txt] | 117 | using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.rst] |
118 | 118 | ||
119 | On architectures that do not hide memoryless nodes, Linux will include only | 119 | On architectures that do not hide memoryless nodes, Linux will include only |
120 | zones [nodes] with memory in the zonelists. This means that for a memoryless | 120 | zones [nodes] with memory in the zonelists. This means that for a memoryless |
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index f68d61335abb..35bba27d5fff 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst | |||
@@ -41,7 +41,7 @@ locations. | |||
41 | Larger installations usually partition the system using cpusets into | 41 | Larger installations usually partition the system using cpusets into |
42 | sections of nodes. Paul Jackson has equipped cpusets with the ability to | 42 | sections of nodes. Paul Jackson has equipped cpusets with the ability to |
43 | move pages when a task is moved to another cpuset (See | 43 | move pages when a task is moved to another cpuset (See |
44 | Documentation/cgroup-v1/cpusets.txt). | 44 | Documentation/cgroup-v1/cpusets.rst). |
45 | Cpusets allows the automation of process locality. If a task is moved to | 45 | Cpusets allows the automation of process locality. If a task is moved to |
46 | a new cpuset then also all its pages are moved with it so that the | 46 | a new cpuset then also all its pages are moved with it so that the |
47 | performance of the process does not sink dramatically. Also the pages | 47 | performance of the process does not sink dramatically. Also the pages |
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst index b8e29f977f2d..c6d94118fbcc 100644 --- a/Documentation/vm/unevictable-lru.rst +++ b/Documentation/vm/unevictable-lru.rst | |||
@@ -98,7 +98,7 @@ Memory Control Group Interaction | |||
98 | -------------------------------- | 98 | -------------------------------- |
99 | 99 | ||
100 | The unevictable LRU facility interacts with the memory control group [aka | 100 | The unevictable LRU facility interacts with the memory control group [aka |
101 | memory controller; see Documentation/cgroup-v1/memory.txt] by extending the | 101 | memory controller; see Documentation/cgroup-v1/memory.rst] by extending the |
102 | lru_list enum. | 102 | lru_list enum. |
103 | 103 | ||
104 | The memory controller data structure automatically gets a per-zone unevictable | 104 | The memory controller data structure automatically gets a per-zone unevictable |
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst index 74fbb78b3c67..a6926cd40f70 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst | |||
@@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks. This is a way of limiting the | |||
15 | amount of system memory that are available to a certain class of tasks. | 15 | amount of system memory that are available to a certain class of tasks. |
16 | 16 | ||
17 | For more information on the features of cpusets, see | 17 | For more information on the features of cpusets, see |
18 | Documentation/cgroup-v1/cpusets.txt. | 18 | Documentation/cgroup-v1/cpusets.rst. |
19 | There are a number of different configurations you can use for your needs. For | 19 | There are a number of different configurations you can use for your needs. For |
20 | more information on the numa=fake command line option and its various ways of | 20 | more information on the numa=fake command line option and its various ways of |
21 | configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. | 21 | configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. |
@@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:: | |||
40 | On node 3 totalpages: 131072 | 40 | On node 3 totalpages: 131072 |
41 | 41 | ||
42 | Now following the instructions for mounting the cpusets filesystem from | 42 | Now following the instructions for mounting the cpusets filesystem from |
43 | Documentation/cgroup-v1/cpusets.txt, you can assign fake nodes (i.e. contiguous memory | 43 | Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory |
44 | address spaces) to individual cpusets:: | 44 | address spaces) to individual cpusets:: |
45 | 45 | ||
46 | [root@xroads /]# mkdir exampleset | 46 | [root@xroads /]# mkdir exampleset |
diff --git a/MAINTAINERS b/MAINTAINERS index 4a9e8e5b2432..558acf24ea1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -4122,7 +4122,7 @@ W: http://www.bullopensource.org/cpuset/ | |||
4122 | W: http://oss.sgi.com/projects/cpusets/ | 4122 | W: http://oss.sgi.com/projects/cpusets/ |
4123 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git | 4123 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git |
4124 | S: Maintained | 4124 | S: Maintained |
4125 | F: Documentation/cgroup-v1/cpusets.txt | 4125 | F: Documentation/cgroup-v1/cpusets.rst |
4126 | F: include/linux/cpuset.h | 4126 | F: include/linux/cpuset.h |
4127 | F: kernel/cgroup/cpuset.c | 4127 | F: kernel/cgroup/cpuset.c |
4128 | 4128 | ||
diff --git a/block/Kconfig b/block/Kconfig index 2466dcc3ef1d..56cb1695cd87 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING | |||
89 | one needs to mount and use blkio cgroup controller for creating | 89 | one needs to mount and use blkio cgroup controller for creating |
90 | cgroups and specifying per device IO rate policies. | 90 | cgroups and specifying per device IO rate policies. |
91 | 91 | ||
92 | See Documentation/cgroup-v1/blkio-controller.txt for more information. | 92 | See Documentation/cgroup-v1/blkio-controller.rst for more information. |
93 | 93 | ||
94 | config BLK_DEV_THROTTLING_LOW | 94 | config BLK_DEV_THROTTLING_LOW |
95 | bool "Block throttling .low limit interface support (EXPERIMENTAL)" | 95 | bool "Block throttling .low limit interface support (EXPERIMENTAL)" |
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index b4e766e93f6e..c5311935239d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -624,7 +624,7 @@ struct cftype { | |||
624 | 624 | ||
625 | /* | 625 | /* |
626 | * Control Group subsystem type. | 626 | * Control Group subsystem type. |
627 | * See Documentation/cgroup-v1/cgroups.txt for details | 627 | * See Documentation/cgroup-v1/cgroups.rst for details |
628 | */ | 628 | */ |
629 | struct cgroup_subsys { | 629 | struct cgroup_subsys { |
630 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); | 630 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0297f930a56e..3745ecdad925 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -131,6 +131,8 @@ void cgroup_free(struct task_struct *p); | |||
131 | int cgroup_init_early(void); | 131 | int cgroup_init_early(void); |
132 | int cgroup_init(void); | 132 | int cgroup_init(void); |
133 | 133 | ||
134 | int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); | ||
135 | |||
134 | /* | 136 | /* |
135 | * Iteration helpers and macros. | 137 | * Iteration helpers and macros. |
136 | */ | 138 | */ |
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a8b823c30b43..489e118b69d2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
@@ -785,7 +785,7 @@ union bpf_attr { | |||
785 | * based on a user-provided identifier for all traffic coming from | 785 | * based on a user-provided identifier for all traffic coming from |
786 | * the tasks belonging to the related cgroup. See also the related | 786 | * the tasks belonging to the related cgroup. See also the related |
787 | * kernel documentation, available from the Linux sources in file | 787 | * kernel documentation, available from the Linux sources in file |
788 | * *Documentation/cgroup-v1/net_cls.txt*. | 788 | * *Documentation/cgroup-v1/net_cls.rst*. |
789 | * | 789 | * |
790 | * The Linux kernel has two versions for cgroups: there are | 790 | * The Linux kernel has two versions for cgroups: there are |
791 | * cgroups v1 and cgroups v2. Both are available to users, who can | 791 | * cgroups v1 and cgroups v2. Both are available to users, who can |
diff --git a/init/Kconfig b/init/Kconfig index c88289c18d59..bf96faf3fe43 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -850,7 +850,7 @@ config BLK_CGROUP | |||
850 | CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set | 850 | CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set |
851 | CONFIG_BLK_DEV_THROTTLING=y. | 851 | CONFIG_BLK_DEV_THROTTLING=y. |
852 | 852 | ||
853 | See Documentation/cgroup-v1/blkio-controller.txt for more information. | 853 | See Documentation/cgroup-v1/blkio-controller.rst for more information. |
854 | 854 | ||
855 | config DEBUG_BLK_CGROUP | 855 | config DEBUG_BLK_CGROUP |
856 | bool "IO controller debugging" | 856 | bool "IO controller debugging" |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cdbeff87fa99..aaba2a41562a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -6240,6 +6240,48 @@ struct cgroup *cgroup_get_from_fd(int fd) | |||
6240 | } | 6240 | } |
6241 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); | 6241 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); |
6242 | 6242 | ||
6243 | static u64 power_of_ten(int power) | ||
6244 | { | ||
6245 | u64 v = 1; | ||
6246 | while (power--) | ||
6247 | v *= 10; | ||
6248 | return v; | ||
6249 | } | ||
6250 | |||
6251 | /** | ||
6252 | * cgroup_parse_float - parse a floating number | ||
6253 | * @input: input string | ||
6254 | * @dec_shift: number of decimal digits to shift | ||
6255 | * @v: output | ||
6256 | * | ||
6257 | * Parse a decimal floating point number in @input and store the result in | ||
6258 | * @v with decimal point right shifted @dec_shift times. For example, if | ||
6259 | * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. | ||
6260 | * Returns 0 on success, -errno otherwise. | ||
6261 | * | ||
6262 | * There's nothing cgroup specific about this function except that it's | ||
6263 | * currently the only user. | ||
6264 | */ | ||
6265 | int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) | ||
6266 | { | ||
6267 | s64 whole, frac = 0; | ||
6268 | int fstart = 0, fend = 0, flen; | ||
6269 | |||
6270 | if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) | ||
6271 | return -EINVAL; | ||
6272 | if (frac < 0) | ||
6273 | return -EINVAL; | ||
6274 | |||
6275 | flen = fend > fstart ? fend - fstart : 0; | ||
6276 | if (flen < dec_shift) | ||
6277 | frac *= power_of_ten(dec_shift - flen); | ||
6278 | else | ||
6279 | frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); | ||
6280 | |||
6281 | *v = whole * power_of_ten(dec_shift) + frac; | ||
6282 | return 0; | ||
6283 | } | ||
6284 | |||
6243 | /* | 6285 | /* |
6244 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data | 6286 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data |
6245 | * definition in cgroup-defs.h. | 6287 | * definition in cgroup-defs.h. |
@@ -6402,4 +6444,5 @@ static int __init cgroup_sysfs_init(void) | |||
6402 | return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); | 6444 | return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); |
6403 | } | 6445 | } |
6404 | subsys_initcall(cgroup_sysfs_init); | 6446 | subsys_initcall(cgroup_sysfs_init); |
6447 | |||
6405 | #endif /* CONFIG_SYSFS */ | 6448 | #endif /* CONFIG_SYSFS */ |
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a1590e244f5f..b3b02b9c4405 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void) | |||
729 | * load balancing domains (sched domains) as specified by that partial | 729 | * load balancing domains (sched domains) as specified by that partial |
730 | * partition. | 730 | * partition. |
731 | * | 731 | * |
732 | * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt | 732 | * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst |
733 | * for a background explanation of this. | 733 | * for a background explanation of this. |
734 | * | 734 | * |
735 | * Does not return errors, on the theory that the callers of this | 735 | * Does not return errors, on the theory that the callers of this |
diff --git a/security/device_cgroup.c b/security/device_cgroup.c index dc28914fa72e..c07196502577 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c | |||
@@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent) | |||
509 | * This is one of the three key functions for hierarchy implementation. | 509 | * This is one of the three key functions for hierarchy implementation. |
510 | * This function is responsible for re-evaluating all the cgroup's active | 510 | * This function is responsible for re-evaluating all the cgroup's active |
511 | * exceptions due to a parent's exception change. | 511 | * exceptions due to a parent's exception change. |
512 | * Refer to Documentation/cgroup-v1/devices.txt for more details. | 512 | * Refer to Documentation/cgroup-v1/devices.rst for more details. |
513 | */ | 513 | */ |
514 | static void revalidate_active_exceptions(struct dev_cgroup *devcg) | 514 | static void revalidate_active_exceptions(struct dev_cgroup *devcg) |
515 | { | 515 | { |
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a8b823c30b43..489e118b69d2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h | |||
@@ -785,7 +785,7 @@ union bpf_attr { | |||
785 | * based on a user-provided identifier for all traffic coming from | 785 | * based on a user-provided identifier for all traffic coming from |
786 | * the tasks belonging to the related cgroup. See also the related | 786 | * the tasks belonging to the related cgroup. See also the related |
787 | * kernel documentation, available from the Linux sources in file | 787 | * kernel documentation, available from the Linux sources in file |
788 | * *Documentation/cgroup-v1/net_cls.txt*. | 788 | * *Documentation/cgroup-v1/net_cls.rst*. |
789 | * | 789 | * |
790 | * The Linux kernel has two versions for cgroups: there are | 790 | * The Linux kernel has two versions for cgroups: there are |
791 | * cgroups v1 and cgroups v2. Both are available to users, who can | 791 | * cgroups v1 and cgroups v2. Both are available to users, who can |