summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-09 00:35:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-09 00:35:12 -0400
commit92c1d6522135050cb377a18cc6e30d08dfb87efb (patch)
tree5d2fa3051c975f1c459b6949f9e71cac2edf74de
parentdf2a40f549e6b73aad98b0c03f400c00d284816b (diff)
parent99c8b231ae6c6ca4ca2fd1c0b3701071f589661f (diff)
Merge branch 'for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "Documentation updates and the addition of cgroup_parse_float() which will be used by new controllers including blk-iocost" * 'for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: docs: cgroup-v1: convert docs to ReST and rename to *.rst cgroup: Move cgroup_parse_float() implementation out of CONFIG_SYSFS cgroup: add cgroup_parse_float()
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst6
-rw-r--r--Documentation/admin-guide/hw-vuln/l1tf.rst2
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt4
-rw-r--r--Documentation/admin-guide/mm/numa_memory_policy.rst2
-rw-r--r--Documentation/block/bfq-iosched.txt2
-rw-r--r--Documentation/cgroup-v1/blkio-controller.rst (renamed from Documentation/cgroup-v1/blkio-controller.txt)79
-rw-r--r--Documentation/cgroup-v1/cgroups.rst (renamed from Documentation/cgroup-v1/cgroups.txt)186
-rw-r--r--Documentation/cgroup-v1/cpuacct.rst (renamed from Documentation/cgroup-v1/cpuacct.txt)15
-rw-r--r--Documentation/cgroup-v1/cpusets.rst (renamed from Documentation/cgroup-v1/cpusets.txt)209
-rw-r--r--Documentation/cgroup-v1/devices.rst (renamed from Documentation/cgroup-v1/devices.txt)40
-rw-r--r--Documentation/cgroup-v1/freezer-subsystem.rst (renamed from Documentation/cgroup-v1/freezer-subsystem.txt)14
-rw-r--r--Documentation/cgroup-v1/hugetlb.rst (renamed from Documentation/cgroup-v1/hugetlb.txt)41
-rw-r--r--Documentation/cgroup-v1/index.rst30
-rw-r--r--Documentation/cgroup-v1/memcg_test.rst (renamed from Documentation/cgroup-v1/memcg_test.txt)265
-rw-r--r--Documentation/cgroup-v1/memory.rst (renamed from Documentation/cgroup-v1/memory.txt)463
-rw-r--r--Documentation/cgroup-v1/net_cls.rst (renamed from Documentation/cgroup-v1/net_cls.txt)37
-rw-r--r--Documentation/cgroup-v1/net_prio.rst (renamed from Documentation/cgroup-v1/net_prio.txt)24
-rw-r--r--Documentation/cgroup-v1/pids.rst (renamed from Documentation/cgroup-v1/pids.txt)82
-rw-r--r--Documentation/cgroup-v1/rdma.rst (renamed from Documentation/cgroup-v1/rdma.txt)66
-rw-r--r--Documentation/filesystems/tmpfs.txt2
-rw-r--r--Documentation/scheduler/sched-deadline.txt2
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt2
-rw-r--r--Documentation/scheduler/sched-rt-group.txt2
-rw-r--r--Documentation/vm/numa.rst4
-rw-r--r--Documentation/vm/page_migration.rst2
-rw-r--r--Documentation/vm/unevictable-lru.rst2
-rw-r--r--Documentation/x86/x86_64/fake-numa-for-cpusets.rst4
-rw-r--r--MAINTAINERS2
-rw-r--r--block/Kconfig2
-rw-r--r--include/linux/cgroup-defs.h2
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/uapi/linux/bpf.h2
-rw-r--r--init/Kconfig2
-rw-r--r--kernel/cgroup/cgroup.c43
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--security/device_cgroup.c2
-rw-r--r--tools/include/uapi/linux/bpf.h2
37 files changed, 1005 insertions, 643 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index cf88c1f98270..a5c845338d6d 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -705,6 +705,12 @@ Conventions
705 informational files on the root cgroup which end up showing global 705 informational files on the root cgroup which end up showing global
706 information available elsewhere shouldn't exist. 706 information available elsewhere shouldn't exist.
707 707
708- The default time unit is microseconds. If a different unit is ever
709 used, an explicit unit suffix must be present.
710
711- A parts-per quantity should use a percentage decimal with at least
712 two digit fractional part - e.g. 13.40.
713
708- If a controller implements weight based resource distribution, its 714- If a controller implements weight based resource distribution, its
709 interface file should be named "weight" and have the range [1, 715 interface file should be named "weight" and have the range [1,
710 10000] with 100 as the default. The values are chosen to allow 716 10000] with 100 as the default. The values are chosen to allow
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
index 31653a9f0e1b..656aee262e23 100644
--- a/Documentation/admin-guide/hw-vuln/l1tf.rst
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -241,7 +241,7 @@ Guest mitigation mechanisms
241 For further information about confining guests to a single or to a group 241 For further information about confining guests to a single or to a group
242 of cores consult the cpusets documentation: 242 of cores consult the cpusets documentation:
243 243
244 https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt 244 https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst
245 245
246.. _interrupt_isolation: 246.. _interrupt_isolation:
247 247
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e6e806285703..74d28efa1c40 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4084,7 +4084,7 @@
4084 4084
4085 relax_domain_level= 4085 relax_domain_level=
4086 [KNL, SMP] Set scheduler's default relax_domain_level. 4086 [KNL, SMP] Set scheduler's default relax_domain_level.
4087 See Documentation/cgroup-v1/cpusets.txt. 4087 See Documentation/cgroup-v1/cpusets.rst.
4088 4088
4089 reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory 4089 reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory
4090 Format: <base1>,<size1>[,<base2>,<size2>,...] 4090 Format: <base1>,<size1>[,<base2>,<size2>,...]
@@ -4594,7 +4594,7 @@
4594 swapaccount=[0|1] 4594 swapaccount=[0|1]
4595 [KNL] Enable accounting of swap in memory resource 4595 [KNL] Enable accounting of swap in memory resource
4596 controller if no parameter or 1 is given or disable 4596 controller if no parameter or 1 is given or disable
4597 it if 0 is given (See Documentation/cgroup-v1/memory.txt) 4597 it if 0 is given (See Documentation/cgroup-v1/memory.rst)
4598 4598
4599 swiotlb= [ARM,IA-64,PPC,MIPS,X86] 4599 swiotlb= [ARM,IA-64,PPC,MIPS,X86]
4600 Format: { <int> | force | noforce } 4600 Format: { <int> | force | noforce }
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index d78c5b315f72..546f174e5d6a 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy
15support. 15support.
16 16
17Memory policies should not be confused with cpusets 17Memory policies should not be confused with cpusets
18(``Documentation/cgroup-v1/cpusets.txt``) 18(``Documentation/cgroup-v1/cpusets.rst``)
19which is an administrative mechanism for restricting the nodes from which 19which is an administrative mechanism for restricting the nodes from which
20memory may be allocated by a set of processes. Memory policies are a 20memory may be allocated by a set of processes. Memory policies are a
21programming interface that a NUMA-aware application can take advantage of. When 21programming interface that a NUMA-aware application can take advantage of. When
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 1a0f2ac02eb6..b2265cf6c9c3 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -539,7 +539,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files
539created, and kept up-to-date by bfq, depends on whether 539created, and kept up-to-date by bfq, depends on whether
540CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all 540CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all
541the stat files documented in 541the stat files documented in
542Documentation/cgroup-v1/blkio-controller.txt. If, instead, 542Documentation/cgroup-v1/blkio-controller.rst. If, instead,
543CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files 543CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files
544blkio.bfq.io_service_bytes 544blkio.bfq.io_service_bytes
545blkio.bfq.io_service_bytes_recursive 545blkio.bfq.io_service_bytes_recursive
diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.rst
index d1a1b7bdd03a..fd3184537d23 100644
--- a/Documentation/cgroup-v1/blkio-controller.txt
+++ b/Documentation/cgroup-v1/blkio-controller.rst
@@ -1,5 +1,7 @@
1 Block IO Controller 1===================
2 =================== 2Block IO Controller
3===================
4
3Overview 5Overview
4======== 6========
5cgroup subsys "blkio" implements the block io controller. There seems to be 7cgroup subsys "blkio" implements the block io controller. There seems to be
@@ -17,24 +19,27 @@ HOWTO
17===== 19=====
18Throttling/Upper Limit policy 20Throttling/Upper Limit policy
19----------------------------- 21-----------------------------
20- Enable Block IO controller 22- Enable Block IO controller::
23
21 CONFIG_BLK_CGROUP=y 24 CONFIG_BLK_CGROUP=y
22 25
23- Enable throttling in block layer 26- Enable throttling in block layer::
27
24 CONFIG_BLK_DEV_THROTTLING=y 28 CONFIG_BLK_DEV_THROTTLING=y
25 29
26- Mount blkio controller (see cgroups.txt, Why are cgroups needed?) 30- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
31
27 mount -t cgroup -o blkio none /sys/fs/cgroup/blkio 32 mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
28 33
29- Specify a bandwidth rate on particular device for root group. The format 34- Specify a bandwidth rate on particular device for root group. The format
30 for policy is "<major>:<minor> <bytes_per_second>". 35 for policy is "<major>:<minor> <bytes_per_second>"::
31 36
32 echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device 37 echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
33 38
34 Above will put a limit of 1MB/second on reads happening for root group 39 Above will put a limit of 1MB/second on reads happening for root group
35 on device having major/minor number 8:16. 40 on device having major/minor number 8:16.
36 41
37- Run dd to read a file and see if rate is throttled to 1MB/s or not. 42- Run dd to read a file and see if rate is throttled to 1MB/s or not::
38 43
39 # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 44 # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
40 1024+0 records in 45 1024+0 records in
@@ -51,7 +56,7 @@ throttling's hierarchy support is enabled iff "sane_behavior" is
51enabled from cgroup side, which currently is a development option and 56enabled from cgroup side, which currently is a development option and
52not publicly available. 57not publicly available.
53 58
54If somebody created a hierarchy like as follows. 59If somebody created a hierarchy like as follows::
55 60
56 root 61 root
57 / \ 62 / \
@@ -66,7 +71,7 @@ directly generated by tasks in that cgroup.
66 71
67Throttling without "sane_behavior" enabled from cgroup side will 72Throttling without "sane_behavior" enabled from cgroup side will
68practically treat all groups at same level as if it looks like the 73practically treat all groups at same level as if it looks like the
69following. 74following::
70 75
71 pivot 76 pivot
72 / / \ \ 77 / / \ \
@@ -99,27 +104,31 @@ Proportional weight policy files
99 These rules override the default value of group weight as specified 104 These rules override the default value of group weight as specified
100 by blkio.weight. 105 by blkio.weight.
101 106
102 Following is the format. 107 Following is the format::
108
109 # echo dev_maj:dev_minor weight > blkio.weight_device
110
111 Configure weight=300 on /dev/sdb (8:16) in this cgroup::
112
113 # echo 8:16 300 > blkio.weight_device
114 # cat blkio.weight_device
115 dev weight
116 8:16 300
117
118 Configure weight=500 on /dev/sda (8:0) in this cgroup::
103 119
104 # echo dev_maj:dev_minor weight > blkio.weight_device 120 # echo 8:0 500 > blkio.weight_device
105 Configure weight=300 on /dev/sdb (8:16) in this cgroup 121 # cat blkio.weight_device
106 # echo 8:16 300 > blkio.weight_device 122 dev weight
107 # cat blkio.weight_device 123 8:0 500
108 dev weight 124 8:16 300
109 8:16 300
110 125
111 Configure weight=500 on /dev/sda (8:0) in this cgroup 126 Remove specific weight for /dev/sda in this cgroup::
112 # echo 8:0 500 > blkio.weight_device
113 # cat blkio.weight_device
114 dev weight
115 8:0 500
116 8:16 300
117 127
118 Remove specific weight for /dev/sda in this cgroup 128 # echo 8:0 0 > blkio.weight_device
119 # echo 8:0 0 > blkio.weight_device 129 # cat blkio.weight_device
120 # cat blkio.weight_device 130 dev weight
121 dev weight 131 8:16 300
122 8:16 300
123 132
124- blkio.leaf_weight[_device] 133- blkio.leaf_weight[_device]
125 - Equivalents of blkio.weight[_device] for the purpose of 134 - Equivalents of blkio.weight[_device] for the purpose of
@@ -244,30 +253,30 @@ Throttling/Upper limit policy files
244- blkio.throttle.read_bps_device 253- blkio.throttle.read_bps_device
245 - Specifies upper limit on READ rate from the device. IO rate is 254 - Specifies upper limit on READ rate from the device. IO rate is
246 specified in bytes per second. Rules are per device. Following is 255 specified in bytes per second. Rules are per device. Following is
247 the format. 256 the format::
248 257
249 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device 258 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
250 259
251- blkio.throttle.write_bps_device 260- blkio.throttle.write_bps_device
252 - Specifies upper limit on WRITE rate to the device. IO rate is 261 - Specifies upper limit on WRITE rate to the device. IO rate is
253 specified in bytes per second. Rules are per device. Following is 262 specified in bytes per second. Rules are per device. Following is
254 the format. 263 the format::
255 264
256 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device 265 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
257 266
258- blkio.throttle.read_iops_device 267- blkio.throttle.read_iops_device
259 - Specifies upper limit on READ rate from the device. IO rate is 268 - Specifies upper limit on READ rate from the device. IO rate is
260 specified in IO per second. Rules are per device. Following is 269 specified in IO per second. Rules are per device. Following is
261 the format. 270 the format::
262 271
263 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device 272 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
264 273
265- blkio.throttle.write_iops_device 274- blkio.throttle.write_iops_device
266 - Specifies upper limit on WRITE rate to the device. IO rate is 275 - Specifies upper limit on WRITE rate to the device. IO rate is
267 specified in io per second. Rules are per device. Following is 276 specified in io per second. Rules are per device. Following is
268 the format. 277 the format::
269 278
270 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device 279 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
271 280
272Note: If both BW and IOPS rules are specified for a device, then IO is 281Note: If both BW and IOPS rules are specified for a device, then IO is
273 subjected to both the constraints. 282 subjected to both the constraints.
diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.rst
index 059f7063eea6..46bbe7e022d4 100644
--- a/Documentation/cgroup-v1/cgroups.txt
+++ b/Documentation/cgroup-v1/cgroups.rst
@@ -1,35 +1,39 @@
1 CGROUPS 1==============
2 ------- 2Control Groups
3==============
3 4
4Written by Paul Menage <menage@google.com> based on 5Written by Paul Menage <menage@google.com> based on
5Documentation/cgroup-v1/cpusets.txt 6Documentation/cgroup-v1/cpusets.rst
6 7
7Original copyright statements from cpusets.txt: 8Original copyright statements from cpusets.txt:
9
8Portions Copyright (C) 2004 BULL SA. 10Portions Copyright (C) 2004 BULL SA.
11
9Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. 12Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
13
10Modified by Paul Jackson <pj@sgi.com> 14Modified by Paul Jackson <pj@sgi.com>
15
11Modified by Christoph Lameter <cl@linux.com> 16Modified by Christoph Lameter <cl@linux.com>
12 17
13CONTENTS: 18.. CONTENTS:
14========= 19
15 20 1. Control Groups
161. Control Groups 21 1.1 What are cgroups ?
17 1.1 What are cgroups ? 22 1.2 Why are cgroups needed ?
18 1.2 Why are cgroups needed ? 23 1.3 How are cgroups implemented ?
19 1.3 How are cgroups implemented ? 24 1.4 What does notify_on_release do ?
20 1.4 What does notify_on_release do ? 25 1.5 What does clone_children do ?
21 1.5 What does clone_children do ? 26 1.6 How do I use cgroups ?
22 1.6 How do I use cgroups ? 27 2. Usage Examples and Syntax
232. Usage Examples and Syntax 28 2.1 Basic Usage
24 2.1 Basic Usage 29 2.2 Attaching processes
25 2.2 Attaching processes 30 2.3 Mounting hierarchies by name
26 2.3 Mounting hierarchies by name 31 3. Kernel API
273. Kernel API 32 3.1 Overview
28 3.1 Overview 33 3.2 Synchronization
29 3.2 Synchronization 34 3.3 Subsystem API
30 3.3 Subsystem API 35 4. Extended attributes usage
314. Extended attributes usage 36 5. Questions
325. Questions
33 37
341. Control Groups 381. Control Groups
35================= 39=================
@@ -72,7 +76,7 @@ On their own, the only use for cgroups is for simple job
72tracking. The intention is that other subsystems hook into the generic 76tracking. The intention is that other subsystems hook into the generic
73cgroup support to provide new attributes for cgroups, such as 77cgroup support to provide new attributes for cgroups, such as
74accounting/limiting the resources which processes in a cgroup can 78accounting/limiting the resources which processes in a cgroup can
75access. For example, cpusets (see Documentation/cgroup-v1/cpusets.txt) allow 79access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow
76you to associate a set of CPUs and a set of memory nodes with the 80you to associate a set of CPUs and a set of memory nodes with the
77tasks in each cgroup. 81tasks in each cgroup.
78 82
@@ -108,7 +112,7 @@ As an example of a scenario (originally proposed by vatsa@in.ibm.com)
108that can benefit from multiple hierarchies, consider a large 112that can benefit from multiple hierarchies, consider a large
109university server with various users - students, professors, system 113university server with various users - students, professors, system
110tasks etc. The resource planning for this server could be along the 114tasks etc. The resource planning for this server could be along the
111following lines: 115following lines::
112 116
113 CPU : "Top cpuset" 117 CPU : "Top cpuset"
114 / \ 118 / \
@@ -136,7 +140,7 @@ depending on who launched it (prof/student).
136With the ability to classify tasks differently for different resources 140With the ability to classify tasks differently for different resources
137(by putting those resource subsystems in different hierarchies), 141(by putting those resource subsystems in different hierarchies),
138the admin can easily set up a script which receives exec notifications 142the admin can easily set up a script which receives exec notifications
139and depending on who is launching the browser he can 143and depending on who is launching the browser he can::
140 144
141 # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks 145 # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
142 146
@@ -151,7 +155,7 @@ wants to do online gaming :)) OR give one of the student's simulation
151apps enhanced CPU power. 155apps enhanced CPU power.
152 156
153With ability to write PIDs directly to resource classes, it's just a 157With ability to write PIDs directly to resource classes, it's just a
154matter of: 158matter of::
155 159
156 # echo pid > /sys/fs/cgroup/network/<new_class>/tasks 160 # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
157 (after some time) 161 (after some time)
@@ -306,7 +310,7 @@ configuration from the parent during initialization.
306-------------------------- 310--------------------------
307 311
308To start a new job that is to be contained within a cgroup, using 312To start a new job that is to be contained within a cgroup, using
309the "cpuset" cgroup subsystem, the steps are something like: 313the "cpuset" cgroup subsystem, the steps are something like::
310 314
311 1) mount -t tmpfs cgroup_root /sys/fs/cgroup 315 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
312 2) mkdir /sys/fs/cgroup/cpuset 316 2) mkdir /sys/fs/cgroup/cpuset
@@ -320,7 +324,7 @@ the "cpuset" cgroup subsystem, the steps are something like:
320 324
321For example, the following sequence of commands will setup a cgroup 325For example, the following sequence of commands will setup a cgroup
322named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, 326named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
323and then start a subshell 'sh' in that cgroup: 327and then start a subshell 'sh' in that cgroup::
324 328
325 mount -t tmpfs cgroup_root /sys/fs/cgroup 329 mount -t tmpfs cgroup_root /sys/fs/cgroup
326 mkdir /sys/fs/cgroup/cpuset 330 mkdir /sys/fs/cgroup/cpuset
@@ -345,8 +349,9 @@ and then start a subshell 'sh' in that cgroup:
345Creating, modifying, using cgroups can be done through the cgroup 349Creating, modifying, using cgroups can be done through the cgroup
346virtual filesystem. 350virtual filesystem.
347 351
348To mount a cgroup hierarchy with all available subsystems, type: 352To mount a cgroup hierarchy with all available subsystems, type::
349# mount -t cgroup xxx /sys/fs/cgroup 353
354 # mount -t cgroup xxx /sys/fs/cgroup
350 355
351The "xxx" is not interpreted by the cgroup code, but will appear in 356The "xxx" is not interpreted by the cgroup code, but will appear in
352/proc/mounts so may be any useful identifying string that you like. 357/proc/mounts so may be any useful identifying string that you like.
@@ -355,18 +360,19 @@ Note: Some subsystems do not work without some user input first. For instance,
355if cpusets are enabled the user will have to populate the cpus and mems files 360if cpusets are enabled the user will have to populate the cpus and mems files
356for each new cgroup created before that group can be used. 361for each new cgroup created before that group can be used.
357 362
358As explained in section `1.2 Why are cgroups needed?' you should create 363As explained in section `1.2 Why are cgroups needed?` you should create
359different hierarchies of cgroups for each single resource or group of 364different hierarchies of cgroups for each single resource or group of
360resources you want to control. Therefore, you should mount a tmpfs on 365resources you want to control. Therefore, you should mount a tmpfs on
361/sys/fs/cgroup and create directories for each cgroup resource or resource 366/sys/fs/cgroup and create directories for each cgroup resource or resource
362group. 367group::
363 368
364# mount -t tmpfs cgroup_root /sys/fs/cgroup 369 # mount -t tmpfs cgroup_root /sys/fs/cgroup
365# mkdir /sys/fs/cgroup/rg1 370 # mkdir /sys/fs/cgroup/rg1
366 371
367To mount a cgroup hierarchy with just the cpuset and memory 372To mount a cgroup hierarchy with just the cpuset and memory
368subsystems, type: 373subsystems, type::
369# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 374
375 # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
370 376
371While remounting cgroups is currently supported, it is not recommend 377While remounting cgroups is currently supported, it is not recommend
372to use it. Remounting allows changing bound subsystems and 378to use it. Remounting allows changing bound subsystems and
@@ -375,9 +381,10 @@ hierarchy is empty and release_agent itself should be replaced with
375conventional fsnotify. The support for remounting will be removed in 381conventional fsnotify. The support for remounting will be removed in
376the future. 382the future.
377 383
378To Specify a hierarchy's release_agent: 384To Specify a hierarchy's release_agent::
379# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ 385
380 xxx /sys/fs/cgroup/rg1 386 # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
387 xxx /sys/fs/cgroup/rg1
381 388
382Note that specifying 'release_agent' more than once will return failure. 389Note that specifying 'release_agent' more than once will return failure.
383 390
@@ -390,32 +397,39 @@ Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
390tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 397tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
391is the cgroup that holds the whole system. 398is the cgroup that holds the whole system.
392 399
393If you want to change the value of release_agent: 400If you want to change the value of release_agent::
394# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent 401
402 # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
395 403
396It can also be changed via remount. 404It can also be changed via remount.
397 405
398If you want to create a new cgroup under /sys/fs/cgroup/rg1: 406If you want to create a new cgroup under /sys/fs/cgroup/rg1::
399# cd /sys/fs/cgroup/rg1 407
400# mkdir my_cgroup 408 # cd /sys/fs/cgroup/rg1
409 # mkdir my_cgroup
410
411Now you want to do something with this cgroup:
412
413 # cd my_cgroup
401 414
402Now you want to do something with this cgroup. 415In this directory you can find several files::
403# cd my_cgroup
404 416
405In this directory you can find several files: 417 # ls
406# ls 418 cgroup.procs notify_on_release tasks
407cgroup.procs notify_on_release tasks 419 (plus whatever files added by the attached subsystems)
408(plus whatever files added by the attached subsystems)
409 420
410Now attach your shell to this cgroup: 421Now attach your shell to this cgroup::
411# /bin/echo $$ > tasks 422
423 # /bin/echo $$ > tasks
412 424
413You can also create cgroups inside your cgroup by using mkdir in this 425You can also create cgroups inside your cgroup by using mkdir in this
414directory. 426directory::
415# mkdir my_sub_cs 427
428 # mkdir my_sub_cs
429
430To remove a cgroup, just use rmdir::
416 431
417To remove a cgroup, just use rmdir: 432 # rmdir my_sub_cs
418# rmdir my_sub_cs
419 433
420This will fail if the cgroup is in use (has cgroups inside, or 434This will fail if the cgroup is in use (has cgroups inside, or
421has processes attached, or is held alive by other subsystem-specific 435has processes attached, or is held alive by other subsystem-specific
@@ -424,19 +438,21 @@ reference).
4242.2 Attaching processes 4382.2 Attaching processes
425----------------------- 439-----------------------
426 440
427# /bin/echo PID > tasks 441::
442
443 # /bin/echo PID > tasks
428 444
429Note that it is PID, not PIDs. You can only attach ONE task at a time. 445Note that it is PID, not PIDs. You can only attach ONE task at a time.
430If you have several tasks to attach, you have to do it one after another: 446If you have several tasks to attach, you have to do it one after another::
431 447
432# /bin/echo PID1 > tasks 448 # /bin/echo PID1 > tasks
433# /bin/echo PID2 > tasks 449 # /bin/echo PID2 > tasks
434 ... 450 ...
435# /bin/echo PIDn > tasks 451 # /bin/echo PIDn > tasks
436 452
437You can attach the current shell task by echoing 0: 453You can attach the current shell task by echoing 0::
438 454
439# echo 0 > tasks 455 # echo 0 > tasks
440 456
441You can use the cgroup.procs file instead of the tasks file to move all 457You can use the cgroup.procs file instead of the tasks file to move all
442threads in a threadgroup at once. Echoing the PID of any task in a 458threads in a threadgroup at once. Echoing the PID of any task in a
@@ -529,7 +545,7 @@ Each subsystem may export the following methods. The only mandatory
529methods are css_alloc/free. Any others that are null are presumed to 545methods are css_alloc/free. Any others that are null are presumed to
530be successful no-ops. 546be successful no-ops.
531 547
532struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) 548``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
533(cgroup_mutex held by caller) 549(cgroup_mutex held by caller)
534 550
535Called to allocate a subsystem state object for a cgroup. The 551Called to allocate a subsystem state object for a cgroup. The
@@ -544,7 +560,7 @@ identified by the passed cgroup object having a NULL parent (since
544it's the root of the hierarchy) and may be an appropriate place for 560it's the root of the hierarchy) and may be an appropriate place for
545initialization code. 561initialization code.
546 562
547int css_online(struct cgroup *cgrp) 563``int css_online(struct cgroup *cgrp)``
548(cgroup_mutex held by caller) 564(cgroup_mutex held by caller)
549 565
550Called after @cgrp successfully completed all allocations and made 566Called after @cgrp successfully completed all allocations and made
@@ -554,7 +570,7 @@ callback can be used to implement reliable state sharing and
554propagation along the hierarchy. See the comment on 570propagation along the hierarchy. See the comment on
555cgroup_for_each_descendant_pre() for details. 571cgroup_for_each_descendant_pre() for details.
556 572
557void css_offline(struct cgroup *cgrp); 573``void css_offline(struct cgroup *cgrp);``
558(cgroup_mutex held by caller) 574(cgroup_mutex held by caller)
559 575
560This is the counterpart of css_online() and called iff css_online() 576This is the counterpart of css_online() and called iff css_online()
@@ -564,7 +580,7 @@ all references it's holding on @cgrp. When all references are dropped,
564cgroup removal will proceed to the next step - css_free(). After this 580cgroup removal will proceed to the next step - css_free(). After this
565callback, @cgrp should be considered dead to the subsystem. 581callback, @cgrp should be considered dead to the subsystem.
566 582
567void css_free(struct cgroup *cgrp) 583``void css_free(struct cgroup *cgrp)``
568(cgroup_mutex held by caller) 584(cgroup_mutex held by caller)
569 585
570The cgroup system is about to free @cgrp; the subsystem should free 586The cgroup system is about to free @cgrp; the subsystem should free
@@ -573,7 +589,7 @@ is completely unused; @cgrp->parent is still valid. (Note - can also
573be called for a newly-created cgroup if an error occurs after this 589be called for a newly-created cgroup if an error occurs after this
574subsystem's create() method has been called for the new cgroup). 590subsystem's create() method has been called for the new cgroup).
575 591
576int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 592``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
577(cgroup_mutex held by caller) 593(cgroup_mutex held by caller)
578 594
579Called prior to moving one or more tasks into a cgroup; if the 595Called prior to moving one or more tasks into a cgroup; if the
@@ -594,7 +610,7 @@ fork. If this method returns 0 (success) then this should remain valid
594while the caller holds cgroup_mutex and it is ensured that either 610while the caller holds cgroup_mutex and it is ensured that either
595attach() or cancel_attach() will be called in future. 611attach() or cancel_attach() will be called in future.
596 612
597void css_reset(struct cgroup_subsys_state *css) 613``void css_reset(struct cgroup_subsys_state *css)``
598(cgroup_mutex held by caller) 614(cgroup_mutex held by caller)
599 615
600An optional operation which should restore @css's configuration to the 616An optional operation which should restore @css's configuration to the
@@ -608,7 +624,7 @@ This prevents unexpected resource control from a hidden css and
608ensures that the configuration is in the initial state when it is made 624ensures that the configuration is in the initial state when it is made
609visible again later. 625visible again later.
610 626
611void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 627``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
612(cgroup_mutex held by caller) 628(cgroup_mutex held by caller)
613 629
614Called when a task attach operation has failed after can_attach() has succeeded. 630Called when a task attach operation has failed after can_attach() has succeeded.
@@ -617,26 +633,26 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
617This will be called only about subsystems whose can_attach() operation have 633This will be called only about subsystems whose can_attach() operation have
618succeeded. The parameters are identical to can_attach(). 634succeeded. The parameters are identical to can_attach().
619 635
620void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 636``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
621(cgroup_mutex held by caller) 637(cgroup_mutex held by caller)
622 638
623Called after the task has been attached to the cgroup, to allow any 639Called after the task has been attached to the cgroup, to allow any
624post-attachment activity that requires memory allocations or blocking. 640post-attachment activity that requires memory allocations or blocking.
625The parameters are identical to can_attach(). 641The parameters are identical to can_attach().
626 642
627void fork(struct task_struct *task) 643``void fork(struct task_struct *task)``
628 644
629Called when a task is forked into a cgroup. 645Called when a task is forked into a cgroup.
630 646
631void exit(struct task_struct *task) 647``void exit(struct task_struct *task)``
632 648
633Called during task exit. 649Called during task exit.
634 650
635void free(struct task_struct *task) 651``void free(struct task_struct *task)``
636 652
637Called when the task_struct is freed. 653Called when the task_struct is freed.
638 654
639void bind(struct cgroup *root) 655``void bind(struct cgroup *root)``
640(cgroup_mutex held by caller) 656(cgroup_mutex held by caller)
641 657
642Called when a cgroup subsystem is rebound to a different hierarchy 658Called when a cgroup subsystem is rebound to a different hierarchy
@@ -649,6 +665,7 @@ that is being created/destroyed (and hence has no sub-cgroups).
649 665
650cgroup filesystem supports certain types of extended attributes in its 666cgroup filesystem supports certain types of extended attributes in its
651directories and files. The current supported types are: 667directories and files. The current supported types are:
668
652 - Trusted (XATTR_TRUSTED) 669 - Trusted (XATTR_TRUSTED)
653 - Security (XATTR_SECURITY) 670 - Security (XATTR_SECURITY)
654 671
@@ -666,12 +683,13 @@ in containers and systemd for assorted meta data like main PID in a cgroup
6665. Questions 6835. Questions
667============ 684============
668 685
669Q: what's up with this '/bin/echo' ? 686::
670A: bash's builtin 'echo' command does not check calls to write() against
671 errors. If you use it in the cgroup file system, you won't be
672 able to tell whether a command succeeded or failed.
673 687
674Q: When I attach processes, only the first of the line gets really attached ! 688 Q: what's up with this '/bin/echo' ?
675A: We can only return one error code per call to write(). So you should also 689 A: bash's builtin 'echo' command does not check calls to write() against
676 put only ONE PID. 690 errors. If you use it in the cgroup file system, you won't be
691 able to tell whether a command succeeded or failed.
677 692
693 Q: When I attach processes, only the first of the line gets really attached !
694 A: We can only return one error code per call to write(). So you should also
695 put only ONE PID.
diff --git a/Documentation/cgroup-v1/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.rst
index 9d73cc0cadb9..d30ed81d2ad7 100644
--- a/Documentation/cgroup-v1/cpuacct.txt
+++ b/Documentation/cgroup-v1/cpuacct.rst
@@ -1,5 +1,6 @@
1=========================
1CPU Accounting Controller 2CPU Accounting Controller
2------------------------- 3=========================
3 4
4The CPU accounting controller is used to group tasks using cgroups and 5The CPU accounting controller is used to group tasks using cgroups and
5account the CPU usage of these groups of tasks. 6account the CPU usage of these groups of tasks.
@@ -8,9 +9,9 @@ The CPU accounting controller supports multi-hierarchy groups. An accounting
8group accumulates the CPU usage of all of its child groups and the tasks 9group accumulates the CPU usage of all of its child groups and the tasks
9directly present in its group. 10directly present in its group.
10 11
11Accounting groups can be created by first mounting the cgroup filesystem. 12Accounting groups can be created by first mounting the cgroup filesystem::
12 13
13# mount -t cgroup -ocpuacct none /sys/fs/cgroup 14 # mount -t cgroup -ocpuacct none /sys/fs/cgroup
14 15
15With the above step, the initial or the parent accounting group becomes 16With the above step, the initial or the parent accounting group becomes
16visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in 17visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
@@ -19,11 +20,11 @@ the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
19by this group which is essentially the CPU time obtained by all the tasks 20by this group which is essentially the CPU time obtained by all the tasks
20in the system. 21in the system.
21 22
22New accounting groups can be created under the parent group /sys/fs/cgroup. 23New accounting groups can be created under the parent group /sys/fs/cgroup::
23 24
24# cd /sys/fs/cgroup 25 # cd /sys/fs/cgroup
25# mkdir g1 26 # mkdir g1
26# echo $$ > g1/tasks 27 # echo $$ > g1/tasks
27 28
28The above steps create a new group g1 and move the current shell 29The above steps create a new group g1 and move the current shell
29process (bash) into it. CPU time consumed by this bash and its children 30process (bash) into it. CPU time consumed by this bash and its children
diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.rst
index 8402dd6de8df..b6a42cdea72b 100644
--- a/Documentation/cgroup-v1/cpusets.txt
+++ b/Documentation/cgroup-v1/cpusets.rst
@@ -1,35 +1,36 @@
1 CPUSETS 1=======
2 ------- 2CPUSETS
3=======
3 4
4Copyright (C) 2004 BULL SA. 5Copyright (C) 2004 BULL SA.
5Written by Simon.Derr@bull.net
6
7Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
8Modified by Paul Jackson <pj@sgi.com>
9Modified by Christoph Lameter <cl@linux.com>
10Modified by Paul Menage <menage@google.com>
11Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
12 6
13CONTENTS: 7Written by Simon.Derr@bull.net
14=========
15 8
161. Cpusets 9- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
17 1.1 What are cpusets ? 10- Modified by Paul Jackson <pj@sgi.com>
18 1.2 Why are cpusets needed ? 11- Modified by Christoph Lameter <cl@linux.com>
19 1.3 How are cpusets implemented ? 12- Modified by Paul Menage <menage@google.com>
20 1.4 What are exclusive cpusets ? 13- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
21 1.5 What is memory_pressure ? 14
22 1.6 What is memory spread ? 15.. CONTENTS:
23 1.7 What is sched_load_balance ? 16
24 1.8 What is sched_relax_domain_level ? 17 1. Cpusets
25 1.9 How do I use cpusets ? 18 1.1 What are cpusets ?
262. Usage Examples and Syntax 19 1.2 Why are cpusets needed ?
27 2.1 Basic Usage 20 1.3 How are cpusets implemented ?
28 2.2 Adding/removing cpus 21 1.4 What are exclusive cpusets ?
29 2.3 Setting flags 22 1.5 What is memory_pressure ?
30 2.4 Attaching processes 23 1.6 What is memory spread ?
313. Questions 24 1.7 What is sched_load_balance ?
324. Contact 25 1.8 What is sched_relax_domain_level ?
26 1.9 How do I use cpusets ?
27 2. Usage Examples and Syntax
28 2.1 Basic Usage
29 2.2 Adding/removing cpus
30 2.3 Setting flags
31 2.4 Attaching processes
32 3. Questions
33 4. Contact
33 34
341. Cpusets 351. Cpusets
35========== 36==========
@@ -48,7 +49,7 @@ hooks, beyond what is already present, required to manage dynamic
48job placement on large systems. 49job placement on large systems.
49 50
50Cpusets use the generic cgroup subsystem described in 51Cpusets use the generic cgroup subsystem described in
51Documentation/cgroup-v1/cgroups.txt. 52Documentation/cgroup-v1/cgroups.rst.
52 53
53Requests by a task, using the sched_setaffinity(2) system call to 54Requests by a task, using the sched_setaffinity(2) system call to
54include CPUs in its CPU affinity mask, and using the mbind(2) and 55include CPUs in its CPU affinity mask, and using the mbind(2) and
@@ -157,7 +158,7 @@ modifying cpusets is via this cpuset file system.
157The /proc/<pid>/status file for each task has four added lines, 158The /proc/<pid>/status file for each task has four added lines,
158displaying the task's cpus_allowed (on which CPUs it may be scheduled) 159displaying the task's cpus_allowed (on which CPUs it may be scheduled)
159and mems_allowed (on which Memory Nodes it may obtain memory), 160and mems_allowed (on which Memory Nodes it may obtain memory),
160in the two formats seen in the following example: 161in the two formats seen in the following example::
161 162
162 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff 163 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
163 Cpus_allowed_list: 0-127 164 Cpus_allowed_list: 0-127
@@ -181,6 +182,7 @@ files describing that cpuset:
181 - cpuset.sched_relax_domain_level: the searching range when migrating tasks 182 - cpuset.sched_relax_domain_level: the searching range when migrating tasks
182 183
183In addition, only the root cpuset has the following file: 184In addition, only the root cpuset has the following file:
185
184 - cpuset.memory_pressure_enabled flag: compute memory_pressure? 186 - cpuset.memory_pressure_enabled flag: compute memory_pressure?
185 187
186New cpusets are created using the mkdir system call or shell 188New cpusets are created using the mkdir system call or shell
@@ -266,7 +268,8 @@ to monitor a cpuset for signs of memory pressure. It's up to the
266batch manager or other user code to decide what to do about it and 268batch manager or other user code to decide what to do about it and
267take action. 269take action.
268 270
269==> Unless this feature is enabled by writing "1" to the special file 271==>
272 Unless this feature is enabled by writing "1" to the special file
270 /dev/cpuset/memory_pressure_enabled, the hook in the rebalance 273 /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
271 code of __alloc_pages() for this metric reduces to simply noticing 274 code of __alloc_pages() for this metric reduces to simply noticing
272 that the cpuset_memory_pressure_enabled flag is zero. So only 275 that the cpuset_memory_pressure_enabled flag is zero. So only
@@ -399,6 +402,7 @@ have tasks running on them unless explicitly assigned.
399 402
400This default load balancing across all CPUs is not well suited for 403This default load balancing across all CPUs is not well suited for
401the following two situations: 404the following two situations:
405
402 1) On large systems, load balancing across many CPUs is expensive. 406 1) On large systems, load balancing across many CPUs is expensive.
403 If the system is managed using cpusets to place independent jobs 407 If the system is managed using cpusets to place independent jobs
404 on separate sets of CPUs, full load balancing is unnecessary. 408 on separate sets of CPUs, full load balancing is unnecessary.
@@ -501,6 +505,7 @@ all the CPUs that must be load balanced.
501The cpuset code builds a new such partition and passes it to the 505The cpuset code builds a new such partition and passes it to the
502scheduler sched domain setup code, to have the sched domains rebuilt 506scheduler sched domain setup code, to have the sched domains rebuilt
503as necessary, whenever: 507as necessary, whenever:
508
504 - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, 509 - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
505 - or CPUs come or go from a cpuset with this flag enabled, 510 - or CPUs come or go from a cpuset with this flag enabled,
506 - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs 511 - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
@@ -553,13 +558,15 @@ this searching range as you like. This file takes int value which
553indicates size of searching range in levels ideally as follows, 558indicates size of searching range in levels ideally as follows,
554otherwise initial value -1 that indicates the cpuset has no request. 559otherwise initial value -1 that indicates the cpuset has no request.
555 560
556 -1 : no request. use system default or follow request of others. 561====== ===========================================================
557 0 : no search. 562 -1 no request. use system default or follow request of others.
558 1 : search siblings (hyperthreads in a core). 563 0 no search.
559 2 : search cores in a package. 564 1 search siblings (hyperthreads in a core).
560 3 : search cpus in a node [= system wide on non-NUMA system] 565 2 search cores in a package.
561 4 : search nodes in a chunk of node [on NUMA system] 566 3 search cpus in a node [= system wide on non-NUMA system]
562 5 : search system wide [on NUMA system] 567 4 search nodes in a chunk of node [on NUMA system]
568 5 search system wide [on NUMA system]
569====== ===========================================================
563 570
564The system default is architecture dependent. The system default 571The system default is architecture dependent. The system default
565can be changed using the relax_domain_level= boot parameter. 572can be changed using the relax_domain_level= boot parameter.
@@ -578,13 +585,14 @@ and whether it is acceptable or not depends on your situation.
578Don't modify this file if you are not sure. 585Don't modify this file if you are not sure.
579 586
580If your situation is: 587If your situation is:
588
581 - The migration costs between each cpu can be assumed considerably 589 - The migration costs between each cpu can be assumed considerably
582 small(for you) due to your special application's behavior or 590 small(for you) due to your special application's behavior or
583 special hardware support for CPU cache etc. 591 special hardware support for CPU cache etc.
584 - The searching cost doesn't have impact(for you) or you can make 592 - The searching cost doesn't have impact(for you) or you can make
585 the searching cost enough small by managing cpuset to compact etc. 593 the searching cost enough small by managing cpuset to compact etc.
586 - The latency is required even it sacrifices cache hit rate etc. 594 - The latency is required even it sacrifices cache hit rate etc.
587then increasing 'sched_relax_domain_level' would benefit you. 595 then increasing 'sched_relax_domain_level' would benefit you.
588 596
589 597
5901.9 How do I use cpusets ? 5981.9 How do I use cpusets ?
@@ -678,7 +686,7 @@ To start a new job that is to be contained within a cpuset, the steps are:
678 686
679For example, the following sequence of commands will setup a cpuset 687For example, the following sequence of commands will setup a cpuset
680named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, 688named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
681and then start a subshell 'sh' in that cpuset: 689and then start a subshell 'sh' in that cpuset::
682 690
683 mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset 691 mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
684 cd /sys/fs/cgroup/cpuset 692 cd /sys/fs/cgroup/cpuset
@@ -693,6 +701,7 @@ and then start a subshell 'sh' in that cpuset:
693 cat /proc/self/cpuset 701 cat /proc/self/cpuset
694 702
695There are ways to query or modify cpusets: 703There are ways to query or modify cpusets:
704
696 - via the cpuset file system directly, using the various cd, mkdir, echo, 705 - via the cpuset file system directly, using the various cd, mkdir, echo,
697 cat, rmdir commands from the shell, or their equivalent from C. 706 cat, rmdir commands from the shell, or their equivalent from C.
698 - via the C library libcpuset. 707 - via the C library libcpuset.
@@ -722,115 +731,133 @@ Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
722tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset 731tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
723is the cpuset that holds the whole system. 732is the cpuset that holds the whole system.
724 733
725If you want to create a new cpuset under /sys/fs/cgroup/cpuset: 734If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
726# cd /sys/fs/cgroup/cpuset 735
727# mkdir my_cpuset 736 # cd /sys/fs/cgroup/cpuset
737 # mkdir my_cpuset
728 738
729Now you want to do something with this cpuset. 739Now you want to do something with this cpuset::
730# cd my_cpuset
731 740
732In this directory you can find several files: 741 # cd my_cpuset
733# ls 742
734cgroup.clone_children cpuset.memory_pressure 743In this directory you can find several files::
735cgroup.event_control cpuset.memory_spread_page 744
736cgroup.procs cpuset.memory_spread_slab 745 # ls
737cpuset.cpu_exclusive cpuset.mems 746 cgroup.clone_children cpuset.memory_pressure
738cpuset.cpus cpuset.sched_load_balance 747 cgroup.event_control cpuset.memory_spread_page
739cpuset.mem_exclusive cpuset.sched_relax_domain_level 748 cgroup.procs cpuset.memory_spread_slab
740cpuset.mem_hardwall notify_on_release 749 cpuset.cpu_exclusive cpuset.mems
741cpuset.memory_migrate tasks 750 cpuset.cpus cpuset.sched_load_balance
751 cpuset.mem_exclusive cpuset.sched_relax_domain_level
752 cpuset.mem_hardwall notify_on_release
753 cpuset.memory_migrate tasks
742 754
743Reading them will give you information about the state of this cpuset: 755Reading them will give you information about the state of this cpuset:
744the CPUs and Memory Nodes it can use, the processes that are using 756the CPUs and Memory Nodes it can use, the processes that are using
745it, its properties. By writing to these files you can manipulate 757it, its properties. By writing to these files you can manipulate
746the cpuset. 758the cpuset.
747 759
748Set some flags: 760Set some flags::
749# /bin/echo 1 > cpuset.cpu_exclusive 761
762 # /bin/echo 1 > cpuset.cpu_exclusive
763
764Add some cpus::
765
766 # /bin/echo 0-7 > cpuset.cpus
767
768Add some mems::
750 769
751Add some cpus: 770 # /bin/echo 0-7 > cpuset.mems
752# /bin/echo 0-7 > cpuset.cpus
753 771
754Add some mems: 772Now attach your shell to this cpuset::
755# /bin/echo 0-7 > cpuset.mems
756 773
757Now attach your shell to this cpuset: 774 # /bin/echo $$ > tasks
758# /bin/echo $$ > tasks
759 775
760You can also create cpusets inside your cpuset by using mkdir in this 776You can also create cpusets inside your cpuset by using mkdir in this
761directory. 777directory::
762# mkdir my_sub_cs 778
779 # mkdir my_sub_cs
780
781To remove a cpuset, just use rmdir::
782
783 # rmdir my_sub_cs
763 784
764To remove a cpuset, just use rmdir:
765# rmdir my_sub_cs
766This will fail if the cpuset is in use (has cpusets inside, or has 785This will fail if the cpuset is in use (has cpusets inside, or has
767processes attached). 786processes attached).
768 787
769Note that for legacy reasons, the "cpuset" filesystem exists as a 788Note that for legacy reasons, the "cpuset" filesystem exists as a
770wrapper around the cgroup filesystem. 789wrapper around the cgroup filesystem.
771 790
772The command 791The command::
773 792
774mount -t cpuset X /sys/fs/cgroup/cpuset 793 mount -t cpuset X /sys/fs/cgroup/cpuset
775 794
776is equivalent to 795is equivalent to::
777 796
778mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset 797 mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
779echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent 798 echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
780 799
7812.2 Adding/removing cpus 8002.2 Adding/removing cpus
782------------------------ 801------------------------
783 802
784This is the syntax to use when writing in the cpus or mems files 803This is the syntax to use when writing in the cpus or mems files
785in cpuset directories: 804in cpuset directories::
786 805
787# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 806 # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
788# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 807 # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
789 808
790To add a CPU to a cpuset, write the new list of CPUs including the 809To add a CPU to a cpuset, write the new list of CPUs including the
791CPU to be added. To add 6 to the above cpuset: 810CPU to be added. To add 6 to the above cpuset::
792 811
793# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 812 # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6
794 813
795Similarly to remove a CPU from a cpuset, write the new list of CPUs 814Similarly to remove a CPU from a cpuset, write the new list of CPUs
796without the CPU to be removed. 815without the CPU to be removed.
797 816
798To remove all the CPUs: 817To remove all the CPUs::
799 818
800# /bin/echo "" > cpuset.cpus -> clear cpus list 819 # /bin/echo "" > cpuset.cpus -> clear cpus list
801 820
8022.3 Setting flags 8212.3 Setting flags
803----------------- 822-----------------
804 823
805The syntax is very simple: 824The syntax is very simple::
806 825
807# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' 826 # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive'
808# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' 827 # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive'
809 828
8102.4 Attaching processes 8292.4 Attaching processes
811----------------------- 830-----------------------
812 831
813# /bin/echo PID > tasks 832::
833
834 # /bin/echo PID > tasks
814 835
815Note that it is PID, not PIDs. You can only attach ONE task at a time. 836Note that it is PID, not PIDs. You can only attach ONE task at a time.
816If you have several tasks to attach, you have to do it one after another: 837If you have several tasks to attach, you have to do it one after another::
817 838
818# /bin/echo PID1 > tasks 839 # /bin/echo PID1 > tasks
819# /bin/echo PID2 > tasks 840 # /bin/echo PID2 > tasks
820 ... 841 ...
821# /bin/echo PIDn > tasks 842 # /bin/echo PIDn > tasks
822 843
823 844
8243. Questions 8453. Questions
825============ 846============
826 847
827Q: what's up with this '/bin/echo' ? 848Q:
828A: bash's builtin 'echo' command does not check calls to write() against 849 what's up with this '/bin/echo' ?
850
851A:
852 bash's builtin 'echo' command does not check calls to write() against
829 errors. If you use it in the cpuset file system, you won't be 853 errors. If you use it in the cpuset file system, you won't be
830 able to tell whether a command succeeded or failed. 854 able to tell whether a command succeeded or failed.
831 855
832Q: When I attach processes, only the first of the line gets really attached ! 856Q:
833A: We can only return one error code per call to write(). So you should also 857 When I attach processes, only the first of the line gets really attached !
858
859A:
860 We can only return one error code per call to write(). So you should also
834 put only ONE pid. 861 put only ONE pid.
835 862
8364. Contact 8634. Contact
diff --git a/Documentation/cgroup-v1/devices.txt b/Documentation/cgroup-v1/devices.rst
index 3c1095ca02ea..e1886783961e 100644
--- a/Documentation/cgroup-v1/devices.txt
+++ b/Documentation/cgroup-v1/devices.rst
@@ -1,6 +1,9 @@
1===========================
1Device Whitelist Controller 2Device Whitelist Controller
3===========================
2 4
31. Description: 51. Description
6==============
4 7
5Implement a cgroup to track and enforce open and mknod restrictions 8Implement a cgroup to track and enforce open and mknod restrictions
6on device files. A device cgroup associates a device access 9on device files. A device cgroup associates a device access
@@ -16,24 +19,26 @@ devices from the whitelist or add new entries. A child cgroup can
16never receive a device access which is denied by its parent. 19never receive a device access which is denied by its parent.
17 20
182. User Interface 212. User Interface
22=================
19 23
20An entry is added using devices.allow, and removed using 24An entry is added using devices.allow, and removed using
21devices.deny. For instance 25devices.deny. For instance::
22 26
23 echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow 27 echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
24 28
25allows cgroup 1 to read and mknod the device usually known as 29allows cgroup 1 to read and mknod the device usually known as
26/dev/null. Doing 30/dev/null. Doing::
27 31
28 echo a > /sys/fs/cgroup/1/devices.deny 32 echo a > /sys/fs/cgroup/1/devices.deny
29 33
30will remove the default 'a *:* rwm' entry. Doing 34will remove the default 'a *:* rwm' entry. Doing::
31 35
32 echo a > /sys/fs/cgroup/1/devices.allow 36 echo a > /sys/fs/cgroup/1/devices.allow
33 37
34will add the 'a *:* rwm' entry to the whitelist. 38will add the 'a *:* rwm' entry to the whitelist.
35 39
363. Security 403. Security
41===========
37 42
38Any task can move itself between cgroups. This clearly won't 43Any task can move itself between cgroups. This clearly won't
39suffice, but we can decide the best way to adequately restrict 44suffice, but we can decide the best way to adequately restrict
@@ -50,6 +55,7 @@ A cgroup may not be granted more permissions than the cgroup's
50parent has. 55parent has.
51 56
524. Hierarchy 574. Hierarchy
58============
53 59
54device cgroups maintain hierarchy by making sure a cgroup never has more 60device cgroups maintain hierarchy by making sure a cgroup never has more
55access permissions than its parent. Every time an entry is written to 61access permissions than its parent. Every time an entry is written to
@@ -58,7 +64,8 @@ from their whitelist and all the locally set whitelist entries will be
58re-evaluated. In case one of the locally set whitelist entries would provide 64re-evaluated. In case one of the locally set whitelist entries would provide
59more access than the cgroup's parent, it'll be removed from the whitelist. 65more access than the cgroup's parent, it'll be removed from the whitelist.
60 66
61Example: 67Example::
68
62 A 69 A
63 / \ 70 / \
64 B 71 B
@@ -67,10 +74,12 @@ Example:
67 A allow "b 8:* rwm", "c 116:1 rw" 74 A allow "b 8:* rwm", "c 116:1 rw"
68 B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" 75 B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
69 76
70If a device is denied in group A: 77If a device is denied in group A::
78
71 # echo "c 116:* r" > A/devices.deny 79 # echo "c 116:* r" > A/devices.deny
80
72it'll propagate down and after revalidating B's entries, the whitelist entry 81it'll propagate down and after revalidating B's entries, the whitelist entry
73"c 116:2 rwm" will be removed: 82"c 116:2 rwm" will be removed::
74 83
75 group whitelist entries denied devices 84 group whitelist entries denied devices
76 A all "b 8:* rwm", "c 116:* rw" 85 A all "b 8:* rwm", "c 116:* rw"
@@ -79,7 +88,8 @@ it'll propagate down and after revalidating B's entries, the whitelist entry
79In case parent's exceptions change and local exceptions are not allowed 88In case parent's exceptions change and local exceptions are not allowed
80anymore, they'll be deleted. 89anymore, they'll be deleted.
81 90
82Notice that new whitelist entries will not be propagated: 91Notice that new whitelist entries will not be propagated::
92
83 A 93 A
84 / \ 94 / \
85 B 95 B
@@ -88,24 +98,30 @@ Notice that new whitelist entries will not be propagated:
88 A "c 1:3 rwm", "c 1:5 r" all the rest 98 A "c 1:3 rwm", "c 1:5 r" all the rest
89 B "c 1:3 rwm", "c 1:5 r" all the rest 99 B "c 1:3 rwm", "c 1:5 r" all the rest
90 100
91when adding "c *:3 rwm": 101when adding ``c *:3 rwm``::
102
92 # echo "c *:3 rwm" >A/devices.allow 103 # echo "c *:3 rwm" >A/devices.allow
93 104
94the result: 105the result::
106
95 group whitelist entries denied devices 107 group whitelist entries denied devices
96 A "c *:3 rwm", "c 1:5 r" all the rest 108 A "c *:3 rwm", "c 1:5 r" all the rest
97 B "c 1:3 rwm", "c 1:5 r" all the rest 109 B "c 1:3 rwm", "c 1:5 r" all the rest
98 110
99but now it'll be possible to add new entries to B: 111but now it'll be possible to add new entries to B::
112
100 # echo "c 2:3 rwm" >B/devices.allow 113 # echo "c 2:3 rwm" >B/devices.allow
101 # echo "c 50:3 r" >B/devices.allow 114 # echo "c 50:3 r" >B/devices.allow
102or even 115
116or even::
117
103 # echo "c *:3 rwm" >B/devices.allow 118 # echo "c *:3 rwm" >B/devices.allow
104 119
105Allowing or denying all by writing 'a' to devices.allow or devices.deny will 120Allowing or denying all by writing 'a' to devices.allow or devices.deny will
106not be possible once the device cgroups has children. 121not be possible once the device cgroups has children.
107 122
1084.1 Hierarchy (internal implementation) 1234.1 Hierarchy (internal implementation)
124---------------------------------------
109 125
110device cgroups is implemented internally using a behavior (ALLOW, DENY) and a 126device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
111list of exceptions. The internal state is controlled using the same user 127list of exceptions. The internal state is controlled using the same user
diff --git a/Documentation/cgroup-v1/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.rst
index e831cb2b8394..582d3427de3f 100644
--- a/Documentation/cgroup-v1/freezer-subsystem.txt
+++ b/Documentation/cgroup-v1/freezer-subsystem.rst
@@ -1,3 +1,7 @@
1==============
2Cgroup Freezer
3==============
4
1The cgroup freezer is useful to batch job management system which start 5The cgroup freezer is useful to batch job management system which start
2and stop sets of tasks in order to schedule the resources of a machine 6and stop sets of tasks in order to schedule the resources of a machine
3according to the desires of a system administrator. This sort of program 7according to the desires of a system administrator. This sort of program
@@ -23,7 +27,7 @@ blocked, or ignored it can be seen by waiting or ptracing parent tasks.
23SIGCONT is especially unsuitable since it can be caught by the task. Any 27SIGCONT is especially unsuitable since it can be caught by the task. Any
24programs designed to watch for SIGSTOP and SIGCONT could be broken by 28programs designed to watch for SIGSTOP and SIGCONT could be broken by
25attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can 29attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
26demonstrate this problem using nested bash shells: 30demonstrate this problem using nested bash shells::
27 31
28 $ echo $$ 32 $ echo $$
29 16644 33 16644
@@ -93,19 +97,19 @@ The following cgroupfs files are created by cgroup freezer.
93The root cgroup is non-freezable and the above interface files don't 97The root cgroup is non-freezable and the above interface files don't
94exist. 98exist.
95 99
96* Examples of usage : 100* Examples of usage::
97 101
98 # mkdir /sys/fs/cgroup/freezer 102 # mkdir /sys/fs/cgroup/freezer
99 # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer 103 # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
100 # mkdir /sys/fs/cgroup/freezer/0 104 # mkdir /sys/fs/cgroup/freezer/0
101 # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks 105 # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
102 106
103to get status of the freezer subsystem : 107to get status of the freezer subsystem::
104 108
105 # cat /sys/fs/cgroup/freezer/0/freezer.state 109 # cat /sys/fs/cgroup/freezer/0/freezer.state
106 THAWED 110 THAWED
107 111
108to freeze all tasks in the container : 112to freeze all tasks in the container::
109 113
110 # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state 114 # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
111 # cat /sys/fs/cgroup/freezer/0/freezer.state 115 # cat /sys/fs/cgroup/freezer/0/freezer.state
@@ -113,7 +117,7 @@ to freeze all tasks in the container :
113 # cat /sys/fs/cgroup/freezer/0/freezer.state 117 # cat /sys/fs/cgroup/freezer/0/freezer.state
114 FROZEN 118 FROZEN
115 119
116to unfreeze all tasks in the container : 120to unfreeze all tasks in the container::
117 121
118 # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state 122 # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
119 # cat /sys/fs/cgroup/freezer/0/freezer.state 123 # cat /sys/fs/cgroup/freezer/0/freezer.state
diff --git a/Documentation/cgroup-v1/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.rst
index 1260e5369b9b..a3902aa253a9 100644
--- a/Documentation/cgroup-v1/hugetlb.txt
+++ b/Documentation/cgroup-v1/hugetlb.rst
@@ -1,5 +1,6 @@
1==================
1HugeTLB Controller 2HugeTLB Controller
2------------------- 3==================
3 4
4The HugeTLB controller allows to limit the HugeTLB usage per control group and 5The HugeTLB controller allows to limit the HugeTLB usage per control group and
5enforces the controller limit during page fault. Since HugeTLB doesn't 6enforces the controller limit during page fault. Since HugeTLB doesn't
@@ -16,16 +17,16 @@ With the above step, the initial or the parent HugeTLB group becomes
16visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in 17visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
17the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. 18the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
18 19
19New groups can be created under the parent group /sys/fs/cgroup. 20New groups can be created under the parent group /sys/fs/cgroup::
20 21
21# cd /sys/fs/cgroup 22 # cd /sys/fs/cgroup
22# mkdir g1 23 # mkdir g1
23# echo $$ > g1/tasks 24 # echo $$ > g1/tasks
24 25
25The above steps create a new group g1 and move the current shell 26The above steps create a new group g1 and move the current shell
26process (bash) into it. 27process (bash) into it.
27 28
28Brief summary of control files 29Brief summary of control files::
29 30
30 hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage 31 hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
31 hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded 32 hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
@@ -33,17 +34,17 @@ Brief summary of control files
33 hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit 34 hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
34 35
35For a system supporting three hugepage sizes (64k, 32M and 1G), the control 36For a system supporting three hugepage sizes (64k, 32M and 1G), the control
36files include: 37files include::
37 38
38hugetlb.1GB.limit_in_bytes 39 hugetlb.1GB.limit_in_bytes
39hugetlb.1GB.max_usage_in_bytes 40 hugetlb.1GB.max_usage_in_bytes
40hugetlb.1GB.usage_in_bytes 41 hugetlb.1GB.usage_in_bytes
41hugetlb.1GB.failcnt 42 hugetlb.1GB.failcnt
42hugetlb.64KB.limit_in_bytes 43 hugetlb.64KB.limit_in_bytes
43hugetlb.64KB.max_usage_in_bytes 44 hugetlb.64KB.max_usage_in_bytes
44hugetlb.64KB.usage_in_bytes 45 hugetlb.64KB.usage_in_bytes
45hugetlb.64KB.failcnt 46 hugetlb.64KB.failcnt
46hugetlb.32MB.limit_in_bytes 47 hugetlb.32MB.limit_in_bytes
47hugetlb.32MB.max_usage_in_bytes 48 hugetlb.32MB.max_usage_in_bytes
48hugetlb.32MB.usage_in_bytes 49 hugetlb.32MB.usage_in_bytes
49hugetlb.32MB.failcnt 50 hugetlb.32MB.failcnt
diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst
new file mode 100644
index 000000000000..fe76d42edc11
--- /dev/null
+++ b/Documentation/cgroup-v1/index.rst
@@ -0,0 +1,30 @@
1:orphan:
2
3========================
4Control Groups version 1
5========================
6
7.. toctree::
8 :maxdepth: 1
9
10 cgroups
11
12 blkio-controller
13 cpuacct
14 cpusets
15 devices
16 freezer-subsystem
17 hugetlb
18 memcg_test
19 memory
20 net_cls
21 net_prio
22 pids
23 rdma
24
25.. only:: subproject and html
26
27 Indices
28 =======
29
30 * :ref:`genindex`
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.rst
index 621e29ffb358..91bd18c6a514 100644
--- a/Documentation/cgroup-v1/memcg_test.txt
+++ b/Documentation/cgroup-v1/memcg_test.rst
@@ -1,32 +1,43 @@
1Memory Resource Controller(Memcg) Implementation Memo. 1=====================================================
2Memory Resource Controller(Memcg) Implementation Memo
3=====================================================
4
2Last Updated: 2010/2 5Last Updated: 2010/2
6
3Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). 7Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
4 8
5Because VM is getting complex (one of reasons is memcg...), memcg's behavior 9Because VM is getting complex (one of reasons is memcg...), memcg's behavior
6is complex. This is a document for memcg's internal behavior. 10is complex. This is a document for memcg's internal behavior.
7Please note that implementation details can be changed. 11Please note that implementation details can be changed.
8 12
9(*) Topics on API should be in Documentation/cgroup-v1/memory.txt) 13(*) Topics on API should be in Documentation/cgroup-v1/memory.rst)
10 14
110. How to record usage ? 150. How to record usage ?
16========================
17
12 2 objects are used. 18 2 objects are used.
13 19
14 page_cgroup ....an object per page. 20 page_cgroup ....an object per page.
21
15 Allocated at boot or memory hotplug. Freed at memory hot removal. 22 Allocated at boot or memory hotplug. Freed at memory hot removal.
16 23
17 swap_cgroup ... an entry per swp_entry. 24 swap_cgroup ... an entry per swp_entry.
25
18 Allocated at swapon(). Freed at swapoff(). 26 Allocated at swapon(). Freed at swapoff().
19 27
20 The page_cgroup has USED bit and double count against a page_cgroup never 28 The page_cgroup has USED bit and double count against a page_cgroup never
21 occurs. swap_cgroup is used only when a charged page is swapped-out. 29 occurs. swap_cgroup is used only when a charged page is swapped-out.
22 30
231. Charge 311. Charge
32=========
24 33
25 a page/swp_entry may be charged (usage += PAGE_SIZE) at 34 a page/swp_entry may be charged (usage += PAGE_SIZE) at
26 35
27 mem_cgroup_try_charge() 36 mem_cgroup_try_charge()
28 37
292. Uncharge 382. Uncharge
39===========
40
30 a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by 41 a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
31 42
32 mem_cgroup_uncharge() 43 mem_cgroup_uncharge()
@@ -37,9 +48,12 @@ Please note that implementation details can be changed.
37 disappears. 48 disappears.
38 49
393. charge-commit-cancel 503. charge-commit-cancel
51=======================
52
40 Memcg pages are charged in two steps: 53 Memcg pages are charged in two steps:
41 mem_cgroup_try_charge() 54
42 mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() 55 - mem_cgroup_try_charge()
56 - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
43 57
44 At try_charge(), there are no flags to say "this page is charged". 58 At try_charge(), there are no flags to say "this page is charged".
45 at this point, usage += PAGE_SIZE. 59 at this point, usage += PAGE_SIZE.
@@ -51,6 +65,8 @@ Please note that implementation details can be changed.
51Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. 65Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
52 66
534. Anonymous 674. Anonymous
68============
69
54 Anonymous page is newly allocated at 70 Anonymous page is newly allocated at
55 - page fault into MAP_ANONYMOUS mapping. 71 - page fault into MAP_ANONYMOUS mapping.
56 - Copy-On-Write. 72 - Copy-On-Write.
@@ -78,34 +94,45 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
78 (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. 94 (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
79 95
805. Page Cache 965. Page Cache
81 Page Cache is charged at 97=============
98
99 Page Cache is charged at
82 - add_to_page_cache_locked(). 100 - add_to_page_cache_locked().
83 101
84 The logic is very clear. (About migration, see below) 102 The logic is very clear. (About migration, see below)
85 Note: __remove_from_page_cache() is called by remove_from_page_cache() 103
86 and __remove_mapping(). 104 Note:
105 __remove_from_page_cache() is called by remove_from_page_cache()
106 and __remove_mapping().
87 107
886. Shmem(tmpfs) Page Cache 1086. Shmem(tmpfs) Page Cache
109===========================
110
89 The best way to understand shmem's page state transition is to read 111 The best way to understand shmem's page state transition is to read
90 mm/shmem.c. 112 mm/shmem.c.
113
91 But brief explanation of the behavior of memcg around shmem will be 114 But brief explanation of the behavior of memcg around shmem will be
92 helpful to understand the logic. 115 helpful to understand the logic.
93 116
94 Shmem's page (just leaf page, not direct/indirect block) can be on 117 Shmem's page (just leaf page, not direct/indirect block) can be on
118
95 - radix-tree of shmem's inode. 119 - radix-tree of shmem's inode.
96 - SwapCache. 120 - SwapCache.
97 - Both on radix-tree and SwapCache. This happens at swap-in 121 - Both on radix-tree and SwapCache. This happens at swap-in
98 and swap-out, 122 and swap-out,
99 123
100 It's charged when... 124 It's charged when...
125
101 - A new page is added to shmem's radix-tree. 126 - A new page is added to shmem's radix-tree.
102 - A swp page is read. (move a charge from swap_cgroup to page_cgroup) 127 - A swp page is read. (move a charge from swap_cgroup to page_cgroup)
103 128
1047. Page Migration 1297. Page Migration
130=================
105 131
106 mem_cgroup_migrate() 132 mem_cgroup_migrate()
107 133
1088. LRU 1348. LRU
135======
109 Each memcg has its own private LRU. Now, its handling is under global 136 Each memcg has its own private LRU. Now, its handling is under global
110 VM's control (means that it's handled under global pgdat->lru_lock). 137 VM's control (means that it's handled under global pgdat->lru_lock).
111 Almost all routines around memcg's LRU is called by global LRU's 138 Almost all routines around memcg's LRU is called by global LRU's
@@ -114,163 +141,211 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
114 A special function is mem_cgroup_isolate_pages(). This scans 141 A special function is mem_cgroup_isolate_pages(). This scans
115 memcg's private LRU and call __isolate_lru_page() to extract a page 142 memcg's private LRU and call __isolate_lru_page() to extract a page
116 from LRU. 143 from LRU.
144
117 (By __isolate_lru_page(), the page is removed from both of global and 145 (By __isolate_lru_page(), the page is removed from both of global and
118 private LRU.) 146 private LRU.)
119 147
120 148
1219. Typical Tests. 1499. Typical Tests.
150=================
122 151
123 Tests for racy cases. 152 Tests for racy cases.
124 153
125 9.1 Small limit to memcg. 1549.1 Small limit to memcg.
155-------------------------
156
126 When you do test to do racy case, it's good test to set memcg's limit 157 When you do test to do racy case, it's good test to set memcg's limit
127 to be very small rather than GB. Many races found in the test under 158 to be very small rather than GB. Many races found in the test under
128 xKB or xxMB limits. 159 xKB or xxMB limits.
160
129 (Memory behavior under GB and Memory behavior under MB shows very 161 (Memory behavior under GB and Memory behavior under MB shows very
130 different situation.) 162 different situation.)
163
1649.2 Shmem
165---------
131 166
132 9.2 Shmem
133 Historically, memcg's shmem handling was poor and we saw some amount 167 Historically, memcg's shmem handling was poor and we saw some amount
134 of troubles here. This is because shmem is page-cache but can be 168 of troubles here. This is because shmem is page-cache but can be
135 SwapCache. Test with shmem/tmpfs is always good test. 169 SwapCache. Test with shmem/tmpfs is always good test.
136 170
137 9.3 Migration 1719.3 Migration
172-------------
173
138 For NUMA, migration is an another special case. To do easy test, cpuset 174 For NUMA, migration is an another special case. To do easy test, cpuset
139 is useful. Following is a sample script to do migration. 175 is useful. Following is a sample script to do migration::
140 176
141 mount -t cgroup -o cpuset none /opt/cpuset 177 mount -t cgroup -o cpuset none /opt/cpuset
142 178
143 mkdir /opt/cpuset/01 179 mkdir /opt/cpuset/01
144 echo 1 > /opt/cpuset/01/cpuset.cpus 180 echo 1 > /opt/cpuset/01/cpuset.cpus
145 echo 0 > /opt/cpuset/01/cpuset.mems 181 echo 0 > /opt/cpuset/01/cpuset.mems
146 echo 1 > /opt/cpuset/01/cpuset.memory_migrate 182 echo 1 > /opt/cpuset/01/cpuset.memory_migrate
147 mkdir /opt/cpuset/02 183 mkdir /opt/cpuset/02
148 echo 1 > /opt/cpuset/02/cpuset.cpus 184 echo 1 > /opt/cpuset/02/cpuset.cpus
149 echo 1 > /opt/cpuset/02/cpuset.mems 185 echo 1 > /opt/cpuset/02/cpuset.mems
150 echo 1 > /opt/cpuset/02/cpuset.memory_migrate 186 echo 1 > /opt/cpuset/02/cpuset.memory_migrate
151 187
152 In above set, when you moves a task from 01 to 02, page migration to 188 In above set, when you moves a task from 01 to 02, page migration to
153 node 0 to node 1 will occur. Following is a script to migrate all 189 node 0 to node 1 will occur. Following is a script to migrate all
154 under cpuset. 190 under cpuset.::
155 -- 191
156 move_task() 192 --
157 { 193 move_task()
158 for pid in $1 194 {
159 do 195 for pid in $1
160 /bin/echo $pid >$2/tasks 2>/dev/null 196 do
161 echo -n $pid 197 /bin/echo $pid >$2/tasks 2>/dev/null
162 echo -n " " 198 echo -n $pid
163 done 199 echo -n " "
164 echo END 200 done
165 } 201 echo END
166 202 }
167 G1_TASK=`cat ${G1}/tasks` 203
168 G2_TASK=`cat ${G2}/tasks` 204 G1_TASK=`cat ${G1}/tasks`
169 move_task "${G1_TASK}" ${G2} & 205 G2_TASK=`cat ${G2}/tasks`
170 -- 206 move_task "${G1_TASK}" ${G2} &
171 9.4 Memory hotplug. 207 --
208
2099.4 Memory hotplug
210------------------
211
172 memory hotplug test is one of good test. 212 memory hotplug test is one of good test.
173 to offline memory, do following. 213
174 # echo offline > /sys/devices/system/memory/memoryXXX/state 214 to offline memory, do following::
215
216 # echo offline > /sys/devices/system/memory/memoryXXX/state
217
175 (XXX is the place of memory) 218 (XXX is the place of memory)
219
176 This is an easy way to test page migration, too. 220 This is an easy way to test page migration, too.
177 221
178 9.5 mkdir/rmdir 2229.5 mkdir/rmdir
223---------------
224
179 When using hierarchy, mkdir/rmdir test should be done. 225 When using hierarchy, mkdir/rmdir test should be done.
180 Use tests like the following. 226 Use tests like the following::
227
228 echo 1 >/opt/cgroup/01/memory/use_hierarchy
229 mkdir /opt/cgroup/01/child_a
230 mkdir /opt/cgroup/01/child_b
181 231
182 echo 1 >/opt/cgroup/01/memory/use_hierarchy 232 set limit to 01.
183 mkdir /opt/cgroup/01/child_a 233 add limit to 01/child_b
184 mkdir /opt/cgroup/01/child_b 234 run jobs under child_a and child_b
185 235
186 set limit to 01. 236 create/delete following groups at random while jobs are running::
187 add limit to 01/child_b
188 run jobs under child_a and child_b
189 237
190 create/delete following groups at random while jobs are running. 238 /opt/cgroup/01/child_a/child_aa
191 /opt/cgroup/01/child_a/child_aa 239 /opt/cgroup/01/child_b/child_bb
192 /opt/cgroup/01/child_b/child_bb 240 /opt/cgroup/01/child_c
193 /opt/cgroup/01/child_c
194 241
195 running new jobs in new group is also good. 242 running new jobs in new group is also good.
196 243
197 9.6 Mount with other subsystems. 2449.6 Mount with other subsystems
245-------------------------------
246
198 Mounting with other subsystems is a good test because there is a 247 Mounting with other subsystems is a good test because there is a
199 race and lock dependency with other cgroup subsystems. 248 race and lock dependency with other cgroup subsystems.
200 249
201 example) 250 example::
202 # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices 251
252 # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
203 253
204 and do task move, mkdir, rmdir etc...under this. 254 and do task move, mkdir, rmdir etc...under this.
205 255
206 9.7 swapoff. 2569.7 swapoff
257-----------
258
207 Besides management of swap is one of complicated parts of memcg, 259 Besides management of swap is one of complicated parts of memcg,
208 call path of swap-in at swapoff is not same as usual swap-in path.. 260 call path of swap-in at swapoff is not same as usual swap-in path..
209 It's worth to be tested explicitly. 261 It's worth to be tested explicitly.
210 262
211 For example, test like following is good. 263 For example, test like following is good:
212 (Shell-A) 264
213 # mount -t cgroup none /cgroup -o memory 265 (Shell-A)::
214 # mkdir /cgroup/test 266
215 # echo 40M > /cgroup/test/memory.limit_in_bytes 267 # mount -t cgroup none /cgroup -o memory
216 # echo 0 > /cgroup/test/tasks 268 # mkdir /cgroup/test
269 # echo 40M > /cgroup/test/memory.limit_in_bytes
270 # echo 0 > /cgroup/test/tasks
271
217 Run malloc(100M) program under this. You'll see 60M of swaps. 272 Run malloc(100M) program under this. You'll see 60M of swaps.
218 (Shell-B) 273
219 # move all tasks in /cgroup/test to /cgroup 274 (Shell-B)::
220 # /sbin/swapoff -a 275
221 # rmdir /cgroup/test 276 # move all tasks in /cgroup/test to /cgroup
222 # kill malloc task. 277 # /sbin/swapoff -a
278 # rmdir /cgroup/test
279 # kill malloc task.
223 280
224 Of course, tmpfs v.s. swapoff test should be tested, too. 281 Of course, tmpfs v.s. swapoff test should be tested, too.
225 282
226 9.8 OOM-Killer 2839.8 OOM-Killer
284--------------
285
227 Out-of-memory caused by memcg's limit will kill tasks under 286 Out-of-memory caused by memcg's limit will kill tasks under
228 the memcg. When hierarchy is used, a task under hierarchy 287 the memcg. When hierarchy is used, a task under hierarchy
229 will be killed by the kernel. 288 will be killed by the kernel.
289
230 In this case, panic_on_oom shouldn't be invoked and tasks 290 In this case, panic_on_oom shouldn't be invoked and tasks
231 in other groups shouldn't be killed. 291 in other groups shouldn't be killed.
232 292
233 It's not difficult to cause OOM under memcg as following. 293 It's not difficult to cause OOM under memcg as following.
234 Case A) when you can swapoff 294
235 #swapoff -a 295 Case A) when you can swapoff::
236 #echo 50M > /memory.limit_in_bytes 296
297 #swapoff -a
298 #echo 50M > /memory.limit_in_bytes
299
237 run 51M of malloc 300 run 51M of malloc
238 301
239 Case B) when you use mem+swap limitation. 302 Case B) when you use mem+swap limitation::
240 #echo 50M > memory.limit_in_bytes 303
241 #echo 50M > memory.memsw.limit_in_bytes 304 #echo 50M > memory.limit_in_bytes
305 #echo 50M > memory.memsw.limit_in_bytes
306
242 run 51M of malloc 307 run 51M of malloc
243 308
244 9.9 Move charges at task migration 3099.9 Move charges at task migration
310----------------------------------
311
245 Charges associated with a task can be moved along with task migration. 312 Charges associated with a task can be moved along with task migration.
246 313
247 (Shell-A) 314 (Shell-A)::
248 #mkdir /cgroup/A 315
249 #echo $$ >/cgroup/A/tasks 316 #mkdir /cgroup/A
317 #echo $$ >/cgroup/A/tasks
318
250 run some programs which uses some amount of memory in /cgroup/A. 319 run some programs which uses some amount of memory in /cgroup/A.
251 320
252 (Shell-B) 321 (Shell-B)::
253 #mkdir /cgroup/B 322
254 #echo 1 >/cgroup/B/memory.move_charge_at_immigrate 323 #mkdir /cgroup/B
255 #echo "pid of the program running in group A" >/cgroup/B/tasks 324 #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
325 #echo "pid of the program running in group A" >/cgroup/B/tasks
256 326
257 You can see charges have been moved by reading *.usage_in_bytes or 327 You can see charges have been moved by reading ``*.usage_in_bytes`` or
258 memory.stat of both A and B. 328 memory.stat of both A and B.
259 See 8.2 of Documentation/cgroup-v1/memory.txt to see what value should be
260 written to move_charge_at_immigrate.
261 329
262 9.10 Memory thresholds 330 See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should
331 be written to move_charge_at_immigrate.
332
3339.10 Memory thresholds
334----------------------
335
263 Memory controller implements memory thresholds using cgroups notification 336 Memory controller implements memory thresholds using cgroups notification
264 API. You can use tools/cgroup/cgroup_event_listener.c to test it. 337 API. You can use tools/cgroup/cgroup_event_listener.c to test it.
265 338
266 (Shell-A) Create cgroup and run event listener 339 (Shell-A) Create cgroup and run event listener::
267 # mkdir /cgroup/A 340
268 # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M 341 # mkdir /cgroup/A
342 # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
343
344 (Shell-B) Add task to cgroup and try to allocate and free memory::
269 345
270 (Shell-B) Add task to cgroup and try to allocate and free memory 346 # echo $$ >/cgroup/A/tasks
271 # echo $$ >/cgroup/A/tasks 347 # a="$(dd if=/dev/zero bs=1M count=10)"
272 # a="$(dd if=/dev/zero bs=1M count=10)" 348 # a=
273 # a=
274 349
275 You will see message from cgroup_event_listener every time you cross 350 You will see message from cgroup_event_listener every time you cross
276 the thresholds. 351 the thresholds.
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.rst
index a33cedf85427..41bdc038dad9 100644
--- a/Documentation/cgroup-v1/memory.txt
+++ b/Documentation/cgroup-v1/memory.rst
@@ -1,22 +1,26 @@
1==========================
1Memory Resource Controller 2Memory Resource Controller
3==========================
2 4
3NOTE: This document is hopelessly outdated and it asks for a complete 5NOTE:
6 This document is hopelessly outdated and it asks for a complete
4 rewrite. It still contains a useful information so we are keeping it 7 rewrite. It still contains a useful information so we are keeping it
5 here but make sure to check the current code if you need a deeper 8 here but make sure to check the current code if you need a deeper
6 understanding. 9 understanding.
7 10
8NOTE: The Memory Resource Controller has generically been referred to as the 11NOTE:
12 The Memory Resource Controller has generically been referred to as the
9 memory controller in this document. Do not confuse memory controller 13 memory controller in this document. Do not confuse memory controller
10 used here with the memory controller that is used in hardware. 14 used here with the memory controller that is used in hardware.
11 15
12(For editors) 16(For editors) In this document:
13In this document:
14 When we mention a cgroup (cgroupfs's directory) with memory controller, 17 When we mention a cgroup (cgroupfs's directory) with memory controller,
15 we call it "memory cgroup". When you see git-log and source code, you'll 18 we call it "memory cgroup". When you see git-log and source code, you'll
16 see patch's title and function names tend to use "memcg". 19 see patch's title and function names tend to use "memcg".
17 In this document, we avoid using it. 20 In this document, we avoid using it.
18 21
19Benefits and Purpose of the memory controller 22Benefits and Purpose of the memory controller
23=============================================
20 24
21The memory controller isolates the memory behaviour of a group of tasks 25The memory controller isolates the memory behaviour of a group of tasks
22from the rest of the system. The article on LWN [12] mentions some probable 26from the rest of the system. The article on LWN [12] mentions some probable
@@ -38,6 +42,7 @@ e. There are several other use cases; find one or use the controller just
38Current Status: linux-2.6.34-mmotm(development version of 2010/April) 42Current Status: linux-2.6.34-mmotm(development version of 2010/April)
39 43
40Features: 44Features:
45
41 - accounting anonymous pages, file caches, swap caches usage and limiting them. 46 - accounting anonymous pages, file caches, swap caches usage and limiting them.
42 - pages are linked to per-memcg LRU exclusively, and there is no global LRU. 47 - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
43 - optionally, memory+swap usage can be accounted and limited. 48 - optionally, memory+swap usage can be accounted and limited.
@@ -54,41 +59,48 @@ Features:
54 59
55Brief summary of control files. 60Brief summary of control files.
56 61
57 tasks # attach a task(thread) and show list of threads 62==================================== ==========================================
58 cgroup.procs # show list of processes 63 tasks attach a task(thread) and show list of
59 cgroup.event_control # an interface for event_fd() 64 threads
60 memory.usage_in_bytes # show current usage for memory 65 cgroup.procs show list of processes
61 (See 5.5 for details) 66 cgroup.event_control an interface for event_fd()
62 memory.memsw.usage_in_bytes # show current usage for memory+Swap 67 memory.usage_in_bytes show current usage for memory
63 (See 5.5 for details) 68 (See 5.5 for details)
64 memory.limit_in_bytes # set/show limit of memory usage 69 memory.memsw.usage_in_bytes show current usage for memory+Swap
65 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage 70 (See 5.5 for details)
66 memory.failcnt # show the number of memory usage hits limits 71 memory.limit_in_bytes set/show limit of memory usage
67 memory.memsw.failcnt # show the number of memory+Swap hits limits 72 memory.memsw.limit_in_bytes set/show limit of memory+Swap usage
68 memory.max_usage_in_bytes # show max memory usage recorded 73 memory.failcnt show the number of memory usage hits limits
69 memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded 74 memory.memsw.failcnt show the number of memory+Swap hits limits
70 memory.soft_limit_in_bytes # set/show soft limit of memory usage 75 memory.max_usage_in_bytes show max memory usage recorded
71 memory.stat # show various statistics 76 memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded
72 memory.use_hierarchy # set/show hierarchical account enabled 77 memory.soft_limit_in_bytes set/show soft limit of memory usage
73 memory.force_empty # trigger forced page reclaim 78 memory.stat show various statistics
74 memory.pressure_level # set memory pressure notifications 79 memory.use_hierarchy set/show hierarchical account enabled
75 memory.swappiness # set/show swappiness parameter of vmscan 80 memory.force_empty trigger forced page reclaim
76 (See sysctl's vm.swappiness) 81 memory.pressure_level set memory pressure notifications
77 memory.move_charge_at_immigrate # set/show controls of moving charges 82 memory.swappiness set/show swappiness parameter of vmscan
78 memory.oom_control # set/show oom controls. 83 (See sysctl's vm.swappiness)
79 memory.numa_stat # show the number of memory usage per numa node 84 memory.move_charge_at_immigrate set/show controls of moving charges
80 85 memory.oom_control set/show oom controls.
81 memory.kmem.limit_in_bytes # set/show hard limit for kernel memory 86 memory.numa_stat show the number of memory usage per numa
82 memory.kmem.usage_in_bytes # show current kernel memory allocation 87 node
83 memory.kmem.failcnt # show the number of kernel memory usage hits limits 88
84 memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded 89 memory.kmem.limit_in_bytes set/show hard limit for kernel memory
85 90 memory.kmem.usage_in_bytes show current kernel memory allocation
86 memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory 91 memory.kmem.failcnt show the number of kernel memory usage
87 memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation 92 hits limits
88 memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits 93 memory.kmem.max_usage_in_bytes show max kernel memory usage recorded
89 memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded 94
95 memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory
96 memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation
97 memory.kmem.tcp.failcnt show the number of tcp buf memory usage
98 hits limits
99 memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded
100==================================== ==========================================
90 101
911. History 1021. History
103==========
92 104
93The memory controller has a long history. A request for comments for the memory 105The memory controller has a long history. A request for comments for the memory
94controller was posted by Balbir Singh [1]. At the time the RFC was posted 106controller was posted by Balbir Singh [1]. At the time the RFC was posted
@@ -103,6 +115,7 @@ at version 6; it combines both mapped (RSS) and unmapped Page
103Cache Control [11]. 115Cache Control [11].
104 116
1052. Memory Control 1172. Memory Control
118=================
106 119
107Memory is a unique resource in the sense that it is present in a limited 120Memory is a unique resource in the sense that it is present in a limited
108amount. If a task requires a lot of CPU processing, the task can spread 121amount. If a task requires a lot of CPU processing, the task can spread
@@ -120,6 +133,7 @@ are:
120The memory controller is the first controller developed. 133The memory controller is the first controller developed.
121 134
1222.1. Design 1352.1. Design
136-----------
123 137
124The core of the design is a counter called the page_counter. The 138The core of the design is a counter called the page_counter. The
125page_counter tracks the current memory usage and limit of the group of 139page_counter tracks the current memory usage and limit of the group of
@@ -127,6 +141,9 @@ processes associated with the controller. Each cgroup has a memory controller
127specific data structure (mem_cgroup) associated with it. 141specific data structure (mem_cgroup) associated with it.
128 142
1292.2. Accounting 1432.2. Accounting
144---------------
145
146::
130 147
131 +--------------------+ 148 +--------------------+
132 | mem_cgroup | 149 | mem_cgroup |
@@ -165,6 +182,7 @@ updated. page_cgroup has its own LRU on cgroup.
165(*) page_cgroup structure is allocated at boot/memory-hotplug time. 182(*) page_cgroup structure is allocated at boot/memory-hotplug time.
166 183
1672.2.1 Accounting details 1842.2.1 Accounting details
185------------------------
168 186
169All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. 187All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
170Some pages which are never reclaimable and will not be on the LRU 188Some pages which are never reclaimable and will not be on the LRU
@@ -191,6 +209,7 @@ Note: we just account pages-on-LRU because our purpose is to control amount
191of used pages; not-on-LRU pages tend to be out-of-control from VM view. 209of used pages; not-on-LRU pages tend to be out-of-control from VM view.
192 210
1932.3 Shared Page Accounting 2112.3 Shared Page Accounting
212--------------------------
194 213
195Shared pages are accounted on the basis of the first touch approach. The 214Shared pages are accounted on the basis of the first touch approach. The
196cgroup that first touches a page is accounted for the page. The principle 215cgroup that first touches a page is accounted for the page. The principle
@@ -207,11 +226,13 @@ be backed into memory in force, charges for pages are accounted against the
207caller of swapoff rather than the users of shmem. 226caller of swapoff rather than the users of shmem.
208 227
2092.4 Swap Extension (CONFIG_MEMCG_SWAP) 2282.4 Swap Extension (CONFIG_MEMCG_SWAP)
229--------------------------------------
210 230
211Swap Extension allows you to record charge for swap. A swapped-in page is 231Swap Extension allows you to record charge for swap. A swapped-in page is
212charged back to original page allocator if possible. 232charged back to original page allocator if possible.
213 233
214When swap is accounted, following files are added. 234When swap is accounted, following files are added.
235
215 - memory.memsw.usage_in_bytes. 236 - memory.memsw.usage_in_bytes.
216 - memory.memsw.limit_in_bytes. 237 - memory.memsw.limit_in_bytes.
217 238
@@ -224,14 +245,16 @@ In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
224By using the memsw limit, you can avoid system OOM which can be caused by swap 245By using the memsw limit, you can avoid system OOM which can be caused by swap
225shortage. 246shortage.
226 247
227* why 'memory+swap' rather than swap. 248**why 'memory+swap' rather than swap**
249
228The global LRU(kswapd) can swap out arbitrary pages. Swap-out means 250The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
229to move account from memory to swap...there is no change in usage of 251to move account from memory to swap...there is no change in usage of
230memory+swap. In other words, when we want to limit the usage of swap without 252memory+swap. In other words, when we want to limit the usage of swap without
231affecting global LRU, memory+swap limit is better than just limiting swap from 253affecting global LRU, memory+swap limit is better than just limiting swap from
232an OS point of view. 254an OS point of view.
233 255
234* What happens when a cgroup hits memory.memsw.limit_in_bytes 256**What happens when a cgroup hits memory.memsw.limit_in_bytes**
257
235When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out 258When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
236in this cgroup. Then, swap-out will not be done by cgroup routine and file 259in this cgroup. Then, swap-out will not be done by cgroup routine and file
237caches are dropped. But as mentioned above, global LRU can do swapout memory 260caches are dropped. But as mentioned above, global LRU can do swapout memory
@@ -239,6 +262,7 @@ from it for sanity of the system's memory management state. You can't forbid
239it by cgroup. 262it by cgroup.
240 263
2412.5 Reclaim 2642.5 Reclaim
265-----------
242 266
243Each cgroup maintains a per cgroup LRU which has the same structure as 267Each cgroup maintains a per cgroup LRU which has the same structure as
244global VM. When a cgroup goes over its limit, we first try 268global VM. When a cgroup goes over its limit, we first try
@@ -251,29 +275,36 @@ The reclaim algorithm has not been modified for cgroups, except that
251pages that are selected for reclaiming come from the per-cgroup LRU 275pages that are selected for reclaiming come from the per-cgroup LRU
252list. 276list.
253 277
254NOTE: Reclaim does not work for the root cgroup, since we cannot set any 278NOTE:
255limits on the root cgroup. 279 Reclaim does not work for the root cgroup, since we cannot set any
280 limits on the root cgroup.
256 281
257Note2: When panic_on_oom is set to "2", the whole system will panic. 282Note2:
283 When panic_on_oom is set to "2", the whole system will panic.
258 284
259When oom event notifier is registered, event will be delivered. 285When oom event notifier is registered, event will be delivered.
260(See oom_control section) 286(See oom_control section)
261 287
2622.6 Locking 2882.6 Locking
289-----------
263 290
264 lock_page_cgroup()/unlock_page_cgroup() should not be called under 291 lock_page_cgroup()/unlock_page_cgroup() should not be called under
265 the i_pages lock. 292 the i_pages lock.
266 293
267 Other lock order is following: 294 Other lock order is following:
295
268 PG_locked. 296 PG_locked.
269 mm->page_table_lock 297 mm->page_table_lock
270 pgdat->lru_lock 298 pgdat->lru_lock
271 lock_page_cgroup. 299 lock_page_cgroup.
300
272 In many cases, just lock_page_cgroup() is called. 301 In many cases, just lock_page_cgroup() is called.
302
273 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by 303 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
274 pgdat->lru_lock, it has no lock of its own. 304 pgdat->lru_lock, it has no lock of its own.
275 305
2762.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) 3062.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
307-----------------------------------------------
277 308
278With the Kernel memory extension, the Memory Controller is able to limit 309With the Kernel memory extension, the Memory Controller is able to limit
279the amount of kernel memory used by the system. Kernel memory is fundamentally 310the amount of kernel memory used by the system. Kernel memory is fundamentally
@@ -288,6 +319,7 @@ Kernel memory limits are not imposed for the root cgroup. Usage for the root
288cgroup may or may not be accounted. The memory used is accumulated into 319cgroup may or may not be accounted. The memory used is accumulated into
289memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. 320memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
290(currently only for tcp). 321(currently only for tcp).
322
291The main "kmem" counter is fed into the main counter, so kmem charges will 323The main "kmem" counter is fed into the main counter, so kmem charges will
292also be visible from the user counter. 324also be visible from the user counter.
293 325
@@ -295,36 +327,42 @@ Currently no soft limit is implemented for kernel memory. It is future work
295to trigger slab reclaim when those limits are reached. 327to trigger slab reclaim when those limits are reached.
296 328
2972.7.1 Current Kernel Memory resources accounted 3292.7.1 Current Kernel Memory resources accounted
330-----------------------------------------------
298 331
299* stack pages: every process consumes some stack pages. By accounting into 332stack pages:
300kernel memory, we prevent new processes from being created when the kernel 333 every process consumes some stack pages. By accounting into
301memory usage is too high. 334 kernel memory, we prevent new processes from being created when the kernel
335 memory usage is too high.
302 336
303* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy 337slab pages:
304of each kmem_cache is created every time the cache is touched by the first time 338 pages allocated by the SLAB or SLUB allocator are tracked. A copy
305from inside the memcg. The creation is done lazily, so some objects can still be 339 of each kmem_cache is created every time the cache is touched by the first time
306skipped while the cache is being created. All objects in a slab page should 340 from inside the memcg. The creation is done lazily, so some objects can still be
307belong to the same memcg. This only fails to hold when a task is migrated to a 341 skipped while the cache is being created. All objects in a slab page should
308different memcg during the page allocation by the cache. 342 belong to the same memcg. This only fails to hold when a task is migrated to a
343 different memcg during the page allocation by the cache.
309 344
310* sockets memory pressure: some sockets protocols have memory pressure 345sockets memory pressure:
311thresholds. The Memory Controller allows them to be controlled individually 346 some sockets protocols have memory pressure
312per cgroup, instead of globally. 347 thresholds. The Memory Controller allows them to be controlled individually
348 per cgroup, instead of globally.
313 349
314* tcp memory pressure: sockets memory pressure for the tcp protocol. 350tcp memory pressure:
351 sockets memory pressure for the tcp protocol.
315 352
3162.7.2 Common use cases 3532.7.2 Common use cases
354----------------------
317 355
318Because the "kmem" counter is fed to the main user counter, kernel memory can 356Because the "kmem" counter is fed to the main user counter, kernel memory can
319never be limited completely independently of user memory. Say "U" is the user 357never be limited completely independently of user memory. Say "U" is the user
320limit, and "K" the kernel limit. There are three possible ways limits can be 358limit, and "K" the kernel limit. There are three possible ways limits can be
321set: 359set:
322 360
323 U != 0, K = unlimited: 361U != 0, K = unlimited:
324 This is the standard memcg limitation mechanism already present before kmem 362 This is the standard memcg limitation mechanism already present before kmem
325 accounting. Kernel memory is completely ignored. 363 accounting. Kernel memory is completely ignored.
326 364
327 U != 0, K < U: 365U != 0, K < U:
328 Kernel memory is a subset of the user memory. This setup is useful in 366 Kernel memory is a subset of the user memory. This setup is useful in
329 deployments where the total amount of memory per-cgroup is overcommited. 367 deployments where the total amount of memory per-cgroup is overcommited.
330 Overcommiting kernel memory limits is definitely not recommended, since the 368 Overcommiting kernel memory limits is definitely not recommended, since the
@@ -332,19 +370,23 @@ set:
332 In this case, the admin could set up K so that the sum of all groups is 370 In this case, the admin could set up K so that the sum of all groups is
333 never greater than the total memory, and freely set U at the cost of his 371 never greater than the total memory, and freely set U at the cost of his
334 QoS. 372 QoS.
335 WARNING: In the current implementation, memory reclaim will NOT be 373
374WARNING:
375 In the current implementation, memory reclaim will NOT be
336 triggered for a cgroup when it hits K while staying below U, which makes 376 triggered for a cgroup when it hits K while staying below U, which makes
337 this setup impractical. 377 this setup impractical.
338 378
339 U != 0, K >= U: 379U != 0, K >= U:
340 Since kmem charges will also be fed to the user counter and reclaim will be 380 Since kmem charges will also be fed to the user counter and reclaim will be
341 triggered for the cgroup for both kinds of memory. This setup gives the 381 triggered for the cgroup for both kinds of memory. This setup gives the
342 admin a unified view of memory, and it is also useful for people who just 382 admin a unified view of memory, and it is also useful for people who just
343 want to track kernel memory usage. 383 want to track kernel memory usage.
344 384
3453. User Interface 3853. User Interface
386=================
346 387
3473.0. Configuration 3883.0. Configuration
389------------------
348 390
349a. Enable CONFIG_CGROUPS 391a. Enable CONFIG_CGROUPS
350b. Enable CONFIG_MEMCG 392b. Enable CONFIG_MEMCG
@@ -352,39 +394,53 @@ c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
352d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 394d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
353 395
3543.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) 3963.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
355# mount -t tmpfs none /sys/fs/cgroup 397-------------------------------------------------------------------
356# mkdir /sys/fs/cgroup/memory 398
357# mount -t cgroup none /sys/fs/cgroup/memory -o memory 399::
400
401 # mount -t tmpfs none /sys/fs/cgroup
402 # mkdir /sys/fs/cgroup/memory
403 # mount -t cgroup none /sys/fs/cgroup/memory -o memory
404
4053.2. Make the new group and move bash into it::
406
407 # mkdir /sys/fs/cgroup/memory/0
408 # echo $$ > /sys/fs/cgroup/memory/0/tasks
358 409
3593.2. Make the new group and move bash into it 410Since now we're in the 0 cgroup, we can alter the memory limit::
360# mkdir /sys/fs/cgroup/memory/0
361# echo $$ > /sys/fs/cgroup/memory/0/tasks
362 411
363Since now we're in the 0 cgroup, we can alter the memory limit: 412 # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
364# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
365 413
366NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 414NOTE:
367mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) 415 We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
416 mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
417 Gibibytes.)
368 418
369NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). 419NOTE:
370NOTE: We cannot set limits on the root cgroup any more. 420 We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
371 421
372# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes 422NOTE:
3734194304 423 We cannot set limits on the root cgroup any more.
374 424
375We can check the usage: 425::
376# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes 426
3771216512 427 # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
428 4194304
429
430We can check the usage::
431
432 # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
433 1216512
378 434
379A successful write to this file does not guarantee a successful setting of 435A successful write to this file does not guarantee a successful setting of
380this limit to the value written into the file. This can be due to a 436this limit to the value written into the file. This can be due to a
381number of factors, such as rounding up to page boundaries or the total 437number of factors, such as rounding up to page boundaries or the total
382availability of memory on the system. The user is required to re-read 438availability of memory on the system. The user is required to re-read
383this file after a write to guarantee the value committed by the kernel. 439this file after a write to guarantee the value committed by the kernel::
384 440
385# echo 1 > memory.limit_in_bytes 441 # echo 1 > memory.limit_in_bytes
386# cat memory.limit_in_bytes 442 # cat memory.limit_in_bytes
3874096 443 4096
388 444
389The memory.failcnt field gives the number of times that the cgroup limit was 445The memory.failcnt field gives the number of times that the cgroup limit was
390exceeded. 446exceeded.
@@ -393,6 +449,7 @@ The memory.stat file gives accounting information. Now, the number of
393caches, RSS and Active pages/Inactive pages are shown. 449caches, RSS and Active pages/Inactive pages are shown.
394 450
3954. Testing 4514. Testing
452==========
396 453
397For testing features and implementation, see memcg_test.txt. 454For testing features and implementation, see memcg_test.txt.
398 455
@@ -408,6 +465,7 @@ But the above two are testing extreme situations.
408Trying usual test under memory controller is always helpful. 465Trying usual test under memory controller is always helpful.
409 466
4104.1 Troubleshooting 4674.1 Troubleshooting
468-------------------
411 469
412Sometimes a user might find that the application under a cgroup is 470Sometimes a user might find that the application under a cgroup is
413terminated by the OOM killer. There are several causes for this: 471terminated by the OOM killer. There are several causes for this:
@@ -422,6 +480,7 @@ To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
422seeing what happens will be helpful. 480seeing what happens will be helpful.
423 481
4244.2 Task migration 4824.2 Task migration
483------------------
425 484
426When a task migrates from one cgroup to another, its charge is not 485When a task migrates from one cgroup to another, its charge is not
427carried forward by default. The pages allocated from the original cgroup still 486carried forward by default. The pages allocated from the original cgroup still
@@ -432,6 +491,7 @@ You can move charges of a task along with task migration.
432See 8. "Move charges at task migration" 491See 8. "Move charges at task migration"
433 492
4344.3 Removing a cgroup 4934.3 Removing a cgroup
494---------------------
435 495
436A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a 496A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
437cgroup might have some charge associated with it, even though all 497cgroup might have some charge associated with it, even though all
@@ -448,13 +508,15 @@ will be charged as a new owner of it.
448 508
449About use_hierarchy, see Section 6. 509About use_hierarchy, see Section 6.
450 510
4515. Misc. interfaces. 5115. Misc. interfaces
512===================
452 513
4535.1 force_empty 5145.1 force_empty
515---------------
454 memory.force_empty interface is provided to make cgroup's memory usage empty. 516 memory.force_empty interface is provided to make cgroup's memory usage empty.
455 When writing anything to this 517 When writing anything to this::
456 518
457 # echo 0 > memory.force_empty 519 # echo 0 > memory.force_empty
458 520
459 the cgroup will be reclaimed and as many pages reclaimed as possible. 521 the cgroup will be reclaimed and as many pages reclaimed as possible.
460 522
@@ -471,50 +533,61 @@ About use_hierarchy, see Section 6.
471 About use_hierarchy, see Section 6. 533 About use_hierarchy, see Section 6.
472 534
4735.2 stat file 5355.2 stat file
536-------------
474 537
475memory.stat file includes following statistics 538memory.stat file includes following statistics
476 539
477# per-memory cgroup local status 540per-memory cgroup local status
478cache - # of bytes of page cache memory. 541^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
479rss - # of bytes of anonymous and swap cache memory (includes 542
543=============== ===============================================================
544cache # of bytes of page cache memory.
545rss # of bytes of anonymous and swap cache memory (includes
480 transparent hugepages). 546 transparent hugepages).
481rss_huge - # of bytes of anonymous transparent hugepages. 547rss_huge # of bytes of anonymous transparent hugepages.
482mapped_file - # of bytes of mapped file (includes tmpfs/shmem) 548mapped_file # of bytes of mapped file (includes tmpfs/shmem)
483pgpgin - # of charging events to the memory cgroup. The charging 549pgpgin # of charging events to the memory cgroup. The charging
484 event happens each time a page is accounted as either mapped 550 event happens each time a page is accounted as either mapped
485 anon page(RSS) or cache page(Page Cache) to the cgroup. 551 anon page(RSS) or cache page(Page Cache) to the cgroup.
486pgpgout - # of uncharging events to the memory cgroup. The uncharging 552pgpgout # of uncharging events to the memory cgroup. The uncharging
487 event happens each time a page is unaccounted from the cgroup. 553 event happens each time a page is unaccounted from the cgroup.
488swap - # of bytes of swap usage 554swap # of bytes of swap usage
489dirty - # of bytes that are waiting to get written back to the disk. 555dirty # of bytes that are waiting to get written back to the disk.
490writeback - # of bytes of file/anon cache that are queued for syncing to 556writeback # of bytes of file/anon cache that are queued for syncing to
491 disk. 557 disk.
492inactive_anon - # of bytes of anonymous and swap cache memory on inactive 558inactive_anon # of bytes of anonymous and swap cache memory on inactive
493 LRU list. 559 LRU list.
494active_anon - # of bytes of anonymous and swap cache memory on active 560active_anon # of bytes of anonymous and swap cache memory on active
495 LRU list. 561 LRU list.
496inactive_file - # of bytes of file-backed memory on inactive LRU list. 562inactive_file # of bytes of file-backed memory on inactive LRU list.
497active_file - # of bytes of file-backed memory on active LRU list. 563active_file # of bytes of file-backed memory on active LRU list.
498unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). 564unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
499 565=============== ===============================================================
500# status considering hierarchy (see memory.use_hierarchy settings) 566
501 567status considering hierarchy (see memory.use_hierarchy settings)
502hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy 568^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
503 under which the memory cgroup is 569
504hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to 570========================= ===================================================
505 hierarchy under which memory cgroup is. 571hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
506 572 under which the memory cgroup is
507total_<counter> - # hierarchical version of <counter>, which in 573hierarchical_memsw_limit # of bytes of memory+swap limit with regard to
508 addition to the cgroup's own value includes the 574 hierarchy under which memory cgroup is.
509 sum of all hierarchical children's values of 575
510 <counter>, i.e. total_cache 576total_<counter> # hierarchical version of <counter>, which in
511 577 addition to the cgroup's own value includes the
512# The following additional stats are dependent on CONFIG_DEBUG_VM. 578 sum of all hierarchical children's values of
513 579 <counter>, i.e. total_cache
514recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) 580========================= ===================================================
515recent_rotated_file - VM internal parameter. (see mm/vmscan.c) 581
516recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) 582The following additional stats are dependent on CONFIG_DEBUG_VM
517recent_scanned_file - VM internal parameter. (see mm/vmscan.c) 583^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
584
585========================= ========================================
586recent_rotated_anon VM internal parameter. (see mm/vmscan.c)
587recent_rotated_file VM internal parameter. (see mm/vmscan.c)
588recent_scanned_anon VM internal parameter. (see mm/vmscan.c)
589recent_scanned_file VM internal parameter. (see mm/vmscan.c)
590========================= ========================================
518 591
519Memo: 592Memo:
520 recent_rotated means recent frequency of LRU rotation. 593 recent_rotated means recent frequency of LRU rotation.
@@ -525,12 +598,15 @@ Note:
525 Only anonymous and swap cache memory is listed as part of 'rss' stat. 598 Only anonymous and swap cache memory is listed as part of 'rss' stat.
526 This should not be confused with the true 'resident set size' or the 599 This should not be confused with the true 'resident set size' or the
527 amount of physical memory used by the cgroup. 600 amount of physical memory used by the cgroup.
601
528 'rss + mapped_file" will give you resident set size of cgroup. 602 'rss + mapped_file" will give you resident set size of cgroup.
603
529 (Note: file and shmem may be shared among other cgroups. In that case, 604 (Note: file and shmem may be shared among other cgroups. In that case,
530 mapped_file is accounted only when the memory cgroup is owner of page 605 mapped_file is accounted only when the memory cgroup is owner of page
531 cache.) 606 cache.)
532 607
5335.3 swappiness 6085.3 swappiness
609--------------
534 610
535Overrides /proc/sys/vm/swappiness for the particular group. The tunable 611Overrides /proc/sys/vm/swappiness for the particular group. The tunable
536in the root cgroup corresponds to the global swappiness setting. 612in the root cgroup corresponds to the global swappiness setting.
@@ -541,16 +617,19 @@ there is a swap storage available. This might lead to memcg OOM killer
541if there are no file pages to reclaim. 617if there are no file pages to reclaim.
542 618
5435.4 failcnt 6195.4 failcnt
620-----------
544 621
545A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. 622A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
546This failcnt(== failure count) shows the number of times that a usage counter 623This failcnt(== failure count) shows the number of times that a usage counter
547hit its limit. When a memory cgroup hits a limit, failcnt increases and 624hit its limit. When a memory cgroup hits a limit, failcnt increases and
548memory under it will be reclaimed. 625memory under it will be reclaimed.
549 626
550You can reset failcnt by writing 0 to failcnt file. 627You can reset failcnt by writing 0 to failcnt file::
551# echo 0 > .../memory.failcnt 628
629 # echo 0 > .../memory.failcnt
552 630
5535.5 usage_in_bytes 6315.5 usage_in_bytes
632------------------
554 633
555For efficiency, as other kernel components, memory cgroup uses some optimization 634For efficiency, as other kernel components, memory cgroup uses some optimization
556to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the 635to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
@@ -560,6 +639,7 @@ If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
560value in memory.stat(see 5.2). 639value in memory.stat(see 5.2).
561 640
5625.6 numa_stat 6415.6 numa_stat
642-------------
563 643
564This is similar to numa_maps but operates on a per-memcg basis. This is 644This is similar to numa_maps but operates on a per-memcg basis. This is
565useful for providing visibility into the numa locality information within 645useful for providing visibility into the numa locality information within
@@ -571,22 +651,23 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
571per-node page counts including "hierarchical_<counter>" which sums up all 651per-node page counts including "hierarchical_<counter>" which sums up all
572hierarchical children's values in addition to the memcg's own value. 652hierarchical children's values in addition to the memcg's own value.
573 653
574The output format of memory.numa_stat is: 654The output format of memory.numa_stat is::
575 655
576total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... 656 total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
577file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... 657 file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
578anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... 658 anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
579unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... 659 unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
580hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ... 660 hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
581 661
582The "total" count is sum of file + anon + unevictable. 662The "total" count is sum of file + anon + unevictable.
583 663
5846. Hierarchy support 6646. Hierarchy support
665====================
585 666
586The memory controller supports a deep hierarchy and hierarchical accounting. 667The memory controller supports a deep hierarchy and hierarchical accounting.
587The hierarchy is created by creating the appropriate cgroups in the 668The hierarchy is created by creating the appropriate cgroups in the
588cgroup filesystem. Consider for example, the following cgroup filesystem 669cgroup filesystem. Consider for example, the following cgroup filesystem
589hierarchy 670hierarchy::
590 671
591 root 672 root
592 / | \ 673 / | \
@@ -603,24 +684,28 @@ limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
603children of the ancestor. 684children of the ancestor.
604 685
6056.1 Enabling hierarchical accounting and reclaim 6866.1 Enabling hierarchical accounting and reclaim
687------------------------------------------------
606 688
607A memory cgroup by default disables the hierarchy feature. Support 689A memory cgroup by default disables the hierarchy feature. Support
608can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup 690can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
609 691
610# echo 1 > memory.use_hierarchy 692 # echo 1 > memory.use_hierarchy
611 693
612The feature can be disabled by 694The feature can be disabled by::
613 695
614# echo 0 > memory.use_hierarchy 696 # echo 0 > memory.use_hierarchy
615 697
616NOTE1: Enabling/disabling will fail if either the cgroup already has other 698NOTE1:
699 Enabling/disabling will fail if either the cgroup already has other
617 cgroups created below it, or if the parent cgroup has use_hierarchy 700 cgroups created below it, or if the parent cgroup has use_hierarchy
618 enabled. 701 enabled.
619 702
620NOTE2: When panic_on_oom is set to "2", the whole system will panic in 703NOTE2:
704 When panic_on_oom is set to "2", the whole system will panic in
621 case of an OOM event in any cgroup. 705 case of an OOM event in any cgroup.
622 706
6237. Soft limits 7077. Soft limits
708==============
624 709
625Soft limits allow for greater sharing of memory. The idea behind soft limits 710Soft limits allow for greater sharing of memory. The idea behind soft limits
626is to allow control groups to use as much of the memory as needed, provided 711is to allow control groups to use as much of the memory as needed, provided
@@ -640,22 +725,26 @@ hints/setup. Currently soft limit based reclaim is set up such that
640it gets invoked from balance_pgdat (kswapd). 725it gets invoked from balance_pgdat (kswapd).
641 726
6427.1 Interface 7277.1 Interface
728-------------
643 729
644Soft limits can be setup by using the following commands (in this example we 730Soft limits can be setup by using the following commands (in this example we
645assume a soft limit of 256 MiB) 731assume a soft limit of 256 MiB)::
646 732
647# echo 256M > memory.soft_limit_in_bytes 733 # echo 256M > memory.soft_limit_in_bytes
648 734
649If we want to change this to 1G, we can at any time use 735If we want to change this to 1G, we can at any time use::
650 736
651# echo 1G > memory.soft_limit_in_bytes 737 # echo 1G > memory.soft_limit_in_bytes
652 738
653NOTE1: Soft limits take effect over a long period of time, since they involve 739NOTE1:
740 Soft limits take effect over a long period of time, since they involve
654 reclaiming memory for balancing between memory cgroups 741 reclaiming memory for balancing between memory cgroups
655NOTE2: It is recommended to set the soft limit always below the hard limit, 742NOTE2:
743 It is recommended to set the soft limit always below the hard limit,
656 otherwise the hard limit will take precedence. 744 otherwise the hard limit will take precedence.
657 745
6588. Move charges at task migration 7468. Move charges at task migration
747=================================
659 748
660Users can move charges associated with a task along with task migration, that 749Users can move charges associated with a task along with task migration, that
661is, uncharge task's pages from the old cgroup and charge them to the new cgroup. 750is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
@@ -663,60 +752,71 @@ This feature is not supported in !CONFIG_MMU environments because of lack of
663page tables. 752page tables.
664 753
6658.1 Interface 7548.1 Interface
755-------------
666 756
667This feature is disabled by default. It can be enabled (and disabled again) by 757This feature is disabled by default. It can be enabled (and disabled again) by
668writing to memory.move_charge_at_immigrate of the destination cgroup. 758writing to memory.move_charge_at_immigrate of the destination cgroup.
669 759
670If you want to enable it: 760If you want to enable it::
671 761
672# echo (some positive value) > memory.move_charge_at_immigrate 762 # echo (some positive value) > memory.move_charge_at_immigrate
673 763
674Note: Each bits of move_charge_at_immigrate has its own meaning about what type 764Note:
765 Each bits of move_charge_at_immigrate has its own meaning about what type
675 of charges should be moved. See 8.2 for details. 766 of charges should be moved. See 8.2 for details.
676Note: Charges are moved only when you move mm->owner, in other words, 767Note:
768 Charges are moved only when you move mm->owner, in other words,
677 a leader of a thread group. 769 a leader of a thread group.
678Note: If we cannot find enough space for the task in the destination cgroup, we 770Note:
771 If we cannot find enough space for the task in the destination cgroup, we
679 try to make space by reclaiming memory. Task migration may fail if we 772 try to make space by reclaiming memory. Task migration may fail if we
680 cannot make enough space. 773 cannot make enough space.
681Note: It can take several seconds if you move charges much. 774Note:
775 It can take several seconds if you move charges much.
682 776
683And if you want disable it again: 777And if you want disable it again::
684 778
685# echo 0 > memory.move_charge_at_immigrate 779 # echo 0 > memory.move_charge_at_immigrate
686 780
6878.2 Type of charges which can be moved 7818.2 Type of charges which can be moved
782--------------------------------------
688 783
689Each bit in move_charge_at_immigrate has its own meaning about what type of 784Each bit in move_charge_at_immigrate has its own meaning about what type of
690charges should be moved. But in any case, it must be noted that an account of 785charges should be moved. But in any case, it must be noted that an account of
691a page or a swap can be moved only when it is charged to the task's current 786a page or a swap can be moved only when it is charged to the task's current
692(old) memory cgroup. 787(old) memory cgroup.
693 788
694 bit | what type of charges would be moved ? 789+---+--------------------------------------------------------------------------+
695 -----+------------------------------------------------------------------------ 790|bit| what type of charges would be moved ? |
696 0 | A charge of an anonymous page (or swap of it) used by the target task. 791+===+==========================================================================+
697 | You must enable Swap Extension (see 2.4) to enable move of swap charges. 792| 0 | A charge of an anonymous page (or swap of it) used by the target task. |
698 -----+------------------------------------------------------------------------ 793| | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
699 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) 794+---+--------------------------------------------------------------------------+
700 | and swaps of tmpfs file) mmapped by the target task. Unlike the case of 795| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
701 | anonymous pages, file pages (and swaps) in the range mmapped by the task 796| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of |
702 | will be moved even if the task hasn't done page fault, i.e. they might 797| | anonymous pages, file pages (and swaps) in the range mmapped by the task |
703 | not be the task's "RSS", but other task's "RSS" that maps the same file. 798| | will be moved even if the task hasn't done page fault, i.e. they might |
704 | And mapcount of the page is ignored (the page can be moved even if 799| | not be the task's "RSS", but other task's "RSS" that maps the same file. |
705 | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to 800| | And mapcount of the page is ignored (the page can be moved even if |
706 | enable move of swap charges. 801| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to |
802| | enable move of swap charges. |
803+---+--------------------------------------------------------------------------+
707 804
7088.3 TODO 8058.3 TODO
806--------
709 807
710- All of moving charge operations are done under cgroup_mutex. It's not good 808- All of moving charge operations are done under cgroup_mutex. It's not good
711 behavior to hold the mutex too long, so we may need some trick. 809 behavior to hold the mutex too long, so we may need some trick.
712 810
7139. Memory thresholds 8119. Memory thresholds
812====================
714 813
715Memory cgroup implements memory thresholds using the cgroups notification 814Memory cgroup implements memory thresholds using the cgroups notification
716API (see cgroups.txt). It allows to register multiple memory and memsw 815API (see cgroups.txt). It allows to register multiple memory and memsw
717thresholds and gets notifications when it crosses. 816thresholds and gets notifications when it crosses.
718 817
719To register a threshold, an application must: 818To register a threshold, an application must:
819
720- create an eventfd using eventfd(2); 820- create an eventfd using eventfd(2);
721- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; 821- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
722- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to 822- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
@@ -728,6 +828,7 @@ threshold in any direction.
728It's applicable for root and non-root cgroup. 828It's applicable for root and non-root cgroup.
729 829
73010. OOM Control 83010. OOM Control
831===============
731 832
732memory.oom_control file is for OOM notification and other controls. 833memory.oom_control file is for OOM notification and other controls.
733 834
@@ -736,6 +837,7 @@ API (See cgroups.txt). It allows to register multiple OOM notification
736delivery and gets notification when OOM happens. 837delivery and gets notification when OOM happens.
737 838
738To register a notifier, an application must: 839To register a notifier, an application must:
840
739 - create an eventfd using eventfd(2) 841 - create an eventfd using eventfd(2)
740 - open memory.oom_control file 842 - open memory.oom_control file
741 - write string like "<event_fd> <fd of memory.oom_control>" to 843 - write string like "<event_fd> <fd of memory.oom_control>" to
@@ -752,8 +854,11 @@ If OOM-killer is disabled, tasks under cgroup will hang/sleep
752in memory cgroup's OOM-waitqueue when they request accountable memory. 854in memory cgroup's OOM-waitqueue when they request accountable memory.
753 855
754For running them, you have to relax the memory cgroup's OOM status by 856For running them, you have to relax the memory cgroup's OOM status by
857
755 * enlarge limit or reduce usage. 858 * enlarge limit or reduce usage.
859
756To reduce usage, 860To reduce usage,
861
757 * kill some tasks. 862 * kill some tasks.
758 * move some tasks to other group with account migration. 863 * move some tasks to other group with account migration.
759 * remove some files (on tmpfs?) 864 * remove some files (on tmpfs?)
@@ -761,11 +866,14 @@ To reduce usage,
761Then, stopped tasks will work again. 866Then, stopped tasks will work again.
762 867
763At reading, current status of OOM is shown. 868At reading, current status of OOM is shown.
764 oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) 869
765 under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may 870 - oom_kill_disable 0 or 1
766 be stopped.) 871 (if 1, oom-killer is disabled)
872 - under_oom 0 or 1
873 (if 1, the memory cgroup is under OOM, tasks may be stopped.)
767 874
76811. Memory Pressure 87511. Memory Pressure
876===================
769 877
770The pressure level notifications can be used to monitor the memory 878The pressure level notifications can be used to monitor the memory
771allocation cost; based on the pressure, applications can implement 879allocation cost; based on the pressure, applications can implement
@@ -840,21 +948,22 @@ Test:
840 948
841 Here is a small script example that makes a new cgroup, sets up a 949 Here is a small script example that makes a new cgroup, sets up a
842 memory limit, sets up a notification in the cgroup and then makes child 950 memory limit, sets up a notification in the cgroup and then makes child
843 cgroup experience a critical pressure: 951 cgroup experience a critical pressure::
844 952
845 # cd /sys/fs/cgroup/memory/ 953 # cd /sys/fs/cgroup/memory/
846 # mkdir foo 954 # mkdir foo
847 # cd foo 955 # cd foo
848 # cgroup_event_listener memory.pressure_level low,hierarchy & 956 # cgroup_event_listener memory.pressure_level low,hierarchy &
849 # echo 8000000 > memory.limit_in_bytes 957 # echo 8000000 > memory.limit_in_bytes
850 # echo 8000000 > memory.memsw.limit_in_bytes 958 # echo 8000000 > memory.memsw.limit_in_bytes
851 # echo $$ > tasks 959 # echo $$ > tasks
852 # dd if=/dev/zero | read x 960 # dd if=/dev/zero | read x
853 961
854 (Expect a bunch of notifications, and eventually, the oom-killer will 962 (Expect a bunch of notifications, and eventually, the oom-killer will
855 trigger.) 963 trigger.)
856 964
85712. TODO 96512. TODO
966========
858 967
8591. Make per-cgroup scanner reclaim not-shared pages first 9681. Make per-cgroup scanner reclaim not-shared pages first
8602. Teach controller to account for shared-pages 9692. Teach controller to account for shared-pages
@@ -862,11 +971,13 @@ Test:
862 not yet hit but the usage is getting closer 971 not yet hit but the usage is getting closer
863 972
864Summary 973Summary
974=======
865 975
866Overall, the memory controller has been a stable controller and has been 976Overall, the memory controller has been a stable controller and has been
867commented and discussed quite extensively in the community. 977commented and discussed quite extensively in the community.
868 978
869References 979References
980==========
870 981
8711. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ 9821. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
8722. Singh, Balbir. Memory Controller (RSS Control), 9832. Singh, Balbir. Memory Controller (RSS Control),
diff --git a/Documentation/cgroup-v1/net_cls.txt b/Documentation/cgroup-v1/net_cls.rst
index ec182346dea2..a2cf272af7a0 100644
--- a/Documentation/cgroup-v1/net_cls.txt
+++ b/Documentation/cgroup-v1/net_cls.rst
@@ -1,5 +1,6 @@
1=========================
1Network classifier cgroup 2Network classifier cgroup
2------------------------- 3=========================
3 4
4The Network classifier cgroup provides an interface to 5The Network classifier cgroup provides an interface to
5tag network packets with a class identifier (classid). 6tag network packets with a class identifier (classid).
@@ -17,23 +18,27 @@ values is 0xAAAABBBB; AAAA is the major handle number and BBBB
17is the minor handle number. 18is the minor handle number.
18Reading net_cls.classid yields a decimal result. 19Reading net_cls.classid yields a decimal result.
19 20
20Example: 21Example::
21mkdir /sys/fs/cgroup/net_cls
22mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
23mkdir /sys/fs/cgroup/net_cls/0
24echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid
25 - setting a 10:1 handle.
26 22
27cat /sys/fs/cgroup/net_cls/0/net_cls.classid 23 mkdir /sys/fs/cgroup/net_cls
281048577 24 mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
25 mkdir /sys/fs/cgroup/net_cls/0
26 echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid
29 27
30configuring tc: 28- setting a 10:1 handle::
31tc qdisc add dev eth0 root handle 10: htb
32 29
33tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit 30 cat /sys/fs/cgroup/net_cls/0/net_cls.classid
34 - creating traffic class 10:1 31 1048577
35 32
36tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup 33- configuring tc::
37 34
38configuring iptables, basic example: 35 tc qdisc add dev eth0 root handle 10: htb
39iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP 36 tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
37
38- creating traffic class 10:1::
39
40 tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
41
42configuring iptables, basic example::
43
44 iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_prio.txt b/Documentation/cgroup-v1/net_prio.rst
index a82cbd28ea8a..b40905871c64 100644
--- a/Documentation/cgroup-v1/net_prio.txt
+++ b/Documentation/cgroup-v1/net_prio.rst
@@ -1,5 +1,6 @@
1=======================
1Network priority cgroup 2Network priority cgroup
2------------------------- 3=======================
3 4
4The Network priority cgroup provides an interface to allow an administrator to 5The Network priority cgroup provides an interface to allow an administrator to
5dynamically set the priority of network traffic generated by various 6dynamically set the priority of network traffic generated by various
@@ -14,9 +15,9 @@ SO_PRIORITY socket option. This however, is not always possible because:
14 15
15This cgroup allows an administrator to assign a process to a group which defines 16This cgroup allows an administrator to assign a process to a group which defines
16the priority of egress traffic on a given interface. Network priority groups can 17the priority of egress traffic on a given interface. Network priority groups can
17be created by first mounting the cgroup filesystem. 18be created by first mounting the cgroup filesystem::
18 19
19# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio 20 # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
20 21
21With the above step, the initial group acting as the parent accounting group 22With the above step, the initial group acting as the parent accounting group
22becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in 23becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in
@@ -25,17 +26,18 @@ the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
25Each net_prio cgroup contains two files that are subsystem specific 26Each net_prio cgroup contains two files that are subsystem specific
26 27
27net_prio.prioidx 28net_prio.prioidx
28This file is read-only, and is simply informative. It contains a unique integer 29 This file is read-only, and is simply informative. It contains a unique
29value that the kernel uses as an internal representation of this cgroup. 30 integer value that the kernel uses as an internal representation of this
31 cgroup.
30 32
31net_prio.ifpriomap 33net_prio.ifpriomap
32This file contains a map of the priorities assigned to traffic originating from 34 This file contains a map of the priorities assigned to traffic originating
33processes in this group and egressing the system on various interfaces. It 35 from processes in this group and egressing the system on various interfaces.
34contains a list of tuples in the form <ifname priority>. Contents of this file 36 It contains a list of tuples in the form <ifname priority>. Contents of this
35can be modified by echoing a string into the file using the same tuple format. 37 file can be modified by echoing a string into the file using the same tuple
36for example: 38 format. For example::
37 39
38echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap 40 echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
39 41
40This command would force any traffic originating from processes belonging to the 42This command would force any traffic originating from processes belonging to the
41iscsi net_prio cgroup and egressing on interface eth0 to have the priority of 43iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.rst
index e105d708ccde..6acebd9e72c8 100644
--- a/Documentation/cgroup-v1/pids.txt
+++ b/Documentation/cgroup-v1/pids.rst
@@ -1,5 +1,6 @@
1 Process Number Controller 1=========================
2 ========================= 2Process Number Controller
3=========================
3 4
4Abstract 5Abstract
5-------- 6--------
@@ -34,55 +35,58 @@ pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
34superset of parent/child/pids.current. 35superset of parent/child/pids.current.
35 36
36The pids.events file contains event counters: 37The pids.events file contains event counters:
38
37 - max: Number of times fork failed because limit was hit. 39 - max: Number of times fork failed because limit was hit.
38 40
39Example 41Example
40------- 42-------
41 43
42First, we mount the pids controller: 44First, we mount the pids controller::
43# mkdir -p /sys/fs/cgroup/pids 45
44# mount -t cgroup -o pids none /sys/fs/cgroup/pids 46 # mkdir -p /sys/fs/cgroup/pids
47 # mount -t cgroup -o pids none /sys/fs/cgroup/pids
48
49Then we create a hierarchy, set limits and attach processes to it::
45 50
46Then we create a hierarchy, set limits and attach processes to it: 51 # mkdir -p /sys/fs/cgroup/pids/parent/child
47# mkdir -p /sys/fs/cgroup/pids/parent/child 52 # echo 2 > /sys/fs/cgroup/pids/parent/pids.max
48# echo 2 > /sys/fs/cgroup/pids/parent/pids.max 53 # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
49# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs 54 # cat /sys/fs/cgroup/pids/parent/pids.current
50# cat /sys/fs/cgroup/pids/parent/pids.current 55 2
512 56 #
52#
53 57
54It should be noted that attempts to overcome the set limit (2 in this case) will 58It should be noted that attempts to overcome the set limit (2 in this case) will
55fail: 59fail::
56 60
57# cat /sys/fs/cgroup/pids/parent/pids.current 61 # cat /sys/fs/cgroup/pids/parent/pids.current
582 62 2
59# ( /bin/echo "Here's some processes for you." | cat ) 63 # ( /bin/echo "Here's some processes for you." | cat )
60sh: fork: Resource temporary unavailable 64 sh: fork: Resource temporary unavailable
61# 65 #
62 66
63Even if we migrate to a child cgroup (which doesn't have a set limit), we will 67Even if we migrate to a child cgroup (which doesn't have a set limit), we will
64not be able to overcome the most stringent limit in the hierarchy (in this case, 68not be able to overcome the most stringent limit in the hierarchy (in this case,
65parent's): 69parent's)::
66 70
67# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs 71 # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
68# cat /sys/fs/cgroup/pids/parent/pids.current 72 # cat /sys/fs/cgroup/pids/parent/pids.current
692 73 2
70# cat /sys/fs/cgroup/pids/parent/child/pids.current 74 # cat /sys/fs/cgroup/pids/parent/child/pids.current
712 75 2
72# cat /sys/fs/cgroup/pids/parent/child/pids.max 76 # cat /sys/fs/cgroup/pids/parent/child/pids.max
73max 77 max
74# ( /bin/echo "Here's some processes for you." | cat ) 78 # ( /bin/echo "Here's some processes for you." | cat )
75sh: fork: Resource temporary unavailable 79 sh: fork: Resource temporary unavailable
76# 80 #
77 81
78We can set a limit that is smaller than pids.current, which will stop any new 82We can set a limit that is smaller than pids.current, which will stop any new
79processes from being forked at all (note that the shell itself counts towards 83processes from being forked at all (note that the shell itself counts towards
80pids.current): 84pids.current)::
81 85
82# echo 1 > /sys/fs/cgroup/pids/parent/pids.max 86 # echo 1 > /sys/fs/cgroup/pids/parent/pids.max
83# /bin/echo "We can't even spawn a single process now." 87 # /bin/echo "We can't even spawn a single process now."
84sh: fork: Resource temporary unavailable 88 sh: fork: Resource temporary unavailable
85# echo 0 > /sys/fs/cgroup/pids/parent/pids.max 89 # echo 0 > /sys/fs/cgroup/pids/parent/pids.max
86# /bin/echo "We can't even spawn a single process now." 90 # /bin/echo "We can't even spawn a single process now."
87sh: fork: Resource temporary unavailable 91 sh: fork: Resource temporary unavailable
88# 92 #
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.rst
index 9bdb7fd03f83..2fcb0a9bf790 100644
--- a/Documentation/cgroup-v1/rdma.txt
+++ b/Documentation/cgroup-v1/rdma.rst
@@ -1,16 +1,17 @@
1 RDMA Controller 1===============
2 ---------------- 2RDMA Controller
3===============
3 4
4Contents 5.. Contents
5--------
6 6
71. Overview 7 1. Overview
8 1-1. What is RDMA controller? 8 1-1. What is RDMA controller?
9 1-2. Why RDMA controller needed? 9 1-2. Why RDMA controller needed?
10 1-3. How is RDMA controller implemented? 10 1-3. How is RDMA controller implemented?
112. Usage Examples 11 2. Usage Examples
12 12
131. Overview 131. Overview
14===========
14 15
151-1. What is RDMA controller? 161-1. What is RDMA controller?
16----------------------------- 17-----------------------------
@@ -83,27 +84,34 @@ what is configured by user for a given cgroup and what is supported by
83IB device. 84IB device.
84 85
85Following resources can be accounted by rdma controller. 86Following resources can be accounted by rdma controller.
87
88 ========== =============================
86 hca_handle Maximum number of HCA Handles 89 hca_handle Maximum number of HCA Handles
87 hca_object Maximum number of HCA Objects 90 hca_object Maximum number of HCA Objects
91 ========== =============================
88 92
892. Usage Examples 932. Usage Examples
90----------------- 94=================
91 95
92(a) Configure resource limit: 96(a) Configure resource limit::
93echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max 97
94echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max 98 echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
95 99 echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
96(b) Query resource limit: 100
97cat /sys/fs/cgroup/rdma/2/rdma.max 101(b) Query resource limit::
98#Output: 102
99mlx4_0 hca_handle=2 hca_object=2000 103 cat /sys/fs/cgroup/rdma/2/rdma.max
100ocrdma1 hca_handle=3 hca_object=max 104 #Output:
101 105 mlx4_0 hca_handle=2 hca_object=2000
102(c) Query current usage: 106 ocrdma1 hca_handle=3 hca_object=max
103cat /sys/fs/cgroup/rdma/2/rdma.current 107
104#Output: 108(c) Query current usage::
105mlx4_0 hca_handle=1 hca_object=20 109
106ocrdma1 hca_handle=1 hca_object=23 110 cat /sys/fs/cgroup/rdma/2/rdma.current
107 111 #Output:
108(d) Delete resource limit: 112 mlx4_0 hca_handle=1 hca_object=20
109echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max 113 ocrdma1 hca_handle=1 hca_object=23
114
115(d) Delete resource limit::
116
117 echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index d06e9a59a9f4..cad797a8a39e 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
98use at file creation time. When a task allocates a file in the file 98use at file creation time. When a task allocates a file in the file
99system, the mount option memory policy will be applied with a NodeList, 99system, the mount option memory policy will be applied with a NodeList,
100if any, modified by the calling task's cpuset constraints 100if any, modified by the calling task's cpuset constraints
101[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed 101[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed
102below. If the resulting NodeLists is the empty set, the effective memory 102below. If the resulting NodeLists is the empty set, the effective memory
103policy for the file will revert to "default" policy. 103policy for the file will revert to "default" policy.
104 104
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index b14e03ff3528..a7514343b660 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -652,7 +652,7 @@ CONTENTS
652 652
653 -deadline tasks cannot have an affinity mask smaller that the entire 653 -deadline tasks cannot have an affinity mask smaller that the entire
654 root_domain they are created on. However, affinities can be specified 654 root_domain they are created on. However, affinities can be specified
655 through the cpuset facility (Documentation/cgroup-v1/cpusets.txt). 655 through the cpuset facility (Documentation/cgroup-v1/cpusets.rst).
656 656
6575.1 SCHED_DEADLINE and cpusets HOWTO 6575.1 SCHED_DEADLINE and cpusets HOWTO
658------------------------------------ 658------------------------------------
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index edd861c94c1b..d1328890ef28 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -215,7 +215,7 @@ SCHED_BATCH) tasks.
215 215
216 These options need CONFIG_CGROUPS to be defined, and let the administrator 216 These options need CONFIG_CGROUPS to be defined, and let the administrator
217 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See 217 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
218 Documentation/cgroup-v1/cgroups.txt for more information about this filesystem. 218 Documentation/cgroup-v1/cgroups.rst for more information about this filesystem.
219 219
220When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each 220When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
221group created using the pseudo filesystem. See example steps below to create 221group created using the pseudo filesystem. See example steps below to create
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index d8fce3e78457..c09f7a3fee66 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
133to control the CPU time reserved for each control group. 133to control the CPU time reserved for each control group.
134 134
135For more information on working with control groups, you should read 135For more information on working with control groups, you should read
136Documentation/cgroup-v1/cgroups.txt as well. 136Documentation/cgroup-v1/cgroups.rst as well.
137 137
138Group settings are checked against the following limits in order to keep the 138Group settings are checked against the following limits in order to keep the
139configuration schedulable: 139configuration schedulable:
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
index 5cae13e9a08b..0d830edae8fe 100644
--- a/Documentation/vm/numa.rst
+++ b/Documentation/vm/numa.rst
@@ -67,7 +67,7 @@ nodes. Each emulated node will manage a fraction of the underlying cells'
67physical memory. NUMA emluation is useful for testing NUMA kernel and 67physical memory. NUMA emluation is useful for testing NUMA kernel and
68application features on non-NUMA platforms, and as a sort of memory resource 68application features on non-NUMA platforms, and as a sort of memory resource
69management mechanism when used together with cpusets. 69management mechanism when used together with cpusets.
70[see Documentation/cgroup-v1/cpusets.txt] 70[see Documentation/cgroup-v1/cpusets.rst]
71 71
72For each node with memory, Linux constructs an independent memory management 72For each node with memory, Linux constructs an independent memory management
73subsystem, complete with its own free page lists, in-use page lists, usage 73subsystem, complete with its own free page lists, in-use page lists, usage
@@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see
114 114
115System administrators can restrict the CPUs and nodes' memories that a non- 115System administrators can restrict the CPUs and nodes' memories that a non-
116privileged user can specify in the scheduling or NUMA commands and functions 116privileged user can specify in the scheduling or NUMA commands and functions
117using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.txt] 117using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.rst]
118 118
119On architectures that do not hide memoryless nodes, Linux will include only 119On architectures that do not hide memoryless nodes, Linux will include only
120zones [nodes] with memory in the zonelists. This means that for a memoryless 120zones [nodes] with memory in the zonelists. This means that for a memoryless
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index f68d61335abb..35bba27d5fff 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -41,7 +41,7 @@ locations.
41Larger installations usually partition the system using cpusets into 41Larger installations usually partition the system using cpusets into
42sections of nodes. Paul Jackson has equipped cpusets with the ability to 42sections of nodes. Paul Jackson has equipped cpusets with the ability to
43move pages when a task is moved to another cpuset (See 43move pages when a task is moved to another cpuset (See
44Documentation/cgroup-v1/cpusets.txt). 44Documentation/cgroup-v1/cpusets.rst).
45Cpusets allows the automation of process locality. If a task is moved to 45Cpusets allows the automation of process locality. If a task is moved to
46a new cpuset then also all its pages are moved with it so that the 46a new cpuset then also all its pages are moved with it so that the
47performance of the process does not sink dramatically. Also the pages 47performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index b8e29f977f2d..c6d94118fbcc 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -98,7 +98,7 @@ Memory Control Group Interaction
98-------------------------------- 98--------------------------------
99 99
100The unevictable LRU facility interacts with the memory control group [aka 100The unevictable LRU facility interacts with the memory control group [aka
101memory controller; see Documentation/cgroup-v1/memory.txt] by extending the 101memory controller; see Documentation/cgroup-v1/memory.rst] by extending the
102lru_list enum. 102lru_list enum.
103 103
104The memory controller data structure automatically gets a per-zone unevictable 104The memory controller data structure automatically gets a per-zone unevictable
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
index 74fbb78b3c67..a6926cd40f70 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
@@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks. This is a way of limiting the
15amount of system memory that are available to a certain class of tasks. 15amount of system memory that are available to a certain class of tasks.
16 16
17For more information on the features of cpusets, see 17For more information on the features of cpusets, see
18Documentation/cgroup-v1/cpusets.txt. 18Documentation/cgroup-v1/cpusets.rst.
19There are a number of different configurations you can use for your needs. For 19There are a number of different configurations you can use for your needs. For
20more information on the numa=fake command line option and its various ways of 20more information on the numa=fake command line option and its various ways of
21configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. 21configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
@@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg::
40 On node 3 totalpages: 131072 40 On node 3 totalpages: 131072
41 41
42Now following the instructions for mounting the cpusets filesystem from 42Now following the instructions for mounting the cpusets filesystem from
43Documentation/cgroup-v1/cpusets.txt, you can assign fake nodes (i.e. contiguous memory 43Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
44address spaces) to individual cpusets:: 44address spaces) to individual cpusets::
45 45
46 [root@xroads /]# mkdir exampleset 46 [root@xroads /]# mkdir exampleset
diff --git a/MAINTAINERS b/MAINTAINERS
index 4a9e8e5b2432..558acf24ea1e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4122,7 +4122,7 @@ W: http://www.bullopensource.org/cpuset/
4122W: http://oss.sgi.com/projects/cpusets/ 4122W: http://oss.sgi.com/projects/cpusets/
4123T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 4123T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
4124S: Maintained 4124S: Maintained
4125F: Documentation/cgroup-v1/cpusets.txt 4125F: Documentation/cgroup-v1/cpusets.rst
4126F: include/linux/cpuset.h 4126F: include/linux/cpuset.h
4127F: kernel/cgroup/cpuset.c 4127F: kernel/cgroup/cpuset.c
4128 4128
diff --git a/block/Kconfig b/block/Kconfig
index 2466dcc3ef1d..56cb1695cd87 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING
89 one needs to mount and use blkio cgroup controller for creating 89 one needs to mount and use blkio cgroup controller for creating
90 cgroups and specifying per device IO rate policies. 90 cgroups and specifying per device IO rate policies.
91 91
92 See Documentation/cgroup-v1/blkio-controller.txt for more information. 92 See Documentation/cgroup-v1/blkio-controller.rst for more information.
93 93
94config BLK_DEV_THROTTLING_LOW 94config BLK_DEV_THROTTLING_LOW
95 bool "Block throttling .low limit interface support (EXPERIMENTAL)" 95 bool "Block throttling .low limit interface support (EXPERIMENTAL)"
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index b4e766e93f6e..c5311935239d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -624,7 +624,7 @@ struct cftype {
624 624
625/* 625/*
626 * Control Group subsystem type. 626 * Control Group subsystem type.
627 * See Documentation/cgroup-v1/cgroups.txt for details 627 * See Documentation/cgroup-v1/cgroups.rst for details
628 */ 628 */
629struct cgroup_subsys { 629struct cgroup_subsys {
630 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); 630 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0297f930a56e..3745ecdad925 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -131,6 +131,8 @@ void cgroup_free(struct task_struct *p);
131int cgroup_init_early(void); 131int cgroup_init_early(void);
132int cgroup_init(void); 132int cgroup_init(void);
133 133
134int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);
135
134/* 136/*
135 * Iteration helpers and macros. 137 * Iteration helpers and macros.
136 */ 138 */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a8b823c30b43..489e118b69d2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -785,7 +785,7 @@ union bpf_attr {
785 * based on a user-provided identifier for all traffic coming from 785 * based on a user-provided identifier for all traffic coming from
786 * the tasks belonging to the related cgroup. See also the related 786 * the tasks belonging to the related cgroup. See also the related
787 * kernel documentation, available from the Linux sources in file 787 * kernel documentation, available from the Linux sources in file
788 * *Documentation/cgroup-v1/net_cls.txt*. 788 * *Documentation/cgroup-v1/net_cls.rst*.
789 * 789 *
790 * The Linux kernel has two versions for cgroups: there are 790 * The Linux kernel has two versions for cgroups: there are
791 * cgroups v1 and cgroups v2. Both are available to users, who can 791 * cgroups v1 and cgroups v2. Both are available to users, who can
diff --git a/init/Kconfig b/init/Kconfig
index c88289c18d59..bf96faf3fe43 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -850,7 +850,7 @@ config BLK_CGROUP
850 CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set 850 CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
851 CONFIG_BLK_DEV_THROTTLING=y. 851 CONFIG_BLK_DEV_THROTTLING=y.
852 852
853 See Documentation/cgroup-v1/blkio-controller.txt for more information. 853 See Documentation/cgroup-v1/blkio-controller.rst for more information.
854 854
855config DEBUG_BLK_CGROUP 855config DEBUG_BLK_CGROUP
856 bool "IO controller debugging" 856 bool "IO controller debugging"
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index cdbeff87fa99..aaba2a41562a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6240,6 +6240,48 @@ struct cgroup *cgroup_get_from_fd(int fd)
6240} 6240}
6241EXPORT_SYMBOL_GPL(cgroup_get_from_fd); 6241EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6242 6242
6243static u64 power_of_ten(int power)
6244{
6245 u64 v = 1;
6246 while (power--)
6247 v *= 10;
6248 return v;
6249}
6250
6251/**
6252 * cgroup_parse_float - parse a floating number
6253 * @input: input string
6254 * @dec_shift: number of decimal digits to shift
6255 * @v: output
6256 *
6257 * Parse a decimal floating point number in @input and store the result in
6258 * @v with decimal point right shifted @dec_shift times. For example, if
6259 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
6260 * Returns 0 on success, -errno otherwise.
6261 *
6262 * There's nothing cgroup specific about this function except that it's
6263 * currently the only user.
6264 */
6265int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6266{
6267 s64 whole, frac = 0;
6268 int fstart = 0, fend = 0, flen;
6269
6270 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6271 return -EINVAL;
6272 if (frac < 0)
6273 return -EINVAL;
6274
6275 flen = fend > fstart ? fend - fstart : 0;
6276 if (flen < dec_shift)
6277 frac *= power_of_ten(dec_shift - flen);
6278 else
6279 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6280
6281 *v = whole * power_of_ten(dec_shift) + frac;
6282 return 0;
6283}
6284
6243/* 6285/*
6244 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data 6286 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
6245 * definition in cgroup-defs.h. 6287 * definition in cgroup-defs.h.
@@ -6402,4 +6444,5 @@ static int __init cgroup_sysfs_init(void)
6402 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); 6444 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6403} 6445}
6404subsys_initcall(cgroup_sysfs_init); 6446subsys_initcall(cgroup_sysfs_init);
6447
6405#endif /* CONFIG_SYSFS */ 6448#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a1590e244f5f..b3b02b9c4405 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
729 * load balancing domains (sched domains) as specified by that partial 729 * load balancing domains (sched domains) as specified by that partial
730 * partition. 730 * partition.
731 * 731 *
732 * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt 732 * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
733 * for a background explanation of this. 733 * for a background explanation of this.
734 * 734 *
735 * Does not return errors, on the theory that the callers of this 735 * Does not return errors, on the theory that the callers of this
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index dc28914fa72e..c07196502577 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent)
509 * This is one of the three key functions for hierarchy implementation. 509 * This is one of the three key functions for hierarchy implementation.
510 * This function is responsible for re-evaluating all the cgroup's active 510 * This function is responsible for re-evaluating all the cgroup's active
511 * exceptions due to a parent's exception change. 511 * exceptions due to a parent's exception change.
512 * Refer to Documentation/cgroup-v1/devices.txt for more details. 512 * Refer to Documentation/cgroup-v1/devices.rst for more details.
513 */ 513 */
514static void revalidate_active_exceptions(struct dev_cgroup *devcg) 514static void revalidate_active_exceptions(struct dev_cgroup *devcg)
515{ 515{
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a8b823c30b43..489e118b69d2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -785,7 +785,7 @@ union bpf_attr {
785 * based on a user-provided identifier for all traffic coming from 785 * based on a user-provided identifier for all traffic coming from
786 * the tasks belonging to the related cgroup. See also the related 786 * the tasks belonging to the related cgroup. See also the related
787 * kernel documentation, available from the Linux sources in file 787 * kernel documentation, available from the Linux sources in file
788 * *Documentation/cgroup-v1/net_cls.txt*. 788 * *Documentation/cgroup-v1/net_cls.rst*.
789 * 789 *
790 * The Linux kernel has two versions for cgroups: there are 790 * The Linux kernel has two versions for cgroups: there are
791 * cgroups v1 and cgroups v2. Both are available to users, who can 791 * cgroups v1 and cgroups v2. Both are available to users, who can