diff options
-rw-r--r-- | Documentation/cgroup-v1/00-INDEX (renamed from Documentation/cgroups/00-INDEX) | 2 | ||||
-rw-r--r-- | Documentation/cgroup-v1/blkio-controller.txt (renamed from Documentation/cgroups/blkio-controller.txt) | 82 | ||||
-rw-r--r-- | Documentation/cgroup-v1/cgroups.txt (renamed from Documentation/cgroups/cgroups.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/cpuacct.txt (renamed from Documentation/cgroups/cpuacct.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/cpusets.txt (renamed from Documentation/cgroups/cpusets.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/devices.txt (renamed from Documentation/cgroups/devices.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/freezer-subsystem.txt (renamed from Documentation/cgroups/freezer-subsystem.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/hugetlb.txt (renamed from Documentation/cgroups/hugetlb.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/memcg_test.txt (renamed from Documentation/cgroups/memcg_test.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/memory.txt (renamed from Documentation/cgroups/memory.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/net_cls.txt (renamed from Documentation/cgroups/net_cls.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/net_prio.txt (renamed from Documentation/cgroups/net_prio.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v1/pids.txt (renamed from Documentation/cgroups/pids.txt) | 0 | ||||
-rw-r--r-- | Documentation/cgroup-v2.txt | 1293 | ||||
-rw-r--r-- | Documentation/cgroups/unified-hierarchy.txt | 647 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 13 | ||||
-rw-r--r-- | include/linux/cgroup.h | 19 | ||||
-rw-r--r-- | include/linux/cgroup_subsys.h | 18 | ||||
-rw-r--r-- | include/uapi/linux/magic.h | 1 | ||||
-rw-r--r-- | init/Kconfig | 241 | ||||
-rw-r--r-- | kernel/cgroup.c | 81 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 2 | ||||
-rw-r--r-- | kernel/cgroup_pids.c | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 12 | ||||
-rw-r--r-- | kernel/fork.c | 7 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 2 |
27 files changed, 1467 insertions, 961 deletions
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroup-v1/00-INDEX index 3f5a40f57d4a..6ad425f7cf56 100644 --- a/Documentation/cgroups/00-INDEX +++ b/Documentation/cgroup-v1/00-INDEX | |||
@@ -24,7 +24,5 @@ net_prio.txt | |||
24 | - Network priority cgroups details and usages. | 24 | - Network priority cgroups details and usages. |
25 | pids.txt | 25 | pids.txt |
26 | - Process number cgroups details and usages. | 26 | - Process number cgroups details and usages. |
27 | resource_counter.txt | ||
28 | - Resource Counter API. | ||
29 | unified-hierarchy.txt | 27 | unified-hierarchy.txt |
30 | - Description the new/next cgroup interface. | 28 | - Description the new/next cgroup interface. |
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt index 52fa9f353342..673dc34d3f78 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroup-v1/blkio-controller.txt | |||
@@ -84,8 +84,7 @@ Throttling/Upper Limit policy | |||
84 | 84 | ||
85 | - Run dd to read a file and see if rate is throttled to 1MB/s or not. | 85 | - Run dd to read a file and see if rate is throttled to 1MB/s or not. |
86 | 86 | ||
87 | # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 | 87 | # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 |
88 | # iflag=direct | ||
89 | 1024+0 records in | 88 | 1024+0 records in |
90 | 1024+0 records out | 89 | 1024+0 records out |
91 | 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s | 90 | 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s |
@@ -374,82 +373,3 @@ One can experience an overall throughput drop if you have created multiple | |||
374 | groups and put applications in that group which are not driving enough | 373 | groups and put applications in that group which are not driving enough |
375 | IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle | 374 | IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle |
376 | on individual groups and throughput should improve. | 375 | on individual groups and throughput should improve. |
377 | |||
378 | Writeback | ||
379 | ========= | ||
380 | |||
381 | Page cache is dirtied through buffered writes and shared mmaps and | ||
382 | written asynchronously to the backing filesystem by the writeback | ||
383 | mechanism. Writeback sits between the memory and IO domains and | ||
384 | regulates the proportion of dirty memory by balancing dirtying and | ||
385 | write IOs. | ||
386 | |||
387 | On traditional cgroup hierarchies, relationships between different | ||
388 | controllers cannot be established making it impossible for writeback | ||
389 | to operate accounting for cgroup resource restrictions and all | ||
390 | writeback IOs are attributed to the root cgroup. | ||
391 | |||
392 | If both the blkio and memory controllers are used on the v2 hierarchy | ||
393 | and the filesystem supports cgroup writeback, writeback operations | ||
394 | correctly follow the resource restrictions imposed by both memory and | ||
395 | blkio controllers. | ||
396 | |||
397 | Writeback examines both system-wide and per-cgroup dirty memory status | ||
398 | and enforces the more restrictive of the two. Also, writeback control | ||
399 | parameters which are absolute values - vm.dirty_bytes and | ||
400 | vm.dirty_background_bytes - are distributed across cgroups according | ||
401 | to their current writeback bandwidth. | ||
402 | |||
403 | There's a peculiarity stemming from the discrepancy in ownership | ||
404 | granularity between memory controller and writeback. While memory | ||
405 | controller tracks ownership per page, writeback operates on inode | ||
406 | basis. cgroup writeback bridges the gap by tracking ownership by | ||
407 | inode but migrating ownership if too many foreign pages, pages which | ||
408 | don't match the current inode ownership, have been encountered while | ||
409 | writing back the inode. | ||
410 | |||
411 | This is a conscious design choice as writeback operations are | ||
412 | inherently tied to inodes making strictly following page ownership | ||
413 | complicated and inefficient. The only use case which suffers from | ||
414 | this compromise is multiple cgroups concurrently dirtying disjoint | ||
415 | regions of the same inode, which is an unlikely use case and decided | ||
416 | to be unsupported. Note that as memory controller assigns page | ||
417 | ownership on the first use and doesn't update it until the page is | ||
418 | released, even if cgroup writeback strictly follows page ownership, | ||
419 | multiple cgroups dirtying overlapping areas wouldn't work as expected. | ||
420 | In general, write-sharing an inode across multiple cgroups is not well | ||
421 | supported. | ||
422 | |||
423 | Filesystem support for cgroup writeback | ||
424 | --------------------------------------- | ||
425 | |||
426 | A filesystem can make writeback IOs cgroup-aware by updating | ||
427 | address_space_operations->writepage[s]() to annotate bio's using the | ||
428 | following two functions. | ||
429 | |||
430 | * wbc_init_bio(@wbc, @bio) | ||
431 | |||
432 | Should be called for each bio carrying writeback data and associates | ||
433 | the bio with the inode's owner cgroup. Can be called anytime | ||
434 | between bio allocation and submission. | ||
435 | |||
436 | * wbc_account_io(@wbc, @page, @bytes) | ||
437 | |||
438 | Should be called for each data segment being written out. While | ||
439 | this function doesn't care exactly when it's called during the | ||
440 | writeback session, it's the easiest and most natural to call it as | ||
441 | data segments are added to a bio. | ||
442 | |||
443 | With writeback bio's annotated, cgroup support can be enabled per | ||
444 | super_block by setting MS_CGROUPWB in ->s_flags. This allows for | ||
445 | selective disabling of cgroup writeback support which is helpful when | ||
446 | certain filesystem features, e.g. journaled data mode, are | ||
447 | incompatible. | ||
448 | |||
449 | wbc_init_bio() binds the specified bio to its cgroup. Depending on | ||
450 | the configuration, the bio may be executed at a lower priority and if | ||
451 | the writeback session is holding shared resources, e.g. a journal | ||
452 | entry, may lead to priority inversion. There is no one easy solution | ||
453 | for the problem. Filesystems can try to work around specific problem | ||
454 | cases by skipping wbc_init_bio() or using bio_associate_blkcg() | ||
455 | directly. | ||
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt index c6256ae9885b..c6256ae9885b 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroup-v1/cgroups.txt | |||
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.txt index 9d73cc0cadb9..9d73cc0cadb9 100644 --- a/Documentation/cgroups/cpuacct.txt +++ b/Documentation/cgroup-v1/cpuacct.txt | |||
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt index fdf7dff3f607..fdf7dff3f607 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroup-v1/cpusets.txt | |||
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroup-v1/devices.txt index 3c1095ca02ea..3c1095ca02ea 100644 --- a/Documentation/cgroups/devices.txt +++ b/Documentation/cgroup-v1/devices.txt | |||
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.txt index e831cb2b8394..e831cb2b8394 100644 --- a/Documentation/cgroups/freezer-subsystem.txt +++ b/Documentation/cgroup-v1/freezer-subsystem.txt | |||
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.txt index 106245c3aecc..106245c3aecc 100644 --- a/Documentation/cgroups/hugetlb.txt +++ b/Documentation/cgroup-v1/hugetlb.txt | |||
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt index 8870b0212150..8870b0212150 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroup-v1/memcg_test.txt | |||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroup-v1/memory.txt index ff71e16cc752..ff71e16cc752 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroup-v1/memory.txt | |||
diff --git a/Documentation/cgroups/net_cls.txt b/Documentation/cgroup-v1/net_cls.txt index ec182346dea2..ec182346dea2 100644 --- a/Documentation/cgroups/net_cls.txt +++ b/Documentation/cgroup-v1/net_cls.txt | |||
diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroup-v1/net_prio.txt index a82cbd28ea8a..a82cbd28ea8a 100644 --- a/Documentation/cgroups/net_prio.txt +++ b/Documentation/cgroup-v1/net_prio.txt | |||
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroup-v1/pids.txt index 1a078b5d281a..1a078b5d281a 100644 --- a/Documentation/cgroups/pids.txt +++ b/Documentation/cgroup-v1/pids.txt | |||
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt new file mode 100644 index 000000000000..31d1f7bf12a1 --- /dev/null +++ b/Documentation/cgroup-v2.txt | |||
@@ -0,0 +1,1293 @@ | |||
1 | |||
2 | Control Group v2 | ||
3 | |||
4 | October, 2015 Tejun Heo <tj@kernel.org> | ||
5 | |||
6 | This is the authoritative documentation on the design, interface and | ||
7 | conventions of cgroup v2. It describes all userland-visible aspects | ||
8 | of cgroup including core and specific controller behaviors. All | ||
9 | future changes must be reflected in this document. Documentation for | ||
10 | v1 is available under Documentation/cgroup-legacy/. | ||
11 | |||
12 | CONTENTS | ||
13 | |||
14 | 1. Introduction | ||
15 | 1-1. Terminology | ||
16 | 1-2. What is cgroup? | ||
17 | 2. Basic Operations | ||
18 | 2-1. Mounting | ||
19 | 2-2. Organizing Processes | ||
20 | 2-3. [Un]populated Notification | ||
21 | 2-4. Controlling Controllers | ||
22 | 2-4-1. Enabling and Disabling | ||
23 | 2-4-2. Top-down Constraint | ||
24 | 2-4-3. No Internal Process Constraint | ||
25 | 2-5. Delegation | ||
26 | 2-5-1. Model of Delegation | ||
27 | 2-5-2. Delegation Containment | ||
28 | 2-6. Guidelines | ||
29 | 2-6-1. Organize Once and Control | ||
30 | 2-6-2. Avoid Name Collisions | ||
31 | 3. Resource Distribution Models | ||
32 | 3-1. Weights | ||
33 | 3-2. Limits | ||
34 | 3-3. Protections | ||
35 | 3-4. Allocations | ||
36 | 4. Interface Files | ||
37 | 4-1. Format | ||
38 | 4-2. Conventions | ||
39 | 4-3. Core Interface Files | ||
40 | 5. Controllers | ||
41 | 5-1. CPU | ||
42 | 5-1-1. CPU Interface Files | ||
43 | 5-2. Memory | ||
44 | 5-2-1. Memory Interface Files | ||
45 | 5-2-2. Usage Guidelines | ||
46 | 5-2-3. Memory Ownership | ||
47 | 5-3. IO | ||
48 | 5-3-1. IO Interface Files | ||
49 | 5-3-2. Writeback | ||
50 | P. Information on Kernel Programming | ||
51 | P-1. Filesystem Support for Writeback | ||
52 | D. Deprecated v1 Core Features | ||
53 | R. Issues with v1 and Rationales for v2 | ||
54 | R-1. Multiple Hierarchies | ||
55 | R-2. Thread Granularity | ||
56 | R-3. Competition Between Inner Nodes and Threads | ||
57 | R-4. Other Interface Issues | ||
58 | R-5. Controller Issues and Remedies | ||
59 | R-5-1. Memory | ||
60 | |||
61 | |||
62 | 1. Introduction | ||
63 | |||
64 | 1-1. Terminology | ||
65 | |||
66 | "cgroup" stands for "control group" and is never capitalized. The | ||
67 | singular form is used to designate the whole feature and also as a | ||
68 | qualifier as in "cgroup controllers". When explicitly referring to | ||
69 | multiple individual control groups, the plural form "cgroups" is used. | ||
70 | |||
71 | |||
72 | 1-2. What is cgroup? | ||
73 | |||
74 | cgroup is a mechanism to organize processes hierarchically and | ||
75 | distribute system resources along the hierarchy in a controlled and | ||
76 | configurable manner. | ||
77 | |||
78 | cgroup is largely composed of two parts - the core and controllers. | ||
79 | cgroup core is primarily responsible for hierarchically organizing | ||
80 | processes. A cgroup controller is usually responsible for | ||
81 | distributing a specific type of system resource along the hierarchy | ||
82 | although there are utility controllers which serve purposes other than | ||
83 | resource distribution. | ||
84 | |||
85 | cgroups form a tree structure and every process in the system belongs | ||
86 | to one and only one cgroup. All threads of a process belong to the | ||
87 | same cgroup. On creation, all processes are put in the cgroup that | ||
88 | the parent process belongs to at the time. A process can be migrated | ||
89 | to another cgroup. Migration of a process doesn't affect already | ||
90 | existing descendant processes. | ||
91 | |||
92 | Following certain structural constraints, controllers may be enabled or | ||
93 | disabled selectively on a cgroup. All controller behaviors are | ||
94 | hierarchical - if a controller is enabled on a cgroup, it affects all | ||
95 | processes which belong to the cgroups consisting the inclusive | ||
96 | sub-hierarchy of the cgroup. When a controller is enabled on a nested | ||
97 | cgroup, it always restricts the resource distribution further. The | ||
98 | restrictions set closer to the root in the hierarchy can not be | ||
99 | overridden from further away. | ||
100 | |||
101 | |||
102 | 2. Basic Operations | ||
103 | |||
104 | 2-1. Mounting | ||
105 | |||
106 | Unlike v1, cgroup v2 has only single hierarchy. The cgroup v2 | ||
107 | hierarchy can be mounted with the following mount command. | ||
108 | |||
109 | # mount -t cgroup2 none $MOUNT_POINT | ||
110 | |||
111 | cgroup2 filesystem has the magic number 0x63677270 ("cgrp"). All | ||
112 | controllers which support v2 and are not bound to a v1 hierarchy are | ||
113 | automatically bound to the v2 hierarchy and show up at the root. | ||
114 | Controllers which are not in active use in the v2 hierarchy can be | ||
115 | bound to other hierarchies. This allows mixing v2 hierarchy with the | ||
116 | legacy v1 multiple hierarchies in a fully backward compatible way. | ||
117 | |||
118 | A controller can be moved across hierarchies only after the controller | ||
119 | is no longer referenced in its current hierarchy. Because per-cgroup | ||
120 | controller states are destroyed asynchronously and controllers may | ||
121 | have lingering references, a controller may not show up immediately on | ||
122 | the v2 hierarchy after the final umount of the previous hierarchy. | ||
123 | Similarly, a controller should be fully disabled to be moved out of | ||
124 | the unified hierarchy and it may take some time for the disabled | ||
125 | controller to become available for other hierarchies; furthermore, due | ||
126 | to inter-controller dependencies, other controllers may need to be | ||
127 | disabled too. | ||
128 | |||
129 | While useful for development and manual configurations, moving | ||
130 | controllers dynamically between the v2 and other hierarchies is | ||
131 | strongly discouraged for production use. It is recommended to decide | ||
132 | the hierarchies and controller associations before starting using the | ||
133 | controllers after system boot. | ||
134 | |||
135 | |||
136 | 2-2. Organizing Processes | ||
137 | |||
138 | Initially, only the root cgroup exists to which all processes belong. | ||
139 | A child cgroup can be created by creating a sub-directory. | ||
140 | |||
141 | # mkdir $CGROUP_NAME | ||
142 | |||
143 | A given cgroup may have multiple child cgroups forming a tree | ||
144 | structure. Each cgroup has a read-writable interface file | ||
145 | "cgroup.procs". When read, it lists the PIDs of all processes which | ||
146 | belong to the cgroup one-per-line. The PIDs are not ordered and the | ||
147 | same PID may show up more than once if the process got moved to | ||
148 | another cgroup and then back or the PID got recycled while reading. | ||
149 | |||
150 | A process can be migrated into a cgroup by writing its PID to the | ||
151 | target cgroup's "cgroup.procs" file. Only one process can be migrated | ||
152 | on a single write(2) call. If a process is composed of multiple | ||
153 | threads, writing the PID of any thread migrates all threads of the | ||
154 | process. | ||
155 | |||
156 | When a process forks a child process, the new process is born into the | ||
157 | cgroup that the forking process belongs to at the time of the | ||
158 | operation. After exit, a process stays associated with the cgroup | ||
159 | that it belonged to at the time of exit until it's reaped; however, a | ||
160 | zombie process does not appear in "cgroup.procs" and thus can't be | ||
161 | moved to another cgroup. | ||
162 | |||
163 | A cgroup which doesn't have any children or live processes can be | ||
164 | destroyed by removing the directory. Note that a cgroup which doesn't | ||
165 | have any children and is associated only with zombie processes is | ||
166 | considered empty and can be removed. | ||
167 | |||
168 | # rmdir $CGROUP_NAME | ||
169 | |||
170 | "/proc/$PID/cgroup" lists a process's cgroup membership. If legacy | ||
171 | cgroup is in use in the system, this file may contain multiple lines, | ||
172 | one for each hierarchy. The entry for cgroup v2 is always in the | ||
173 | format "0::$PATH". | ||
174 | |||
175 | # cat /proc/842/cgroup | ||
176 | ... | ||
177 | 0::/test-cgroup/test-cgroup-nested | ||
178 | |||
179 | If the process becomes a zombie and the cgroup it was associated with | ||
180 | is removed subsequently, " (deleted)" is appended to the path. | ||
181 | |||
182 | # cat /proc/842/cgroup | ||
183 | ... | ||
184 | 0::/test-cgroup/test-cgroup-nested (deleted) | ||
185 | |||
186 | |||
187 | 2-3. [Un]populated Notification | ||
188 | |||
189 | Each non-root cgroup has a "cgroup.events" file which contains | ||
190 | "populated" field indicating whether the cgroup's sub-hierarchy has | ||
191 | live processes in it. Its value is 0 if there is no live process in | ||
192 | the cgroup and its descendants; otherwise, 1. poll and [id]notify | ||
193 | events are triggered when the value changes. This can be used, for | ||
194 | example, to start a clean-up operation after all processes of a given | ||
195 | sub-hierarchy have exited. The populated state updates and | ||
196 | notifications are recursive. Consider the following sub-hierarchy | ||
197 | where the numbers in the parentheses represent the numbers of processes | ||
198 | in each cgroup. | ||
199 | |||
200 | A(4) - B(0) - C(1) | ||
201 | \ D(0) | ||
202 | |||
203 | A, B and C's "populated" fields would be 1 while D's 0. After the one | ||
204 | process in C exits, B and C's "populated" fields would flip to "0" and | ||
205 | file modified events will be generated on the "cgroup.events" files of | ||
206 | both cgroups. | ||
207 | |||
208 | |||
209 | 2-4. Controlling Controllers | ||
210 | |||
211 | 2-4-1. Enabling and Disabling | ||
212 | |||
213 | Each cgroup has a "cgroup.controllers" file which lists all | ||
214 | controllers available for the cgroup to enable. | ||
215 | |||
216 | # cat cgroup.controllers | ||
217 | cpu io memory | ||
218 | |||
219 | No controller is enabled by default. Controllers can be enabled and | ||
220 | disabled by writing to the "cgroup.subtree_control" file. | ||
221 | |||
222 | # echo "+cpu +memory -io" > cgroup.subtree_control | ||
223 | |||
224 | Only controllers which are listed in "cgroup.controllers" can be | ||
225 | enabled. When multiple operations are specified as above, either they | ||
226 | all succeed or fail. If multiple operations on the same controller | ||
227 | are specified, the last one is effective. | ||
228 | |||
229 | Enabling a controller in a cgroup indicates that the distribution of | ||
230 | the target resource across its immediate children will be controlled. | ||
231 | Consider the following sub-hierarchy. The enabled controllers are | ||
232 | listed in parentheses. | ||
233 | |||
234 | A(cpu,memory) - B(memory) - C() | ||
235 | \ D() | ||
236 | |||
237 | As A has "cpu" and "memory" enabled, A will control the distribution | ||
238 | of CPU cycles and memory to its children, in this case, B. As B has | ||
239 | "memory" enabled but not "CPU", C and D will compete freely on CPU | ||
240 | cycles but their division of memory available to B will be controlled. | ||
241 | |||
242 | As a controller regulates the distribution of the target resource to | ||
243 | the cgroup's children, enabling it creates the controller's interface | ||
244 | files in the child cgroups. In the above example, enabling "cpu" on B | ||
245 | would create the "cpu." prefixed controller interface files in C and | ||
246 | D. Likewise, disabling "memory" from B would remove the "memory." | ||
247 | prefixed controller interface files from C and D. This means that the | ||
248 | controller interface files - anything which doesn't start with | ||
249 | "cgroup." are owned by the parent rather than the cgroup itself. | ||
250 | |||
251 | |||
252 | 2-4-2. Top-down Constraint | ||
253 | |||
254 | Resources are distributed top-down and a cgroup can further distribute | ||
255 | a resource only if the resource has been distributed to it from the | ||
256 | parent. This means that all non-root "cgroup.subtree_control" files | ||
257 | can only contain controllers which are enabled in the parent's | ||
258 | "cgroup.subtree_control" file. A controller can be enabled only if | ||
259 | the parent has the controller enabled and a controller can't be | ||
260 | disabled if one or more children have it enabled. | ||
261 | |||
262 | |||
263 | 2-4-3. No Internal Process Constraint | ||
264 | |||
265 | Non-root cgroups can only distribute resources to their children when | ||
266 | they don't have any processes of their own. In other words, only | ||
267 | cgroups which don't contain any processes can have controllers enabled | ||
268 | in their "cgroup.subtree_control" files. | ||
269 | |||
270 | This guarantees that, when a controller is looking at the part of the | ||
271 | hierarchy which has it enabled, processes are always only on the | ||
272 | leaves. This rules out situations where child cgroups compete against | ||
273 | internal processes of the parent. | ||
274 | |||
275 | The root cgroup is exempt from this restriction. Root contains | ||
276 | processes and anonymous resource consumption which can't be associated | ||
277 | with any other cgroups and requires special treatment from most | ||
278 | controllers. How resource consumption in the root cgroup is governed | ||
279 | is up to each controller. | ||
280 | |||
281 | Note that the restriction doesn't get in the way if there is no | ||
282 | enabled controller in the cgroup's "cgroup.subtree_control". This is | ||
283 | important as otherwise it wouldn't be possible to create children of a | ||
284 | populated cgroup. To control resource distribution of a cgroup, the | ||
285 | cgroup must create children and transfer all its processes to the | ||
286 | children before enabling controllers in its "cgroup.subtree_control" | ||
287 | file. | ||
288 | |||
289 | |||
290 | 2-5. Delegation | ||
291 | |||
292 | 2-5-1. Model of Delegation | ||
293 | |||
294 | A cgroup can be delegated to a less privileged user by granting write | ||
295 | access of the directory and its "cgroup.procs" file to the user. Note | ||
296 | that resource control interface files in a given directory control the | ||
297 | distribution of the parent's resources and thus must not be delegated | ||
298 | along with the directory. | ||
299 | |||
300 | Once delegated, the user can build sub-hierarchy under the directory, | ||
301 | organize processes as it sees fit and further distribute the resources | ||
302 | it received from the parent. The limits and other settings of all | ||
303 | resource controllers are hierarchical and regardless of what happens | ||
304 | in the delegated sub-hierarchy, nothing can escape the resource | ||
305 | restrictions imposed by the parent. | ||
306 | |||
307 | Currently, cgroup doesn't impose any restrictions on the number of | ||
308 | cgroups in or nesting depth of a delegated sub-hierarchy; however, | ||
309 | this may be limited explicitly in the future. | ||
310 | |||
311 | |||
312 | 2-5-2. Delegation Containment | ||
313 | |||
314 | A delegated sub-hierarchy is contained in the sense that processes | ||
315 | can't be moved into or out of the sub-hierarchy by the delegatee. For | ||
316 | a process with a non-root euid to migrate a target process into a | ||
317 | cgroup by writing its PID to the "cgroup.procs" file, the following | ||
318 | conditions must be met. | ||
319 | |||
320 | - The writer's euid must match either uid or suid of the target process. | ||
321 | |||
322 | - The writer must have write access to the "cgroup.procs" file. | ||
323 | |||
324 | - The writer must have write access to the "cgroup.procs" file of the | ||
325 | common ancestor of the source and destination cgroups. | ||
326 | |||
327 | The above three constraints ensure that while a delegatee may migrate | ||
328 | processes around freely in the delegated sub-hierarchy it can't pull | ||
329 | in from or push out to outside the sub-hierarchy. | ||
330 | |||
331 | For an example, let's assume cgroups C0 and C1 have been delegated to | ||
332 | user U0 who created C00, C01 under C0 and C10 under C1 as follows and | ||
333 | all processes under C0 and C1 belong to U0. | ||
334 | |||
335 | ~~~~~~~~~~~~~ - C0 - C00 | ||
336 | ~ cgroup ~ \ C01 | ||
337 | ~ hierarchy ~ | ||
338 | ~~~~~~~~~~~~~ - C1 - C10 | ||
339 | |||
340 | Let's also say U0 wants to write the PID of a process which is | ||
341 | currently in C10 into "C00/cgroup.procs". U0 has write access to the | ||
342 | file and uid match on the process; however, the common ancestor of the | ||
343 | source cgroup C10 and the destination cgroup C00 is above the points | ||
344 | of delegation and U0 would not have write access to its "cgroup.procs" | ||
345 | files and thus the write will be denied with -EACCES. | ||
346 | |||
347 | |||
348 | 2-6. Guidelines | ||
349 | |||
350 | 2-6-1. Organize Once and Control | ||
351 | |||
352 | Migrating a process across cgroups is a relatively expensive operation | ||
353 | and stateful resources such as memory are not moved together with the | ||
354 | process. This is an explicit design decision as there often exist | ||
355 | inherent trade-offs between migration and various hot paths in terms | ||
356 | of synchronization cost. | ||
357 | |||
358 | As such, migrating processes across cgroups frequently as a means to | ||
359 | apply different resource restrictions is discouraged. A workload | ||
360 | should be assigned to a cgroup according to the system's logical and | ||
361 | resource structure once on start-up. Dynamic adjustments to resource | ||
362 | distribution can be made by changing controller configuration through | ||
363 | the interface files. | ||
364 | |||
365 | |||
366 | 2-6-2. Avoid Name Collisions | ||
367 | |||
368 | Interface files for a cgroup and its children cgroups occupy the same | ||
369 | directory and it is possible to create children cgroups which collide | ||
370 | with interface files. | ||
371 | |||
372 | All cgroup core interface files are prefixed with "cgroup." and each | ||
373 | controller's interface files are prefixed with the controller name and | ||
374 | a dot. A controller's name is composed of lower case alphabets and | ||
375 | '_'s but never begins with an '_' so it can be used as the prefix | ||
376 | character for collision avoidance. Also, interface file names won't | ||
377 | start or end with terms which are often used in categorizing workloads | ||
378 | such as job, service, slice, unit or workload. | ||
379 | |||
380 | cgroup doesn't do anything to prevent name collisions and it's the | ||
381 | user's responsibility to avoid them. | ||
382 | |||
383 | |||
384 | 3. Resource Distribution Models | ||
385 | |||
386 | cgroup controllers implement several resource distribution schemes | ||
387 | depending on the resource type and expected use cases. This section | ||
388 | describes major schemes in use along with their expected behaviors. | ||
389 | |||
390 | |||
391 | 3-1. Weights | ||
392 | |||
393 | A parent's resource is distributed by adding up the weights of all | ||
394 | active children and giving each the fraction matching the ratio of its | ||
395 | weight against the sum. As only children which can make use of the | ||
396 | resource at the moment participate in the distribution, this is | ||
397 | work-conserving. Due to the dynamic nature, this model is usually | ||
398 | used for stateless resources. | ||
399 | |||
400 | All weights are in the range [1, 10000] with the default at 100. This | ||
401 | allows symmetric multiplicative biases in both directions at fine | ||
402 | enough granularity while staying in the intuitive range. | ||
403 | |||
404 | As long as the weight is in range, all configuration combinations are | ||
405 | valid and there is no reason to reject configuration changes or | ||
406 | process migrations. | ||
407 | |||
408 | "cpu.weight" proportionally distributes CPU cycles to active children | ||
409 | and is an example of this type. | ||
410 | |||
411 | |||
412 | 3-2. Limits | ||
413 | |||
414 | A child can only consume upto the configured amount of the resource. | ||
415 | Limits can be over-committed - the sum of the limits of children can | ||
416 | exceed the amount of resource available to the parent. | ||
417 | |||
418 | Limits are in the range [0, max] and defaults to "max", which is noop. | ||
419 | |||
420 | As limits can be over-committed, all configuration combinations are | ||
421 | valid and there is no reason to reject configuration changes or | ||
422 | process migrations. | ||
423 | |||
424 | "io.max" limits the maximum BPS and/or IOPS that a cgroup can consume | ||
425 | on an IO device and is an example of this type. | ||
426 | |||
427 | |||
428 | 3-3. Protections | ||
429 | |||
430 | A cgroup is protected to be allocated upto the configured amount of | ||
431 | the resource if the usages of all its ancestors are under their | ||
432 | protected levels. Protections can be hard guarantees or best effort | ||
433 | soft boundaries. Protections can also be over-committed in which case | ||
434 | only upto the amount available to the parent is protected among | ||
435 | children. | ||
436 | |||
437 | Protections are in the range [0, max] and defaults to 0, which is | ||
438 | noop. | ||
439 | |||
440 | As protections can be over-committed, all configuration combinations | ||
441 | are valid and there is no reason to reject configuration changes or | ||
442 | process migrations. | ||
443 | |||
444 | "memory.low" implements best-effort memory protection and is an | ||
445 | example of this type. | ||
446 | |||
447 | |||
448 | 3-4. Allocations | ||
449 | |||
450 | A cgroup is exclusively allocated a certain amount of a finite | ||
451 | resource. Allocations can't be over-committed - the sum of the | ||
452 | allocations of children can not exceed the amount of resource | ||
453 | available to the parent. | ||
454 | |||
455 | Allocations are in the range [0, max] and defaults to 0, which is no | ||
456 | resource. | ||
457 | |||
458 | As allocations can't be over-committed, some configuration | ||
459 | combinations are invalid and should be rejected. Also, if the | ||
460 | resource is mandatory for execution of processes, process migrations | ||
461 | may be rejected. | ||
462 | |||
463 | "cpu.rt.max" hard-allocates realtime slices and is an example of this | ||
464 | type. | ||
465 | |||
466 | |||
467 | 4. Interface Files | ||
468 | |||
469 | 4-1. Format | ||
470 | |||
471 | All interface files should be in one of the following formats whenever | ||
472 | possible. | ||
473 | |||
474 | New-line separated values | ||
475 | (when only one value can be written at once) | ||
476 | |||
477 | VAL0\n | ||
478 | VAL1\n | ||
479 | ... | ||
480 | |||
481 | Space separated values | ||
482 | (when read-only or multiple values can be written at once) | ||
483 | |||
484 | VAL0 VAL1 ...\n | ||
485 | |||
486 | Flat keyed | ||
487 | |||
488 | KEY0 VAL0\n | ||
489 | KEY1 VAL1\n | ||
490 | ... | ||
491 | |||
492 | Nested keyed | ||
493 | |||
494 | KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... | ||
495 | KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... | ||
496 | ... | ||
497 | |||
498 | For a writable file, the format for writing should generally match | ||
499 | reading; however, controllers may allow omitting later fields or | ||
500 | implement restricted shortcuts for most common use cases. | ||
501 | |||
502 | For both flat and nested keyed files, only the values for a single key | ||
503 | can be written at a time. For nested keyed files, the sub key pairs | ||
504 | may be specified in any order and not all pairs have to be specified. | ||
505 | |||
506 | |||
507 | 4-2. Conventions | ||
508 | |||
509 | - Settings for a single feature should be contained in a single file. | ||
510 | |||
511 | - The root cgroup should be exempt from resource control and thus | ||
512 | shouldn't have resource control interface files. Also, | ||
513 | informational files on the root cgroup which end up showing global | ||
514 | information available elsewhere shouldn't exist. | ||
515 | |||
516 | - If a controller implements weight based resource distribution, its | ||
517 | interface file should be named "weight" and have the range [1, | ||
518 | 10000] with 100 as the default. The values are chosen to allow | ||
519 | enough and symmetric bias in both directions while keeping it | ||
520 | intuitive (the default is 100%). | ||
521 | |||
522 | - If a controller implements an absolute resource guarantee and/or | ||
523 | limit, the interface files should be named "min" and "max" | ||
524 | respectively. If a controller implements best effort resource | ||
525 | guarantee and/or limit, the interface files should be named "low" | ||
526 | and "high" respectively. | ||
527 | |||
528 | In the above four control files, the special token "max" should be | ||
529 | used to represent upward infinity for both reading and writing. | ||
530 | |||
531 | - If a setting has a configurable default value and keyed specific | ||
532 | overrides, the default entry should be keyed with "default" and | ||
533 | appear as the first entry in the file. | ||
534 | |||
535 | The default value can be updated by writing either "default $VAL" or | ||
536 | "$VAL". | ||
537 | |||
538 | When writing to update a specific override, "default" can be used as | ||
539 | the value to indicate removal of the override. Override entries | ||
540 | with "default" as the value must not appear when read. | ||
541 | |||
542 | For example, a setting which is keyed by major:minor device numbers | ||
543 | with integer values may look like the following. | ||
544 | |||
545 | # cat cgroup-example-interface-file | ||
546 | default 150 | ||
547 | 8:0 300 | ||
548 | |||
549 | The default value can be updated by | ||
550 | |||
551 | # echo 125 > cgroup-example-interface-file | ||
552 | |||
553 | or | ||
554 | |||
555 | # echo "default 125" > cgroup-example-interface-file | ||
556 | |||
557 | An override can be set by | ||
558 | |||
559 | # echo "8:16 170" > cgroup-example-interface-file | ||
560 | |||
561 | and cleared by | ||
562 | |||
563 | # echo "8:0 default" > cgroup-example-interface-file | ||
564 | # cat cgroup-example-interface-file | ||
565 | default 125 | ||
566 | 8:16 170 | ||
567 | |||
568 | - For events which are not very high frequency, an interface file | ||
569 | "events" should be created which lists event key value pairs. | ||
570 | Whenever a notifiable event happens, file modified event should be | ||
571 | generated on the file. | ||
572 | |||
573 | |||
574 | 4-3. Core Interface Files | ||
575 | |||
576 | All cgroup core files are prefixed with "cgroup." | ||
577 | |||
578 | cgroup.procs | ||
579 | |||
580 | A read-write new-line separated values file which exists on | ||
581 | all cgroups. | ||
582 | |||
583 | When read, it lists the PIDs of all processes which belong to | ||
584 | the cgroup one-per-line. The PIDs are not ordered and the | ||
585 | same PID may show up more than once if the process got moved | ||
586 | to another cgroup and then back or the PID got recycled while | ||
587 | reading. | ||
588 | |||
589 | A PID can be written to migrate the process associated with | ||
590 | the PID to the cgroup. The writer should match all of the | ||
591 | following conditions. | ||
592 | |||
593 | - Its euid is either root or must match either uid or suid of | ||
594 | the target process. | ||
595 | |||
596 | - It must have write access to the "cgroup.procs" file. | ||
597 | |||
598 | - It must have write access to the "cgroup.procs" file of the | ||
599 | common ancestor of the source and destination cgroups. | ||
600 | |||
601 | When delegating a sub-hierarchy, write access to this file | ||
602 | should be granted along with the containing directory. | ||
603 | |||
604 | cgroup.controllers | ||
605 | |||
606 | A read-only space separated values file which exists on all | ||
607 | cgroups. | ||
608 | |||
609 | It shows space separated list of all controllers available to | ||
610 | the cgroup. The controllers are not ordered. | ||
611 | |||
612 | cgroup.subtree_control | ||
613 | |||
614 | A read-write space separated values file which exists on all | ||
615 | cgroups. Starts out empty. | ||
616 | |||
617 | When read, it shows space separated list of the controllers | ||
618 | which are enabled to control resource distribution from the | ||
619 | cgroup to its children. | ||
620 | |||
621 | Space separated list of controllers prefixed with '+' or '-' | ||
622 | can be written to enable or disable controllers. A controller | ||
623 | name prefixed with '+' enables the controller and '-' | ||
624 | disables. If a controller appears more than once on the list, | ||
625 | the last one is effective. When multiple enable and disable | ||
626 | operations are specified, either all succeed or all fail. | ||
627 | |||
628 | cgroup.events | ||
629 | |||
630 | A read-only flat-keyed file which exists on non-root cgroups. | ||
631 | The following entries are defined. Unless specified | ||
632 | otherwise, a value change in this file generates a file | ||
633 | modified event. | ||
634 | |||
635 | populated | ||
636 | |||
637 | 1 if the cgroup or its descendants contains any live | ||
638 | processes; otherwise, 0. | ||
639 | |||
640 | |||
641 | 5. Controllers | ||
642 | |||
643 | 5-1. CPU | ||
644 | |||
645 | [NOTE: The interface for the cpu controller hasn't been merged yet] | ||
646 | |||
647 | The "cpu" controllers regulates distribution of CPU cycles. This | ||
648 | controller implements weight and absolute bandwidth limit models for | ||
649 | normal scheduling policy and absolute bandwidth allocation model for | ||
650 | realtime scheduling policy. | ||
651 | |||
652 | |||
653 | 5-1-1. CPU Interface Files | ||
654 | |||
655 | All time durations are in microseconds. | ||
656 | |||
657 | cpu.stat | ||
658 | |||
659 | A read-only flat-keyed file which exists on non-root cgroups. | ||
660 | |||
661 | It reports the following six stats. | ||
662 | |||
663 | usage_usec | ||
664 | user_usec | ||
665 | system_usec | ||
666 | nr_periods | ||
667 | nr_throttled | ||
668 | throttled_usec | ||
669 | |||
670 | cpu.weight | ||
671 | |||
672 | A read-write single value file which exists on non-root | ||
673 | cgroups. The default is "100". | ||
674 | |||
675 | The weight in the range [1, 10000]. | ||
676 | |||
677 | cpu.max | ||
678 | |||
679 | A read-write two value file which exists on non-root cgroups. | ||
680 | The default is "max 100000". | ||
681 | |||
682 | The maximum bandwidth limit. It's in the following format. | ||
683 | |||
684 | $MAX $PERIOD | ||
685 | |||
686 | which indicates that the group may consume upto $MAX in each | ||
687 | $PERIOD duration. "max" for $MAX indicates no limit. If only | ||
688 | one number is written, $MAX is updated. | ||
689 | |||
690 | cpu.rt.max | ||
691 | |||
692 | [NOTE: The semantics of this file is still under discussion and the | ||
693 | interface hasn't been merged yet] | ||
694 | |||
695 | A read-write two value file which exists on all cgroups. | ||
696 | The default is "0 100000". | ||
697 | |||
698 | The maximum realtime runtime allocation. Over-committing | ||
699 | configurations are disallowed and process migrations are | ||
700 | rejected if not enough bandwidth is available. It's in the | ||
701 | following format. | ||
702 | |||
703 | $MAX $PERIOD | ||
704 | |||
705 | which indicates that the group may consume upto $MAX in each | ||
706 | $PERIOD duration. If only one number is written, $MAX is | ||
707 | updated. | ||
708 | |||
709 | |||
710 | 5-2. Memory | ||
711 | |||
712 | The "memory" controller regulates distribution of memory. Memory is | ||
713 | stateful and implements both limit and protection models. Due to the | ||
714 | intertwining between memory usage and reclaim pressure and the | ||
715 | stateful nature of memory, the distribution model is relatively | ||
716 | complex. | ||
717 | |||
718 | While not completely water-tight, all major memory usages by a given | ||
719 | cgroup are tracked so that the total memory consumption can be | ||
720 | accounted and controlled to a reasonable extent. Currently, the | ||
721 | following types of memory usages are tracked. | ||
722 | |||
723 | - Userland memory - page cache and anonymous memory. | ||
724 | |||
725 | - Kernel data structures such as dentries and inodes. | ||
726 | |||
727 | - TCP socket buffers. | ||
728 | |||
729 | The above list may expand in the future for better coverage. | ||
730 | |||
731 | |||
732 | 5-2-1. Memory Interface Files | ||
733 | |||
734 | All memory amounts are in bytes. If a value which is not aligned to | ||
735 | PAGE_SIZE is written, the value may be rounded up to the closest | ||
736 | PAGE_SIZE multiple when read back. | ||
737 | |||
738 | memory.current | ||
739 | |||
740 | A read-only single value file which exists on non-root | ||
741 | cgroups. | ||
742 | |||
743 | The total amount of memory currently being used by the cgroup | ||
744 | and its descendants. | ||
745 | |||
746 | memory.low | ||
747 | |||
748 | A read-write single value file which exists on non-root | ||
749 | cgroups. The default is "0". | ||
750 | |||
751 | Best-effort memory protection. If the memory usages of a | ||
752 | cgroup and all its ancestors are below their low boundaries, | ||
753 | the cgroup's memory won't be reclaimed unless memory can be | ||
754 | reclaimed from unprotected cgroups. | ||
755 | |||
756 | Putting more memory than generally available under this | ||
757 | protection is discouraged. | ||
758 | |||
759 | memory.high | ||
760 | |||
761 | A read-write single value file which exists on non-root | ||
762 | cgroups. The default is "max". | ||
763 | |||
764 | Memory usage throttle limit. This is the main mechanism to | ||
765 | control memory usage of a cgroup. If a cgroup's usage goes | ||
766 | over the high boundary, the processes of the cgroup are | ||
767 | throttled and put under heavy reclaim pressure. | ||
768 | |||
769 | Going over the high limit never invokes the OOM killer and | ||
770 | under extreme conditions the limit may be breached. | ||
771 | |||
772 | memory.max | ||
773 | |||
774 | A read-write single value file which exists on non-root | ||
775 | cgroups. The default is "max". | ||
776 | |||
777 | Memory usage hard limit. This is the final protection | ||
778 | mechanism. If a cgroup's memory usage reaches this limit and | ||
779 | can't be reduced, the OOM killer is invoked in the cgroup. | ||
780 | Under certain circumstances, the usage may go over the limit | ||
781 | temporarily. | ||
782 | |||
783 | This is the ultimate protection mechanism. As long as the | ||
784 | high limit is used and monitored properly, this limit's | ||
785 | utility is limited to providing the final safety net. | ||
786 | |||
787 | memory.events | ||
788 | |||
789 | A read-only flat-keyed file which exists on non-root cgroups. | ||
790 | The following entries are defined. Unless specified | ||
791 | otherwise, a value change in this file generates a file | ||
792 | modified event. | ||
793 | |||
794 | low | ||
795 | |||
796 | The number of times the cgroup is reclaimed due to | ||
797 | high memory pressure even though its usage is under | ||
798 | the low boundary. This usually indicates that the low | ||
799 | boundary is over-committed. | ||
800 | |||
801 | high | ||
802 | |||
803 | The number of times processes of the cgroup are | ||
804 | throttled and routed to perform direct memory reclaim | ||
805 | because the high memory boundary was exceeded. For a | ||
806 | cgroup whose memory usage is capped by the high limit | ||
807 | rather than global memory pressure, this event's | ||
808 | occurrences are expected. | ||
809 | |||
810 | max | ||
811 | |||
812 | The number of times the cgroup's memory usage was | ||
813 | about to go over the max boundary. If direct reclaim | ||
814 | fails to bring it down, the OOM killer is invoked. | ||
815 | |||
816 | oom | ||
817 | |||
818 | The number of times the OOM killer has been invoked in | ||
819 | the cgroup. This may not exactly match the number of | ||
820 | processes killed but should generally be close. | ||
821 | |||
822 | |||
823 | 5-2-2. General Usage | ||
824 | |||
825 | "memory.high" is the main mechanism to control memory usage. | ||
826 | Over-committing on high limit (sum of high limits > available memory) | ||
827 | and letting global memory pressure to distribute memory according to | ||
828 | usage is a viable strategy. | ||
829 | |||
830 | Because breach of the high limit doesn't trigger the OOM killer but | ||
831 | throttles the offending cgroup, a management agent has ample | ||
832 | opportunities to monitor and take appropriate actions such as granting | ||
833 | more memory or terminating the workload. | ||
834 | |||
835 | Determining whether a cgroup has enough memory is not trivial as | ||
836 | memory usage doesn't indicate whether the workload can benefit from | ||
837 | more memory. For example, a workload which writes data received from | ||
838 | network to a file can use all available memory but can also operate as | ||
839 | performant with a small amount of memory. A measure of memory | ||
840 | pressure - how much the workload is being impacted due to lack of | ||
841 | memory - is necessary to determine whether a workload needs more | ||
842 | memory; unfortunately, memory pressure monitoring mechanism isn't | ||
843 | implemented yet. | ||
844 | |||
845 | |||
846 | 5-2-3. Memory Ownership | ||
847 | |||
848 | A memory area is charged to the cgroup which instantiated it and stays | ||
849 | charged to the cgroup until the area is released. Migrating a process | ||
850 | to a different cgroup doesn't move the memory usages that it | ||
851 | instantiated while in the previous cgroup to the new cgroup. | ||
852 | |||
853 | A memory area may be used by processes belonging to different cgroups. | ||
854 | To which cgroup the area will be charged is in-deterministic; however, | ||
855 | over time, the memory area is likely to end up in a cgroup which has | ||
856 | enough memory allowance to avoid high reclaim pressure. | ||
857 | |||
858 | If a cgroup sweeps a considerable amount of memory which is expected | ||
859 | to be accessed repeatedly by other cgroups, it may make sense to use | ||
860 | POSIX_FADV_DONTNEED to relinquish the ownership of memory areas | ||
861 | belonging to the affected files to ensure correct memory ownership. | ||
862 | |||
863 | |||
864 | 5-3. IO | ||
865 | |||
866 | The "io" controller regulates the distribution of IO resources. This | ||
867 | controller implements both weight based and absolute bandwidth or IOPS | ||
868 | limit distribution; however, weight based distribution is available | ||
869 | only if cfq-iosched is in use and neither scheme is available for | ||
870 | blk-mq devices. | ||
871 | |||
872 | |||
873 | 5-3-1. IO Interface Files | ||
874 | |||
875 | io.stat | ||
876 | |||
877 | A read-only nested-keyed file which exists on non-root | ||
878 | cgroups. | ||
879 | |||
880 | Lines are keyed by $MAJ:$MIN device numbers and not ordered. | ||
881 | The following nested keys are defined. | ||
882 | |||
883 | rbytes Bytes read | ||
884 | wbytes Bytes written | ||
885 | rios Number of read IOs | ||
886 | wios Number of write IOs | ||
887 | |||
888 | An example read output follows. | ||
889 | |||
890 | 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 | ||
891 | 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 | ||
892 | |||
893 | io.weight | ||
894 | |||
895 | A read-write flat-keyed file which exists on non-root cgroups. | ||
896 | The default is "default 100". | ||
897 | |||
898 | The first line is the default weight applied to devices | ||
899 | without specific override. The rest are overrides keyed by | ||
900 | $MAJ:$MIN device numbers and not ordered. The weights are in | ||
901 | the range [1, 10000] and specifies the relative amount IO time | ||
902 | the cgroup can use in relation to its siblings. | ||
903 | |||
904 | The default weight can be updated by writing either "default | ||
905 | $WEIGHT" or simply "$WEIGHT". Overrides can be set by writing | ||
906 | "$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default". | ||
907 | |||
908 | An example read output follows. | ||
909 | |||
910 | default 100 | ||
911 | 8:16 200 | ||
912 | 8:0 50 | ||
913 | |||
914 | io.max | ||
915 | |||
916 | A read-write nested-keyed file which exists on non-root | ||
917 | cgroups. | ||
918 | |||
919 | BPS and IOPS based IO limit. Lines are keyed by $MAJ:$MIN | ||
920 | device numbers and not ordered. The following nested keys are | ||
921 | defined. | ||
922 | |||
923 | rbps Max read bytes per second | ||
924 | wbps Max write bytes per second | ||
925 | riops Max read IO operations per second | ||
926 | wiops Max write IO operations per second | ||
927 | |||
928 | When writing, any number of nested key-value pairs can be | ||
929 | specified in any order. "max" can be specified as the value | ||
930 | to remove a specific limit. If the same key is specified | ||
931 | multiple times, the outcome is undefined. | ||
932 | |||
933 | BPS and IOPS are measured in each IO direction and IOs are | ||
934 | delayed if limit is reached. Temporary bursts are allowed. | ||
935 | |||
936 | Setting read limit at 2M BPS and write at 120 IOPS for 8:16. | ||
937 | |||
938 | echo "8:16 rbps=2097152 wiops=120" > io.max | ||
939 | |||
940 | Reading returns the following. | ||
941 | |||
942 | 8:16 rbps=2097152 wbps=max riops=max wiops=120 | ||
943 | |||
944 | Write IOPS limit can be removed by writing the following. | ||
945 | |||
946 | echo "8:16 wiops=max" > io.max | ||
947 | |||
948 | Reading now returns the following. | ||
949 | |||
950 | 8:16 rbps=2097152 wbps=max riops=max wiops=max | ||
951 | |||
952 | |||
953 | 5-3-2. Writeback | ||
954 | |||
955 | Page cache is dirtied through buffered writes and shared mmaps and | ||
956 | written asynchronously to the backing filesystem by the writeback | ||
957 | mechanism. Writeback sits between the memory and IO domains and | ||
958 | regulates the proportion of dirty memory by balancing dirtying and | ||
959 | write IOs. | ||
960 | |||
961 | The io controller, in conjunction with the memory controller, | ||
962 | implements control of page cache writeback IOs. The memory controller | ||
963 | defines the memory domain that dirty memory ratio is calculated and | ||
964 | maintained for and the io controller defines the io domain which | ||
965 | writes out dirty pages for the memory domain. Both system-wide and | ||
966 | per-cgroup dirty memory states are examined and the more restrictive | ||
967 | of the two is enforced. | ||
968 | |||
969 | cgroup writeback requires explicit support from the underlying | ||
970 | filesystem. Currently, cgroup writeback is implemented on ext2, ext4 | ||
971 | and btrfs. On other filesystems, all writeback IOs are attributed to | ||
972 | the root cgroup. | ||
973 | |||
974 | There are inherent differences in memory and writeback management | ||
975 | which affects how cgroup ownership is tracked. Memory is tracked per | ||
976 | page while writeback per inode. For the purpose of writeback, an | ||
977 | inode is assigned to a cgroup and all IO requests to write dirty pages | ||
978 | from the inode are attributed to that cgroup. | ||
979 | |||
980 | As cgroup ownership for memory is tracked per page, there can be pages | ||
981 | which are associated with different cgroups than the one the inode is | ||
982 | associated with. These are called foreign pages. The writeback | ||
983 | constantly keeps track of foreign pages and, if a particular foreign | ||
984 | cgroup becomes the majority over a certain period of time, switches | ||
985 | the ownership of the inode to that cgroup. | ||
986 | |||
987 | While this model is enough for most use cases where a given inode is | ||
988 | mostly dirtied by a single cgroup even when the main writing cgroup | ||
989 | changes over time, use cases where multiple cgroups write to a single | ||
990 | inode simultaneously are not supported well. In such circumstances, a | ||
991 | significant portion of IOs are likely to be attributed incorrectly. | ||
992 | As memory controller assigns page ownership on the first use and | ||
993 | doesn't update it until the page is released, even if writeback | ||
994 | strictly follows page ownership, multiple cgroups dirtying overlapping | ||
995 | areas wouldn't work as expected. It's recommended to avoid such usage | ||
996 | patterns. | ||
997 | |||
998 | The sysctl knobs which affect writeback behavior are applied to cgroup | ||
999 | writeback as follows. | ||
1000 | |||
1001 | vm.dirty_background_ratio | ||
1002 | vm.dirty_ratio | ||
1003 | |||
1004 | These ratios apply the same to cgroup writeback with the | ||
1005 | amount of available memory capped by limits imposed by the | ||
1006 | memory controller and system-wide clean memory. | ||
1007 | |||
1008 | vm.dirty_background_bytes | ||
1009 | vm.dirty_bytes | ||
1010 | |||
1011 | For cgroup writeback, this is calculated into ratio against | ||
1012 | total available memory and applied the same way as | ||
1013 | vm.dirty[_background]_ratio. | ||
1014 | |||
1015 | |||
1016 | P. Information on Kernel Programming | ||
1017 | |||
1018 | This section contains kernel programming information in the areas | ||
1019 | where interacting with cgroup is necessary. cgroup core and | ||
1020 | controllers are not covered. | ||
1021 | |||
1022 | |||
1023 | P-1. Filesystem Support for Writeback | ||
1024 | |||
1025 | A filesystem can support cgroup writeback by updating | ||
1026 | address_space_operations->writepage[s]() to annotate bio's using the | ||
1027 | following two functions. | ||
1028 | |||
1029 | wbc_init_bio(@wbc, @bio) | ||
1030 | |||
1031 | Should be called for each bio carrying writeback data and | ||
1032 | associates the bio with the inode's owner cgroup. Can be | ||
1033 | called anytime between bio allocation and submission. | ||
1034 | |||
1035 | wbc_account_io(@wbc, @page, @bytes) | ||
1036 | |||
1037 | Should be called for each data segment being written out. | ||
1038 | While this function doesn't care exactly when it's called | ||
1039 | during the writeback session, it's the easiest and most | ||
1040 | natural to call it as data segments are added to a bio. | ||
1041 | |||
1042 | With writeback bio's annotated, cgroup support can be enabled per | ||
1043 | super_block by setting SB_I_CGROUPWB in ->s_iflags. This allows for | ||
1044 | selective disabling of cgroup writeback support which is helpful when | ||
1045 | certain filesystem features, e.g. journaled data mode, are | ||
1046 | incompatible. | ||
1047 | |||
1048 | wbc_init_bio() binds the specified bio to its cgroup. Depending on | ||
1049 | the configuration, the bio may be executed at a lower priority and if | ||
1050 | the writeback session is holding shared resources, e.g. a journal | ||
1051 | entry, may lead to priority inversion. There is no one easy solution | ||
1052 | for the problem. Filesystems can try to work around specific problem | ||
1053 | cases by skipping wbc_init_bio() or using bio_associate_blkcg() | ||
1054 | directly. | ||
1055 | |||
1056 | |||
1057 | D. Deprecated v1 Core Features | ||
1058 | |||
1059 | - Multiple hierarchies including named ones are not supported. | ||
1060 | |||
1061 | - All mount options and remounting are not supported. | ||
1062 | |||
1063 | - The "tasks" file is removed and "cgroup.procs" is not sorted. | ||
1064 | |||
1065 | - "cgroup.clone_children" is removed. | ||
1066 | |||
1067 | - /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file | ||
1068 | at the root instead. | ||
1069 | |||
1070 | |||
1071 | R. Issues with v1 and Rationales for v2 | ||
1072 | |||
1073 | R-1. Multiple Hierarchies | ||
1074 | |||
1075 | cgroup v1 allowed an arbitrary number of hierarchies and each | ||
1076 | hierarchy could host any number of controllers. While this seemed to | ||
1077 | provide a high level of flexibility, it wasn't useful in practice. | ||
1078 | |||
1079 | For example, as there is only one instance of each controller, utility | ||
1080 | type controllers such as freezer which can be useful in all | ||
1081 | hierarchies could only be used in one. The issue is exacerbated by | ||
1082 | the fact that controllers couldn't be moved to another hierarchy once | ||
1083 | hierarchies were populated. Another issue was that all controllers | ||
1084 | bound to a hierarchy were forced to have exactly the same view of the | ||
1085 | hierarchy. It wasn't possible to vary the granularity depending on | ||
1086 | the specific controller. | ||
1087 | |||
1088 | In practice, these issues heavily limited which controllers could be | ||
1089 | put on the same hierarchy and most configurations resorted to putting | ||
1090 | each controller on its own hierarchy. Only closely related ones, such | ||
1091 | as the cpu and cpuacct controllers, made sense to be put on the same | ||
1092 | hierarchy. This often meant that userland ended up managing multiple | ||
1093 | similar hierarchies repeating the same steps on each hierarchy | ||
1094 | whenever a hierarchy management operation was necessary. | ||
1095 | |||
1096 | Furthermore, support for multiple hierarchies came at a steep cost. | ||
1097 | It greatly complicated cgroup core implementation but more importantly | ||
1098 | the support for multiple hierarchies restricted how cgroup could be | ||
1099 | used in general and what controllers was able to do. | ||
1100 | |||
1101 | There was no limit on how many hierarchies there might be, which meant | ||
1102 | that a thread's cgroup membership couldn't be described in finite | ||
1103 | length. The key might contain any number of entries and was unlimited | ||
1104 | in length, which made it highly awkward to manipulate and led to | ||
1105 | addition of controllers which existed only to identify membership, | ||
1106 | which in turn exacerbated the original problem of proliferating number | ||
1107 | of hierarchies. | ||
1108 | |||
1109 | Also, as a controller couldn't have any expectation regarding the | ||
1110 | topologies of hierarchies other controllers might be on, each | ||
1111 | controller had to assume that all other controllers were attached to | ||
1112 | completely orthogonal hierarchies. This made it impossible, or at | ||
1113 | least very cumbersome, for controllers to cooperate with each other. | ||
1114 | |||
1115 | In most use cases, putting controllers on hierarchies which are | ||
1116 | completely orthogonal to each other isn't necessary. What usually is | ||
1117 | called for is the ability to have differing levels of granularity | ||
1118 | depending on the specific controller. In other words, hierarchy may | ||
1119 | be collapsed from leaf towards root when viewed from specific | ||
1120 | controllers. For example, a given configuration might not care about | ||
1121 | how memory is distributed beyond a certain level while still wanting | ||
1122 | to control how CPU cycles are distributed. | ||
1123 | |||
1124 | |||
1125 | R-2. Thread Granularity | ||
1126 | |||
1127 | cgroup v1 allowed threads of a process to belong to different cgroups. | ||
1128 | This didn't make sense for some controllers and those controllers | ||
1129 | ended up implementing different ways to ignore such situations but | ||
1130 | much more importantly it blurred the line between API exposed to | ||
1131 | individual applications and system management interface. | ||
1132 | |||
1133 | Generally, in-process knowledge is available only to the process | ||
1134 | itself; thus, unlike service-level organization of processes, | ||
1135 | categorizing threads of a process requires active participation from | ||
1136 | the application which owns the target process. | ||
1137 | |||
1138 | cgroup v1 had an ambiguously defined delegation model which got abused | ||
1139 | in combination with thread granularity. cgroups were delegated to | ||
1140 | individual applications so that they can create and manage their own | ||
1141 | sub-hierarchies and control resource distributions along them. This | ||
1142 | effectively raised cgroup to the status of a syscall-like API exposed | ||
1143 | to lay programs. | ||
1144 | |||
1145 | First of all, cgroup has a fundamentally inadequate interface to be | ||
1146 | exposed this way. For a process to access its own knobs, it has to | ||
1147 | extract the path on the target hierarchy from /proc/self/cgroup, | ||
1148 | construct the path by appending the name of the knob to the path, open | ||
1149 | and then read and/or write to it. This is not only extremely clunky | ||
1150 | and unusual but also inherently racy. There is no conventional way to | ||
1151 | define transaction across the required steps and nothing can guarantee | ||
1152 | that the process would actually be operating on its own sub-hierarchy. | ||
1153 | |||
1154 | cgroup controllers implemented a number of knobs which would never be | ||
1155 | accepted as public APIs because they were just adding control knobs to | ||
1156 | system-management pseudo filesystem. cgroup ended up with interface | ||
1157 | knobs which were not properly abstracted or refined and directly | ||
1158 | revealed kernel internal details. These knobs got exposed to | ||
1159 | individual applications through the ill-defined delegation mechanism | ||
1160 | effectively abusing cgroup as a shortcut to implementing public APIs | ||
1161 | without going through the required scrutiny. | ||
1162 | |||
1163 | This was painful for both userland and kernel. Userland ended up with | ||
1164 | misbehaving and poorly abstracted interfaces and kernel exposing and | ||
1165 | locked into constructs inadvertently. | ||
1166 | |||
1167 | |||
1168 | R-3. Competition Between Inner Nodes and Threads | ||
1169 | |||
1170 | cgroup v1 allowed threads to be in any cgroups which created an | ||
1171 | interesting problem where threads belonging to a parent cgroup and its | ||
1172 | children cgroups competed for resources. This was nasty as two | ||
1173 | different types of entities competed and there was no obvious way to | ||
1174 | settle it. Different controllers did different things. | ||
1175 | |||
1176 | The cpu controller considered threads and cgroups as equivalents and | ||
1177 | mapped nice levels to cgroup weights. This worked for some cases but | ||
1178 | fell flat when children wanted to be allocated specific ratios of CPU | ||
1179 | cycles and the number of internal threads fluctuated - the ratios | ||
1180 | constantly changed as the number of competing entities fluctuated. | ||
1181 | There also were other issues. The mapping from nice level to weight | ||
1182 | wasn't obvious or universal, and there were various other knobs which | ||
1183 | simply weren't available for threads. | ||
1184 | |||
1185 | The io controller implicitly created a hidden leaf node for each | ||
1186 | cgroup to host the threads. The hidden leaf had its own copies of all | ||
1187 | the knobs with "leaf_" prefixed. While this allowed equivalent | ||
1188 | control over internal threads, it was with serious drawbacks. It | ||
1189 | always added an extra layer of nesting which wouldn't be necessary | ||
1190 | otherwise, made the interface messy and significantly complicated the | ||
1191 | implementation. | ||
1192 | |||
1193 | The memory controller didn't have a way to control what happened | ||
1194 | between internal tasks and child cgroups and the behavior was not | ||
1195 | clearly defined. There were attempts to add ad-hoc behaviors and | ||
1196 | knobs to tailor the behavior to specific workloads which would have | ||
1197 | led to problems extremely difficult to resolve in the long term. | ||
1198 | |||
1199 | Multiple controllers struggled with internal tasks and came up with | ||
1200 | different ways to deal with it; unfortunately, all the approaches were | ||
1201 | severely flawed and, furthermore, the widely different behaviors | ||
1202 | made cgroup as a whole highly inconsistent. | ||
1203 | |||
1204 | This clearly is a problem which needs to be addressed from cgroup core | ||
1205 | in a uniform way. | ||
1206 | |||
1207 | |||
1208 | R-4. Other Interface Issues | ||
1209 | |||
1210 | cgroup v1 grew without oversight and developed a large number of | ||
1211 | idiosyncrasies and inconsistencies. One issue on the cgroup core side | ||
1212 | was how an empty cgroup was notified - a userland helper binary was | ||
1213 | forked and executed for each event. The event delivery wasn't | ||
1214 | recursive or delegatable. The limitations of the mechanism also led | ||
1215 | to in-kernel event delivery filtering mechanism further complicating | ||
1216 | the interface. | ||
1217 | |||
1218 | Controller interfaces were problematic too. An extreme example is | ||
1219 | controllers completely ignoring hierarchical organization and treating | ||
1220 | all cgroups as if they were all located directly under the root | ||
1221 | cgroup. Some controllers exposed a large amount of inconsistent | ||
1222 | implementation details to userland. | ||
1223 | |||
1224 | There also was no consistency across controllers. When a new cgroup | ||
1225 | was created, some controllers defaulted to not imposing extra | ||
1226 | restrictions while others disallowed any resource usage until | ||
1227 | explicitly configured. Configuration knobs for the same type of | ||
1228 | control used widely differing naming schemes and formats. Statistics | ||
1229 | and information knobs were named arbitrarily and used different | ||
1230 | formats and units even in the same controller. | ||
1231 | |||
1232 | cgroup v2 establishes common conventions where appropriate and updates | ||
1233 | controllers so that they expose minimal and consistent interfaces. | ||
1234 | |||
1235 | |||
1236 | R-5. Controller Issues and Remedies | ||
1237 | |||
1238 | R-5-1. Memory | ||
1239 | |||
1240 | The original lower boundary, the soft limit, is defined as a limit | ||
1241 | that is per default unset. As a result, the set of cgroups that | ||
1242 | global reclaim prefers is opt-in, rather than opt-out. The costs for | ||
1243 | optimizing these mostly negative lookups are so high that the | ||
1244 | implementation, despite its enormous size, does not even provide the | ||
1245 | basic desirable behavior. First off, the soft limit has no | ||
1246 | hierarchical meaning. All configured groups are organized in a global | ||
1247 | rbtree and treated like equal peers, regardless where they are located | ||
1248 | in the hierarchy. This makes subtree delegation impossible. Second, | ||
1249 | the soft limit reclaim pass is so aggressive that it not just | ||
1250 | introduces high allocation latencies into the system, but also impacts | ||
1251 | system performance due to overreclaim, to the point where the feature | ||
1252 | becomes self-defeating. | ||
1253 | |||
1254 | The memory.low boundary on the other hand is a top-down allocated | ||
1255 | reserve. A cgroup enjoys reclaim protection when it and all its | ||
1256 | ancestors are below their low boundaries, which makes delegation of | ||
1257 | subtrees possible. Secondly, new cgroups have no reserve per default | ||
1258 | and in the common case most cgroups are eligible for the preferred | ||
1259 | reclaim pass. This allows the new low boundary to be efficiently | ||
1260 | implemented with just a minor addition to the generic reclaim code, | ||
1261 | without the need for out-of-band data structures and reclaim passes. | ||
1262 | Because the generic reclaim code considers all cgroups except for the | ||
1263 | ones running low in the preferred first reclaim pass, overreclaim of | ||
1264 | individual groups is eliminated as well, resulting in much better | ||
1265 | overall workload performance. | ||
1266 | |||
1267 | The original high boundary, the hard limit, is defined as a strict | ||
1268 | limit that can not budge, even if the OOM killer has to be called. | ||
1269 | But this generally goes against the goal of making the most out of the | ||
1270 | available memory. The memory consumption of workloads varies during | ||
1271 | runtime, and that requires users to overcommit. But doing that with a | ||
1272 | strict upper limit requires either a fairly accurate prediction of the | ||
1273 | working set size or adding slack to the limit. Since working set size | ||
1274 | estimation is hard and error prone, and getting it wrong results in | ||
1275 | OOM kills, most users tend to err on the side of a looser limit and | ||
1276 | end up wasting precious resources. | ||
1277 | |||
1278 | The memory.high boundary on the other hand can be set much more | ||
1279 | conservatively. When hit, it throttles allocations by forcing them | ||
1280 | into direct reclaim to work off the excess, but it never invokes the | ||
1281 | OOM killer. As a result, a high boundary that is chosen too | ||
1282 | aggressively will not terminate the processes, but instead it will | ||
1283 | lead to gradual performance degradation. The user can monitor this | ||
1284 | and make corrections until the minimal memory footprint that still | ||
1285 | gives acceptable performance is found. | ||
1286 | |||
1287 | In extreme cases, with many concurrent allocations and a complete | ||
1288 | breakdown of reclaim progress within the group, the high boundary can | ||
1289 | be exceeded. But even then it's mostly better to satisfy the | ||
1290 | allocation from the slack available in other groups or the rest of the | ||
1291 | system than killing the group. Otherwise, memory.max is there to | ||
1292 | limit this type of spillover and ultimately contain buggy or even | ||
1293 | malicious applications. | ||
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt deleted file mode 100644 index 781b1d475bcf..000000000000 --- a/Documentation/cgroups/unified-hierarchy.txt +++ /dev/null | |||
@@ -1,647 +0,0 @@ | |||
1 | |||
2 | Cgroup unified hierarchy | ||
3 | |||
4 | April, 2014 Tejun Heo <tj@kernel.org> | ||
5 | |||
6 | This document describes the changes made by unified hierarchy and | ||
7 | their rationales. It will eventually be merged into the main cgroup | ||
8 | documentation. | ||
9 | |||
10 | CONTENTS | ||
11 | |||
12 | 1. Background | ||
13 | 2. Basic Operation | ||
14 | 2-1. Mounting | ||
15 | 2-2. cgroup.subtree_control | ||
16 | 2-3. cgroup.controllers | ||
17 | 3. Structural Constraints | ||
18 | 3-1. Top-down | ||
19 | 3-2. No internal tasks | ||
20 | 4. Delegation | ||
21 | 4-1. Model of delegation | ||
22 | 4-2. Common ancestor rule | ||
23 | 5. Other Changes | ||
24 | 5-1. [Un]populated Notification | ||
25 | 5-2. Other Core Changes | ||
26 | 5-3. Controller File Conventions | ||
27 | 5-3-1. Format | ||
28 | 5-3-2. Control Knobs | ||
29 | 5-4. Per-Controller Changes | ||
30 | 5-4-1. io | ||
31 | 5-4-2. cpuset | ||
32 | 5-4-3. memory | ||
33 | 6. Planned Changes | ||
34 | 6-1. CAP for resource control | ||
35 | |||
36 | |||
37 | 1. Background | ||
38 | |||
39 | cgroup allows an arbitrary number of hierarchies and each hierarchy | ||
40 | can host any number of controllers. While this seems to provide a | ||
41 | high level of flexibility, it isn't quite useful in practice. | ||
42 | |||
43 | For example, as there is only one instance of each controller, utility | ||
44 | type controllers such as freezer which can be useful in all | ||
45 | hierarchies can only be used in one. The issue is exacerbated by the | ||
46 | fact that controllers can't be moved around once hierarchies are | ||
47 | populated. Another issue is that all controllers bound to a hierarchy | ||
48 | are forced to have exactly the same view of the hierarchy. It isn't | ||
49 | possible to vary the granularity depending on the specific controller. | ||
50 | |||
51 | In practice, these issues heavily limit which controllers can be put | ||
52 | on the same hierarchy and most configurations resort to putting each | ||
53 | controller on its own hierarchy. Only closely related ones, such as | ||
54 | the cpu and cpuacct controllers, make sense to put on the same | ||
55 | hierarchy. This often means that userland ends up managing multiple | ||
56 | similar hierarchies repeating the same steps on each hierarchy | ||
57 | whenever a hierarchy management operation is necessary. | ||
58 | |||
59 | Unfortunately, support for multiple hierarchies comes at a steep cost. | ||
60 | Internal implementation in cgroup core proper is dazzlingly | ||
61 | complicated but more importantly the support for multiple hierarchies | ||
62 | restricts how cgroup is used in general and what controllers can do. | ||
63 | |||
64 | There's no limit on how many hierarchies there may be, which means | ||
65 | that a task's cgroup membership can't be described in finite length. | ||
66 | The key may contain any varying number of entries and is unlimited in | ||
67 | length, which makes it highly awkward to handle and leads to addition | ||
68 | of controllers which exist only to identify membership, which in turn | ||
69 | exacerbates the original problem. | ||
70 | |||
71 | Also, as a controller can't have any expectation regarding what shape | ||
72 | of hierarchies other controllers would be on, each controller has to | ||
73 | assume that all other controllers are operating on completely | ||
74 | orthogonal hierarchies. This makes it impossible, or at least very | ||
75 | cumbersome, for controllers to cooperate with each other. | ||
76 | |||
77 | In most use cases, putting controllers on hierarchies which are | ||
78 | completely orthogonal to each other isn't necessary. What usually is | ||
79 | called for is the ability to have differing levels of granularity | ||
80 | depending on the specific controller. In other words, hierarchy may | ||
81 | be collapsed from leaf towards root when viewed from specific | ||
82 | controllers. For example, a given configuration might not care about | ||
83 | how memory is distributed beyond a certain level while still wanting | ||
84 | to control how CPU cycles are distributed. | ||
85 | |||
86 | Unified hierarchy is the next version of cgroup interface. It aims to | ||
87 | address the aforementioned issues by having more structure while | ||
88 | retaining enough flexibility for most use cases. Various other | ||
89 | general and controller-specific interface issues are also addressed in | ||
90 | the process. | ||
91 | |||
92 | |||
93 | 2. Basic Operation | ||
94 | |||
95 | 2-1. Mounting | ||
96 | |||
97 | Currently, unified hierarchy can be mounted with the following mount | ||
98 | command. Note that this is still under development and scheduled to | ||
99 | change soon. | ||
100 | |||
101 | mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT | ||
102 | |||
103 | All controllers which support the unified hierarchy and are not bound | ||
104 | to other hierarchies are automatically bound to unified hierarchy and | ||
105 | show up at the root of it. Controllers which are enabled only in the | ||
106 | root of unified hierarchy can be bound to other hierarchies. This | ||
107 | allows mixing unified hierarchy with the traditional multiple | ||
108 | hierarchies in a fully backward compatible way. | ||
109 | |||
110 | A controller can be moved across hierarchies only after the controller | ||
111 | is no longer referenced in its current hierarchy. Because per-cgroup | ||
112 | controller states are destroyed asynchronously and controllers may | ||
113 | have lingering references, a controller may not show up immediately on | ||
114 | the unified hierarchy after the final umount of the previous | ||
115 | hierarchy. Similarly, a controller should be fully disabled to be | ||
116 | moved out of the unified hierarchy and it may take some time for the | ||
117 | disabled controller to become available for other hierarchies; | ||
118 | furthermore, due to dependencies among controllers, other controllers | ||
119 | may need to be disabled too. | ||
120 | |||
121 | While useful for development and manual configurations, dynamically | ||
122 | moving controllers between the unified and other hierarchies is | ||
123 | strongly discouraged for production use. It is recommended to decide | ||
124 | the hierarchies and controller associations before starting using the | ||
125 | controllers. | ||
126 | |||
127 | |||
128 | 2-2. cgroup.subtree_control | ||
129 | |||
130 | All cgroups on unified hierarchy have a "cgroup.subtree_control" file | ||
131 | which governs which controllers are enabled on the children of the | ||
132 | cgroup. Let's assume a hierarchy like the following. | ||
133 | |||
134 | root - A - B - C | ||
135 | \ D | ||
136 | |||
137 | root's "cgroup.subtree_control" file determines which controllers are | ||
138 | enabled on A. A's on B. B's on C and D. This coincides with the | ||
139 | fact that controllers on the immediate sub-level are used to | ||
140 | distribute the resources of the parent. In fact, it's natural to | ||
141 | assume that resource control knobs of a child belong to its parent. | ||
142 | Enabling a controller in a "cgroup.subtree_control" file declares that | ||
143 | distribution of the respective resources of the cgroup will be | ||
144 | controlled. Note that this means that controller enable states are | ||
145 | shared among siblings. | ||
146 | |||
147 | When read, the file contains a space-separated list of currently | ||
148 | enabled controllers. A write to the file should contain a | ||
149 | space-separated list of controllers with '+' or '-' prefixed (without | ||
150 | the quotes). Controllers prefixed with '+' are enabled and '-' | ||
151 | disabled. If a controller is listed multiple times, the last entry | ||
152 | wins. The specific operations are executed atomically - either all | ||
153 | succeed or fail. | ||
154 | |||
155 | |||
156 | 2-3. cgroup.controllers | ||
157 | |||
158 | Read-only "cgroup.controllers" file contains a space-separated list of | ||
159 | controllers which can be enabled in the cgroup's | ||
160 | "cgroup.subtree_control" file. | ||
161 | |||
162 | In the root cgroup, this lists controllers which are not bound to | ||
163 | other hierarchies and the content changes as controllers are bound to | ||
164 | and unbound from other hierarchies. | ||
165 | |||
166 | In non-root cgroups, the content of this file equals that of the | ||
167 | parent's "cgroup.subtree_control" file as only controllers enabled | ||
168 | from the parent can be used in its children. | ||
169 | |||
170 | |||
171 | 3. Structural Constraints | ||
172 | |||
173 | 3-1. Top-down | ||
174 | |||
175 | As it doesn't make sense to nest control of an uncontrolled resource, | ||
176 | all non-root "cgroup.subtree_control" files can only contain | ||
177 | controllers which are enabled in the parent's "cgroup.subtree_control" | ||
178 | file. A controller can be enabled only if the parent has the | ||
179 | controller enabled and a controller can't be disabled if one or more | ||
180 | children have it enabled. | ||
181 | |||
182 | |||
183 | 3-2. No internal tasks | ||
184 | |||
185 | One long-standing issue that cgroup faces is the competition between | ||
186 | tasks belonging to the parent cgroup and its children cgroups. This | ||
187 | is inherently nasty as two different types of entities compete and | ||
188 | there is no agreed-upon obvious way to handle it. Different | ||
189 | controllers are doing different things. | ||
190 | |||
191 | The cpu controller considers tasks and cgroups as equivalents and maps | ||
192 | nice levels to cgroup weights. This works for some cases but falls | ||
193 | flat when children should be allocated specific ratios of CPU cycles | ||
194 | and the number of internal tasks fluctuates - the ratios constantly | ||
195 | change as the number of competing entities fluctuates. There also are | ||
196 | other issues. The mapping from nice level to weight isn't obvious or | ||
197 | universal, and there are various other knobs which simply aren't | ||
198 | available for tasks. | ||
199 | |||
200 | The io controller implicitly creates a hidden leaf node for each | ||
201 | cgroup to host the tasks. The hidden leaf has its own copies of all | ||
202 | the knobs with "leaf_" prefixed. While this allows equivalent control | ||
203 | over internal tasks, it's with serious drawbacks. It always adds an | ||
204 | extra layer of nesting which may not be necessary, makes the interface | ||
205 | messy and significantly complicates the implementation. | ||
206 | |||
207 | The memory controller currently doesn't have a way to control what | ||
208 | happens between internal tasks and child cgroups and the behavior is | ||
209 | not clearly defined. There have been attempts to add ad-hoc behaviors | ||
210 | and knobs to tailor the behavior to specific workloads. Continuing | ||
211 | this direction will lead to problems which will be extremely difficult | ||
212 | to resolve in the long term. | ||
213 | |||
214 | Multiple controllers struggle with internal tasks and came up with | ||
215 | different ways to deal with it; unfortunately, all the approaches in | ||
216 | use now are severely flawed and, furthermore, the widely different | ||
217 | behaviors make cgroup as whole highly inconsistent. | ||
218 | |||
219 | It is clear that this is something which needs to be addressed from | ||
220 | cgroup core proper in a uniform way so that controllers don't need to | ||
221 | worry about it and cgroup as a whole shows a consistent and logical | ||
222 | behavior. To achieve that, unified hierarchy enforces the following | ||
223 | structural constraint: | ||
224 | |||
225 | Except for the root, only cgroups which don't contain any task may | ||
226 | have controllers enabled in their "cgroup.subtree_control" files. | ||
227 | |||
228 | Combined with other properties, this guarantees that, when a | ||
229 | controller is looking at the part of the hierarchy which has it | ||
230 | enabled, tasks are always only on the leaves. This rules out | ||
231 | situations where child cgroups compete against internal tasks of the | ||
232 | parent. | ||
233 | |||
234 | There are two things to note. Firstly, the root cgroup is exempt from | ||
235 | the restriction. Root contains tasks and anonymous resource | ||
236 | consumption which can't be associated with any other cgroup and | ||
237 | requires special treatment from most controllers. How resource | ||
238 | consumption in the root cgroup is governed is up to each controller. | ||
239 | |||
240 | Secondly, the restriction doesn't take effect if there is no enabled | ||
241 | controller in the cgroup's "cgroup.subtree_control" file. This is | ||
242 | important as otherwise it wouldn't be possible to create children of a | ||
243 | populated cgroup. To control resource distribution of a cgroup, the | ||
244 | cgroup must create children and transfer all its tasks to the children | ||
245 | before enabling controllers in its "cgroup.subtree_control" file. | ||
246 | |||
247 | |||
248 | 4. Delegation | ||
249 | |||
250 | 4-1. Model of delegation | ||
251 | |||
252 | A cgroup can be delegated to a less privileged user by granting write | ||
253 | access of the directory and its "cgroup.procs" file to the user. Note | ||
254 | that the resource control knobs in a given directory concern the | ||
255 | resources of the parent and thus must not be delegated along with the | ||
256 | directory. | ||
257 | |||
258 | Once delegated, the user can build sub-hierarchy under the directory, | ||
259 | organize processes as it sees fit and further distribute the resources | ||
260 | it got from the parent. The limits and other settings of all resource | ||
261 | controllers are hierarchical and regardless of what happens in the | ||
262 | delegated sub-hierarchy, nothing can escape the resource restrictions | ||
263 | imposed by the parent. | ||
264 | |||
265 | Currently, cgroup doesn't impose any restrictions on the number of | ||
266 | cgroups in or nesting depth of a delegated sub-hierarchy; however, | ||
267 | this may in the future be limited explicitly. | ||
268 | |||
269 | |||
270 | 4-2. Common ancestor rule | ||
271 | |||
272 | On the unified hierarchy, to write to a "cgroup.procs" file, in | ||
273 | addition to the usual write permission to the file and uid match, the | ||
274 | writer must also have write access to the "cgroup.procs" file of the | ||
275 | common ancestor of the source and destination cgroups. This prevents | ||
276 | delegatees from smuggling processes across disjoint sub-hierarchies. | ||
277 | |||
278 | Let's say cgroups C0 and C1 have been delegated to user U0 who created | ||
279 | C00, C01 under C0 and C10 under C1 as follows. | ||
280 | |||
281 | ~~~~~~~~~~~~~ - C0 - C00 | ||
282 | ~ cgroup ~ \ C01 | ||
283 | ~ hierarchy ~ | ||
284 | ~~~~~~~~~~~~~ - C1 - C10 | ||
285 | |||
286 | C0 and C1 are separate entities in terms of resource distribution | ||
287 | regardless of their relative positions in the hierarchy. The | ||
288 | resources the processes under C0 are entitled to are controlled by | ||
289 | C0's ancestors and may be completely different from C1. It's clear | ||
290 | that the intention of delegating C0 to U0 is allowing U0 to organize | ||
291 | the processes under C0 and further control the distribution of C0's | ||
292 | resources. | ||
293 | |||
294 | On traditional hierarchies, if a task has write access to "tasks" or | ||
295 | "cgroup.procs" file of a cgroup and its uid agrees with the target, it | ||
296 | can move the target to the cgroup. In the above example, U0 will not | ||
297 | only be able to move processes in each sub-hierarchy but also across | ||
298 | the two sub-hierarchies, effectively allowing it to violate the | ||
299 | organizational and resource restrictions implied by the hierarchical | ||
300 | structure above C0 and C1. | ||
301 | |||
302 | On the unified hierarchy, let's say U0 wants to write the pid of a | ||
303 | process which has a matching uid and is currently in C10 into | ||
304 | "C00/cgroup.procs". U0 obviously has write access to the file and | ||
305 | migration permission on the process; however, the common ancestor of | ||
306 | the source cgroup C10 and the destination cgroup C00 is above the | ||
307 | points of delegation and U0 would not have write access to its | ||
308 | "cgroup.procs" and thus be denied with -EACCES. | ||
309 | |||
310 | |||
311 | 5. Other Changes | ||
312 | |||
313 | 5-1. [Un]populated Notification | ||
314 | |||
315 | cgroup users often need a way to determine when a cgroup's | ||
316 | subhierarchy becomes empty so that it can be cleaned up. cgroup | ||
317 | currently provides release_agent for it; unfortunately, this mechanism | ||
318 | is riddled with issues. | ||
319 | |||
320 | - It delivers events by forking and execing a userland binary | ||
321 | specified as the release_agent. This is a long deprecated method of | ||
322 | notification delivery. It's extremely heavy, slow and cumbersome to | ||
323 | integrate with larger infrastructure. | ||
324 | |||
325 | - There is single monitoring point at the root. There's no way to | ||
326 | delegate management of a subtree. | ||
327 | |||
328 | - The event isn't recursive. It triggers when a cgroup doesn't have | ||
329 | any tasks or child cgroups. Events for internal nodes trigger only | ||
330 | after all children are removed. This again makes it impossible to | ||
331 | delegate management of a subtree. | ||
332 | |||
333 | - Events are filtered from the kernel side. A "notify_on_release" | ||
334 | file is used to subscribe to or suppress release events. This is | ||
335 | unnecessarily complicated and probably done this way because event | ||
336 | delivery itself was expensive. | ||
337 | |||
338 | Unified hierarchy implements "populated" field in "cgroup.events" | ||
339 | interface file which can be used to monitor whether the cgroup's | ||
340 | subhierarchy has tasks in it or not. Its value is 0 if there is no | ||
341 | task in the cgroup and its descendants; otherwise, 1. poll and | ||
342 | [id]notify events are triggered when the value changes. | ||
343 | |||
344 | This is significantly lighter and simpler and trivially allows | ||
345 | delegating management of subhierarchy - subhierarchy monitoring can | ||
346 | block further propagation simply by putting itself or another process | ||
347 | in the subhierarchy and monitor events that it's interested in from | ||
348 | there without interfering with monitoring higher in the tree. | ||
349 | |||
350 | In unified hierarchy, the release_agent mechanism is no longer | ||
351 | supported and the interface files "release_agent" and | ||
352 | "notify_on_release" do not exist. | ||
353 | |||
354 | |||
355 | 5-2. Other Core Changes | ||
356 | |||
357 | - None of the mount options is allowed. | ||
358 | |||
359 | - remount is disallowed. | ||
360 | |||
361 | - rename(2) is disallowed. | ||
362 | |||
363 | - The "tasks" file is removed. Everything should at process | ||
364 | granularity. Use the "cgroup.procs" file instead. | ||
365 | |||
366 | - The "cgroup.procs" file is not sorted. pids will be unique unless | ||
367 | they got recycled in-between reads. | ||
368 | |||
369 | - The "cgroup.clone_children" file is removed. | ||
370 | |||
371 | - /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged | ||
372 | to before exiting. If the cgroup is removed before the zombie is | ||
373 | reaped, " (deleted)" is appeneded to the path. | ||
374 | |||
375 | |||
376 | 5-3. Controller File Conventions | ||
377 | |||
378 | 5-3-1. Format | ||
379 | |||
380 | In general, all controller files should be in one of the following | ||
381 | formats whenever possible. | ||
382 | |||
383 | - Values only files | ||
384 | |||
385 | VAL0 VAL1...\n | ||
386 | |||
387 | - Flat keyed files | ||
388 | |||
389 | KEY0 VAL0\n | ||
390 | KEY1 VAL1\n | ||
391 | ... | ||
392 | |||
393 | - Nested keyed files | ||
394 | |||
395 | KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... | ||
396 | KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... | ||
397 | ... | ||
398 | |||
399 | For a writeable file, the format for writing should generally match | ||
400 | reading; however, controllers may allow omitting later fields or | ||
401 | implement restricted shortcuts for most common use cases. | ||
402 | |||
403 | For both flat and nested keyed files, only the values for a single key | ||
404 | can be written at a time. For nested keyed files, the sub key pairs | ||
405 | may be specified in any order and not all pairs have to be specified. | ||
406 | |||
407 | |||
408 | 5-3-2. Control Knobs | ||
409 | |||
410 | - Settings for a single feature should generally be implemented in a | ||
411 | single file. | ||
412 | |||
413 | - In general, the root cgroup should be exempt from resource control | ||
414 | and thus shouldn't have resource control knobs. | ||
415 | |||
416 | - If a controller implements ratio based resource distribution, the | ||
417 | control knob should be named "weight" and have the range [1, 10000] | ||
418 | and 100 should be the default value. The values are chosen to allow | ||
419 | enough and symmetric bias in both directions while keeping it | ||
420 | intuitive (the default is 100%). | ||
421 | |||
422 | - If a controller implements an absolute resource guarantee and/or | ||
423 | limit, the control knobs should be named "min" and "max" | ||
424 | respectively. If a controller implements best effort resource | ||
425 | gurantee and/or limit, the control knobs should be named "low" and | ||
426 | "high" respectively. | ||
427 | |||
428 | In the above four control files, the special token "max" should be | ||
429 | used to represent upward infinity for both reading and writing. | ||
430 | |||
431 | - If a setting has configurable default value and specific overrides, | ||
432 | the default settings should be keyed with "default" and appear as | ||
433 | the first entry in the file. Specific entries can use "default" as | ||
434 | its value to indicate inheritance of the default value. | ||
435 | |||
436 | - For events which are not very high frequency, an interface file | ||
437 | "events" should be created which lists event key value pairs. | ||
438 | Whenever a notifiable event happens, file modified event should be | ||
439 | generated on the file. | ||
440 | |||
441 | |||
442 | 5-4. Per-Controller Changes | ||
443 | |||
444 | 5-4-1. io | ||
445 | |||
446 | - blkio is renamed to io. The interface is overhauled anyway. The | ||
447 | new name is more in line with the other two major controllers, cpu | ||
448 | and memory, and better suited given that it may be used for cgroup | ||
449 | writeback without involving block layer. | ||
450 | |||
451 | - Everything including stat is always hierarchical making separate | ||
452 | recursive stat files pointless and, as no internal node can have | ||
453 | tasks, leaf weights are meaningless. The operation model is | ||
454 | simplified and the interface is overhauled accordingly. | ||
455 | |||
456 | io.stat | ||
457 | |||
458 | The stat file. The reported stats are from the point where | ||
459 | bio's are issued to request_queue. The stats are counted | ||
460 | independent of which policies are enabled. Each line in the | ||
461 | file follows the following format. More fields may later be | ||
462 | added at the end. | ||
463 | |||
464 | $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS | ||
465 | |||
466 | io.weight | ||
467 | |||
468 | The weight setting, currently only available and effective if | ||
469 | cfq-iosched is in use for the target device. The weight is | ||
470 | between 1 and 10000 and defaults to 100. The first line | ||
471 | always contains the default weight in the following format to | ||
472 | use when per-device setting is missing. | ||
473 | |||
474 | default $WEIGHT | ||
475 | |||
476 | Subsequent lines list per-device weights of the following | ||
477 | format. | ||
478 | |||
479 | $MAJ:$MIN $WEIGHT | ||
480 | |||
481 | Writing "$WEIGHT" or "default $WEIGHT" changes the default | ||
482 | setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight | ||
483 | while "$MAJ:$MIN default" clears it. | ||
484 | |||
485 | This file is available only on non-root cgroups. | ||
486 | |||
487 | io.max | ||
488 | |||
489 | The maximum bandwidth and/or iops setting, only available if | ||
490 | blk-throttle is enabled. The file is of the following format. | ||
491 | |||
492 | $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS | ||
493 | |||
494 | ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are | ||
495 | read/write IOs per second. "max" indicates no limit. Writing | ||
496 | to the file follows the same format but the individual | ||
497 | settings may be omitted or specified in any order. | ||
498 | |||
499 | This file is available only on non-root cgroups. | ||
500 | |||
501 | |||
502 | 5-4-2. cpuset | ||
503 | |||
504 | - Tasks are kept in empty cpusets after hotplug and take on the masks | ||
505 | of the nearest non-empty ancestor, instead of being moved to it. | ||
506 | |||
507 | - A task can be moved into an empty cpuset, and again it takes on the | ||
508 | masks of the nearest non-empty ancestor. | ||
509 | |||
510 | |||
511 | 5-4-3. memory | ||
512 | |||
513 | - use_hierarchy is on by default and the cgroup file for the flag is | ||
514 | not created. | ||
515 | |||
516 | - The original lower boundary, the soft limit, is defined as a limit | ||
517 | that is per default unset. As a result, the set of cgroups that | ||
518 | global reclaim prefers is opt-in, rather than opt-out. The costs | ||
519 | for optimizing these mostly negative lookups are so high that the | ||
520 | implementation, despite its enormous size, does not even provide the | ||
521 | basic desirable behavior. First off, the soft limit has no | ||
522 | hierarchical meaning. All configured groups are organized in a | ||
523 | global rbtree and treated like equal peers, regardless where they | ||
524 | are located in the hierarchy. This makes subtree delegation | ||
525 | impossible. Second, the soft limit reclaim pass is so aggressive | ||
526 | that it not just introduces high allocation latencies into the | ||
527 | system, but also impacts system performance due to overreclaim, to | ||
528 | the point where the feature becomes self-defeating. | ||
529 | |||
530 | The memory.low boundary on the other hand is a top-down allocated | ||
531 | reserve. A cgroup enjoys reclaim protection when it and all its | ||
532 | ancestors are below their low boundaries, which makes delegation of | ||
533 | subtrees possible. Secondly, new cgroups have no reserve per | ||
534 | default and in the common case most cgroups are eligible for the | ||
535 | preferred reclaim pass. This allows the new low boundary to be | ||
536 | efficiently implemented with just a minor addition to the generic | ||
537 | reclaim code, without the need for out-of-band data structures and | ||
538 | reclaim passes. Because the generic reclaim code considers all | ||
539 | cgroups except for the ones running low in the preferred first | ||
540 | reclaim pass, overreclaim of individual groups is eliminated as | ||
541 | well, resulting in much better overall workload performance. | ||
542 | |||
543 | - The original high boundary, the hard limit, is defined as a strict | ||
544 | limit that can not budge, even if the OOM killer has to be called. | ||
545 | But this generally goes against the goal of making the most out of | ||
546 | the available memory. The memory consumption of workloads varies | ||
547 | during runtime, and that requires users to overcommit. But doing | ||
548 | that with a strict upper limit requires either a fairly accurate | ||
549 | prediction of the working set size or adding slack to the limit. | ||
550 | Since working set size estimation is hard and error prone, and | ||
551 | getting it wrong results in OOM kills, most users tend to err on the | ||
552 | side of a looser limit and end up wasting precious resources. | ||
553 | |||
554 | The memory.high boundary on the other hand can be set much more | ||
555 | conservatively. When hit, it throttles allocations by forcing them | ||
556 | into direct reclaim to work off the excess, but it never invokes the | ||
557 | OOM killer. As a result, a high boundary that is chosen too | ||
558 | aggressively will not terminate the processes, but instead it will | ||
559 | lead to gradual performance degradation. The user can monitor this | ||
560 | and make corrections until the minimal memory footprint that still | ||
561 | gives acceptable performance is found. | ||
562 | |||
563 | In extreme cases, with many concurrent allocations and a complete | ||
564 | breakdown of reclaim progress within the group, the high boundary | ||
565 | can be exceeded. But even then it's mostly better to satisfy the | ||
566 | allocation from the slack available in other groups or the rest of | ||
567 | the system than killing the group. Otherwise, memory.max is there | ||
568 | to limit this type of spillover and ultimately contain buggy or even | ||
569 | malicious applications. | ||
570 | |||
571 | - The original control file names are unwieldy and inconsistent in | ||
572 | many different ways. For example, the upper boundary hit count is | ||
573 | exported in the memory.failcnt file, but an OOM event count has to | ||
574 | be manually counted by listening to memory.oom_control events, and | ||
575 | lower boundary / soft limit events have to be counted by first | ||
576 | setting a threshold for that value and then counting those events. | ||
577 | Also, usage and limit files encode their units in the filename. | ||
578 | That makes the filenames very long, even though this is not | ||
579 | information that a user needs to be reminded of every time they type | ||
580 | out those names. | ||
581 | |||
582 | To address these naming issues, as well as to signal clearly that | ||
583 | the new interface carries a new configuration model, the naming | ||
584 | conventions in it necessarily differ from the old interface. | ||
585 | |||
586 | - The original limit files indicate the state of an unset limit with a | ||
587 | Very High Number, and a configured limit can be unset by echoing -1 | ||
588 | into those files. But that very high number is implementation and | ||
589 | architecture dependent and not very descriptive. And while -1 can | ||
590 | be understood as an underflow into the highest possible value, -2 or | ||
591 | -10M etc. do not work, so it's not consistent. | ||
592 | |||
593 | memory.low, memory.high, and memory.max will use the string "max" to | ||
594 | indicate and set the highest possible value. | ||
595 | |||
596 | 6. Planned Changes | ||
597 | |||
598 | 6-1. CAP for resource control | ||
599 | |||
600 | Unified hierarchy will require one of the capabilities(7), which is | ||
601 | yet to be decided, for all resource control related knobs. Process | ||
602 | organization operations - creation of sub-cgroups and migration of | ||
603 | processes in sub-hierarchies may be delegated by changing the | ||
604 | ownership and/or permissions on the cgroup directory and | ||
605 | "cgroup.procs" interface file; however, all operations which affect | ||
606 | resource control - writes to a "cgroup.subtree_control" file or any | ||
607 | controller-specific knobs - will require an explicit CAP privilege. | ||
608 | |||
609 | This, in part, is to prevent the cgroup interface from being | ||
610 | inadvertently promoted to programmable API used by non-privileged | ||
611 | binaries. cgroup exposes various aspects of the system in ways which | ||
612 | aren't properly abstracted for direct consumption by regular programs. | ||
613 | This is an administration interface much closer to sysctl knobs than | ||
614 | system calls. Even the basic access model, being filesystem path | ||
615 | based, isn't suitable for direct consumption. There's no way to | ||
616 | access "my cgroup" in a race-free way or make multiple operations | ||
617 | atomic against migration to another cgroup. | ||
618 | |||
619 | Another aspect is that, for better or for worse, the cgroup interface | ||
620 | goes through far less scrutiny than regular interfaces for | ||
621 | unprivileged userland. The upside is that cgroup is able to expose | ||
622 | useful features which may not be suitable for general consumption in a | ||
623 | reasonable time frame. It provides a relatively short path between | ||
624 | internal details and userland-visible interface. Of course, this | ||
625 | shortcut comes with high risk. We go through what we go through for | ||
626 | general kernel APIs for good reasons. It may end up leaking internal | ||
627 | details in a way which can exert significant pain by locking the | ||
628 | kernel into a contract that can't be maintained in a reasonable | ||
629 | manner. | ||
630 | |||
631 | Also, due to the specific nature, cgroup and its controllers don't | ||
632 | tend to attract attention from a wide scope of developers. cgroup's | ||
633 | short history is already fraught with severely mis-designed | ||
634 | interfaces, unnecessary commitments to and exposing of internal | ||
635 | details, broken and dangerous implementations of various features. | ||
636 | |||
637 | Keeping cgroup as an administration interface is both advantageous for | ||
638 | its role and imperative given its nature. Some of the cgroup features | ||
639 | may make sense for unprivileged access. If deemed justified, those | ||
640 | must be further abstracted and implemented as a different interface, | ||
641 | be it a system call or process-private filesystem, and survive through | ||
642 | the scrutiny that any interface for general consumption is required to | ||
643 | go through. | ||
644 | |||
645 | Requiring CAP is not a complete solution but should serve as a | ||
646 | significant deterrent against spraying cgroup usages in non-privileged | ||
647 | programs. | ||
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e5f4164cbd99..7f540f7f588d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -34,17 +34,12 @@ struct seq_file; | |||
34 | 34 | ||
35 | /* define the enumeration of all cgroup subsystems */ | 35 | /* define the enumeration of all cgroup subsystems */ |
36 | #define SUBSYS(_x) _x ## _cgrp_id, | 36 | #define SUBSYS(_x) _x ## _cgrp_id, |
37 | #define SUBSYS_TAG(_t) CGROUP_ ## _t, \ | ||
38 | __unused_tag_ ## _t = CGROUP_ ## _t - 1, | ||
39 | enum cgroup_subsys_id { | 37 | enum cgroup_subsys_id { |
40 | #include <linux/cgroup_subsys.h> | 38 | #include <linux/cgroup_subsys.h> |
41 | CGROUP_SUBSYS_COUNT, | 39 | CGROUP_SUBSYS_COUNT, |
42 | }; | 40 | }; |
43 | #undef SUBSYS_TAG | ||
44 | #undef SUBSYS | 41 | #undef SUBSYS |
45 | 42 | ||
46 | #define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START) | ||
47 | |||
48 | /* bits in struct cgroup_subsys_state flags field */ | 43 | /* bits in struct cgroup_subsys_state flags field */ |
49 | enum { | 44 | enum { |
50 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ | 45 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ |
@@ -66,7 +61,6 @@ enum { | |||
66 | 61 | ||
67 | /* cgroup_root->flags */ | 62 | /* cgroup_root->flags */ |
68 | enum { | 63 | enum { |
69 | CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ | ||
70 | CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ | 64 | CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ |
71 | CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ | 65 | CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ |
72 | }; | 66 | }; |
@@ -439,9 +433,9 @@ struct cgroup_subsys { | |||
439 | int (*can_attach)(struct cgroup_taskset *tset); | 433 | int (*can_attach)(struct cgroup_taskset *tset); |
440 | void (*cancel_attach)(struct cgroup_taskset *tset); | 434 | void (*cancel_attach)(struct cgroup_taskset *tset); |
441 | void (*attach)(struct cgroup_taskset *tset); | 435 | void (*attach)(struct cgroup_taskset *tset); |
442 | int (*can_fork)(struct task_struct *task, void **priv_p); | 436 | int (*can_fork)(struct task_struct *task); |
443 | void (*cancel_fork)(struct task_struct *task, void *priv); | 437 | void (*cancel_fork)(struct task_struct *task); |
444 | void (*fork)(struct task_struct *task, void *priv); | 438 | void (*fork)(struct task_struct *task); |
445 | void (*exit)(struct task_struct *task); | 439 | void (*exit)(struct task_struct *task); |
446 | void (*free)(struct task_struct *task); | 440 | void (*free)(struct task_struct *task); |
447 | void (*bind)(struct cgroup_subsys_state *root_css); | 441 | void (*bind)(struct cgroup_subsys_state *root_css); |
@@ -527,7 +521,6 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) | |||
527 | 521 | ||
528 | #else /* CONFIG_CGROUPS */ | 522 | #else /* CONFIG_CGROUPS */ |
529 | 523 | ||
530 | #define CGROUP_CANFORK_COUNT 0 | ||
531 | #define CGROUP_SUBSYS_COUNT 0 | 524 | #define CGROUP_SUBSYS_COUNT 0 |
532 | 525 | ||
533 | static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} | 526 | static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 322a28482745..2162dca88dc0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -97,12 +97,9 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
97 | struct pid *pid, struct task_struct *tsk); | 97 | struct pid *pid, struct task_struct *tsk); |
98 | 98 | ||
99 | void cgroup_fork(struct task_struct *p); | 99 | void cgroup_fork(struct task_struct *p); |
100 | extern int cgroup_can_fork(struct task_struct *p, | 100 | extern int cgroup_can_fork(struct task_struct *p); |
101 | void *ss_priv[CGROUP_CANFORK_COUNT]); | 101 | extern void cgroup_cancel_fork(struct task_struct *p); |
102 | extern void cgroup_cancel_fork(struct task_struct *p, | 102 | extern void cgroup_post_fork(struct task_struct *p); |
103 | void *ss_priv[CGROUP_CANFORK_COUNT]); | ||
104 | extern void cgroup_post_fork(struct task_struct *p, | ||
105 | void *old_ss_priv[CGROUP_CANFORK_COUNT]); | ||
106 | void cgroup_exit(struct task_struct *p); | 103 | void cgroup_exit(struct task_struct *p); |
107 | void cgroup_free(struct task_struct *p); | 104 | void cgroup_free(struct task_struct *p); |
108 | 105 | ||
@@ -562,13 +559,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats, | |||
562 | struct dentry *dentry) { return -EINVAL; } | 559 | struct dentry *dentry) { return -EINVAL; } |
563 | 560 | ||
564 | static inline void cgroup_fork(struct task_struct *p) {} | 561 | static inline void cgroup_fork(struct task_struct *p) {} |
565 | static inline int cgroup_can_fork(struct task_struct *p, | 562 | static inline int cgroup_can_fork(struct task_struct *p) { return 0; } |
566 | void *ss_priv[CGROUP_CANFORK_COUNT]) | 563 | static inline void cgroup_cancel_fork(struct task_struct *p) {} |
567 | { return 0; } | 564 | static inline void cgroup_post_fork(struct task_struct *p) {} |
568 | static inline void cgroup_cancel_fork(struct task_struct *p, | ||
569 | void *ss_priv[CGROUP_CANFORK_COUNT]) {} | ||
570 | static inline void cgroup_post_fork(struct task_struct *p, | ||
571 | void *ss_priv[CGROUP_CANFORK_COUNT]) {} | ||
572 | static inline void cgroup_exit(struct task_struct *p) {} | 565 | static inline void cgroup_exit(struct task_struct *p) {} |
573 | static inline void cgroup_free(struct task_struct *p) {} | 566 | static inline void cgroup_free(struct task_struct *p) {} |
574 | 567 | ||
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1a96fdaa33d5..0df0336acee9 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h | |||
@@ -6,14 +6,8 @@ | |||
6 | 6 | ||
7 | /* | 7 | /* |
8 | * This file *must* be included with SUBSYS() defined. | 8 | * This file *must* be included with SUBSYS() defined. |
9 | * SUBSYS_TAG() is a noop if undefined. | ||
10 | */ | 9 | */ |
11 | 10 | ||
12 | #ifndef SUBSYS_TAG | ||
13 | #define __TMP_SUBSYS_TAG | ||
14 | #define SUBSYS_TAG(_x) | ||
15 | #endif | ||
16 | |||
17 | #if IS_ENABLED(CONFIG_CPUSETS) | 11 | #if IS_ENABLED(CONFIG_CPUSETS) |
18 | SUBSYS(cpuset) | 12 | SUBSYS(cpuset) |
19 | #endif | 13 | #endif |
@@ -58,17 +52,10 @@ SUBSYS(net_prio) | |||
58 | SUBSYS(hugetlb) | 52 | SUBSYS(hugetlb) |
59 | #endif | 53 | #endif |
60 | 54 | ||
61 | /* | ||
62 | * Subsystems that implement the can_fork() family of callbacks. | ||
63 | */ | ||
64 | SUBSYS_TAG(CANFORK_START) | ||
65 | |||
66 | #if IS_ENABLED(CONFIG_CGROUP_PIDS) | 55 | #if IS_ENABLED(CONFIG_CGROUP_PIDS) |
67 | SUBSYS(pids) | 56 | SUBSYS(pids) |
68 | #endif | 57 | #endif |
69 | 58 | ||
70 | SUBSYS_TAG(CANFORK_END) | ||
71 | |||
72 | /* | 59 | /* |
73 | * The following subsystems are not supported on the default hierarchy. | 60 | * The following subsystems are not supported on the default hierarchy. |
74 | */ | 61 | */ |
@@ -76,11 +63,6 @@ SUBSYS_TAG(CANFORK_END) | |||
76 | SUBSYS(debug) | 63 | SUBSYS(debug) |
77 | #endif | 64 | #endif |
78 | 65 | ||
79 | #ifdef __TMP_SUBSYS_TAG | ||
80 | #undef __TMP_SUBSYS_TAG | ||
81 | #undef SUBSYS_TAG | ||
82 | #endif | ||
83 | |||
84 | /* | 66 | /* |
85 | * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. | 67 | * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. |
86 | */ | 68 | */ |
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index accb036bbc9c..b283d56c1db9 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h | |||
@@ -54,6 +54,7 @@ | |||
54 | 54 | ||
55 | #define SMB_SUPER_MAGIC 0x517B | 55 | #define SMB_SUPER_MAGIC 0x517B |
56 | #define CGROUP_SUPER_MAGIC 0x27e0eb | 56 | #define CGROUP_SUPER_MAGIC 0x27e0eb |
57 | #define CGROUP2_SUPER_MAGIC 0x63677270 | ||
57 | 58 | ||
58 | 59 | ||
59 | #define STACK_END_MAGIC 0x57AC6E9D | 60 | #define STACK_END_MAGIC 0x57AC6E9D |
diff --git a/init/Kconfig b/init/Kconfig index 235c7a2c0d20..5481b49e8c3f 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -940,95 +940,24 @@ menuconfig CGROUPS | |||
940 | 940 | ||
941 | if CGROUPS | 941 | if CGROUPS |
942 | 942 | ||
943 | config CGROUP_DEBUG | ||
944 | bool "Example debug cgroup subsystem" | ||
945 | default n | ||
946 | help | ||
947 | This option enables a simple cgroup subsystem that | ||
948 | exports useful debugging information about the cgroups | ||
949 | framework. | ||
950 | |||
951 | Say N if unsure. | ||
952 | |||
953 | config CGROUP_FREEZER | ||
954 | bool "Freezer cgroup subsystem" | ||
955 | help | ||
956 | Provides a way to freeze and unfreeze all tasks in a | ||
957 | cgroup. | ||
958 | |||
959 | config CGROUP_PIDS | ||
960 | bool "PIDs cgroup subsystem" | ||
961 | help | ||
962 | Provides enforcement of process number limits in the scope of a | ||
963 | cgroup. Any attempt to fork more processes than is allowed in the | ||
964 | cgroup will fail. PIDs are fundamentally a global resource because it | ||
965 | is fairly trivial to reach PID exhaustion before you reach even a | ||
966 | conservative kmemcg limit. As a result, it is possible to grind a | ||
967 | system to halt without being limited by other cgroup policies. The | ||
968 | PIDs cgroup subsystem is designed to stop this from happening. | ||
969 | |||
970 | It should be noted that organisational operations (such as attaching | ||
971 | to a cgroup hierarchy will *not* be blocked by the PIDs subsystem), | ||
972 | since the PIDs limit only affects a process's ability to fork, not to | ||
973 | attach to a cgroup. | ||
974 | |||
975 | config CGROUP_DEVICE | ||
976 | bool "Device controller for cgroups" | ||
977 | help | ||
978 | Provides a cgroup implementing whitelists for devices which | ||
979 | a process in the cgroup can mknod or open. | ||
980 | |||
981 | config CPUSETS | ||
982 | bool "Cpuset support" | ||
983 | help | ||
984 | This option will let you create and manage CPUSETs which | ||
985 | allow dynamically partitioning a system into sets of CPUs and | ||
986 | Memory Nodes and assigning tasks to run only within those sets. | ||
987 | This is primarily useful on large SMP or NUMA systems. | ||
988 | |||
989 | Say N if unsure. | ||
990 | |||
991 | config PROC_PID_CPUSET | ||
992 | bool "Include legacy /proc/<pid>/cpuset file" | ||
993 | depends on CPUSETS | ||
994 | default y | ||
995 | |||
996 | config CGROUP_CPUACCT | ||
997 | bool "Simple CPU accounting cgroup subsystem" | ||
998 | help | ||
999 | Provides a simple Resource Controller for monitoring the | ||
1000 | total CPU consumed by the tasks in a cgroup. | ||
1001 | |||
1002 | config PAGE_COUNTER | 943 | config PAGE_COUNTER |
1003 | bool | 944 | bool |
1004 | 945 | ||
1005 | config MEMCG | 946 | config MEMCG |
1006 | bool "Memory Resource Controller for Control Groups" | 947 | bool "Memory controller" |
1007 | select PAGE_COUNTER | 948 | select PAGE_COUNTER |
1008 | select EVENTFD | 949 | select EVENTFD |
1009 | help | 950 | help |
1010 | Provides a memory resource controller that manages both anonymous | 951 | Provides control over the memory footprint of tasks in a cgroup. |
1011 | memory and page cache. (See Documentation/cgroups/memory.txt) | ||
1012 | 952 | ||
1013 | config MEMCG_SWAP | 953 | config MEMCG_SWAP |
1014 | bool "Memory Resource Controller Swap Extension" | 954 | bool "Swap controller" |
1015 | depends on MEMCG && SWAP | 955 | depends on MEMCG && SWAP |
1016 | help | 956 | help |
1017 | Add swap management feature to memory resource controller. When you | 957 | Provides control over the swap space consumed by tasks in a cgroup. |
1018 | enable this, you can limit mem+swap usage per cgroup. In other words, | 958 | |
1019 | when you disable this, memory resource controller has no cares to | ||
1020 | usage of swap...a process can exhaust all of the swap. This extension | ||
1021 | is useful when you want to avoid exhaustion swap but this itself | ||
1022 | adds more overheads and consumes memory for remembering information. | ||
1023 | Especially if you use 32bit system or small memory system, please | ||
1024 | be careful about enabling this. When memory resource controller | ||
1025 | is disabled by boot option, this will be automatically disabled and | ||
1026 | there will be no overhead from this. Even when you set this config=y, | ||
1027 | if boot option "swapaccount=0" is set, swap will not be accounted. | ||
1028 | Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page | ||
1029 | size is 4096bytes, 512k per 1Gbytes of swap. | ||
1030 | config MEMCG_SWAP_ENABLED | 959 | config MEMCG_SWAP_ENABLED |
1031 | bool "Memory Resource Controller Swap Extension enabled by default" | 960 | bool "Swap controller enabled by default" |
1032 | depends on MEMCG_SWAP | 961 | depends on MEMCG_SWAP |
1033 | default y | 962 | default y |
1034 | help | 963 | help |
@@ -1052,34 +981,43 @@ config MEMCG_KMEM | |||
1052 | the kmem extension can use it to guarantee that no group of processes | 981 | the kmem extension can use it to guarantee that no group of processes |
1053 | will ever exhaust kernel resources alone. | 982 | will ever exhaust kernel resources alone. |
1054 | 983 | ||
1055 | config CGROUP_HUGETLB | 984 | config BLK_CGROUP |
1056 | bool "HugeTLB Resource Controller for Control Groups" | 985 | bool "IO controller" |
1057 | depends on HUGETLB_PAGE | 986 | depends on BLOCK |
1058 | select PAGE_COUNTER | ||
1059 | default n | 987 | default n |
1060 | help | 988 | ---help--- |
1061 | Provides a cgroup Resource Controller for HugeTLB pages. | 989 | Generic block IO controller cgroup interface. This is the common |
1062 | When you enable this, you can put a per cgroup limit on HugeTLB usage. | 990 | cgroup interface which should be used by various IO controlling |
1063 | The limit is enforced during page fault. Since HugeTLB doesn't | 991 | policies. |
1064 | support page reclaim, enforcing the limit at page fault time implies | ||
1065 | that, the application will get SIGBUS signal if it tries to access | ||
1066 | HugeTLB pages beyond its limit. This requires the application to know | ||
1067 | beforehand how much HugeTLB pages it would require for its use. The | ||
1068 | control group is tracked in the third page lru pointer. This means | ||
1069 | that we cannot use the controller with huge page less than 3 pages. | ||
1070 | 992 | ||
1071 | config CGROUP_PERF | 993 | Currently, CFQ IO scheduler uses it to recognize task groups and |
1072 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" | 994 | control disk bandwidth allocation (proportional time slice allocation) |
1073 | depends on PERF_EVENTS && CGROUPS | 995 | to such task groups. It is also used by bio throttling logic in |
1074 | help | 996 | block layer to implement upper limit in IO rates on a device. |
1075 | This option extends the per-cpu mode to restrict monitoring to | ||
1076 | threads which belong to the cgroup specified and run on the | ||
1077 | designated cpu. | ||
1078 | 997 | ||
1079 | Say N if unsure. | 998 | This option only enables generic Block IO controller infrastructure. |
999 | One needs to also enable actual IO controlling logic/policy. For | ||
1000 | enabling proportional weight division of disk bandwidth in CFQ, set | ||
1001 | CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set | ||
1002 | CONFIG_BLK_DEV_THROTTLING=y. | ||
1003 | |||
1004 | See Documentation/cgroups/blkio-controller.txt for more information. | ||
1005 | |||
1006 | config DEBUG_BLK_CGROUP | ||
1007 | bool "IO controller debugging" | ||
1008 | depends on BLK_CGROUP | ||
1009 | default n | ||
1010 | ---help--- | ||
1011 | Enable some debugging help. Currently it exports additional stat | ||
1012 | files in a cgroup which can be useful for debugging. | ||
1013 | |||
1014 | config CGROUP_WRITEBACK | ||
1015 | bool | ||
1016 | depends on MEMCG && BLK_CGROUP | ||
1017 | default y | ||
1080 | 1018 | ||
1081 | menuconfig CGROUP_SCHED | 1019 | menuconfig CGROUP_SCHED |
1082 | bool "Group CPU scheduler" | 1020 | bool "CPU controller" |
1083 | default n | 1021 | default n |
1084 | help | 1022 | help |
1085 | This feature lets CPU scheduler recognize task groups and control CPU | 1023 | This feature lets CPU scheduler recognize task groups and control CPU |
@@ -1116,40 +1054,89 @@ config RT_GROUP_SCHED | |||
1116 | 1054 | ||
1117 | endif #CGROUP_SCHED | 1055 | endif #CGROUP_SCHED |
1118 | 1056 | ||
1119 | config BLK_CGROUP | 1057 | config CGROUP_PIDS |
1120 | bool "Block IO controller" | 1058 | bool "PIDs controller" |
1121 | depends on BLOCK | 1059 | help |
1060 | Provides enforcement of process number limits in the scope of a | ||
1061 | cgroup. Any attempt to fork more processes than is allowed in the | ||
1062 | cgroup will fail. PIDs are fundamentally a global resource because it | ||
1063 | is fairly trivial to reach PID exhaustion before you reach even a | ||
1064 | conservative kmemcg limit. As a result, it is possible to grind a | ||
1065 | system to halt without being limited by other cgroup policies. The | ||
1066 | PIDs cgroup subsystem is designed to stop this from happening. | ||
1067 | |||
1068 | It should be noted that organisational operations (such as attaching | ||
1069 | to a cgroup hierarchy will *not* be blocked by the PIDs subsystem), | ||
1070 | since the PIDs limit only affects a process's ability to fork, not to | ||
1071 | attach to a cgroup. | ||
1072 | |||
1073 | config CGROUP_FREEZER | ||
1074 | bool "Freezer controller" | ||
1075 | help | ||
1076 | Provides a way to freeze and unfreeze all tasks in a | ||
1077 | cgroup. | ||
1078 | |||
1079 | config CGROUP_HUGETLB | ||
1080 | bool "HugeTLB controller" | ||
1081 | depends on HUGETLB_PAGE | ||
1082 | select PAGE_COUNTER | ||
1122 | default n | 1083 | default n |
1123 | ---help--- | 1084 | help |
1124 | Generic block IO controller cgroup interface. This is the common | 1085 | Provides a cgroup controller for HugeTLB pages. |
1125 | cgroup interface which should be used by various IO controlling | 1086 | When you enable this, you can put a per cgroup limit on HugeTLB usage. |
1126 | policies. | 1087 | The limit is enforced during page fault. Since HugeTLB doesn't |
1088 | support page reclaim, enforcing the limit at page fault time implies | ||
1089 | that, the application will get SIGBUS signal if it tries to access | ||
1090 | HugeTLB pages beyond its limit. This requires the application to know | ||
1091 | beforehand how much HugeTLB pages it would require for its use. The | ||
1092 | control group is tracked in the third page lru pointer. This means | ||
1093 | that we cannot use the controller with huge page less than 3 pages. | ||
1127 | 1094 | ||
1128 | Currently, CFQ IO scheduler uses it to recognize task groups and | 1095 | config CPUSETS |
1129 | control disk bandwidth allocation (proportional time slice allocation) | 1096 | bool "Cpuset controller" |
1130 | to such task groups. It is also used by bio throttling logic in | 1097 | help |
1131 | block layer to implement upper limit in IO rates on a device. | 1098 | This option will let you create and manage CPUSETs which |
1099 | allow dynamically partitioning a system into sets of CPUs and | ||
1100 | Memory Nodes and assigning tasks to run only within those sets. | ||
1101 | This is primarily useful on large SMP or NUMA systems. | ||
1132 | 1102 | ||
1133 | This option only enables generic Block IO controller infrastructure. | 1103 | Say N if unsure. |
1134 | One needs to also enable actual IO controlling logic/policy. For | ||
1135 | enabling proportional weight division of disk bandwidth in CFQ, set | ||
1136 | CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set | ||
1137 | CONFIG_BLK_DEV_THROTTLING=y. | ||
1138 | 1104 | ||
1139 | See Documentation/cgroups/blkio-controller.txt for more information. | 1105 | config PROC_PID_CPUSET |
1106 | bool "Include legacy /proc/<pid>/cpuset file" | ||
1107 | depends on CPUSETS | ||
1108 | default y | ||
1140 | 1109 | ||
1141 | config DEBUG_BLK_CGROUP | 1110 | config CGROUP_DEVICE |
1142 | bool "Enable Block IO controller debugging" | 1111 | bool "Device controller" |
1143 | depends on BLK_CGROUP | 1112 | help |
1113 | Provides a cgroup controller implementing whitelists for | ||
1114 | devices which a process in the cgroup can mknod or open. | ||
1115 | |||
1116 | config CGROUP_CPUACCT | ||
1117 | bool "Simple CPU accounting controller" | ||
1118 | help | ||
1119 | Provides a simple controller for monitoring the | ||
1120 | total CPU consumed by the tasks in a cgroup. | ||
1121 | |||
1122 | config CGROUP_PERF | ||
1123 | bool "Perf controller" | ||
1124 | depends on PERF_EVENTS | ||
1125 | help | ||
1126 | This option extends the perf per-cpu mode to restrict monitoring | ||
1127 | to threads which belong to the cgroup specified and run on the | ||
1128 | designated cpu. | ||
1129 | |||
1130 | Say N if unsure. | ||
1131 | |||
1132 | config CGROUP_DEBUG | ||
1133 | bool "Example controller" | ||
1144 | default n | 1134 | default n |
1145 | ---help--- | 1135 | help |
1146 | Enable some debugging help. Currently it exports additional stat | 1136 | This option enables a simple controller that exports |
1147 | files in a cgroup which can be useful for debugging. | 1137 | debugging information about the cgroups framework. |
1148 | 1138 | ||
1149 | config CGROUP_WRITEBACK | 1139 | Say N. |
1150 | bool | ||
1151 | depends on MEMCG && BLK_CGROUP | ||
1152 | default y | ||
1153 | 1140 | ||
1154 | endif # CGROUPS | 1141 | endif # CGROUPS |
1155 | 1142 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fe95970b1f79..c03a640ef6da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly; | |||
211 | /* Ditto for the can_fork callback. */ | 211 | /* Ditto for the can_fork callback. */ |
212 | static unsigned long have_canfork_callback __read_mostly; | 212 | static unsigned long have_canfork_callback __read_mostly; |
213 | 213 | ||
214 | static struct file_system_type cgroup2_fs_type; | ||
214 | static struct cftype cgroup_dfl_base_files[]; | 215 | static struct cftype cgroup_dfl_base_files[]; |
215 | static struct cftype cgroup_legacy_base_files[]; | 216 | static struct cftype cgroup_legacy_base_files[]; |
216 | 217 | ||
@@ -1623,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1623 | all_ss = true; | 1624 | all_ss = true; |
1624 | continue; | 1625 | continue; |
1625 | } | 1626 | } |
1626 | if (!strcmp(token, "__DEVEL__sane_behavior")) { | ||
1627 | opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; | ||
1628 | continue; | ||
1629 | } | ||
1630 | if (!strcmp(token, "noprefix")) { | 1627 | if (!strcmp(token, "noprefix")) { |
1631 | opts->flags |= CGRP_ROOT_NOPREFIX; | 1628 | opts->flags |= CGRP_ROOT_NOPREFIX; |
1632 | continue; | 1629 | continue; |
@@ -1693,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1693 | return -ENOENT; | 1690 | return -ENOENT; |
1694 | } | 1691 | } |
1695 | 1692 | ||
1696 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { | ||
1697 | pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); | ||
1698 | if (nr_opts != 1) { | ||
1699 | pr_err("sane_behavior: no other mount options allowed\n"); | ||
1700 | return -EINVAL; | ||
1701 | } | ||
1702 | return 0; | ||
1703 | } | ||
1704 | |||
1705 | /* | 1693 | /* |
1706 | * If the 'all' option was specified select all the subsystems, | 1694 | * If the 'all' option was specified select all the subsystems, |
1707 | * otherwise if 'none', 'name=' and a subsystem name options were | 1695 | * otherwise if 'none', 'name=' and a subsystem name options were |
@@ -1981,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1981 | int flags, const char *unused_dev_name, | 1969 | int flags, const char *unused_dev_name, |
1982 | void *data) | 1970 | void *data) |
1983 | { | 1971 | { |
1972 | bool is_v2 = fs_type == &cgroup2_fs_type; | ||
1984 | struct super_block *pinned_sb = NULL; | 1973 | struct super_block *pinned_sb = NULL; |
1985 | struct cgroup_subsys *ss; | 1974 | struct cgroup_subsys *ss; |
1986 | struct cgroup_root *root; | 1975 | struct cgroup_root *root; |
@@ -1997,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1997 | if (!use_task_css_set_links) | 1986 | if (!use_task_css_set_links) |
1998 | cgroup_enable_task_cg_lists(); | 1987 | cgroup_enable_task_cg_lists(); |
1999 | 1988 | ||
1989 | if (is_v2) { | ||
1990 | if (data) { | ||
1991 | pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); | ||
1992 | return ERR_PTR(-EINVAL); | ||
1993 | } | ||
1994 | cgrp_dfl_root_visible = true; | ||
1995 | root = &cgrp_dfl_root; | ||
1996 | cgroup_get(&root->cgrp); | ||
1997 | goto out_mount; | ||
1998 | } | ||
1999 | |||
2000 | mutex_lock(&cgroup_mutex); | 2000 | mutex_lock(&cgroup_mutex); |
2001 | 2001 | ||
2002 | /* First find the desired set of subsystems */ | 2002 | /* First find the desired set of subsystems */ |
@@ -2004,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
2004 | if (ret) | 2004 | if (ret) |
2005 | goto out_unlock; | 2005 | goto out_unlock; |
2006 | 2006 | ||
2007 | /* look for a matching existing root */ | ||
2008 | if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { | ||
2009 | cgrp_dfl_root_visible = true; | ||
2010 | root = &cgrp_dfl_root; | ||
2011 | cgroup_get(&root->cgrp); | ||
2012 | ret = 0; | ||
2013 | goto out_unlock; | ||
2014 | } | ||
2015 | |||
2016 | /* | 2007 | /* |
2017 | * Destruction of cgroup root is asynchronous, so subsystems may | 2008 | * Destruction of cgroup root is asynchronous, so subsystems may |
2018 | * still be dying after the previous unmount. Let's drain the | 2009 | * still be dying after the previous unmount. Let's drain the |
@@ -2123,9 +2114,10 @@ out_free: | |||
2123 | 2114 | ||
2124 | if (ret) | 2115 | if (ret) |
2125 | return ERR_PTR(ret); | 2116 | return ERR_PTR(ret); |
2126 | 2117 | out_mount: | |
2127 | dentry = kernfs_mount(fs_type, flags, root->kf_root, | 2118 | dentry = kernfs_mount(fs_type, flags, root->kf_root, |
2128 | CGROUP_SUPER_MAGIC, &new_sb); | 2119 | is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, |
2120 | &new_sb); | ||
2129 | if (IS_ERR(dentry) || !new_sb) | 2121 | if (IS_ERR(dentry) || !new_sb) |
2130 | cgroup_put(&root->cgrp); | 2122 | cgroup_put(&root->cgrp); |
2131 | 2123 | ||
@@ -2168,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = { | |||
2168 | .kill_sb = cgroup_kill_sb, | 2160 | .kill_sb = cgroup_kill_sb, |
2169 | }; | 2161 | }; |
2170 | 2162 | ||
2163 | static struct file_system_type cgroup2_fs_type = { | ||
2164 | .name = "cgroup2", | ||
2165 | .mount = cgroup_mount, | ||
2166 | .kill_sb = cgroup_kill_sb, | ||
2167 | }; | ||
2168 | |||
2171 | /** | 2169 | /** |
2172 | * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy | 2170 | * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy |
2173 | * @task: target task | 2171 | * @task: target task |
@@ -4039,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
4039 | goto out_err; | 4037 | goto out_err; |
4040 | 4038 | ||
4041 | /* | 4039 | /* |
4042 | * Migrate tasks one-by-one until @form is empty. This fails iff | 4040 | * Migrate tasks one-by-one until @from is empty. This fails iff |
4043 | * ->can_attach() fails. | 4041 | * ->can_attach() fails. |
4044 | */ | 4042 | */ |
4045 | do { | 4043 | do { |
@@ -5171,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) | |||
5171 | { | 5169 | { |
5172 | struct cgroup_subsys_state *css; | 5170 | struct cgroup_subsys_state *css; |
5173 | 5171 | ||
5174 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 5172 | pr_debug("Initializing cgroup subsys %s\n", ss->name); |
5175 | 5173 | ||
5176 | mutex_lock(&cgroup_mutex); | 5174 | mutex_lock(&cgroup_mutex); |
5177 | 5175 | ||
@@ -5329,6 +5327,7 @@ int __init cgroup_init(void) | |||
5329 | 5327 | ||
5330 | WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); | 5328 | WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); |
5331 | WARN_ON(register_filesystem(&cgroup_fs_type)); | 5329 | WARN_ON(register_filesystem(&cgroup_fs_type)); |
5330 | WARN_ON(register_filesystem(&cgroup2_fs_type)); | ||
5332 | WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); | 5331 | WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); |
5333 | 5332 | ||
5334 | return 0; | 5333 | return 0; |
@@ -5472,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
5472 | .release = single_release, | 5471 | .release = single_release, |
5473 | }; | 5472 | }; |
5474 | 5473 | ||
5475 | static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) | ||
5476 | { | ||
5477 | if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) | ||
5478 | return &ss_priv[i - CGROUP_CANFORK_START]; | ||
5479 | return NULL; | ||
5480 | } | ||
5481 | |||
5482 | static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) | ||
5483 | { | ||
5484 | void **private = subsys_canfork_priv_p(ss_priv, i); | ||
5485 | return private ? *private : NULL; | ||
5486 | } | ||
5487 | |||
5488 | /** | 5474 | /** |
5489 | * cgroup_fork - initialize cgroup related fields during copy_process() | 5475 | * cgroup_fork - initialize cgroup related fields during copy_process() |
5490 | * @child: pointer to task_struct of forking parent process. | 5476 | * @child: pointer to task_struct of forking parent process. |
@@ -5507,14 +5493,13 @@ void cgroup_fork(struct task_struct *child) | |||
5507 | * returns an error, the fork aborts with that error code. This allows for | 5493 | * returns an error, the fork aborts with that error code. This allows for |
5508 | * a cgroup subsystem to conditionally allow or deny new forks. | 5494 | * a cgroup subsystem to conditionally allow or deny new forks. |
5509 | */ | 5495 | */ |
5510 | int cgroup_can_fork(struct task_struct *child, | 5496 | int cgroup_can_fork(struct task_struct *child) |
5511 | void *ss_priv[CGROUP_CANFORK_COUNT]) | ||
5512 | { | 5497 | { |
5513 | struct cgroup_subsys *ss; | 5498 | struct cgroup_subsys *ss; |
5514 | int i, j, ret; | 5499 | int i, j, ret; |
5515 | 5500 | ||
5516 | for_each_subsys_which(ss, i, &have_canfork_callback) { | 5501 | for_each_subsys_which(ss, i, &have_canfork_callback) { |
5517 | ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); | 5502 | ret = ss->can_fork(child); |
5518 | if (ret) | 5503 | if (ret) |
5519 | goto out_revert; | 5504 | goto out_revert; |
5520 | } | 5505 | } |
@@ -5526,7 +5511,7 @@ out_revert: | |||
5526 | if (j >= i) | 5511 | if (j >= i) |
5527 | break; | 5512 | break; |
5528 | if (ss->cancel_fork) | 5513 | if (ss->cancel_fork) |
5529 | ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); | 5514 | ss->cancel_fork(child); |
5530 | } | 5515 | } |
5531 | 5516 | ||
5532 | return ret; | 5517 | return ret; |
@@ -5539,15 +5524,14 @@ out_revert: | |||
5539 | * This calls the cancel_fork() callbacks if a fork failed *after* | 5524 | * This calls the cancel_fork() callbacks if a fork failed *after* |
5540 | * cgroup_can_fork() succeded. | 5525 | * cgroup_can_fork() succeded. |
5541 | */ | 5526 | */ |
5542 | void cgroup_cancel_fork(struct task_struct *child, | 5527 | void cgroup_cancel_fork(struct task_struct *child) |
5543 | void *ss_priv[CGROUP_CANFORK_COUNT]) | ||
5544 | { | 5528 | { |
5545 | struct cgroup_subsys *ss; | 5529 | struct cgroup_subsys *ss; |
5546 | int i; | 5530 | int i; |
5547 | 5531 | ||
5548 | for_each_subsys(ss, i) | 5532 | for_each_subsys(ss, i) |
5549 | if (ss->cancel_fork) | 5533 | if (ss->cancel_fork) |
5550 | ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); | 5534 | ss->cancel_fork(child); |
5551 | } | 5535 | } |
5552 | 5536 | ||
5553 | /** | 5537 | /** |
@@ -5560,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child, | |||
5560 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its | 5544 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
5561 | * list. | 5545 | * list. |
5562 | */ | 5546 | */ |
5563 | void cgroup_post_fork(struct task_struct *child, | 5547 | void cgroup_post_fork(struct task_struct *child) |
5564 | void *old_ss_priv[CGROUP_CANFORK_COUNT]) | ||
5565 | { | 5548 | { |
5566 | struct cgroup_subsys *ss; | 5549 | struct cgroup_subsys *ss; |
5567 | int i; | 5550 | int i; |
@@ -5605,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child, | |||
5605 | * and addition to css_set. | 5588 | * and addition to css_set. |
5606 | */ | 5589 | */ |
5607 | for_each_subsys_which(ss, i, &have_fork_callback) | 5590 | for_each_subsys_which(ss, i, &have_fork_callback) |
5608 | ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); | 5591 | ss->fork(child); |
5609 | } | 5592 | } |
5610 | 5593 | ||
5611 | /** | 5594 | /** |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2d3df82c54f2..1b72d56edce5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset) | |||
200 | * to do anything as freezer_attach() will put @task into the appropriate | 200 | * to do anything as freezer_attach() will put @task into the appropriate |
201 | * state. | 201 | * state. |
202 | */ | 202 | */ |
203 | static void freezer_fork(struct task_struct *task, void *private) | 203 | static void freezer_fork(struct task_struct *task) |
204 | { | 204 | { |
205 | struct freezer *freezer; | 205 | struct freezer *freezer; |
206 | 206 | ||
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index b50d5a167fda..303097b37429 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c | |||
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) | |||
134 | * | 134 | * |
135 | * This function follows the set limit. It will fail if the charge would cause | 135 | * This function follows the set limit. It will fail if the charge would cause |
136 | * the new value to exceed the hierarchical limit. Returns 0 if the charge | 136 | * the new value to exceed the hierarchical limit. Returns 0 if the charge |
137 | * succeded, otherwise -EAGAIN. | 137 | * succeeded, otherwise -EAGAIN. |
138 | */ | 138 | */ |
139 | static int pids_try_charge(struct pids_cgroup *pids, int num) | 139 | static int pids_try_charge(struct pids_cgroup *pids, int num) |
140 | { | 140 | { |
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) | |||
209 | * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies | 209 | * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies |
210 | * on threadgroup_change_begin() held by the copy_process(). | 210 | * on threadgroup_change_begin() held by the copy_process(). |
211 | */ | 211 | */ |
212 | static int pids_can_fork(struct task_struct *task, void **priv_p) | 212 | static int pids_can_fork(struct task_struct *task) |
213 | { | 213 | { |
214 | struct cgroup_subsys_state *css; | 214 | struct cgroup_subsys_state *css; |
215 | struct pids_cgroup *pids; | 215 | struct pids_cgroup *pids; |
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p) | |||
219 | return pids_try_charge(pids, 1); | 219 | return pids_try_charge(pids, 1); |
220 | } | 220 | } |
221 | 221 | ||
222 | static void pids_cancel_fork(struct task_struct *task, void *priv) | 222 | static void pids_cancel_fork(struct task_struct *task) |
223 | { | 223 | { |
224 | struct cgroup_subsys_state *css; | 224 | struct cgroup_subsys_state *css; |
225 | struct pids_cgroup *pids; | 225 | struct pids_cgroup *pids; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02a8ea5c9963..3e945fcd8179 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/stat.h> | 51 | #include <linux/stat.h> |
52 | #include <linux/string.h> | 52 | #include <linux/string.h> |
53 | #include <linux/time.h> | 53 | #include <linux/time.h> |
54 | #include <linux/time64.h> | ||
54 | #include <linux/backing-dev.h> | 55 | #include <linux/backing-dev.h> |
55 | #include <linux/sort.h> | 56 | #include <linux/sort.h> |
56 | 57 | ||
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; | |||
68 | struct fmeter { | 69 | struct fmeter { |
69 | int cnt; /* unprocessed events count */ | 70 | int cnt; /* unprocessed events count */ |
70 | int val; /* most recent output value */ | 71 | int val; /* most recent output value */ |
71 | time_t time; /* clock (secs) when val computed */ | 72 | time64_t time; /* clock (secs) when val computed */ |
72 | spinlock_t lock; /* guards read or write of above */ | 73 | spinlock_t lock; /* guards read or write of above */ |
73 | }; | 74 | }; |
74 | 75 | ||
@@ -1374,7 +1375,7 @@ out: | |||
1374 | */ | 1375 | */ |
1375 | 1376 | ||
1376 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ | 1377 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ |
1377 | #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ | 1378 | #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ |
1378 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ | 1379 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ |
1379 | #define FM_SCALE 1000 /* faux fixed point scale */ | 1380 | #define FM_SCALE 1000 /* faux fixed point scale */ |
1380 | 1381 | ||
@@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp) | |||
1390 | /* Internal meter update - process cnt events and update value */ | 1391 | /* Internal meter update - process cnt events and update value */ |
1391 | static void fmeter_update(struct fmeter *fmp) | 1392 | static void fmeter_update(struct fmeter *fmp) |
1392 | { | 1393 | { |
1393 | time_t now = get_seconds(); | 1394 | time64_t now; |
1394 | time_t ticks = now - fmp->time; | 1395 | u32 ticks; |
1396 | |||
1397 | now = ktime_get_seconds(); | ||
1398 | ticks = now - fmp->time; | ||
1395 | 1399 | ||
1396 | if (ticks == 0) | 1400 | if (ticks == 0) |
1397 | return; | 1401 | return; |
diff --git a/kernel/fork.c b/kernel/fork.c index 291b08cc817b..6774e6b2e96d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1250,7 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1250 | { | 1250 | { |
1251 | int retval; | 1251 | int retval; |
1252 | struct task_struct *p; | 1252 | struct task_struct *p; |
1253 | void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; | ||
1254 | 1253 | ||
1255 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 1254 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
1256 | return ERR_PTR(-EINVAL); | 1255 | return ERR_PTR(-EINVAL); |
@@ -1527,7 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1527 | * between here and cgroup_post_fork() if an organisation operation is in | 1526 | * between here and cgroup_post_fork() if an organisation operation is in |
1528 | * progress. | 1527 | * progress. |
1529 | */ | 1528 | */ |
1530 | retval = cgroup_can_fork(p, cgrp_ss_priv); | 1529 | retval = cgroup_can_fork(p); |
1531 | if (retval) | 1530 | if (retval) |
1532 | goto bad_fork_free_pid; | 1531 | goto bad_fork_free_pid; |
1533 | 1532 | ||
@@ -1609,7 +1608,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1609 | write_unlock_irq(&tasklist_lock); | 1608 | write_unlock_irq(&tasklist_lock); |
1610 | 1609 | ||
1611 | proc_fork_connector(p); | 1610 | proc_fork_connector(p); |
1612 | cgroup_post_fork(p, cgrp_ss_priv); | 1611 | cgroup_post_fork(p); |
1613 | threadgroup_change_end(current); | 1612 | threadgroup_change_end(current); |
1614 | perf_event_fork(p); | 1613 | perf_event_fork(p); |
1615 | 1614 | ||
@@ -1619,7 +1618,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1619 | return p; | 1618 | return p; |
1620 | 1619 | ||
1621 | bad_fork_cancel_cgroup: | 1620 | bad_fork_cancel_cgroup: |
1622 | cgroup_cancel_fork(p, cgrp_ss_priv); | 1621 | cgroup_cancel_fork(p); |
1623 | bad_fork_free_pid: | 1622 | bad_fork_free_pid: |
1624 | if (pid != &init_struct_pid) | 1623 | if (pid != &init_struct_pid) |
1625 | free_pid(pid); | 1624 | free_pid(pid); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 77d97a6fc715..44253adb3c36 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
8342 | sched_offline_group(tg); | 8342 | sched_offline_group(tg); |
8343 | } | 8343 | } |
8344 | 8344 | ||
8345 | static void cpu_cgroup_fork(struct task_struct *task, void *private) | 8345 | static void cpu_cgroup_fork(struct task_struct *task) |
8346 | { | 8346 | { |
8347 | sched_move_task(task); | 8347 | sched_move_task(task); |
8348 | } | 8348 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc10620967c7..14cb1db4c52b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -4813,7 +4813,7 @@ static void mem_cgroup_clear_mc(void) | |||
4813 | static int mem_cgroup_can_attach(struct cgroup_taskset *tset) | 4813 | static int mem_cgroup_can_attach(struct cgroup_taskset *tset) |
4814 | { | 4814 | { |
4815 | struct cgroup_subsys_state *css; | 4815 | struct cgroup_subsys_state *css; |
4816 | struct mem_cgroup *memcg; | 4816 | struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ |
4817 | struct mem_cgroup *from; | 4817 | struct mem_cgroup *from; |
4818 | struct task_struct *leader, *p; | 4818 | struct task_struct *leader, *p; |
4819 | struct mm_struct *mm; | 4819 | struct mm_struct *mm; |