aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroup-v1/00-INDEX (renamed from Documentation/cgroups/00-INDEX)2
-rw-r--r--Documentation/cgroup-v1/blkio-controller.txt (renamed from Documentation/cgroups/blkio-controller.txt)82
-rw-r--r--Documentation/cgroup-v1/cgroups.txt (renamed from Documentation/cgroups/cgroups.txt)0
-rw-r--r--Documentation/cgroup-v1/cpuacct.txt (renamed from Documentation/cgroups/cpuacct.txt)0
-rw-r--r--Documentation/cgroup-v1/cpusets.txt (renamed from Documentation/cgroups/cpusets.txt)0
-rw-r--r--Documentation/cgroup-v1/devices.txt (renamed from Documentation/cgroups/devices.txt)0
-rw-r--r--Documentation/cgroup-v1/freezer-subsystem.txt (renamed from Documentation/cgroups/freezer-subsystem.txt)0
-rw-r--r--Documentation/cgroup-v1/hugetlb.txt (renamed from Documentation/cgroups/hugetlb.txt)0
-rw-r--r--Documentation/cgroup-v1/memcg_test.txt (renamed from Documentation/cgroups/memcg_test.txt)0
-rw-r--r--Documentation/cgroup-v1/memory.txt (renamed from Documentation/cgroups/memory.txt)0
-rw-r--r--Documentation/cgroup-v1/net_cls.txt (renamed from Documentation/cgroups/net_cls.txt)0
-rw-r--r--Documentation/cgroup-v1/net_prio.txt (renamed from Documentation/cgroups/net_prio.txt)0
-rw-r--r--Documentation/cgroup-v1/pids.txt (renamed from Documentation/cgroups/pids.txt)0
-rw-r--r--Documentation/cgroup-v2.txt1293
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt647
-rw-r--r--include/linux/cgroup-defs.h13
-rw-r--r--include/linux/cgroup.h19
-rw-r--r--include/linux/cgroup_subsys.h18
-rw-r--r--include/uapi/linux/magic.h1
-rw-r--r--init/Kconfig241
-rw-r--r--kernel/cgroup.c81
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c6
-rw-r--r--kernel/cpuset.c12
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/sched/core.c2
-rw-r--r--mm/memcontrol.c2
27 files changed, 1467 insertions, 961 deletions
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroup-v1/00-INDEX
index 3f5a40f57d4a..6ad425f7cf56 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroup-v1/00-INDEX
@@ -24,7 +24,5 @@ net_prio.txt
24 - Network priority cgroups details and usages. 24 - Network priority cgroups details and usages.
25pids.txt 25pids.txt
26 - Process number cgroups details and usages. 26 - Process number cgroups details and usages.
27resource_counter.txt
28 - Resource Counter API.
29unified-hierarchy.txt 27unified-hierarchy.txt
30 - Description the new/next cgroup interface. 28 - Description the new/next cgroup interface.
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt
index 52fa9f353342..673dc34d3f78 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroup-v1/blkio-controller.txt
@@ -84,8 +84,7 @@ Throttling/Upper Limit policy
84 84
85- Run dd to read a file and see if rate is throttled to 1MB/s or not. 85- Run dd to read a file and see if rate is throttled to 1MB/s or not.
86 86
87 # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 87 # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
88 # iflag=direct
89 1024+0 records in 88 1024+0 records in
90 1024+0 records out 89 1024+0 records out
91 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s 90 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
@@ -374,82 +373,3 @@ One can experience an overall throughput drop if you have created multiple
374groups and put applications in that group which are not driving enough 373groups and put applications in that group which are not driving enough
375IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle 374IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
376on individual groups and throughput should improve. 375on individual groups and throughput should improve.
377
378Writeback
379=========
380
381Page cache is dirtied through buffered writes and shared mmaps and
382written asynchronously to the backing filesystem by the writeback
383mechanism. Writeback sits between the memory and IO domains and
384regulates the proportion of dirty memory by balancing dirtying and
385write IOs.
386
387On traditional cgroup hierarchies, relationships between different
388controllers cannot be established making it impossible for writeback
389to operate accounting for cgroup resource restrictions and all
390writeback IOs are attributed to the root cgroup.
391
392If both the blkio and memory controllers are used on the v2 hierarchy
393and the filesystem supports cgroup writeback, writeback operations
394correctly follow the resource restrictions imposed by both memory and
395blkio controllers.
396
397Writeback examines both system-wide and per-cgroup dirty memory status
398and enforces the more restrictive of the two. Also, writeback control
399parameters which are absolute values - vm.dirty_bytes and
400vm.dirty_background_bytes - are distributed across cgroups according
401to their current writeback bandwidth.
402
403There's a peculiarity stemming from the discrepancy in ownership
404granularity between memory controller and writeback. While memory
405controller tracks ownership per page, writeback operates on inode
406basis. cgroup writeback bridges the gap by tracking ownership by
407inode but migrating ownership if too many foreign pages, pages which
408don't match the current inode ownership, have been encountered while
409writing back the inode.
410
411This is a conscious design choice as writeback operations are
412inherently tied to inodes making strictly following page ownership
413complicated and inefficient. The only use case which suffers from
414this compromise is multiple cgroups concurrently dirtying disjoint
415regions of the same inode, which is an unlikely use case and decided
416to be unsupported. Note that as memory controller assigns page
417ownership on the first use and doesn't update it until the page is
418released, even if cgroup writeback strictly follows page ownership,
419multiple cgroups dirtying overlapping areas wouldn't work as expected.
420In general, write-sharing an inode across multiple cgroups is not well
421supported.
422
423Filesystem support for cgroup writeback
424---------------------------------------
425
426A filesystem can make writeback IOs cgroup-aware by updating
427address_space_operations->writepage[s]() to annotate bio's using the
428following two functions.
429
430* wbc_init_bio(@wbc, @bio)
431
432 Should be called for each bio carrying writeback data and associates
433 the bio with the inode's owner cgroup. Can be called anytime
434 between bio allocation and submission.
435
436* wbc_account_io(@wbc, @page, @bytes)
437
438 Should be called for each data segment being written out. While
439 this function doesn't care exactly when it's called during the
440 writeback session, it's the easiest and most natural to call it as
441 data segments are added to a bio.
442
443With writeback bio's annotated, cgroup support can be enabled per
444super_block by setting MS_CGROUPWB in ->s_flags. This allows for
445selective disabling of cgroup writeback support which is helpful when
446certain filesystem features, e.g. journaled data mode, are
447incompatible.
448
449wbc_init_bio() binds the specified bio to its cgroup. Depending on
450the configuration, the bio may be executed at a lower priority and if
451the writeback session is holding shared resources, e.g. a journal
452entry, may lead to priority inversion. There is no one easy solution
453for the problem. Filesystems can try to work around specific problem
454cases by skipping wbc_init_bio() or using bio_associate_blkcg()
455directly.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt
index c6256ae9885b..c6256ae9885b 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroup-v1/cgroups.txt
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.txt
index 9d73cc0cadb9..9d73cc0cadb9 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroup-v1/cpuacct.txt
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt
index fdf7dff3f607..fdf7dff3f607 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroup-v1/cpusets.txt
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroup-v1/devices.txt
index 3c1095ca02ea..3c1095ca02ea 100644
--- a/Documentation/cgroups/devices.txt
+++ b/Documentation/cgroup-v1/devices.txt
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.txt
index e831cb2b8394..e831cb2b8394 100644
--- a/Documentation/cgroups/freezer-subsystem.txt
+++ b/Documentation/cgroup-v1/freezer-subsystem.txt
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.txt
index 106245c3aecc..106245c3aecc 100644
--- a/Documentation/cgroups/hugetlb.txt
+++ b/Documentation/cgroup-v1/hugetlb.txt
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt
index 8870b0212150..8870b0212150 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroup-v1/memcg_test.txt
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroup-v1/memory.txt
index ff71e16cc752..ff71e16cc752 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroup-v1/memory.txt
diff --git a/Documentation/cgroups/net_cls.txt b/Documentation/cgroup-v1/net_cls.txt
index ec182346dea2..ec182346dea2 100644
--- a/Documentation/cgroups/net_cls.txt
+++ b/Documentation/cgroup-v1/net_cls.txt
diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroup-v1/net_prio.txt
index a82cbd28ea8a..a82cbd28ea8a 100644
--- a/Documentation/cgroups/net_prio.txt
+++ b/Documentation/cgroup-v1/net_prio.txt
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroup-v1/pids.txt
index 1a078b5d281a..1a078b5d281a 100644
--- a/Documentation/cgroups/pids.txt
+++ b/Documentation/cgroup-v1/pids.txt
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
new file mode 100644
index 000000000000..31d1f7bf12a1
--- /dev/null
+++ b/Documentation/cgroup-v2.txt
@@ -0,0 +1,1293 @@
1
2Control Group v2
3
4October, 2015 Tejun Heo <tj@kernel.org>
5
6This is the authoritative documentation on the design, interface and
7conventions of cgroup v2. It describes all userland-visible aspects
8of cgroup including core and specific controller behaviors. All
9future changes must be reflected in this document. Documentation for
10v1 is available under Documentation/cgroup-legacy/.
11
12CONTENTS
13
141. Introduction
15 1-1. Terminology
16 1-2. What is cgroup?
172. Basic Operations
18 2-1. Mounting
19 2-2. Organizing Processes
20 2-3. [Un]populated Notification
21 2-4. Controlling Controllers
22 2-4-1. Enabling and Disabling
23 2-4-2. Top-down Constraint
24 2-4-3. No Internal Process Constraint
25 2-5. Delegation
26 2-5-1. Model of Delegation
27 2-5-2. Delegation Containment
28 2-6. Guidelines
29 2-6-1. Organize Once and Control
30 2-6-2. Avoid Name Collisions
313. Resource Distribution Models
32 3-1. Weights
33 3-2. Limits
34 3-3. Protections
35 3-4. Allocations
364. Interface Files
37 4-1. Format
38 4-2. Conventions
39 4-3. Core Interface Files
405. Controllers
41 5-1. CPU
42 5-1-1. CPU Interface Files
43 5-2. Memory
44 5-2-1. Memory Interface Files
45 5-2-2. Usage Guidelines
46 5-2-3. Memory Ownership
47 5-3. IO
48 5-3-1. IO Interface Files
49 5-3-2. Writeback
50P. Information on Kernel Programming
51 P-1. Filesystem Support for Writeback
52D. Deprecated v1 Core Features
53R. Issues with v1 and Rationales for v2
54 R-1. Multiple Hierarchies
55 R-2. Thread Granularity
56 R-3. Competition Between Inner Nodes and Threads
57 R-4. Other Interface Issues
58 R-5. Controller Issues and Remedies
59 R-5-1. Memory
60
61
621. Introduction
63
641-1. Terminology
65
66"cgroup" stands for "control group" and is never capitalized. The
67singular form is used to designate the whole feature and also as a
68qualifier as in "cgroup controllers". When explicitly referring to
69multiple individual control groups, the plural form "cgroups" is used.
70
71
721-2. What is cgroup?
73
74cgroup is a mechanism to organize processes hierarchically and
75distribute system resources along the hierarchy in a controlled and
76configurable manner.
77
78cgroup is largely composed of two parts - the core and controllers.
79cgroup core is primarily responsible for hierarchically organizing
80processes. A cgroup controller is usually responsible for
81distributing a specific type of system resource along the hierarchy
82although there are utility controllers which serve purposes other than
83resource distribution.
84
85cgroups form a tree structure and every process in the system belongs
86to one and only one cgroup. All threads of a process belong to the
87same cgroup. On creation, all processes are put in the cgroup that
88the parent process belongs to at the time. A process can be migrated
89to another cgroup. Migration of a process doesn't affect already
90existing descendant processes.
91
92Following certain structural constraints, controllers may be enabled or
93disabled selectively on a cgroup. All controller behaviors are
94hierarchical - if a controller is enabled on a cgroup, it affects all
95processes which belong to the cgroups consisting the inclusive
96sub-hierarchy of the cgroup. When a controller is enabled on a nested
97cgroup, it always restricts the resource distribution further. The
98restrictions set closer to the root in the hierarchy can not be
99overridden from further away.
100
101
1022. Basic Operations
103
1042-1. Mounting
105
106Unlike v1, cgroup v2 has only single hierarchy. The cgroup v2
107hierarchy can be mounted with the following mount command.
108
109 # mount -t cgroup2 none $MOUNT_POINT
110
111cgroup2 filesystem has the magic number 0x63677270 ("cgrp"). All
112controllers which support v2 and are not bound to a v1 hierarchy are
113automatically bound to the v2 hierarchy and show up at the root.
114Controllers which are not in active use in the v2 hierarchy can be
115bound to other hierarchies. This allows mixing v2 hierarchy with the
116legacy v1 multiple hierarchies in a fully backward compatible way.
117
118A controller can be moved across hierarchies only after the controller
119is no longer referenced in its current hierarchy. Because per-cgroup
120controller states are destroyed asynchronously and controllers may
121have lingering references, a controller may not show up immediately on
122the v2 hierarchy after the final umount of the previous hierarchy.
123Similarly, a controller should be fully disabled to be moved out of
124the unified hierarchy and it may take some time for the disabled
125controller to become available for other hierarchies; furthermore, due
126to inter-controller dependencies, other controllers may need to be
127disabled too.
128
129While useful for development and manual configurations, moving
130controllers dynamically between the v2 and other hierarchies is
131strongly discouraged for production use. It is recommended to decide
132the hierarchies and controller associations before starting using the
133controllers after system boot.
134
135
1362-2. Organizing Processes
137
138Initially, only the root cgroup exists to which all processes belong.
139A child cgroup can be created by creating a sub-directory.
140
141 # mkdir $CGROUP_NAME
142
143A given cgroup may have multiple child cgroups forming a tree
144structure. Each cgroup has a read-writable interface file
145"cgroup.procs". When read, it lists the PIDs of all processes which
146belong to the cgroup one-per-line. The PIDs are not ordered and the
147same PID may show up more than once if the process got moved to
148another cgroup and then back or the PID got recycled while reading.
149
150A process can be migrated into a cgroup by writing its PID to the
151target cgroup's "cgroup.procs" file. Only one process can be migrated
152on a single write(2) call. If a process is composed of multiple
153threads, writing the PID of any thread migrates all threads of the
154process.
155
156When a process forks a child process, the new process is born into the
157cgroup that the forking process belongs to at the time of the
158operation. After exit, a process stays associated with the cgroup
159that it belonged to at the time of exit until it's reaped; however, a
160zombie process does not appear in "cgroup.procs" and thus can't be
161moved to another cgroup.
162
163A cgroup which doesn't have any children or live processes can be
164destroyed by removing the directory. Note that a cgroup which doesn't
165have any children and is associated only with zombie processes is
166considered empty and can be removed.
167
168 # rmdir $CGROUP_NAME
169
170"/proc/$PID/cgroup" lists a process's cgroup membership. If legacy
171cgroup is in use in the system, this file may contain multiple lines,
172one for each hierarchy. The entry for cgroup v2 is always in the
173format "0::$PATH".
174
175 # cat /proc/842/cgroup
176 ...
177 0::/test-cgroup/test-cgroup-nested
178
179If the process becomes a zombie and the cgroup it was associated with
180is removed subsequently, " (deleted)" is appended to the path.
181
182 # cat /proc/842/cgroup
183 ...
184 0::/test-cgroup/test-cgroup-nested (deleted)
185
186
1872-3. [Un]populated Notification
188
189Each non-root cgroup has a "cgroup.events" file which contains
190"populated" field indicating whether the cgroup's sub-hierarchy has
191live processes in it. Its value is 0 if there is no live process in
192the cgroup and its descendants; otherwise, 1. poll and [id]notify
193events are triggered when the value changes. This can be used, for
194example, to start a clean-up operation after all processes of a given
195sub-hierarchy have exited. The populated state updates and
196notifications are recursive. Consider the following sub-hierarchy
197where the numbers in the parentheses represent the numbers of processes
198in each cgroup.
199
200 A(4) - B(0) - C(1)
201 \ D(0)
202
203A, B and C's "populated" fields would be 1 while D's 0. After the one
204process in C exits, B and C's "populated" fields would flip to "0" and
205file modified events will be generated on the "cgroup.events" files of
206both cgroups.
207
208
2092-4. Controlling Controllers
210
2112-4-1. Enabling and Disabling
212
213Each cgroup has a "cgroup.controllers" file which lists all
214controllers available for the cgroup to enable.
215
216 # cat cgroup.controllers
217 cpu io memory
218
219No controller is enabled by default. Controllers can be enabled and
220disabled by writing to the "cgroup.subtree_control" file.
221
222 # echo "+cpu +memory -io" > cgroup.subtree_control
223
224Only controllers which are listed in "cgroup.controllers" can be
225enabled. When multiple operations are specified as above, either they
226all succeed or fail. If multiple operations on the same controller
227are specified, the last one is effective.
228
229Enabling a controller in a cgroup indicates that the distribution of
230the target resource across its immediate children will be controlled.
231Consider the following sub-hierarchy. The enabled controllers are
232listed in parentheses.
233
234 A(cpu,memory) - B(memory) - C()
235 \ D()
236
237As A has "cpu" and "memory" enabled, A will control the distribution
238of CPU cycles and memory to its children, in this case, B. As B has
239"memory" enabled but not "CPU", C and D will compete freely on CPU
240cycles but their division of memory available to B will be controlled.
241
242As a controller regulates the distribution of the target resource to
243the cgroup's children, enabling it creates the controller's interface
244files in the child cgroups. In the above example, enabling "cpu" on B
245would create the "cpu." prefixed controller interface files in C and
246D. Likewise, disabling "memory" from B would remove the "memory."
247prefixed controller interface files from C and D. This means that the
248controller interface files - anything which doesn't start with
249"cgroup." are owned by the parent rather than the cgroup itself.
250
251
2522-4-2. Top-down Constraint
253
254Resources are distributed top-down and a cgroup can further distribute
255a resource only if the resource has been distributed to it from the
256parent. This means that all non-root "cgroup.subtree_control" files
257can only contain controllers which are enabled in the parent's
258"cgroup.subtree_control" file. A controller can be enabled only if
259the parent has the controller enabled and a controller can't be
260disabled if one or more children have it enabled.
261
262
2632-4-3. No Internal Process Constraint
264
265Non-root cgroups can only distribute resources to their children when
266they don't have any processes of their own. In other words, only
267cgroups which don't contain any processes can have controllers enabled
268in their "cgroup.subtree_control" files.
269
270This guarantees that, when a controller is looking at the part of the
271hierarchy which has it enabled, processes are always only on the
272leaves. This rules out situations where child cgroups compete against
273internal processes of the parent.
274
275The root cgroup is exempt from this restriction. Root contains
276processes and anonymous resource consumption which can't be associated
277with any other cgroups and requires special treatment from most
278controllers. How resource consumption in the root cgroup is governed
279is up to each controller.
280
281Note that the restriction doesn't get in the way if there is no
282enabled controller in the cgroup's "cgroup.subtree_control". This is
283important as otherwise it wouldn't be possible to create children of a
284populated cgroup. To control resource distribution of a cgroup, the
285cgroup must create children and transfer all its processes to the
286children before enabling controllers in its "cgroup.subtree_control"
287file.
288
289
2902-5. Delegation
291
2922-5-1. Model of Delegation
293
294A cgroup can be delegated to a less privileged user by granting write
295access of the directory and its "cgroup.procs" file to the user. Note
296that resource control interface files in a given directory control the
297distribution of the parent's resources and thus must not be delegated
298along with the directory.
299
300Once delegated, the user can build sub-hierarchy under the directory,
301organize processes as it sees fit and further distribute the resources
302it received from the parent. The limits and other settings of all
303resource controllers are hierarchical and regardless of what happens
304in the delegated sub-hierarchy, nothing can escape the resource
305restrictions imposed by the parent.
306
307Currently, cgroup doesn't impose any restrictions on the number of
308cgroups in or nesting depth of a delegated sub-hierarchy; however,
309this may be limited explicitly in the future.
310
311
3122-5-2. Delegation Containment
313
314A delegated sub-hierarchy is contained in the sense that processes
315can't be moved into or out of the sub-hierarchy by the delegatee. For
316a process with a non-root euid to migrate a target process into a
317cgroup by writing its PID to the "cgroup.procs" file, the following
318conditions must be met.
319
320- The writer's euid must match either uid or suid of the target process.
321
322- The writer must have write access to the "cgroup.procs" file.
323
324- The writer must have write access to the "cgroup.procs" file of the
325 common ancestor of the source and destination cgroups.
326
327The above three constraints ensure that while a delegatee may migrate
328processes around freely in the delegated sub-hierarchy it can't pull
329in from or push out to outside the sub-hierarchy.
330
331For an example, let's assume cgroups C0 and C1 have been delegated to
332user U0 who created C00, C01 under C0 and C10 under C1 as follows and
333all processes under C0 and C1 belong to U0.
334
335 ~~~~~~~~~~~~~ - C0 - C00
336 ~ cgroup ~ \ C01
337 ~ hierarchy ~
338 ~~~~~~~~~~~~~ - C1 - C10
339
340Let's also say U0 wants to write the PID of a process which is
341currently in C10 into "C00/cgroup.procs". U0 has write access to the
342file and uid match on the process; however, the common ancestor of the
343source cgroup C10 and the destination cgroup C00 is above the points
344of delegation and U0 would not have write access to its "cgroup.procs"
345files and thus the write will be denied with -EACCES.
346
347
3482-6. Guidelines
349
3502-6-1. Organize Once and Control
351
352Migrating a process across cgroups is a relatively expensive operation
353and stateful resources such as memory are not moved together with the
354process. This is an explicit design decision as there often exist
355inherent trade-offs between migration and various hot paths in terms
356of synchronization cost.
357
358As such, migrating processes across cgroups frequently as a means to
359apply different resource restrictions is discouraged. A workload
360should be assigned to a cgroup according to the system's logical and
361resource structure once on start-up. Dynamic adjustments to resource
362distribution can be made by changing controller configuration through
363the interface files.
364
365
3662-6-2. Avoid Name Collisions
367
368Interface files for a cgroup and its children cgroups occupy the same
369directory and it is possible to create children cgroups which collide
370with interface files.
371
372All cgroup core interface files are prefixed with "cgroup." and each
373controller's interface files are prefixed with the controller name and
374a dot. A controller's name is composed of lower case alphabets and
375'_'s but never begins with an '_' so it can be used as the prefix
376character for collision avoidance. Also, interface file names won't
377start or end with terms which are often used in categorizing workloads
378such as job, service, slice, unit or workload.
379
380cgroup doesn't do anything to prevent name collisions and it's the
381user's responsibility to avoid them.
382
383
3843. Resource Distribution Models
385
386cgroup controllers implement several resource distribution schemes
387depending on the resource type and expected use cases. This section
388describes major schemes in use along with their expected behaviors.
389
390
3913-1. Weights
392
393A parent's resource is distributed by adding up the weights of all
394active children and giving each the fraction matching the ratio of its
395weight against the sum. As only children which can make use of the
396resource at the moment participate in the distribution, this is
397work-conserving. Due to the dynamic nature, this model is usually
398used for stateless resources.
399
400All weights are in the range [1, 10000] with the default at 100. This
401allows symmetric multiplicative biases in both directions at fine
402enough granularity while staying in the intuitive range.
403
404As long as the weight is in range, all configuration combinations are
405valid and there is no reason to reject configuration changes or
406process migrations.
407
408"cpu.weight" proportionally distributes CPU cycles to active children
409and is an example of this type.
410
411
4123-2. Limits
413
414A child can only consume upto the configured amount of the resource.
415Limits can be over-committed - the sum of the limits of children can
416exceed the amount of resource available to the parent.
417
418Limits are in the range [0, max] and defaults to "max", which is noop.
419
420As limits can be over-committed, all configuration combinations are
421valid and there is no reason to reject configuration changes or
422process migrations.
423
424"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
425on an IO device and is an example of this type.
426
427
4283-3. Protections
429
430A cgroup is protected to be allocated upto the configured amount of
431the resource if the usages of all its ancestors are under their
432protected levels. Protections can be hard guarantees or best effort
433soft boundaries. Protections can also be over-committed in which case
434only upto the amount available to the parent is protected among
435children.
436
437Protections are in the range [0, max] and defaults to 0, which is
438noop.
439
440As protections can be over-committed, all configuration combinations
441are valid and there is no reason to reject configuration changes or
442process migrations.
443
444"memory.low" implements best-effort memory protection and is an
445example of this type.
446
447
4483-4. Allocations
449
450A cgroup is exclusively allocated a certain amount of a finite
451resource. Allocations can't be over-committed - the sum of the
452allocations of children can not exceed the amount of resource
453available to the parent.
454
455Allocations are in the range [0, max] and defaults to 0, which is no
456resource.
457
458As allocations can't be over-committed, some configuration
459combinations are invalid and should be rejected. Also, if the
460resource is mandatory for execution of processes, process migrations
461may be rejected.
462
463"cpu.rt.max" hard-allocates realtime slices and is an example of this
464type.
465
466
4674. Interface Files
468
4694-1. Format
470
471All interface files should be in one of the following formats whenever
472possible.
473
474 New-line separated values
475 (when only one value can be written at once)
476
477 VAL0\n
478 VAL1\n
479 ...
480
481 Space separated values
482 (when read-only or multiple values can be written at once)
483
484 VAL0 VAL1 ...\n
485
486 Flat keyed
487
488 KEY0 VAL0\n
489 KEY1 VAL1\n
490 ...
491
492 Nested keyed
493
494 KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
495 KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
496 ...
497
498For a writable file, the format for writing should generally match
499reading; however, controllers may allow omitting later fields or
500implement restricted shortcuts for most common use cases.
501
502For both flat and nested keyed files, only the values for a single key
503can be written at a time. For nested keyed files, the sub key pairs
504may be specified in any order and not all pairs have to be specified.
505
506
5074-2. Conventions
508
509- Settings for a single feature should be contained in a single file.
510
511- The root cgroup should be exempt from resource control and thus
512 shouldn't have resource control interface files. Also,
513 informational files on the root cgroup which end up showing global
514 information available elsewhere shouldn't exist.
515
516- If a controller implements weight based resource distribution, its
517 interface file should be named "weight" and have the range [1,
518 10000] with 100 as the default. The values are chosen to allow
519 enough and symmetric bias in both directions while keeping it
520 intuitive (the default is 100%).
521
522- If a controller implements an absolute resource guarantee and/or
523 limit, the interface files should be named "min" and "max"
524 respectively. If a controller implements best effort resource
525 guarantee and/or limit, the interface files should be named "low"
526 and "high" respectively.
527
528 In the above four control files, the special token "max" should be
529 used to represent upward infinity for both reading and writing.
530
531- If a setting has a configurable default value and keyed specific
532 overrides, the default entry should be keyed with "default" and
533 appear as the first entry in the file.
534
535 The default value can be updated by writing either "default $VAL" or
536 "$VAL".
537
538 When writing to update a specific override, "default" can be used as
539 the value to indicate removal of the override. Override entries
540 with "default" as the value must not appear when read.
541
542 For example, a setting which is keyed by major:minor device numbers
543 with integer values may look like the following.
544
545 # cat cgroup-example-interface-file
546 default 150
547 8:0 300
548
549 The default value can be updated by
550
551 # echo 125 > cgroup-example-interface-file
552
553 or
554
555 # echo "default 125" > cgroup-example-interface-file
556
557 An override can be set by
558
559 # echo "8:16 170" > cgroup-example-interface-file
560
561 and cleared by
562
563 # echo "8:0 default" > cgroup-example-interface-file
564 # cat cgroup-example-interface-file
565 default 125
566 8:16 170
567
568- For events which are not very high frequency, an interface file
569 "events" should be created which lists event key value pairs.
570 Whenever a notifiable event happens, file modified event should be
571 generated on the file.
572
573
5744-3. Core Interface Files
575
576All cgroup core files are prefixed with "cgroup."
577
578 cgroup.procs
579
580 A read-write new-line separated values file which exists on
581 all cgroups.
582
583 When read, it lists the PIDs of all processes which belong to
584 the cgroup one-per-line. The PIDs are not ordered and the
585 same PID may show up more than once if the process got moved
586 to another cgroup and then back or the PID got recycled while
587 reading.
588
589 A PID can be written to migrate the process associated with
590 the PID to the cgroup. The writer should match all of the
591 following conditions.
592
593 - Its euid is either root or must match either uid or suid of
594 the target process.
595
596 - It must have write access to the "cgroup.procs" file.
597
598 - It must have write access to the "cgroup.procs" file of the
599 common ancestor of the source and destination cgroups.
600
601 When delegating a sub-hierarchy, write access to this file
602 should be granted along with the containing directory.
603
604 cgroup.controllers
605
606 A read-only space separated values file which exists on all
607 cgroups.
608
609 It shows space separated list of all controllers available to
610 the cgroup. The controllers are not ordered.
611
612 cgroup.subtree_control
613
614 A read-write space separated values file which exists on all
615 cgroups. Starts out empty.
616
617 When read, it shows space separated list of the controllers
618 which are enabled to control resource distribution from the
619 cgroup to its children.
620
621 Space separated list of controllers prefixed with '+' or '-'
622 can be written to enable or disable controllers. A controller
623 name prefixed with '+' enables the controller and '-'
624 disables. If a controller appears more than once on the list,
625 the last one is effective. When multiple enable and disable
626 operations are specified, either all succeed or all fail.
627
628 cgroup.events
629
630 A read-only flat-keyed file which exists on non-root cgroups.
631 The following entries are defined. Unless specified
632 otherwise, a value change in this file generates a file
633 modified event.
634
635 populated
636
637 1 if the cgroup or its descendants contains any live
638 processes; otherwise, 0.
639
640
6415. Controllers
642
6435-1. CPU
644
645[NOTE: The interface for the cpu controller hasn't been merged yet]
646
647The "cpu" controllers regulates distribution of CPU cycles. This
648controller implements weight and absolute bandwidth limit models for
649normal scheduling policy and absolute bandwidth allocation model for
650realtime scheduling policy.
651
652
6535-1-1. CPU Interface Files
654
655All time durations are in microseconds.
656
657 cpu.stat
658
659 A read-only flat-keyed file which exists on non-root cgroups.
660
661 It reports the following six stats.
662
663 usage_usec
664 user_usec
665 system_usec
666 nr_periods
667 nr_throttled
668 throttled_usec
669
670 cpu.weight
671
672 A read-write single value file which exists on non-root
673 cgroups. The default is "100".
674
675 The weight in the range [1, 10000].
676
677 cpu.max
678
679 A read-write two value file which exists on non-root cgroups.
680 The default is "max 100000".
681
682 The maximum bandwidth limit. It's in the following format.
683
684 $MAX $PERIOD
685
686 which indicates that the group may consume upto $MAX in each
687 $PERIOD duration. "max" for $MAX indicates no limit. If only
688 one number is written, $MAX is updated.
689
690 cpu.rt.max
691
692 [NOTE: The semantics of this file is still under discussion and the
693 interface hasn't been merged yet]
694
695 A read-write two value file which exists on all cgroups.
696 The default is "0 100000".
697
698 The maximum realtime runtime allocation. Over-committing
699 configurations are disallowed and process migrations are
700 rejected if not enough bandwidth is available. It's in the
701 following format.
702
703 $MAX $PERIOD
704
705 which indicates that the group may consume upto $MAX in each
706 $PERIOD duration. If only one number is written, $MAX is
707 updated.
708
709
7105-2. Memory
711
712The "memory" controller regulates distribution of memory. Memory is
713stateful and implements both limit and protection models. Due to the
714intertwining between memory usage and reclaim pressure and the
715stateful nature of memory, the distribution model is relatively
716complex.
717
718While not completely water-tight, all major memory usages by a given
719cgroup are tracked so that the total memory consumption can be
720accounted and controlled to a reasonable extent. Currently, the
721following types of memory usages are tracked.
722
723- Userland memory - page cache and anonymous memory.
724
725- Kernel data structures such as dentries and inodes.
726
727- TCP socket buffers.
728
729The above list may expand in the future for better coverage.
730
731
7325-2-1. Memory Interface Files
733
734All memory amounts are in bytes. If a value which is not aligned to
735PAGE_SIZE is written, the value may be rounded up to the closest
736PAGE_SIZE multiple when read back.
737
738 memory.current
739
740 A read-only single value file which exists on non-root
741 cgroups.
742
743 The total amount of memory currently being used by the cgroup
744 and its descendants.
745
746 memory.low
747
748 A read-write single value file which exists on non-root
749 cgroups. The default is "0".
750
751 Best-effort memory protection. If the memory usages of a
752 cgroup and all its ancestors are below their low boundaries,
753 the cgroup's memory won't be reclaimed unless memory can be
754 reclaimed from unprotected cgroups.
755
756 Putting more memory than generally available under this
757 protection is discouraged.
758
759 memory.high
760
761 A read-write single value file which exists on non-root
762 cgroups. The default is "max".
763
764 Memory usage throttle limit. This is the main mechanism to
765 control memory usage of a cgroup. If a cgroup's usage goes
766 over the high boundary, the processes of the cgroup are
767 throttled and put under heavy reclaim pressure.
768
769 Going over the high limit never invokes the OOM killer and
770 under extreme conditions the limit may be breached.
771
772 memory.max
773
774 A read-write single value file which exists on non-root
775 cgroups. The default is "max".
776
777 Memory usage hard limit. This is the final protection
778 mechanism. If a cgroup's memory usage reaches this limit and
779 can't be reduced, the OOM killer is invoked in the cgroup.
780 Under certain circumstances, the usage may go over the limit
781 temporarily.
782
783 This is the ultimate protection mechanism. As long as the
784 high limit is used and monitored properly, this limit's
785 utility is limited to providing the final safety net.
786
787 memory.events
788
789 A read-only flat-keyed file which exists on non-root cgroups.
790 The following entries are defined. Unless specified
791 otherwise, a value change in this file generates a file
792 modified event.
793
794 low
795
796 The number of times the cgroup is reclaimed due to
797 high memory pressure even though its usage is under
798 the low boundary. This usually indicates that the low
799 boundary is over-committed.
800
801 high
802
803 The number of times processes of the cgroup are
804 throttled and routed to perform direct memory reclaim
805 because the high memory boundary was exceeded. For a
806 cgroup whose memory usage is capped by the high limit
807 rather than global memory pressure, this event's
808 occurrences are expected.
809
810 max
811
812 The number of times the cgroup's memory usage was
813 about to go over the max boundary. If direct reclaim
814 fails to bring it down, the OOM killer is invoked.
815
816 oom
817
818 The number of times the OOM killer has been invoked in
819 the cgroup. This may not exactly match the number of
820 processes killed but should generally be close.
821
822
8235-2-2. General Usage
824
825"memory.high" is the main mechanism to control memory usage.
826Over-committing on high limit (sum of high limits > available memory)
827and letting global memory pressure to distribute memory according to
828usage is a viable strategy.
829
830Because breach of the high limit doesn't trigger the OOM killer but
831throttles the offending cgroup, a management agent has ample
832opportunities to monitor and take appropriate actions such as granting
833more memory or terminating the workload.
834
835Determining whether a cgroup has enough memory is not trivial as
836memory usage doesn't indicate whether the workload can benefit from
837more memory. For example, a workload which writes data received from
838network to a file can use all available memory but can also operate as
839performant with a small amount of memory. A measure of memory
840pressure - how much the workload is being impacted due to lack of
841memory - is necessary to determine whether a workload needs more
842memory; unfortunately, memory pressure monitoring mechanism isn't
843implemented yet.
844
845
8465-2-3. Memory Ownership
847
848A memory area is charged to the cgroup which instantiated it and stays
849charged to the cgroup until the area is released. Migrating a process
850to a different cgroup doesn't move the memory usages that it
851instantiated while in the previous cgroup to the new cgroup.
852
853A memory area may be used by processes belonging to different cgroups.
854To which cgroup the area will be charged is in-deterministic; however,
855over time, the memory area is likely to end up in a cgroup which has
856enough memory allowance to avoid high reclaim pressure.
857
858If a cgroup sweeps a considerable amount of memory which is expected
859to be accessed repeatedly by other cgroups, it may make sense to use
860POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
861belonging to the affected files to ensure correct memory ownership.
862
863
8645-3. IO
865
866The "io" controller regulates the distribution of IO resources. This
867controller implements both weight based and absolute bandwidth or IOPS
868limit distribution; however, weight based distribution is available
869only if cfq-iosched is in use and neither scheme is available for
870blk-mq devices.
871
872
8735-3-1. IO Interface Files
874
875 io.stat
876
877 A read-only nested-keyed file which exists on non-root
878 cgroups.
879
880 Lines are keyed by $MAJ:$MIN device numbers and not ordered.
881 The following nested keys are defined.
882
883 rbytes Bytes read
884 wbytes Bytes written
885 rios Number of read IOs
886 wios Number of write IOs
887
888 An example read output follows.
889
890 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
891 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
892
893 io.weight
894
895 A read-write flat-keyed file which exists on non-root cgroups.
896 The default is "default 100".
897
898 The first line is the default weight applied to devices
899 without specific override. The rest are overrides keyed by
900 $MAJ:$MIN device numbers and not ordered. The weights are in
901 the range [1, 10000] and specifies the relative amount IO time
902 the cgroup can use in relation to its siblings.
903
904 The default weight can be updated by writing either "default
905 $WEIGHT" or simply "$WEIGHT". Overrides can be set by writing
906 "$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default".
907
908 An example read output follows.
909
910 default 100
911 8:16 200
912 8:0 50
913
914 io.max
915
916 A read-write nested-keyed file which exists on non-root
917 cgroups.
918
919 BPS and IOPS based IO limit. Lines are keyed by $MAJ:$MIN
920 device numbers and not ordered. The following nested keys are
921 defined.
922
923 rbps Max read bytes per second
924 wbps Max write bytes per second
925 riops Max read IO operations per second
926 wiops Max write IO operations per second
927
928 When writing, any number of nested key-value pairs can be
929 specified in any order. "max" can be specified as the value
930 to remove a specific limit. If the same key is specified
931 multiple times, the outcome is undefined.
932
933 BPS and IOPS are measured in each IO direction and IOs are
934 delayed if limit is reached. Temporary bursts are allowed.
935
936 Setting read limit at 2M BPS and write at 120 IOPS for 8:16.
937
938 echo "8:16 rbps=2097152 wiops=120" > io.max
939
940 Reading returns the following.
941
942 8:16 rbps=2097152 wbps=max riops=max wiops=120
943
944 Write IOPS limit can be removed by writing the following.
945
946 echo "8:16 wiops=max" > io.max
947
948 Reading now returns the following.
949
950 8:16 rbps=2097152 wbps=max riops=max wiops=max
951
952
9535-3-2. Writeback
954
955Page cache is dirtied through buffered writes and shared mmaps and
956written asynchronously to the backing filesystem by the writeback
957mechanism. Writeback sits between the memory and IO domains and
958regulates the proportion of dirty memory by balancing dirtying and
959write IOs.
960
961The io controller, in conjunction with the memory controller,
962implements control of page cache writeback IOs. The memory controller
963defines the memory domain that dirty memory ratio is calculated and
964maintained for and the io controller defines the io domain which
965writes out dirty pages for the memory domain. Both system-wide and
966per-cgroup dirty memory states are examined and the more restrictive
967of the two is enforced.
968
969cgroup writeback requires explicit support from the underlying
970filesystem. Currently, cgroup writeback is implemented on ext2, ext4
971and btrfs. On other filesystems, all writeback IOs are attributed to
972the root cgroup.
973
974There are inherent differences in memory and writeback management
975which affects how cgroup ownership is tracked. Memory is tracked per
976page while writeback per inode. For the purpose of writeback, an
977inode is assigned to a cgroup and all IO requests to write dirty pages
978from the inode are attributed to that cgroup.
979
980As cgroup ownership for memory is tracked per page, there can be pages
981which are associated with different cgroups than the one the inode is
982associated with. These are called foreign pages. The writeback
983constantly keeps track of foreign pages and, if a particular foreign
984cgroup becomes the majority over a certain period of time, switches
985the ownership of the inode to that cgroup.
986
987While this model is enough for most use cases where a given inode is
988mostly dirtied by a single cgroup even when the main writing cgroup
989changes over time, use cases where multiple cgroups write to a single
990inode simultaneously are not supported well. In such circumstances, a
991significant portion of IOs are likely to be attributed incorrectly.
992As memory controller assigns page ownership on the first use and
993doesn't update it until the page is released, even if writeback
994strictly follows page ownership, multiple cgroups dirtying overlapping
995areas wouldn't work as expected. It's recommended to avoid such usage
996patterns.
997
998The sysctl knobs which affect writeback behavior are applied to cgroup
999writeback as follows.
1000
1001 vm.dirty_background_ratio
1002 vm.dirty_ratio
1003
1004 These ratios apply the same to cgroup writeback with the
1005 amount of available memory capped by limits imposed by the
1006 memory controller and system-wide clean memory.
1007
1008 vm.dirty_background_bytes
1009 vm.dirty_bytes
1010
1011 For cgroup writeback, this is calculated into ratio against
1012 total available memory and applied the same way as
1013 vm.dirty[_background]_ratio.
1014
1015
1016P. Information on Kernel Programming
1017
1018This section contains kernel programming information in the areas
1019where interacting with cgroup is necessary. cgroup core and
1020controllers are not covered.
1021
1022
1023P-1. Filesystem Support for Writeback
1024
1025A filesystem can support cgroup writeback by updating
1026address_space_operations->writepage[s]() to annotate bio's using the
1027following two functions.
1028
1029 wbc_init_bio(@wbc, @bio)
1030
1031 Should be called for each bio carrying writeback data and
1032 associates the bio with the inode's owner cgroup. Can be
1033 called anytime between bio allocation and submission.
1034
1035 wbc_account_io(@wbc, @page, @bytes)
1036
1037 Should be called for each data segment being written out.
1038 While this function doesn't care exactly when it's called
1039 during the writeback session, it's the easiest and most
1040 natural to call it as data segments are added to a bio.
1041
1042With writeback bio's annotated, cgroup support can be enabled per
1043super_block by setting SB_I_CGROUPWB in ->s_iflags. This allows for
1044selective disabling of cgroup writeback support which is helpful when
1045certain filesystem features, e.g. journaled data mode, are
1046incompatible.
1047
1048wbc_init_bio() binds the specified bio to its cgroup. Depending on
1049the configuration, the bio may be executed at a lower priority and if
1050the writeback session is holding shared resources, e.g. a journal
1051entry, may lead to priority inversion. There is no one easy solution
1052for the problem. Filesystems can try to work around specific problem
1053cases by skipping wbc_init_bio() or using bio_associate_blkcg()
1054directly.
1055
1056
1057D. Deprecated v1 Core Features
1058
1059- Multiple hierarchies including named ones are not supported.
1060
1061- All mount options and remounting are not supported.
1062
1063- The "tasks" file is removed and "cgroup.procs" is not sorted.
1064
1065- "cgroup.clone_children" is removed.
1066
1067- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file
1068 at the root instead.
1069
1070
1071R. Issues with v1 and Rationales for v2
1072
1073R-1. Multiple Hierarchies
1074
1075cgroup v1 allowed an arbitrary number of hierarchies and each
1076hierarchy could host any number of controllers. While this seemed to
1077provide a high level of flexibility, it wasn't useful in practice.
1078
1079For example, as there is only one instance of each controller, utility
1080type controllers such as freezer which can be useful in all
1081hierarchies could only be used in one. The issue is exacerbated by
1082the fact that controllers couldn't be moved to another hierarchy once
1083hierarchies were populated. Another issue was that all controllers
1084bound to a hierarchy were forced to have exactly the same view of the
1085hierarchy. It wasn't possible to vary the granularity depending on
1086the specific controller.
1087
1088In practice, these issues heavily limited which controllers could be
1089put on the same hierarchy and most configurations resorted to putting
1090each controller on its own hierarchy. Only closely related ones, such
1091as the cpu and cpuacct controllers, made sense to be put on the same
1092hierarchy. This often meant that userland ended up managing multiple
1093similar hierarchies repeating the same steps on each hierarchy
1094whenever a hierarchy management operation was necessary.
1095
1096Furthermore, support for multiple hierarchies came at a steep cost.
1097It greatly complicated cgroup core implementation but more importantly
1098the support for multiple hierarchies restricted how cgroup could be
1099used in general and what controllers was able to do.
1100
1101There was no limit on how many hierarchies there might be, which meant
1102that a thread's cgroup membership couldn't be described in finite
1103length. The key might contain any number of entries and was unlimited
1104in length, which made it highly awkward to manipulate and led to
1105addition of controllers which existed only to identify membership,
1106which in turn exacerbated the original problem of proliferating number
1107of hierarchies.
1108
1109Also, as a controller couldn't have any expectation regarding the
1110topologies of hierarchies other controllers might be on, each
1111controller had to assume that all other controllers were attached to
1112completely orthogonal hierarchies. This made it impossible, or at
1113least very cumbersome, for controllers to cooperate with each other.
1114
1115In most use cases, putting controllers on hierarchies which are
1116completely orthogonal to each other isn't necessary. What usually is
1117called for is the ability to have differing levels of granularity
1118depending on the specific controller. In other words, hierarchy may
1119be collapsed from leaf towards root when viewed from specific
1120controllers. For example, a given configuration might not care about
1121how memory is distributed beyond a certain level while still wanting
1122to control how CPU cycles are distributed.
1123
1124
1125R-2. Thread Granularity
1126
1127cgroup v1 allowed threads of a process to belong to different cgroups.
1128This didn't make sense for some controllers and those controllers
1129ended up implementing different ways to ignore such situations but
1130much more importantly it blurred the line between API exposed to
1131individual applications and system management interface.
1132
1133Generally, in-process knowledge is available only to the process
1134itself; thus, unlike service-level organization of processes,
1135categorizing threads of a process requires active participation from
1136the application which owns the target process.
1137
1138cgroup v1 had an ambiguously defined delegation model which got abused
1139in combination with thread granularity. cgroups were delegated to
1140individual applications so that they can create and manage their own
1141sub-hierarchies and control resource distributions along them. This
1142effectively raised cgroup to the status of a syscall-like API exposed
1143to lay programs.
1144
1145First of all, cgroup has a fundamentally inadequate interface to be
1146exposed this way. For a process to access its own knobs, it has to
1147extract the path on the target hierarchy from /proc/self/cgroup,
1148construct the path by appending the name of the knob to the path, open
1149and then read and/or write to it. This is not only extremely clunky
1150and unusual but also inherently racy. There is no conventional way to
1151define transaction across the required steps and nothing can guarantee
1152that the process would actually be operating on its own sub-hierarchy.
1153
1154cgroup controllers implemented a number of knobs which would never be
1155accepted as public APIs because they were just adding control knobs to
1156system-management pseudo filesystem. cgroup ended up with interface
1157knobs which were not properly abstracted or refined and directly
1158revealed kernel internal details. These knobs got exposed to
1159individual applications through the ill-defined delegation mechanism
1160effectively abusing cgroup as a shortcut to implementing public APIs
1161without going through the required scrutiny.
1162
1163This was painful for both userland and kernel. Userland ended up with
1164misbehaving and poorly abstracted interfaces and kernel exposing and
1165locked into constructs inadvertently.
1166
1167
1168R-3. Competition Between Inner Nodes and Threads
1169
1170cgroup v1 allowed threads to be in any cgroups which created an
1171interesting problem where threads belonging to a parent cgroup and its
1172children cgroups competed for resources. This was nasty as two
1173different types of entities competed and there was no obvious way to
1174settle it. Different controllers did different things.
1175
1176The cpu controller considered threads and cgroups as equivalents and
1177mapped nice levels to cgroup weights. This worked for some cases but
1178fell flat when children wanted to be allocated specific ratios of CPU
1179cycles and the number of internal threads fluctuated - the ratios
1180constantly changed as the number of competing entities fluctuated.
1181There also were other issues. The mapping from nice level to weight
1182wasn't obvious or universal, and there were various other knobs which
1183simply weren't available for threads.
1184
1185The io controller implicitly created a hidden leaf node for each
1186cgroup to host the threads. The hidden leaf had its own copies of all
1187the knobs with "leaf_" prefixed. While this allowed equivalent
1188control over internal threads, it was with serious drawbacks. It
1189always added an extra layer of nesting which wouldn't be necessary
1190otherwise, made the interface messy and significantly complicated the
1191implementation.
1192
1193The memory controller didn't have a way to control what happened
1194between internal tasks and child cgroups and the behavior was not
1195clearly defined. There were attempts to add ad-hoc behaviors and
1196knobs to tailor the behavior to specific workloads which would have
1197led to problems extremely difficult to resolve in the long term.
1198
1199Multiple controllers struggled with internal tasks and came up with
1200different ways to deal with it; unfortunately, all the approaches were
1201severely flawed and, furthermore, the widely different behaviors
1202made cgroup as a whole highly inconsistent.
1203
1204This clearly is a problem which needs to be addressed from cgroup core
1205in a uniform way.
1206
1207
1208R-4. Other Interface Issues
1209
1210cgroup v1 grew without oversight and developed a large number of
1211idiosyncrasies and inconsistencies. One issue on the cgroup core side
1212was how an empty cgroup was notified - a userland helper binary was
1213forked and executed for each event. The event delivery wasn't
1214recursive or delegatable. The limitations of the mechanism also led
1215to in-kernel event delivery filtering mechanism further complicating
1216the interface.
1217
1218Controller interfaces were problematic too. An extreme example is
1219controllers completely ignoring hierarchical organization and treating
1220all cgroups as if they were all located directly under the root
1221cgroup. Some controllers exposed a large amount of inconsistent
1222implementation details to userland.
1223
1224There also was no consistency across controllers. When a new cgroup
1225was created, some controllers defaulted to not imposing extra
1226restrictions while others disallowed any resource usage until
1227explicitly configured. Configuration knobs for the same type of
1228control used widely differing naming schemes and formats. Statistics
1229and information knobs were named arbitrarily and used different
1230formats and units even in the same controller.
1231
1232cgroup v2 establishes common conventions where appropriate and updates
1233controllers so that they expose minimal and consistent interfaces.
1234
1235
1236R-5. Controller Issues and Remedies
1237
1238R-5-1. Memory
1239
1240The original lower boundary, the soft limit, is defined as a limit
1241that is per default unset. As a result, the set of cgroups that
1242global reclaim prefers is opt-in, rather than opt-out. The costs for
1243optimizing these mostly negative lookups are so high that the
1244implementation, despite its enormous size, does not even provide the
1245basic desirable behavior. First off, the soft limit has no
1246hierarchical meaning. All configured groups are organized in a global
1247rbtree and treated like equal peers, regardless where they are located
1248in the hierarchy. This makes subtree delegation impossible. Second,
1249the soft limit reclaim pass is so aggressive that it not just
1250introduces high allocation latencies into the system, but also impacts
1251system performance due to overreclaim, to the point where the feature
1252becomes self-defeating.
1253
1254The memory.low boundary on the other hand is a top-down allocated
1255reserve. A cgroup enjoys reclaim protection when it and all its
1256ancestors are below their low boundaries, which makes delegation of
1257subtrees possible. Secondly, new cgroups have no reserve per default
1258and in the common case most cgroups are eligible for the preferred
1259reclaim pass. This allows the new low boundary to be efficiently
1260implemented with just a minor addition to the generic reclaim code,
1261without the need for out-of-band data structures and reclaim passes.
1262Because the generic reclaim code considers all cgroups except for the
1263ones running low in the preferred first reclaim pass, overreclaim of
1264individual groups is eliminated as well, resulting in much better
1265overall workload performance.
1266
1267The original high boundary, the hard limit, is defined as a strict
1268limit that can not budge, even if the OOM killer has to be called.
1269But this generally goes against the goal of making the most out of the
1270available memory. The memory consumption of workloads varies during
1271runtime, and that requires users to overcommit. But doing that with a
1272strict upper limit requires either a fairly accurate prediction of the
1273working set size or adding slack to the limit. Since working set size
1274estimation is hard and error prone, and getting it wrong results in
1275OOM kills, most users tend to err on the side of a looser limit and
1276end up wasting precious resources.
1277
1278The memory.high boundary on the other hand can be set much more
1279conservatively. When hit, it throttles allocations by forcing them
1280into direct reclaim to work off the excess, but it never invokes the
1281OOM killer. As a result, a high boundary that is chosen too
1282aggressively will not terminate the processes, but instead it will
1283lead to gradual performance degradation. The user can monitor this
1284and make corrections until the minimal memory footprint that still
1285gives acceptable performance is found.
1286
1287In extreme cases, with many concurrent allocations and a complete
1288breakdown of reclaim progress within the group, the high boundary can
1289be exceeded. But even then it's mostly better to satisfy the
1290allocation from the slack available in other groups or the rest of the
1291system than killing the group. Otherwise, memory.max is there to
1292limit this type of spillover and ultimately contain buggy or even
1293malicious applications.
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
deleted file mode 100644
index 781b1d475bcf..000000000000
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ /dev/null
@@ -1,647 +0,0 @@
1
2Cgroup unified hierarchy
3
4April, 2014 Tejun Heo <tj@kernel.org>
5
6This document describes the changes made by unified hierarchy and
7their rationales. It will eventually be merged into the main cgroup
8documentation.
9
10CONTENTS
11
121. Background
132. Basic Operation
14 2-1. Mounting
15 2-2. cgroup.subtree_control
16 2-3. cgroup.controllers
173. Structural Constraints
18 3-1. Top-down
19 3-2. No internal tasks
204. Delegation
21 4-1. Model of delegation
22 4-2. Common ancestor rule
235. Other Changes
24 5-1. [Un]populated Notification
25 5-2. Other Core Changes
26 5-3. Controller File Conventions
27 5-3-1. Format
28 5-3-2. Control Knobs
29 5-4. Per-Controller Changes
30 5-4-1. io
31 5-4-2. cpuset
32 5-4-3. memory
336. Planned Changes
34 6-1. CAP for resource control
35
36
371. Background
38
39cgroup allows an arbitrary number of hierarchies and each hierarchy
40can host any number of controllers. While this seems to provide a
41high level of flexibility, it isn't quite useful in practice.
42
43For example, as there is only one instance of each controller, utility
44type controllers such as freezer which can be useful in all
45hierarchies can only be used in one. The issue is exacerbated by the
46fact that controllers can't be moved around once hierarchies are
47populated. Another issue is that all controllers bound to a hierarchy
48are forced to have exactly the same view of the hierarchy. It isn't
49possible to vary the granularity depending on the specific controller.
50
51In practice, these issues heavily limit which controllers can be put
52on the same hierarchy and most configurations resort to putting each
53controller on its own hierarchy. Only closely related ones, such as
54the cpu and cpuacct controllers, make sense to put on the same
55hierarchy. This often means that userland ends up managing multiple
56similar hierarchies repeating the same steps on each hierarchy
57whenever a hierarchy management operation is necessary.
58
59Unfortunately, support for multiple hierarchies comes at a steep cost.
60Internal implementation in cgroup core proper is dazzlingly
61complicated but more importantly the support for multiple hierarchies
62restricts how cgroup is used in general and what controllers can do.
63
64There's no limit on how many hierarchies there may be, which means
65that a task's cgroup membership can't be described in finite length.
66The key may contain any varying number of entries and is unlimited in
67length, which makes it highly awkward to handle and leads to addition
68of controllers which exist only to identify membership, which in turn
69exacerbates the original problem.
70
71Also, as a controller can't have any expectation regarding what shape
72of hierarchies other controllers would be on, each controller has to
73assume that all other controllers are operating on completely
74orthogonal hierarchies. This makes it impossible, or at least very
75cumbersome, for controllers to cooperate with each other.
76
77In most use cases, putting controllers on hierarchies which are
78completely orthogonal to each other isn't necessary. What usually is
79called for is the ability to have differing levels of granularity
80depending on the specific controller. In other words, hierarchy may
81be collapsed from leaf towards root when viewed from specific
82controllers. For example, a given configuration might not care about
83how memory is distributed beyond a certain level while still wanting
84to control how CPU cycles are distributed.
85
86Unified hierarchy is the next version of cgroup interface. It aims to
87address the aforementioned issues by having more structure while
88retaining enough flexibility for most use cases. Various other
89general and controller-specific interface issues are also addressed in
90the process.
91
92
932. Basic Operation
94
952-1. Mounting
96
97Currently, unified hierarchy can be mounted with the following mount
98command. Note that this is still under development and scheduled to
99change soon.
100
101 mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
102
103All controllers which support the unified hierarchy and are not bound
104to other hierarchies are automatically bound to unified hierarchy and
105show up at the root of it. Controllers which are enabled only in the
106root of unified hierarchy can be bound to other hierarchies. This
107allows mixing unified hierarchy with the traditional multiple
108hierarchies in a fully backward compatible way.
109
110A controller can be moved across hierarchies only after the controller
111is no longer referenced in its current hierarchy. Because per-cgroup
112controller states are destroyed asynchronously and controllers may
113have lingering references, a controller may not show up immediately on
114the unified hierarchy after the final umount of the previous
115hierarchy. Similarly, a controller should be fully disabled to be
116moved out of the unified hierarchy and it may take some time for the
117disabled controller to become available for other hierarchies;
118furthermore, due to dependencies among controllers, other controllers
119may need to be disabled too.
120
121While useful for development and manual configurations, dynamically
122moving controllers between the unified and other hierarchies is
123strongly discouraged for production use. It is recommended to decide
124the hierarchies and controller associations before starting using the
125controllers.
126
127
1282-2. cgroup.subtree_control
129
130All cgroups on unified hierarchy have a "cgroup.subtree_control" file
131which governs which controllers are enabled on the children of the
132cgroup. Let's assume a hierarchy like the following.
133
134 root - A - B - C
135 \ D
136
137root's "cgroup.subtree_control" file determines which controllers are
138enabled on A. A's on B. B's on C and D. This coincides with the
139fact that controllers on the immediate sub-level are used to
140distribute the resources of the parent. In fact, it's natural to
141assume that resource control knobs of a child belong to its parent.
142Enabling a controller in a "cgroup.subtree_control" file declares that
143distribution of the respective resources of the cgroup will be
144controlled. Note that this means that controller enable states are
145shared among siblings.
146
147When read, the file contains a space-separated list of currently
148enabled controllers. A write to the file should contain a
149space-separated list of controllers with '+' or '-' prefixed (without
150the quotes). Controllers prefixed with '+' are enabled and '-'
151disabled. If a controller is listed multiple times, the last entry
152wins. The specific operations are executed atomically - either all
153succeed or fail.
154
155
1562-3. cgroup.controllers
157
158Read-only "cgroup.controllers" file contains a space-separated list of
159controllers which can be enabled in the cgroup's
160"cgroup.subtree_control" file.
161
162In the root cgroup, this lists controllers which are not bound to
163other hierarchies and the content changes as controllers are bound to
164and unbound from other hierarchies.
165
166In non-root cgroups, the content of this file equals that of the
167parent's "cgroup.subtree_control" file as only controllers enabled
168from the parent can be used in its children.
169
170
1713. Structural Constraints
172
1733-1. Top-down
174
175As it doesn't make sense to nest control of an uncontrolled resource,
176all non-root "cgroup.subtree_control" files can only contain
177controllers which are enabled in the parent's "cgroup.subtree_control"
178file. A controller can be enabled only if the parent has the
179controller enabled and a controller can't be disabled if one or more
180children have it enabled.
181
182
1833-2. No internal tasks
184
185One long-standing issue that cgroup faces is the competition between
186tasks belonging to the parent cgroup and its children cgroups. This
187is inherently nasty as two different types of entities compete and
188there is no agreed-upon obvious way to handle it. Different
189controllers are doing different things.
190
191The cpu controller considers tasks and cgroups as equivalents and maps
192nice levels to cgroup weights. This works for some cases but falls
193flat when children should be allocated specific ratios of CPU cycles
194and the number of internal tasks fluctuates - the ratios constantly
195change as the number of competing entities fluctuates. There also are
196other issues. The mapping from nice level to weight isn't obvious or
197universal, and there are various other knobs which simply aren't
198available for tasks.
199
200The io controller implicitly creates a hidden leaf node for each
201cgroup to host the tasks. The hidden leaf has its own copies of all
202the knobs with "leaf_" prefixed. While this allows equivalent control
203over internal tasks, it's with serious drawbacks. It always adds an
204extra layer of nesting which may not be necessary, makes the interface
205messy and significantly complicates the implementation.
206
207The memory controller currently doesn't have a way to control what
208happens between internal tasks and child cgroups and the behavior is
209not clearly defined. There have been attempts to add ad-hoc behaviors
210and knobs to tailor the behavior to specific workloads. Continuing
211this direction will lead to problems which will be extremely difficult
212to resolve in the long term.
213
214Multiple controllers struggle with internal tasks and came up with
215different ways to deal with it; unfortunately, all the approaches in
216use now are severely flawed and, furthermore, the widely different
217behaviors make cgroup as whole highly inconsistent.
218
219It is clear that this is something which needs to be addressed from
220cgroup core proper in a uniform way so that controllers don't need to
221worry about it and cgroup as a whole shows a consistent and logical
222behavior. To achieve that, unified hierarchy enforces the following
223structural constraint:
224
225 Except for the root, only cgroups which don't contain any task may
226 have controllers enabled in their "cgroup.subtree_control" files.
227
228Combined with other properties, this guarantees that, when a
229controller is looking at the part of the hierarchy which has it
230enabled, tasks are always only on the leaves. This rules out
231situations where child cgroups compete against internal tasks of the
232parent.
233
234There are two things to note. Firstly, the root cgroup is exempt from
235the restriction. Root contains tasks and anonymous resource
236consumption which can't be associated with any other cgroup and
237requires special treatment from most controllers. How resource
238consumption in the root cgroup is governed is up to each controller.
239
240Secondly, the restriction doesn't take effect if there is no enabled
241controller in the cgroup's "cgroup.subtree_control" file. This is
242important as otherwise it wouldn't be possible to create children of a
243populated cgroup. To control resource distribution of a cgroup, the
244cgroup must create children and transfer all its tasks to the children
245before enabling controllers in its "cgroup.subtree_control" file.
246
247
2484. Delegation
249
2504-1. Model of delegation
251
252A cgroup can be delegated to a less privileged user by granting write
253access of the directory and its "cgroup.procs" file to the user. Note
254that the resource control knobs in a given directory concern the
255resources of the parent and thus must not be delegated along with the
256directory.
257
258Once delegated, the user can build sub-hierarchy under the directory,
259organize processes as it sees fit and further distribute the resources
260it got from the parent. The limits and other settings of all resource
261controllers are hierarchical and regardless of what happens in the
262delegated sub-hierarchy, nothing can escape the resource restrictions
263imposed by the parent.
264
265Currently, cgroup doesn't impose any restrictions on the number of
266cgroups in or nesting depth of a delegated sub-hierarchy; however,
267this may in the future be limited explicitly.
268
269
2704-2. Common ancestor rule
271
272On the unified hierarchy, to write to a "cgroup.procs" file, in
273addition to the usual write permission to the file and uid match, the
274writer must also have write access to the "cgroup.procs" file of the
275common ancestor of the source and destination cgroups. This prevents
276delegatees from smuggling processes across disjoint sub-hierarchies.
277
278Let's say cgroups C0 and C1 have been delegated to user U0 who created
279C00, C01 under C0 and C10 under C1 as follows.
280
281 ~~~~~~~~~~~~~ - C0 - C00
282 ~ cgroup ~ \ C01
283 ~ hierarchy ~
284 ~~~~~~~~~~~~~ - C1 - C10
285
286C0 and C1 are separate entities in terms of resource distribution
287regardless of their relative positions in the hierarchy. The
288resources the processes under C0 are entitled to are controlled by
289C0's ancestors and may be completely different from C1. It's clear
290that the intention of delegating C0 to U0 is allowing U0 to organize
291the processes under C0 and further control the distribution of C0's
292resources.
293
294On traditional hierarchies, if a task has write access to "tasks" or
295"cgroup.procs" file of a cgroup and its uid agrees with the target, it
296can move the target to the cgroup. In the above example, U0 will not
297only be able to move processes in each sub-hierarchy but also across
298the two sub-hierarchies, effectively allowing it to violate the
299organizational and resource restrictions implied by the hierarchical
300structure above C0 and C1.
301
302On the unified hierarchy, let's say U0 wants to write the pid of a
303process which has a matching uid and is currently in C10 into
304"C00/cgroup.procs". U0 obviously has write access to the file and
305migration permission on the process; however, the common ancestor of
306the source cgroup C10 and the destination cgroup C00 is above the
307points of delegation and U0 would not have write access to its
308"cgroup.procs" and thus be denied with -EACCES.
309
310
3115. Other Changes
312
3135-1. [Un]populated Notification
314
315cgroup users often need a way to determine when a cgroup's
316subhierarchy becomes empty so that it can be cleaned up. cgroup
317currently provides release_agent for it; unfortunately, this mechanism
318is riddled with issues.
319
320- It delivers events by forking and execing a userland binary
321 specified as the release_agent. This is a long deprecated method of
322 notification delivery. It's extremely heavy, slow and cumbersome to
323 integrate with larger infrastructure.
324
325- There is single monitoring point at the root. There's no way to
326 delegate management of a subtree.
327
328- The event isn't recursive. It triggers when a cgroup doesn't have
329 any tasks or child cgroups. Events for internal nodes trigger only
330 after all children are removed. This again makes it impossible to
331 delegate management of a subtree.
332
333- Events are filtered from the kernel side. A "notify_on_release"
334 file is used to subscribe to or suppress release events. This is
335 unnecessarily complicated and probably done this way because event
336 delivery itself was expensive.
337
338Unified hierarchy implements "populated" field in "cgroup.events"
339interface file which can be used to monitor whether the cgroup's
340subhierarchy has tasks in it or not. Its value is 0 if there is no
341task in the cgroup and its descendants; otherwise, 1. poll and
342[id]notify events are triggered when the value changes.
343
344This is significantly lighter and simpler and trivially allows
345delegating management of subhierarchy - subhierarchy monitoring can
346block further propagation simply by putting itself or another process
347in the subhierarchy and monitor events that it's interested in from
348there without interfering with monitoring higher in the tree.
349
350In unified hierarchy, the release_agent mechanism is no longer
351supported and the interface files "release_agent" and
352"notify_on_release" do not exist.
353
354
3555-2. Other Core Changes
356
357- None of the mount options is allowed.
358
359- remount is disallowed.
360
361- rename(2) is disallowed.
362
363- The "tasks" file is removed. Everything should at process
364 granularity. Use the "cgroup.procs" file instead.
365
366- The "cgroup.procs" file is not sorted. pids will be unique unless
367 they got recycled in-between reads.
368
369- The "cgroup.clone_children" file is removed.
370
371- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
372 to before exiting. If the cgroup is removed before the zombie is
373 reaped, " (deleted)" is appeneded to the path.
374
375
3765-3. Controller File Conventions
377
3785-3-1. Format
379
380In general, all controller files should be in one of the following
381formats whenever possible.
382
383- Values only files
384
385 VAL0 VAL1...\n
386
387- Flat keyed files
388
389 KEY0 VAL0\n
390 KEY1 VAL1\n
391 ...
392
393- Nested keyed files
394
395 KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
396 KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
397 ...
398
399For a writeable file, the format for writing should generally match
400reading; however, controllers may allow omitting later fields or
401implement restricted shortcuts for most common use cases.
402
403For both flat and nested keyed files, only the values for a single key
404can be written at a time. For nested keyed files, the sub key pairs
405may be specified in any order and not all pairs have to be specified.
406
407
4085-3-2. Control Knobs
409
410- Settings for a single feature should generally be implemented in a
411 single file.
412
413- In general, the root cgroup should be exempt from resource control
414 and thus shouldn't have resource control knobs.
415
416- If a controller implements ratio based resource distribution, the
417 control knob should be named "weight" and have the range [1, 10000]
418 and 100 should be the default value. The values are chosen to allow
419 enough and symmetric bias in both directions while keeping it
420 intuitive (the default is 100%).
421
422- If a controller implements an absolute resource guarantee and/or
423 limit, the control knobs should be named "min" and "max"
424 respectively. If a controller implements best effort resource
425 gurantee and/or limit, the control knobs should be named "low" and
426 "high" respectively.
427
428 In the above four control files, the special token "max" should be
429 used to represent upward infinity for both reading and writing.
430
431- If a setting has configurable default value and specific overrides,
432 the default settings should be keyed with "default" and appear as
433 the first entry in the file. Specific entries can use "default" as
434 its value to indicate inheritance of the default value.
435
436- For events which are not very high frequency, an interface file
437 "events" should be created which lists event key value pairs.
438 Whenever a notifiable event happens, file modified event should be
439 generated on the file.
440
441
4425-4. Per-Controller Changes
443
4445-4-1. io
445
446- blkio is renamed to io. The interface is overhauled anyway. The
447 new name is more in line with the other two major controllers, cpu
448 and memory, and better suited given that it may be used for cgroup
449 writeback without involving block layer.
450
451- Everything including stat is always hierarchical making separate
452 recursive stat files pointless and, as no internal node can have
453 tasks, leaf weights are meaningless. The operation model is
454 simplified and the interface is overhauled accordingly.
455
456 io.stat
457
458 The stat file. The reported stats are from the point where
459 bio's are issued to request_queue. The stats are counted
460 independent of which policies are enabled. Each line in the
461 file follows the following format. More fields may later be
462 added at the end.
463
464 $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
465
466 io.weight
467
468 The weight setting, currently only available and effective if
469 cfq-iosched is in use for the target device. The weight is
470 between 1 and 10000 and defaults to 100. The first line
471 always contains the default weight in the following format to
472 use when per-device setting is missing.
473
474 default $WEIGHT
475
476 Subsequent lines list per-device weights of the following
477 format.
478
479 $MAJ:$MIN $WEIGHT
480
481 Writing "$WEIGHT" or "default $WEIGHT" changes the default
482 setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
483 while "$MAJ:$MIN default" clears it.
484
485 This file is available only on non-root cgroups.
486
487 io.max
488
489 The maximum bandwidth and/or iops setting, only available if
490 blk-throttle is enabled. The file is of the following format.
491
492 $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
493
494 ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
495 read/write IOs per second. "max" indicates no limit. Writing
496 to the file follows the same format but the individual
497 settings may be omitted or specified in any order.
498
499 This file is available only on non-root cgroups.
500
501
5025-4-2. cpuset
503
504- Tasks are kept in empty cpusets after hotplug and take on the masks
505 of the nearest non-empty ancestor, instead of being moved to it.
506
507- A task can be moved into an empty cpuset, and again it takes on the
508 masks of the nearest non-empty ancestor.
509
510
5115-4-3. memory
512
513- use_hierarchy is on by default and the cgroup file for the flag is
514 not created.
515
516- The original lower boundary, the soft limit, is defined as a limit
517 that is per default unset. As a result, the set of cgroups that
518 global reclaim prefers is opt-in, rather than opt-out. The costs
519 for optimizing these mostly negative lookups are so high that the
520 implementation, despite its enormous size, does not even provide the
521 basic desirable behavior. First off, the soft limit has no
522 hierarchical meaning. All configured groups are organized in a
523 global rbtree and treated like equal peers, regardless where they
524 are located in the hierarchy. This makes subtree delegation
525 impossible. Second, the soft limit reclaim pass is so aggressive
526 that it not just introduces high allocation latencies into the
527 system, but also impacts system performance due to overreclaim, to
528 the point where the feature becomes self-defeating.
529
530 The memory.low boundary on the other hand is a top-down allocated
531 reserve. A cgroup enjoys reclaim protection when it and all its
532 ancestors are below their low boundaries, which makes delegation of
533 subtrees possible. Secondly, new cgroups have no reserve per
534 default and in the common case most cgroups are eligible for the
535 preferred reclaim pass. This allows the new low boundary to be
536 efficiently implemented with just a minor addition to the generic
537 reclaim code, without the need for out-of-band data structures and
538 reclaim passes. Because the generic reclaim code considers all
539 cgroups except for the ones running low in the preferred first
540 reclaim pass, overreclaim of individual groups is eliminated as
541 well, resulting in much better overall workload performance.
542
543- The original high boundary, the hard limit, is defined as a strict
544 limit that can not budge, even if the OOM killer has to be called.
545 But this generally goes against the goal of making the most out of
546 the available memory. The memory consumption of workloads varies
547 during runtime, and that requires users to overcommit. But doing
548 that with a strict upper limit requires either a fairly accurate
549 prediction of the working set size or adding slack to the limit.
550 Since working set size estimation is hard and error prone, and
551 getting it wrong results in OOM kills, most users tend to err on the
552 side of a looser limit and end up wasting precious resources.
553
554 The memory.high boundary on the other hand can be set much more
555 conservatively. When hit, it throttles allocations by forcing them
556 into direct reclaim to work off the excess, but it never invokes the
557 OOM killer. As a result, a high boundary that is chosen too
558 aggressively will not terminate the processes, but instead it will
559 lead to gradual performance degradation. The user can monitor this
560 and make corrections until the minimal memory footprint that still
561 gives acceptable performance is found.
562
563 In extreme cases, with many concurrent allocations and a complete
564 breakdown of reclaim progress within the group, the high boundary
565 can be exceeded. But even then it's mostly better to satisfy the
566 allocation from the slack available in other groups or the rest of
567 the system than killing the group. Otherwise, memory.max is there
568 to limit this type of spillover and ultimately contain buggy or even
569 malicious applications.
570
571- The original control file names are unwieldy and inconsistent in
572 many different ways. For example, the upper boundary hit count is
573 exported in the memory.failcnt file, but an OOM event count has to
574 be manually counted by listening to memory.oom_control events, and
575 lower boundary / soft limit events have to be counted by first
576 setting a threshold for that value and then counting those events.
577 Also, usage and limit files encode their units in the filename.
578 That makes the filenames very long, even though this is not
579 information that a user needs to be reminded of every time they type
580 out those names.
581
582 To address these naming issues, as well as to signal clearly that
583 the new interface carries a new configuration model, the naming
584 conventions in it necessarily differ from the old interface.
585
586- The original limit files indicate the state of an unset limit with a
587 Very High Number, and a configured limit can be unset by echoing -1
588 into those files. But that very high number is implementation and
589 architecture dependent and not very descriptive. And while -1 can
590 be understood as an underflow into the highest possible value, -2 or
591 -10M etc. do not work, so it's not consistent.
592
593 memory.low, memory.high, and memory.max will use the string "max" to
594 indicate and set the highest possible value.
595
5966. Planned Changes
597
5986-1. CAP for resource control
599
600Unified hierarchy will require one of the capabilities(7), which is
601yet to be decided, for all resource control related knobs. Process
602organization operations - creation of sub-cgroups and migration of
603processes in sub-hierarchies may be delegated by changing the
604ownership and/or permissions on the cgroup directory and
605"cgroup.procs" interface file; however, all operations which affect
606resource control - writes to a "cgroup.subtree_control" file or any
607controller-specific knobs - will require an explicit CAP privilege.
608
609This, in part, is to prevent the cgroup interface from being
610inadvertently promoted to programmable API used by non-privileged
611binaries. cgroup exposes various aspects of the system in ways which
612aren't properly abstracted for direct consumption by regular programs.
613This is an administration interface much closer to sysctl knobs than
614system calls. Even the basic access model, being filesystem path
615based, isn't suitable for direct consumption. There's no way to
616access "my cgroup" in a race-free way or make multiple operations
617atomic against migration to another cgroup.
618
619Another aspect is that, for better or for worse, the cgroup interface
620goes through far less scrutiny than regular interfaces for
621unprivileged userland. The upside is that cgroup is able to expose
622useful features which may not be suitable for general consumption in a
623reasonable time frame. It provides a relatively short path between
624internal details and userland-visible interface. Of course, this
625shortcut comes with high risk. We go through what we go through for
626general kernel APIs for good reasons. It may end up leaking internal
627details in a way which can exert significant pain by locking the
628kernel into a contract that can't be maintained in a reasonable
629manner.
630
631Also, due to the specific nature, cgroup and its controllers don't
632tend to attract attention from a wide scope of developers. cgroup's
633short history is already fraught with severely mis-designed
634interfaces, unnecessary commitments to and exposing of internal
635details, broken and dangerous implementations of various features.
636
637Keeping cgroup as an administration interface is both advantageous for
638its role and imperative given its nature. Some of the cgroup features
639may make sense for unprivileged access. If deemed justified, those
640must be further abstracted and implemented as a different interface,
641be it a system call or process-private filesystem, and survive through
642the scrutiny that any interface for general consumption is required to
643go through.
644
645Requiring CAP is not a complete solution but should serve as a
646significant deterrent against spraying cgroup usages in non-privileged
647programs.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e5f4164cbd99..7f540f7f588d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -34,17 +34,12 @@ struct seq_file;
34 34
35/* define the enumeration of all cgroup subsystems */ 35/* define the enumeration of all cgroup subsystems */
36#define SUBSYS(_x) _x ## _cgrp_id, 36#define SUBSYS(_x) _x ## _cgrp_id,
37#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
38 __unused_tag_ ## _t = CGROUP_ ## _t - 1,
39enum cgroup_subsys_id { 37enum cgroup_subsys_id {
40#include <linux/cgroup_subsys.h> 38#include <linux/cgroup_subsys.h>
41 CGROUP_SUBSYS_COUNT, 39 CGROUP_SUBSYS_COUNT,
42}; 40};
43#undef SUBSYS_TAG
44#undef SUBSYS 41#undef SUBSYS
45 42
46#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
47
48/* bits in struct cgroup_subsys_state flags field */ 43/* bits in struct cgroup_subsys_state flags field */
49enum { 44enum {
50 CSS_NO_REF = (1 << 0), /* no reference counting for this css */ 45 CSS_NO_REF = (1 << 0), /* no reference counting for this css */
@@ -66,7 +61,6 @@ enum {
66 61
67/* cgroup_root->flags */ 62/* cgroup_root->flags */
68enum { 63enum {
69 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
70 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 64 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
71 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 65 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
72}; 66};
@@ -439,9 +433,9 @@ struct cgroup_subsys {
439 int (*can_attach)(struct cgroup_taskset *tset); 433 int (*can_attach)(struct cgroup_taskset *tset);
440 void (*cancel_attach)(struct cgroup_taskset *tset); 434 void (*cancel_attach)(struct cgroup_taskset *tset);
441 void (*attach)(struct cgroup_taskset *tset); 435 void (*attach)(struct cgroup_taskset *tset);
442 int (*can_fork)(struct task_struct *task, void **priv_p); 436 int (*can_fork)(struct task_struct *task);
443 void (*cancel_fork)(struct task_struct *task, void *priv); 437 void (*cancel_fork)(struct task_struct *task);
444 void (*fork)(struct task_struct *task, void *priv); 438 void (*fork)(struct task_struct *task);
445 void (*exit)(struct task_struct *task); 439 void (*exit)(struct task_struct *task);
446 void (*free)(struct task_struct *task); 440 void (*free)(struct task_struct *task);
447 void (*bind)(struct cgroup_subsys_state *root_css); 441 void (*bind)(struct cgroup_subsys_state *root_css);
@@ -527,7 +521,6 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
527 521
528#else /* CONFIG_CGROUPS */ 522#else /* CONFIG_CGROUPS */
529 523
530#define CGROUP_CANFORK_COUNT 0
531#define CGROUP_SUBSYS_COUNT 0 524#define CGROUP_SUBSYS_COUNT 0
532 525
533static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} 526static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 322a28482745..2162dca88dc0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -97,12 +97,9 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
97 struct pid *pid, struct task_struct *tsk); 97 struct pid *pid, struct task_struct *tsk);
98 98
99void cgroup_fork(struct task_struct *p); 99void cgroup_fork(struct task_struct *p);
100extern int cgroup_can_fork(struct task_struct *p, 100extern int cgroup_can_fork(struct task_struct *p);
101 void *ss_priv[CGROUP_CANFORK_COUNT]); 101extern void cgroup_cancel_fork(struct task_struct *p);
102extern void cgroup_cancel_fork(struct task_struct *p, 102extern void cgroup_post_fork(struct task_struct *p);
103 void *ss_priv[CGROUP_CANFORK_COUNT]);
104extern void cgroup_post_fork(struct task_struct *p,
105 void *old_ss_priv[CGROUP_CANFORK_COUNT]);
106void cgroup_exit(struct task_struct *p); 103void cgroup_exit(struct task_struct *p);
107void cgroup_free(struct task_struct *p); 104void cgroup_free(struct task_struct *p);
108 105
@@ -562,13 +559,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
562 struct dentry *dentry) { return -EINVAL; } 559 struct dentry *dentry) { return -EINVAL; }
563 560
564static inline void cgroup_fork(struct task_struct *p) {} 561static inline void cgroup_fork(struct task_struct *p) {}
565static inline int cgroup_can_fork(struct task_struct *p, 562static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
566 void *ss_priv[CGROUP_CANFORK_COUNT]) 563static inline void cgroup_cancel_fork(struct task_struct *p) {}
567{ return 0; } 564static inline void cgroup_post_fork(struct task_struct *p) {}
568static inline void cgroup_cancel_fork(struct task_struct *p,
569 void *ss_priv[CGROUP_CANFORK_COUNT]) {}
570static inline void cgroup_post_fork(struct task_struct *p,
571 void *ss_priv[CGROUP_CANFORK_COUNT]) {}
572static inline void cgroup_exit(struct task_struct *p) {} 565static inline void cgroup_exit(struct task_struct *p) {}
573static inline void cgroup_free(struct task_struct *p) {} 566static inline void cgroup_free(struct task_struct *p) {}
574 567
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1a96fdaa33d5..0df0336acee9 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -6,14 +6,8 @@
6 6
7/* 7/*
8 * This file *must* be included with SUBSYS() defined. 8 * This file *must* be included with SUBSYS() defined.
9 * SUBSYS_TAG() is a noop if undefined.
10 */ 9 */
11 10
12#ifndef SUBSYS_TAG
13#define __TMP_SUBSYS_TAG
14#define SUBSYS_TAG(_x)
15#endif
16
17#if IS_ENABLED(CONFIG_CPUSETS) 11#if IS_ENABLED(CONFIG_CPUSETS)
18SUBSYS(cpuset) 12SUBSYS(cpuset)
19#endif 13#endif
@@ -58,17 +52,10 @@ SUBSYS(net_prio)
58SUBSYS(hugetlb) 52SUBSYS(hugetlb)
59#endif 53#endif
60 54
61/*
62 * Subsystems that implement the can_fork() family of callbacks.
63 */
64SUBSYS_TAG(CANFORK_START)
65
66#if IS_ENABLED(CONFIG_CGROUP_PIDS) 55#if IS_ENABLED(CONFIG_CGROUP_PIDS)
67SUBSYS(pids) 56SUBSYS(pids)
68#endif 57#endif
69 58
70SUBSYS_TAG(CANFORK_END)
71
72/* 59/*
73 * The following subsystems are not supported on the default hierarchy. 60 * The following subsystems are not supported on the default hierarchy.
74 */ 61 */
@@ -76,11 +63,6 @@ SUBSYS_TAG(CANFORK_END)
76SUBSYS(debug) 63SUBSYS(debug)
77#endif 64#endif
78 65
79#ifdef __TMP_SUBSYS_TAG
80#undef __TMP_SUBSYS_TAG
81#undef SUBSYS_TAG
82#endif
83
84/* 66/*
85 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. 67 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
86 */ 68 */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index accb036bbc9c..b283d56c1db9 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -54,6 +54,7 @@
54 54
55#define SMB_SUPER_MAGIC 0x517B 55#define SMB_SUPER_MAGIC 0x517B
56#define CGROUP_SUPER_MAGIC 0x27e0eb 56#define CGROUP_SUPER_MAGIC 0x27e0eb
57#define CGROUP2_SUPER_MAGIC 0x63677270
57 58
58 59
59#define STACK_END_MAGIC 0x57AC6E9D 60#define STACK_END_MAGIC 0x57AC6E9D
diff --git a/init/Kconfig b/init/Kconfig
index 235c7a2c0d20..5481b49e8c3f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -940,95 +940,24 @@ menuconfig CGROUPS
940 940
941if CGROUPS 941if CGROUPS
942 942
943config CGROUP_DEBUG
944 bool "Example debug cgroup subsystem"
945 default n
946 help
947 This option enables a simple cgroup subsystem that
948 exports useful debugging information about the cgroups
949 framework.
950
951 Say N if unsure.
952
953config CGROUP_FREEZER
954 bool "Freezer cgroup subsystem"
955 help
956 Provides a way to freeze and unfreeze all tasks in a
957 cgroup.
958
959config CGROUP_PIDS
960 bool "PIDs cgroup subsystem"
961 help
962 Provides enforcement of process number limits in the scope of a
963 cgroup. Any attempt to fork more processes than is allowed in the
964 cgroup will fail. PIDs are fundamentally a global resource because it
965 is fairly trivial to reach PID exhaustion before you reach even a
966 conservative kmemcg limit. As a result, it is possible to grind a
967 system to halt without being limited by other cgroup policies. The
968 PIDs cgroup subsystem is designed to stop this from happening.
969
970 It should be noted that organisational operations (such as attaching
971 to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
972 since the PIDs limit only affects a process's ability to fork, not to
973 attach to a cgroup.
974
975config CGROUP_DEVICE
976 bool "Device controller for cgroups"
977 help
978 Provides a cgroup implementing whitelists for devices which
979 a process in the cgroup can mknod or open.
980
981config CPUSETS
982 bool "Cpuset support"
983 help
984 This option will let you create and manage CPUSETs which
985 allow dynamically partitioning a system into sets of CPUs and
986 Memory Nodes and assigning tasks to run only within those sets.
987 This is primarily useful on large SMP or NUMA systems.
988
989 Say N if unsure.
990
991config PROC_PID_CPUSET
992 bool "Include legacy /proc/<pid>/cpuset file"
993 depends on CPUSETS
994 default y
995
996config CGROUP_CPUACCT
997 bool "Simple CPU accounting cgroup subsystem"
998 help
999 Provides a simple Resource Controller for monitoring the
1000 total CPU consumed by the tasks in a cgroup.
1001
1002config PAGE_COUNTER 943config PAGE_COUNTER
1003 bool 944 bool
1004 945
1005config MEMCG 946config MEMCG
1006 bool "Memory Resource Controller for Control Groups" 947 bool "Memory controller"
1007 select PAGE_COUNTER 948 select PAGE_COUNTER
1008 select EVENTFD 949 select EVENTFD
1009 help 950 help
1010 Provides a memory resource controller that manages both anonymous 951 Provides control over the memory footprint of tasks in a cgroup.
1011 memory and page cache. (See Documentation/cgroups/memory.txt)
1012 952
1013config MEMCG_SWAP 953config MEMCG_SWAP
1014 bool "Memory Resource Controller Swap Extension" 954 bool "Swap controller"
1015 depends on MEMCG && SWAP 955 depends on MEMCG && SWAP
1016 help 956 help
1017 Add swap management feature to memory resource controller. When you 957 Provides control over the swap space consumed by tasks in a cgroup.
1018 enable this, you can limit mem+swap usage per cgroup. In other words, 958
1019 when you disable this, memory resource controller has no cares to
1020 usage of swap...a process can exhaust all of the swap. This extension
1021 is useful when you want to avoid exhaustion swap but this itself
1022 adds more overheads and consumes memory for remembering information.
1023 Especially if you use 32bit system or small memory system, please
1024 be careful about enabling this. When memory resource controller
1025 is disabled by boot option, this will be automatically disabled and
1026 there will be no overhead from this. Even when you set this config=y,
1027 if boot option "swapaccount=0" is set, swap will not be accounted.
1028 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
1029 size is 4096bytes, 512k per 1Gbytes of swap.
1030config MEMCG_SWAP_ENABLED 959config MEMCG_SWAP_ENABLED
1031 bool "Memory Resource Controller Swap Extension enabled by default" 960 bool "Swap controller enabled by default"
1032 depends on MEMCG_SWAP 961 depends on MEMCG_SWAP
1033 default y 962 default y
1034 help 963 help
@@ -1052,34 +981,43 @@ config MEMCG_KMEM
1052 the kmem extension can use it to guarantee that no group of processes 981 the kmem extension can use it to guarantee that no group of processes
1053 will ever exhaust kernel resources alone. 982 will ever exhaust kernel resources alone.
1054 983
1055config CGROUP_HUGETLB 984config BLK_CGROUP
1056 bool "HugeTLB Resource Controller for Control Groups" 985 bool "IO controller"
1057 depends on HUGETLB_PAGE 986 depends on BLOCK
1058 select PAGE_COUNTER
1059 default n 987 default n
1060 help 988 ---help---
1061 Provides a cgroup Resource Controller for HugeTLB pages. 989 Generic block IO controller cgroup interface. This is the common
1062 When you enable this, you can put a per cgroup limit on HugeTLB usage. 990 cgroup interface which should be used by various IO controlling
1063 The limit is enforced during page fault. Since HugeTLB doesn't 991 policies.
1064 support page reclaim, enforcing the limit at page fault time implies
1065 that, the application will get SIGBUS signal if it tries to access
1066 HugeTLB pages beyond its limit. This requires the application to know
1067 beforehand how much HugeTLB pages it would require for its use. The
1068 control group is tracked in the third page lru pointer. This means
1069 that we cannot use the controller with huge page less than 3 pages.
1070 992
1071config CGROUP_PERF 993 Currently, CFQ IO scheduler uses it to recognize task groups and
1072 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" 994 control disk bandwidth allocation (proportional time slice allocation)
1073 depends on PERF_EVENTS && CGROUPS 995 to such task groups. It is also used by bio throttling logic in
1074 help 996 block layer to implement upper limit in IO rates on a device.
1075 This option extends the per-cpu mode to restrict monitoring to
1076 threads which belong to the cgroup specified and run on the
1077 designated cpu.
1078 997
1079 Say N if unsure. 998 This option only enables generic Block IO controller infrastructure.
999 One needs to also enable actual IO controlling logic/policy. For
1000 enabling proportional weight division of disk bandwidth in CFQ, set
1001 CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
1002 CONFIG_BLK_DEV_THROTTLING=y.
1003
1004 See Documentation/cgroups/blkio-controller.txt for more information.
1005
1006config DEBUG_BLK_CGROUP
1007 bool "IO controller debugging"
1008 depends on BLK_CGROUP
1009 default n
1010 ---help---
1011 Enable some debugging help. Currently it exports additional stat
1012 files in a cgroup which can be useful for debugging.
1013
1014config CGROUP_WRITEBACK
1015 bool
1016 depends on MEMCG && BLK_CGROUP
1017 default y
1080 1018
1081menuconfig CGROUP_SCHED 1019menuconfig CGROUP_SCHED
1082 bool "Group CPU scheduler" 1020 bool "CPU controller"
1083 default n 1021 default n
1084 help 1022 help
1085 This feature lets CPU scheduler recognize task groups and control CPU 1023 This feature lets CPU scheduler recognize task groups and control CPU
@@ -1116,40 +1054,89 @@ config RT_GROUP_SCHED
1116 1054
1117endif #CGROUP_SCHED 1055endif #CGROUP_SCHED
1118 1056
1119config BLK_CGROUP 1057config CGROUP_PIDS
1120 bool "Block IO controller" 1058 bool "PIDs controller"
1121 depends on BLOCK 1059 help
1060 Provides enforcement of process number limits in the scope of a
1061 cgroup. Any attempt to fork more processes than is allowed in the
1062 cgroup will fail. PIDs are fundamentally a global resource because it
1063 is fairly trivial to reach PID exhaustion before you reach even a
1064 conservative kmemcg limit. As a result, it is possible to grind a
1065 system to halt without being limited by other cgroup policies. The
1066 PIDs cgroup subsystem is designed to stop this from happening.
1067
1068 It should be noted that organisational operations (such as attaching
1069 to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
1070 since the PIDs limit only affects a process's ability to fork, not to
1071 attach to a cgroup.
1072
1073config CGROUP_FREEZER
1074 bool "Freezer controller"
1075 help
1076 Provides a way to freeze and unfreeze all tasks in a
1077 cgroup.
1078
1079config CGROUP_HUGETLB
1080 bool "HugeTLB controller"
1081 depends on HUGETLB_PAGE
1082 select PAGE_COUNTER
1122 default n 1083 default n
1123 ---help--- 1084 help
1124 Generic block IO controller cgroup interface. This is the common 1085 Provides a cgroup controller for HugeTLB pages.
1125 cgroup interface which should be used by various IO controlling 1086 When you enable this, you can put a per cgroup limit on HugeTLB usage.
1126 policies. 1087 The limit is enforced during page fault. Since HugeTLB doesn't
1088 support page reclaim, enforcing the limit at page fault time implies
1089 that, the application will get SIGBUS signal if it tries to access
1090 HugeTLB pages beyond its limit. This requires the application to know
1091 beforehand how much HugeTLB pages it would require for its use. The
1092 control group is tracked in the third page lru pointer. This means
1093 that we cannot use the controller with huge page less than 3 pages.
1127 1094
1128 Currently, CFQ IO scheduler uses it to recognize task groups and 1095config CPUSETS
1129 control disk bandwidth allocation (proportional time slice allocation) 1096 bool "Cpuset controller"
1130 to such task groups. It is also used by bio throttling logic in 1097 help
1131 block layer to implement upper limit in IO rates on a device. 1098 This option will let you create and manage CPUSETs which
1099 allow dynamically partitioning a system into sets of CPUs and
1100 Memory Nodes and assigning tasks to run only within those sets.
1101 This is primarily useful on large SMP or NUMA systems.
1132 1102
1133 This option only enables generic Block IO controller infrastructure. 1103 Say N if unsure.
1134 One needs to also enable actual IO controlling logic/policy. For
1135 enabling proportional weight division of disk bandwidth in CFQ, set
1136 CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
1137 CONFIG_BLK_DEV_THROTTLING=y.
1138 1104
1139 See Documentation/cgroups/blkio-controller.txt for more information. 1105config PROC_PID_CPUSET
1106 bool "Include legacy /proc/<pid>/cpuset file"
1107 depends on CPUSETS
1108 default y
1140 1109
1141config DEBUG_BLK_CGROUP 1110config CGROUP_DEVICE
1142 bool "Enable Block IO controller debugging" 1111 bool "Device controller"
1143 depends on BLK_CGROUP 1112 help
1113 Provides a cgroup controller implementing whitelists for
1114 devices which a process in the cgroup can mknod or open.
1115
1116config CGROUP_CPUACCT
1117 bool "Simple CPU accounting controller"
1118 help
1119 Provides a simple controller for monitoring the
1120 total CPU consumed by the tasks in a cgroup.
1121
1122config CGROUP_PERF
1123 bool "Perf controller"
1124 depends on PERF_EVENTS
1125 help
1126 This option extends the perf per-cpu mode to restrict monitoring
1127 to threads which belong to the cgroup specified and run on the
1128 designated cpu.
1129
1130 Say N if unsure.
1131
1132config CGROUP_DEBUG
1133 bool "Example controller"
1144 default n 1134 default n
1145 ---help--- 1135 help
1146 Enable some debugging help. Currently it exports additional stat 1136 This option enables a simple controller that exports
1147 files in a cgroup which can be useful for debugging. 1137 debugging information about the cgroups framework.
1148 1138
1149config CGROUP_WRITEBACK 1139 Say N.
1150 bool
1151 depends on MEMCG && BLK_CGROUP
1152 default y
1153 1140
1154endif # CGROUPS 1141endif # CGROUPS
1155 1142
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fe95970b1f79..c03a640ef6da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
211/* Ditto for the can_fork callback. */ 211/* Ditto for the can_fork callback. */
212static unsigned long have_canfork_callback __read_mostly; 212static unsigned long have_canfork_callback __read_mostly;
213 213
214static struct file_system_type cgroup2_fs_type;
214static struct cftype cgroup_dfl_base_files[]; 215static struct cftype cgroup_dfl_base_files[];
215static struct cftype cgroup_legacy_base_files[]; 216static struct cftype cgroup_legacy_base_files[];
216 217
@@ -1623,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1623 all_ss = true; 1624 all_ss = true;
1624 continue; 1625 continue;
1625 } 1626 }
1626 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1627 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1628 continue;
1629 }
1630 if (!strcmp(token, "noprefix")) { 1627 if (!strcmp(token, "noprefix")) {
1631 opts->flags |= CGRP_ROOT_NOPREFIX; 1628 opts->flags |= CGRP_ROOT_NOPREFIX;
1632 continue; 1629 continue;
@@ -1693,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1693 return -ENOENT; 1690 return -ENOENT;
1694 } 1691 }
1695 1692
1696 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1697 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1698 if (nr_opts != 1) {
1699 pr_err("sane_behavior: no other mount options allowed\n");
1700 return -EINVAL;
1701 }
1702 return 0;
1703 }
1704
1705 /* 1693 /*
1706 * If the 'all' option was specified select all the subsystems, 1694 * If the 'all' option was specified select all the subsystems,
1707 * otherwise if 'none', 'name=' and a subsystem name options were 1695 * otherwise if 'none', 'name=' and a subsystem name options were
@@ -1981,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1981 int flags, const char *unused_dev_name, 1969 int flags, const char *unused_dev_name,
1982 void *data) 1970 void *data)
1983{ 1971{
1972 bool is_v2 = fs_type == &cgroup2_fs_type;
1984 struct super_block *pinned_sb = NULL; 1973 struct super_block *pinned_sb = NULL;
1985 struct cgroup_subsys *ss; 1974 struct cgroup_subsys *ss;
1986 struct cgroup_root *root; 1975 struct cgroup_root *root;
@@ -1997,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1997 if (!use_task_css_set_links) 1986 if (!use_task_css_set_links)
1998 cgroup_enable_task_cg_lists(); 1987 cgroup_enable_task_cg_lists();
1999 1988
1989 if (is_v2) {
1990 if (data) {
1991 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
1992 return ERR_PTR(-EINVAL);
1993 }
1994 cgrp_dfl_root_visible = true;
1995 root = &cgrp_dfl_root;
1996 cgroup_get(&root->cgrp);
1997 goto out_mount;
1998 }
1999
2000 mutex_lock(&cgroup_mutex); 2000 mutex_lock(&cgroup_mutex);
2001 2001
2002 /* First find the desired set of subsystems */ 2002 /* First find the desired set of subsystems */
@@ -2004,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2004 if (ret) 2004 if (ret)
2005 goto out_unlock; 2005 goto out_unlock;
2006 2006
2007 /* look for a matching existing root */
2008 if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
2009 cgrp_dfl_root_visible = true;
2010 root = &cgrp_dfl_root;
2011 cgroup_get(&root->cgrp);
2012 ret = 0;
2013 goto out_unlock;
2014 }
2015
2016 /* 2007 /*
2017 * Destruction of cgroup root is asynchronous, so subsystems may 2008 * Destruction of cgroup root is asynchronous, so subsystems may
2018 * still be dying after the previous unmount. Let's drain the 2009 * still be dying after the previous unmount. Let's drain the
@@ -2123,9 +2114,10 @@ out_free:
2123 2114
2124 if (ret) 2115 if (ret)
2125 return ERR_PTR(ret); 2116 return ERR_PTR(ret);
2126 2117out_mount:
2127 dentry = kernfs_mount(fs_type, flags, root->kf_root, 2118 dentry = kernfs_mount(fs_type, flags, root->kf_root,
2128 CGROUP_SUPER_MAGIC, &new_sb); 2119 is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
2120 &new_sb);
2129 if (IS_ERR(dentry) || !new_sb) 2121 if (IS_ERR(dentry) || !new_sb)
2130 cgroup_put(&root->cgrp); 2122 cgroup_put(&root->cgrp);
2131 2123
@@ -2168,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = {
2168 .kill_sb = cgroup_kill_sb, 2160 .kill_sb = cgroup_kill_sb,
2169}; 2161};
2170 2162
2163static struct file_system_type cgroup2_fs_type = {
2164 .name = "cgroup2",
2165 .mount = cgroup_mount,
2166 .kill_sb = cgroup_kill_sb,
2167};
2168
2171/** 2169/**
2172 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 2170 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
2173 * @task: target task 2171 * @task: target task
@@ -4039,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4039 goto out_err; 4037 goto out_err;
4040 4038
4041 /* 4039 /*
4042 * Migrate tasks one-by-one until @form is empty. This fails iff 4040 * Migrate tasks one-by-one until @from is empty. This fails iff
4043 * ->can_attach() fails. 4041 * ->can_attach() fails.
4044 */ 4042 */
4045 do { 4043 do {
@@ -5171,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5171{ 5169{
5172 struct cgroup_subsys_state *css; 5170 struct cgroup_subsys_state *css;
5173 5171
5174 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 5172 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5175 5173
5176 mutex_lock(&cgroup_mutex); 5174 mutex_lock(&cgroup_mutex);
5177 5175
@@ -5329,6 +5327,7 @@ int __init cgroup_init(void)
5329 5327
5330 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); 5328 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5331 WARN_ON(register_filesystem(&cgroup_fs_type)); 5329 WARN_ON(register_filesystem(&cgroup_fs_type));
5330 WARN_ON(register_filesystem(&cgroup2_fs_type));
5332 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); 5331 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5333 5332
5334 return 0; 5333 return 0;
@@ -5472,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = {
5472 .release = single_release, 5471 .release = single_release,
5473}; 5472};
5474 5473
5475static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
5476{
5477 if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
5478 return &ss_priv[i - CGROUP_CANFORK_START];
5479 return NULL;
5480}
5481
5482static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
5483{
5484 void **private = subsys_canfork_priv_p(ss_priv, i);
5485 return private ? *private : NULL;
5486}
5487
5488/** 5474/**
5489 * cgroup_fork - initialize cgroup related fields during copy_process() 5475 * cgroup_fork - initialize cgroup related fields during copy_process()
5490 * @child: pointer to task_struct of forking parent process. 5476 * @child: pointer to task_struct of forking parent process.
@@ -5507,14 +5493,13 @@ void cgroup_fork(struct task_struct *child)
5507 * returns an error, the fork aborts with that error code. This allows for 5493 * returns an error, the fork aborts with that error code. This allows for
5508 * a cgroup subsystem to conditionally allow or deny new forks. 5494 * a cgroup subsystem to conditionally allow or deny new forks.
5509 */ 5495 */
5510int cgroup_can_fork(struct task_struct *child, 5496int cgroup_can_fork(struct task_struct *child)
5511 void *ss_priv[CGROUP_CANFORK_COUNT])
5512{ 5497{
5513 struct cgroup_subsys *ss; 5498 struct cgroup_subsys *ss;
5514 int i, j, ret; 5499 int i, j, ret;
5515 5500
5516 for_each_subsys_which(ss, i, &have_canfork_callback) { 5501 for_each_subsys_which(ss, i, &have_canfork_callback) {
5517 ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); 5502 ret = ss->can_fork(child);
5518 if (ret) 5503 if (ret)
5519 goto out_revert; 5504 goto out_revert;
5520 } 5505 }
@@ -5526,7 +5511,7 @@ out_revert:
5526 if (j >= i) 5511 if (j >= i)
5527 break; 5512 break;
5528 if (ss->cancel_fork) 5513 if (ss->cancel_fork)
5529 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); 5514 ss->cancel_fork(child);
5530 } 5515 }
5531 5516
5532 return ret; 5517 return ret;
@@ -5539,15 +5524,14 @@ out_revert:
5539 * This calls the cancel_fork() callbacks if a fork failed *after* 5524 * This calls the cancel_fork() callbacks if a fork failed *after*
5540 * cgroup_can_fork() succeded. 5525 * cgroup_can_fork() succeded.
5541 */ 5526 */
5542void cgroup_cancel_fork(struct task_struct *child, 5527void cgroup_cancel_fork(struct task_struct *child)
5543 void *ss_priv[CGROUP_CANFORK_COUNT])
5544{ 5528{
5545 struct cgroup_subsys *ss; 5529 struct cgroup_subsys *ss;
5546 int i; 5530 int i;
5547 5531
5548 for_each_subsys(ss, i) 5532 for_each_subsys(ss, i)
5549 if (ss->cancel_fork) 5533 if (ss->cancel_fork)
5550 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); 5534 ss->cancel_fork(child);
5551} 5535}
5552 5536
5553/** 5537/**
@@ -5560,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child,
5560 * cgroup_task_iter_start() - to guarantee that the new task ends up on its 5544 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5561 * list. 5545 * list.
5562 */ 5546 */
5563void cgroup_post_fork(struct task_struct *child, 5547void cgroup_post_fork(struct task_struct *child)
5564 void *old_ss_priv[CGROUP_CANFORK_COUNT])
5565{ 5548{
5566 struct cgroup_subsys *ss; 5549 struct cgroup_subsys *ss;
5567 int i; 5550 int i;
@@ -5605,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child,
5605 * and addition to css_set. 5588 * and addition to css_set.
5606 */ 5589 */
5607 for_each_subsys_which(ss, i, &have_fork_callback) 5590 for_each_subsys_which(ss, i, &have_fork_callback)
5608 ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); 5591 ss->fork(child);
5609} 5592}
5610 5593
5611/** 5594/**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2d3df82c54f2..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
200 * to do anything as freezer_attach() will put @task into the appropriate 200 * to do anything as freezer_attach() will put @task into the appropriate
201 * state. 201 * state.
202 */ 202 */
203static void freezer_fork(struct task_struct *task, void *private) 203static void freezer_fork(struct task_struct *task)
204{ 204{
205 struct freezer *freezer; 205 struct freezer *freezer;
206 206
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a167fda..303097b37429 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
134 * 134 *
135 * This function follows the set limit. It will fail if the charge would cause 135 * This function follows the set limit. It will fail if the charge would cause
136 * the new value to exceed the hierarchical limit. Returns 0 if the charge 136 * the new value to exceed the hierarchical limit. Returns 0 if the charge
137 * succeded, otherwise -EAGAIN. 137 * succeeded, otherwise -EAGAIN.
138 */ 138 */
139static int pids_try_charge(struct pids_cgroup *pids, int num) 139static int pids_try_charge(struct pids_cgroup *pids, int num)
140{ 140{
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
209 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies 209 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
210 * on threadgroup_change_begin() held by the copy_process(). 210 * on threadgroup_change_begin() held by the copy_process().
211 */ 211 */
212static int pids_can_fork(struct task_struct *task, void **priv_p) 212static int pids_can_fork(struct task_struct *task)
213{ 213{
214 struct cgroup_subsys_state *css; 214 struct cgroup_subsys_state *css;
215 struct pids_cgroup *pids; 215 struct pids_cgroup *pids;
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
219 return pids_try_charge(pids, 1); 219 return pids_try_charge(pids, 1);
220} 220}
221 221
222static void pids_cancel_fork(struct task_struct *task, void *priv) 222static void pids_cancel_fork(struct task_struct *task)
223{ 223{
224 struct cgroup_subsys_state *css; 224 struct cgroup_subsys_state *css;
225 struct pids_cgroup *pids; 225 struct pids_cgroup *pids;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02a8ea5c9963..3e945fcd8179 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -51,6 +51,7 @@
51#include <linux/stat.h> 51#include <linux/stat.h>
52#include <linux/string.h> 52#include <linux/string.h>
53#include <linux/time.h> 53#include <linux/time.h>
54#include <linux/time64.h>
54#include <linux/backing-dev.h> 55#include <linux/backing-dev.h>
55#include <linux/sort.h> 56#include <linux/sort.h>
56 57
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
68struct fmeter { 69struct fmeter {
69 int cnt; /* unprocessed events count */ 70 int cnt; /* unprocessed events count */
70 int val; /* most recent output value */ 71 int val; /* most recent output value */
71 time_t time; /* clock (secs) when val computed */ 72 time64_t time; /* clock (secs) when val computed */
72 spinlock_t lock; /* guards read or write of above */ 73 spinlock_t lock; /* guards read or write of above */
73}; 74};
74 75
@@ -1374,7 +1375,7 @@ out:
1374 */ 1375 */
1375 1376
1376#define FM_COEF 933 /* coefficient for half-life of 10 secs */ 1377#define FM_COEF 933 /* coefficient for half-life of 10 secs */
1377#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ 1378#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
1378#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 1379#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
1379#define FM_SCALE 1000 /* faux fixed point scale */ 1380#define FM_SCALE 1000 /* faux fixed point scale */
1380 1381
@@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp)
1390/* Internal meter update - process cnt events and update value */ 1391/* Internal meter update - process cnt events and update value */
1391static void fmeter_update(struct fmeter *fmp) 1392static void fmeter_update(struct fmeter *fmp)
1392{ 1393{
1393 time_t now = get_seconds(); 1394 time64_t now;
1394 time_t ticks = now - fmp->time; 1395 u32 ticks;
1396
1397 now = ktime_get_seconds();
1398 ticks = now - fmp->time;
1395 1399
1396 if (ticks == 0) 1400 if (ticks == 0)
1397 return; 1401 return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 291b08cc817b..6774e6b2e96d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1250,7 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1250{ 1250{
1251 int retval; 1251 int retval;
1252 struct task_struct *p; 1252 struct task_struct *p;
1253 void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
1254 1253
1255 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1254 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1256 return ERR_PTR(-EINVAL); 1255 return ERR_PTR(-EINVAL);
@@ -1527,7 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1527 * between here and cgroup_post_fork() if an organisation operation is in 1526 * between here and cgroup_post_fork() if an organisation operation is in
1528 * progress. 1527 * progress.
1529 */ 1528 */
1530 retval = cgroup_can_fork(p, cgrp_ss_priv); 1529 retval = cgroup_can_fork(p);
1531 if (retval) 1530 if (retval)
1532 goto bad_fork_free_pid; 1531 goto bad_fork_free_pid;
1533 1532
@@ -1609,7 +1608,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1609 write_unlock_irq(&tasklist_lock); 1608 write_unlock_irq(&tasklist_lock);
1610 1609
1611 proc_fork_connector(p); 1610 proc_fork_connector(p);
1612 cgroup_post_fork(p, cgrp_ss_priv); 1611 cgroup_post_fork(p);
1613 threadgroup_change_end(current); 1612 threadgroup_change_end(current);
1614 perf_event_fork(p); 1613 perf_event_fork(p);
1615 1614
@@ -1619,7 +1618,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1619 return p; 1618 return p;
1620 1619
1621bad_fork_cancel_cgroup: 1620bad_fork_cancel_cgroup:
1622 cgroup_cancel_fork(p, cgrp_ss_priv); 1621 cgroup_cancel_fork(p);
1623bad_fork_free_pid: 1622bad_fork_free_pid:
1624 if (pid != &init_struct_pid) 1623 if (pid != &init_struct_pid)
1625 free_pid(pid); 1624 free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 77d97a6fc715..44253adb3c36 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
8342 sched_offline_group(tg); 8342 sched_offline_group(tg);
8343} 8343}
8344 8344
8345static void cpu_cgroup_fork(struct task_struct *task, void *private) 8345static void cpu_cgroup_fork(struct task_struct *task)
8346{ 8346{
8347 sched_move_task(task); 8347 sched_move_task(task);
8348} 8348}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc10620967c7..14cb1db4c52b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4813,7 +4813,7 @@ static void mem_cgroup_clear_mc(void)
4813static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 4813static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
4814{ 4814{
4815 struct cgroup_subsys_state *css; 4815 struct cgroup_subsys_state *css;
4816 struct mem_cgroup *memcg; 4816 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
4817 struct mem_cgroup *from; 4817 struct mem_cgroup *from;
4818 struct task_struct *leader, *p; 4818 struct task_struct *leader, *p;
4819 struct mm_struct *mm; 4819 struct mm_struct *mm;