aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-09 18:03:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-09 18:03:33 -0400
commit14208b0ec56919f5333dd654b1a7d10765d0ad05 (patch)
tree474b46c351efced45925d15dc2e0049c49784716
parent6ea4fa70e4af0da8b133b246458fb789d8cb3985 (diff)
parentc731ae1d0f02a300697a8b1564780ad28a6c2013 (diff)
Merge branch 'for-3.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "A lot of activities on cgroup side. Heavy restructuring including locking simplification took place to improve the code base and enable implementation of the unified hierarchy, which currently exists behind a __DEVEL__ mount option. The core support is mostly complete but individual controllers need further work. To explain the design and rationales of the the unified hierarchy Documentation/cgroups/unified-hierarchy.txt is added. Another notable change is css (cgroup_subsys_state - what each controller uses to identify and interact with a cgroup) iteration update. This is part of continuing updates on css object lifetime and visibility. cgroup started with reference count draining on removal way back and is now reaching a point where csses behave and are iterated like normal refcnted objects albeit with some complexities to allow distinguishing the state where they're being deleted. The css iteration update isn't taken advantage of yet but is planned to be used to simplify memcg significantly" * 'for-3.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (77 commits) cgroup: disallow disabled controllers on the default hierarchy cgroup: don't destroy the default root cgroup: disallow debug controller on the default hierarchy cgroup: clean up MAINTAINERS entries cgroup: implement css_tryget() device_cgroup: use css_has_online_children() instead of has_children() cgroup: convert cgroup_has_live_children() into css_has_online_children() cgroup: use CSS_ONLINE instead of CGRP_DEAD cgroup: iterate cgroup_subsys_states directly cgroup: introduce CSS_RELEASED and reduce css iteration fallback window cgroup: move cgroup->serial_nr into cgroup_subsys_state cgroup: link all cgroup_subsys_states in their sibling lists cgroup: move cgroup->sibling and ->children into cgroup_subsys_state cgroup: remove cgroup->parent device_cgroup: remove direct access to cgroup->children memcg: update memcg_has_children() to use css_next_child() memcg: remove tasks/children test from mem_cgroup_force_empty() cgroup: remove css_parent() cgroup: skip refcnting on normal root csses and cgrp_dfl_root self css cgroup: use cgroup->self.refcnt for cgroup refcnting ...
-rw-r--r--Documentation/cgroups/memory.txt6
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt359
-rw-r--r--MAINTAINERS47
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-cgroup.c2
-rw-r--r--block/blk-cgroup.h2
-rw-r--r--block/blk-throttle.c32
-rw-r--r--block/cfq-iosched.c28
-rw-r--r--include/linux/cgroup.h272
-rw-r--r--include/linux/cgroup_subsys.h11
-rw-r--r--kernel/cgroup.c1825
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/cpuset.c46
-rw-r--r--kernel/events/core.c3
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--mm/hugetlb_cgroup.c37
-rw-r--r--mm/memcontrol.c188
-rw-r--r--net/core/netclassid_cgroup.c2
-rw-r--r--net/core/netprio_cgroup.c14
-rw-r--r--net/ipv4/tcp_memcontrol.c31
-rw-r--r--security/device_cgroup.c33
22 files changed, 1966 insertions, 1004 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index b3429aec444c..02ab997a1ed2 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -458,15 +458,11 @@ About use_hierarchy, see Section 6.
458 458
4595.1 force_empty 4595.1 force_empty
460 memory.force_empty interface is provided to make cgroup's memory usage empty. 460 memory.force_empty interface is provided to make cgroup's memory usage empty.
461 You can use this interface only when the cgroup has no tasks.
462 When writing anything to this 461 When writing anything to this
463 462
464 # echo 0 > memory.force_empty 463 # echo 0 > memory.force_empty
465 464
466 Almost all pages tracked by this memory cgroup will be unmapped and freed. 465 the cgroup will be reclaimed and as many pages reclaimed as possible.
467 Some pages cannot be freed because they are locked or in-use. Such pages are
468 moved to parent (if use_hierarchy==1) or root (if use_hierarchy==0) and this
469 cgroup will be empty.
470 466
471 The typical use case for this interface is before calling rmdir(). 467 The typical use case for this interface is before calling rmdir().
472 Because rmdir() moves all pages to parent, some out-of-use page caches can be 468 Because rmdir() moves all pages to parent, some out-of-use page caches can be
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
new file mode 100644
index 000000000000..324b182e6000
--- /dev/null
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -0,0 +1,359 @@
1
2Cgroup unified hierarchy
3
4April, 2014 Tejun Heo <tj@kernel.org>
5
6This document describes the changes made by unified hierarchy and
7their rationales. It will eventually be merged into the main cgroup
8documentation.
9
10CONTENTS
11
121. Background
132. Basic Operation
14 2-1. Mounting
15 2-2. cgroup.subtree_control
16 2-3. cgroup.controllers
173. Structural Constraints
18 3-1. Top-down
19 3-2. No internal tasks
204. Other Changes
21 4-1. [Un]populated Notification
22 4-2. Other Core Changes
23 4-3. Per-Controller Changes
24 4-3-1. blkio
25 4-3-2. cpuset
26 4-3-3. memory
275. Planned Changes
28 5-1. CAP for resource control
29
30
311. Background
32
33cgroup allows an arbitrary number of hierarchies and each hierarchy
34can host any number of controllers. While this seems to provide a
35high level of flexibility, it isn't quite useful in practice.
36
37For example, as there is only one instance of each controller, utility
38type controllers such as freezer which can be useful in all
39hierarchies can only be used in one. The issue is exacerbated by the
40fact that controllers can't be moved around once hierarchies are
41populated. Another issue is that all controllers bound to a hierarchy
42are forced to have exactly the same view of the hierarchy. It isn't
43possible to vary the granularity depending on the specific controller.
44
45In practice, these issues heavily limit which controllers can be put
46on the same hierarchy and most configurations resort to putting each
47controller on its own hierarchy. Only closely related ones, such as
48the cpu and cpuacct controllers, make sense to put on the same
49hierarchy. This often means that userland ends up managing multiple
50similar hierarchies repeating the same steps on each hierarchy
51whenever a hierarchy management operation is necessary.
52
53Unfortunately, support for multiple hierarchies comes at a steep cost.
54Internal implementation in cgroup core proper is dazzlingly
55complicated but more importantly the support for multiple hierarchies
56restricts how cgroup is used in general and what controllers can do.
57
58There's no limit on how many hierarchies there may be, which means
59that a task's cgroup membership can't be described in finite length.
60The key may contain any varying number of entries and is unlimited in
61length, which makes it highly awkward to handle and leads to addition
62of controllers which exist only to identify membership, which in turn
63exacerbates the original problem.
64
65Also, as a controller can't have any expectation regarding what shape
66of hierarchies other controllers would be on, each controller has to
67assume that all other controllers are operating on completely
68orthogonal hierarchies. This makes it impossible, or at least very
69cumbersome, for controllers to cooperate with each other.
70
71In most use cases, putting controllers on hierarchies which are
72completely orthogonal to each other isn't necessary. What usually is
73called for is the ability to have differing levels of granularity
74depending on the specific controller. In other words, hierarchy may
75be collapsed from leaf towards root when viewed from specific
76controllers. For example, a given configuration might not care about
77how memory is distributed beyond a certain level while still wanting
78to control how CPU cycles are distributed.
79
80Unified hierarchy is the next version of cgroup interface. It aims to
81address the aforementioned issues by having more structure while
82retaining enough flexibility for most use cases. Various other
83general and controller-specific interface issues are also addressed in
84the process.
85
86
872. Basic Operation
88
892-1. Mounting
90
91Currently, unified hierarchy can be mounted with the following mount
92command. Note that this is still under development and scheduled to
93change soon.
94
95 mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
96
97All controllers which are not bound to other hierarchies are
98automatically bound to unified hierarchy and show up at the root of
99it. Controllers which are enabled only in the root of unified
100hierarchy can be bound to other hierarchies at any time. This allows
101mixing unified hierarchy with the traditional multiple hierarchies in
102a fully backward compatible way.
103
104
1052-2. cgroup.subtree_control
106
107All cgroups on unified hierarchy have a "cgroup.subtree_control" file
108which governs which controllers are enabled on the children of the
109cgroup. Let's assume a hierarchy like the following.
110
111 root - A - B - C
112 \ D
113
114root's "cgroup.subtree_control" file determines which controllers are
115enabled on A. A's on B. B's on C and D. This coincides with the
116fact that controllers on the immediate sub-level are used to
117distribute the resources of the parent. In fact, it's natural to
118assume that resource control knobs of a child belong to its parent.
119Enabling a controller in a "cgroup.subtree_control" file declares that
120distribution of the respective resources of the cgroup will be
121controlled. Note that this means that controller enable states are
122shared among siblings.
123
124When read, the file contains a space-separated list of currently
125enabled controllers. A write to the file should contain a
126space-separated list of controllers with '+' or '-' prefixed (without
127the quotes). Controllers prefixed with '+' are enabled and '-'
128disabled. If a controller is listed multiple times, the last entry
129wins. The specific operations are executed atomically - either all
130succeed or fail.
131
132
1332-3. cgroup.controllers
134
135Read-only "cgroup.controllers" file contains a space-separated list of
136controllers which can be enabled in the cgroup's
137"cgroup.subtree_control" file.
138
139In the root cgroup, this lists controllers which are not bound to
140other hierarchies and the content changes as controllers are bound to
141and unbound from other hierarchies.
142
143In non-root cgroups, the content of this file equals that of the
144parent's "cgroup.subtree_control" file as only controllers enabled
145from the parent can be used in its children.
146
147
1483. Structural Constraints
149
1503-1. Top-down
151
152As it doesn't make sense to nest control of an uncontrolled resource,
153all non-root "cgroup.subtree_control" files can only contain
154controllers which are enabled in the parent's "cgroup.subtree_control"
155file. A controller can be enabled only if the parent has the
156controller enabled and a controller can't be disabled if one or more
157children have it enabled.
158
159
1603-2. No internal tasks
161
162One long-standing issue that cgroup faces is the competition between
163tasks belonging to the parent cgroup and its children cgroups. This
164is inherently nasty as two different types of entities compete and
165there is no agreed-upon obvious way to handle it. Different
166controllers are doing different things.
167
168The cpu controller considers tasks and cgroups as equivalents and maps
169nice levels to cgroup weights. This works for some cases but falls
170flat when children should be allocated specific ratios of CPU cycles
171and the number of internal tasks fluctuates - the ratios constantly
172change as the number of competing entities fluctuates. There also are
173other issues. The mapping from nice level to weight isn't obvious or
174universal, and there are various other knobs which simply aren't
175available for tasks.
176
177The blkio controller implicitly creates a hidden leaf node for each
178cgroup to host the tasks. The hidden leaf has its own copies of all
179the knobs with "leaf_" prefixed. While this allows equivalent control
180over internal tasks, it's with serious drawbacks. It always adds an
181extra layer of nesting which may not be necessary, makes the interface
182messy and significantly complicates the implementation.
183
184The memory controller currently doesn't have a way to control what
185happens between internal tasks and child cgroups and the behavior is
186not clearly defined. There have been attempts to add ad-hoc behaviors
187and knobs to tailor the behavior to specific workloads. Continuing
188this direction will lead to problems which will be extremely difficult
189to resolve in the long term.
190
191Multiple controllers struggle with internal tasks and came up with
192different ways to deal with it; unfortunately, all the approaches in
193use now are severely flawed and, furthermore, the widely different
194behaviors make cgroup as whole highly inconsistent.
195
196It is clear that this is something which needs to be addressed from
197cgroup core proper in a uniform way so that controllers don't need to
198worry about it and cgroup as a whole shows a consistent and logical
199behavior. To achieve that, unified hierarchy enforces the following
200structural constraint:
201
202 Except for the root, only cgroups which don't contain any task may
203 have controllers enabled in their "cgroup.subtree_control" files.
204
205Combined with other properties, this guarantees that, when a
206controller is looking at the part of the hierarchy which has it
207enabled, tasks are always only on the leaves. This rules out
208situations where child cgroups compete against internal tasks of the
209parent.
210
211There are two things to note. Firstly, the root cgroup is exempt from
212the restriction. Root contains tasks and anonymous resource
213consumption which can't be associated with any other cgroup and
214requires special treatment from most controllers. How resource
215consumption in the root cgroup is governed is up to each controller.
216
217Secondly, the restriction doesn't take effect if there is no enabled
218controller in the cgroup's "cgroup.subtree_control" file. This is
219important as otherwise it wouldn't be possible to create children of a
220populated cgroup. To control resource distribution of a cgroup, the
221cgroup must create children and transfer all its tasks to the children
222before enabling controllers in its "cgroup.subtree_control" file.
223
224
2254. Other Changes
226
2274-1. [Un]populated Notification
228
229cgroup users often need a way to determine when a cgroup's
230subhierarchy becomes empty so that it can be cleaned up. cgroup
231currently provides release_agent for it; unfortunately, this mechanism
232is riddled with issues.
233
234- It delivers events by forking and execing a userland binary
235 specified as the release_agent. This is a long deprecated method of
236 notification delivery. It's extremely heavy, slow and cumbersome to
237 integrate with larger infrastructure.
238
239- There is single monitoring point at the root. There's no way to
240 delegate management of a subtree.
241
242- The event isn't recursive. It triggers when a cgroup doesn't have
243 any tasks or child cgroups. Events for internal nodes trigger only
244 after all children are removed. This again makes it impossible to
245 delegate management of a subtree.
246
247- Events are filtered from the kernel side. A "notify_on_release"
248 file is used to subscribe to or suppress release events. This is
249 unnecessarily complicated and probably done this way because event
250 delivery itself was expensive.
251
252Unified hierarchy implements an interface file "cgroup.populated"
253which can be used to monitor whether the cgroup's subhierarchy has
254tasks in it or not. Its value is 0 if there is no task in the cgroup
255and its descendants; otherwise, 1. poll and [id]notify events are
256triggered when the value changes.
257
258This is significantly lighter and simpler and trivially allows
259delegating management of subhierarchy - subhierarchy monitoring can
260block further propagation simply by putting itself or another process
261in the subhierarchy and monitor events that it's interested in from
262there without interfering with monitoring higher in the tree.
263
264In unified hierarchy, the release_agent mechanism is no longer
265supported and the interface files "release_agent" and
266"notify_on_release" do not exist.
267
268
2694-2. Other Core Changes
270
271- None of the mount options is allowed.
272
273- remount is disallowed.
274
275- rename(2) is disallowed.
276
277- The "tasks" file is removed. Everything should at process
278 granularity. Use the "cgroup.procs" file instead.
279
280- The "cgroup.procs" file is not sorted. pids will be unique unless
281 they got recycled in-between reads.
282
283- The "cgroup.clone_children" file is removed.
284
285
2864-3. Per-Controller Changes
287
2884-3-1. blkio
289
290- blk-throttle becomes properly hierarchical.
291
292
2934-3-2. cpuset
294
295- Tasks are kept in empty cpusets after hotplug and take on the masks
296 of the nearest non-empty ancestor, instead of being moved to it.
297
298- A task can be moved into an empty cpuset, and again it takes on the
299 masks of the nearest non-empty ancestor.
300
301
3024-3-3. memory
303
304- use_hierarchy is on by default and the cgroup file for the flag is
305 not created.
306
307
3085. Planned Changes
309
3105-1. CAP for resource control
311
312Unified hierarchy will require one of the capabilities(7), which is
313yet to be decided, for all resource control related knobs. Process
314organization operations - creation of sub-cgroups and migration of
315processes in sub-hierarchies may be delegated by changing the
316ownership and/or permissions on the cgroup directory and
317"cgroup.procs" interface file; however, all operations which affect
318resource control - writes to a "cgroup.subtree_control" file or any
319controller-specific knobs - will require an explicit CAP privilege.
320
321This, in part, is to prevent the cgroup interface from being
322inadvertently promoted to programmable API used by non-privileged
323binaries. cgroup exposes various aspects of the system in ways which
324aren't properly abstracted for direct consumption by regular programs.
325This is an administration interface much closer to sysctl knobs than
326system calls. Even the basic access model, being filesystem path
327based, isn't suitable for direct consumption. There's no way to
328access "my cgroup" in a race-free way or make multiple operations
329atomic against migration to another cgroup.
330
331Another aspect is that, for better or for worse, the cgroup interface
332goes through far less scrutiny than regular interfaces for
333unprivileged userland. The upside is that cgroup is able to expose
334useful features which may not be suitable for general consumption in a
335reasonable time frame. It provides a relatively short path between
336internal details and userland-visible interface. Of course, this
337shortcut comes with high risk. We go through what we go through for
338general kernel APIs for good reasons. It may end up leaking internal
339details in a way which can exert significant pain by locking the
340kernel into a contract that can't be maintained in a reasonable
341manner.
342
343Also, due to the specific nature, cgroup and its controllers don't
344tend to attract attention from a wide scope of developers. cgroup's
345short history is already fraught with severely mis-designed
346interfaces, unnecessary commitments to and exposing of internal
347details, broken and dangerous implementations of various features.
348
349Keeping cgroup as an administration interface is both advantageous for
350its role and imperative given its nature. Some of the cgroup features
351may make sense for unprivileged access. If deemed justified, those
352must be further abstracted and implemented as a different interface,
353be it a system call or process-private filesystem, and survive through
354the scrutiny that any interface for general consumption is required to
355go through.
356
357Requiring CAP is not a complete solution but should serve as a
358significant deterrent against spraying cgroup usages in non-privileged
359programs.
diff --git a/MAINTAINERS b/MAINTAINERS
index 1b22565c59ac..0fbd4a04407b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2384,16 +2384,35 @@ L: netdev@vger.kernel.org
2384S: Maintained 2384S: Maintained
2385F: drivers/connector/ 2385F: drivers/connector/
2386 2386
2387CONTROL GROUPS (CGROUPS) 2387CONTROL GROUP (CGROUP)
2388M: Tejun Heo <tj@kernel.org> 2388M: Tejun Heo <tj@kernel.org>
2389M: Li Zefan <lizefan@huawei.com> 2389M: Li Zefan <lizefan@huawei.com>
2390L: containers@lists.linux-foundation.org
2391L: cgroups@vger.kernel.org 2390L: cgroups@vger.kernel.org
2392T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 2391T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
2393S: Maintained 2392S: Maintained
2393F: Documentation/cgroups/
2394F: include/linux/cgroup* 2394F: include/linux/cgroup*
2395F: kernel/cgroup* 2395F: kernel/cgroup*
2396F: mm/*cgroup* 2396
2397CONTROL GROUP - CPUSET
2398M: Li Zefan <lizefan@huawei.com>
2399L: cgroups@vger.kernel.org
2400W: http://www.bullopensource.org/cpuset/
2401W: http://oss.sgi.com/projects/cpusets/
2402T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
2403S: Maintained
2404F: Documentation/cgroups/cpusets.txt
2405F: include/linux/cpuset.h
2406F: kernel/cpuset.c
2407
2408CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
2409M: Johannes Weiner <hannes@cmpxchg.org>
2410M: Michal Hocko <mhocko@suse.cz>
2411L: cgroups@vger.kernel.org
2412L: linux-mm@kvack.org
2413S: Maintained
2414F: mm/memcontrol.c
2415F: mm/page_cgroup.c
2397 2416
2398CORETEMP HARDWARE MONITORING DRIVER 2417CORETEMP HARDWARE MONITORING DRIVER
2399M: Fenghua Yu <fenghua.yu@intel.com> 2418M: Fenghua Yu <fenghua.yu@intel.com>
@@ -2464,17 +2483,6 @@ M: Thomas Renninger <trenn@suse.de>
2464S: Maintained 2483S: Maintained
2465F: tools/power/cpupower/ 2484F: tools/power/cpupower/
2466 2485
2467CPUSETS
2468M: Li Zefan <lizefan@huawei.com>
2469L: cgroups@vger.kernel.org
2470W: http://www.bullopensource.org/cpuset/
2471W: http://oss.sgi.com/projects/cpusets/
2472T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
2473S: Maintained
2474F: Documentation/cgroups/cpusets.txt
2475F: include/linux/cpuset.h
2476F: kernel/cpuset.c
2477
2478CRAMFS FILESYSTEM 2486CRAMFS FILESYSTEM
2479W: http://sourceforge.net/projects/cramfs/ 2487W: http://sourceforge.net/projects/cramfs/
2480S: Orphan / Obsolete 2488S: Orphan / Obsolete
@@ -5757,17 +5765,6 @@ F: include/linux/memory_hotplug.h
5757F: include/linux/vmalloc.h 5765F: include/linux/vmalloc.h
5758F: mm/ 5766F: mm/
5759 5767
5760MEMORY RESOURCE CONTROLLER
5761M: Johannes Weiner <hannes@cmpxchg.org>
5762M: Michal Hocko <mhocko@suse.cz>
5763M: Balbir Singh <bsingharora@gmail.com>
5764M: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
5765L: cgroups@vger.kernel.org
5766L: linux-mm@kvack.org
5767S: Maintained
5768F: mm/memcontrol.c
5769F: mm/page_cgroup.c
5770
5771MEMORY TECHNOLOGY DEVICES (MTD) 5768MEMORY TECHNOLOGY DEVICES (MTD)
5772M: David Woodhouse <dwmw2@infradead.org> 5769M: David Woodhouse <dwmw2@infradead.org>
5773M: Brian Norris <computersforpeace@gmail.com> 5770M: Brian Norris <computersforpeace@gmail.com>
diff --git a/block/bio.c b/block/bio.c
index 96d28eee8a1e..1ba33657160f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1971,7 +1971,7 @@ int bio_associate_current(struct bio *bio)
1971 /* associate blkcg if exists */ 1971 /* associate blkcg if exists */
1972 rcu_read_lock(); 1972 rcu_read_lock();
1973 css = task_css(current, blkio_cgrp_id); 1973 css = task_css(current, blkio_cgrp_id);
1974 if (css && css_tryget(css)) 1974 if (css && css_tryget_online(css))
1975 bio->bi_css = css; 1975 bio->bi_css = css;
1976 rcu_read_unlock(); 1976 rcu_read_unlock();
1977 1977
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1039fb9ff5f5..9f5bce33e6fe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -185,7 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
185 lockdep_assert_held(q->queue_lock); 185 lockdep_assert_held(q->queue_lock);
186 186
187 /* blkg holds a reference to blkcg */ 187 /* blkg holds a reference to blkcg */
188 if (!css_tryget(&blkcg->css)) { 188 if (!css_tryget_online(&blkcg->css)) {
189 ret = -EINVAL; 189 ret = -EINVAL;
190 goto err_free_blkg; 190 goto err_free_blkg;
191 } 191 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 371fe8e92ab5..d692b29c083a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -204,7 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
204 */ 204 */
205static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) 205static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
206{ 206{
207 return css_to_blkcg(css_parent(&blkcg->css)); 207 return css_to_blkcg(blkcg->css.parent);
208} 208}
209 209
210/** 210/**
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9353b4683359..3fdb21a390c1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1346,10 +1346,10 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
1346 return 0; 1346 return 0;
1347} 1347}
1348 1348
1349static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, 1349static ssize_t tg_set_conf(struct kernfs_open_file *of,
1350 const char *buf, bool is_u64) 1350 char *buf, size_t nbytes, loff_t off, bool is_u64)
1351{ 1351{
1352 struct blkcg *blkcg = css_to_blkcg(css); 1352 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1353 struct blkg_conf_ctx ctx; 1353 struct blkg_conf_ctx ctx;
1354 struct throtl_grp *tg; 1354 struct throtl_grp *tg;
1355 struct throtl_service_queue *sq; 1355 struct throtl_service_queue *sq;
@@ -1368,9 +1368,9 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
1368 ctx.v = -1; 1368 ctx.v = -1;
1369 1369
1370 if (is_u64) 1370 if (is_u64)
1371 *(u64 *)((void *)tg + cft->private) = ctx.v; 1371 *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
1372 else 1372 else
1373 *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1373 *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
1374 1374
1375 throtl_log(&tg->service_queue, 1375 throtl_log(&tg->service_queue,
1376 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1376 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1404,19 +1404,19 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
1404 } 1404 }
1405 1405
1406 blkg_conf_finish(&ctx); 1406 blkg_conf_finish(&ctx);
1407 return 0; 1407 return nbytes;
1408} 1408}
1409 1409
1410static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, 1410static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
1411 char *buf) 1411 char *buf, size_t nbytes, loff_t off)
1412{ 1412{
1413 return tg_set_conf(css, cft, buf, true); 1413 return tg_set_conf(of, buf, nbytes, off, true);
1414} 1414}
1415 1415
1416static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, 1416static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
1417 char *buf) 1417 char *buf, size_t nbytes, loff_t off)
1418{ 1418{
1419 return tg_set_conf(css, cft, buf, false); 1419 return tg_set_conf(of, buf, nbytes, off, false);
1420} 1420}
1421 1421
1422static struct cftype throtl_files[] = { 1422static struct cftype throtl_files[] = {
@@ -1424,25 +1424,25 @@ static struct cftype throtl_files[] = {
1424 .name = "throttle.read_bps_device", 1424 .name = "throttle.read_bps_device",
1425 .private = offsetof(struct throtl_grp, bps[READ]), 1425 .private = offsetof(struct throtl_grp, bps[READ]),
1426 .seq_show = tg_print_conf_u64, 1426 .seq_show = tg_print_conf_u64,
1427 .write_string = tg_set_conf_u64, 1427 .write = tg_set_conf_u64,
1428 }, 1428 },
1429 { 1429 {
1430 .name = "throttle.write_bps_device", 1430 .name = "throttle.write_bps_device",
1431 .private = offsetof(struct throtl_grp, bps[WRITE]), 1431 .private = offsetof(struct throtl_grp, bps[WRITE]),
1432 .seq_show = tg_print_conf_u64, 1432 .seq_show = tg_print_conf_u64,
1433 .write_string = tg_set_conf_u64, 1433 .write = tg_set_conf_u64,
1434 }, 1434 },
1435 { 1435 {
1436 .name = "throttle.read_iops_device", 1436 .name = "throttle.read_iops_device",
1437 .private = offsetof(struct throtl_grp, iops[READ]), 1437 .private = offsetof(struct throtl_grp, iops[READ]),
1438 .seq_show = tg_print_conf_uint, 1438 .seq_show = tg_print_conf_uint,
1439 .write_string = tg_set_conf_uint, 1439 .write = tg_set_conf_uint,
1440 }, 1440 },
1441 { 1441 {
1442 .name = "throttle.write_iops_device", 1442 .name = "throttle.write_iops_device",
1443 .private = offsetof(struct throtl_grp, iops[WRITE]), 1443 .private = offsetof(struct throtl_grp, iops[WRITE]),
1444 .seq_show = tg_print_conf_uint, 1444 .seq_show = tg_print_conf_uint,
1445 .write_string = tg_set_conf_uint, 1445 .write = tg_set_conf_uint,
1446 }, 1446 },
1447 { 1447 {
1448 .name = "throttle.io_service_bytes", 1448 .name = "throttle.io_service_bytes",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 22dffebc7c73..cadc37841744 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1670,11 +1670,11 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1670 return 0; 1670 return 0;
1671} 1671}
1672 1672
1673static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, 1673static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1674 struct cftype *cft, const char *buf, 1674 char *buf, size_t nbytes, loff_t off,
1675 bool is_leaf_weight) 1675 bool is_leaf_weight)
1676{ 1676{
1677 struct blkcg *blkcg = css_to_blkcg(css); 1677 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1678 struct blkg_conf_ctx ctx; 1678 struct blkg_conf_ctx ctx;
1679 struct cfq_group *cfqg; 1679 struct cfq_group *cfqg;
1680 int ret; 1680 int ret;
@@ -1697,19 +1697,19 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
1697 } 1697 }
1698 1698
1699 blkg_conf_finish(&ctx); 1699 blkg_conf_finish(&ctx);
1700 return ret; 1700 return ret ?: nbytes;
1701} 1701}
1702 1702
1703static int cfqg_set_weight_device(struct cgroup_subsys_state *css, 1703static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1704 struct cftype *cft, char *buf) 1704 char *buf, size_t nbytes, loff_t off)
1705{ 1705{
1706 return __cfqg_set_weight_device(css, cft, buf, false); 1706 return __cfqg_set_weight_device(of, buf, nbytes, off, false);
1707} 1707}
1708 1708
1709static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, 1709static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1710 struct cftype *cft, char *buf) 1710 char *buf, size_t nbytes, loff_t off)
1711{ 1711{
1712 return __cfqg_set_weight_device(css, cft, buf, true); 1712 return __cfqg_set_weight_device(of, buf, nbytes, off, true);
1713} 1713}
1714 1714
1715static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1715static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -1837,7 +1837,7 @@ static struct cftype cfq_blkcg_files[] = {
1837 .name = "weight_device", 1837 .name = "weight_device",
1838 .flags = CFTYPE_ONLY_ON_ROOT, 1838 .flags = CFTYPE_ONLY_ON_ROOT,
1839 .seq_show = cfqg_print_leaf_weight_device, 1839 .seq_show = cfqg_print_leaf_weight_device,
1840 .write_string = cfqg_set_leaf_weight_device, 1840 .write = cfqg_set_leaf_weight_device,
1841 }, 1841 },
1842 { 1842 {
1843 .name = "weight", 1843 .name = "weight",
@@ -1851,7 +1851,7 @@ static struct cftype cfq_blkcg_files[] = {
1851 .name = "weight_device", 1851 .name = "weight_device",
1852 .flags = CFTYPE_NOT_ON_ROOT, 1852 .flags = CFTYPE_NOT_ON_ROOT,
1853 .seq_show = cfqg_print_weight_device, 1853 .seq_show = cfqg_print_weight_device,
1854 .write_string = cfqg_set_weight_device, 1854 .write = cfqg_set_weight_device,
1855 }, 1855 },
1856 { 1856 {
1857 .name = "weight", 1857 .name = "weight",
@@ -1863,7 +1863,7 @@ static struct cftype cfq_blkcg_files[] = {
1863 { 1863 {
1864 .name = "leaf_weight_device", 1864 .name = "leaf_weight_device",
1865 .seq_show = cfqg_print_leaf_weight_device, 1865 .seq_show = cfqg_print_leaf_weight_device,
1866 .write_string = cfqg_set_leaf_weight_device, 1866 .write = cfqg_set_leaf_weight_device,
1867 }, 1867 },
1868 { 1868 {
1869 .name = "leaf_weight", 1869 .name = "leaf_weight",
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bddebc5cf8e7..8a111dd42d7a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -21,6 +21,7 @@
21#include <linux/percpu-refcount.h> 21#include <linux/percpu-refcount.h>
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/kernfs.h> 23#include <linux/kernfs.h>
24#include <linux/wait.h>
24 25
25#ifdef CONFIG_CGROUPS 26#ifdef CONFIG_CGROUPS
26 27
@@ -47,21 +48,45 @@ enum cgroup_subsys_id {
47}; 48};
48#undef SUBSYS 49#undef SUBSYS
49 50
50/* Per-subsystem/per-cgroup state maintained by the system. */ 51/*
52 * Per-subsystem/per-cgroup state maintained by the system. This is the
53 * fundamental structural building block that controllers deal with.
54 *
55 * Fields marked with "PI:" are public and immutable and may be accessed
56 * directly without synchronization.
57 */
51struct cgroup_subsys_state { 58struct cgroup_subsys_state {
52 /* the cgroup that this css is attached to */ 59 /* PI: the cgroup that this css is attached to */
53 struct cgroup *cgroup; 60 struct cgroup *cgroup;
54 61
55 /* the cgroup subsystem that this css is attached to */ 62 /* PI: the cgroup subsystem that this css is attached to */
56 struct cgroup_subsys *ss; 63 struct cgroup_subsys *ss;
57 64
58 /* reference count - access via css_[try]get() and css_put() */ 65 /* reference count - access via css_[try]get() and css_put() */
59 struct percpu_ref refcnt; 66 struct percpu_ref refcnt;
60 67
61 /* the parent css */ 68 /* PI: the parent css */
62 struct cgroup_subsys_state *parent; 69 struct cgroup_subsys_state *parent;
63 70
64 unsigned long flags; 71 /* siblings list anchored at the parent's ->children */
72 struct list_head sibling;
73 struct list_head children;
74
75 /*
76 * PI: Subsys-unique ID. 0 is unused and root is always 1. The
77 * matching css can be looked up using css_from_id().
78 */
79 int id;
80
81 unsigned int flags;
82
83 /*
84 * Monotonically increasing unique serial number which defines a
85 * uniform order among all csses. It's guaranteed that all
86 * ->children lists are in the ascending order of ->serial_nr and
87 * used to allow interrupting and resuming iterations.
88 */
89 u64 serial_nr;
65 90
66 /* percpu_ref killing and RCU release */ 91 /* percpu_ref killing and RCU release */
67 struct rcu_head rcu_head; 92 struct rcu_head rcu_head;
@@ -70,8 +95,9 @@ struct cgroup_subsys_state {
70 95
71/* bits in struct cgroup_subsys_state flags field */ 96/* bits in struct cgroup_subsys_state flags field */
72enum { 97enum {
73 CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */ 98 CSS_NO_REF = (1 << 0), /* no reference counting for this css */
74 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 99 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
100 CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
75}; 101};
76 102
77/** 103/**
@@ -82,8 +108,7 @@ enum {
82 */ 108 */
83static inline void css_get(struct cgroup_subsys_state *css) 109static inline void css_get(struct cgroup_subsys_state *css)
84{ 110{
85 /* We don't need to reference count the root state */ 111 if (!(css->flags & CSS_NO_REF))
86 if (!(css->flags & CSS_ROOT))
87 percpu_ref_get(&css->refcnt); 112 percpu_ref_get(&css->refcnt);
88} 113}
89 114
@@ -91,35 +116,51 @@ static inline void css_get(struct cgroup_subsys_state *css)
91 * css_tryget - try to obtain a reference on the specified css 116 * css_tryget - try to obtain a reference on the specified css
92 * @css: target css 117 * @css: target css
93 * 118 *
94 * Obtain a reference on @css if it's alive. The caller naturally needs to 119 * Obtain a reference on @css unless it already has reached zero and is
95 * ensure that @css is accessible but doesn't have to be holding a 120 * being released. This function doesn't care whether @css is on or
121 * offline. The caller naturally needs to ensure that @css is accessible
122 * but doesn't have to be holding a reference on it - IOW, RCU protected
123 * access is good enough for this function. Returns %true if a reference
124 * count was successfully obtained; %false otherwise.
125 */
126static inline bool css_tryget(struct cgroup_subsys_state *css)
127{
128 if (!(css->flags & CSS_NO_REF))
129 return percpu_ref_tryget(&css->refcnt);
130 return true;
131}
132
133/**
134 * css_tryget_online - try to obtain a reference on the specified css if online
135 * @css: target css
136 *
137 * Obtain a reference on @css if it's online. The caller naturally needs
138 * to ensure that @css is accessible but doesn't have to be holding a
96 * reference on it - IOW, RCU protected access is good enough for this 139 * reference on it - IOW, RCU protected access is good enough for this
97 * function. Returns %true if a reference count was successfully obtained; 140 * function. Returns %true if a reference count was successfully obtained;
98 * %false otherwise. 141 * %false otherwise.
99 */ 142 */
100static inline bool css_tryget(struct cgroup_subsys_state *css) 143static inline bool css_tryget_online(struct cgroup_subsys_state *css)
101{ 144{
102 if (css->flags & CSS_ROOT) 145 if (!(css->flags & CSS_NO_REF))
103 return true; 146 return percpu_ref_tryget_live(&css->refcnt);
104 return percpu_ref_tryget_live(&css->refcnt); 147 return true;
105} 148}
106 149
107/** 150/**
108 * css_put - put a css reference 151 * css_put - put a css reference
109 * @css: target css 152 * @css: target css
110 * 153 *
111 * Put a reference obtained via css_get() and css_tryget(). 154 * Put a reference obtained via css_get() and css_tryget_online().
112 */ 155 */
113static inline void css_put(struct cgroup_subsys_state *css) 156static inline void css_put(struct cgroup_subsys_state *css)
114{ 157{
115 if (!(css->flags & CSS_ROOT)) 158 if (!(css->flags & CSS_NO_REF))
116 percpu_ref_put(&css->refcnt); 159 percpu_ref_put(&css->refcnt);
117} 160}
118 161
119/* bits in struct cgroup flags field */ 162/* bits in struct cgroup flags field */
120enum { 163enum {
121 /* Control Group is dead */
122 CGRP_DEAD,
123 /* 164 /*
124 * Control Group has previously had a child cgroup or a task, 165 * Control Group has previously had a child cgroup or a task,
125 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) 166 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
@@ -133,48 +174,37 @@ enum {
133 * specified at mount time and thus is implemented here. 174 * specified at mount time and thus is implemented here.
134 */ 175 */
135 CGRP_CPUSET_CLONE_CHILDREN, 176 CGRP_CPUSET_CLONE_CHILDREN,
136 /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
137 CGRP_SANE_BEHAVIOR,
138}; 177};
139 178
140struct cgroup { 179struct cgroup {
180 /* self css with NULL ->ss, points back to this cgroup */
181 struct cgroup_subsys_state self;
182
141 unsigned long flags; /* "unsigned long" so bitops work */ 183 unsigned long flags; /* "unsigned long" so bitops work */
142 184
143 /* 185 /*
144 * idr allocated in-hierarchy ID. 186 * idr allocated in-hierarchy ID.
145 * 187 *
146 * The ID of the root cgroup is always 0, and a new cgroup 188 * ID 0 is not used, the ID of the root cgroup is always 1, and a
147 * will be assigned with a smallest available ID. 189 * new cgroup will be assigned with a smallest available ID.
148 * 190 *
149 * Allocating/Removing ID must be protected by cgroup_mutex. 191 * Allocating/Removing ID must be protected by cgroup_mutex.
150 */ 192 */
151 int id; 193 int id;
152 194
153 /* the number of attached css's */
154 int nr_css;
155
156 atomic_t refcnt;
157
158 /* 195 /*
159 * We link our 'sibling' struct into our parent's 'children'. 196 * If this cgroup contains any tasks, it contributes one to
160 * Our children link their 'sibling' into our 'children'. 197 * populated_cnt. All children with non-zero popuplated_cnt of
198 * their own contribute one. The count is zero iff there's no task
199 * in this cgroup or its subtree.
161 */ 200 */
162 struct list_head sibling; /* my parent's children */ 201 int populated_cnt;
163 struct list_head children; /* my children */
164 202
165 struct cgroup *parent; /* my parent */
166 struct kernfs_node *kn; /* cgroup kernfs entry */ 203 struct kernfs_node *kn; /* cgroup kernfs entry */
204 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
167 205
168 /* 206 /* the bitmask of subsystems enabled on the child cgroups */
169 * Monotonically increasing unique serial number which defines a 207 unsigned int child_subsys_mask;
170 * uniform order among all cgroups. It's guaranteed that all
171 * ->children lists are in the ascending order of ->serial_nr.
172 * It's used to allow interrupting and resuming iterations.
173 */
174 u64 serial_nr;
175
176 /* The bitmask of subsystems attached to this cgroup */
177 unsigned long subsys_mask;
178 208
179 /* Private pointers for each registered subsystem */ 209 /* Private pointers for each registered subsystem */
180 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; 210 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
@@ -188,6 +218,15 @@ struct cgroup {
188 struct list_head cset_links; 218 struct list_head cset_links;
189 219
190 /* 220 /*
221 * On the default hierarchy, a css_set for a cgroup with some
222 * susbsys disabled will point to css's which are associated with
223 * the closest ancestor which has the subsys enabled. The
224 * following lists all css_sets which point to this cgroup's css
225 * for the given subsystem.
226 */
227 struct list_head e_csets[CGROUP_SUBSYS_COUNT];
228
229 /*
191 * Linked list running through all cgroups that can 230 * Linked list running through all cgroups that can
192 * potentially be reaped by the release agent. Protected by 231 * potentially be reaped by the release agent. Protected by
193 * release_list_lock 232 * release_list_lock
@@ -201,12 +240,8 @@ struct cgroup {
201 struct list_head pidlists; 240 struct list_head pidlists;
202 struct mutex pidlist_mutex; 241 struct mutex pidlist_mutex;
203 242
204 /* dummy css with NULL ->ss, points back to this cgroup */ 243 /* used to wait for offlining of csses */
205 struct cgroup_subsys_state dummy_css; 244 wait_queue_head_t offline_waitq;
206
207 /* For css percpu_ref killing and RCU-protected deletion */
208 struct rcu_head rcu_head;
209 struct work_struct destroy_work;
210}; 245};
211 246
212#define MAX_CGROUP_ROOT_NAMELEN 64 247#define MAX_CGROUP_ROOT_NAMELEN 64
@@ -250,6 +285,12 @@ enum {
250 * 285 *
251 * - "cgroup.clone_children" is removed. 286 * - "cgroup.clone_children" is removed.
252 * 287 *
288 * - "cgroup.subtree_populated" is available. Its value is 0 if
289 * the cgroup and its descendants contain no task; otherwise, 1.
290 * The file also generates kernfs notification which can be
291 * monitored through poll and [di]notify when the value of the
292 * file changes.
293 *
253 * - If mount is requested with sane_behavior but without any 294 * - If mount is requested with sane_behavior but without any
254 * subsystem, the default unified hierarchy is mounted. 295 * subsystem, the default unified hierarchy is mounted.
255 * 296 *
@@ -264,6 +305,8 @@ enum {
264 * the flag is not created. 305 * the flag is not created.
265 * 306 *
266 * - blkcg: blk-throttle becomes properly hierarchical. 307 * - blkcg: blk-throttle becomes properly hierarchical.
308 *
309 * - debug: disallowed on the default hierarchy.
267 */ 310 */
268 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), 311 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
269 312
@@ -282,6 +325,9 @@ enum {
282struct cgroup_root { 325struct cgroup_root {
283 struct kernfs_root *kf_root; 326 struct kernfs_root *kf_root;
284 327
328 /* The bitmask of subsystems attached to this hierarchy */
329 unsigned int subsys_mask;
330
285 /* Unique id for this hierarchy. */ 331 /* Unique id for this hierarchy. */
286 int hierarchy_id; 332 int hierarchy_id;
287 333
@@ -295,7 +341,7 @@ struct cgroup_root {
295 struct list_head root_list; 341 struct list_head root_list;
296 342
297 /* Hierarchy-specific flags */ 343 /* Hierarchy-specific flags */
298 unsigned long flags; 344 unsigned int flags;
299 345
300 /* IDs for cgroups in this hierarchy */ 346 /* IDs for cgroups in this hierarchy */
301 struct idr cgroup_idr; 347 struct idr cgroup_idr;
@@ -342,6 +388,9 @@ struct css_set {
342 */ 388 */
343 struct list_head cgrp_links; 389 struct list_head cgrp_links;
344 390
391 /* the default cgroup associated with this css_set */
392 struct cgroup *dfl_cgrp;
393
345 /* 394 /*
346 * Set of subsystem states, one for each subsystem. This array is 395 * Set of subsystem states, one for each subsystem. This array is
347 * immutable after creation apart from the init_css_set during 396 * immutable after creation apart from the init_css_set during
@@ -366,6 +415,15 @@ struct css_set {
366 struct cgroup *mg_src_cgrp; 415 struct cgroup *mg_src_cgrp;
367 struct css_set *mg_dst_cset; 416 struct css_set *mg_dst_cset;
368 417
418 /*
419 * On the default hierarhcy, ->subsys[ssid] may point to a css
420 * attached to an ancestor instead of the cgroup this css_set is
421 * associated with. The following node is anchored at
422 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
423 * iterate through all css's attached to a given cgroup.
424 */
425 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
426
369 /* For RCU-protected deletion */ 427 /* For RCU-protected deletion */
370 struct rcu_head rcu_head; 428 struct rcu_head rcu_head;
371}; 429};
@@ -405,8 +463,7 @@ struct cftype {
405 463
406 /* 464 /*
407 * The maximum length of string, excluding trailing nul, that can 465 * The maximum length of string, excluding trailing nul, that can
408 * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is 466 * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
409 * assumed.
410 */ 467 */
411 size_t max_write_len; 468 size_t max_write_len;
412 469
@@ -453,19 +510,13 @@ struct cftype {
453 s64 val); 510 s64 val);
454 511
455 /* 512 /*
456 * write_string() is passed a nul-terminated kernelspace 513 * write() is the generic write callback which maps directly to
457 * buffer of maximum length determined by max_write_len. 514 * kernfs write operation and overrides all other operations.
458 * Returns 0 or -ve error code. 515 * Maximum write size is determined by ->max_write_len. Use
459 */ 516 * of_css/cft() to access the associated css and cft.
460 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
461 char *buffer);
462 /*
463 * trigger() callback can be used to get some kick from the
464 * userspace, when the actual string written is not important
465 * at all. The private field can be used to determine the
466 * kick type for multiplexing.
467 */ 517 */
468 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 518 ssize_t (*write)(struct kernfs_open_file *of,
519 char *buf, size_t nbytes, loff_t off);
469 520
470#ifdef CONFIG_DEBUG_LOCK_ALLOC 521#ifdef CONFIG_DEBUG_LOCK_ALLOC
471 struct lock_class_key lockdep_key; 522 struct lock_class_key lockdep_key;
@@ -504,14 +555,24 @@ static inline ino_t cgroup_ino(struct cgroup *cgrp)
504 return 0; 555 return 0;
505} 556}
506 557
507static inline struct cftype *seq_cft(struct seq_file *seq) 558/* cft/css accessors for cftype->write() operation */
559static inline struct cftype *of_cft(struct kernfs_open_file *of)
508{ 560{
509 struct kernfs_open_file *of = seq->private;
510
511 return of->kn->priv; 561 return of->kn->priv;
512} 562}
513 563
514struct cgroup_subsys_state *seq_css(struct seq_file *seq); 564struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
565
566/* cft/css accessors for cftype->seq_*() operations */
567static inline struct cftype *seq_cft(struct seq_file *seq)
568{
569 return of_cft(seq->private);
570}
571
572static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
573{
574 return of_css(seq->private);
575}
515 576
516/* 577/*
517 * Name / path handling functions. All are thin wrappers around the kernfs 578 * Name / path handling functions. All are thin wrappers around the kernfs
@@ -612,6 +673,9 @@ struct cgroup_subsys {
612 /* link to parent, protected by cgroup_lock() */ 673 /* link to parent, protected by cgroup_lock() */
613 struct cgroup_root *root; 674 struct cgroup_root *root;
614 675
676 /* idr for css->id */
677 struct idr css_idr;
678
615 /* 679 /*
616 * List of cftypes. Each entry is the first entry of an array 680 * List of cftypes. Each entry is the first entry of an array
617 * terminated by zero length name. 681 * terminated by zero length name.
@@ -627,19 +691,6 @@ struct cgroup_subsys {
627#undef SUBSYS 691#undef SUBSYS
628 692
629/** 693/**
630 * css_parent - find the parent css
631 * @css: the target cgroup_subsys_state
632 *
633 * Return the parent css of @css. This function is guaranteed to return
634 * non-NULL parent as long as @css isn't the root.
635 */
636static inline
637struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
638{
639 return css->parent;
640}
641
642/**
643 * task_css_set_check - obtain a task's css_set with extra access conditions 694 * task_css_set_check - obtain a task's css_set with extra access conditions
644 * @task: the task to obtain css_set for 695 * @task: the task to obtain css_set for
645 * @__c: extra condition expression to be passed to rcu_dereference_check() 696 * @__c: extra condition expression to be passed to rcu_dereference_check()
@@ -731,14 +782,14 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
731 * @pos: the css * to use as the loop cursor 782 * @pos: the css * to use as the loop cursor
732 * @parent: css whose children to walk 783 * @parent: css whose children to walk
733 * 784 *
734 * Walk @parent's children. Must be called under rcu_read_lock(). A child 785 * Walk @parent's children. Must be called under rcu_read_lock().
735 * css which hasn't finished ->css_online() or already has finished
736 * ->css_offline() may show up during traversal and it's each subsystem's
737 * responsibility to verify that each @pos is alive.
738 * 786 *
739 * If a subsystem synchronizes against the parent in its ->css_online() and 787 * If a subsystem synchronizes ->css_online() and the start of iteration, a
740 * before starting iterating, a css which finished ->css_online() is 788 * css which finished ->css_online() is guaranteed to be visible in the
741 * guaranteed to be visible in the future iterations. 789 * future iterations and will stay visible until the last reference is put.
790 * A css which hasn't finished ->css_online() or already finished
791 * ->css_offline() may show up during traversal. It's each subsystem's
792 * responsibility to synchronize against on/offlining.
742 * 793 *
743 * It is allowed to temporarily drop RCU read lock during iteration. The 794 * It is allowed to temporarily drop RCU read lock during iteration. The
744 * caller is responsible for ensuring that @pos remains accessible until 795 * caller is responsible for ensuring that @pos remains accessible until
@@ -761,17 +812,16 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos);
761 * @root: css whose descendants to walk 812 * @root: css whose descendants to walk
762 * 813 *
763 * Walk @root's descendants. @root is included in the iteration and the 814 * Walk @root's descendants. @root is included in the iteration and the
764 * first node to be visited. Must be called under rcu_read_lock(). A 815 * first node to be visited. Must be called under rcu_read_lock().
765 * descendant css which hasn't finished ->css_online() or already has
766 * finished ->css_offline() may show up during traversal and it's each
767 * subsystem's responsibility to verify that each @pos is alive.
768 * 816 *
769 * If a subsystem synchronizes against the parent in its ->css_online() and 817 * If a subsystem synchronizes ->css_online() and the start of iteration, a
770 * before starting iterating, and synchronizes against @pos on each 818 * css which finished ->css_online() is guaranteed to be visible in the
771 * iteration, any descendant css which finished ->css_online() is 819 * future iterations and will stay visible until the last reference is put.
772 * guaranteed to be visible in the future iterations. 820 * A css which hasn't finished ->css_online() or already finished
821 * ->css_offline() may show up during traversal. It's each subsystem's
822 * responsibility to synchronize against on/offlining.
773 * 823 *
774 * In other words, the following guarantees that a descendant can't escape 824 * For example, the following guarantees that a descendant can't escape
775 * state updates of its ancestors. 825 * state updates of its ancestors.
776 * 826 *
777 * my_online(@css) 827 * my_online(@css)
@@ -827,18 +877,34 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
827 * 877 *
828 * Similar to css_for_each_descendant_pre() but performs post-order 878 * Similar to css_for_each_descendant_pre() but performs post-order
829 * traversal instead. @root is included in the iteration and the last 879 * traversal instead. @root is included in the iteration and the last
830 * node to be visited. Note that the walk visibility guarantee described 880 * node to be visited.
831 * in pre-order walk doesn't apply the same to post-order walks. 881 *
882 * If a subsystem synchronizes ->css_online() and the start of iteration, a
883 * css which finished ->css_online() is guaranteed to be visible in the
884 * future iterations and will stay visible until the last reference is put.
885 * A css which hasn't finished ->css_online() or already finished
886 * ->css_offline() may show up during traversal. It's each subsystem's
887 * responsibility to synchronize against on/offlining.
888 *
889 * Note that the walk visibility guarantee example described in pre-order
890 * walk doesn't apply the same to post-order walks.
832 */ 891 */
833#define css_for_each_descendant_post(pos, css) \ 892#define css_for_each_descendant_post(pos, css) \
834 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ 893 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
835 (pos) = css_next_descendant_post((pos), (css))) 894 (pos) = css_next_descendant_post((pos), (css)))
836 895
896bool css_has_online_children(struct cgroup_subsys_state *css);
897
837/* A css_task_iter should be treated as an opaque object */ 898/* A css_task_iter should be treated as an opaque object */
838struct css_task_iter { 899struct css_task_iter {
839 struct cgroup_subsys_state *origin_css; 900 struct cgroup_subsys *ss;
840 struct list_head *cset_link; 901
841 struct list_head *task; 902 struct list_head *cset_pos;
903 struct list_head *cset_head;
904
905 struct list_head *task_pos;
906 struct list_head *tasks_head;
907 struct list_head *mg_tasks_head;
842}; 908};
843 909
844void css_task_iter_start(struct cgroup_subsys_state *css, 910void css_task_iter_start(struct cgroup_subsys_state *css,
@@ -849,8 +915,8 @@ void css_task_iter_end(struct css_task_iter *it);
849int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 915int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
850int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 916int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
851 917
852struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 918struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
853 struct cgroup_subsys *ss); 919 struct cgroup_subsys *ss);
854 920
855#else /* !CONFIG_CGROUPS */ 921#else /* !CONFIG_CGROUPS */
856 922
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 768fe44e19f0..98c4f9b12b03 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -7,10 +7,6 @@
7SUBSYS(cpuset) 7SUBSYS(cpuset)
8#endif 8#endif
9 9
10#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
11SUBSYS(debug)
12#endif
13
14#if IS_ENABLED(CONFIG_CGROUP_SCHED) 10#if IS_ENABLED(CONFIG_CGROUP_SCHED)
15SUBSYS(cpu) 11SUBSYS(cpu)
16#endif 12#endif
@@ -50,6 +46,13 @@ SUBSYS(net_prio)
50#if IS_ENABLED(CONFIG_CGROUP_HUGETLB) 46#if IS_ENABLED(CONFIG_CGROUP_HUGETLB)
51SUBSYS(hugetlb) 47SUBSYS(hugetlb)
52#endif 48#endif
49
50/*
51 * The following subsystems are not supported on the default hierarchy.
52 */
53#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
54SUBSYS(debug)
55#endif
53/* 56/*
54 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. 57 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
55 */ 58 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ceee0c54c6a4..7868fc3c0bc5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
26 * distribution for more details. 26 * distribution for more details.
27 */ 27 */
28 28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
29#include <linux/cgroup.h> 31#include <linux/cgroup.h>
30#include <linux/cred.h> 32#include <linux/cred.h>
31#include <linux/ctype.h> 33#include <linux/ctype.h>
@@ -70,15 +72,6 @@
70 MAX_CFTYPE_NAME + 2) 72 MAX_CFTYPE_NAME + 2)
71 73
72/* 74/*
73 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
74 * creation/removal and hierarchy changing operations including cgroup
75 * creation, removal, css association and controller rebinding. This outer
76 * lock is needed mainly to resolve the circular dependency between kernfs
77 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
78 */
79static DEFINE_MUTEX(cgroup_tree_mutex);
80
81/*
82 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
83 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
84 * 77 *
@@ -99,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem);
99#endif 92#endif
100 93
101/* 94/*
95 * Protects cgroup_idr and css_idr so that IDs can be released without
96 * grabbing cgroup_mutex.
97 */
98static DEFINE_SPINLOCK(cgroup_idr_lock);
99
100/*
102 * Protects cgroup_subsys->release_agent_path. Modifying it also requires 101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
103 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. 102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
104 */ 103 */
105static DEFINE_SPINLOCK(release_agent_path_lock); 104static DEFINE_SPINLOCK(release_agent_path_lock);
106 105
107#define cgroup_assert_mutexes_or_rcu_locked() \ 106#define cgroup_assert_mutex_or_rcu_locked() \
108 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
109 lockdep_is_held(&cgroup_tree_mutex) || \
110 lockdep_is_held(&cgroup_mutex), \ 108 lockdep_is_held(&cgroup_mutex), \
111 "cgroup_[tree_]mutex or RCU read lock required"); 109 "cgroup_mutex or RCU read lock required");
112 110
113/* 111/*
114 * cgroup destruction makes heavy use of work items and there can be a lot 112 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -151,6 +149,13 @@ struct cgroup_root cgrp_dfl_root;
151 */ 149 */
152static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
153 151
152/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158
154/* The list of hierarchy roots */ 159/* The list of hierarchy roots */
155 160
156static LIST_HEAD(cgroup_roots); 161static LIST_HEAD(cgroup_roots);
@@ -160,14 +165,13 @@ static int cgroup_root_count;
160static DEFINE_IDR(cgroup_hierarchy_idr); 165static DEFINE_IDR(cgroup_hierarchy_idr);
161 166
162/* 167/*
163 * Assign a monotonically increasing serial number to cgroups. It 168 * Assign a monotonically increasing serial number to csses. It guarantees
164 * guarantees cgroups with bigger numbers are newer than those with smaller 169 * cgroups with bigger numbers are newer than those with smaller numbers.
165 * numbers. Also, as cgroups are always appended to the parent's 170 * Also, as csses are always appended to the parent's ->children list, it
166 * ->children list, it guarantees that sibling cgroups are always sorted in 171 * guarantees that sibling csses are always sorted in the ascending serial
167 * the ascending serial number order on the list. Protected by 172 * number order on the list. Protected by cgroup_mutex.
168 * cgroup_mutex.
169 */ 173 */
170static u64 cgroup_serial_nr_next = 1; 174static u64 css_serial_nr_next = 1;
171 175
172/* This flag indicates whether tasks in the fork and exit paths should 176/* This flag indicates whether tasks in the fork and exit paths should
173 * check for fork/exit handlers to call. This avoids us having to do 177 * check for fork/exit handlers to call. This avoids us having to do
@@ -180,17 +184,59 @@ static struct cftype cgroup_base_files[];
180 184
181static void cgroup_put(struct cgroup *cgrp); 185static void cgroup_put(struct cgroup *cgrp);
182static int rebind_subsystems(struct cgroup_root *dst_root, 186static int rebind_subsystems(struct cgroup_root *dst_root,
183 unsigned long ss_mask); 187 unsigned int ss_mask);
184static void cgroup_destroy_css_killed(struct cgroup *cgrp);
185static int cgroup_destroy_locked(struct cgroup *cgrp); 188static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
190static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css);
186static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
187 bool is_add); 193 bool is_add);
188static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 194static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
189 195
196/* IDR wrappers which synchronize using cgroup_idr_lock */
197static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
198 gfp_t gfp_mask)
199{
200 int ret;
201
202 idr_preload(gfp_mask);
203 spin_lock_bh(&cgroup_idr_lock);
204 ret = idr_alloc(idr, ptr, start, end, gfp_mask);
205 spin_unlock_bh(&cgroup_idr_lock);
206 idr_preload_end();
207 return ret;
208}
209
210static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
211{
212 void *ret;
213
214 spin_lock_bh(&cgroup_idr_lock);
215 ret = idr_replace(idr, ptr, id);
216 spin_unlock_bh(&cgroup_idr_lock);
217 return ret;
218}
219
220static void cgroup_idr_remove(struct idr *idr, int id)
221{
222 spin_lock_bh(&cgroup_idr_lock);
223 idr_remove(idr, id);
224 spin_unlock_bh(&cgroup_idr_lock);
225}
226
227static struct cgroup *cgroup_parent(struct cgroup *cgrp)
228{
229 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
230
231 if (parent_css)
232 return container_of(parent_css, struct cgroup, self);
233 return NULL;
234}
235
190/** 236/**
191 * cgroup_css - obtain a cgroup's css for the specified subsystem 237 * cgroup_css - obtain a cgroup's css for the specified subsystem
192 * @cgrp: the cgroup of interest 238 * @cgrp: the cgroup of interest
193 * @ss: the subsystem of interest (%NULL returns the dummy_css) 239 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
194 * 240 *
195 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This 241 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
196 * function must be called either under cgroup_mutex or rcu_read_lock() and 242 * function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -203,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
203{ 249{
204 if (ss) 250 if (ss)
205 return rcu_dereference_check(cgrp->subsys[ss->id], 251 return rcu_dereference_check(cgrp->subsys[ss->id],
206 lockdep_is_held(&cgroup_tree_mutex) ||
207 lockdep_is_held(&cgroup_mutex)); 252 lockdep_is_held(&cgroup_mutex));
208 else 253 else
209 return &cgrp->dummy_css; 254 return &cgrp->self;
255}
256
257/**
258 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
259 * @cgrp: the cgroup of interest
260 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
261 *
262 * Similar to cgroup_css() but returns the effctive css, which is defined
263 * as the matching css of the nearest ancestor including self which has @ss
264 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
265 * function is guaranteed to return non-NULL css.
266 */
267static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
268 struct cgroup_subsys *ss)
269{
270 lockdep_assert_held(&cgroup_mutex);
271
272 if (!ss)
273 return &cgrp->self;
274
275 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
276 return NULL;
277
278 while (cgroup_parent(cgrp) &&
279 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
280 cgrp = cgroup_parent(cgrp);
281
282 return cgroup_css(cgrp, ss);
210} 283}
211 284
212/* convenient tests for these bits */ 285/* convenient tests for these bits */
213static inline bool cgroup_is_dead(const struct cgroup *cgrp) 286static inline bool cgroup_is_dead(const struct cgroup *cgrp)
214{ 287{
215 return test_bit(CGRP_DEAD, &cgrp->flags); 288 return !(cgrp->self.flags & CSS_ONLINE);
216} 289}
217 290
218struct cgroup_subsys_state *seq_css(struct seq_file *seq) 291struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
219{ 292{
220 struct kernfs_open_file *of = seq->private;
221 struct cgroup *cgrp = of->kn->parent->priv; 293 struct cgroup *cgrp = of->kn->parent->priv;
222 struct cftype *cft = seq_cft(seq); 294 struct cftype *cft = of_cft(of);
223 295
224 /* 296 /*
225 * This is open and unprotected implementation of cgroup_css(). 297 * This is open and unprotected implementation of cgroup_css().
@@ -232,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
232 if (cft->ss) 304 if (cft->ss)
233 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); 305 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
234 else 306 else
235 return &cgrp->dummy_css; 307 return &cgrp->self;
236} 308}
237EXPORT_SYMBOL_GPL(seq_css); 309EXPORT_SYMBOL_GPL(of_css);
238 310
239/** 311/**
240 * cgroup_is_descendant - test ancestry 312 * cgroup_is_descendant - test ancestry
@@ -250,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
250 while (cgrp) { 322 while (cgrp) {
251 if (cgrp == ancestor) 323 if (cgrp == ancestor)
252 return true; 324 return true;
253 cgrp = cgrp->parent; 325 cgrp = cgroup_parent(cgrp);
254 } 326 }
255 return false; 327 return false;
256} 328}
@@ -274,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp)
274 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 346 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
275 * @cgrp: the target cgroup to iterate css's of 347 * @cgrp: the target cgroup to iterate css's of
276 * 348 *
277 * Should be called under cgroup_mutex. 349 * Should be called under cgroup_[tree_]mutex.
278 */ 350 */
279#define for_each_css(css, ssid, cgrp) \ 351#define for_each_css(css, ssid, cgrp) \
280 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 352 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
281 if (!((css) = rcu_dereference_check( \ 353 if (!((css) = rcu_dereference_check( \
282 (cgrp)->subsys[(ssid)], \ 354 (cgrp)->subsys[(ssid)], \
283 lockdep_is_held(&cgroup_tree_mutex) || \
284 lockdep_is_held(&cgroup_mutex)))) { } \ 355 lockdep_is_held(&cgroup_mutex)))) { } \
285 else 356 else
286 357
287/** 358/**
359 * for_each_e_css - iterate all effective css's of a cgroup
360 * @css: the iteration cursor
361 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
362 * @cgrp: the target cgroup to iterate css's of
363 *
364 * Should be called under cgroup_[tree_]mutex.
365 */
366#define for_each_e_css(css, ssid, cgrp) \
367 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
368 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
369 ; \
370 else
371
372/**
288 * for_each_subsys - iterate all enabled cgroup subsystems 373 * for_each_subsys - iterate all enabled cgroup subsystems
289 * @ss: the iteration cursor 374 * @ss: the iteration cursor
290 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 375 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -297,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp)
297#define for_each_root(root) \ 382#define for_each_root(root) \
298 list_for_each_entry((root), &cgroup_roots, root_list) 383 list_for_each_entry((root), &cgroup_roots, root_list)
299 384
300/** 385/* iterate over child cgrps, lock should be held throughout iteration */
301 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 386#define cgroup_for_each_live_child(child, cgrp) \
302 * @cgrp: the cgroup to be checked for liveness 387 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
303 * 388 if (({ lockdep_assert_held(&cgroup_mutex); \
304 * On success, returns true; the mutex should be later unlocked. On 389 cgroup_is_dead(child); })) \
305 * failure returns false with no lock held. 390 ; \
306 */ 391 else
307static bool cgroup_lock_live_group(struct cgroup *cgrp)
308{
309 mutex_lock(&cgroup_mutex);
310 if (cgroup_is_dead(cgrp)) {
311 mutex_unlock(&cgroup_mutex);
312 return false;
313 }
314 return true;
315}
316 392
317/* the list of cgroups eligible for automatic release. Protected by 393/* the list of cgroups eligible for automatic release. Protected by
318 * release_list_lock */ 394 * release_list_lock */
@@ -360,6 +436,43 @@ struct css_set init_css_set = {
360 436
361static int css_set_count = 1; /* 1 for init_css_set */ 437static int css_set_count = 1; /* 1 for init_css_set */
362 438
439/**
440 * cgroup_update_populated - updated populated count of a cgroup
441 * @cgrp: the target cgroup
442 * @populated: inc or dec populated count
443 *
444 * @cgrp is either getting the first task (css_set) or losing the last.
445 * Update @cgrp->populated_cnt accordingly. The count is propagated
446 * towards root so that a given cgroup's populated_cnt is zero iff the
447 * cgroup and all its descendants are empty.
448 *
449 * @cgrp's interface file "cgroup.populated" is zero if
450 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
451 * changes from or to zero, userland is notified that the content of the
452 * interface file has changed. This can be used to detect when @cgrp and
453 * its descendants become populated or empty.
454 */
455static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
456{
457 lockdep_assert_held(&css_set_rwsem);
458
459 do {
460 bool trigger;
461
462 if (populated)
463 trigger = !cgrp->populated_cnt++;
464 else
465 trigger = !--cgrp->populated_cnt;
466
467 if (!trigger)
468 break;
469
470 if (cgrp->populated_kn)
471 kernfs_notify(cgrp->populated_kn);
472 cgrp = cgroup_parent(cgrp);
473 } while (cgrp);
474}
475
363/* 476/*
364 * hash table for cgroup groups. This improves the performance to find 477 * hash table for cgroup groups. This improves the performance to find
365 * an existing css_set. This hash doesn't (currently) take into 478 * an existing css_set. This hash doesn't (currently) take into
@@ -384,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
384static void put_css_set_locked(struct css_set *cset, bool taskexit) 497static void put_css_set_locked(struct css_set *cset, bool taskexit)
385{ 498{
386 struct cgrp_cset_link *link, *tmp_link; 499 struct cgrp_cset_link *link, *tmp_link;
500 struct cgroup_subsys *ss;
501 int ssid;
387 502
388 lockdep_assert_held(&css_set_rwsem); 503 lockdep_assert_held(&css_set_rwsem);
389 504
@@ -391,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
391 return; 506 return;
392 507
393 /* This css_set is dead. unlink it and release cgroup refcounts */ 508 /* This css_set is dead. unlink it and release cgroup refcounts */
509 for_each_subsys(ss, ssid)
510 list_del(&cset->e_cset_node[ssid]);
394 hash_del(&cset->hlist); 511 hash_del(&cset->hlist);
395 css_set_count--; 512 css_set_count--;
396 513
@@ -401,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
401 list_del(&link->cgrp_link); 518 list_del(&link->cgrp_link);
402 519
403 /* @cgrp can't go away while we're holding css_set_rwsem */ 520 /* @cgrp can't go away while we're holding css_set_rwsem */
404 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 521 if (list_empty(&cgrp->cset_links)) {
405 if (taskexit) 522 cgroup_update_populated(cgrp, false);
406 set_bit(CGRP_RELEASABLE, &cgrp->flags); 523 if (notify_on_release(cgrp)) {
407 check_for_release(cgrp); 524 if (taskexit)
525 set_bit(CGRP_RELEASABLE, &cgrp->flags);
526 check_for_release(cgrp);
527 }
408 } 528 }
409 529
410 kfree(link); 530 kfree(link);
@@ -453,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset,
453{ 573{
454 struct list_head *l1, *l2; 574 struct list_head *l1, *l2;
455 575
456 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { 576 /*
457 /* Not all subsystems matched */ 577 * On the default hierarchy, there can be csets which are
578 * associated with the same set of cgroups but different csses.
579 * Let's first ensure that csses match.
580 */
581 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
458 return false; 582 return false;
459 }
460 583
461 /* 584 /*
462 * Compare cgroup pointers in order to distinguish between 585 * Compare cgroup pointers in order to distinguish between
463 * different cgroups in heirarchies with no subsystems. We 586 * different cgroups in hierarchies. As different cgroups may
464 * could get by with just this check alone (and skip the 587 * share the same effective css, this comparison is always
465 * memcmp above) but on most setups the memcmp check will 588 * necessary.
466 * avoid the need for this more expensive check on almost all
467 * candidates.
468 */ 589 */
469
470 l1 = &cset->cgrp_links; 590 l1 = &cset->cgrp_links;
471 l2 = &old_cset->cgrp_links; 591 l2 = &old_cset->cgrp_links;
472 while (1) { 592 while (1) {
@@ -530,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
530 * won't change, so no need for locking. 650 * won't change, so no need for locking.
531 */ 651 */
532 for_each_subsys(ss, i) { 652 for_each_subsys(ss, i) {
533 if (root->cgrp.subsys_mask & (1UL << i)) { 653 if (root->subsys_mask & (1UL << i)) {
534 /* Subsystem is in this hierarchy. So we want 654 /*
535 * the subsystem state from the new 655 * @ss is in this hierarchy, so we want the
536 * cgroup */ 656 * effective css from @cgrp.
537 template[i] = cgroup_css(cgrp, ss); 657 */
658 template[i] = cgroup_e_css(cgrp, ss);
538 } else { 659 } else {
539 /* Subsystem is not in this hierarchy, so we 660 /*
540 * don't want to change the subsystem state */ 661 * @ss is not in this hierarchy, so we don't want
662 * to change the css.
663 */
541 template[i] = old_cset->subsys[i]; 664 template[i] = old_cset->subsys[i];
542 } 665 }
543 } 666 }
@@ -603,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
603 struct cgrp_cset_link *link; 726 struct cgrp_cset_link *link;
604 727
605 BUG_ON(list_empty(tmp_links)); 728 BUG_ON(list_empty(tmp_links));
729
730 if (cgroup_on_dfl(cgrp))
731 cset->dfl_cgrp = cgrp;
732
606 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); 733 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
607 link->cset = cset; 734 link->cset = cset;
608 link->cgrp = cgrp; 735 link->cgrp = cgrp;
736
737 if (list_empty(&cgrp->cset_links))
738 cgroup_update_populated(cgrp, true);
609 list_move(&link->cset_link, &cgrp->cset_links); 739 list_move(&link->cset_link, &cgrp->cset_links);
740
610 /* 741 /*
611 * Always add links to the tail of the list so that the list 742 * Always add links to the tail of the list so that the list
612 * is sorted by order of hierarchy creation 743 * is sorted by order of hierarchy creation
@@ -629,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
629 struct css_set *cset; 760 struct css_set *cset;
630 struct list_head tmp_links; 761 struct list_head tmp_links;
631 struct cgrp_cset_link *link; 762 struct cgrp_cset_link *link;
763 struct cgroup_subsys *ss;
632 unsigned long key; 764 unsigned long key;
765 int ssid;
633 766
634 lockdep_assert_held(&cgroup_mutex); 767 lockdep_assert_held(&cgroup_mutex);
635 768
@@ -680,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
680 813
681 css_set_count++; 814 css_set_count++;
682 815
683 /* Add this cgroup group to the hash table */ 816 /* Add @cset to the hash table */
684 key = css_set_hash(cset->subsys); 817 key = css_set_hash(cset->subsys);
685 hash_add(css_set_table, &cset->hlist, key); 818 hash_add(css_set_table, &cset->hlist, key);
686 819
820 for_each_subsys(ss, ssid)
821 list_add_tail(&cset->e_cset_node[ssid],
822 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
823
687 up_write(&css_set_rwsem); 824 up_write(&css_set_rwsem);
688 825
689 return cset; 826 return cset;
@@ -736,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
736 struct cgroup *cgrp = &root->cgrp; 873 struct cgroup *cgrp = &root->cgrp;
737 struct cgrp_cset_link *link, *tmp_link; 874 struct cgrp_cset_link *link, *tmp_link;
738 875
739 mutex_lock(&cgroup_tree_mutex);
740 mutex_lock(&cgroup_mutex); 876 mutex_lock(&cgroup_mutex);
741 877
742 BUG_ON(atomic_read(&root->nr_cgrps)); 878 BUG_ON(atomic_read(&root->nr_cgrps));
743 BUG_ON(!list_empty(&cgrp->children)); 879 BUG_ON(!list_empty(&cgrp->self.children));
744 880
745 /* Rebind all subsystems back to the default hierarchy */ 881 /* Rebind all subsystems back to the default hierarchy */
746 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); 882 rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
747 883
748 /* 884 /*
749 * Release all the links from cset_links to this hierarchy's 885 * Release all the links from cset_links to this hierarchy's
@@ -766,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
766 cgroup_exit_root_id(root); 902 cgroup_exit_root_id(root);
767 903
768 mutex_unlock(&cgroup_mutex); 904 mutex_unlock(&cgroup_mutex);
769 mutex_unlock(&cgroup_tree_mutex);
770 905
771 kernfs_destroy_root(root->kf_root); 906 kernfs_destroy_root(root->kf_root);
772 cgroup_free_root(root); 907 cgroup_free_root(root);
@@ -849,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
849 * update of a tasks cgroup pointer by cgroup_attach_task() 984 * update of a tasks cgroup pointer by cgroup_attach_task()
850 */ 985 */
851 986
852static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 987static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
853static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 988static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
854static const struct file_operations proc_cgroupstats_operations; 989static const struct file_operations proc_cgroupstats_operations;
855 990
@@ -884,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
884 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1019 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
885 mode |= S_IRUGO; 1020 mode |= S_IRUGO;
886 1021
887 if (cft->write_u64 || cft->write_s64 || cft->write_string || 1022 if (cft->write_u64 || cft->write_s64 || cft->write)
888 cft->trigger)
889 mode |= S_IWUSR; 1023 mode |= S_IWUSR;
890 1024
891 return mode; 1025 return mode;
892} 1026}
893 1027
894static void cgroup_free_fn(struct work_struct *work) 1028static void cgroup_get(struct cgroup *cgrp)
895{ 1029{
896 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 1030 WARN_ON_ONCE(cgroup_is_dead(cgrp));
897 1031 css_get(&cgrp->self);
898 atomic_dec(&cgrp->root->nr_cgrps);
899 cgroup_pidlist_destroy_all(cgrp);
900
901 if (cgrp->parent) {
902 /*
903 * We get a ref to the parent, and put the ref when this
904 * cgroup is being freed, so it's guaranteed that the
905 * parent won't be destroyed before its children.
906 */
907 cgroup_put(cgrp->parent);
908 kernfs_put(cgrp->kn);
909 kfree(cgrp);
910 } else {
911 /*
912 * This is root cgroup's refcnt reaching zero, which
913 * indicates that the root should be released.
914 */
915 cgroup_destroy_root(cgrp->root);
916 }
917} 1032}
918 1033
919static void cgroup_free_rcu(struct rcu_head *head) 1034static void cgroup_put(struct cgroup *cgrp)
920{ 1035{
921 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 1036 css_put(&cgrp->self);
922
923 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
924 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
925} 1037}
926 1038
927static void cgroup_get(struct cgroup *cgrp) 1039/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced
1042 *
1043 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1044 * the method finishes if locking succeeded. Note that once this function
1045 * returns the cgroup returned by cgroup_kn_lock_live() may become
1046 * inaccessible any time. If the caller intends to continue to access the
1047 * cgroup, it should pin it before invoking this function.
1048 */
1049static void cgroup_kn_unlock(struct kernfs_node *kn)
928{ 1050{
929 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 1051 struct cgroup *cgrp;
930 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); 1052
931 atomic_inc(&cgrp->refcnt); 1053 if (kernfs_type(kn) == KERNFS_DIR)
1054 cgrp = kn->priv;
1055 else
1056 cgrp = kn->parent->priv;
1057
1058 mutex_unlock(&cgroup_mutex);
1059
1060 kernfs_unbreak_active_protection(kn);
1061 cgroup_put(cgrp);
932} 1062}
933 1063
934static void cgroup_put(struct cgroup *cgrp) 1064/**
1065 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1066 * @kn: the kernfs_node being serviced
1067 *
1068 * This helper is to be used by a cgroup kernfs method currently servicing
1069 * @kn. It breaks the active protection, performs cgroup locking and
1070 * verifies that the associated cgroup is alive. Returns the cgroup if
1071 * alive; otherwise, %NULL. A successful return should be undone by a
1072 * matching cgroup_kn_unlock() invocation.
1073 *
1074 * Any cgroup kernfs method implementation which requires locking the
1075 * associated cgroup should use this helper. It avoids nesting cgroup
1076 * locking under kernfs active protection and allows all kernfs operations
1077 * including self-removal.
1078 */
1079static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
935{ 1080{
936 if (!atomic_dec_and_test(&cgrp->refcnt)) 1081 struct cgroup *cgrp;
937 return; 1082
938 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) 1083 if (kernfs_type(kn) == KERNFS_DIR)
939 return; 1084 cgrp = kn->priv;
1085 else
1086 cgrp = kn->parent->priv;
940 1087
941 /* 1088 /*
942 * XXX: cgrp->id is only used to look up css's. As cgroup and 1089 * We're gonna grab cgroup_mutex which nests outside kernfs
943 * css's lifetimes will be decoupled, it should be made 1090 * active_ref. cgroup liveliness check alone provides enough
944 * per-subsystem and moved to css->id so that lookups are 1091 * protection against removal. Ensure @cgrp stays accessible and
945 * successful until the target css is released. 1092 * break the active_ref protection.
946 */ 1093 */
1094 cgroup_get(cgrp);
1095 kernfs_break_active_protection(kn);
1096
947 mutex_lock(&cgroup_mutex); 1097 mutex_lock(&cgroup_mutex);
948 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
949 mutex_unlock(&cgroup_mutex);
950 cgrp->id = -1;
951 1098
952 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 1099 if (!cgroup_is_dead(cgrp))
1100 return cgrp;
1101
1102 cgroup_kn_unlock(kn);
1103 return NULL;
953} 1104}
954 1105
955static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1106static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956{ 1107{
957 char name[CGROUP_FILE_NAME_MAX]; 1108 char name[CGROUP_FILE_NAME_MAX];
958 1109
959 lockdep_assert_held(&cgroup_tree_mutex); 1110 lockdep_assert_held(&cgroup_mutex);
960 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1111 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
961} 1112}
962 1113
@@ -965,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
965 * @cgrp: target cgroup 1116 * @cgrp: target cgroup
966 * @subsys_mask: mask of the subsystem ids whose files should be removed 1117 * @subsys_mask: mask of the subsystem ids whose files should be removed
967 */ 1118 */
968static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1119static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
969{ 1120{
970 struct cgroup_subsys *ss; 1121 struct cgroup_subsys *ss;
971 int i; 1122 int i;
@@ -973,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
973 for_each_subsys(ss, i) { 1124 for_each_subsys(ss, i) {
974 struct cftype *cfts; 1125 struct cftype *cfts;
975 1126
976 if (!test_bit(i, &subsys_mask)) 1127 if (!(subsys_mask & (1 << i)))
977 continue; 1128 continue;
978 list_for_each_entry(cfts, &ss->cfts, node) 1129 list_for_each_entry(cfts, &ss->cfts, node)
979 cgroup_addrm_files(cgrp, cfts, false); 1130 cgroup_addrm_files(cgrp, cfts, false);
980 } 1131 }
981} 1132}
982 1133
983static int rebind_subsystems(struct cgroup_root *dst_root, 1134static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
984 unsigned long ss_mask)
985{ 1135{
986 struct cgroup_subsys *ss; 1136 struct cgroup_subsys *ss;
987 int ssid, ret; 1137 unsigned int tmp_ss_mask;
1138 int ssid, i, ret;
988 1139
989 lockdep_assert_held(&cgroup_tree_mutex);
990 lockdep_assert_held(&cgroup_mutex); 1140 lockdep_assert_held(&cgroup_mutex);
991 1141
992 for_each_subsys(ss, ssid) { 1142 for_each_subsys(ss, ssid) {
993 if (!(ss_mask & (1 << ssid))) 1143 if (!(ss_mask & (1 << ssid)))
994 continue; 1144 continue;
995 1145
996 /* if @ss is on the dummy_root, we can always move it */ 1146 /* if @ss has non-root csses attached to it, can't move */
997 if (ss->root == &cgrp_dfl_root) 1147 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
998 continue;
999
1000 /* if @ss has non-root cgroups attached to it, can't move */
1001 if (!list_empty(&ss->root->cgrp.children))
1002 return -EBUSY; 1148 return -EBUSY;
1003 1149
1004 /* can't move between two non-dummy roots either */ 1150 /* can't move between two non-dummy roots either */
1005 if (dst_root != &cgrp_dfl_root) 1151 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1006 return -EBUSY; 1152 return -EBUSY;
1007 } 1153 }
1008 1154
1009 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); 1155 /* skip creating root files on dfl_root for inhibited subsystems */
1156 tmp_ss_mask = ss_mask;
1157 if (dst_root == &cgrp_dfl_root)
1158 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1159
1160 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
1010 if (ret) { 1161 if (ret) {
1011 if (dst_root != &cgrp_dfl_root) 1162 if (dst_root != &cgrp_dfl_root)
1012 return ret; 1163 return ret;
@@ -1018,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1018 * Just warn about it and continue. 1169 * Just warn about it and continue.
1019 */ 1170 */
1020 if (cgrp_dfl_root_visible) { 1171 if (cgrp_dfl_root_visible) {
1021 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", 1172 pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1022 ret, ss_mask); 1173 ret, ss_mask);
1023 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); 1174 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1024 } 1175 }
1025 } 1176 }
1026 1177
@@ -1028,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1028 * Nothing can fail from this point on. Remove files for the 1179 * Nothing can fail from this point on. Remove files for the
1029 * removed subsystems and rebind each subsystem. 1180 * removed subsystems and rebind each subsystem.
1030 */ 1181 */
1031 mutex_unlock(&cgroup_mutex);
1032 for_each_subsys(ss, ssid) 1182 for_each_subsys(ss, ssid)
1033 if (ss_mask & (1 << ssid)) 1183 if (ss_mask & (1 << ssid))
1034 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); 1184 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1035 mutex_lock(&cgroup_mutex);
1036 1185
1037 for_each_subsys(ss, ssid) { 1186 for_each_subsys(ss, ssid) {
1038 struct cgroup_root *src_root; 1187 struct cgroup_root *src_root;
1039 struct cgroup_subsys_state *css; 1188 struct cgroup_subsys_state *css;
1189 struct css_set *cset;
1040 1190
1041 if (!(ss_mask & (1 << ssid))) 1191 if (!(ss_mask & (1 << ssid)))
1042 continue; 1192 continue;
@@ -1051,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1051 ss->root = dst_root; 1201 ss->root = dst_root;
1052 css->cgroup = &dst_root->cgrp; 1202 css->cgroup = &dst_root->cgrp;
1053 1203
1054 src_root->cgrp.subsys_mask &= ~(1 << ssid); 1204 down_write(&css_set_rwsem);
1055 dst_root->cgrp.subsys_mask |= 1 << ssid; 1205 hash_for_each(css_set_table, i, cset, hlist)
1206 list_move_tail(&cset->e_cset_node[ss->id],
1207 &dst_root->cgrp.e_csets[ss->id]);
1208 up_write(&css_set_rwsem);
1209
1210 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1212
1213 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root)
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1056 1217
1057 if (ss->bind) 1218 if (ss->bind)
1058 ss->bind(css); 1219 ss->bind(css);
@@ -1070,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq,
1070 int ssid; 1231 int ssid;
1071 1232
1072 for_each_subsys(ss, ssid) 1233 for_each_subsys(ss, ssid)
1073 if (root->cgrp.subsys_mask & (1 << ssid)) 1234 if (root->subsys_mask & (1 << ssid))
1074 seq_printf(seq, ",%s", ss->name); 1235 seq_printf(seq, ",%s", ss->name);
1075 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1076 seq_puts(seq, ",sane_behavior"); 1237 seq_puts(seq, ",sane_behavior");
@@ -1092,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq,
1092} 1253}
1093 1254
1094struct cgroup_sb_opts { 1255struct cgroup_sb_opts {
1095 unsigned long subsys_mask; 1256 unsigned int subsys_mask;
1096 unsigned long flags; 1257 unsigned int flags;
1097 char *release_agent; 1258 char *release_agent;
1098 bool cpuset_clone_children; 1259 bool cpuset_clone_children;
1099 char *name; 1260 char *name;
@@ -1101,24 +1262,16 @@ struct cgroup_sb_opts {
1101 bool none; 1262 bool none;
1102}; 1263};
1103 1264
1104/*
1105 * Convert a hierarchy specifier into a bitmask of subsystems and
1106 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1107 * array. This function takes refcounts on subsystems to be used, unless it
1108 * returns error, in which case no refcounts are taken.
1109 */
1110static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1265static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1111{ 1266{
1112 char *token, *o = data; 1267 char *token, *o = data;
1113 bool all_ss = false, one_ss = false; 1268 bool all_ss = false, one_ss = false;
1114 unsigned long mask = (unsigned long)-1; 1269 unsigned int mask = -1U;
1115 struct cgroup_subsys *ss; 1270 struct cgroup_subsys *ss;
1116 int i; 1271 int i;
1117 1272
1118 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1119
1120#ifdef CONFIG_CPUSETS 1273#ifdef CONFIG_CPUSETS
1121 mask = ~(1UL << cpuset_cgrp_id); 1274 mask = ~(1U << cpuset_cgrp_id);
1122#endif 1275#endif
1123 1276
1124 memset(opts, 0, sizeof(*opts)); 1277 memset(opts, 0, sizeof(*opts));
@@ -1199,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1199 /* Mutually exclusive option 'all' + subsystem name */ 1352 /* Mutually exclusive option 'all' + subsystem name */
1200 if (all_ss) 1353 if (all_ss)
1201 return -EINVAL; 1354 return -EINVAL;
1202 set_bit(i, &opts->subsys_mask); 1355 opts->subsys_mask |= (1 << i);
1203 one_ss = true; 1356 one_ss = true;
1204 1357
1205 break; 1358 break;
@@ -1211,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1211 /* Consistency checks */ 1364 /* Consistency checks */
1212 1365
1213 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1214 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1215 1368
1216 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1217 opts->cpuset_clone_children || opts->release_agent || 1370 opts->cpuset_clone_children || opts->release_agent ||
1218 opts->name) { 1371 opts->name) {
1219 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); 1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1220 return -EINVAL; 1373 return -EINVAL;
1221 } 1374 }
1222 } else { 1375 } else {
@@ -1228,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1228 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1229 for_each_subsys(ss, i) 1382 for_each_subsys(ss, i)
1230 if (!ss->disabled) 1383 if (!ss->disabled)
1231 set_bit(i, &opts->subsys_mask); 1384 opts->subsys_mask |= (1 << i);
1232 1385
1233 /* 1386 /*
1234 * We either have to specify by name or by subsystems. (So 1387 * We either have to specify by name or by subsystems. (So
@@ -1259,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1259 int ret = 0; 1412 int ret = 0;
1260 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1413 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1261 struct cgroup_sb_opts opts; 1414 struct cgroup_sb_opts opts;
1262 unsigned long added_mask, removed_mask; 1415 unsigned int added_mask, removed_mask;
1263 1416
1264 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1265 pr_err("cgroup: sane_behavior: remount is not allowed\n"); 1418 pr_err("sane_behavior: remount is not allowed\n");
1266 return -EINVAL; 1419 return -EINVAL;
1267 } 1420 }
1268 1421
1269 mutex_lock(&cgroup_tree_mutex);
1270 mutex_lock(&cgroup_mutex); 1422 mutex_lock(&cgroup_mutex);
1271 1423
1272 /* See what subsystems are wanted */ 1424 /* See what subsystems are wanted */
@@ -1274,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1274 if (ret) 1426 if (ret)
1275 goto out_unlock; 1427 goto out_unlock;
1276 1428
1277 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) 1429 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1278 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1430 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1279 task_tgid_nr(current), current->comm); 1431 task_tgid_nr(current), current->comm);
1280 1432
1281 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; 1433 added_mask = opts.subsys_mask & ~root->subsys_mask;
1282 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; 1434 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1283 1435
1284 /* Don't allow flags or name to change at remount */ 1436 /* Don't allow flags or name to change at remount */
1285 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1286 (opts.name && strcmp(opts.name, root->name))) { 1438 (opts.name && strcmp(opts.name, root->name))) {
1287 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", 1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1288 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1289 root->flags & CGRP_ROOT_OPTION_MASK, root->name); 1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1290 ret = -EINVAL; 1442 ret = -EINVAL;
@@ -1292,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1292 } 1444 }
1293 1445
1294 /* remounting is not allowed for populated hierarchies */ 1446 /* remounting is not allowed for populated hierarchies */
1295 if (!list_empty(&root->cgrp.children)) { 1447 if (!list_empty(&root->cgrp.self.children)) {
1296 ret = -EBUSY; 1448 ret = -EBUSY;
1297 goto out_unlock; 1449 goto out_unlock;
1298 } 1450 }
@@ -1312,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1312 kfree(opts.release_agent); 1464 kfree(opts.release_agent);
1313 kfree(opts.name); 1465 kfree(opts.name);
1314 mutex_unlock(&cgroup_mutex); 1466 mutex_unlock(&cgroup_mutex);
1315 mutex_unlock(&cgroup_tree_mutex);
1316 return ret; 1467 return ret;
1317} 1468}
1318 1469
@@ -1370,14 +1521,22 @@ out_unlock:
1370 1521
1371static void init_cgroup_housekeeping(struct cgroup *cgrp) 1522static void init_cgroup_housekeeping(struct cgroup *cgrp)
1372{ 1523{
1373 atomic_set(&cgrp->refcnt, 1); 1524 struct cgroup_subsys *ss;
1374 INIT_LIST_HEAD(&cgrp->sibling); 1525 int ssid;
1375 INIT_LIST_HEAD(&cgrp->children); 1526
1527 INIT_LIST_HEAD(&cgrp->self.sibling);
1528 INIT_LIST_HEAD(&cgrp->self.children);
1376 INIT_LIST_HEAD(&cgrp->cset_links); 1529 INIT_LIST_HEAD(&cgrp->cset_links);
1377 INIT_LIST_HEAD(&cgrp->release_list); 1530 INIT_LIST_HEAD(&cgrp->release_list);
1378 INIT_LIST_HEAD(&cgrp->pidlists); 1531 INIT_LIST_HEAD(&cgrp->pidlists);
1379 mutex_init(&cgrp->pidlist_mutex); 1532 mutex_init(&cgrp->pidlist_mutex);
1380 cgrp->dummy_css.cgroup = cgrp; 1533 cgrp->self.cgroup = cgrp;
1534 cgrp->self.flags |= CSS_ONLINE;
1535
1536 for_each_subsys(ss, ssid)
1537 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1538
1539 init_waitqueue_head(&cgrp->offline_waitq);
1381} 1540}
1382 1541
1383static void init_cgroup_root(struct cgroup_root *root, 1542static void init_cgroup_root(struct cgroup_root *root,
@@ -1400,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root,
1400 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1559 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1401} 1560}
1402 1561
1403static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) 1562static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1404{ 1563{
1405 LIST_HEAD(tmp_links); 1564 LIST_HEAD(tmp_links);
1406 struct cgroup *root_cgrp = &root->cgrp; 1565 struct cgroup *root_cgrp = &root->cgrp;
1407 struct css_set *cset; 1566 struct css_set *cset;
1408 int i, ret; 1567 int i, ret;
1409 1568
1410 lockdep_assert_held(&cgroup_tree_mutex);
1411 lockdep_assert_held(&cgroup_mutex); 1569 lockdep_assert_held(&cgroup_mutex);
1412 1570
1413 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); 1571 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1414 if (ret < 0) 1572 if (ret < 0)
1415 goto out; 1573 goto out;
1416 root_cgrp->id = ret; 1574 root_cgrp->id = ret;
1417 1575
1576 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
1577 if (ret)
1578 goto out;
1579
1418 /* 1580 /*
1419 * We're accessing css_set_count without locking css_set_rwsem here, 1581 * We're accessing css_set_count without locking css_set_rwsem here,
1420 * but that's OK - it can only be increased by someone holding 1582 * but that's OK - it can only be increased by someone holding
@@ -1423,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1423 */ 1585 */
1424 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); 1586 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1425 if (ret) 1587 if (ret)
1426 goto out; 1588 goto cancel_ref;
1427 1589
1428 ret = cgroup_init_root_id(root); 1590 ret = cgroup_init_root_id(root);
1429 if (ret) 1591 if (ret)
1430 goto out; 1592 goto cancel_ref;
1431 1593
1432 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, 1594 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1433 KERNFS_ROOT_CREATE_DEACTIVATED, 1595 KERNFS_ROOT_CREATE_DEACTIVATED,
@@ -1463,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1463 link_css_set(&tmp_links, cset, root_cgrp); 1625 link_css_set(&tmp_links, cset, root_cgrp);
1464 up_write(&css_set_rwsem); 1626 up_write(&css_set_rwsem);
1465 1627
1466 BUG_ON(!list_empty(&root_cgrp->children)); 1628 BUG_ON(!list_empty(&root_cgrp->self.children));
1467 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1629 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1468 1630
1469 kernfs_activate(root_cgrp->kn); 1631 kernfs_activate(root_cgrp->kn);
@@ -1475,6 +1637,8 @@ destroy_root:
1475 root->kf_root = NULL; 1637 root->kf_root = NULL;
1476exit_root_id: 1638exit_root_id:
1477 cgroup_exit_root_id(root); 1639 cgroup_exit_root_id(root);
1640cancel_ref:
1641 percpu_ref_cancel_init(&root_cgrp->self.refcnt);
1478out: 1642out:
1479 free_cgrp_cset_links(&tmp_links); 1643 free_cgrp_cset_links(&tmp_links);
1480 return ret; 1644 return ret;
@@ -1497,14 +1661,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1497 if (!use_task_css_set_links) 1661 if (!use_task_css_set_links)
1498 cgroup_enable_task_cg_lists(); 1662 cgroup_enable_task_cg_lists();
1499 1663
1500 mutex_lock(&cgroup_tree_mutex);
1501 mutex_lock(&cgroup_mutex); 1664 mutex_lock(&cgroup_mutex);
1502 1665
1503 /* First find the desired set of subsystems */ 1666 /* First find the desired set of subsystems */
1504 ret = parse_cgroupfs_options(data, &opts); 1667 ret = parse_cgroupfs_options(data, &opts);
1505 if (ret) 1668 if (ret)
1506 goto out_unlock; 1669 goto out_unlock;
1507retry: 1670
1508 /* look for a matching existing root */ 1671 /* look for a matching existing root */
1509 if (!opts.subsys_mask && !opts.none && !opts.name) { 1672 if (!opts.subsys_mask && !opts.none && !opts.name) {
1510 cgrp_dfl_root_visible = true; 1673 cgrp_dfl_root_visible = true;
@@ -1536,7 +1699,7 @@ retry:
1536 * subsystems) then they must match. 1699 * subsystems) then they must match.
1537 */ 1700 */
1538 if ((opts.subsys_mask || opts.none) && 1701 if ((opts.subsys_mask || opts.none) &&
1539 (opts.subsys_mask != root->cgrp.subsys_mask)) { 1702 (opts.subsys_mask != root->subsys_mask)) {
1540 if (!name_match) 1703 if (!name_match)
1541 continue; 1704 continue;
1542 ret = -EBUSY; 1705 ret = -EBUSY;
@@ -1545,28 +1708,27 @@ retry:
1545 1708
1546 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1709 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1547 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1710 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1548 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1711 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1549 ret = -EINVAL; 1712 ret = -EINVAL;
1550 goto out_unlock; 1713 goto out_unlock;
1551 } else { 1714 } else {
1552 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1715 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1553 } 1716 }
1554 } 1717 }
1555 1718
1556 /* 1719 /*
1557 * A root's lifetime is governed by its root cgroup. Zero 1720 * A root's lifetime is governed by its root cgroup.
1558 * ref indicate that the root is being destroyed. Wait for 1721 * tryget_live failure indicate that the root is being
1559 * destruction to complete so that the subsystems are free. 1722 * destroyed. Wait for destruction to complete so that the
1560 * We can use wait_queue for the wait but this path is 1723 * subsystems are free. We can use wait_queue for the wait
1561 * super cold. Let's just sleep for a bit and retry. 1724 * but this path is super cold. Let's just sleep for a bit
1725 * and retry.
1562 */ 1726 */
1563 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { 1727 if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1564 mutex_unlock(&cgroup_mutex); 1728 mutex_unlock(&cgroup_mutex);
1565 mutex_unlock(&cgroup_tree_mutex);
1566 msleep(10); 1729 msleep(10);
1567 mutex_lock(&cgroup_tree_mutex); 1730 ret = restart_syscall();
1568 mutex_lock(&cgroup_mutex); 1731 goto out_free;
1569 goto retry;
1570 } 1732 }
1571 1733
1572 ret = 0; 1734 ret = 0;
@@ -1597,8 +1759,7 @@ retry:
1597 1759
1598out_unlock: 1760out_unlock:
1599 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1600 mutex_unlock(&cgroup_tree_mutex); 1762out_free:
1601
1602 kfree(opts.release_agent); 1763 kfree(opts.release_agent);
1603 kfree(opts.name); 1764 kfree(opts.name);
1604 1765
@@ -1617,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb)
1617 struct kernfs_root *kf_root = kernfs_root_from_sb(sb); 1778 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1618 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1779 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1619 1780
1620 cgroup_put(&root->cgrp); 1781 /*
1782 * If @root doesn't have any mounts or children, start killing it.
1783 * This prevents new mounts by disabling percpu_ref_tryget_live().
1784 * cgroup_mount() may wait for @root's release.
1785 *
1786 * And don't kill the default root.
1787 */
1788 if (css_has_online_children(&root->cgrp.self) ||
1789 root == &cgrp_dfl_root)
1790 cgroup_put(&root->cgrp);
1791 else
1792 percpu_ref_kill(&root->cgrp.self.refcnt);
1793
1621 kernfs_kill_sb(sb); 1794 kernfs_kill_sb(sb);
1622} 1795}
1623 1796
@@ -1739,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1739 1912
1740/** 1913/**
1741 * cgroup_task_migrate - move a task from one cgroup to another. 1914 * cgroup_task_migrate - move a task from one cgroup to another.
1742 * @old_cgrp; the cgroup @tsk is being migrated from 1915 * @old_cgrp: the cgroup @tsk is being migrated from
1743 * @tsk: the task being migrated 1916 * @tsk: the task being migrated
1744 * @new_cset: the new css_set @tsk is being attached to 1917 * @new_cset: the new css_set @tsk is being attached to
1745 * 1918 *
@@ -1831,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1831 2004
1832 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2005 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1833 2006
1834 /* nothing to do if this cset already belongs to the cgroup */
1835 if (src_cgrp == dst_cgrp)
1836 return;
1837
1838 if (!list_empty(&src_cset->mg_preload_node)) 2007 if (!list_empty(&src_cset->mg_preload_node))
1839 return; 2008 return;
1840 2009
@@ -1849,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1849 2018
1850/** 2019/**
1851 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 2020 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1852 * @dst_cgrp: the destination cgroup 2021 * @dst_cgrp: the destination cgroup (may be %NULL)
1853 * @preloaded_csets: list of preloaded source css_sets 2022 * @preloaded_csets: list of preloaded source css_sets
1854 * 2023 *
1855 * Tasks are about to be moved to @dst_cgrp and all the source css_sets 2024 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1856 * have been preloaded to @preloaded_csets. This function looks up and 2025 * have been preloaded to @preloaded_csets. This function looks up and
1857 * pins all destination css_sets, links each to its source, and put them on 2026 * pins all destination css_sets, links each to its source, and append them
1858 * @preloaded_csets. 2027 * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
2028 * source css_set is assumed to be its cgroup on the default hierarchy.
1859 * 2029 *
1860 * This function must be called after cgroup_migrate_add_src() has been 2030 * This function must be called after cgroup_migrate_add_src() has been
1861 * called on each migration source css_set. After migration is performed 2031 * called on each migration source css_set. After migration is performed
@@ -1866,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1866 struct list_head *preloaded_csets) 2036 struct list_head *preloaded_csets)
1867{ 2037{
1868 LIST_HEAD(csets); 2038 LIST_HEAD(csets);
1869 struct css_set *src_cset; 2039 struct css_set *src_cset, *tmp_cset;
1870 2040
1871 lockdep_assert_held(&cgroup_mutex); 2041 lockdep_assert_held(&cgroup_mutex);
1872 2042
2043 /*
2044 * Except for the root, child_subsys_mask must be zero for a cgroup
2045 * with tasks so that child cgroups don't compete against tasks.
2046 */
2047 if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2048 dst_cgrp->child_subsys_mask)
2049 return -EBUSY;
2050
1873 /* look up the dst cset for each src cset and link it to src */ 2051 /* look up the dst cset for each src cset and link it to src */
1874 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { 2052 list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
1875 struct css_set *dst_cset; 2053 struct css_set *dst_cset;
1876 2054
1877 dst_cset = find_css_set(src_cset, dst_cgrp); 2055 dst_cset = find_css_set(src_cset,
2056 dst_cgrp ?: src_cset->dfl_cgrp);
1878 if (!dst_cset) 2057 if (!dst_cset)
1879 goto err; 2058 goto err;
1880 2059
1881 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); 2060 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2061
2062 /*
2063 * If src cset equals dst, it's noop. Drop the src.
2064 * cgroup_migrate() will skip the cset too. Note that we
2065 * can't handle src == dst as some nodes are used by both.
2066 */
2067 if (src_cset == dst_cset) {
2068 src_cset->mg_src_cgrp = NULL;
2069 list_del_init(&src_cset->mg_preload_node);
2070 put_css_set(src_cset, false);
2071 put_css_set(dst_cset, false);
2072 continue;
2073 }
2074
1882 src_cset->mg_dst_cset = dst_cset; 2075 src_cset->mg_dst_cset = dst_cset;
1883 2076
1884 if (list_empty(&dst_cset->mg_preload_node)) 2077 if (list_empty(&dst_cset->mg_preload_node))
@@ -1887,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1887 put_css_set(dst_cset, false); 2080 put_css_set(dst_cset, false);
1888 } 2081 }
1889 2082
1890 list_splice(&csets, preloaded_csets); 2083 list_splice_tail(&csets, preloaded_csets);
1891 return 0; 2084 return 0;
1892err: 2085err:
1893 cgroup_migrate_finish(&csets); 2086 cgroup_migrate_finish(&csets);
@@ -1968,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1968 return 0; 2161 return 0;
1969 2162
1970 /* check that we can legitimately attach to the cgroup */ 2163 /* check that we can legitimately attach to the cgroup */
1971 for_each_css(css, i, cgrp) { 2164 for_each_e_css(css, i, cgrp) {
1972 if (css->ss->can_attach) { 2165 if (css->ss->can_attach) {
1973 ret = css->ss->can_attach(css, &tset); 2166 ret = css->ss->can_attach(css, &tset);
1974 if (ret) { 2167 if (ret) {
@@ -1998,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1998 */ 2191 */
1999 tset.csets = &tset.dst_csets; 2192 tset.csets = &tset.dst_csets;
2000 2193
2001 for_each_css(css, i, cgrp) 2194 for_each_e_css(css, i, cgrp)
2002 if (css->ss->attach) 2195 if (css->ss->attach)
2003 css->ss->attach(css, &tset); 2196 css->ss->attach(css, &tset);
2004 2197
@@ -2006,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2006 goto out_release_tset; 2199 goto out_release_tset;
2007 2200
2008out_cancel_attach: 2201out_cancel_attach:
2009 for_each_css(css, i, cgrp) { 2202 for_each_e_css(css, i, cgrp) {
2010 if (css == failed_css) 2203 if (css == failed_css)
2011 break; 2204 break;
2012 if (css->ss->cancel_attach) 2205 if (css->ss->cancel_attach)
@@ -2065,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2065 * function to attach either it or all tasks in its threadgroup. Will lock 2258 * function to attach either it or all tasks in its threadgroup. Will lock
2066 * cgroup_mutex and threadgroup. 2259 * cgroup_mutex and threadgroup.
2067 */ 2260 */
2068static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2261static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2262 size_t nbytes, loff_t off, bool threadgroup)
2069{ 2263{
2070 struct task_struct *tsk; 2264 struct task_struct *tsk;
2071 const struct cred *cred = current_cred(), *tcred; 2265 const struct cred *cred = current_cred(), *tcred;
2266 struct cgroup *cgrp;
2267 pid_t pid;
2072 int ret; 2268 int ret;
2073 2269
2074 if (!cgroup_lock_live_group(cgrp)) 2270 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2271 return -EINVAL;
2272
2273 cgrp = cgroup_kn_lock_live(of->kn);
2274 if (!cgrp)
2075 return -ENODEV; 2275 return -ENODEV;
2076 2276
2077retry_find_task: 2277retry_find_task:
@@ -2137,8 +2337,8 @@ retry_find_task:
2137 2337
2138 put_task_struct(tsk); 2338 put_task_struct(tsk);
2139out_unlock_cgroup: 2339out_unlock_cgroup:
2140 mutex_unlock(&cgroup_mutex); 2340 cgroup_kn_unlock(of->kn);
2141 return ret; 2341 return ret ?: nbytes;
2142} 2342}
2143 2343
2144/** 2344/**
@@ -2172,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2172} 2372}
2173EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2373EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2174 2374
2175static int cgroup_tasks_write(struct cgroup_subsys_state *css, 2375static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2176 struct cftype *cft, u64 pid) 2376 char *buf, size_t nbytes, loff_t off)
2177{ 2377{
2178 return attach_task_by_pid(css->cgroup, pid, false); 2378 return __cgroup_procs_write(of, buf, nbytes, off, false);
2179} 2379}
2180 2380
2181static int cgroup_procs_write(struct cgroup_subsys_state *css, 2381static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2182 struct cftype *cft, u64 tgid) 2382 char *buf, size_t nbytes, loff_t off)
2183{ 2383{
2184 return attach_task_by_pid(css->cgroup, tgid, true); 2384 return __cgroup_procs_write(of, buf, nbytes, off, true);
2185} 2385}
2186 2386
2187static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2387static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2188 struct cftype *cft, char *buffer) 2388 char *buf, size_t nbytes, loff_t off)
2189{ 2389{
2190 struct cgroup_root *root = css->cgroup->root; 2390 struct cgroup *cgrp;
2191 2391
2192 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); 2392 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2193 if (!cgroup_lock_live_group(css->cgroup)) 2393
2394 cgrp = cgroup_kn_lock_live(of->kn);
2395 if (!cgrp)
2194 return -ENODEV; 2396 return -ENODEV;
2195 spin_lock(&release_agent_path_lock); 2397 spin_lock(&release_agent_path_lock);
2196 strlcpy(root->release_agent_path, buffer, 2398 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2197 sizeof(root->release_agent_path)); 2399 sizeof(cgrp->root->release_agent_path));
2198 spin_unlock(&release_agent_path_lock); 2400 spin_unlock(&release_agent_path_lock);
2199 mutex_unlock(&cgroup_mutex); 2401 cgroup_kn_unlock(of->kn);
2200 return 0; 2402 return nbytes;
2201} 2403}
2202 2404
2203static int cgroup_release_agent_show(struct seq_file *seq, void *v) 2405static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2204{ 2406{
2205 struct cgroup *cgrp = seq_css(seq)->cgroup; 2407 struct cgroup *cgrp = seq_css(seq)->cgroup;
2206 2408
2207 if (!cgroup_lock_live_group(cgrp)) 2409 spin_lock(&release_agent_path_lock);
2208 return -ENODEV;
2209 seq_puts(seq, cgrp->root->release_agent_path); 2410 seq_puts(seq, cgrp->root->release_agent_path);
2411 spin_unlock(&release_agent_path_lock);
2210 seq_putc(seq, '\n'); 2412 seq_putc(seq, '\n');
2211 mutex_unlock(&cgroup_mutex);
2212 return 0; 2413 return 0;
2213} 2414}
2214 2415
@@ -2220,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2220 return 0; 2421 return 0;
2221} 2422}
2222 2423
2424static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2425{
2426 struct cgroup_subsys *ss;
2427 bool printed = false;
2428 int ssid;
2429
2430 for_each_subsys(ss, ssid) {
2431 if (ss_mask & (1 << ssid)) {
2432 if (printed)
2433 seq_putc(seq, ' ');
2434 seq_printf(seq, "%s", ss->name);
2435 printed = true;
2436 }
2437 }
2438 if (printed)
2439 seq_putc(seq, '\n');
2440}
2441
2442/* show controllers which are currently attached to the default hierarchy */
2443static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2444{
2445 struct cgroup *cgrp = seq_css(seq)->cgroup;
2446
2447 cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
2448 ~cgrp_dfl_root_inhibit_ss_mask);
2449 return 0;
2450}
2451
2452/* show controllers which are enabled from the parent */
2453static int cgroup_controllers_show(struct seq_file *seq, void *v)
2454{
2455 struct cgroup *cgrp = seq_css(seq)->cgroup;
2456
2457 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
2458 return 0;
2459}
2460
2461/* show controllers which are enabled for a given cgroup's children */
2462static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2463{
2464 struct cgroup *cgrp = seq_css(seq)->cgroup;
2465
2466 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2467 return 0;
2468}
2469
2470/**
2471 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2472 * @cgrp: root of the subtree to update csses for
2473 *
2474 * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2475 * css associations need to be updated accordingly. This function looks up
2476 * all css_sets which are attached to the subtree, creates the matching
2477 * updated css_sets and migrates the tasks to the new ones.
2478 */
2479static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2480{
2481 LIST_HEAD(preloaded_csets);
2482 struct cgroup_subsys_state *css;
2483 struct css_set *src_cset;
2484 int ret;
2485
2486 lockdep_assert_held(&cgroup_mutex);
2487
2488 /* look up all csses currently attached to @cgrp's subtree */
2489 down_read(&css_set_rwsem);
2490 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2491 struct cgrp_cset_link *link;
2492
2493 /* self is not affected by child_subsys_mask change */
2494 if (css->cgroup == cgrp)
2495 continue;
2496
2497 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2498 cgroup_migrate_add_src(link->cset, cgrp,
2499 &preloaded_csets);
2500 }
2501 up_read(&css_set_rwsem);
2502
2503 /* NULL dst indicates self on default hierarchy */
2504 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2505 if (ret)
2506 goto out_finish;
2507
2508 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2509 struct task_struct *last_task = NULL, *task;
2510
2511 /* src_csets precede dst_csets, break on the first dst_cset */
2512 if (!src_cset->mg_src_cgrp)
2513 break;
2514
2515 /*
2516 * All tasks in src_cset need to be migrated to the
2517 * matching dst_cset. Empty it process by process. We
2518 * walk tasks but migrate processes. The leader might even
2519 * belong to a different cset but such src_cset would also
2520 * be among the target src_csets because the default
2521 * hierarchy enforces per-process membership.
2522 */
2523 while (true) {
2524 down_read(&css_set_rwsem);
2525 task = list_first_entry_or_null(&src_cset->tasks,
2526 struct task_struct, cg_list);
2527 if (task) {
2528 task = task->group_leader;
2529 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2530 get_task_struct(task);
2531 }
2532 up_read(&css_set_rwsem);
2533
2534 if (!task)
2535 break;
2536
2537 /* guard against possible infinite loop */
2538 if (WARN(last_task == task,
2539 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2540 goto out_finish;
2541 last_task = task;
2542
2543 threadgroup_lock(task);
2544 /* raced against de_thread() from another thread? */
2545 if (!thread_group_leader(task)) {
2546 threadgroup_unlock(task);
2547 put_task_struct(task);
2548 continue;
2549 }
2550
2551 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2552
2553 threadgroup_unlock(task);
2554 put_task_struct(task);
2555
2556 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2557 goto out_finish;
2558 }
2559 }
2560
2561out_finish:
2562 cgroup_migrate_finish(&preloaded_csets);
2563 return ret;
2564}
2565
2566/* change the enabled child controllers for a cgroup in the default hierarchy */
2567static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2568 char *buf, size_t nbytes,
2569 loff_t off)
2570{
2571 unsigned int enable = 0, disable = 0;
2572 struct cgroup *cgrp, *child;
2573 struct cgroup_subsys *ss;
2574 char *tok;
2575 int ssid, ret;
2576
2577 /*
2578 * Parse input - space separated list of subsystem names prefixed
2579 * with either + or -.
2580 */
2581 buf = strstrip(buf);
2582 while ((tok = strsep(&buf, " "))) {
2583 if (tok[0] == '\0')
2584 continue;
2585 for_each_subsys(ss, ssid) {
2586 if (ss->disabled || strcmp(tok + 1, ss->name) ||
2587 ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2588 continue;
2589
2590 if (*tok == '+') {
2591 enable |= 1 << ssid;
2592 disable &= ~(1 << ssid);
2593 } else if (*tok == '-') {
2594 disable |= 1 << ssid;
2595 enable &= ~(1 << ssid);
2596 } else {
2597 return -EINVAL;
2598 }
2599 break;
2600 }
2601 if (ssid == CGROUP_SUBSYS_COUNT)
2602 return -EINVAL;
2603 }
2604
2605 cgrp = cgroup_kn_lock_live(of->kn);
2606 if (!cgrp)
2607 return -ENODEV;
2608
2609 for_each_subsys(ss, ssid) {
2610 if (enable & (1 << ssid)) {
2611 if (cgrp->child_subsys_mask & (1 << ssid)) {
2612 enable &= ~(1 << ssid);
2613 continue;
2614 }
2615
2616 /*
2617 * Because css offlining is asynchronous, userland
2618 * might try to re-enable the same controller while
2619 * the previous instance is still around. In such
2620 * cases, wait till it's gone using offline_waitq.
2621 */
2622 cgroup_for_each_live_child(child, cgrp) {
2623 DEFINE_WAIT(wait);
2624
2625 if (!cgroup_css(child, ss))
2626 continue;
2627
2628 cgroup_get(child);
2629 prepare_to_wait(&child->offline_waitq, &wait,
2630 TASK_UNINTERRUPTIBLE);
2631 cgroup_kn_unlock(of->kn);
2632 schedule();
2633 finish_wait(&child->offline_waitq, &wait);
2634 cgroup_put(child);
2635
2636 return restart_syscall();
2637 }
2638
2639 /* unavailable or not enabled on the parent? */
2640 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2641 (cgroup_parent(cgrp) &&
2642 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2643 ret = -ENOENT;
2644 goto out_unlock;
2645 }
2646 } else if (disable & (1 << ssid)) {
2647 if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2648 disable &= ~(1 << ssid);
2649 continue;
2650 }
2651
2652 /* a child has it enabled? */
2653 cgroup_for_each_live_child(child, cgrp) {
2654 if (child->child_subsys_mask & (1 << ssid)) {
2655 ret = -EBUSY;
2656 goto out_unlock;
2657 }
2658 }
2659 }
2660 }
2661
2662 if (!enable && !disable) {
2663 ret = 0;
2664 goto out_unlock;
2665 }
2666
2667 /*
2668 * Except for the root, child_subsys_mask must be zero for a cgroup
2669 * with tasks so that child cgroups don't compete against tasks.
2670 */
2671 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
2672 ret = -EBUSY;
2673 goto out_unlock;
2674 }
2675
2676 /*
2677 * Create csses for enables and update child_subsys_mask. This
2678 * changes cgroup_e_css() results which in turn makes the
2679 * subsequent cgroup_update_dfl_csses() associate all tasks in the
2680 * subtree to the updated csses.
2681 */
2682 for_each_subsys(ss, ssid) {
2683 if (!(enable & (1 << ssid)))
2684 continue;
2685
2686 cgroup_for_each_live_child(child, cgrp) {
2687 ret = create_css(child, ss);
2688 if (ret)
2689 goto err_undo_css;
2690 }
2691 }
2692
2693 cgrp->child_subsys_mask |= enable;
2694 cgrp->child_subsys_mask &= ~disable;
2695
2696 ret = cgroup_update_dfl_csses(cgrp);
2697 if (ret)
2698 goto err_undo_css;
2699
2700 /* all tasks are now migrated away from the old csses, kill them */
2701 for_each_subsys(ss, ssid) {
2702 if (!(disable & (1 << ssid)))
2703 continue;
2704
2705 cgroup_for_each_live_child(child, cgrp)
2706 kill_css(cgroup_css(child, ss));
2707 }
2708
2709 kernfs_activate(cgrp->kn);
2710 ret = 0;
2711out_unlock:
2712 cgroup_kn_unlock(of->kn);
2713 return ret ?: nbytes;
2714
2715err_undo_css:
2716 cgrp->child_subsys_mask &= ~enable;
2717 cgrp->child_subsys_mask |= disable;
2718
2719 for_each_subsys(ss, ssid) {
2720 if (!(enable & (1 << ssid)))
2721 continue;
2722
2723 cgroup_for_each_live_child(child, cgrp) {
2724 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2725 if (css)
2726 kill_css(css);
2727 }
2728 }
2729 goto out_unlock;
2730}
2731
2732static int cgroup_populated_show(struct seq_file *seq, void *v)
2733{
2734 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2735 return 0;
2736}
2737
2223static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2738static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2224 size_t nbytes, loff_t off) 2739 size_t nbytes, loff_t off)
2225{ 2740{
@@ -2228,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2228 struct cgroup_subsys_state *css; 2743 struct cgroup_subsys_state *css;
2229 int ret; 2744 int ret;
2230 2745
2746 if (cft->write)
2747 return cft->write(of, buf, nbytes, off);
2748
2231 /* 2749 /*
2232 * kernfs guarantees that a file isn't deleted with operations in 2750 * kernfs guarantees that a file isn't deleted with operations in
2233 * flight, which means that the matching css is and stays alive and 2751 * flight, which means that the matching css is and stays alive and
@@ -2238,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2238 css = cgroup_css(cgrp, cft->ss); 2756 css = cgroup_css(cgrp, cft->ss);
2239 rcu_read_unlock(); 2757 rcu_read_unlock();
2240 2758
2241 if (cft->write_string) { 2759 if (cft->write_u64) {
2242 ret = cft->write_string(css, cft, strstrip(buf));
2243 } else if (cft->write_u64) {
2244 unsigned long long v; 2760 unsigned long long v;
2245 ret = kstrtoull(buf, 0, &v); 2761 ret = kstrtoull(buf, 0, &v);
2246 if (!ret) 2762 if (!ret)
@@ -2250,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2250 ret = kstrtoll(buf, 0, &v); 2766 ret = kstrtoll(buf, 0, &v);
2251 if (!ret) 2767 if (!ret)
2252 ret = cft->write_s64(css, cft, v); 2768 ret = cft->write_s64(css, cft, v);
2253 } else if (cft->trigger) {
2254 ret = cft->trigger(css, (unsigned int)cft->private);
2255 } else { 2769 } else {
2256 ret = -EINVAL; 2770 ret = -EINVAL;
2257 } 2771 }
@@ -2328,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2328 return -EPERM; 2842 return -EPERM;
2329 2843
2330 /* 2844 /*
2331 * We're gonna grab cgroup_tree_mutex which nests outside kernfs 2845 * We're gonna grab cgroup_mutex which nests outside kernfs
2332 * active_ref. kernfs_rename() doesn't require active_ref 2846 * active_ref. kernfs_rename() doesn't require active_ref
2333 * protection. Break them before grabbing cgroup_tree_mutex. 2847 * protection. Break them before grabbing cgroup_mutex.
2334 */ 2848 */
2335 kernfs_break_active_protection(new_parent); 2849 kernfs_break_active_protection(new_parent);
2336 kernfs_break_active_protection(kn); 2850 kernfs_break_active_protection(kn);
2337 2851
2338 mutex_lock(&cgroup_tree_mutex);
2339 mutex_lock(&cgroup_mutex); 2852 mutex_lock(&cgroup_mutex);
2340 2853
2341 ret = kernfs_rename(kn, new_parent, new_name_str); 2854 ret = kernfs_rename(kn, new_parent, new_name_str);
2342 2855
2343 mutex_unlock(&cgroup_mutex); 2856 mutex_unlock(&cgroup_mutex);
2344 mutex_unlock(&cgroup_tree_mutex);
2345 2857
2346 kernfs_unbreak_active_protection(kn); 2858 kernfs_unbreak_active_protection(kn);
2347 kernfs_unbreak_active_protection(new_parent); 2859 kernfs_unbreak_active_protection(new_parent);
@@ -2379,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2379 return PTR_ERR(kn); 2891 return PTR_ERR(kn);
2380 2892
2381 ret = cgroup_kn_set_ugid(kn); 2893 ret = cgroup_kn_set_ugid(kn);
2382 if (ret) 2894 if (ret) {
2383 kernfs_remove(kn); 2895 kernfs_remove(kn);
2384 return ret; 2896 return ret;
2897 }
2898
2899 if (cft->seq_show == cgroup_populated_show)
2900 cgrp->populated_kn = kn;
2901 return 0;
2385} 2902}
2386 2903
2387/** 2904/**
@@ -2401,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2401 struct cftype *cft; 2918 struct cftype *cft;
2402 int ret; 2919 int ret;
2403 2920
2404 lockdep_assert_held(&cgroup_tree_mutex); 2921 lockdep_assert_held(&cgroup_mutex);
2405 2922
2406 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2923 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2407 /* does cft->flags tell us to skip this file on @cgrp? */ 2924 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2409,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2409 continue; 2926 continue;
2410 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2927 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2411 continue; 2928 continue;
2412 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2929 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2413 continue; 2930 continue;
2414 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2931 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
2415 continue; 2932 continue;
2416 2933
2417 if (is_add) { 2934 if (is_add) {
2418 ret = cgroup_add_file(cgrp, cft); 2935 ret = cgroup_add_file(cgrp, cft);
2419 if (ret) { 2936 if (ret) {
2420 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2937 pr_warn("%s: failed to add %s, err=%d\n",
2421 cft->name, ret); 2938 __func__, cft->name, ret);
2422 return ret; 2939 return ret;
2423 } 2940 }
2424 } else { 2941 } else {
@@ -2436,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2436 struct cgroup_subsys_state *css; 2953 struct cgroup_subsys_state *css;
2437 int ret = 0; 2954 int ret = 0;
2438 2955
2439 lockdep_assert_held(&cgroup_tree_mutex); 2956 lockdep_assert_held(&cgroup_mutex);
2440
2441 /* don't bother if @ss isn't attached */
2442 if (ss->root == &cgrp_dfl_root)
2443 return 0;
2444 2957
2445 /* add/rm files for all cgroups created before */ 2958 /* add/rm files for all cgroups created before */
2446 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2959 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2508,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2508 3021
2509static int cgroup_rm_cftypes_locked(struct cftype *cfts) 3022static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2510{ 3023{
2511 lockdep_assert_held(&cgroup_tree_mutex); 3024 lockdep_assert_held(&cgroup_mutex);
2512 3025
2513 if (!cfts || !cfts[0].ss) 3026 if (!cfts || !cfts[0].ss)
2514 return -ENOENT; 3027 return -ENOENT;
@@ -2534,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2534{ 3047{
2535 int ret; 3048 int ret;
2536 3049
2537 mutex_lock(&cgroup_tree_mutex); 3050 mutex_lock(&cgroup_mutex);
2538 ret = cgroup_rm_cftypes_locked(cfts); 3051 ret = cgroup_rm_cftypes_locked(cfts);
2539 mutex_unlock(&cgroup_tree_mutex); 3052 mutex_unlock(&cgroup_mutex);
2540 return ret; 3053 return ret;
2541} 3054}
2542 3055
@@ -2558,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2558{ 3071{
2559 int ret; 3072 int ret;
2560 3073
3074 if (ss->disabled)
3075 return 0;
3076
2561 if (!cfts || cfts[0].name[0] == '\0') 3077 if (!cfts || cfts[0].name[0] == '\0')
2562 return 0; 3078 return 0;
2563 3079
@@ -2565,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2565 if (ret) 3081 if (ret)
2566 return ret; 3082 return ret;
2567 3083
2568 mutex_lock(&cgroup_tree_mutex); 3084 mutex_lock(&cgroup_mutex);
2569 3085
2570 list_add_tail(&cfts->node, &ss->cfts); 3086 list_add_tail(&cfts->node, &ss->cfts);
2571 ret = cgroup_apply_cftypes(cfts, true); 3087 ret = cgroup_apply_cftypes(cfts, true);
2572 if (ret) 3088 if (ret)
2573 cgroup_rm_cftypes_locked(cfts); 3089 cgroup_rm_cftypes_locked(cfts);
2574 3090
2575 mutex_unlock(&cgroup_tree_mutex); 3091 mutex_unlock(&cgroup_mutex);
2576 return ret; 3092 return ret;
2577} 3093}
2578 3094
@@ -2596,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp)
2596 3112
2597/** 3113/**
2598 * css_next_child - find the next child of a given css 3114 * css_next_child - find the next child of a given css
2599 * @pos_css: the current position (%NULL to initiate traversal) 3115 * @pos: the current position (%NULL to initiate traversal)
2600 * @parent_css: css whose children to walk 3116 * @parent: css whose children to walk
2601 * 3117 *
2602 * This function returns the next child of @parent_css and should be called 3118 * This function returns the next child of @parent and should be called
2603 * under either cgroup_mutex or RCU read lock. The only requirement is 3119 * under either cgroup_mutex or RCU read lock. The only requirement is
2604 * that @parent_css and @pos_css are accessible. The next sibling is 3120 * that @parent and @pos are accessible. The next sibling is guaranteed to
2605 * guaranteed to be returned regardless of their states. 3121 * be returned regardless of their states.
3122 *
3123 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3124 * css which finished ->css_online() is guaranteed to be visible in the
3125 * future iterations and will stay visible until the last reference is put.
3126 * A css which hasn't finished ->css_online() or already finished
3127 * ->css_offline() may show up during traversal. It's each subsystem's
3128 * responsibility to synchronize against on/offlining.
2606 */ 3129 */
2607struct cgroup_subsys_state * 3130struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
2608css_next_child(struct cgroup_subsys_state *pos_css, 3131 struct cgroup_subsys_state *parent)
2609 struct cgroup_subsys_state *parent_css)
2610{ 3132{
2611 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; 3133 struct cgroup_subsys_state *next;
2612 struct cgroup *cgrp = parent_css->cgroup;
2613 struct cgroup *next;
2614 3134
2615 cgroup_assert_mutexes_or_rcu_locked(); 3135 cgroup_assert_mutex_or_rcu_locked();
2616 3136
2617 /* 3137 /*
2618 * @pos could already have been removed. Once a cgroup is removed, 3138 * @pos could already have been unlinked from the sibling list.
2619 * its ->sibling.next is no longer updated when its next sibling 3139 * Once a cgroup is removed, its ->sibling.next is no longer
2620 * changes. As CGRP_DEAD assertion is serialized and happens 3140 * updated when its next sibling changes. CSS_RELEASED is set when
2621 * before the cgroup is taken off the ->sibling list, if we see it 3141 * @pos is taken off list, at which time its next pointer is valid,
2622 * unasserted, it's guaranteed that the next sibling hasn't 3142 * and, as releases are serialized, the one pointed to by the next
2623 * finished its grace period even if it's already removed, and thus 3143 * pointer is guaranteed to not have started release yet. This
2624 * safe to dereference from this RCU critical section. If 3144 * implies that if we observe !CSS_RELEASED on @pos in this RCU
2625 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3145 * critical section, the one pointed to by its next pointer is
2626 * to be visible as %true here. 3146 * guaranteed to not have finished its RCU grace period even if we
3147 * have dropped rcu_read_lock() inbetween iterations.
2627 * 3148 *
2628 * If @pos is dead, its next pointer can't be dereferenced; 3149 * If @pos has CSS_RELEASED set, its next pointer can't be
2629 * however, as each cgroup is given a monotonically increasing 3150 * dereferenced; however, as each css is given a monotonically
2630 * unique serial number and always appended to the sibling list, 3151 * increasing unique serial number and always appended to the
2631 * the next one can be found by walking the parent's children until 3152 * sibling list, the next one can be found by walking the parent's
2632 * we see a cgroup with higher serial number than @pos's. While 3153 * children until the first css with higher serial number than
2633 * this path can be slower, it's taken only when either the current 3154 * @pos's. While this path can be slower, it happens iff iteration
2634 * cgroup is removed or iteration and removal race. 3155 * races against release and the race window is very small.
2635 */ 3156 */
2636 if (!pos) { 3157 if (!pos) {
2637 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); 3158 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
2638 } else if (likely(!cgroup_is_dead(pos))) { 3159 } else if (likely(!(pos->flags & CSS_RELEASED))) {
2639 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3160 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
2640 } else { 3161 } else {
2641 list_for_each_entry_rcu(next, &cgrp->children, sibling) 3162 list_for_each_entry_rcu(next, &parent->children, sibling)
2642 if (next->serial_nr > pos->serial_nr) 3163 if (next->serial_nr > pos->serial_nr)
2643 break; 3164 break;
2644 } 3165 }
2645 3166
2646 if (&next->sibling == &cgrp->children) 3167 /*
2647 return NULL; 3168 * @next, if not pointing to the head, can be dereferenced and is
2648 3169 * the next sibling.
2649 return cgroup_css(next, parent_css->ss); 3170 */
3171 if (&next->sibling != &parent->children)
3172 return next;
3173 return NULL;
2650} 3174}
2651 3175
2652/** 3176/**
@@ -2662,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2662 * doesn't require the whole traversal to be contained in a single critical 3186 * doesn't require the whole traversal to be contained in a single critical
2663 * section. This function will return the correct next descendant as long 3187 * section. This function will return the correct next descendant as long
2664 * as both @pos and @root are accessible and @pos is a descendant of @root. 3188 * as both @pos and @root are accessible and @pos is a descendant of @root.
3189 *
3190 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3191 * css which finished ->css_online() is guaranteed to be visible in the
3192 * future iterations and will stay visible until the last reference is put.
3193 * A css which hasn't finished ->css_online() or already finished
3194 * ->css_offline() may show up during traversal. It's each subsystem's
3195 * responsibility to synchronize against on/offlining.
2665 */ 3196 */
2666struct cgroup_subsys_state * 3197struct cgroup_subsys_state *
2667css_next_descendant_pre(struct cgroup_subsys_state *pos, 3198css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -2669,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2669{ 3200{
2670 struct cgroup_subsys_state *next; 3201 struct cgroup_subsys_state *next;
2671 3202
2672 cgroup_assert_mutexes_or_rcu_locked(); 3203 cgroup_assert_mutex_or_rcu_locked();
2673 3204
2674 /* if first iteration, visit @root */ 3205 /* if first iteration, visit @root */
2675 if (!pos) 3206 if (!pos)
@@ -2682,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2682 3213
2683 /* no child, visit my or the closest ancestor's next sibling */ 3214 /* no child, visit my or the closest ancestor's next sibling */
2684 while (pos != root) { 3215 while (pos != root) {
2685 next = css_next_child(pos, css_parent(pos)); 3216 next = css_next_child(pos, pos->parent);
2686 if (next) 3217 if (next)
2687 return next; 3218 return next;
2688 pos = css_parent(pos); 3219 pos = pos->parent;
2689 } 3220 }
2690 3221
2691 return NULL; 3222 return NULL;
@@ -2709,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
2709{ 3240{
2710 struct cgroup_subsys_state *last, *tmp; 3241 struct cgroup_subsys_state *last, *tmp;
2711 3242
2712 cgroup_assert_mutexes_or_rcu_locked(); 3243 cgroup_assert_mutex_or_rcu_locked();
2713 3244
2714 do { 3245 do {
2715 last = pos; 3246 last = pos;
@@ -2749,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
2749 * section. This function will return the correct next descendant as long 3280 * section. This function will return the correct next descendant as long
2750 * as both @pos and @cgroup are accessible and @pos is a descendant of 3281 * as both @pos and @cgroup are accessible and @pos is a descendant of
2751 * @cgroup. 3282 * @cgroup.
3283 *
3284 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3285 * css which finished ->css_online() is guaranteed to be visible in the
3286 * future iterations and will stay visible until the last reference is put.
3287 * A css which hasn't finished ->css_online() or already finished
3288 * ->css_offline() may show up during traversal. It's each subsystem's
3289 * responsibility to synchronize against on/offlining.
2752 */ 3290 */
2753struct cgroup_subsys_state * 3291struct cgroup_subsys_state *
2754css_next_descendant_post(struct cgroup_subsys_state *pos, 3292css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -2756,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2756{ 3294{
2757 struct cgroup_subsys_state *next; 3295 struct cgroup_subsys_state *next;
2758 3296
2759 cgroup_assert_mutexes_or_rcu_locked(); 3297 cgroup_assert_mutex_or_rcu_locked();
2760 3298
2761 /* if first iteration, visit leftmost descendant which may be @root */ 3299 /* if first iteration, visit leftmost descendant which may be @root */
2762 if (!pos) 3300 if (!pos)
@@ -2767,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2767 return NULL; 3305 return NULL;
2768 3306
2769 /* if there's an unvisited sibling, visit its leftmost descendant */ 3307 /* if there's an unvisited sibling, visit its leftmost descendant */
2770 next = css_next_child(pos, css_parent(pos)); 3308 next = css_next_child(pos, pos->parent);
2771 if (next) 3309 if (next)
2772 return css_leftmost_descendant(next); 3310 return css_leftmost_descendant(next);
2773 3311
2774 /* no sibling left, visit parent */ 3312 /* no sibling left, visit parent */
2775 return css_parent(pos); 3313 return pos->parent;
3314}
3315
3316/**
3317 * css_has_online_children - does a css have online children
3318 * @css: the target css
3319 *
3320 * Returns %true if @css has any online children; otherwise, %false. This
3321 * function can be called from any context but the caller is responsible
3322 * for synchronizing against on/offlining as necessary.
3323 */
3324bool css_has_online_children(struct cgroup_subsys_state *css)
3325{
3326 struct cgroup_subsys_state *child;
3327 bool ret = false;
3328
3329 rcu_read_lock();
3330 css_for_each_child(child, css) {
3331 if (css->flags & CSS_ONLINE) {
3332 ret = true;
3333 break;
3334 }
3335 }
3336 rcu_read_unlock();
3337 return ret;
2776} 3338}
2777 3339
2778/** 3340/**
@@ -2783,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2783 */ 3345 */
2784static void css_advance_task_iter(struct css_task_iter *it) 3346static void css_advance_task_iter(struct css_task_iter *it)
2785{ 3347{
2786 struct list_head *l = it->cset_link; 3348 struct list_head *l = it->cset_pos;
2787 struct cgrp_cset_link *link; 3349 struct cgrp_cset_link *link;
2788 struct css_set *cset; 3350 struct css_set *cset;
2789 3351
2790 /* Advance to the next non-empty css_set */ 3352 /* Advance to the next non-empty css_set */
2791 do { 3353 do {
2792 l = l->next; 3354 l = l->next;
2793 if (l == &it->origin_css->cgroup->cset_links) { 3355 if (l == it->cset_head) {
2794 it->cset_link = NULL; 3356 it->cset_pos = NULL;
2795 return; 3357 return;
2796 } 3358 }
2797 link = list_entry(l, struct cgrp_cset_link, cset_link); 3359
2798 cset = link->cset; 3360 if (it->ss) {
3361 cset = container_of(l, struct css_set,
3362 e_cset_node[it->ss->id]);
3363 } else {
3364 link = list_entry(l, struct cgrp_cset_link, cset_link);
3365 cset = link->cset;
3366 }
2799 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3367 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2800 3368
2801 it->cset_link = l; 3369 it->cset_pos = l;
2802 3370
2803 if (!list_empty(&cset->tasks)) 3371 if (!list_empty(&cset->tasks))
2804 it->task = cset->tasks.next; 3372 it->task_pos = cset->tasks.next;
2805 else 3373 else
2806 it->task = cset->mg_tasks.next; 3374 it->task_pos = cset->mg_tasks.next;
3375
3376 it->tasks_head = &cset->tasks;
3377 it->mg_tasks_head = &cset->mg_tasks;
2807} 3378}
2808 3379
2809/** 3380/**
@@ -2829,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2829 3400
2830 down_read(&css_set_rwsem); 3401 down_read(&css_set_rwsem);
2831 3402
2832 it->origin_css = css; 3403 it->ss = css->ss;
2833 it->cset_link = &css->cgroup->cset_links; 3404
3405 if (it->ss)
3406 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3407 else
3408 it->cset_pos = &css->cgroup->cset_links;
3409
3410 it->cset_head = it->cset_pos;
2834 3411
2835 css_advance_task_iter(it); 3412 css_advance_task_iter(it);
2836} 3413}
@@ -2846,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2846struct task_struct *css_task_iter_next(struct css_task_iter *it) 3423struct task_struct *css_task_iter_next(struct css_task_iter *it)
2847{ 3424{
2848 struct task_struct *res; 3425 struct task_struct *res;
2849 struct list_head *l = it->task; 3426 struct list_head *l = it->task_pos;
2850 struct cgrp_cset_link *link = list_entry(it->cset_link,
2851 struct cgrp_cset_link, cset_link);
2852 3427
2853 /* If the iterator cg is NULL, we have no tasks */ 3428 /* If the iterator cg is NULL, we have no tasks */
2854 if (!it->cset_link) 3429 if (!it->cset_pos)
2855 return NULL; 3430 return NULL;
2856 res = list_entry(l, struct task_struct, cg_list); 3431 res = list_entry(l, struct task_struct, cg_list);
2857 3432
@@ -2862,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
2862 */ 3437 */
2863 l = l->next; 3438 l = l->next;
2864 3439
2865 if (l == &link->cset->tasks) 3440 if (l == it->tasks_head)
2866 l = link->cset->mg_tasks.next; 3441 l = it->mg_tasks_head->next;
2867 3442
2868 if (l == &link->cset->mg_tasks) 3443 if (l == it->mg_tasks_head)
2869 css_advance_task_iter(it); 3444 css_advance_task_iter(it);
2870 else 3445 else
2871 it->task = l; 3446 it->task_pos = l;
2872 3447
2873 return res; 3448 return res;
2874} 3449}
@@ -2921,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
2921 * ->can_attach() fails. 3496 * ->can_attach() fails.
2922 */ 3497 */
2923 do { 3498 do {
2924 css_task_iter_start(&from->dummy_css, &it); 3499 css_task_iter_start(&from->self, &it);
2925 task = css_task_iter_next(&it); 3500 task = css_task_iter_next(&it);
2926 if (task) 3501 if (task)
2927 get_task_struct(task); 3502 get_task_struct(task);
@@ -3186,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3186 if (!array) 3761 if (!array)
3187 return -ENOMEM; 3762 return -ENOMEM;
3188 /* now, populate the array */ 3763 /* now, populate the array */
3189 css_task_iter_start(&cgrp->dummy_css, &it); 3764 css_task_iter_start(&cgrp->self, &it);
3190 while ((tsk = css_task_iter_next(&it))) { 3765 while ((tsk = css_task_iter_next(&it))) {
3191 if (unlikely(n == length)) 3766 if (unlikely(n == length))
3192 break; 3767 break;
@@ -3248,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3248 3823
3249 /* 3824 /*
3250 * We aren't being called from kernfs and there's no guarantee on 3825 * We aren't being called from kernfs and there's no guarantee on
3251 * @kn->priv's validity. For this and css_tryget_from_dir(), 3826 * @kn->priv's validity. For this and css_tryget_online_from_dir(),
3252 * @kn->priv is RCU safe. Let's do the RCU dancing. 3827 * @kn->priv is RCU safe. Let's do the RCU dancing.
3253 */ 3828 */
3254 rcu_read_lock(); 3829 rcu_read_lock();
@@ -3260,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3260 } 3835 }
3261 rcu_read_unlock(); 3836 rcu_read_unlock();
3262 3837
3263 css_task_iter_start(&cgrp->dummy_css, &it); 3838 css_task_iter_start(&cgrp->self, &it);
3264 while ((tsk = css_task_iter_next(&it))) { 3839 while ((tsk = css_task_iter_next(&it))) {
3265 switch (tsk->state) { 3840 switch (tsk->state) {
3266 case TASK_RUNNING: 3841 case TASK_RUNNING:
@@ -3390,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
3390 return seq_printf(s, "%d\n", *(int *)v); 3965 return seq_printf(s, "%d\n", *(int *)v);
3391} 3966}
3392 3967
3393/*
3394 * seq_operations functions for iterating on pidlists through seq_file -
3395 * independent of whether it's tasks or procs
3396 */
3397static const struct seq_operations cgroup_pidlist_seq_operations = {
3398 .start = cgroup_pidlist_start,
3399 .stop = cgroup_pidlist_stop,
3400 .next = cgroup_pidlist_next,
3401 .show = cgroup_pidlist_show,
3402};
3403
3404static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3968static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3405 struct cftype *cft) 3969 struct cftype *cft)
3406{ 3970{
@@ -3442,7 +4006,7 @@ static struct cftype cgroup_base_files[] = {
3442 .seq_stop = cgroup_pidlist_stop, 4006 .seq_stop = cgroup_pidlist_stop,
3443 .seq_show = cgroup_pidlist_show, 4007 .seq_show = cgroup_pidlist_show,
3444 .private = CGROUP_FILE_PROCS, 4008 .private = CGROUP_FILE_PROCS,
3445 .write_u64 = cgroup_procs_write, 4009 .write = cgroup_procs_write,
3446 .mode = S_IRUGO | S_IWUSR, 4010 .mode = S_IRUGO | S_IWUSR,
3447 }, 4011 },
3448 { 4012 {
@@ -3456,6 +4020,27 @@ static struct cftype cgroup_base_files[] = {
3456 .flags = CFTYPE_ONLY_ON_ROOT, 4020 .flags = CFTYPE_ONLY_ON_ROOT,
3457 .seq_show = cgroup_sane_behavior_show, 4021 .seq_show = cgroup_sane_behavior_show,
3458 }, 4022 },
4023 {
4024 .name = "cgroup.controllers",
4025 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
4026 .seq_show = cgroup_root_controllers_show,
4027 },
4028 {
4029 .name = "cgroup.controllers",
4030 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4031 .seq_show = cgroup_controllers_show,
4032 },
4033 {
4034 .name = "cgroup.subtree_control",
4035 .flags = CFTYPE_ONLY_ON_DFL,
4036 .seq_show = cgroup_subtree_control_show,
4037 .write = cgroup_subtree_control_write,
4038 },
4039 {
4040 .name = "cgroup.populated",
4041 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4042 .seq_show = cgroup_populated_show,
4043 },
3459 4044
3460 /* 4045 /*
3461 * Historical crazy stuff. These don't have "cgroup." prefix and 4046 * Historical crazy stuff. These don't have "cgroup." prefix and
@@ -3470,7 +4055,7 @@ static struct cftype cgroup_base_files[] = {
3470 .seq_stop = cgroup_pidlist_stop, 4055 .seq_stop = cgroup_pidlist_stop,
3471 .seq_show = cgroup_pidlist_show, 4056 .seq_show = cgroup_pidlist_show,
3472 .private = CGROUP_FILE_TASKS, 4057 .private = CGROUP_FILE_TASKS,
3473 .write_u64 = cgroup_tasks_write, 4058 .write = cgroup_tasks_write,
3474 .mode = S_IRUGO | S_IWUSR, 4059 .mode = S_IRUGO | S_IWUSR,
3475 }, 4060 },
3476 { 4061 {
@@ -3483,7 +4068,7 @@ static struct cftype cgroup_base_files[] = {
3483 .name = "release_agent", 4068 .name = "release_agent",
3484 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4069 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3485 .seq_show = cgroup_release_agent_show, 4070 .seq_show = cgroup_release_agent_show,
3486 .write_string = cgroup_release_agent_write, 4071 .write = cgroup_release_agent_write,
3487 .max_write_len = PATH_MAX - 1, 4072 .max_write_len = PATH_MAX - 1,
3488 }, 4073 },
3489 { } /* terminate */ 4074 { } /* terminate */
@@ -3496,7 +4081,7 @@ static struct cftype cgroup_base_files[] = {
3496 * 4081 *
3497 * On failure, no file is added. 4082 * On failure, no file is added.
3498 */ 4083 */
3499static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) 4084static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
3500{ 4085{
3501 struct cgroup_subsys *ss; 4086 struct cgroup_subsys *ss;
3502 int i, ret = 0; 4087 int i, ret = 0;
@@ -3505,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3505 for_each_subsys(ss, i) { 4090 for_each_subsys(ss, i) {
3506 struct cftype *cfts; 4091 struct cftype *cfts;
3507 4092
3508 if (!test_bit(i, &subsys_mask)) 4093 if (!(subsys_mask & (1 << i)))
3509 continue; 4094 continue;
3510 4095
3511 list_for_each_entry(cfts, &ss->cfts, node) { 4096 list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3527,9 +4112,9 @@ err:
3527 * Implemented in kill_css(). 4112 * Implemented in kill_css().
3528 * 4113 *
3529 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs 4114 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3530 * and thus css_tryget() is guaranteed to fail, the css can be offlined 4115 * and thus css_tryget_online() is guaranteed to fail, the css can be
3531 * by invoking offline_css(). After offlining, the base ref is put. 4116 * offlined by invoking offline_css(). After offlining, the base ref is
3532 * Implemented in css_killed_work_fn(). 4117 * put. Implemented in css_killed_work_fn().
3533 * 4118 *
3534 * 3. When the percpu_ref reaches zero, the only possible remaining 4119 * 3. When the percpu_ref reaches zero, the only possible remaining
3535 * accessors are inside RCU read sections. css_release() schedules the 4120 * accessors are inside RCU read sections. css_release() schedules the
@@ -3548,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work)
3548 container_of(work, struct cgroup_subsys_state, destroy_work); 4133 container_of(work, struct cgroup_subsys_state, destroy_work);
3549 struct cgroup *cgrp = css->cgroup; 4134 struct cgroup *cgrp = css->cgroup;
3550 4135
3551 if (css->parent) 4136 if (css->ss) {
3552 css_put(css->parent); 4137 /* css free path */
4138 if (css->parent)
4139 css_put(css->parent);
3553 4140
3554 css->ss->css_free(css); 4141 css->ss->css_free(css);
3555 cgroup_put(cgrp); 4142 cgroup_put(cgrp);
4143 } else {
4144 /* cgroup free path */
4145 atomic_dec(&cgrp->root->nr_cgrps);
4146 cgroup_pidlist_destroy_all(cgrp);
4147
4148 if (cgroup_parent(cgrp)) {
4149 /*
4150 * We get a ref to the parent, and put the ref when
4151 * this cgroup is being freed, so it's guaranteed
4152 * that the parent won't be destroyed before its
4153 * children.
4154 */
4155 cgroup_put(cgroup_parent(cgrp));
4156 kernfs_put(cgrp->kn);
4157 kfree(cgrp);
4158 } else {
4159 /*
4160 * This is root cgroup's refcnt reaching zero,
4161 * which indicates that the root should be
4162 * released.
4163 */
4164 cgroup_destroy_root(cgrp->root);
4165 }
4166 }
3556} 4167}
3557 4168
3558static void css_free_rcu_fn(struct rcu_head *rcu_head) 4169static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -3564,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
3564 queue_work(cgroup_destroy_wq, &css->destroy_work); 4175 queue_work(cgroup_destroy_wq, &css->destroy_work);
3565} 4176}
3566 4177
4178static void css_release_work_fn(struct work_struct *work)
4179{
4180 struct cgroup_subsys_state *css =
4181 container_of(work, struct cgroup_subsys_state, destroy_work);
4182 struct cgroup_subsys *ss = css->ss;
4183 struct cgroup *cgrp = css->cgroup;
4184
4185 mutex_lock(&cgroup_mutex);
4186
4187 css->flags |= CSS_RELEASED;
4188 list_del_rcu(&css->sibling);
4189
4190 if (ss) {
4191 /* css release path */
4192 cgroup_idr_remove(&ss->css_idr, css->id);
4193 } else {
4194 /* cgroup release path */
4195 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4196 cgrp->id = -1;
4197 }
4198
4199 mutex_unlock(&cgroup_mutex);
4200
4201 call_rcu(&css->rcu_head, css_free_rcu_fn);
4202}
4203
3567static void css_release(struct percpu_ref *ref) 4204static void css_release(struct percpu_ref *ref)
3568{ 4205{
3569 struct cgroup_subsys_state *css = 4206 struct cgroup_subsys_state *css =
3570 container_of(ref, struct cgroup_subsys_state, refcnt); 4207 container_of(ref, struct cgroup_subsys_state, refcnt);
3571 4208
3572 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); 4209 INIT_WORK(&css->destroy_work, css_release_work_fn);
3573 call_rcu(&css->rcu_head, css_free_rcu_fn); 4210 queue_work(cgroup_destroy_wq, &css->destroy_work);
3574} 4211}
3575 4212
3576static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, 4213static void init_and_link_css(struct cgroup_subsys_state *css,
3577 struct cgroup *cgrp) 4214 struct cgroup_subsys *ss, struct cgroup *cgrp)
3578{ 4215{
4216 lockdep_assert_held(&cgroup_mutex);
4217
4218 cgroup_get(cgrp);
4219
4220 memset(css, 0, sizeof(*css));
3579 css->cgroup = cgrp; 4221 css->cgroup = cgrp;
3580 css->ss = ss; 4222 css->ss = ss;
3581 css->flags = 0; 4223 INIT_LIST_HEAD(&css->sibling);
4224 INIT_LIST_HEAD(&css->children);
4225 css->serial_nr = css_serial_nr_next++;
3582 4226
3583 if (cgrp->parent) 4227 if (cgroup_parent(cgrp)) {
3584 css->parent = cgroup_css(cgrp->parent, ss); 4228 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
3585 else 4229 css_get(css->parent);
3586 css->flags |= CSS_ROOT; 4230 }
3587 4231
3588 BUG_ON(cgroup_css(cgrp, ss)); 4232 BUG_ON(cgroup_css(cgrp, ss));
3589} 4233}
@@ -3594,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css)
3594 struct cgroup_subsys *ss = css->ss; 4238 struct cgroup_subsys *ss = css->ss;
3595 int ret = 0; 4239 int ret = 0;
3596 4240
3597 lockdep_assert_held(&cgroup_tree_mutex);
3598 lockdep_assert_held(&cgroup_mutex); 4241 lockdep_assert_held(&cgroup_mutex);
3599 4242
3600 if (ss->css_online) 4243 if (ss->css_online)
3601 ret = ss->css_online(css); 4244 ret = ss->css_online(css);
3602 if (!ret) { 4245 if (!ret) {
3603 css->flags |= CSS_ONLINE; 4246 css->flags |= CSS_ONLINE;
3604 css->cgroup->nr_css++;
3605 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 4247 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3606 } 4248 }
3607 return ret; 4249 return ret;
@@ -3612,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css)
3612{ 4254{
3613 struct cgroup_subsys *ss = css->ss; 4255 struct cgroup_subsys *ss = css->ss;
3614 4256
3615 lockdep_assert_held(&cgroup_tree_mutex);
3616 lockdep_assert_held(&cgroup_mutex); 4257 lockdep_assert_held(&cgroup_mutex);
3617 4258
3618 if (!(css->flags & CSS_ONLINE)) 4259 if (!(css->flags & CSS_ONLINE))
@@ -3622,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css)
3622 ss->css_offline(css); 4263 ss->css_offline(css);
3623 4264
3624 css->flags &= ~CSS_ONLINE; 4265 css->flags &= ~CSS_ONLINE;
3625 css->cgroup->nr_css--; 4266 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
3626 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); 4267
4268 wake_up_all(&css->cgroup->offline_waitq);
3627} 4269}
3628 4270
3629/** 4271/**
@@ -3637,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css)
3637 */ 4279 */
3638static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4280static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
3639{ 4281{
3640 struct cgroup *parent = cgrp->parent; 4282 struct cgroup *parent = cgroup_parent(cgrp);
4283 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
3641 struct cgroup_subsys_state *css; 4284 struct cgroup_subsys_state *css;
3642 int err; 4285 int err;
3643 4286
3644 lockdep_assert_held(&cgroup_mutex); 4287 lockdep_assert_held(&cgroup_mutex);
3645 4288
3646 css = ss->css_alloc(cgroup_css(parent, ss)); 4289 css = ss->css_alloc(parent_css);
3647 if (IS_ERR(css)) 4290 if (IS_ERR(css))
3648 return PTR_ERR(css); 4291 return PTR_ERR(css);
3649 4292
4293 init_and_link_css(css, ss, cgrp);
4294
3650 err = percpu_ref_init(&css->refcnt, css_release); 4295 err = percpu_ref_init(&css->refcnt, css_release);
3651 if (err) 4296 if (err)
3652 goto err_free_css; 4297 goto err_free_css;
3653 4298
3654 init_css(css, ss, cgrp); 4299 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4300 if (err < 0)
4301 goto err_free_percpu_ref;
4302 css->id = err;
3655 4303
3656 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4304 err = cgroup_populate_dir(cgrp, 1 << ss->id);
3657 if (err) 4305 if (err)
3658 goto err_free_percpu_ref; 4306 goto err_free_id;
4307
4308 /* @css is ready to be brought online now, make it visible */
4309 list_add_tail_rcu(&css->sibling, &parent_css->children);
4310 cgroup_idr_replace(&ss->css_idr, css, css->id);
3659 4311
3660 err = online_css(css); 4312 err = online_css(css);
3661 if (err) 4313 if (err)
3662 goto err_clear_dir; 4314 goto err_list_del;
3663
3664 cgroup_get(cgrp);
3665 css_get(css->parent);
3666
3667 cgrp->subsys_mask |= 1 << ss->id;
3668 4315
3669 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4316 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
3670 parent->parent) { 4317 cgroup_parent(parent)) {
3671 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4318 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
3672 current->comm, current->pid, ss->name); 4319 current->comm, current->pid, ss->name);
3673 if (!strcmp(ss->name, "memory")) 4320 if (!strcmp(ss->name, "memory"))
3674 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4321 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
3675 ss->warned_broken_hierarchy = true; 4322 ss->warned_broken_hierarchy = true;
3676 } 4323 }
3677 4324
3678 return 0; 4325 return 0;
3679 4326
3680err_clear_dir: 4327err_list_del:
4328 list_del_rcu(&css->sibling);
3681 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4329 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4330err_free_id:
4331 cgroup_idr_remove(&ss->css_idr, css->id);
3682err_free_percpu_ref: 4332err_free_percpu_ref:
3683 percpu_ref_cancel_init(&css->refcnt); 4333 percpu_ref_cancel_init(&css->refcnt);
3684err_free_css: 4334err_free_css:
3685 ss->css_free(css); 4335 call_rcu(&css->rcu_head, css_free_rcu_fn);
3686 return err; 4336 return err;
3687} 4337}
3688 4338
3689/** 4339static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3690 * cgroup_create - create a cgroup 4340 umode_t mode)
3691 * @parent: cgroup that will be parent of the new cgroup
3692 * @name: name of the new cgroup
3693 * @mode: mode to set on new cgroup
3694 */
3695static long cgroup_create(struct cgroup *parent, const char *name,
3696 umode_t mode)
3697{ 4341{
3698 struct cgroup *cgrp; 4342 struct cgroup *parent, *cgrp;
3699 struct cgroup_root *root = parent->root; 4343 struct cgroup_root *root;
3700 int ssid, err;
3701 struct cgroup_subsys *ss; 4344 struct cgroup_subsys *ss;
3702 struct kernfs_node *kn; 4345 struct kernfs_node *kn;
4346 int ssid, ret;
3703 4347
3704 /* 4348 parent = cgroup_kn_lock_live(parent_kn);
3705 * XXX: The default hierarchy isn't fully implemented yet. Block 4349 if (!parent)
3706 * !root cgroup creation on it for now. 4350 return -ENODEV;
3707 */ 4351 root = parent->root;
3708 if (root == &cgrp_dfl_root)
3709 return -EINVAL;
3710 4352
3711 /* allocate the cgroup and its ID, 0 is reserved for the root */ 4353 /* allocate the cgroup and its ID, 0 is reserved for the root */
3712 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4354 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3713 if (!cgrp) 4355 if (!cgrp) {
3714 return -ENOMEM; 4356 ret = -ENOMEM;
3715 4357 goto out_unlock;
3716 mutex_lock(&cgroup_tree_mutex);
3717
3718 /*
3719 * Only live parents can have children. Note that the liveliness
3720 * check isn't strictly necessary because cgroup_mkdir() and
3721 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
3722 * anyway so that locking is contained inside cgroup proper and we
3723 * don't get nasty surprises if we ever grow another caller.
3724 */
3725 if (!cgroup_lock_live_group(parent)) {
3726 err = -ENODEV;
3727 goto err_unlock_tree;
3728 } 4358 }
3729 4359
4360 ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
4361 if (ret)
4362 goto out_free_cgrp;
4363
3730 /* 4364 /*
3731 * Temporarily set the pointer to NULL, so idr_find() won't return 4365 * Temporarily set the pointer to NULL, so idr_find() won't return
3732 * a half-baked cgroup. 4366 * a half-baked cgroup.
3733 */ 4367 */
3734 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); 4368 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
3735 if (cgrp->id < 0) { 4369 if (cgrp->id < 0) {
3736 err = -ENOMEM; 4370 ret = -ENOMEM;
3737 goto err_unlock; 4371 goto out_cancel_ref;
3738 } 4372 }
3739 4373
3740 init_cgroup_housekeeping(cgrp); 4374 init_cgroup_housekeeping(cgrp);
3741 4375
3742 cgrp->parent = parent; 4376 cgrp->self.parent = &parent->self;
3743 cgrp->dummy_css.parent = &parent->dummy_css; 4377 cgrp->root = root;
3744 cgrp->root = parent->root;
3745 4378
3746 if (notify_on_release(parent)) 4379 if (notify_on_release(parent))
3747 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4380 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -3752,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3752 /* create the directory */ 4385 /* create the directory */
3753 kn = kernfs_create_dir(parent->kn, name, mode, cgrp); 4386 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3754 if (IS_ERR(kn)) { 4387 if (IS_ERR(kn)) {
3755 err = PTR_ERR(kn); 4388 ret = PTR_ERR(kn);
3756 goto err_free_id; 4389 goto out_free_id;
3757 } 4390 }
3758 cgrp->kn = kn; 4391 cgrp->kn = kn;
3759 4392
@@ -3763,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3763 */ 4396 */
3764 kernfs_get(kn); 4397 kernfs_get(kn);
3765 4398
3766 cgrp->serial_nr = cgroup_serial_nr_next++; 4399 cgrp->self.serial_nr = css_serial_nr_next++;
3767 4400
3768 /* allocation complete, commit to creation */ 4401 /* allocation complete, commit to creation */
3769 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4402 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
3770 atomic_inc(&root->nr_cgrps); 4403 atomic_inc(&root->nr_cgrps);
3771 cgroup_get(parent); 4404 cgroup_get(parent);
3772 4405
@@ -3774,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3774 * @cgrp is now fully operational. If something fails after this 4407 * @cgrp is now fully operational. If something fails after this
3775 * point, it'll be released via the normal destruction path. 4408 * point, it'll be released via the normal destruction path.
3776 */ 4409 */
3777 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4410 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
3778 4411
3779 err = cgroup_kn_set_ugid(kn); 4412 ret = cgroup_kn_set_ugid(kn);
3780 if (err) 4413 if (ret)
3781 goto err_destroy; 4414 goto out_destroy;
3782 4415
3783 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4416 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3784 if (err) 4417 if (ret)
3785 goto err_destroy; 4418 goto out_destroy;
3786 4419
3787 /* let's create and online css's */ 4420 /* let's create and online css's */
3788 for_each_subsys(ss, ssid) { 4421 for_each_subsys(ss, ssid) {
3789 if (root->cgrp.subsys_mask & (1 << ssid)) { 4422 if (parent->child_subsys_mask & (1 << ssid)) {
3790 err = create_css(cgrp, ss); 4423 ret = create_css(cgrp, ss);
3791 if (err) 4424 if (ret)
3792 goto err_destroy; 4425 goto out_destroy;
3793 } 4426 }
3794 } 4427 }
3795 4428
3796 kernfs_activate(kn); 4429 /*
4430 * On the default hierarchy, a child doesn't automatically inherit
4431 * child_subsys_mask from the parent. Each is configured manually.
4432 */
4433 if (!cgroup_on_dfl(cgrp))
4434 cgrp->child_subsys_mask = parent->child_subsys_mask;
3797 4435
3798 mutex_unlock(&cgroup_mutex); 4436 kernfs_activate(kn);
3799 mutex_unlock(&cgroup_tree_mutex);
3800 4437
3801 return 0; 4438 ret = 0;
4439 goto out_unlock;
3802 4440
3803err_free_id: 4441out_free_id:
3804 idr_remove(&root->cgroup_idr, cgrp->id); 4442 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
3805err_unlock: 4443out_cancel_ref:
3806 mutex_unlock(&cgroup_mutex); 4444 percpu_ref_cancel_init(&cgrp->self.refcnt);
3807err_unlock_tree: 4445out_free_cgrp:
3808 mutex_unlock(&cgroup_tree_mutex);
3809 kfree(cgrp); 4446 kfree(cgrp);
3810 return err; 4447out_unlock:
4448 cgroup_kn_unlock(parent_kn);
4449 return ret;
3811 4450
3812err_destroy: 4451out_destroy:
3813 cgroup_destroy_locked(cgrp); 4452 cgroup_destroy_locked(cgrp);
3814 mutex_unlock(&cgroup_mutex); 4453 goto out_unlock;
3815 mutex_unlock(&cgroup_tree_mutex);
3816 return err;
3817}
3818
3819static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3820 umode_t mode)
3821{
3822 struct cgroup *parent = parent_kn->priv;
3823 int ret;
3824
3825 /*
3826 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3827 * kernfs active_ref and cgroup_create() already synchronizes
3828 * properly against removal through cgroup_lock_live_group().
3829 * Break it before calling cgroup_create().
3830 */
3831 cgroup_get(parent);
3832 kernfs_break_active_protection(parent_kn);
3833
3834 ret = cgroup_create(parent, name, mode);
3835
3836 kernfs_unbreak_active_protection(parent_kn);
3837 cgroup_put(parent);
3838 return ret;
3839} 4454}
3840 4455
3841/* 4456/*
3842 * This is called when the refcnt of a css is confirmed to be killed. 4457 * This is called when the refcnt of a css is confirmed to be killed.
3843 * css_tryget() is now guaranteed to fail. 4458 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
4459 * initate destruction and put the css ref from kill_css().
3844 */ 4460 */
3845static void css_killed_work_fn(struct work_struct *work) 4461static void css_killed_work_fn(struct work_struct *work)
3846{ 4462{
3847 struct cgroup_subsys_state *css = 4463 struct cgroup_subsys_state *css =
3848 container_of(work, struct cgroup_subsys_state, destroy_work); 4464 container_of(work, struct cgroup_subsys_state, destroy_work);
3849 struct cgroup *cgrp = css->cgroup;
3850 4465
3851 mutex_lock(&cgroup_tree_mutex);
3852 mutex_lock(&cgroup_mutex); 4466 mutex_lock(&cgroup_mutex);
3853
3854 /*
3855 * css_tryget() is guaranteed to fail now. Tell subsystems to
3856 * initate destruction.
3857 */
3858 offline_css(css); 4467 offline_css(css);
3859
3860 /*
3861 * If @cgrp is marked dead, it's waiting for refs of all css's to
3862 * be disabled before proceeding to the second phase of cgroup
3863 * destruction. If we are the last one, kick it off.
3864 */
3865 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3866 cgroup_destroy_css_killed(cgrp);
3867
3868 mutex_unlock(&cgroup_mutex); 4468 mutex_unlock(&cgroup_mutex);
3869 mutex_unlock(&cgroup_tree_mutex);
3870 4469
3871 /*
3872 * Put the css refs from kill_css(). Each css holds an extra
3873 * reference to the cgroup's dentry and cgroup removal proceeds
3874 * regardless of css refs. On the last put of each css, whenever
3875 * that may be, the extra dentry ref is put so that dentry
3876 * destruction happens only after all css's are released.
3877 */
3878 css_put(css); 4470 css_put(css);
3879} 4471}
3880 4472
@@ -3888,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
3888 queue_work(cgroup_destroy_wq, &css->destroy_work); 4480 queue_work(cgroup_destroy_wq, &css->destroy_work);
3889} 4481}
3890 4482
3891static void __kill_css(struct cgroup_subsys_state *css) 4483/**
4484 * kill_css - destroy a css
4485 * @css: css to destroy
4486 *
4487 * This function initiates destruction of @css by removing cgroup interface
4488 * files and putting its base reference. ->css_offline() will be invoked
4489 * asynchronously once css_tryget_online() is guaranteed to fail and when
4490 * the reference count reaches zero, @css will be released.
4491 */
4492static void kill_css(struct cgroup_subsys_state *css)
3892{ 4493{
3893 lockdep_assert_held(&cgroup_tree_mutex); 4494 lockdep_assert_held(&cgroup_mutex);
3894 4495
3895 /* 4496 /*
3896 * This must happen before css is disassociated with its cgroup. 4497 * This must happen before css is disassociated with its cgroup.
@@ -3907,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css)
3907 /* 4508 /*
3908 * cgroup core guarantees that, by the time ->css_offline() is 4509 * cgroup core guarantees that, by the time ->css_offline() is
3909 * invoked, no new css reference will be given out via 4510 * invoked, no new css reference will be given out via
3910 * css_tryget(). We can't simply call percpu_ref_kill() and 4511 * css_tryget_online(). We can't simply call percpu_ref_kill() and
3911 * proceed to offlining css's because percpu_ref_kill() doesn't 4512 * proceed to offlining css's because percpu_ref_kill() doesn't
3912 * guarantee that the ref is seen as killed on all CPUs on return. 4513 * guarantee that the ref is seen as killed on all CPUs on return.
3913 * 4514 *
@@ -3918,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css)
3918} 4519}
3919 4520
3920/** 4521/**
3921 * kill_css - destroy a css
3922 * @css: css to destroy
3923 *
3924 * This function initiates destruction of @css by removing cgroup interface
3925 * files and putting its base reference. ->css_offline() will be invoked
3926 * asynchronously once css_tryget() is guaranteed to fail and when the
3927 * reference count reaches zero, @css will be released.
3928 */
3929static void kill_css(struct cgroup_subsys_state *css)
3930{
3931 struct cgroup *cgrp = css->cgroup;
3932
3933 lockdep_assert_held(&cgroup_tree_mutex);
3934
3935 /* if already killed, noop */
3936 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3937 cgrp->subsys_mask &= ~(1 << css->ss->id);
3938 __kill_css(css);
3939 }
3940}
3941
3942/**
3943 * cgroup_destroy_locked - the first stage of cgroup destruction 4522 * cgroup_destroy_locked - the first stage of cgroup destruction
3944 * @cgrp: cgroup to be destroyed 4523 * @cgrp: cgroup to be destroyed
3945 * 4524 *
3946 * css's make use of percpu refcnts whose killing latency shouldn't be 4525 * css's make use of percpu refcnts whose killing latency shouldn't be
3947 * exposed to userland and are RCU protected. Also, cgroup core needs to 4526 * exposed to userland and are RCU protected. Also, cgroup core needs to
3948 * guarantee that css_tryget() won't succeed by the time ->css_offline() is 4527 * guarantee that css_tryget_online() won't succeed by the time
3949 * invoked. To satisfy all the requirements, destruction is implemented in 4528 * ->css_offline() is invoked. To satisfy all the requirements,
3950 * the following two steps. 4529 * destruction is implemented in the following two steps.
3951 * 4530 *
3952 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all 4531 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
3953 * userland visible parts and start killing the percpu refcnts of 4532 * userland visible parts and start killing the percpu refcnts of
@@ -3966,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css)
3966static int cgroup_destroy_locked(struct cgroup *cgrp) 4545static int cgroup_destroy_locked(struct cgroup *cgrp)
3967 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4546 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
3968{ 4547{
3969 struct cgroup *child;
3970 struct cgroup_subsys_state *css; 4548 struct cgroup_subsys_state *css;
3971 bool empty; 4549 bool empty;
3972 int ssid; 4550 int ssid;
3973 4551
3974 lockdep_assert_held(&cgroup_tree_mutex);
3975 lockdep_assert_held(&cgroup_mutex); 4552 lockdep_assert_held(&cgroup_mutex);
3976 4553
3977 /* 4554 /*
@@ -3985,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
3985 return -EBUSY; 4562 return -EBUSY;
3986 4563
3987 /* 4564 /*
3988 * Make sure there's no live children. We can't test ->children 4565 * Make sure there's no live children. We can't test emptiness of
3989 * emptiness as dead children linger on it while being destroyed; 4566 * ->self.children as dead children linger on it while being
3990 * otherwise, "rmdir parent/child parent" may fail with -EBUSY. 4567 * drained; otherwise, "rmdir parent/child parent" may fail.
3991 */ 4568 */
3992 empty = true; 4569 if (css_has_online_children(&cgrp->self))
3993 rcu_read_lock();
3994 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3995 empty = cgroup_is_dead(child);
3996 if (!empty)
3997 break;
3998 }
3999 rcu_read_unlock();
4000 if (!empty)
4001 return -EBUSY; 4570 return -EBUSY;
4002 4571
4003 /* 4572 /*
4004 * Mark @cgrp dead. This prevents further task migration and child 4573 * Mark @cgrp dead. This prevents further task migration and child
4005 * creation by disabling cgroup_lock_live_group(). Note that 4574 * creation by disabling cgroup_lock_live_group().
4006 * CGRP_DEAD assertion is depended upon by css_next_child() to
4007 * resume iteration after dropping RCU read lock. See
4008 * css_next_child() for details.
4009 */ 4575 */
4010 set_bit(CGRP_DEAD, &cgrp->flags); 4576 cgrp->self.flags &= ~CSS_ONLINE;
4011 4577
4012 /* 4578 /* initiate massacre of all css's */
4013 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4014 * will be invoked to perform the rest of destruction once the
4015 * percpu refs of all css's are confirmed to be killed. This
4016 * involves removing the subsystem's files, drop cgroup_mutex.
4017 */
4018 mutex_unlock(&cgroup_mutex);
4019 for_each_css(css, ssid, cgrp) 4579 for_each_css(css, ssid, cgrp)
4020 kill_css(css); 4580 kill_css(css);
4021 mutex_lock(&cgroup_mutex);
4022 4581
4023 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 4582 /* CSS_ONLINE is clear, remove from ->release_list for the last time */
4024 raw_spin_lock(&release_list_lock); 4583 raw_spin_lock(&release_list_lock);
4025 if (!list_empty(&cgrp->release_list)) 4584 if (!list_empty(&cgrp->release_list))
4026 list_del_init(&cgrp->release_list); 4585 list_del_init(&cgrp->release_list);
4027 raw_spin_unlock(&release_list_lock); 4586 raw_spin_unlock(&release_list_lock);
4028 4587
4029 /* 4588 /*
4030 * If @cgrp has css's attached, the second stage of cgroup 4589 * Remove @cgrp directory along with the base files. @cgrp has an
4031 * destruction is kicked off from css_killed_work_fn() after the 4590 * extra ref on its kn.
4032 * refs of all attached css's are killed. If @cgrp doesn't have
4033 * any css, we kick it off here.
4034 */ 4591 */
4035 if (!cgrp->nr_css) 4592 kernfs_remove(cgrp->kn);
4036 cgroup_destroy_css_killed(cgrp);
4037
4038 /* remove @cgrp directory along with the base files */
4039 mutex_unlock(&cgroup_mutex);
4040 4593
4041 /* 4594 set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
4042 * There are two control paths which try to determine cgroup from 4595 check_for_release(cgroup_parent(cgrp));
4043 * dentry without going through kernfs - cgroupstats_build() and
4044 * css_tryget_from_dir(). Those are supported by RCU protecting
4045 * clearing of cgrp->kn->priv backpointer, which should happen
4046 * after all files under it have been removed.
4047 */
4048 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4049 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4050 4596
4051 mutex_lock(&cgroup_mutex); 4597 /* put the base reference */
4598 percpu_ref_kill(&cgrp->self.refcnt);
4052 4599
4053 return 0; 4600 return 0;
4054}; 4601};
4055 4602
4056/**
4057 * cgroup_destroy_css_killed - the second step of cgroup destruction
4058 * @work: cgroup->destroy_free_work
4059 *
4060 * This function is invoked from a work item for a cgroup which is being
4061 * destroyed after all css's are offlined and performs the rest of
4062 * destruction. This is the second step of destruction described in the
4063 * comment above cgroup_destroy_locked().
4064 */
4065static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4066{
4067 struct cgroup *parent = cgrp->parent;
4068
4069 lockdep_assert_held(&cgroup_tree_mutex);
4070 lockdep_assert_held(&cgroup_mutex);
4071
4072 /* delete this cgroup from parent->children */
4073 list_del_rcu(&cgrp->sibling);
4074
4075 cgroup_put(cgrp);
4076
4077 set_bit(CGRP_RELEASABLE, &parent->flags);
4078 check_for_release(parent);
4079}
4080
4081static int cgroup_rmdir(struct kernfs_node *kn) 4603static int cgroup_rmdir(struct kernfs_node *kn)
4082{ 4604{
4083 struct cgroup *cgrp = kn->priv; 4605 struct cgroup *cgrp;
4084 int ret = 0; 4606 int ret = 0;
4085 4607
4086 /* 4608 cgrp = cgroup_kn_lock_live(kn);
4087 * This is self-destruction but @kn can't be removed while this 4609 if (!cgrp)
4088 * callback is in progress. Let's break active protection. Once 4610 return 0;
4089 * the protection is broken, @cgrp can be destroyed at any point. 4611 cgroup_get(cgrp); /* for @kn->priv clearing */
4090 * Pin it so that it stays accessible.
4091 */
4092 cgroup_get(cgrp);
4093 kernfs_break_active_protection(kn);
4094 4612
4095 mutex_lock(&cgroup_tree_mutex); 4613 ret = cgroup_destroy_locked(cgrp);
4096 mutex_lock(&cgroup_mutex); 4614
4615 cgroup_kn_unlock(kn);
4097 4616
4098 /* 4617 /*
4099 * @cgrp might already have been destroyed while we're trying to 4618 * There are two control paths which try to determine cgroup from
4100 * grab the mutexes. 4619 * dentry without going through kernfs - cgroupstats_build() and
4620 * css_tryget_online_from_dir(). Those are supported by RCU
4621 * protecting clearing of cgrp->kn->priv backpointer, which should
4622 * happen after all files under it have been removed.
4101 */ 4623 */
4102 if (!cgroup_is_dead(cgrp)) 4624 if (!ret)
4103 ret = cgroup_destroy_locked(cgrp); 4625 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4104
4105 mutex_unlock(&cgroup_mutex);
4106 mutex_unlock(&cgroup_tree_mutex);
4107 4626
4108 kernfs_unbreak_active_protection(kn);
4109 cgroup_put(cgrp); 4627 cgroup_put(cgrp);
4110 return ret; 4628 return ret;
4111} 4629}
@@ -4118,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4118 .rename = cgroup_rename, 4636 .rename = cgroup_rename,
4119}; 4637};
4120 4638
4121static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4639static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4122{ 4640{
4123 struct cgroup_subsys_state *css; 4641 struct cgroup_subsys_state *css;
4124 4642
4125 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4643 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4126 4644
4127 mutex_lock(&cgroup_tree_mutex);
4128 mutex_lock(&cgroup_mutex); 4645 mutex_lock(&cgroup_mutex);
4129 4646
4647 idr_init(&ss->css_idr);
4130 INIT_LIST_HEAD(&ss->cfts); 4648 INIT_LIST_HEAD(&ss->cfts);
4131 4649
4132 /* Create the root cgroup state for this subsystem */ 4650 /* Create the root cgroup state for this subsystem */
@@ -4134,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4134 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); 4652 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4135 /* We don't handle early failures gracefully */ 4653 /* We don't handle early failures gracefully */
4136 BUG_ON(IS_ERR(css)); 4654 BUG_ON(IS_ERR(css));
4137 init_css(css, ss, &cgrp_dfl_root.cgrp); 4655 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4656
4657 /*
4658 * Root csses are never destroyed and we can't initialize
4659 * percpu_ref during early init. Disable refcnting.
4660 */
4661 css->flags |= CSS_NO_REF;
4662
4663 if (early) {
4664 /* allocation can't be done safely during early init */
4665 css->id = 1;
4666 } else {
4667 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4668 BUG_ON(css->id < 0);
4669 }
4138 4670
4139 /* Update the init_css_set to contain a subsys 4671 /* Update the init_css_set to contain a subsys
4140 * pointer to this state - since the subsystem is 4672 * pointer to this state - since the subsystem is
@@ -4151,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4151 4683
4152 BUG_ON(online_css(css)); 4684 BUG_ON(online_css(css));
4153 4685
4154 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4155
4156 mutex_unlock(&cgroup_mutex); 4686 mutex_unlock(&cgroup_mutex);
4157 mutex_unlock(&cgroup_tree_mutex);
4158} 4687}
4159 4688
4160/** 4689/**
@@ -4171,6 +4700,8 @@ int __init cgroup_init_early(void)
4171 int i; 4700 int i;
4172 4701
4173 init_cgroup_root(&cgrp_dfl_root, &opts); 4702 init_cgroup_root(&cgrp_dfl_root, &opts);
4703 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4704
4174 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4705 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4175 4706
4176 for_each_subsys(ss, i) { 4707 for_each_subsys(ss, i) {
@@ -4185,7 +4716,7 @@ int __init cgroup_init_early(void)
4185 ss->name = cgroup_subsys_name[i]; 4716 ss->name = cgroup_subsys_name[i];
4186 4717
4187 if (ss->early_init) 4718 if (ss->early_init)
4188 cgroup_init_subsys(ss); 4719 cgroup_init_subsys(ss, true);
4189 } 4720 }
4190 return 0; 4721 return 0;
4191} 4722}
@@ -4204,7 +4735,6 @@ int __init cgroup_init(void)
4204 4735
4205 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4736 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4206 4737
4207 mutex_lock(&cgroup_tree_mutex);
4208 mutex_lock(&cgroup_mutex); 4738 mutex_lock(&cgroup_mutex);
4209 4739
4210 /* Add init_css_set to the hash table */ 4740 /* Add init_css_set to the hash table */
@@ -4214,18 +4744,31 @@ int __init cgroup_init(void)
4214 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4744 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4215 4745
4216 mutex_unlock(&cgroup_mutex); 4746 mutex_unlock(&cgroup_mutex);
4217 mutex_unlock(&cgroup_tree_mutex);
4218 4747
4219 for_each_subsys(ss, ssid) { 4748 for_each_subsys(ss, ssid) {
4220 if (!ss->early_init) 4749 if (ss->early_init) {
4221 cgroup_init_subsys(ss); 4750 struct cgroup_subsys_state *css =
4751 init_css_set.subsys[ss->id];
4752
4753 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4754 GFP_KERNEL);
4755 BUG_ON(css->id < 0);
4756 } else {
4757 cgroup_init_subsys(ss, false);
4758 }
4759
4760 list_add_tail(&init_css_set.e_cset_node[ssid],
4761 &cgrp_dfl_root.cgrp.e_csets[ssid]);
4222 4762
4223 /* 4763 /*
4224 * cftype registration needs kmalloc and can't be done 4764 * Setting dfl_root subsys_mask needs to consider the
4225 * during early_init. Register base cftypes separately. 4765 * disabled flag and cftype registration needs kmalloc,
4766 * both of which aren't available during early_init.
4226 */ 4767 */
4227 if (ss->base_cftypes) 4768 if (!ss->disabled) {
4769 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4228 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4770 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4771 }
4229 } 4772 }
4230 4773
4231 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4774 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4308,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4308 4851
4309 seq_printf(m, "%d:", root->hierarchy_id); 4852 seq_printf(m, "%d:", root->hierarchy_id);
4310 for_each_subsys(ss, ssid) 4853 for_each_subsys(ss, ssid)
4311 if (root->cgrp.subsys_mask & (1 << ssid)) 4854 if (root->subsys_mask & (1 << ssid))
4312 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4855 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4313 if (strlen(root->name)) 4856 if (strlen(root->name))
4314 seq_printf(m, "%sname=%s", count ? "," : "", 4857 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4503,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk)
4503 5046
4504static void check_for_release(struct cgroup *cgrp) 5047static void check_for_release(struct cgroup *cgrp)
4505{ 5048{
4506 if (cgroup_is_releasable(cgrp) && 5049 if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
4507 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { 5050 !css_has_online_children(&cgrp->self)) {
4508 /* 5051 /*
4509 * Control Group is currently removeable. If it's not 5052 * Control Group is currently removeable. If it's not
4510 * already queued for a userspace notification, queue 5053 * already queued for a userspace notification, queue
@@ -4621,7 +5164,7 @@ static int __init cgroup_disable(char *str)
4621__setup("cgroup_disable=", cgroup_disable); 5164__setup("cgroup_disable=", cgroup_disable);
4622 5165
4623/** 5166/**
4624 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir 5167 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
4625 * @dentry: directory dentry of interest 5168 * @dentry: directory dentry of interest
4626 * @ss: subsystem of interest 5169 * @ss: subsystem of interest
4627 * 5170 *
@@ -4629,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable);
4629 * to get the corresponding css and return it. If such css doesn't exist 5172 * to get the corresponding css and return it. If such css doesn't exist
4630 * or can't be pinned, an ERR_PTR value is returned. 5173 * or can't be pinned, an ERR_PTR value is returned.
4631 */ 5174 */
4632struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 5175struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
4633 struct cgroup_subsys *ss) 5176 struct cgroup_subsys *ss)
4634{ 5177{
4635 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 5178 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4636 struct cgroup_subsys_state *css = NULL; 5179 struct cgroup_subsys_state *css = NULL;
@@ -4646,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4646 /* 5189 /*
4647 * This path doesn't originate from kernfs and @kn could already 5190 * This path doesn't originate from kernfs and @kn could already
4648 * have been or be removed at any point. @kn->priv is RCU 5191 * have been or be removed at any point. @kn->priv is RCU
4649 * protected for this access. See destroy_locked() for details. 5192 * protected for this access. See cgroup_rmdir() for details.
4650 */ 5193 */
4651 cgrp = rcu_dereference(kn->priv); 5194 cgrp = rcu_dereference(kn->priv);
4652 if (cgrp) 5195 if (cgrp)
4653 css = cgroup_css(cgrp, ss); 5196 css = cgroup_css(cgrp, ss);
4654 5197
4655 if (!css || !css_tryget(css)) 5198 if (!css || !css_tryget_online(css))
4656 css = ERR_PTR(-ENOENT); 5199 css = ERR_PTR(-ENOENT);
4657 5200
4658 rcu_read_unlock(); 5201 rcu_read_unlock();
@@ -4669,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4669 */ 5212 */
4670struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 5213struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
4671{ 5214{
4672 struct cgroup *cgrp; 5215 WARN_ON_ONCE(!rcu_read_lock_held());
4673 5216 return idr_find(&ss->css_idr, id);
4674 cgroup_assert_mutexes_or_rcu_locked();
4675
4676 cgrp = idr_find(&ss->root->cgroup_idr, id);
4677 if (cgrp)
4678 return cgroup_css(cgrp, ss);
4679 return NULL;
4680} 5217}
4681 5218
4682#ifdef CONFIG_CGROUP_DEBUG 5219#ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 345628c78b5b..a79e40f9d700 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -59,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)
59 59
60static struct freezer *parent_freezer(struct freezer *freezer) 60static struct freezer *parent_freezer(struct freezer *freezer)
61{ 61{
62 return css_freezer(css_parent(&freezer->css)); 62 return css_freezer(freezer->css.parent);
63} 63}
64 64
65bool cgroup_freezing(struct task_struct *task) 65bool cgroup_freezing(struct task_struct *task)
@@ -73,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
73 return ret; 73 return ret;
74} 74}
75 75
76/*
77 * cgroups_write_string() limits the size of freezer state strings to
78 * CGROUP_LOCAL_BUFFER_SIZE
79 */
80static const char *freezer_state_strs(unsigned int state) 76static const char *freezer_state_strs(unsigned int state)
81{ 77{
82 if (state & CGROUP_FROZEN) 78 if (state & CGROUP_FROZEN)
@@ -304,7 +300,7 @@ static int freezer_read(struct seq_file *m, void *v)
304 300
305 /* update states bottom-up */ 301 /* update states bottom-up */
306 css_for_each_descendant_post(pos, css) { 302 css_for_each_descendant_post(pos, css) {
307 if (!css_tryget(pos)) 303 if (!css_tryget_online(pos))
308 continue; 304 continue;
309 rcu_read_unlock(); 305 rcu_read_unlock();
310 306
@@ -404,7 +400,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
404 struct freezer *pos_f = css_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
405 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
406 402
407 if (!css_tryget(pos)) 403 if (!css_tryget_online(pos))
408 continue; 404 continue;
409 rcu_read_unlock(); 405 rcu_read_unlock();
410 406
@@ -423,20 +419,22 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
423 mutex_unlock(&freezer_mutex); 419 mutex_unlock(&freezer_mutex);
424} 420}
425 421
426static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 422static ssize_t freezer_write(struct kernfs_open_file *of,
427 char *buffer) 423 char *buf, size_t nbytes, loff_t off)
428{ 424{
429 bool freeze; 425 bool freeze;
430 426
431 if (strcmp(buffer, freezer_state_strs(0)) == 0) 427 buf = strstrip(buf);
428
429 if (strcmp(buf, freezer_state_strs(0)) == 0)
432 freeze = false; 430 freeze = false;
433 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) 431 else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
434 freeze = true; 432 freeze = true;
435 else 433 else
436 return -EINVAL; 434 return -EINVAL;
437 435
438 freezer_change_state(css_freezer(css), freeze); 436 freezer_change_state(css_freezer(of_css(of)), freeze);
439 return 0; 437 return nbytes;
440} 438}
441 439
442static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, 440static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -460,7 +458,7 @@ static struct cftype files[] = {
460 .name = "state", 458 .name = "state",
461 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
462 .seq_show = freezer_read, 460 .seq_show = freezer_read,
463 .write_string = freezer_write, 461 .write = freezer_write,
464 }, 462 },
465 { 463 {
466 .name = "self_freezing", 464 .name = "self_freezing",
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 130017843899..f6b33c696224 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)
119 119
120static inline struct cpuset *parent_cs(struct cpuset *cs) 120static inline struct cpuset *parent_cs(struct cpuset *cs)
121{ 121{
122 return css_cs(css_parent(&cs->css)); 122 return css_cs(cs->css.parent);
123} 123}
124 124
125#ifdef CONFIG_NUMA 125#ifdef CONFIG_NUMA
@@ -691,11 +691,8 @@ restart:
691 if (nslot == ndoms) { 691 if (nslot == ndoms) {
692 static int warnings = 10; 692 static int warnings = 10;
693 if (warnings) { 693 if (warnings) {
694 printk(KERN_WARNING 694 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
695 "rebuild_sched_domains confused:" 695 nslot, ndoms, csn, i, apn);
696 " nslot %d, ndoms %d, csn %d, i %d,"
697 " apn %d\n",
698 nslot, ndoms, csn, i, apn);
699 warnings--; 696 warnings--;
700 } 697 }
701 continue; 698 continue;
@@ -870,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
870 continue; 867 continue;
871 } 868 }
872 } 869 }
873 if (!css_tryget(&cp->css)) 870 if (!css_tryget_online(&cp->css))
874 continue; 871 continue;
875 rcu_read_unlock(); 872 rcu_read_unlock();
876 873
@@ -885,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
885/** 882/**
886 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 883 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
887 * @cs: the cpuset to consider 884 * @cs: the cpuset to consider
885 * @trialcs: trial cpuset
888 * @buf: buffer of cpu numbers written to this cpuset 886 * @buf: buffer of cpu numbers written to this cpuset
889 */ 887 */
890static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 888static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
@@ -1105,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1105 continue; 1103 continue;
1106 } 1104 }
1107 } 1105 }
1108 if (!css_tryget(&cp->css)) 1106 if (!css_tryget_online(&cp->css))
1109 continue; 1107 continue;
1110 rcu_read_unlock(); 1108 rcu_read_unlock();
1111 1109
@@ -1600,13 +1598,15 @@ out_unlock:
1600/* 1598/*
1601 * Common handling for a write to a "cpus" or "mems" file. 1599 * Common handling for a write to a "cpus" or "mems" file.
1602 */ 1600 */
1603static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1601static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1604 struct cftype *cft, char *buf) 1602 char *buf, size_t nbytes, loff_t off)
1605{ 1603{
1606 struct cpuset *cs = css_cs(css); 1604 struct cpuset *cs = css_cs(of_css(of));
1607 struct cpuset *trialcs; 1605 struct cpuset *trialcs;
1608 int retval = -ENODEV; 1606 int retval = -ENODEV;
1609 1607
1608 buf = strstrip(buf);
1609
1610 /* 1610 /*
1611 * CPU or memory hotunplug may leave @cs w/o any execution 1611 * CPU or memory hotunplug may leave @cs w/o any execution
1612 * resources, in which case the hotplug code asynchronously updates 1612 * resources, in which case the hotplug code asynchronously updates
@@ -1630,7 +1630,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1630 goto out_unlock; 1630 goto out_unlock;
1631 } 1631 }
1632 1632
1633 switch (cft->private) { 1633 switch (of_cft(of)->private) {
1634 case FILE_CPULIST: 1634 case FILE_CPULIST:
1635 retval = update_cpumask(cs, trialcs, buf); 1635 retval = update_cpumask(cs, trialcs, buf);
1636 break; 1636 break;
@@ -1645,7 +1645,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1645 free_trial_cpuset(trialcs); 1645 free_trial_cpuset(trialcs);
1646out_unlock: 1646out_unlock:
1647 mutex_unlock(&cpuset_mutex); 1647 mutex_unlock(&cpuset_mutex);
1648 return retval; 1648 return retval ?: nbytes;
1649} 1649}
1650 1650
1651/* 1651/*
@@ -1747,7 +1747,7 @@ static struct cftype files[] = {
1747 { 1747 {
1748 .name = "cpus", 1748 .name = "cpus",
1749 .seq_show = cpuset_common_seq_show, 1749 .seq_show = cpuset_common_seq_show,
1750 .write_string = cpuset_write_resmask, 1750 .write = cpuset_write_resmask,
1751 .max_write_len = (100U + 6 * NR_CPUS), 1751 .max_write_len = (100U + 6 * NR_CPUS),
1752 .private = FILE_CPULIST, 1752 .private = FILE_CPULIST,
1753 }, 1753 },
@@ -1755,7 +1755,7 @@ static struct cftype files[] = {
1755 { 1755 {
1756 .name = "mems", 1756 .name = "mems",
1757 .seq_show = cpuset_common_seq_show, 1757 .seq_show = cpuset_common_seq_show,
1758 .write_string = cpuset_write_resmask, 1758 .write = cpuset_write_resmask,
1759 .max_write_len = (100U + 6 * MAX_NUMNODES), 1759 .max_write_len = (100U + 6 * MAX_NUMNODES),
1760 .private = FILE_MEMLIST, 1760 .private = FILE_MEMLIST,
1761 }, 1761 },
@@ -2011,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2011 parent = parent_cs(parent); 2011 parent = parent_cs(parent);
2012 2012
2013 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2013 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2014 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); 2014 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2015 pr_cont_cgroup_name(cs->css.cgroup); 2015 pr_cont_cgroup_name(cs->css.cgroup);
2016 pr_cont("\n"); 2016 pr_cont("\n");
2017 } 2017 }
@@ -2149,7 +2149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2149 2149
2150 rcu_read_lock(); 2150 rcu_read_lock();
2151 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 2151 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2152 if (cs == &top_cpuset || !css_tryget(&cs->css)) 2152 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2153 continue; 2153 continue;
2154 rcu_read_unlock(); 2154 rcu_read_unlock();
2155 2155
@@ -2530,7 +2530,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2530 2530
2531/** 2531/**
2532 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2532 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2533 * @task: pointer to task_struct of some task. 2533 * @tsk: pointer to task_struct of some task.
2534 * 2534 *
2535 * Description: Prints @task's name, cpuset name, and cached copy of its 2535 * Description: Prints @task's name, cpuset name, and cached copy of its
2536 * mems_allowed to the kernel log. 2536 * mems_allowed to the kernel log.
@@ -2548,7 +2548,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2548 cgrp = task_cs(tsk)->css.cgroup; 2548 cgrp = task_cs(tsk)->css.cgroup;
2549 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2549 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2550 tsk->mems_allowed); 2550 tsk->mems_allowed);
2551 printk(KERN_INFO "%s cpuset=", tsk->comm); 2551 pr_info("%s cpuset=", tsk->comm);
2552 pr_cont_cgroup_name(cgrp); 2552 pr_cont_cgroup_name(cgrp);
2553 pr_cont(" mems_allowed=%s\n", cpuset_nodelist); 2553 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2554 2554
@@ -2640,10 +2640,10 @@ out:
2640/* Display task mems_allowed in /proc/<pid>/status file. */ 2640/* Display task mems_allowed in /proc/<pid>/status file. */
2641void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2641void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2642{ 2642{
2643 seq_printf(m, "Mems_allowed:\t"); 2643 seq_puts(m, "Mems_allowed:\t");
2644 seq_nodemask(m, &task->mems_allowed); 2644 seq_nodemask(m, &task->mems_allowed);
2645 seq_printf(m, "\n"); 2645 seq_puts(m, "\n");
2646 seq_printf(m, "Mems_allowed_list:\t"); 2646 seq_puts(m, "Mems_allowed_list:\t");
2647 seq_nodemask_list(m, &task->mems_allowed); 2647 seq_nodemask_list(m, &task->mems_allowed);
2648 seq_printf(m, "\n"); 2648 seq_puts(m, "\n");
2649} 2649}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 689237a0c5e8..24d35cc38e42 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -608,7 +608,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
608 if (!f.file) 608 if (!f.file)
609 return -EBADF; 609 return -EBADF;
610 610
611 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); 611 css = css_tryget_online_from_dir(f.file->f_dentry,
612 &perf_event_cgrp_subsys);
612 if (IS_ERR(css)) { 613 if (IS_ERR(css)) {
613 ret = PTR_ERR(css); 614 ret = PTR_ERR(css);
614 goto out; 615 goto out;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48e78b657d23..c6b98793d647 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7669,7 +7669,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7669static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7669static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7670{ 7670{
7671 struct task_group *tg = css_tg(css); 7671 struct task_group *tg = css_tg(css);
7672 struct task_group *parent = css_tg(css_parent(css)); 7672 struct task_group *parent = css_tg(css->parent);
7673 7673
7674 if (parent) 7674 if (parent)
7675 sched_online_group(tg, parent); 7675 sched_online_group(tg, parent);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index c143ee380e3a..9cf350c94ec4 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
48{ 48{
49 return css_ca(css_parent(&ca->css)); 49 return css_ca(ca->css.parent);
50} 50}
51 51
52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 595d7fd795e1..493f758445e7 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -52,7 +52,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
52static inline struct hugetlb_cgroup * 52static inline struct hugetlb_cgroup *
53parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 53parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
54{ 54{
55 return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); 55 return hugetlb_cgroup_from_css(h_cg->css.parent);
56} 56}
57 57
58static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 58static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
@@ -181,7 +181,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
181again: 181again:
182 rcu_read_lock(); 182 rcu_read_lock();
183 h_cg = hugetlb_cgroup_from_task(current); 183 h_cg = hugetlb_cgroup_from_task(current);
184 if (!css_tryget(&h_cg->css)) { 184 if (!css_tryget_online(&h_cg->css)) {
185 rcu_read_unlock(); 185 rcu_read_unlock();
186 goto again; 186 goto again;
187 } 187 }
@@ -253,15 +253,16 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
253 return res_counter_read_u64(&h_cg->hugepage[idx], name); 253 return res_counter_read_u64(&h_cg->hugepage[idx], name);
254} 254}
255 255
256static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, 256static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
257 struct cftype *cft, char *buffer) 257 char *buf, size_t nbytes, loff_t off)
258{ 258{
259 int idx, name, ret; 259 int idx, name, ret;
260 unsigned long long val; 260 unsigned long long val;
261 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 261 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
262 262
263 idx = MEMFILE_IDX(cft->private); 263 buf = strstrip(buf);
264 name = MEMFILE_ATTR(cft->private); 264 idx = MEMFILE_IDX(of_cft(of)->private);
265 name = MEMFILE_ATTR(of_cft(of)->private);
265 266
266 switch (name) { 267 switch (name) {
267 case RES_LIMIT: 268 case RES_LIMIT:
@@ -271,7 +272,7 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
271 break; 272 break;
272 } 273 }
273 /* This function does all necessary parse...reuse it */ 274 /* This function does all necessary parse...reuse it */
274 ret = res_counter_memparse_write_strategy(buffer, &val); 275 ret = res_counter_memparse_write_strategy(buf, &val);
275 if (ret) 276 if (ret)
276 break; 277 break;
277 ret = res_counter_set_limit(&h_cg->hugepage[idx], val); 278 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
@@ -280,17 +281,17 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
280 ret = -EINVAL; 281 ret = -EINVAL;
281 break; 282 break;
282 } 283 }
283 return ret; 284 return ret ?: nbytes;
284} 285}
285 286
286static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, 287static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
287 unsigned int event) 288 char *buf, size_t nbytes, loff_t off)
288{ 289{
289 int idx, name, ret = 0; 290 int idx, name, ret = 0;
290 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 291 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
291 292
292 idx = MEMFILE_IDX(event); 293 idx = MEMFILE_IDX(of_cft(of)->private);
293 name = MEMFILE_ATTR(event); 294 name = MEMFILE_ATTR(of_cft(of)->private);
294 295
295 switch (name) { 296 switch (name) {
296 case RES_MAX_USAGE: 297 case RES_MAX_USAGE:
@@ -303,7 +304,7 @@ static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
303 ret = -EINVAL; 304 ret = -EINVAL;
304 break; 305 break;
305 } 306 }
306 return ret; 307 return ret ?: nbytes;
307} 308}
308 309
309static char *mem_fmt(char *buf, int size, unsigned long hsize) 310static char *mem_fmt(char *buf, int size, unsigned long hsize)
@@ -331,7 +332,7 @@ static void __init __hugetlb_cgroup_file_init(int idx)
331 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 332 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
332 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 333 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
333 cft->read_u64 = hugetlb_cgroup_read_u64; 334 cft->read_u64 = hugetlb_cgroup_read_u64;
334 cft->write_string = hugetlb_cgroup_write; 335 cft->write = hugetlb_cgroup_write;
335 336
336 /* Add the usage file */ 337 /* Add the usage file */
337 cft = &h->cgroup_files[1]; 338 cft = &h->cgroup_files[1];
@@ -343,14 +344,14 @@ static void __init __hugetlb_cgroup_file_init(int idx)
343 cft = &h->cgroup_files[2]; 344 cft = &h->cgroup_files[2];
344 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
345 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 346 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
346 cft->trigger = hugetlb_cgroup_reset; 347 cft->write = hugetlb_cgroup_reset;
347 cft->read_u64 = hugetlb_cgroup_read_u64; 348 cft->read_u64 = hugetlb_cgroup_read_u64;
348 349
349 /* Add the failcntfile */ 350 /* Add the failcntfile */
350 cft = &h->cgroup_files[3]; 351 cft = &h->cgroup_files[3];
351 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 352 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
352 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 353 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
353 cft->trigger = hugetlb_cgroup_reset; 354 cft->write = hugetlb_cgroup_reset;
354 cft->read_u64 = hugetlb_cgroup_read_u64; 355 cft->read_u64 = hugetlb_cgroup_read_u64;
355 356
356 /* NULL terminate the last cft */ 357 /* NULL terminate the last cft */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a9559b91603c..a2c7bcb0e6eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -526,18 +526,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
526 526
527static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 527static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
528{ 528{
529 /* 529 return memcg->css.id;
530 * The ID of the root cgroup is 0, but memcg treat 0 as an
531 * invalid ID, so we return (cgroup_id + 1).
532 */
533 return memcg->css.cgroup->id + 1;
534} 530}
535 531
536static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 532static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
537{ 533{
538 struct cgroup_subsys_state *css; 534 struct cgroup_subsys_state *css;
539 535
540 css = css_from_id(id - 1, &memory_cgrp_subsys); 536 css = css_from_id(id, &memory_cgrp_subsys);
541 return mem_cgroup_from_css(css); 537 return mem_cgroup_from_css(css);
542} 538}
543 539
@@ -570,7 +566,8 @@ void sock_update_memcg(struct sock *sk)
570 memcg = mem_cgroup_from_task(current); 566 memcg = mem_cgroup_from_task(current);
571 cg_proto = sk->sk_prot->proto_cgroup(memcg); 567 cg_proto = sk->sk_prot->proto_cgroup(memcg);
572 if (!mem_cgroup_is_root(memcg) && 568 if (!mem_cgroup_is_root(memcg) &&
573 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { 569 memcg_proto_active(cg_proto) &&
570 css_tryget_online(&memcg->css)) {
574 sk->sk_cgrp = cg_proto; 571 sk->sk_cgrp = cg_proto;
575 } 572 }
576 rcu_read_unlock(); 573 rcu_read_unlock();
@@ -831,7 +828,7 @@ retry:
831 */ 828 */
832 __mem_cgroup_remove_exceeded(mz, mctz); 829 __mem_cgroup_remove_exceeded(mz, mctz);
833 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 830 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
834 !css_tryget(&mz->memcg->css)) 831 !css_tryget_online(&mz->memcg->css))
835 goto retry; 832 goto retry;
836done: 833done:
837 return mz; 834 return mz;
@@ -1073,7 +1070,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1073 if (unlikely(!memcg)) 1070 if (unlikely(!memcg))
1074 memcg = root_mem_cgroup; 1071 memcg = root_mem_cgroup;
1075 } 1072 }
1076 } while (!css_tryget(&memcg->css)); 1073 } while (!css_tryget_online(&memcg->css));
1077 rcu_read_unlock(); 1074 rcu_read_unlock();
1078 return memcg; 1075 return memcg;
1079} 1076}
@@ -1110,7 +1107,8 @@ skip_node:
1110 */ 1107 */
1111 if (next_css) { 1108 if (next_css) {
1112 if ((next_css == &root->css) || 1109 if ((next_css == &root->css) ||
1113 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) 1110 ((next_css->flags & CSS_ONLINE) &&
1111 css_tryget_online(next_css)))
1114 return mem_cgroup_from_css(next_css); 1112 return mem_cgroup_from_css(next_css);
1115 1113
1116 prev_css = next_css; 1114 prev_css = next_css;
@@ -1156,7 +1154,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1156 * would be returned all the time. 1154 * would be returned all the time.
1157 */ 1155 */
1158 if (position && position != root && 1156 if (position && position != root &&
1159 !css_tryget(&position->css)) 1157 !css_tryget_online(&position->css))
1160 position = NULL; 1158 position = NULL;
1161 } 1159 }
1162 return position; 1160 return position;
@@ -1533,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1533int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1531int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1534{ 1532{
1535 /* root ? */ 1533 /* root ? */
1536 if (mem_cgroup_disabled() || !css_parent(&memcg->css)) 1534 if (mem_cgroup_disabled() || !memcg->css.parent)
1537 return vm_swappiness; 1535 return vm_swappiness;
1538 1536
1539 return memcg->swappiness; 1537 return memcg->swappiness;
@@ -2769,9 +2767,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2769 2767
2770/* 2768/*
2771 * A helper function to get mem_cgroup from ID. must be called under 2769 * A helper function to get mem_cgroup from ID. must be called under
2772 * rcu_read_lock(). The caller is responsible for calling css_tryget if 2770 * rcu_read_lock(). The caller is responsible for calling
2773 * the mem_cgroup is used for charging. (dropping refcnt from swap can be 2771 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
2774 * called against removed memcg.) 2772 * refcnt from swap can be called against removed memcg.)
2775 */ 2773 */
2776static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2774static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2777{ 2775{
@@ -2794,14 +2792,14 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2794 lock_page_cgroup(pc); 2792 lock_page_cgroup(pc);
2795 if (PageCgroupUsed(pc)) { 2793 if (PageCgroupUsed(pc)) {
2796 memcg = pc->mem_cgroup; 2794 memcg = pc->mem_cgroup;
2797 if (memcg && !css_tryget(&memcg->css)) 2795 if (memcg && !css_tryget_online(&memcg->css))
2798 memcg = NULL; 2796 memcg = NULL;
2799 } else if (PageSwapCache(page)) { 2797 } else if (PageSwapCache(page)) {
2800 ent.val = page_private(page); 2798 ent.val = page_private(page);
2801 id = lookup_swap_cgroup_id(ent); 2799 id = lookup_swap_cgroup_id(ent);
2802 rcu_read_lock(); 2800 rcu_read_lock();
2803 memcg = mem_cgroup_lookup(id); 2801 memcg = mem_cgroup_lookup(id);
2804 if (memcg && !css_tryget(&memcg->css)) 2802 if (memcg && !css_tryget_online(&memcg->css))
2805 memcg = NULL; 2803 memcg = NULL;
2806 rcu_read_unlock(); 2804 rcu_read_unlock();
2807 } 2805 }
@@ -3365,7 +3363,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3365 } 3363 }
3366 3364
3367 /* The corresponding put will be done in the workqueue. */ 3365 /* The corresponding put will be done in the workqueue. */
3368 if (!css_tryget(&memcg->css)) 3366 if (!css_tryget_online(&memcg->css))
3369 goto out; 3367 goto out;
3370 rcu_read_unlock(); 3368 rcu_read_unlock();
3371 3369
@@ -4125,8 +4123,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
4125 memcg = mem_cgroup_lookup(id); 4123 memcg = mem_cgroup_lookup(id);
4126 if (memcg) { 4124 if (memcg) {
4127 /* 4125 /*
4128 * We uncharge this because swap is freed. 4126 * We uncharge this because swap is freed. This memcg can
4129 * This memcg can be obsolete one. We avoid calling css_tryget 4127 * be obsolete one. We avoid calling css_tryget_online().
4130 */ 4128 */
4131 if (!mem_cgroup_is_root(memcg)) 4129 if (!mem_cgroup_is_root(memcg))
4132 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4130 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
@@ -4711,18 +4709,28 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4711 } while (usage > 0); 4709 } while (usage > 0);
4712} 4710}
4713 4711
4712/*
4713 * Test whether @memcg has children, dead or alive. Note that this
4714 * function doesn't care whether @memcg has use_hierarchy enabled and
4715 * returns %true if there are child csses according to the cgroup
4716 * hierarchy. Testing use_hierarchy is the caller's responsiblity.
4717 */
4714static inline bool memcg_has_children(struct mem_cgroup *memcg) 4718static inline bool memcg_has_children(struct mem_cgroup *memcg)
4715{ 4719{
4716 lockdep_assert_held(&memcg_create_mutex); 4720 bool ret;
4721
4717 /* 4722 /*
4718 * The lock does not prevent addition or deletion to the list 4723 * The lock does not prevent addition or deletion of children, but
4719 * of children, but it prevents a new child from being 4724 * it prevents a new child from being initialized based on this
4720 * initialized based on this parent in css_online(), so it's 4725 * parent in css_online(), so it's enough to decide whether
4721 * enough to decide whether hierarchically inherited 4726 * hierarchically inherited attributes can still be changed or not.
4722 * attributes can still be changed or not.
4723 */ 4727 */
4724 return memcg->use_hierarchy && 4728 lockdep_assert_held(&memcg_create_mutex);
4725 !list_empty(&memcg->css.cgroup->children); 4729
4730 rcu_read_lock();
4731 ret = css_next_child(NULL, &memcg->css);
4732 rcu_read_unlock();
4733 return ret;
4726} 4734}
4727 4735
4728/* 4736/*
@@ -4734,11 +4742,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
4734static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 4742static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4735{ 4743{
4736 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 4744 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4737 struct cgroup *cgrp = memcg->css.cgroup;
4738
4739 /* returns EBUSY if there is a task or if we come here twice. */
4740 if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
4741 return -EBUSY;
4742 4745
4743 /* we call try-to-free pages for make this cgroup empty */ 4746 /* we call try-to-free pages for make this cgroup empty */
4744 lru_add_drain_all(); 4747 lru_add_drain_all();
@@ -4758,20 +4761,19 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4758 } 4761 }
4759 4762
4760 } 4763 }
4761 lru_add_drain();
4762 mem_cgroup_reparent_charges(memcg);
4763 4764
4764 return 0; 4765 return 0;
4765} 4766}
4766 4767
4767static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, 4768static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
4768 unsigned int event) 4769 char *buf, size_t nbytes,
4770 loff_t off)
4769{ 4771{
4770 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4772 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4771 4773
4772 if (mem_cgroup_is_root(memcg)) 4774 if (mem_cgroup_is_root(memcg))
4773 return -EINVAL; 4775 return -EINVAL;
4774 return mem_cgroup_force_empty(memcg); 4776 return mem_cgroup_force_empty(memcg) ?: nbytes;
4775} 4777}
4776 4778
4777static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 4779static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
@@ -4785,7 +4787,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
4785{ 4787{
4786 int retval = 0; 4788 int retval = 0;
4787 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4789 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4788 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 4790 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
4789 4791
4790 mutex_lock(&memcg_create_mutex); 4792 mutex_lock(&memcg_create_mutex);
4791 4793
@@ -4802,7 +4804,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
4802 */ 4804 */
4803 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 4805 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4804 (val == 1 || val == 0)) { 4806 (val == 1 || val == 0)) {
4805 if (list_empty(&memcg->css.cgroup->children)) 4807 if (!memcg_has_children(memcg))
4806 memcg->use_hierarchy = val; 4808 memcg->use_hierarchy = val;
4807 else 4809 else
4808 retval = -EBUSY; 4810 retval = -EBUSY;
@@ -4919,7 +4921,8 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4919 * of course permitted. 4921 * of course permitted.
4920 */ 4922 */
4921 mutex_lock(&memcg_create_mutex); 4923 mutex_lock(&memcg_create_mutex);
4922 if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) 4924 if (cgroup_has_tasks(memcg->css.cgroup) ||
4925 (memcg->use_hierarchy && memcg_has_children(memcg)))
4923 err = -EBUSY; 4926 err = -EBUSY;
4924 mutex_unlock(&memcg_create_mutex); 4927 mutex_unlock(&memcg_create_mutex);
4925 if (err) 4928 if (err)
@@ -5021,17 +5024,18 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5021 * The user of this function is... 5024 * The user of this function is...
5022 * RES_LIMIT. 5025 * RES_LIMIT.
5023 */ 5026 */
5024static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 5027static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
5025 char *buffer) 5028 char *buf, size_t nbytes, loff_t off)
5026{ 5029{
5027 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5030 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5028 enum res_type type; 5031 enum res_type type;
5029 int name; 5032 int name;
5030 unsigned long long val; 5033 unsigned long long val;
5031 int ret; 5034 int ret;
5032 5035
5033 type = MEMFILE_TYPE(cft->private); 5036 buf = strstrip(buf);
5034 name = MEMFILE_ATTR(cft->private); 5037 type = MEMFILE_TYPE(of_cft(of)->private);
5038 name = MEMFILE_ATTR(of_cft(of)->private);
5035 5039
5036 switch (name) { 5040 switch (name) {
5037 case RES_LIMIT: 5041 case RES_LIMIT:
@@ -5040,7 +5044,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5040 break; 5044 break;
5041 } 5045 }
5042 /* This function does all necessary parse...reuse it */ 5046 /* This function does all necessary parse...reuse it */
5043 ret = res_counter_memparse_write_strategy(buffer, &val); 5047 ret = res_counter_memparse_write_strategy(buf, &val);
5044 if (ret) 5048 if (ret)
5045 break; 5049 break;
5046 if (type == _MEM) 5050 if (type == _MEM)
@@ -5053,7 +5057,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5053 return -EINVAL; 5057 return -EINVAL;
5054 break; 5058 break;
5055 case RES_SOFT_LIMIT: 5059 case RES_SOFT_LIMIT:
5056 ret = res_counter_memparse_write_strategy(buffer, &val); 5060 ret = res_counter_memparse_write_strategy(buf, &val);
5057 if (ret) 5061 if (ret)
5058 break; 5062 break;
5059 /* 5063 /*
@@ -5070,7 +5074,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5070 ret = -EINVAL; /* should be BUG() ? */ 5074 ret = -EINVAL; /* should be BUG() ? */
5071 break; 5075 break;
5072 } 5076 }
5073 return ret; 5077 return ret ?: nbytes;
5074} 5078}
5075 5079
5076static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5080static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
@@ -5083,8 +5087,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5083 if (!memcg->use_hierarchy) 5087 if (!memcg->use_hierarchy)
5084 goto out; 5088 goto out;
5085 5089
5086 while (css_parent(&memcg->css)) { 5090 while (memcg->css.parent) {
5087 memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 5091 memcg = mem_cgroup_from_css(memcg->css.parent);
5088 if (!memcg->use_hierarchy) 5092 if (!memcg->use_hierarchy)
5089 break; 5093 break;
5090 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5094 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -5097,14 +5101,15 @@ out:
5097 *memsw_limit = min_memsw_limit; 5101 *memsw_limit = min_memsw_limit;
5098} 5102}
5099 5103
5100static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) 5104static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
5105 size_t nbytes, loff_t off)
5101{ 5106{
5102 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5107 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5103 int name; 5108 int name;
5104 enum res_type type; 5109 enum res_type type;
5105 5110
5106 type = MEMFILE_TYPE(event); 5111 type = MEMFILE_TYPE(of_cft(of)->private);
5107 name = MEMFILE_ATTR(event); 5112 name = MEMFILE_ATTR(of_cft(of)->private);
5108 5113
5109 switch (name) { 5114 switch (name) {
5110 case RES_MAX_USAGE: 5115 case RES_MAX_USAGE:
@@ -5129,7 +5134,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5129 break; 5134 break;
5130 } 5135 }
5131 5136
5132 return 0; 5137 return nbytes;
5133} 5138}
5134 5139
5135static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 5140static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
@@ -5322,7 +5327,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5322 if (val > 100) 5327 if (val > 100)
5323 return -EINVAL; 5328 return -EINVAL;
5324 5329
5325 if (css_parent(css)) 5330 if (css->parent)
5326 memcg->swappiness = val; 5331 memcg->swappiness = val;
5327 else 5332 else
5328 vm_swappiness = val; 5333 vm_swappiness = val;
@@ -5659,7 +5664,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5659 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5664 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5660 5665
5661 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5666 /* cannot set to root cgroup and only 0 and 1 are allowed */
5662 if (!css_parent(css) || !((val == 0) || (val == 1))) 5667 if (!css->parent || !((val == 0) || (val == 1)))
5663 return -EINVAL; 5668 return -EINVAL;
5664 5669
5665 memcg->oom_kill_disable = val; 5670 memcg->oom_kill_disable = val;
@@ -5705,10 +5710,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5705 * which is then paired with css_put during uncharge resp. here. 5710 * which is then paired with css_put during uncharge resp. here.
5706 * 5711 *
5707 * Although this might sound strange as this path is called from 5712 * Although this might sound strange as this path is called from
5708 * css_offline() when the referencemight have dropped down to 0 5713 * css_offline() when the referencemight have dropped down to 0 and
5709 * and shouldn't be incremented anymore (css_tryget would fail) 5714 * shouldn't be incremented anymore (css_tryget_online() would
5710 * we do not have other options because of the kmem allocations 5715 * fail) we do not have other options because of the kmem
5711 * lifetime. 5716 * allocations lifetime.
5712 */ 5717 */
5713 css_get(&memcg->css); 5718 css_get(&memcg->css);
5714 5719
@@ -5827,9 +5832,10 @@ static void memcg_event_ptable_queue_proc(struct file *file,
5827 * Input must be in format '<event_fd> <control_fd> <args>'. 5832 * Input must be in format '<event_fd> <control_fd> <args>'.
5828 * Interpretation of args is defined by control file implementation. 5833 * Interpretation of args is defined by control file implementation.
5829 */ 5834 */
5830static int memcg_write_event_control(struct cgroup_subsys_state *css, 5835static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
5831 struct cftype *cft, char *buffer) 5836 char *buf, size_t nbytes, loff_t off)
5832{ 5837{
5838 struct cgroup_subsys_state *css = of_css(of);
5833 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5839 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5834 struct mem_cgroup_event *event; 5840 struct mem_cgroup_event *event;
5835 struct cgroup_subsys_state *cfile_css; 5841 struct cgroup_subsys_state *cfile_css;
@@ -5840,15 +5846,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
5840 char *endp; 5846 char *endp;
5841 int ret; 5847 int ret;
5842 5848
5843 efd = simple_strtoul(buffer, &endp, 10); 5849 buf = strstrip(buf);
5850
5851 efd = simple_strtoul(buf, &endp, 10);
5844 if (*endp != ' ') 5852 if (*endp != ' ')
5845 return -EINVAL; 5853 return -EINVAL;
5846 buffer = endp + 1; 5854 buf = endp + 1;
5847 5855
5848 cfd = simple_strtoul(buffer, &endp, 10); 5856 cfd = simple_strtoul(buf, &endp, 10);
5849 if ((*endp != ' ') && (*endp != '\0')) 5857 if ((*endp != ' ') && (*endp != '\0'))
5850 return -EINVAL; 5858 return -EINVAL;
5851 buffer = endp + 1; 5859 buf = endp + 1;
5852 5860
5853 event = kzalloc(sizeof(*event), GFP_KERNEL); 5861 event = kzalloc(sizeof(*event), GFP_KERNEL);
5854 if (!event) 5862 if (!event)
@@ -5916,8 +5924,8 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
5916 * automatically removed on cgroup destruction but the removal is 5924 * automatically removed on cgroup destruction but the removal is
5917 * asynchronous, so take an extra ref on @css. 5925 * asynchronous, so take an extra ref on @css.
5918 */ 5926 */
5919 cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, 5927 cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
5920 &memory_cgrp_subsys); 5928 &memory_cgrp_subsys);
5921 ret = -EINVAL; 5929 ret = -EINVAL;
5922 if (IS_ERR(cfile_css)) 5930 if (IS_ERR(cfile_css))
5923 goto out_put_cfile; 5931 goto out_put_cfile;
@@ -5926,7 +5934,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
5926 goto out_put_cfile; 5934 goto out_put_cfile;
5927 } 5935 }
5928 5936
5929 ret = event->register_event(memcg, event->eventfd, buffer); 5937 ret = event->register_event(memcg, event->eventfd, buf);
5930 if (ret) 5938 if (ret)
5931 goto out_put_css; 5939 goto out_put_css;
5932 5940
@@ -5939,7 +5947,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
5939 fdput(cfile); 5947 fdput(cfile);
5940 fdput(efile); 5948 fdput(efile);
5941 5949
5942 return 0; 5950 return nbytes;
5943 5951
5944out_put_css: 5952out_put_css:
5945 css_put(css); 5953 css_put(css);
@@ -5964,25 +5972,25 @@ static struct cftype mem_cgroup_files[] = {
5964 { 5972 {
5965 .name = "max_usage_in_bytes", 5973 .name = "max_usage_in_bytes",
5966 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5974 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5967 .trigger = mem_cgroup_reset, 5975 .write = mem_cgroup_reset,
5968 .read_u64 = mem_cgroup_read_u64, 5976 .read_u64 = mem_cgroup_read_u64,
5969 }, 5977 },
5970 { 5978 {
5971 .name = "limit_in_bytes", 5979 .name = "limit_in_bytes",
5972 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5980 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5973 .write_string = mem_cgroup_write, 5981 .write = mem_cgroup_write,
5974 .read_u64 = mem_cgroup_read_u64, 5982 .read_u64 = mem_cgroup_read_u64,
5975 }, 5983 },
5976 { 5984 {
5977 .name = "soft_limit_in_bytes", 5985 .name = "soft_limit_in_bytes",
5978 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5986 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5979 .write_string = mem_cgroup_write, 5987 .write = mem_cgroup_write,
5980 .read_u64 = mem_cgroup_read_u64, 5988 .read_u64 = mem_cgroup_read_u64,
5981 }, 5989 },
5982 { 5990 {
5983 .name = "failcnt", 5991 .name = "failcnt",
5984 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5992 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5985 .trigger = mem_cgroup_reset, 5993 .write = mem_cgroup_reset,
5986 .read_u64 = mem_cgroup_read_u64, 5994 .read_u64 = mem_cgroup_read_u64,
5987 }, 5995 },
5988 { 5996 {
@@ -5991,7 +5999,7 @@ static struct cftype mem_cgroup_files[] = {
5991 }, 5999 },
5992 { 6000 {
5993 .name = "force_empty", 6001 .name = "force_empty",
5994 .trigger = mem_cgroup_force_empty_write, 6002 .write = mem_cgroup_force_empty_write,
5995 }, 6003 },
5996 { 6004 {
5997 .name = "use_hierarchy", 6005 .name = "use_hierarchy",
@@ -6001,7 +6009,7 @@ static struct cftype mem_cgroup_files[] = {
6001 }, 6009 },
6002 { 6010 {
6003 .name = "cgroup.event_control", /* XXX: for compat */ 6011 .name = "cgroup.event_control", /* XXX: for compat */
6004 .write_string = memcg_write_event_control, 6012 .write = memcg_write_event_control,
6005 .flags = CFTYPE_NO_PREFIX, 6013 .flags = CFTYPE_NO_PREFIX,
6006 .mode = S_IWUGO, 6014 .mode = S_IWUGO,
6007 }, 6015 },
@@ -6034,7 +6042,7 @@ static struct cftype mem_cgroup_files[] = {
6034 { 6042 {
6035 .name = "kmem.limit_in_bytes", 6043 .name = "kmem.limit_in_bytes",
6036 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6044 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6037 .write_string = mem_cgroup_write, 6045 .write = mem_cgroup_write,
6038 .read_u64 = mem_cgroup_read_u64, 6046 .read_u64 = mem_cgroup_read_u64,
6039 }, 6047 },
6040 { 6048 {
@@ -6045,13 +6053,13 @@ static struct cftype mem_cgroup_files[] = {
6045 { 6053 {
6046 .name = "kmem.failcnt", 6054 .name = "kmem.failcnt",
6047 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6055 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6048 .trigger = mem_cgroup_reset, 6056 .write = mem_cgroup_reset,
6049 .read_u64 = mem_cgroup_read_u64, 6057 .read_u64 = mem_cgroup_read_u64,
6050 }, 6058 },
6051 { 6059 {
6052 .name = "kmem.max_usage_in_bytes", 6060 .name = "kmem.max_usage_in_bytes",
6053 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6061 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6054 .trigger = mem_cgroup_reset, 6062 .write = mem_cgroup_reset,
6055 .read_u64 = mem_cgroup_read_u64, 6063 .read_u64 = mem_cgroup_read_u64,
6056 }, 6064 },
6057#ifdef CONFIG_SLABINFO 6065#ifdef CONFIG_SLABINFO
@@ -6074,19 +6082,19 @@ static struct cftype memsw_cgroup_files[] = {
6074 { 6082 {
6075 .name = "memsw.max_usage_in_bytes", 6083 .name = "memsw.max_usage_in_bytes",
6076 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6084 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6077 .trigger = mem_cgroup_reset, 6085 .write = mem_cgroup_reset,
6078 .read_u64 = mem_cgroup_read_u64, 6086 .read_u64 = mem_cgroup_read_u64,
6079 }, 6087 },
6080 { 6088 {
6081 .name = "memsw.limit_in_bytes", 6089 .name = "memsw.limit_in_bytes",
6082 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6090 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6083 .write_string = mem_cgroup_write, 6091 .write = mem_cgroup_write,
6084 .read_u64 = mem_cgroup_read_u64, 6092 .read_u64 = mem_cgroup_read_u64,
6085 }, 6093 },
6086 { 6094 {
6087 .name = "memsw.failcnt", 6095 .name = "memsw.failcnt",
6088 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6096 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6089 .trigger = mem_cgroup_reset, 6097 .write = mem_cgroup_reset,
6090 .read_u64 = mem_cgroup_read_u64, 6098 .read_u64 = mem_cgroup_read_u64,
6091 }, 6099 },
6092 { }, /* terminate */ 6100 { }, /* terminate */
@@ -6264,9 +6272,9 @@ static int
6264mem_cgroup_css_online(struct cgroup_subsys_state *css) 6272mem_cgroup_css_online(struct cgroup_subsys_state *css)
6265{ 6273{
6266 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6274 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6267 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); 6275 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
6268 6276
6269 if (css->cgroup->id > MEM_CGROUP_ID_MAX) 6277 if (css->id > MEM_CGROUP_ID_MAX)
6270 return -ENOSPC; 6278 return -ENOSPC;
6271 6279
6272 if (!parent) 6280 if (!parent)
@@ -6361,7 +6369,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6361 /* 6369 /*
6362 * XXX: css_offline() would be where we should reparent all 6370 * XXX: css_offline() would be where we should reparent all
6363 * memory to prepare the cgroup for destruction. However, 6371 * memory to prepare the cgroup for destruction. However,
6364 * memcg does not do css_tryget() and res_counter charging 6372 * memcg does not do css_tryget_online() and res_counter charging
6365 * under the same RCU lock region, which means that charging 6373 * under the same RCU lock region, which means that charging
6366 * could race with offlining. Offlining only happens to 6374 * could race with offlining. Offlining only happens to
6367 * cgroups with no tasks in them but charges can show up 6375 * cgroups with no tasks in them but charges can show up
@@ -6375,9 +6383,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6375 * lookup_swap_cgroup_id() 6383 * lookup_swap_cgroup_id()
6376 * rcu_read_lock() 6384 * rcu_read_lock()
6377 * mem_cgroup_lookup() 6385 * mem_cgroup_lookup()
6378 * css_tryget() 6386 * css_tryget_online()
6379 * rcu_read_unlock() 6387 * rcu_read_unlock()
6380 * disable css_tryget() 6388 * disable css_tryget_online()
6381 * call_rcu() 6389 * call_rcu()
6382 * offline_css() 6390 * offline_css()
6383 * reparent_charges() 6391 * reparent_charges()
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 22931e1b99b4..30d903b19c62 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -42,7 +42,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
42static int cgrp_css_online(struct cgroup_subsys_state *css) 42static int cgrp_css_online(struct cgroup_subsys_state *css)
43{ 43{
44 struct cgroup_cls_state *cs = css_cls_state(css); 44 struct cgroup_cls_state *cs = css_cls_state(css);
45 struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); 45 struct cgroup_cls_state *parent = css_cls_state(css->parent);
46 46
47 if (parent) 47 if (parent)
48 cs->classid = parent->classid; 48 cs->classid = parent->classid;
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 3825f669147b..2f385b9bccc0 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
140 140
141static int cgrp_css_online(struct cgroup_subsys_state *css) 141static int cgrp_css_online(struct cgroup_subsys_state *css)
142{ 142{
143 struct cgroup_subsys_state *parent_css = css_parent(css); 143 struct cgroup_subsys_state *parent_css = css->parent;
144 struct net_device *dev; 144 struct net_device *dev;
145 int ret = 0; 145 int ret = 0;
146 146
@@ -185,15 +185,15 @@ static int read_priomap(struct seq_file *sf, void *v)
185 return 0; 185 return 0;
186} 186}
187 187
188static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 188static ssize_t write_priomap(struct kernfs_open_file *of,
189 char *buffer) 189 char *buf, size_t nbytes, loff_t off)
190{ 190{
191 char devname[IFNAMSIZ + 1]; 191 char devname[IFNAMSIZ + 1];
192 struct net_device *dev; 192 struct net_device *dev;
193 u32 prio; 193 u32 prio;
194 int ret; 194 int ret;
195 195
196 if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) 196 if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
197 return -EINVAL; 197 return -EINVAL;
198 198
199 dev = dev_get_by_name(&init_net, devname); 199 dev = dev_get_by_name(&init_net, devname);
@@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
202 202
203 rtnl_lock(); 203 rtnl_lock();
204 204
205 ret = netprio_set_prio(css, dev, prio); 205 ret = netprio_set_prio(of_css(of), dev, prio);
206 206
207 rtnl_unlock(); 207 rtnl_unlock();
208 dev_put(dev); 208 dev_put(dev);
209 return ret; 209 return ret ?: nbytes;
210} 210}
211 211
212static int update_netprio(const void *v, struct file *file, unsigned n) 212static int update_netprio(const void *v, struct file *file, unsigned n)
@@ -239,7 +239,7 @@ static struct cftype ss_files[] = {
239 { 239 {
240 .name = "ifpriomap", 240 .name = "ifpriomap",
241 .seq_show = read_priomap, 241 .seq_show = read_priomap,
242 .write_string = write_priomap, 242 .write = write_priomap,
243 }, 243 },
244 { } /* terminate */ 244 { } /* terminate */
245}; 245};
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index d4f015ad6c84..f7a2ec3ac584 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -102,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
102 return 0; 102 return 0;
103} 103}
104 104
105static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 105static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
106 char *buffer) 106 char *buf, size_t nbytes, loff_t off)
107{ 107{
108 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 108 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
109 unsigned long long val; 109 unsigned long long val;
110 int ret = 0; 110 int ret = 0;
111 111
112 switch (cft->private) { 112 buf = strstrip(buf);
113
114 switch (of_cft(of)->private) {
113 case RES_LIMIT: 115 case RES_LIMIT:
114 /* see memcontrol.c */ 116 /* see memcontrol.c */
115 ret = res_counter_memparse_write_strategy(buffer, &val); 117 ret = res_counter_memparse_write_strategy(buf, &val);
116 if (ret) 118 if (ret)
117 break; 119 break;
118 ret = tcp_update_limit(memcg, val); 120 ret = tcp_update_limit(memcg, val);
@@ -121,7 +123,7 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
121 ret = -EINVAL; 123 ret = -EINVAL;
122 break; 124 break;
123 } 125 }
124 return ret; 126 return ret ?: nbytes;
125} 127}
126 128
127static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) 129static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
@@ -168,17 +170,18 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
168 return val; 170 return val;
169} 171}
170 172
171static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) 173static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
174 char *buf, size_t nbytes, loff_t off)
172{ 175{
173 struct mem_cgroup *memcg; 176 struct mem_cgroup *memcg;
174 struct cg_proto *cg_proto; 177 struct cg_proto *cg_proto;
175 178
176 memcg = mem_cgroup_from_css(css); 179 memcg = mem_cgroup_from_css(of_css(of));
177 cg_proto = tcp_prot.proto_cgroup(memcg); 180 cg_proto = tcp_prot.proto_cgroup(memcg);
178 if (!cg_proto) 181 if (!cg_proto)
179 return 0; 182 return nbytes;
180 183
181 switch (event) { 184 switch (of_cft(of)->private) {
182 case RES_MAX_USAGE: 185 case RES_MAX_USAGE:
183 res_counter_reset_max(&cg_proto->memory_allocated); 186 res_counter_reset_max(&cg_proto->memory_allocated);
184 break; 187 break;
@@ -187,13 +190,13 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
187 break; 190 break;
188 } 191 }
189 192
190 return 0; 193 return nbytes;
191} 194}
192 195
193static struct cftype tcp_files[] = { 196static struct cftype tcp_files[] = {
194 { 197 {
195 .name = "kmem.tcp.limit_in_bytes", 198 .name = "kmem.tcp.limit_in_bytes",
196 .write_string = tcp_cgroup_write, 199 .write = tcp_cgroup_write,
197 .read_u64 = tcp_cgroup_read, 200 .read_u64 = tcp_cgroup_read,
198 .private = RES_LIMIT, 201 .private = RES_LIMIT,
199 }, 202 },
@@ -205,13 +208,13 @@ static struct cftype tcp_files[] = {
205 { 208 {
206 .name = "kmem.tcp.failcnt", 209 .name = "kmem.tcp.failcnt",
207 .private = RES_FAILCNT, 210 .private = RES_FAILCNT,
208 .trigger = tcp_cgroup_reset, 211 .write = tcp_cgroup_reset,
209 .read_u64 = tcp_cgroup_read, 212 .read_u64 = tcp_cgroup_read,
210 }, 213 },
211 { 214 {
212 .name = "kmem.tcp.max_usage_in_bytes", 215 .name = "kmem.tcp.max_usage_in_bytes",
213 .private = RES_MAX_USAGE, 216 .private = RES_MAX_USAGE,
214 .trigger = tcp_cgroup_reset, 217 .write = tcp_cgroup_reset,
215 .read_u64 = tcp_cgroup_read, 218 .read_u64 = tcp_cgroup_read,
216 }, 219 },
217 { } /* terminate */ 220 { } /* terminate */
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 9134dbf70d3e..d9d69e6930ed 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -182,7 +182,7 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg)
182static int devcgroup_online(struct cgroup_subsys_state *css) 182static int devcgroup_online(struct cgroup_subsys_state *css)
183{ 183{
184 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); 184 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
185 struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css)); 185 struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent);
186 int ret = 0; 186 int ret = 0;
187 187
188 mutex_lock(&devcgroup_mutex); 188 mutex_lock(&devcgroup_mutex);
@@ -455,7 +455,7 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
455static int parent_has_perm(struct dev_cgroup *childcg, 455static int parent_has_perm(struct dev_cgroup *childcg,
456 struct dev_exception_item *ex) 456 struct dev_exception_item *ex)
457{ 457{
458 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); 458 struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);
459 459
460 if (!parent) 460 if (!parent)
461 return 1; 461 return 1;
@@ -476,7 +476,7 @@ static int parent_has_perm(struct dev_cgroup *childcg,
476static bool parent_allows_removal(struct dev_cgroup *childcg, 476static bool parent_allows_removal(struct dev_cgroup *childcg,
477 struct dev_exception_item *ex) 477 struct dev_exception_item *ex)
478{ 478{
479 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); 479 struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);
480 480
481 if (!parent) 481 if (!parent)
482 return true; 482 return true;
@@ -587,13 +587,6 @@ static int propagate_exception(struct dev_cgroup *devcg_root,
587 return rc; 587 return rc;
588} 588}
589 589
590static inline bool has_children(struct dev_cgroup *devcgroup)
591{
592 struct cgroup *cgrp = devcgroup->css.cgroup;
593
594 return !list_empty(&cgrp->children);
595}
596
597/* 590/*
598 * Modify the exception list using allow/deny rules. 591 * Modify the exception list using allow/deny rules.
599 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 592 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD
@@ -614,7 +607,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
614 char temp[12]; /* 11 + 1 characters needed for a u32 */ 607 char temp[12]; /* 11 + 1 characters needed for a u32 */
615 int count, rc = 0; 608 int count, rc = 0;
616 struct dev_exception_item ex; 609 struct dev_exception_item ex;
617 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css)); 610 struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent);
618 611
619 if (!capable(CAP_SYS_ADMIN)) 612 if (!capable(CAP_SYS_ADMIN))
620 return -EPERM; 613 return -EPERM;
@@ -626,7 +619,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
626 case 'a': 619 case 'a':
627 switch (filetype) { 620 switch (filetype) {
628 case DEVCG_ALLOW: 621 case DEVCG_ALLOW:
629 if (has_children(devcgroup)) 622 if (css_has_online_children(&devcgroup->css))
630 return -EINVAL; 623 return -EINVAL;
631 624
632 if (!may_allow_all(parent)) 625 if (!may_allow_all(parent))
@@ -642,7 +635,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
642 return rc; 635 return rc;
643 break; 636 break;
644 case DEVCG_DENY: 637 case DEVCG_DENY:
645 if (has_children(devcgroup)) 638 if (css_has_online_children(&devcgroup->css))
646 return -EINVAL; 639 return -EINVAL;
647 640
648 dev_exception_clean(devcgroup); 641 dev_exception_clean(devcgroup);
@@ -767,27 +760,27 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
767 return rc; 760 return rc;
768} 761}
769 762
770static int devcgroup_access_write(struct cgroup_subsys_state *css, 763static ssize_t devcgroup_access_write(struct kernfs_open_file *of,
771 struct cftype *cft, char *buffer) 764 char *buf, size_t nbytes, loff_t off)
772{ 765{
773 int retval; 766 int retval;
774 767
775 mutex_lock(&devcgroup_mutex); 768 mutex_lock(&devcgroup_mutex);
776 retval = devcgroup_update_access(css_to_devcgroup(css), 769 retval = devcgroup_update_access(css_to_devcgroup(of_css(of)),
777 cft->private, buffer); 770 of_cft(of)->private, strstrip(buf));
778 mutex_unlock(&devcgroup_mutex); 771 mutex_unlock(&devcgroup_mutex);
779 return retval; 772 return retval ?: nbytes;
780} 773}
781 774
782static struct cftype dev_cgroup_files[] = { 775static struct cftype dev_cgroup_files[] = {
783 { 776 {
784 .name = "allow", 777 .name = "allow",
785 .write_string = devcgroup_access_write, 778 .write = devcgroup_access_write,
786 .private = DEVCG_ALLOW, 779 .private = DEVCG_ALLOW,
787 }, 780 },
788 { 781 {
789 .name = "deny", 782 .name = "deny",
790 .write_string = devcgroup_access_write, 783 .write = devcgroup_access_write,
791 .private = DEVCG_DENY, 784 .private = DEVCG_DENY,
792 }, 785 },
793 { 786 {