aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 11:18:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 11:18:24 -0500
commitd206e09036d6201f90b2719484c8a59526c46125 (patch)
tree84b9057919bcb8cfd1cff47baa5fc74457e77d6d
parentfef3ff2eb777e76cfa5ae67591982d902c17139c (diff)
parent15ef4ffaa797034d5ff82844daf8f595d7c6d53c (diff)
Merge branch 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup changes from Tejun Heo: "A lot of activities on cgroup side. The big changes are focused on making cgroup hierarchy handling saner. - cgroup_rmdir() had peculiar semantics - it allowed cgroup destruction to be vetoed by individual controllers and tried to drain refcnt synchronously. The vetoing never worked properly and caused good deal of contortions in cgroup. memcg was the last reamining user. Michal Hocko removed the usage and cgroup_rmdir() path has been simplified significantly. This was done in a separate branch so that the memcg people can base further memcg changes on top. - The above allowed cleaning up cgroup lifecycle management and implementation of generic cgroup iterators which are used to improve hierarchy support. - cgroup_freezer updated to allow migration in and out of a frozen cgroup and handle hierarchy. If a cgroup is frozen, all descendant cgroups are frozen. - netcls_cgroup and netprio_cgroup updated to handle hierarchy properly. - Various fixes and cleanups. - Two merge commits. One to pull in memcg and rmdir cleanups (needed to build iterators). The other pulled in cgroup/for-3.7-fixes for device_cgroup fixes so that further device_cgroup patches can be stacked on top." Fixed up a trivial conflict in mm/memcontrol.c as per Tejun (due to commit bea8c150a7 ("memcg: fix hotplugged memory zone oops") in master touching code close to commit 2ef37d3fe4 ("memcg: Simplify mem_cgroup_force_empty_list error handling") in for-3.8) * 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (65 commits) cgroup: update Documentation/cgroups/00-INDEX cgroup_rm_file: don't delete the uncreated files cgroup: remove subsystem files when remounting cgroup cgroup: use cgroup_addrm_files() in cgroup_clear_directory() cgroup: warn about broken hierarchies only after css_online cgroup: list_del_init() on removed events cgroup: fix lockdep warning for event_control cgroup: move list add after list head initilization netprio_cgroup: allow nesting and inherit config on cgroup creation netprio_cgroup: implement netprio[_set]_prio() helpers netprio_cgroup: use cgroup->id instead of cgroup_netprio_state->prioidx netprio_cgroup: reimplement priomap expansion netprio_cgroup: shorten variable names in extend_netdev_table() netprio_cgroup: simplify write_priomap() netcls_cgroup: move config inheritance to ->css_online() and remove .broken_hierarchy marking cgroup: remove obsolete guarantee from cgroup_task_migrate. cgroup: add cgroup->id cgroup, cpuset: remove cgroup_subsys->post_clone() cgroup: s/CGRP_CLONE_CHILDREN/CGRP_CPUSET_CLONE_CHILDREN/ cgroup: rename ->create/post_create/pre_destroy/destroy() to ->css_alloc/online/offline/free() ...
-rw-r--r--Documentation/cgroups/00-INDEX8
-rw-r--r--Documentation/cgroups/cgroups.txt61
-rw-r--r--Documentation/cgroups/freezer-subsystem.txt63
-rw-r--r--Documentation/cgroups/net_prio.txt2
-rw-r--r--block/blk-cgroup.c15
-rw-r--r--include/linux/cgroup.h167
-rw-r--r--include/linux/freezer.h57
-rw-r--r--include/net/netprio_cgroup.h11
-rw-r--r--kernel/cgroup.c754
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/cpuset.c90
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/signal.c20
-rw-r--r--mm/hugetlb_cgroup.c23
-rw-r--r--mm/memcontrol.c191
-rw-r--r--net/core/netprio_cgroup.c260
-rw-r--r--net/sched/cls_cgroup.c28
-rw-r--r--security/device_cgroup.c20
22 files changed, 1255 insertions, 1086 deletions
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
index 3f58fa3d6d00..f78b90a35ad0 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX
@@ -1,7 +1,11 @@
100-INDEX 100-INDEX
2 - this file 2 - this file
3blkio-controller.txt
4 - Description for Block IO Controller, implementation and usage details.
3cgroups.txt 5cgroups.txt
4 - Control Groups definition, implementation details, examples and API. 6 - Control Groups definition, implementation details, examples and API.
7cgroup_event_listener.c
8 - A user program for cgroup listener.
5cpuacct.txt 9cpuacct.txt
6 - CPU Accounting Controller; account CPU usage for groups of tasks. 10 - CPU Accounting Controller; account CPU usage for groups of tasks.
7cpusets.txt 11cpusets.txt
@@ -10,9 +14,13 @@ devices.txt
10 - Device Whitelist Controller; description, interface and security. 14 - Device Whitelist Controller; description, interface and security.
11freezer-subsystem.txt 15freezer-subsystem.txt
12 - checkpointing; rationale to not use signals, interface. 16 - checkpointing; rationale to not use signals, interface.
17hugetlb.txt
18 - HugeTLB Controller implementation and usage details.
13memcg_test.txt 19memcg_test.txt
14 - Memory Resource Controller; implementation details. 20 - Memory Resource Controller; implementation details.
15memory.txt 21memory.txt
16 - Memory Resource Controller; design, accounting, interface, testing. 22 - Memory Resource Controller; design, accounting, interface, testing.
23net_prio.txt
24 - Network priority cgroups details and usages.
17resource_counter.txt 25resource_counter.txt
18 - Resource Counter API. 26 - Resource Counter API.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 9e04196c4d78..bcf1a00b06a1 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -299,11 +299,9 @@ a cgroup hierarchy's release_agent path is empty.
2991.5 What does clone_children do ? 2991.5 What does clone_children do ?
300--------------------------------- 300---------------------------------
301 301
302If the clone_children flag is enabled (1) in a cgroup, then all 302This flag only affects the cpuset controller. If the clone_children
303cgroups created beneath will call the post_clone callbacks for each 303flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
304subsystem of the newly created cgroup. Usually when this callback is 304configuration from the parent during initialization.
305implemented for a subsystem, it copies the values of the parent
306subsystem, this is the case for the cpuset.
307 305
3081.6 How do I use cgroups ? 3061.6 How do I use cgroups ?
309-------------------------- 307--------------------------
@@ -553,16 +551,16 @@ call to cgroup_unload_subsys(). It should also set its_subsys.module =
553THIS_MODULE in its .c file. 551THIS_MODULE in its .c file.
554 552
555Each subsystem may export the following methods. The only mandatory 553Each subsystem may export the following methods. The only mandatory
556methods are create/destroy. Any others that are null are presumed to 554methods are css_alloc/free. Any others that are null are presumed to
557be successful no-ops. 555be successful no-ops.
558 556
559struct cgroup_subsys_state *create(struct cgroup *cgrp) 557struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
560(cgroup_mutex held by caller) 558(cgroup_mutex held by caller)
561 559
562Called to create a subsystem state object for a cgroup. The 560Called to allocate a subsystem state object for a cgroup. The
563subsystem should allocate its subsystem state object for the passed 561subsystem should allocate its subsystem state object for the passed
564cgroup, returning a pointer to the new object on success or a 562cgroup, returning a pointer to the new object on success or a
565negative error code. On success, the subsystem pointer should point to 563ERR_PTR() value. On success, the subsystem pointer should point to
566a structure of type cgroup_subsys_state (typically embedded in a 564a structure of type cgroup_subsys_state (typically embedded in a
567larger subsystem-specific object), which will be initialized by the 565larger subsystem-specific object), which will be initialized by the
568cgroup system. Note that this will be called at initialization to 566cgroup system. Note that this will be called at initialization to
@@ -571,24 +569,33 @@ identified by the passed cgroup object having a NULL parent (since
571it's the root of the hierarchy) and may be an appropriate place for 569it's the root of the hierarchy) and may be an appropriate place for
572initialization code. 570initialization code.
573 571
574void destroy(struct cgroup *cgrp) 572int css_online(struct cgroup *cgrp)
575(cgroup_mutex held by caller) 573(cgroup_mutex held by caller)
576 574
577The cgroup system is about to destroy the passed cgroup; the subsystem 575Called after @cgrp successfully completed all allocations and made
578should do any necessary cleanup and free its subsystem state 576visible to cgroup_for_each_child/descendant_*() iterators. The
579object. By the time this method is called, the cgroup has already been 577subsystem may choose to fail creation by returning -errno. This
580unlinked from the file system and from the child list of its parent; 578callback can be used to implement reliable state sharing and
581cgroup->parent is still valid. (Note - can also be called for a 579propagation along the hierarchy. See the comment on
582newly-created cgroup if an error occurs after this subsystem's 580cgroup_for_each_descendant_pre() for details.
583create() method has been called for the new cgroup).
584 581
585int pre_destroy(struct cgroup *cgrp); 582void css_offline(struct cgroup *cgrp);
586 583
587Called before checking the reference count on each subsystem. This may 584This is the counterpart of css_online() and called iff css_online()
588be useful for subsystems which have some extra references even if 585has succeeded on @cgrp. This signifies the beginning of the end of
589there are not tasks in the cgroup. If pre_destroy() returns error code, 586@cgrp. @cgrp is being removed and the subsystem should start dropping
590rmdir() will fail with it. From this behavior, pre_destroy() can be 587all references it's holding on @cgrp. When all references are dropped,
591called multiple times against a cgroup. 588cgroup removal will proceed to the next step - css_free(). After this
589callback, @cgrp should be considered dead to the subsystem.
590
591void css_free(struct cgroup *cgrp)
592(cgroup_mutex held by caller)
593
594The cgroup system is about to free @cgrp; the subsystem should free
595its subsystem state object. By the time this method is called, @cgrp
596is completely unused; @cgrp->parent is still valid. (Note - can also
597be called for a newly-created cgroup if an error occurs after this
598subsystem's create() method has been called for the new cgroup).
592 599
593int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 600int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
594(cgroup_mutex held by caller) 601(cgroup_mutex held by caller)
@@ -635,14 +642,6 @@ void exit(struct task_struct *task)
635 642
636Called during task exit. 643Called during task exit.
637 644
638void post_clone(struct cgroup *cgrp)
639(cgroup_mutex held by caller)
640
641Called during cgroup_create() to do any parameter
642initialization which might be required before a task could attach. For
643example, in cpusets, no task may attach before 'cpus' and 'mems' are set
644up.
645
646void bind(struct cgroup *root) 645void bind(struct cgroup *root)
647(cgroup_mutex held by caller) 646(cgroup_mutex held by caller)
648 647
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt
index 7e62de1e59ff..c96a72cbb30a 100644
--- a/Documentation/cgroups/freezer-subsystem.txt
+++ b/Documentation/cgroups/freezer-subsystem.txt
@@ -49,13 +49,49 @@ prevent the freeze/unfreeze cycle from becoming visible to the tasks
49being frozen. This allows the bash example above and gdb to run as 49being frozen. This allows the bash example above and gdb to run as
50expected. 50expected.
51 51
52The freezer subsystem in the container filesystem defines a file named 52The cgroup freezer is hierarchical. Freezing a cgroup freezes all
53freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the 53tasks beloning to the cgroup and all its descendant cgroups. Each
54cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup. 54cgroup has its own state (self-state) and the state inherited from the
55Reading will return the current state. 55parent (parent-state). Iff both states are THAWED, the cgroup is
56THAWED.
56 57
57Note freezer.state doesn't exist in root cgroup, which means root cgroup 58The following cgroupfs files are created by cgroup freezer.
58is non-freezable. 59
60* freezer.state: Read-write.
61
62 When read, returns the effective state of the cgroup - "THAWED",
63 "FREEZING" or "FROZEN". This is the combined self and parent-states.
64 If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
65
66 FREEZING cgroup transitions into FROZEN state when all tasks
67 belonging to the cgroup and its descendants become frozen. Note that
68 a cgroup reverts to FREEZING from FROZEN after a new task is added
69 to the cgroup or one of its descendant cgroups until the new task is
70 frozen.
71
72 When written, sets the self-state of the cgroup. Two values are
73 allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
74 if not already freezing, enters FREEZING state along with all its
75 descendant cgroups.
76
77 If THAWED is written, the self-state of the cgroup is changed to
78 THAWED. Note that the effective state may not change to THAWED if
79 the parent-state is still freezing. If a cgroup's effective state
80 becomes THAWED, all its descendants which are freezing because of
81 the cgroup also leave the freezing state.
82
83* freezer.self_freezing: Read only.
84
85 Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
86 This value is 1 iff the last write to freezer.state was "FROZEN".
87
88* freezer.parent_freezing: Read only.
89
90 Shows the parent-state. 0 if none of the cgroup's ancestors is
91 frozen; otherwise, 1.
92
93The root cgroup is non-freezable and the above interface files don't
94exist.
59 95
60* Examples of usage : 96* Examples of usage :
61 97
@@ -85,18 +121,3 @@ to unfreeze all tasks in the container :
85 121
86This is the basic mechanism which should do the right thing for user space task 122This is the basic mechanism which should do the right thing for user space task
87in a simple scenario. 123in a simple scenario.
88
89It's important to note that freezing can be incomplete. In that case we return
90EBUSY. This means that some tasks in the cgroup are busy doing something that
91prevents us from completely freezing the cgroup at this time. After EBUSY,
92the cgroup will remain partially frozen -- reflected by freezer.state reporting
93"FREEZING" when read. The state will remain "FREEZING" until one of these
94things happens:
95
96 1) Userspace cancels the freezing operation by writing "THAWED" to
97 the freezer.state file
98 2) Userspace retries the freezing operation by writing "FROZEN" to
99 the freezer.state file (writing "FREEZING" is not legal
100 and returns EINVAL)
101 3) The tasks that blocked the cgroup from entering the "FROZEN"
102 state disappear from the cgroup's set of tasks.
diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroups/net_prio.txt
index 01b322635591..a82cbd28ea8a 100644
--- a/Documentation/cgroups/net_prio.txt
+++ b/Documentation/cgroups/net_prio.txt
@@ -51,3 +51,5 @@ One usage for the net_prio cgroup is with mqprio qdisc allowing application
51traffic to be steered to hardware/driver based traffic classes. These mappings 51traffic to be steered to hardware/driver based traffic classes. These mappings
52can then be managed by administrators or other networking protocols such as 52can then be managed by administrators or other networking protocols such as
53DCBX. 53DCBX.
54
55A new net_prio cgroup inherits the parent's configuration.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d0b770391ad4..3f6d39d23bb6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -600,7 +600,7 @@ struct cftype blkcg_files[] = {
600}; 600};
601 601
602/** 602/**
603 * blkcg_pre_destroy - cgroup pre_destroy callback 603 * blkcg_css_offline - cgroup css_offline callback
604 * @cgroup: cgroup of interest 604 * @cgroup: cgroup of interest
605 * 605 *
606 * This function is called when @cgroup is about to go away and responsible 606 * This function is called when @cgroup is about to go away and responsible
@@ -610,7 +610,7 @@ struct cftype blkcg_files[] = {
610 * 610 *
611 * This is the blkcg counterpart of ioc_release_fn(). 611 * This is the blkcg counterpart of ioc_release_fn().
612 */ 612 */
613static int blkcg_pre_destroy(struct cgroup *cgroup) 613static void blkcg_css_offline(struct cgroup *cgroup)
614{ 614{
615 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 615 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
616 616
@@ -632,10 +632,9 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
632 } 632 }
633 633
634 spin_unlock_irq(&blkcg->lock); 634 spin_unlock_irq(&blkcg->lock);
635 return 0;
636} 635}
637 636
638static void blkcg_destroy(struct cgroup *cgroup) 637static void blkcg_css_free(struct cgroup *cgroup)
639{ 638{
640 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 639 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
641 640
@@ -643,7 +642,7 @@ static void blkcg_destroy(struct cgroup *cgroup)
643 kfree(blkcg); 642 kfree(blkcg);
644} 643}
645 644
646static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) 645static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
647{ 646{
648 static atomic64_t id_seq = ATOMIC64_INIT(0); 647 static atomic64_t id_seq = ATOMIC64_INIT(0);
649 struct blkcg *blkcg; 648 struct blkcg *blkcg;
@@ -740,10 +739,10 @@ static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
740 739
741struct cgroup_subsys blkio_subsys = { 740struct cgroup_subsys blkio_subsys = {
742 .name = "blkio", 741 .name = "blkio",
743 .create = blkcg_create, 742 .css_alloc = blkcg_css_alloc,
743 .css_offline = blkcg_css_offline,
744 .css_free = blkcg_css_free,
744 .can_attach = blkcg_can_attach, 745 .can_attach = blkcg_can_attach,
745 .pre_destroy = blkcg_pre_destroy,
746 .destroy = blkcg_destroy,
747 .subsys_id = blkio_subsys_id, 746 .subsys_id = blkio_subsys_id,
748 .base_cftypes = blkcg_files, 747 .base_cftypes = blkcg_files,
749 .module = THIS_MODULE, 748 .module = THIS_MODULE,
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8a030ced0c7..7d73905dcba2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -12,6 +12,7 @@
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
15#include <linux/rculist.h>
15#include <linux/cgroupstats.h> 16#include <linux/cgroupstats.h>
16#include <linux/prio_heap.h> 17#include <linux/prio_heap.h>
17#include <linux/rwsem.h> 18#include <linux/rwsem.h>
@@ -34,7 +35,6 @@ extern int cgroup_lock_is_held(void);
34extern bool cgroup_lock_live_group(struct cgroup *cgrp); 35extern bool cgroup_lock_live_group(struct cgroup *cgrp);
35extern void cgroup_unlock(void); 36extern void cgroup_unlock(void);
36extern void cgroup_fork(struct task_struct *p); 37extern void cgroup_fork(struct task_struct *p);
37extern void cgroup_fork_callbacks(struct task_struct *p);
38extern void cgroup_post_fork(struct task_struct *p); 38extern void cgroup_post_fork(struct task_struct *p);
39extern void cgroup_exit(struct task_struct *p, int run_callbacks); 39extern void cgroup_exit(struct task_struct *p, int run_callbacks);
40extern int cgroupstats_build(struct cgroupstats *stats, 40extern int cgroupstats_build(struct cgroupstats *stats,
@@ -66,7 +66,7 @@ struct cgroup_subsys_state {
66 /* 66 /*
67 * State maintained by the cgroup system to allow subsystems 67 * State maintained by the cgroup system to allow subsystems
68 * to be "busy". Should be accessed via css_get(), 68 * to be "busy". Should be accessed via css_get(),
69 * css_tryget() and and css_put(). 69 * css_tryget() and css_put().
70 */ 70 */
71 71
72 atomic_t refcnt; 72 atomic_t refcnt;
@@ -81,9 +81,8 @@ struct cgroup_subsys_state {
81 81
82/* bits in struct cgroup_subsys_state flags field */ 82/* bits in struct cgroup_subsys_state flags field */
83enum { 83enum {
84 CSS_ROOT, /* This CSS is the root of the subsystem */ 84 CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */
85 CSS_REMOVED, /* This CSS is dead */ 85 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
86 CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */
87}; 86};
88 87
89/* Caller must verify that the css is not for root cgroup */ 88/* Caller must verify that the css is not for root cgroup */
@@ -102,15 +101,10 @@ static inline void __css_get(struct cgroup_subsys_state *css, int count)
102static inline void css_get(struct cgroup_subsys_state *css) 101static inline void css_get(struct cgroup_subsys_state *css)
103{ 102{
104 /* We don't need to reference count the root state */ 103 /* We don't need to reference count the root state */
105 if (!test_bit(CSS_ROOT, &css->flags)) 104 if (!(css->flags & CSS_ROOT))
106 __css_get(css, 1); 105 __css_get(css, 1);
107} 106}
108 107
109static inline bool css_is_removed(struct cgroup_subsys_state *css)
110{
111 return test_bit(CSS_REMOVED, &css->flags);
112}
113
114/* 108/*
115 * Call css_tryget() to take a reference on a css if your existing 109 * Call css_tryget() to take a reference on a css if your existing
116 * (known-valid) reference isn't already ref-counted. Returns false if 110 * (known-valid) reference isn't already ref-counted. Returns false if
@@ -120,7 +114,7 @@ static inline bool css_is_removed(struct cgroup_subsys_state *css)
120extern bool __css_tryget(struct cgroup_subsys_state *css); 114extern bool __css_tryget(struct cgroup_subsys_state *css);
121static inline bool css_tryget(struct cgroup_subsys_state *css) 115static inline bool css_tryget(struct cgroup_subsys_state *css)
122{ 116{
123 if (test_bit(CSS_ROOT, &css->flags)) 117 if (css->flags & CSS_ROOT)
124 return true; 118 return true;
125 return __css_tryget(css); 119 return __css_tryget(css);
126} 120}
@@ -133,7 +127,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
133extern void __css_put(struct cgroup_subsys_state *css); 127extern void __css_put(struct cgroup_subsys_state *css);
134static inline void css_put(struct cgroup_subsys_state *css) 128static inline void css_put(struct cgroup_subsys_state *css)
135{ 129{
136 if (!test_bit(CSS_ROOT, &css->flags)) 130 if (!(css->flags & CSS_ROOT))
137 __css_put(css); 131 __css_put(css);
138} 132}
139 133
@@ -149,13 +143,11 @@ enum {
149 /* Control Group requires release notifications to userspace */ 143 /* Control Group requires release notifications to userspace */
150 CGRP_NOTIFY_ON_RELEASE, 144 CGRP_NOTIFY_ON_RELEASE,
151 /* 145 /*
152 * A thread in rmdir() is wating for this cgroup. 146 * Clone the parent's configuration when creating a new child
153 */ 147 * cpuset cgroup. For historical reasons, this option can be
154 CGRP_WAIT_ON_RMDIR, 148 * specified at mount time and thus is implemented here.
155 /*
156 * Clone cgroup values when creating a new child cgroup
157 */ 149 */
158 CGRP_CLONE_CHILDREN, 150 CGRP_CPUSET_CLONE_CHILDREN,
159}; 151};
160 152
161struct cgroup { 153struct cgroup {
@@ -167,6 +159,8 @@ struct cgroup {
167 */ 159 */
168 atomic_t count; 160 atomic_t count;
169 161
162 int id; /* ida allocated in-hierarchy ID */
163
170 /* 164 /*
171 * We link our 'sibling' struct into our parent's 'children'. 165 * We link our 'sibling' struct into our parent's 'children'.
172 * Our children link their 'sibling' into our 'children'. 166 * Our children link their 'sibling' into our 'children'.
@@ -176,7 +170,7 @@ struct cgroup {
176 struct list_head files; /* my files */ 170 struct list_head files; /* my files */
177 171
178 struct cgroup *parent; /* my parent */ 172 struct cgroup *parent; /* my parent */
179 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ 173 struct dentry *dentry; /* cgroup fs entry, RCU protected */
180 174
181 /* Private pointers for each registered subsystem */ 175 /* Private pointers for each registered subsystem */
182 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 176 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
@@ -282,7 +276,7 @@ struct cgroup_map_cb {
282 276
283/* cftype->flags */ 277/* cftype->flags */
284#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ 278#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */
285#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create onp root cg */ 279#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */
286 280
287#define MAX_CFTYPE_NAME 64 281#define MAX_CFTYPE_NAME 64
288 282
@@ -422,23 +416,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
422int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 416int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
423 417
424/* 418/*
425 * When the subsys has to access css and may add permanent refcnt to css,
426 * it should take care of racy conditions with rmdir(). Following set of
427 * functions, is for stop/restart rmdir if necessary.
428 * Because these will call css_get/put, "css" should be alive css.
429 *
430 * cgroup_exclude_rmdir();
431 * ...do some jobs which may access arbitrary empty cgroup
432 * cgroup_release_and_wakeup_rmdir();
433 *
434 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
435 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
436 */
437
438void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
439void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
440
441/*
442 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 419 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
443 * methods. 420 * methods.
444 */ 421 */
@@ -466,16 +443,17 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
466 */ 443 */
467 444
468struct cgroup_subsys { 445struct cgroup_subsys {
469 struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); 446 struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp);
470 int (*pre_destroy)(struct cgroup *cgrp); 447 int (*css_online)(struct cgroup *cgrp);
471 void (*destroy)(struct cgroup *cgrp); 448 void (*css_offline)(struct cgroup *cgrp);
449 void (*css_free)(struct cgroup *cgrp);
450
472 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 451 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
473 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 452 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
474 void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 453 void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
475 void (*fork)(struct task_struct *task); 454 void (*fork)(struct task_struct *task);
476 void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, 455 void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
477 struct task_struct *task); 456 struct task_struct *task);
478 void (*post_clone)(struct cgroup *cgrp);
479 void (*bind)(struct cgroup *root); 457 void (*bind)(struct cgroup *root);
480 458
481 int subsys_id; 459 int subsys_id;
@@ -489,17 +467,6 @@ struct cgroup_subsys {
489 bool use_id; 467 bool use_id;
490 468
491 /* 469 /*
492 * If %true, cgroup removal will try to clear css refs by retrying
493 * ss->pre_destroy() until there's no css ref left. This behavior
494 * is strictly for backward compatibility and will be removed as
495 * soon as the current user (memcg) is updated.
496 *
497 * If %false, ss->pre_destroy() can't fail and cgroup removal won't
498 * wait for css refs to drop to zero before proceeding.
499 */
500 bool __DEPRECATED_clear_css_refs;
501
502 /*
503 * If %false, this subsystem is properly hierarchical - 470 * If %false, this subsystem is properly hierarchical -
504 * configuration, resource accounting and restriction on a parent 471 * configuration, resource accounting and restriction on a parent
505 * cgroup cover those of its children. If %true, hierarchy support 472 * cgroup cover those of its children. If %true, hierarchy support
@@ -572,6 +539,100 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
572 return task_subsys_state(task, subsys_id)->cgroup; 539 return task_subsys_state(task, subsys_id)->cgroup;
573} 540}
574 541
542/**
543 * cgroup_for_each_child - iterate through children of a cgroup
544 * @pos: the cgroup * to use as the loop cursor
545 * @cgroup: cgroup whose children to walk
546 *
547 * Walk @cgroup's children. Must be called under rcu_read_lock(). A child
548 * cgroup which hasn't finished ->css_online() or already has finished
549 * ->css_offline() may show up during traversal and it's each subsystem's
550 * responsibility to verify that each @pos is alive.
551 *
552 * If a subsystem synchronizes against the parent in its ->css_online() and
553 * before starting iterating, a cgroup which finished ->css_online() is
554 * guaranteed to be visible in the future iterations.
555 */
556#define cgroup_for_each_child(pos, cgroup) \
557 list_for_each_entry_rcu(pos, &(cgroup)->children, sibling)
558
559struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
560 struct cgroup *cgroup);
561
562/**
563 * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
564 * @pos: the cgroup * to use as the loop cursor
565 * @cgroup: cgroup whose descendants to walk
566 *
567 * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A
568 * descendant cgroup which hasn't finished ->css_online() or already has
569 * finished ->css_offline() may show up during traversal and it's each
570 * subsystem's responsibility to verify that each @pos is alive.
571 *
572 * If a subsystem synchronizes against the parent in its ->css_online() and
573 * before starting iterating, and synchronizes against @pos on each
574 * iteration, any descendant cgroup which finished ->css_offline() is
575 * guaranteed to be visible in the future iterations.
576 *
577 * In other words, the following guarantees that a descendant can't escape
578 * state updates of its ancestors.
579 *
580 * my_online(@cgrp)
581 * {
582 * Lock @cgrp->parent and @cgrp;
583 * Inherit state from @cgrp->parent;
584 * Unlock both.
585 * }
586 *
587 * my_update_state(@cgrp)
588 * {
589 * Lock @cgrp;
590 * Update @cgrp's state;
591 * Unlock @cgrp;
592 *
593 * cgroup_for_each_descendant_pre(@pos, @cgrp) {
594 * Lock @pos;
595 * Verify @pos is alive and inherit state from @pos->parent;
596 * Unlock @pos;
597 * }
598 * }
599 *
600 * As long as the inheriting step, including checking the parent state, is
601 * enclosed inside @pos locking, double-locking the parent isn't necessary
602 * while inheriting. The state update to the parent is guaranteed to be
603 * visible by walking order and, as long as inheriting operations to the
604 * same @pos are atomic to each other, multiple updates racing each other
605 * still result in the correct state. It's guaranateed that at least one
606 * inheritance happens for any cgroup after the latest update to its
607 * parent.
608 *
609 * If checking parent's state requires locking the parent, each inheriting
610 * iteration should lock and unlock both @pos->parent and @pos.
611 *
612 * Alternatively, a subsystem may choose to use a single global lock to
613 * synchronize ->css_online() and ->css_offline() against tree-walking
614 * operations.
615 */
616#define cgroup_for_each_descendant_pre(pos, cgroup) \
617 for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \
618 pos = cgroup_next_descendant_pre((pos), (cgroup)))
619
620struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
621 struct cgroup *cgroup);
622
623/**
624 * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants
625 * @pos: the cgroup * to use as the loop cursor
626 * @cgroup: cgroup whose descendants to walk
627 *
628 * Similar to cgroup_for_each_descendant_pre() but performs post-order
629 * traversal instead. Note that the walk visibility guarantee described in
630 * pre-order walk doesn't apply the same to post-order walks.
631 */
632#define cgroup_for_each_descendant_post(pos, cgroup) \
633 for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \
634 pos = cgroup_next_descendant_post((pos), (cgroup)))
635
575/* A cgroup_iter should be treated as an opaque object */ 636/* A cgroup_iter should be treated as an opaque object */
576struct cgroup_iter { 637struct cgroup_iter {
577 struct list_head *cg_link; 638 struct list_head *cg_link;
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index b90091af5798..e4238ceaa4d6 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -75,35 +75,68 @@ static inline bool cgroup_freezing(struct task_struct *task)
75 */ 75 */
76 76
77 77
78/* Tell the freezer not to count the current task as freezable. */ 78/**
79 * freezer_do_not_count - tell freezer to ignore %current
80 *
81 * Tell freezers to ignore the current task when determining whether the
82 * target frozen state is reached. IOW, the current task will be
83 * considered frozen enough by freezers.
84 *
85 * The caller shouldn't do anything which isn't allowed for a frozen task
86 * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair
87 * wrap a scheduling operation and nothing much else.
88 */
79static inline void freezer_do_not_count(void) 89static inline void freezer_do_not_count(void)
80{ 90{
81 current->flags |= PF_FREEZER_SKIP; 91 current->flags |= PF_FREEZER_SKIP;
82} 92}
83 93
84/* 94/**
85 * Tell the freezer to count the current task as freezable again and try to 95 * freezer_count - tell freezer to stop ignoring %current
86 * freeze it. 96 *
97 * Undo freezer_do_not_count(). It tells freezers that %current should be
98 * considered again and tries to freeze if freezing condition is already in
99 * effect.
87 */ 100 */
88static inline void freezer_count(void) 101static inline void freezer_count(void)
89{ 102{
90 current->flags &= ~PF_FREEZER_SKIP; 103 current->flags &= ~PF_FREEZER_SKIP;
104 /*
105 * If freezing is in progress, the following paired with smp_mb()
106 * in freezer_should_skip() ensures that either we see %true
107 * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
108 */
109 smp_mb();
91 try_to_freeze(); 110 try_to_freeze();
92} 111}
93 112
94/* 113/**
95 * Check if the task should be counted as freezable by the freezer 114 * freezer_should_skip - whether to skip a task when determining frozen
115 * state is reached
116 * @p: task in quesion
117 *
118 * This function is used by freezers after establishing %true freezing() to
119 * test whether a task should be skipped when determining the target frozen
120 * state is reached. IOW, if this function returns %true, @p is considered
121 * frozen enough.
96 */ 122 */
97static inline int freezer_should_skip(struct task_struct *p) 123static inline bool freezer_should_skip(struct task_struct *p)
98{ 124{
99 return !!(p->flags & PF_FREEZER_SKIP); 125 /*
126 * The following smp_mb() paired with the one in freezer_count()
127 * ensures that either freezer_count() sees %true freezing() or we
128 * see cleared %PF_FREEZER_SKIP and return %false. This makes it
129 * impossible for a task to slip frozen state testing after
130 * clearing %PF_FREEZER_SKIP.
131 */
132 smp_mb();
133 return p->flags & PF_FREEZER_SKIP;
100} 134}
101 135
102/* 136/*
103 * These macros are intended to be used whenever you want allow a task that's 137 * These macros are intended to be used whenever you want allow a sleeping
104 * sleeping in TASK_UNINTERRUPTIBLE or TASK_KILLABLE state to be frozen. Note 138 * task to be frozen. Note that neither return any clear indication of
105 * that neither return any clear indication of whether a freeze event happened 139 * whether a freeze event happened while in this function.
106 * while in this function.
107 */ 140 */
108 141
109/* Like schedule(), but should not block the freezer. */ 142/* Like schedule(), but should not block the freezer. */
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 2760f4f4ae9b..1d04b6f0fbd4 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -27,7 +27,6 @@ struct netprio_map {
27 27
28struct cgroup_netprio_state { 28struct cgroup_netprio_state {
29 struct cgroup_subsys_state css; 29 struct cgroup_subsys_state css;
30 u32 prioidx;
31}; 30};
32 31
33extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task); 32extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task);
@@ -36,13 +35,12 @@ extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task);
36 35
37static inline u32 task_netprioidx(struct task_struct *p) 36static inline u32 task_netprioidx(struct task_struct *p)
38{ 37{
39 struct cgroup_netprio_state *state; 38 struct cgroup_subsys_state *css;
40 u32 idx; 39 u32 idx;
41 40
42 rcu_read_lock(); 41 rcu_read_lock();
43 state = container_of(task_subsys_state(p, net_prio_subsys_id), 42 css = task_subsys_state(p, net_prio_subsys_id);
44 struct cgroup_netprio_state, css); 43 idx = css->cgroup->id;
45 idx = state->prioidx;
46 rcu_read_unlock(); 44 rcu_read_unlock();
47 return idx; 45 return idx;
48} 46}
@@ -57,8 +55,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
57 rcu_read_lock(); 55 rcu_read_lock();
58 css = task_subsys_state(p, net_prio_subsys_id); 56 css = task_subsys_state(p, net_prio_subsys_id);
59 if (css) 57 if (css)
60 idx = container_of(css, 58 idx = css->cgroup->id;
61 struct cgroup_netprio_state, css)->prioidx;
62 rcu_read_unlock(); 59 rcu_read_unlock();
63 return idx; 60 return idx;
64} 61}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..f34c41bfaa37 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
138 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
139 unsigned long flags; 139 unsigned long flags;
140 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
141 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
142 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
143 146
@@ -171,8 +174,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
176 */ 179 */
177 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
178 /* 181 /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
242 */ 245 */
243static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
244 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
245#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
246int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
247{ 254{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
294 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295} 302}
296 303
297static int clone_children(const struct cgroup *cgrp)
298{
299 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300}
301
302/* 304/*
303 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
304 * an active hierarchy 306 * an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
782 * The task_lock() exception 784 * The task_lock() exception
783 * 785 *
784 * The need for this exception arises from the action of 786 * The need for this exception arises from the action of
785 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 787 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
786 * another. It does so using cgroup_mutex, however there are 788 * another. It does so using cgroup_mutex, however there are
787 * several performance critical places that need to reference 789 * several performance critical places that need to reference
788 * task->cgroup without the expense of grabbing a system global 790 * task->cgroup without the expense of grabbing a system global
789 * mutex. Therefore except as noted below, when dereferencing or, as 791 * mutex. Therefore except as noted below, when dereferencing or, as
790 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 792 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
791 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 793 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792 * the task_struct routinely used for such matters. 794 * the task_struct routinely used for such matters.
793 * 795 *
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 856 return inode;
855} 857}
856 858
857/*
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{
863 struct cgroup_subsys *ss;
864 int ret = 0;
865
866 for_each_subsys(cgrp->root, ss) {
867 if (!ss->pre_destroy)
868 continue;
869
870 ret = ss->pre_destroy(cgrp);
871 if (ret) {
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877
878 return ret;
879}
880
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 859static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882{ 860{
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 861 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
898 * Release the subsystem state objects. 876 * Release the subsystem state objects.
899 */ 877 */
900 for_each_subsys(cgrp->root, ss) 878 for_each_subsys(cgrp->root, ss)
901 ss->destroy(cgrp); 879 ss->css_free(cgrp);
902 880
903 cgrp->root->number_of_cgroups--; 881 cgrp->root->number_of_cgroups--;
904 mutex_unlock(&cgroup_mutex); 882 mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
917 895
918 simple_xattrs_free(&cgrp->xattrs); 896 simple_xattrs_free(&cgrp->xattrs);
919 897
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
920 kfree_rcu(cgrp, rcu_head); 899 kfree_rcu(cgrp, rcu_head);
921 } else { 900 } else {
922 struct cfent *cfe = __d_cfe(dentry); 901 struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
987 if (!test_bit(ss->subsys_id, &subsys_mask)) 966 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue; 967 continue;
989 list_for_each_entry(set, &ss->cftsets, node) 968 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts); 969 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
991 } 970 }
992 if (base_files) { 971 if (base_files) {
993 while (!list_empty(&cgrp->files)) 972 while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 994}
1016 995
1017/* 996/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 997 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 998 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 999 * returns an error, no reference counts are touched.
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1150 seq_puts(seq, ",xattr"); 1102 seq_puts(seq, ",xattr");
1151 if (strlen(root->release_agent_path)) 1103 if (strlen(root->release_agent_path))
1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1104 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153 if (clone_children(&root->top_cgroup)) 1105 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1154 seq_puts(seq, ",clone_children"); 1106 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1107 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1108 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
1162 unsigned long subsys_mask; 1114 unsigned long subsys_mask;
1163 unsigned long flags; 1115 unsigned long flags;
1164 char *release_agent; 1116 char *release_agent;
1165 bool clone_children; 1117 bool cpuset_clone_children;
1166 char *name; 1118 char *name;
1167 /* User explicitly requested empty subsystem */ 1119 /* User explicitly requested empty subsystem */
1168 bool none; 1120 bool none;
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1213 continue; 1165 continue;
1214 } 1166 }
1215 if (!strcmp(token, "clone_children")) { 1167 if (!strcmp(token, "clone_children")) {
1216 opts->clone_children = true; 1168 opts->cpuset_clone_children = true;
1217 continue; 1169 continue;
1218 } 1170 }
1219 if (!strcmp(token, "xattr")) { 1171 if (!strcmp(token, "xattr")) {
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1397 goto out_unlock; 1349 goto out_unlock;
1398 } 1350 }
1399 1351
1352 /*
1353 * Clear out the files of subsystems that should be removed, do
1354 * this before rebind_subsystems, since rebind_subsystems may
1355 * change this hierarchy's subsys_list.
1356 */
1357 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1358
1400 ret = rebind_subsystems(root, opts.subsys_mask); 1359 ret = rebind_subsystems(root, opts.subsys_mask);
1401 if (ret) { 1360 if (ret) {
1361 /* rebind_subsystems failed, re-populate the removed files */
1362 cgroup_populate_dir(cgrp, false, removed_mask);
1402 drop_parsed_module_refcounts(opts.subsys_mask); 1363 drop_parsed_module_refcounts(opts.subsys_mask);
1403 goto out_unlock; 1364 goto out_unlock;
1404 } 1365 }
1405 1366
1406 /* clear out any existing files and repopulate subsystem files */
1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408 /* re-populate subsystem files */ 1367 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask); 1368 cgroup_populate_dir(cgrp, false, added_mask);
1410 1369
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1432 INIT_LIST_HEAD(&cgrp->children); 1391 INIT_LIST_HEAD(&cgrp->children);
1433 INIT_LIST_HEAD(&cgrp->files); 1392 INIT_LIST_HEAD(&cgrp->files);
1434 INIT_LIST_HEAD(&cgrp->css_sets); 1393 INIT_LIST_HEAD(&cgrp->css_sets);
1394 INIT_LIST_HEAD(&cgrp->allcg_node);
1435 INIT_LIST_HEAD(&cgrp->release_list); 1395 INIT_LIST_HEAD(&cgrp->release_list);
1436 INIT_LIST_HEAD(&cgrp->pidlists); 1396 INIT_LIST_HEAD(&cgrp->pidlists);
1437 mutex_init(&cgrp->pidlist_mutex); 1397 mutex_init(&cgrp->pidlist_mutex);
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1450 root->number_of_cgroups = 1; 1410 root->number_of_cgroups = 1;
1451 cgrp->root = root; 1411 cgrp->root = root;
1452 cgrp->top_cgroup = cgrp; 1412 cgrp->top_cgroup = cgrp;
1453 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454 init_cgroup_housekeeping(cgrp); 1413 init_cgroup_housekeeping(cgrp);
1414 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1455} 1415}
1456 1416
1457static bool init_root_id(struct cgroupfs_root *root) 1417static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1518 1478
1519 root->subsys_mask = opts->subsys_mask; 1479 root->subsys_mask = opts->subsys_mask;
1520 root->flags = opts->flags; 1480 root->flags = opts->flags;
1481 ida_init(&root->cgroup_ida);
1521 if (opts->release_agent) 1482 if (opts->release_agent)
1522 strcpy(root->release_agent_path, opts->release_agent); 1483 strcpy(root->release_agent_path, opts->release_agent);
1523 if (opts->name) 1484 if (opts->name)
1524 strcpy(root->name, opts->name); 1485 strcpy(root->name, opts->name);
1525 if (opts->clone_children) 1486 if (opts->cpuset_clone_children)
1526 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1487 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1527 return root; 1488 return root;
1528} 1489}
1529 1490
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1536 spin_lock(&hierarchy_id_lock); 1497 spin_lock(&hierarchy_id_lock);
1537 ida_remove(&hierarchy_ida, root->hierarchy_id); 1498 ida_remove(&hierarchy_ida, root->hierarchy_id);
1538 spin_unlock(&hierarchy_id_lock); 1499 spin_unlock(&hierarchy_id_lock);
1500 ida_destroy(&root->cgroup_ida);
1539 kfree(root); 1501 kfree(root);
1540} 1502}
1541 1503
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1701 1663
1702 free_cg_links(&tmp_cg_links); 1664 free_cg_links(&tmp_cg_links);
1703 1665
1704 BUG_ON(!list_empty(&root_cgrp->sibling));
1705 BUG_ON(!list_empty(&root_cgrp->children)); 1666 BUG_ON(!list_empty(&root_cgrp->children));
1706 BUG_ON(root->number_of_cgroups != 1); 1667 BUG_ON(root->number_of_cgroups != 1);
1707 1668
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1750 1711
1751 BUG_ON(root->number_of_cgroups != 1); 1712 BUG_ON(root->number_of_cgroups != 1);
1752 BUG_ON(!list_empty(&cgrp->children)); 1713 BUG_ON(!list_empty(&cgrp->children));
1753 BUG_ON(!list_empty(&cgrp->sibling));
1754 1714
1755 mutex_lock(&cgroup_mutex); 1715 mutex_lock(&cgroup_mutex);
1756 mutex_lock(&cgroup_root_mutex); 1716 mutex_lock(&cgroup_root_mutex);
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj;
1808 */ 1768 */
1809int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1769int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810{ 1770{
1771 struct dentry *dentry = cgrp->dentry;
1811 char *start; 1772 char *start;
1812 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1773
1813 cgroup_lock_is_held()); 1774 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1775 "cgroup_path() called without proper locking");
1814 1776
1815 if (!dentry || cgrp == dummytop) { 1777 if (!dentry || cgrp == dummytop) {
1816 /* 1778 /*
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1821 return 0; 1783 return 0;
1822 } 1784 }
1823 1785
1824 start = buf + buflen; 1786 start = buf + buflen - 1;
1825 1787
1826 *--start = '\0'; 1788 *start = '\0';
1827 for (;;) { 1789 for (;;) {
1828 int len = dentry->d_name.len; 1790 int len = dentry->d_name.len;
1829 1791
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1834 if (!cgrp) 1796 if (!cgrp)
1835 break; 1797 break;
1836 1798
1837 dentry = rcu_dereference_check(cgrp->dentry, 1799 dentry = cgrp->dentry;
1838 cgroup_lock_is_held());
1839 if (!cgrp->parent) 1800 if (!cgrp->parent)
1840 continue; 1801 continue;
1841 if (--start < buf) 1802 if (--start < buf)
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1930/* 1891/*
1931 * cgroup_task_migrate - move a task from one cgroup to another. 1892 * cgroup_task_migrate - move a task from one cgroup to another.
1932 * 1893 *
1933 * 'guarantee' is set if the caller promises that a new css_set for the task 1894 * Must be called with cgroup_mutex and threadgroup locked.
1934 * will already exist. If not set, this function might sleep, and can fail with
1935 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936 */ 1895 */
1937static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1896static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938 struct task_struct *tsk, struct css_set *newcg) 1897 struct task_struct *tsk, struct css_set *newcg)
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2025 } 1984 }
2026 1985
2027 synchronize_rcu(); 1986 synchronize_rcu();
2028
2029 /*
2030 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031 * is no longer empty.
2032 */
2033 cgroup_wakeup_rmdir_waiter(cgrp);
2034out: 1987out:
2035 if (retval) { 1988 if (retval) {
2036 for_each_subsys(root, ss) { 1989 for_each_subsys(root, ss) {
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 * step 5: success! and cleanup 2153 * step 5: success! and cleanup
2201 */ 2154 */
2202 synchronize_rcu(); 2155 synchronize_rcu();
2203 cgroup_wakeup_rmdir_waiter(cgrp);
2204 retval = 0; 2156 retval = 0;
2205out_put_css_set_refs: 2157out_put_css_set_refs:
2206 if (retval) { 2158 if (retval) {
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2711 2663
2712 /* start off with i_nlink == 2 (for "." entry) */ 2664 /* start off with i_nlink == 2 (for "." entry) */
2713 inc_nlink(inode); 2665 inc_nlink(inode);
2666 inc_nlink(dentry->d_parent->d_inode);
2714 2667
2715 /* start with the directory inode held, so that we can 2668 /*
2716 * populate it without racing with another mkdir */ 2669 * Control reaches here with cgroup_mutex held.
2717 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2670 * @inode->i_mutex should nest outside cgroup_mutex but we
2671 * want to populate it immediately without releasing
2672 * cgroup_mutex. As @inode isn't visible to anyone else
2673 * yet, trylock will always succeed without affecting
2674 * lockdep checks.
2675 */
2676 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2718 } else if (S_ISREG(mode)) { 2677 } else if (S_ISREG(mode)) {
2719 inode->i_size = 0; 2678 inode->i_size = 0;
2720 inode->i_fop = &cgroup_file_operations; 2679 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2725 return 0; 2684 return 0;
2726} 2685}
2727 2686
2728/*
2729 * cgroup_create_dir - create a directory for an object.
2730 * @cgrp: the cgroup we create the directory for. It must have a valid
2731 * ->parent field. And we are going to fill its ->dentry field.
2732 * @dentry: dentry of the new cgroup
2733 * @mode: mode to set on new directory.
2734 */
2735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736 umode_t mode)
2737{
2738 struct dentry *parent;
2739 int error = 0;
2740
2741 parent = cgrp->parent->dentry;
2742 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743 if (!error) {
2744 dentry->d_fsdata = cgrp;
2745 inc_nlink(parent->d_inode);
2746 rcu_assign_pointer(cgrp->dentry, dentry);
2747 dget(dentry);
2748 }
2749 dput(dentry);
2750
2751 return error;
2752}
2753
2754/** 2687/**
2755 * cgroup_file_mode - deduce file mode of a control file 2688 * cgroup_file_mode - deduce file mode of a control file
2756 * @cft: the control file in question 2689 * @cft: the control file in question
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 2724
2792 simple_xattrs_init(&cft->xattrs); 2725 simple_xattrs_init(&cft->xattrs);
2793 2726
2794 /* does @cft->flags tell us to skip creation on @cgrp? */
2795 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796 return 0;
2797 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798 return 0;
2799
2800 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2727 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801 strcpy(name, subsys->name); 2728 strcpy(name, subsys->name);
2802 strcat(name, "."); 2729 strcat(name, ".");
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2837 int err, ret = 0; 2764 int err, ret = 0;
2838 2765
2839 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2766 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2767 /* does cft->flags tell us to skip this file on @cgrp? */
2768 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2769 continue;
2770 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2771 continue;
2772
2840 if (is_add) 2773 if (is_add)
2841 err = cgroup_add_file(cgrp, subsys, cft); 2774 err = cgroup_add_file(cgrp, subsys, cft);
2842 else 2775 else
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void)
3044 write_unlock(&css_set_lock); 2977 write_unlock(&css_set_lock);
3045} 2978}
3046 2979
2980/**
2981 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2982 * @pos: the current position (%NULL to initiate traversal)
2983 * @cgroup: cgroup whose descendants to walk
2984 *
2985 * To be used by cgroup_for_each_descendant_pre(). Find the next
2986 * descendant to visit for pre-order traversal of @cgroup's descendants.
2987 */
2988struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2989 struct cgroup *cgroup)
2990{
2991 struct cgroup *next;
2992
2993 WARN_ON_ONCE(!rcu_read_lock_held());
2994
2995 /* if first iteration, pretend we just visited @cgroup */
2996 if (!pos) {
2997 if (list_empty(&cgroup->children))
2998 return NULL;
2999 pos = cgroup;
3000 }
3001
3002 /* visit the first child if exists */
3003 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3004 if (next)
3005 return next;
3006
3007 /* no child, visit my or the closest ancestor's next sibling */
3008 do {
3009 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3010 sibling);
3011 if (&next->sibling != &pos->parent->children)
3012 return next;
3013
3014 pos = pos->parent;
3015 } while (pos != cgroup);
3016
3017 return NULL;
3018}
3019EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3020
3021static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3022{
3023 struct cgroup *last;
3024
3025 do {
3026 last = pos;
3027 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3028 sibling);
3029 } while (pos);
3030
3031 return last;
3032}
3033
3034/**
3035 * cgroup_next_descendant_post - find the next descendant for post-order walk
3036 * @pos: the current position (%NULL to initiate traversal)
3037 * @cgroup: cgroup whose descendants to walk
3038 *
3039 * To be used by cgroup_for_each_descendant_post(). Find the next
3040 * descendant to visit for post-order traversal of @cgroup's descendants.
3041 */
3042struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3043 struct cgroup *cgroup)
3044{
3045 struct cgroup *next;
3046
3047 WARN_ON_ONCE(!rcu_read_lock_held());
3048
3049 /* if first iteration, visit the leftmost descendant */
3050 if (!pos) {
3051 next = cgroup_leftmost_descendant(cgroup);
3052 return next != cgroup ? next : NULL;
3053 }
3054
3055 /* if there's an unvisited sibling, visit its leftmost descendant */
3056 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3057 if (&next->sibling != &pos->parent->children)
3058 return cgroup_leftmost_descendant(next);
3059
3060 /* no sibling left, visit parent */
3061 next = pos->parent;
3062 return next != cgroup ? next : NULL;
3063}
3064EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3065
3047void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3066void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048 __acquires(css_set_lock) 3067 __acquires(css_set_lock)
3049{ 3068{
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3757 if (flags & POLLHUP) { 3776 if (flags & POLLHUP) {
3758 __remove_wait_queue(event->wqh, &event->wait); 3777 __remove_wait_queue(event->wqh, &event->wait);
3759 spin_lock(&cgrp->event_list_lock); 3778 spin_lock(&cgrp->event_list_lock);
3760 list_del(&event->list); 3779 list_del_init(&event->list);
3761 spin_unlock(&cgrp->event_list_lock); 3780 spin_unlock(&cgrp->event_list_lock);
3762 /* 3781 /*
3763 * We are in atomic context, but cgroup_event_remove() may 3782 * We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3913,7 @@ fail:
3894static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3913static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895 struct cftype *cft) 3914 struct cftype *cft)
3896{ 3915{
3897 return clone_children(cgrp); 3916 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3898} 3917}
3899 3918
3900static int cgroup_clone_children_write(struct cgroup *cgrp, 3919static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3902 u64 val) 3921 u64 val)
3903{ 3922{
3904 if (val) 3923 if (val)
3905 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3924 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3906 else 3925 else
3907 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3926 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3908 return 0; 3927 return 0;
3909} 3928}
3910 3929
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4017 css->flags = 0; 4036 css->flags = 0;
4018 css->id = NULL; 4037 css->id = NULL;
4019 if (cgrp == dummytop) 4038 if (cgrp == dummytop)
4020 set_bit(CSS_ROOT, &css->flags); 4039 css->flags |= CSS_ROOT;
4021 BUG_ON(cgrp->subsys[ss->subsys_id]); 4040 BUG_ON(cgrp->subsys[ss->subsys_id]);
4022 cgrp->subsys[ss->subsys_id] = css; 4041 cgrp->subsys[ss->subsys_id] = css;
4023 4042
4024 /* 4043 /*
4025 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4044 * css holds an extra ref to @cgrp->dentry which is put on the last
4026 * which is put on the last css_put(). dput() requires process 4045 * css_put(). dput() requires process context, which css_put() may
4027 * context, which css_put() may be called without. @css->dput_work 4046 * be called without. @css->dput_work will be used to invoke
4028 * will be used to invoke dput() asynchronously from css_put(). 4047 * dput() asynchronously from css_put().
4029 */ 4048 */
4030 INIT_WORK(&css->dput_work, css_dput_fn); 4049 INIT_WORK(&css->dput_work, css_dput_fn);
4031 if (ss->__DEPRECATED_clear_css_refs) 4050}
4032 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4051
4052/* invoke ->post_create() on a new CSS and mark it online if successful */
4053static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4054{
4055 int ret = 0;
4056
4057 lockdep_assert_held(&cgroup_mutex);
4058
4059 if (ss->css_online)
4060 ret = ss->css_online(cgrp);
4061 if (!ret)
4062 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4063 return ret;
4064}
4065
4066/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4067static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4068 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4069{
4070 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4071
4072 lockdep_assert_held(&cgroup_mutex);
4073
4074 if (!(css->flags & CSS_ONLINE))
4075 return;
4076
4077 /*
4078 * css_offline() should be called with cgroup_mutex unlocked. See
4079 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4080 * details. This temporary unlocking should go away once
4081 * cgroup_mutex is unexported from controllers.
4082 */
4083 if (ss->css_offline) {
4084 mutex_unlock(&cgroup_mutex);
4085 ss->css_offline(cgrp);
4086 mutex_lock(&cgroup_mutex);
4087 }
4088
4089 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4033} 4090}
4034 4091
4035/* 4092/*
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4049 struct cgroup_subsys *ss; 4106 struct cgroup_subsys *ss;
4050 struct super_block *sb = root->sb; 4107 struct super_block *sb = root->sb;
4051 4108
4109 /* allocate the cgroup and its ID, 0 is reserved for the root */
4052 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4110 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053 if (!cgrp) 4111 if (!cgrp)
4054 return -ENOMEM; 4112 return -ENOMEM;
4055 4113
4114 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4115 if (cgrp->id < 0)
4116 goto err_free_cgrp;
4117
4118 /*
4119 * Only live parents can have children. Note that the liveliness
4120 * check isn't strictly necessary because cgroup_mkdir() and
4121 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4122 * anyway so that locking is contained inside cgroup proper and we
4123 * don't get nasty surprises if we ever grow another caller.
4124 */
4125 if (!cgroup_lock_live_group(parent)) {
4126 err = -ENODEV;
4127 goto err_free_id;
4128 }
4129
4056 /* Grab a reference on the superblock so the hierarchy doesn't 4130 /* Grab a reference on the superblock so the hierarchy doesn't
4057 * get deleted on unmount if there are child cgroups. This 4131 * get deleted on unmount if there are child cgroups. This
4058 * can be done outside cgroup_mutex, since the sb can't 4132 * can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4060 * fs */ 4134 * fs */
4061 atomic_inc(&sb->s_active); 4135 atomic_inc(&sb->s_active);
4062 4136
4063 mutex_lock(&cgroup_mutex);
4064
4065 init_cgroup_housekeeping(cgrp); 4137 init_cgroup_housekeeping(cgrp);
4066 4138
4067 cgrp->parent = parent; 4139 cgrp->parent = parent;
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4071 if (notify_on_release(parent)) 4143 if (notify_on_release(parent))
4072 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4144 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 4145
4074 if (clone_children(parent)) 4146 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4075 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4147 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4076 4148
4077 for_each_subsys(root, ss) { 4149 for_each_subsys(root, ss) {
4078 struct cgroup_subsys_state *css; 4150 struct cgroup_subsys_state *css;
4079 4151
4080 css = ss->create(cgrp); 4152 css = ss->css_alloc(cgrp);
4081 if (IS_ERR(css)) { 4153 if (IS_ERR(css)) {
4082 err = PTR_ERR(css); 4154 err = PTR_ERR(css);
4083 goto err_destroy; 4155 goto err_free_all;
4084 } 4156 }
4085 init_cgroup_css(css, ss, cgrp); 4157 init_cgroup_css(css, ss, cgrp);
4086 if (ss->use_id) { 4158 if (ss->use_id) {
4087 err = alloc_css_id(ss, parent, cgrp); 4159 err = alloc_css_id(ss, parent, cgrp);
4088 if (err) 4160 if (err)
4089 goto err_destroy; 4161 goto err_free_all;
4090 } 4162 }
4091 /* At error, ->destroy() callback has to free assigned ID. */ 4163 }
4092 if (clone_children(parent) && ss->post_clone) 4164
4093 ss->post_clone(cgrp); 4165 /*
4166 * Create directory. cgroup_create_file() returns with the new
4167 * directory locked on success so that it can be populated without
4168 * dropping cgroup_mutex.
4169 */
4170 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4171 if (err < 0)
4172 goto err_free_all;
4173 lockdep_assert_held(&dentry->d_inode->i_mutex);
4174
4175 /* allocation complete, commit to creation */
4176 dentry->d_fsdata = cgrp;
4177 cgrp->dentry = dentry;
4178 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4179 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4180 root->number_of_cgroups++;
4181
4182 /* each css holds a ref to the cgroup's dentry */
4183 for_each_subsys(root, ss)
4184 dget(dentry);
4185
4186 /* creation succeeded, notify subsystems */
4187 for_each_subsys(root, ss) {
4188 err = online_css(ss, cgrp);
4189 if (err)
4190 goto err_destroy;
4094 4191
4095 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4192 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096 parent->parent) { 4193 parent->parent) {
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4102 } 4199 }
4103 } 4200 }
4104 4201
4105 list_add(&cgrp->sibling, &cgrp->parent->children);
4106 root->number_of_cgroups++;
4107
4108 err = cgroup_create_dir(cgrp, dentry, mode);
4109 if (err < 0)
4110 goto err_remove;
4111
4112 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113 for_each_subsys(root, ss)
4114 if (!ss->__DEPRECATED_clear_css_refs)
4115 dget(dentry);
4116
4117 /* The cgroup directory was pre-locked for us */
4118 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119
4120 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121
4122 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4202 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123 /* If err < 0, we have a half-filled directory - oh well ;) */ 4203 if (err)
4204 goto err_destroy;
4124 4205
4125 mutex_unlock(&cgroup_mutex); 4206 mutex_unlock(&cgroup_mutex);
4126 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4207 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 4208
4128 return 0; 4209 return 0;
4129 4210
4130 err_remove: 4211err_free_all:
4131
4132 list_del(&cgrp->sibling);
4133 root->number_of_cgroups--;
4134
4135 err_destroy:
4136
4137 for_each_subsys(root, ss) { 4212 for_each_subsys(root, ss) {
4138 if (cgrp->subsys[ss->subsys_id]) 4213 if (cgrp->subsys[ss->subsys_id])
4139 ss->destroy(cgrp); 4214 ss->css_free(cgrp);
4140 } 4215 }
4141
4142 mutex_unlock(&cgroup_mutex); 4216 mutex_unlock(&cgroup_mutex);
4143
4144 /* Release the reference count that we took on the superblock */ 4217 /* Release the reference count that we took on the superblock */
4145 deactivate_super(sb); 4218 deactivate_super(sb);
4146 4219err_free_id:
4220 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4221err_free_cgrp:
4147 kfree(cgrp); 4222 kfree(cgrp);
4148 return err; 4223 return err;
4224
4225err_destroy:
4226 cgroup_destroy_locked(cgrp);
4227 mutex_unlock(&cgroup_mutex);
4228 mutex_unlock(&dentry->d_inode->i_mutex);
4229 return err;
4149} 4230}
4150 4231
4151static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4232static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4197 return 0; 4278 return 0;
4198} 4279}
4199 4280
4200/* 4281static int cgroup_destroy_locked(struct cgroup *cgrp)
4201 * Atomically mark all (or else none) of the cgroup's CSS objects as 4282 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4202 * CSS_REMOVED. Return true on success, or false if the cgroup has
4203 * busy subsystems. Call with cgroup_mutex held
4204 *
4205 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206 * not, cgroup removal behaves differently.
4207 *
4208 * If clear is set, css refcnt for the subsystem should be zero before
4209 * cgroup removal can be committed. This is implemented by
4210 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211 * called multiple times until all css refcnts reach zero and is allowed to
4212 * veto removal on any invocation. This behavior is deprecated and will be
4213 * removed as soon as the existing user (memcg) is updated.
4214 *
4215 * If clear is not set, each css holds an extra reference to the cgroup's
4216 * dentry and cgroup removal proceeds regardless of css refs.
4217 * ->pre_destroy() will be called at least once and is not allowed to fail.
4218 * On the last put of each css, whenever that may be, the extra dentry ref
4219 * is put so that dentry destruction happens only after all css's are
4220 * released.
4221 */
4222static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223{ 4283{
4284 struct dentry *d = cgrp->dentry;
4285 struct cgroup *parent = cgrp->parent;
4286 DEFINE_WAIT(wait);
4287 struct cgroup_event *event, *tmp;
4224 struct cgroup_subsys *ss; 4288 struct cgroup_subsys *ss;
4225 unsigned long flags; 4289 LIST_HEAD(tmp_list);
4226 bool failed = false; 4290
4291 lockdep_assert_held(&d->d_inode->i_mutex);
4292 lockdep_assert_held(&cgroup_mutex);
4227 4293
4228 local_irq_save(flags); 4294 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4295 return -EBUSY;
4229 4296
4230 /* 4297 /*
4231 * Block new css_tryget() by deactivating refcnt. If all refcnts 4298 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4232 * for subsystems w/ clear_css_refs set were 1 at the moment of 4299 * removed. This makes future css_tryget() and child creation
4233 * deactivation, we succeeded. 4300 * attempts fail thus maintaining the removal conditions verified
4301 * above.
4234 */ 4302 */
4235 for_each_subsys(cgrp->root, ss) { 4303 for_each_subsys(cgrp->root, ss) {
4236 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4304 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 4305
4238 WARN_ON(atomic_read(&css->refcnt) < 0); 4306 WARN_ON(atomic_read(&css->refcnt) < 0);
4239 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4307 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240
4241 if (ss->__DEPRECATED_clear_css_refs)
4242 failed |= css_refcnt(css) != 1;
4243 }
4244
4245 /*
4246 * If succeeded, set REMOVED and put all the base refs; otherwise,
4247 * restore refcnts to positive values. Either way, all in-progress
4248 * css_tryget() will be released.
4249 */
4250 for_each_subsys(cgrp->root, ss) {
4251 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252
4253 if (!failed) {
4254 set_bit(CSS_REMOVED, &css->flags);
4255 css_put(css);
4256 } else {
4257 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258 }
4259 } 4308 }
4309 set_bit(CGRP_REMOVED, &cgrp->flags);
4260 4310
4261 local_irq_restore(flags); 4311 /* tell subsystems to initate destruction */
4262 return !failed; 4312 for_each_subsys(cgrp->root, ss)
4263} 4313 offline_css(ss, cgrp);
4264
4265static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266{
4267 struct cgroup *cgrp = dentry->d_fsdata;
4268 struct dentry *d;
4269 struct cgroup *parent;
4270 DEFINE_WAIT(wait);
4271 struct cgroup_event *event, *tmp;
4272 int ret;
4273
4274 /* the vfs holds both inode->i_mutex already */
4275again:
4276 mutex_lock(&cgroup_mutex);
4277 if (atomic_read(&cgrp->count) != 0) {
4278 mutex_unlock(&cgroup_mutex);
4279 return -EBUSY;
4280 }
4281 if (!list_empty(&cgrp->children)) {
4282 mutex_unlock(&cgroup_mutex);
4283 return -EBUSY;
4284 }
4285 mutex_unlock(&cgroup_mutex);
4286
4287 /*
4288 * In general, subsystem has no css->refcnt after pre_destroy(). But
4289 * in racy cases, subsystem may have to get css->refcnt after
4290 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293 * and subsystem's reference count handling. Please see css_get/put
4294 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295 */
4296 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 4314
4298 /* 4315 /*
4299 * Call pre_destroy handlers of subsys. Notify subsystems 4316 * Put all the base refs. Each css holds an extra reference to the
4300 * that rmdir() request comes. 4317 * cgroup's dentry and cgroup removal proceeds regardless of css
4318 * refs. On the last put of each css, whenever that may be, the
4319 * extra dentry ref is put so that dentry destruction happens only
4320 * after all css's are released.
4301 */ 4321 */
4302 ret = cgroup_call_pre_destroy(cgrp); 4322 for_each_subsys(cgrp->root, ss)
4303 if (ret) { 4323 css_put(cgrp->subsys[ss->subsys_id]);
4304 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305 return ret;
4306 }
4307
4308 mutex_lock(&cgroup_mutex);
4309 parent = cgrp->parent;
4310 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312 mutex_unlock(&cgroup_mutex);
4313 return -EBUSY;
4314 }
4315 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316 if (!cgroup_clear_css_refs(cgrp)) {
4317 mutex_unlock(&cgroup_mutex);
4318 /*
4319 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320 * prepare_to_wait(), we need to check this flag.
4321 */
4322 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323 schedule();
4324 finish_wait(&cgroup_rmdir_waitq, &wait);
4325 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326 if (signal_pending(current))
4327 return -EINTR;
4328 goto again;
4329 }
4330 /* NO css_tryget() can success after here. */
4331 finish_wait(&cgroup_rmdir_waitq, &wait);
4332 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 4324
4334 raw_spin_lock(&release_list_lock); 4325 raw_spin_lock(&release_list_lock);
4335 set_bit(CGRP_REMOVED, &cgrp->flags);
4336 if (!list_empty(&cgrp->release_list)) 4326 if (!list_empty(&cgrp->release_list))
4337 list_del_init(&cgrp->release_list); 4327 list_del_init(&cgrp->release_list);
4338 raw_spin_unlock(&release_list_lock); 4328 raw_spin_unlock(&release_list_lock);
4339 4329
4340 /* delete this cgroup from parent->children */ 4330 /* delete this cgroup from parent->children */
4341 list_del_init(&cgrp->sibling); 4331 list_del_rcu(&cgrp->sibling);
4342
4343 list_del_init(&cgrp->allcg_node); 4332 list_del_init(&cgrp->allcg_node);
4344 4333
4345 d = dget(cgrp->dentry); 4334 dget(d);
4346
4347 cgroup_d_remove_dir(d); 4335 cgroup_d_remove_dir(d);
4348 dput(d); 4336 dput(d);
4349 4337
@@ -4353,21 +4341,35 @@ again:
4353 /* 4341 /*
4354 * Unregister events and notify userspace. 4342 * Unregister events and notify userspace.
4355 * Notify userspace about cgroup removing only after rmdir of cgroup 4343 * Notify userspace about cgroup removing only after rmdir of cgroup
4356 * directory to avoid race between userspace and kernelspace 4344 * directory to avoid race between userspace and kernelspace. Use
4345 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4346 * cgroup_event_wake() is called with the wait queue head locked,
4347 * remove_wait_queue() cannot be called while holding event_list_lock.
4357 */ 4348 */
4358 spin_lock(&cgrp->event_list_lock); 4349 spin_lock(&cgrp->event_list_lock);
4359 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4350 list_splice_init(&cgrp->event_list, &tmp_list);
4360 list_del(&event->list); 4351 spin_unlock(&cgrp->event_list_lock);
4352 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4353 list_del_init(&event->list);
4361 remove_wait_queue(event->wqh, &event->wait); 4354 remove_wait_queue(event->wqh, &event->wait);
4362 eventfd_signal(event->eventfd, 1); 4355 eventfd_signal(event->eventfd, 1);
4363 schedule_work(&event->remove); 4356 schedule_work(&event->remove);
4364 } 4357 }
4365 spin_unlock(&cgrp->event_list_lock);
4366 4358
4367 mutex_unlock(&cgroup_mutex);
4368 return 0; 4359 return 0;
4369} 4360}
4370 4361
4362static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4363{
4364 int ret;
4365
4366 mutex_lock(&cgroup_mutex);
4367 ret = cgroup_destroy_locked(dentry->d_fsdata);
4368 mutex_unlock(&cgroup_mutex);
4369
4370 return ret;
4371}
4372
4371static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4373static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372{ 4374{
4373 INIT_LIST_HEAD(&ss->cftsets); 4375 INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4388 4390
4389 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4391 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 4392
4393 mutex_lock(&cgroup_mutex);
4394
4391 /* init base cftset */ 4395 /* init base cftset */
4392 cgroup_init_cftsets(ss); 4396 cgroup_init_cftsets(ss);
4393 4397
4394 /* Create the top cgroup state for this subsystem */ 4398 /* Create the top cgroup state for this subsystem */
4395 list_add(&ss->sibling, &rootnode.subsys_list); 4399 list_add(&ss->sibling, &rootnode.subsys_list);
4396 ss->root = &rootnode; 4400 ss->root = &rootnode;
4397 css = ss->create(dummytop); 4401 css = ss->css_alloc(dummytop);
4398 /* We don't handle early failures gracefully */ 4402 /* We don't handle early failures gracefully */
4399 BUG_ON(IS_ERR(css)); 4403 BUG_ON(IS_ERR(css));
4400 init_cgroup_css(css, ss, dummytop); 4404 init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4403 * pointer to this state - since the subsystem is 4407 * pointer to this state - since the subsystem is
4404 * newly registered, all tasks and hence the 4408 * newly registered, all tasks and hence the
4405 * init_css_set is in the subsystem's top cgroup. */ 4409 * init_css_set is in the subsystem's top cgroup. */
4406 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4410 init_css_set.subsys[ss->subsys_id] = css;
4407 4411
4408 need_forkexit_callback |= ss->fork || ss->exit; 4412 need_forkexit_callback |= ss->fork || ss->exit;
4409 4413
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4413 BUG_ON(!list_empty(&init_task.tasks)); 4417 BUG_ON(!list_empty(&init_task.tasks));
4414 4418
4415 ss->active = 1; 4419 ss->active = 1;
4420 BUG_ON(online_css(ss, dummytop));
4421
4422 mutex_unlock(&cgroup_mutex);
4416 4423
4417 /* this function shouldn't be used with modular subsystems, since they 4424 /* this function shouldn't be used with modular subsystems, since they
4418 * need to register a subsys_id, among other things */ 4425 * need to register a subsys_id, among other things */
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4430 */ 4437 */
4431int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4438int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432{ 4439{
4433 int i;
4434 struct cgroup_subsys_state *css; 4440 struct cgroup_subsys_state *css;
4441 int i, ret;
4435 4442
4436 /* check name and function validity */ 4443 /* check name and function validity */
4437 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4444 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438 ss->create == NULL || ss->destroy == NULL) 4445 ss->css_alloc == NULL || ss->css_free == NULL)
4439 return -EINVAL; 4446 return -EINVAL;
4440 4447
4441 /* 4448 /*
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4464 subsys[ss->subsys_id] = ss; 4471 subsys[ss->subsys_id] = ss;
4465 4472
4466 /* 4473 /*
4467 * no ss->create seems to need anything important in the ss struct, so 4474 * no ss->css_alloc seems to need anything important in the ss
4468 * this can happen first (i.e. before the rootnode attachment). 4475 * struct, so this can happen first (i.e. before the rootnode
4476 * attachment).
4469 */ 4477 */
4470 css = ss->create(dummytop); 4478 css = ss->css_alloc(dummytop);
4471 if (IS_ERR(css)) { 4479 if (IS_ERR(css)) {
4472 /* failure case - need to deassign the subsys[] slot. */ 4480 /* failure case - need to deassign the subsys[] slot. */
4473 subsys[ss->subsys_id] = NULL; 4481 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4482 init_cgroup_css(css, ss, dummytop); 4490 init_cgroup_css(css, ss, dummytop);
4483 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4491 /* init_idr must be after init_cgroup_css because it sets css->id. */
4484 if (ss->use_id) { 4492 if (ss->use_id) {
4485 int ret = cgroup_init_idr(ss, css); 4493 ret = cgroup_init_idr(ss, css);
4486 if (ret) { 4494 if (ret)
4487 dummytop->subsys[ss->subsys_id] = NULL; 4495 goto err_unload;
4488 ss->destroy(dummytop);
4489 subsys[ss->subsys_id] = NULL;
4490 mutex_unlock(&cgroup_mutex);
4491 return ret;
4492 }
4493 } 4496 }
4494 4497
4495 /* 4498 /*
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4522 write_unlock(&css_set_lock); 4525 write_unlock(&css_set_lock);
4523 4526
4524 ss->active = 1; 4527 ss->active = 1;
4528 ret = online_css(ss, dummytop);
4529 if (ret)
4530 goto err_unload;
4525 4531
4526 /* success! */ 4532 /* success! */
4527 mutex_unlock(&cgroup_mutex); 4533 mutex_unlock(&cgroup_mutex);
4528 return 0; 4534 return 0;
4535
4536err_unload:
4537 mutex_unlock(&cgroup_mutex);
4538 /* @ss can't be mounted here as try_module_get() would fail */
4539 cgroup_unload_subsys(ss);
4540 return ret;
4529} 4541}
4530EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4542EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4531 4543
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552 BUG_ON(ss->root != &rootnode); 4564 BUG_ON(ss->root != &rootnode);
4553 4565
4554 mutex_lock(&cgroup_mutex); 4566 mutex_lock(&cgroup_mutex);
4567
4568 offline_css(ss, dummytop);
4569 ss->active = 0;
4570
4571 if (ss->use_id) {
4572 idr_remove_all(&ss->idr);
4573 idr_destroy(&ss->idr);
4574 }
4575
4555 /* deassign the subsys_id */ 4576 /* deassign the subsys_id */
4556 subsys[ss->subsys_id] = NULL; 4577 subsys[ss->subsys_id] = NULL;
4557 4578
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 struct css_set *cg = link->cg; 4588 struct css_set *cg = link->cg;
4568 4589
4569 hlist_del(&cg->hlist); 4590 hlist_del(&cg->hlist);
4570 BUG_ON(!cg->subsys[ss->subsys_id]);
4571 cg->subsys[ss->subsys_id] = NULL; 4591 cg->subsys[ss->subsys_id] = NULL;
4572 hhead = css_set_hash(cg->subsys); 4592 hhead = css_set_hash(cg->subsys);
4573 hlist_add_head(&cg->hlist, hhead); 4593 hlist_add_head(&cg->hlist, hhead);
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4575 write_unlock(&css_set_lock); 4595 write_unlock(&css_set_lock);
4576 4596
4577 /* 4597 /*
4578 * remove subsystem's css from the dummytop and free it - need to free 4598 * remove subsystem's css from the dummytop and free it - need to
4579 * before marking as null because ss->destroy needs the cgrp->subsys 4599 * free before marking as null because ss->css_free needs the
4580 * pointer to find their state. note that this also takes care of 4600 * cgrp->subsys pointer to find their state. note that this also
4581 * freeing the css_id. 4601 * takes care of freeing the css_id.
4582 */ 4602 */
4583 ss->destroy(dummytop); 4603 ss->css_free(dummytop);
4584 dummytop->subsys[ss->subsys_id] = NULL; 4604 dummytop->subsys[ss->subsys_id] = NULL;
4585 4605
4586 mutex_unlock(&cgroup_mutex); 4606 mutex_unlock(&cgroup_mutex);
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void)
4624 4644
4625 BUG_ON(!ss->name); 4645 BUG_ON(!ss->name);
4626 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4646 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627 BUG_ON(!ss->create); 4647 BUG_ON(!ss->css_alloc);
4628 BUG_ON(!ss->destroy); 4648 BUG_ON(!ss->css_free);
4629 if (ss->subsys_id != i) { 4649 if (ss->subsys_id != i) {
4630 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4650 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631 ss->name, ss->subsys_id); 4651 ss->name, ss->subsys_id);
@@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child)
4832} 4852}
4833 4853
4834/** 4854/**
4835 * cgroup_fork_callbacks - run fork callbacks
4836 * @child: the new task
4837 *
4838 * Called on a new task very soon before adding it to the
4839 * tasklist. No need to take any locks since no-one can
4840 * be operating on this task.
4841 */
4842void cgroup_fork_callbacks(struct task_struct *child)
4843{
4844 if (need_forkexit_callback) {
4845 int i;
4846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847 struct cgroup_subsys *ss = subsys[i];
4848
4849 /*
4850 * forkexit callbacks are only supported for
4851 * builtin subsystems.
4852 */
4853 if (!ss || ss->module)
4854 continue;
4855
4856 if (ss->fork)
4857 ss->fork(child);
4858 }
4859 }
4860}
4861
4862/**
4863 * cgroup_post_fork - called on a new task after adding it to the task list 4855 * cgroup_post_fork - called on a new task after adding it to the task list
4864 * @child: the task in question 4856 * @child: the task in question
4865 * 4857 *
4866 * Adds the task to the list running through its css_set if necessary. 4858 * Adds the task to the list running through its css_set if necessary and
4867 * Has to be after the task is visible on the task list in case we race 4859 * call the subsystem fork() callbacks. Has to be after the task is
4868 * with the first call to cgroup_iter_start() - to guarantee that the 4860 * visible on the task list in case we race with the first call to
4869 * new task ends up on its list. 4861 * cgroup_iter_start() - to guarantee that the new task ends up on its
4862 * list.
4870 */ 4863 */
4871void cgroup_post_fork(struct task_struct *child) 4864void cgroup_post_fork(struct task_struct *child)
4872{ 4865{
4866 int i;
4867
4873 /* 4868 /*
4874 * use_task_css_set_links is set to 1 before we walk the tasklist 4869 * use_task_css_set_links is set to 1 before we walk the tasklist
4875 * under the tasklist_lock and we read it here after we added the child 4870 * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child)
4889 task_unlock(child); 4884 task_unlock(child);
4890 write_unlock(&css_set_lock); 4885 write_unlock(&css_set_lock);
4891 } 4886 }
4887
4888 /*
4889 * Call ss->fork(). This must happen after @child is linked on
4890 * css_set; otherwise, @child might change state between ->fork()
4891 * and addition to css_set.
4892 */
4893 if (need_forkexit_callback) {
4894 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4895 struct cgroup_subsys *ss = subsys[i];
4896
4897 /*
4898 * fork/exit callbacks are supported only for
4899 * builtin subsystems and we don't need further
4900 * synchronization as they never go away.
4901 */
4902 if (!ss || ss->module)
4903 continue;
4904
4905 if (ss->fork)
4906 ss->fork(child);
4907 }
4908 }
4892} 4909}
4910
4893/** 4911/**
4894 * cgroup_exit - detach cgroup from exiting task 4912 * cgroup_exit - detach cgroup from exiting task
4895 * @tsk: pointer to task_struct of exiting process 4913 * @tsk: pointer to task_struct of exiting process
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp)
5022/* Caller must verify that the css is not for root cgroup */ 5040/* Caller must verify that the css is not for root cgroup */
5023bool __css_tryget(struct cgroup_subsys_state *css) 5041bool __css_tryget(struct cgroup_subsys_state *css)
5024{ 5042{
5025 do { 5043 while (true) {
5026 int v = css_refcnt(css); 5044 int t, v;
5027 5045
5028 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5046 v = css_refcnt(css);
5047 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5048 if (likely(t == v))
5029 return true; 5049 return true;
5050 else if (t < 0)
5051 return false;
5030 cpu_relax(); 5052 cpu_relax();
5031 } while (!test_bit(CSS_REMOVED, &css->flags)); 5053 }
5032
5033 return false;
5034} 5054}
5035EXPORT_SYMBOL_GPL(__css_tryget); 5055EXPORT_SYMBOL_GPL(__css_tryget);
5036 5056
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css)
5049 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5069 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050 check_for_release(cgrp); 5070 check_for_release(cgrp);
5051 } 5071 }
5052 cgroup_wakeup_rmdir_waiter(cgrp);
5053 break; 5072 break;
5054 case 0: 5073 case 0:
5055 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5074 schedule_work(&css->dput_work);
5056 schedule_work(&css->dput_work);
5057 break; 5075 break;
5058 } 5076 }
5059 rcu_read_unlock(); 5077 rcu_read_unlock();
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5439} 5457}
5440 5458
5441#ifdef CONFIG_CGROUP_DEBUG 5459#ifdef CONFIG_CGROUP_DEBUG
5442static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5460static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5443{ 5461{
5444 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5462 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 5463
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5449 return css; 5467 return css;
5450} 5468}
5451 5469
5452static void debug_destroy(struct cgroup *cont) 5470static void debug_css_free(struct cgroup *cont)
5453{ 5471{
5454 kfree(cont->subsys[debug_subsys_id]); 5472 kfree(cont->subsys[debug_subsys_id]);
5455} 5473}
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = {
5578 5596
5579struct cgroup_subsys debug_subsys = { 5597struct cgroup_subsys debug_subsys = {
5580 .name = "debug", 5598 .name = "debug",
5581 .create = debug_create, 5599 .css_alloc = debug_css_alloc,
5582 .destroy = debug_destroy, 5600 .css_free = debug_css_free,
5583 .subsys_id = debug_subsys_id, 5601 .subsys_id = debug_subsys_id,
5584 .base_cftypes = debug_files, 5602 .base_cftypes = debug_files,
5585}; 5603};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25enum freezer_state { 25/*
26 CGROUP_THAWED = 0, 26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
27 CGROUP_FREEZING, 27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
28 CGROUP_FROZEN, 28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
29}; 40};
30 41
31struct freezer { 42struct freezer {
32 struct cgroup_subsys_state css; 43 struct cgroup_subsys_state css;
33 enum freezer_state state; 44 unsigned int state;
34 spinlock_t lock; /* protects _writes_ to state */ 45 spinlock_t lock;
35}; 46};
36 47
37static inline struct freezer *cgroup_freezer( 48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
38 struct cgroup *cgroup)
39{ 49{
40 return container_of( 50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
41 cgroup_subsys_state(cgroup, freezer_subsys_id), 51 struct freezer, css);
42 struct freezer, css);
43} 52}
44 53
45static inline struct freezer *task_freezer(struct task_struct *task) 54static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 57 struct freezer, css);
49} 58}
50 59
60static struct freezer *parent_freezer(struct freezer *freezer)
61{
62 struct cgroup *pcg = freezer->css.cgroup->parent;
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67}
68
51bool cgroup_freezing(struct task_struct *task) 69bool cgroup_freezing(struct task_struct *task)
52{ 70{
53 enum freezer_state state;
54 bool ret; 71 bool ret;
55 72
56 rcu_read_lock(); 73 rcu_read_lock();
57 state = task_freezer(task)->state; 74 ret = task_freezer(task)->state & CGROUP_FREEZING;
58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
59 rcu_read_unlock(); 75 rcu_read_unlock();
60 76
61 return ret; 77 return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
65 * cgroups_write_string() limits the size of freezer state strings to 81 * cgroups_write_string() limits the size of freezer state strings to
66 * CGROUP_LOCAL_BUFFER_SIZE 82 * CGROUP_LOCAL_BUFFER_SIZE
67 */ 83 */
68static const char *freezer_state_strs[] = { 84static const char *freezer_state_strs(unsigned int state)
69 "THAWED", 85{
70 "FREEZING", 86 if (state & CGROUP_FROZEN)
71 "FROZEN", 87 return "FROZEN";
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
72}; 91};
73 92
74/*
75 * State diagram
76 * Transitions are caused by userspace writes to the freezer.state file.
77 * The values in parenthesis are state labels. The rest are edge labels.
78 *
79 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
80 * ^ ^ | |
81 * | \_______THAWED_______/ |
82 * \__________________________THAWED____________/
83 */
84
85struct cgroup_subsys freezer_subsys; 93struct cgroup_subsys freezer_subsys;
86 94
87/* Locks taken and their ordering 95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
88 * ------------------------------
89 * cgroup_mutex (AKA cgroup_lock)
90 * freezer->lock
91 * css_set_lock
92 * task->alloc_lock (AKA task_lock)
93 * task->sighand->siglock
94 *
95 * cgroup code forces css_set_lock to be taken before task->alloc_lock
96 *
97 * freezer_create(), freezer_destroy():
98 * cgroup_mutex [ by cgroup core ]
99 *
100 * freezer_can_attach():
101 * cgroup_mutex (held by caller of can_attach)
102 *
103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
104 * freezer->lock
105 * sighand->siglock (if the cgroup is freezing)
106 *
107 * freezer_read():
108 * cgroup_mutex
109 * freezer->lock
110 * write_lock css_set_lock (cgroup iterator start)
111 * task->alloc_lock
112 * read_lock css_set_lock (cgroup iterator start)
113 *
114 * freezer_write() (freeze):
115 * cgroup_mutex
116 * freezer->lock
117 * write_lock css_set_lock (cgroup iterator start)
118 * task->alloc_lock
119 * read_lock css_set_lock (cgroup iterator start)
120 * sighand->siglock (fake signal delivery inside freeze_task())
121 *
122 * freezer_write() (unfreeze):
123 * cgroup_mutex
124 * freezer->lock
125 * write_lock css_set_lock (cgroup iterator start)
126 * task->alloc_lock
127 * read_lock css_set_lock (cgroup iterator start)
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock
130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132{ 96{
133 struct freezer *freezer; 97 struct freezer *freezer;
134 98
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
137 return ERR_PTR(-ENOMEM); 101 return ERR_PTR(-ENOMEM);
138 102
139 spin_lock_init(&freezer->lock); 103 spin_lock_init(&freezer->lock);
140 freezer->state = CGROUP_THAWED;
141 return &freezer->css; 104 return &freezer->css;
142} 105}
143 106
144static void freezer_destroy(struct cgroup *cgroup) 107/**
108 * freezer_css_online - commit creation of a freezer cgroup
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{
117 struct freezer *freezer = cgroup_freezer(cgroup);
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
145{ 151{
146 struct freezer *freezer = cgroup_freezer(cgroup); 152 struct freezer *freezer = cgroup_freezer(cgroup);
147 153
148 if (freezer->state != CGROUP_THAWED) 154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
149 atomic_dec(&system_freezing_cnt); 157 atomic_dec(&system_freezing_cnt);
150 kfree(freezer); 158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
151} 162}
152 163
153/* task is frozen or will freeze immediately when next it gets woken */ 164static void freezer_css_free(struct cgroup *cgroup)
154static bool is_task_frozen_enough(struct task_struct *task)
155{ 165{
156 return frozen(task) || 166 kfree(cgroup_freezer(cgroup));
157 (task_is_stopped_or_traced(task) && freezing(task));
158} 167}
159 168
160/* 169/*
161 * The call to cgroup_lock() in the freezer.state write method prevents 170 * Tasks can be migrated into a different freezer anytime regardless of its
162 * a write to that file racing against an attach, and hence the 171 * current state. freezer_attach() is responsible for making new tasks
163 * can_attach() result will remain valid until the attach completes. 172 * conform to the current state.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
164 */ 177 */
165static int freezer_can_attach(struct cgroup *new_cgroup, 178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
166 struct cgroup_taskset *tset)
167{ 179{
168 struct freezer *freezer; 180 struct freezer *freezer = cgroup_freezer(new_cgrp);
169 struct task_struct *task; 181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
170 185
171 /* 186 /*
172 * Anything frozen can't move or be moved to/from. 187 * Make the new tasks conform to the current state of @new_cgrp.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
173 */ 195 */
174 cgroup_taskset_for_each(task, new_cgroup, tset) 196 cgroup_taskset_for_each(task, new_cgrp, tset) {
175 if (cgroup_freezing(task)) 197 if (!(freezer->state & CGROUP_FREEZING)) {
176 return -EBUSY; 198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
177 205
178 freezer = cgroup_freezer(new_cgroup); 206 spin_unlock_irq(&freezer->lock);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
181 207
182 return 0; 208 /*
209 * Propagate FROZEN clearing upwards. We may race with
210 * update_if_frozen(), but as long as both work bottom-up, either
211 * update_if_frozen() sees child's FROZEN cleared or we clear the
212 * parent's FROZEN later. No parent w/ !FROZEN children can be
213 * left FROZEN.
214 */
215 while (clear_frozen && (freezer = parent_freezer(freezer))) {
216 spin_lock_irq(&freezer->lock);
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 }
183} 221}
184 222
185static void freezer_fork(struct task_struct *task) 223static void freezer_fork(struct task_struct *task)
186{ 224{
187 struct freezer *freezer; 225 struct freezer *freezer;
188 226
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 227 rcu_read_lock();
197 freezer = task_freezer(task); 228 freezer = task_freezer(task);
198 rcu_read_unlock();
199 229
200 /* 230 /*
201 * The root cgroup is non-freezable, so we can skip the 231 * The root cgroup is non-freezable, so we can skip the
202 * following check. 232 * following check.
203 */ 233 */
204 if (!freezer->css.cgroup->parent) 234 if (!freezer->css.cgroup->parent)
205 return; 235 goto out;
206 236
207 spin_lock_irq(&freezer->lock); 237 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 238 if (freezer->state & CGROUP_FREEZING)
209
210 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 239 freeze_task(task);
213 spin_unlock_irq(&freezer->lock); 240 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
214} 243}
215 244
216/* 245/**
217 * caller must hold freezer->lock 246 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
218 */ 260 */
219static void update_if_frozen(struct cgroup *cgroup, 261static void update_if_frozen(struct cgroup *cgroup)
220 struct freezer *freezer)
221{ 262{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
222 struct cgroup_iter it; 265 struct cgroup_iter it;
223 struct task_struct *task; 266 struct task_struct *task;
224 unsigned int nfrozen = 0, ntotal = 0;
225 enum freezer_state old_state = freezer->state;
226 267
227 cgroup_iter_start(cgroup, &it); 268 WARN_ON_ONCE(!rcu_read_lock_held());
228 while ((task = cgroup_iter_next(cgroup, &it))) { 269
229 ntotal++; 270 spin_lock_irq(&freezer->lock);
230 if (freezing(task) && is_task_frozen_enough(task)) 271
231 nfrozen++; 272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
232 } 283 }
233 284
234 if (old_state == CGROUP_THAWED) { 285 /* are all tasks frozen? */
235 BUG_ON(nfrozen > 0); 286 cgroup_iter_start(cgroup, &it);
236 } else if (old_state == CGROUP_FREEZING) { 287
237 if (nfrozen == ntotal) 288 while ((task = cgroup_iter_next(cgroup, &it))) {
238 freezer->state = CGROUP_FROZEN; 289 if (freezing(task)) {
239 } else { /* old_state == CGROUP_FROZEN */ 290 /*
240 BUG_ON(nfrozen != ntotal); 291 * freezer_should_skip() indicates that the task
292 * should be skipped when determining freezing
293 * completion. Consider it frozen in addition to
294 * the usual frozen condition.
295 */
296 if (!frozen(task) && !freezer_should_skip(task))
297 goto out_iter_end;
298 }
241 } 299 }
242 300
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
243 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
244} 306}
245 307
246static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 308static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
247 struct seq_file *m) 309 struct seq_file *m)
248{ 310{
249 struct freezer *freezer; 311 struct cgroup *pos;
250 enum freezer_state state;
251 312
252 if (!cgroup_lock_live_group(cgroup)) 313 rcu_read_lock();
253 return -ENODEV;
254 314
255 freezer = cgroup_freezer(cgroup); 315 /* update states bottom-up */
256 spin_lock_irq(&freezer->lock); 316 cgroup_for_each_descendant_post(pos, cgroup)
257 state = freezer->state; 317 update_if_frozen(pos);
258 if (state == CGROUP_FREEZING) { 318 update_if_frozen(cgroup);
259 /* We change from FREEZING to FROZEN lazily if the cgroup was 319
260 * only partially frozen when we exitted write. */ 320 rcu_read_unlock();
261 update_if_frozen(cgroup, freezer);
262 state = freezer->state;
263 }
264 spin_unlock_irq(&freezer->lock);
265 cgroup_unlock();
266 321
267 seq_puts(m, freezer_state_strs[state]); 322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
268 seq_putc(m, '\n'); 323 seq_putc(m, '\n');
269 return 0; 324 return 0;
270} 325}
271 326
272static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 327static void freeze_cgroup(struct freezer *freezer)
273{ 328{
329 struct cgroup *cgroup = freezer->css.cgroup;
274 struct cgroup_iter it; 330 struct cgroup_iter it;
275 struct task_struct *task; 331 struct task_struct *task;
276 unsigned int num_cant_freeze_now = 0;
277 332
278 cgroup_iter_start(cgroup, &it); 333 cgroup_iter_start(cgroup, &it);
279 while ((task = cgroup_iter_next(cgroup, &it))) { 334 while ((task = cgroup_iter_next(cgroup, &it)))
280 if (!freeze_task(task)) 335 freeze_task(task);
281 continue;
282 if (is_task_frozen_enough(task))
283 continue;
284 if (!freezing(task) && !freezer_should_skip(task))
285 num_cant_freeze_now++;
286 }
287 cgroup_iter_end(cgroup, &it); 336 cgroup_iter_end(cgroup, &it);
288
289 return num_cant_freeze_now ? -EBUSY : 0;
290} 337}
291 338
292static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 339static void unfreeze_cgroup(struct freezer *freezer)
293{ 340{
341 struct cgroup *cgroup = freezer->css.cgroup;
294 struct cgroup_iter it; 342 struct cgroup_iter it;
295 struct task_struct *task; 343 struct task_struct *task;
296 344
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 cgroup_iter_end(cgroup, &it); 348 cgroup_iter_end(cgroup, &it);
301} 349}
302 350
303static int freezer_change_state(struct cgroup *cgroup, 351/**
304 enum freezer_state goal_state) 352 * freezer_apply_state - apply state change to a single cgroup_freezer
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
305{ 362{
306 struct freezer *freezer; 363 /* also synchronizes against task migration, see freezer_attach() */
307 int retval = 0; 364 lockdep_assert_held(&freezer->lock);
308
309 freezer = cgroup_freezer(cgroup);
310 365
311 spin_lock_irq(&freezer->lock); 366 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
367 return;
312 368
313 update_if_frozen(cgroup, freezer); 369 if (freeze) {
314 370 if (!(freezer->state & CGROUP_FREEZING))
315 switch (goal_state) {
316 case CGROUP_THAWED:
317 if (freezer->state != CGROUP_THAWED)
318 atomic_dec(&system_freezing_cnt);
319 freezer->state = CGROUP_THAWED;
320 unfreeze_cgroup(cgroup, freezer);
321 break;
322 case CGROUP_FROZEN:
323 if (freezer->state == CGROUP_THAWED)
324 atomic_inc(&system_freezing_cnt); 371 atomic_inc(&system_freezing_cnt);
325 freezer->state = CGROUP_FREEZING; 372 freezer->state |= state;
326 retval = try_to_freeze_cgroup(cgroup, freezer); 373 freeze_cgroup(freezer);
327 break; 374 } else {
328 default: 375 bool was_freezing = freezer->state & CGROUP_FREEZING;
329 BUG(); 376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
330 } 385 }
386}
331 387
388/**
389 * freezer_change_state - change the freezing state of a cgroup_freezer
390 * @freezer: freezer of interest
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
332 spin_unlock_irq(&freezer->lock); 403 spin_unlock_irq(&freezer->lock);
333 404
334 return retval; 405 /*
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
335} 426}
336 427
337static int freezer_write(struct cgroup *cgroup, 428static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
338 struct cftype *cft,
339 const char *buffer) 429 const char *buffer)
340{ 430{
341 int retval; 431 bool freeze;
342 enum freezer_state goal_state;
343 432
344 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) 433 if (strcmp(buffer, freezer_state_strs(0)) == 0)
345 goal_state = CGROUP_THAWED; 434 freeze = false;
346 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
347 goal_state = CGROUP_FROZEN; 436 freeze = true;
348 else 437 else
349 return -EINVAL; 438 return -EINVAL;
350 439
351 if (!cgroup_lock_live_group(cgroup)) 440 freezer_change_state(cgroup_freezer(cgroup), freeze);
352 return -ENODEV; 441 return 0;
353 retval = freezer_change_state(cgroup, goal_state); 442}
354 cgroup_unlock(); 443
355 return retval; 444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
356} 456}
357 457
358static struct cftype files[] = { 458static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
362 .read_seq_string = freezer_read, 462 .read_seq_string = freezer_read,
363 .write_string = freezer_write, 463 .write_string = freezer_write,
364 }, 464 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
365 { } /* terminate */ 475 { } /* terminate */
366}; 476};
367 477
368struct cgroup_subsys freezer_subsys = { 478struct cgroup_subsys freezer_subsys = {
369 .name = "freezer", 479 .name = "freezer",
370 .create = freezer_create, 480 .css_alloc = freezer_css_alloc,
371 .destroy = freezer_destroy, 481 .css_online = freezer_css_online,
482 .css_offline = freezer_css_offline,
483 .css_free = freezer_css_free,
372 .subsys_id = freezer_subsys_id, 484 .subsys_id = freezer_subsys_id,
373 .can_attach = freezer_can_attach, 485 .attach = freezer_attach,
374 .fork = freezer_fork, 486 .fork = freezer_fork,
375 .base_cftypes = files, 487 .base_cftypes = files,
376
377 /*
378 * freezer subsys doesn't handle hierarchy at all. Frozen state
379 * should be inherited through the hierarchy - if a parent is
380 * frozen, all its children should be frozen. Fix it and remove
381 * the following.
382 */
383 .broken_hierarchy = true,
384}; 488};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..b017887d632f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
1784}; 1784};
1785 1785
1786/* 1786/*
1787 * post_clone() is called during cgroup_create() when the 1787 * cpuset_css_alloc - allocate a cpuset css
1788 * clone_children mount argument was specified. The cgroup
1789 * can not yet have any tasks.
1790 *
1791 * Currently we refuse to set up the cgroup - thereby
1792 * refusing the task to be entered, and as a result refusing
1793 * the sys_unshare() or clone() which initiated it - if any
1794 * sibling cpusets have exclusive cpus or mem.
1795 *
1796 * If this becomes a problem for some users who wish to
1797 * allow that scenario, then cpuset_post_clone() could be
1798 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1799 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1800 * held.
1801 */
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823/*
1824 * cpuset_create - create a cpuset
1825 * cont: control group that the new cpuset will be part of 1788 * cont: control group that the new cpuset will be part of
1826 */ 1789 */
1827 1790
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) 1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1829{ 1792{
1830 struct cpuset *cs; 1793 struct cgroup *parent_cg = cont->parent;
1831 struct cpuset *parent; 1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1832 1796
1833 if (!cont->parent) { 1797 if (!parent_cg)
1834 return &top_cpuset.css; 1798 return &top_cpuset.css;
1835 } 1799 parent = cgroup_cs(parent_cg);
1836 parent = cgroup_cs(cont->parent); 1800
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs) 1802 if (!cs)
1839 return ERR_PTR(-ENOMEM); 1803 return ERR_PTR(-ENOMEM);
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1855 1819
1856 cs->parent = parent; 1820 cs->parent = parent;
1857 number_of_cpusets++; 1821 number_of_cpusets++;
1858 return &cs->css ; 1822
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
1824 goto skip_clone;
1825
1826 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1828 * set. This flag handling is implemented in cgroup core for
1829 * histrical reasons - the flag may be specified during mount.
1830 *
1831 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1832 * refuse to clone the configuration - thereby refusing the task to
1833 * be entered, and as a result refusing the sys_unshare() or
1834 * clone() which initiated it. If this becomes a problem for some
1835 * users who wish to allow that scenario, then this could be
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup.
1838 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
1841
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
1843 goto skip_clone;
1844 }
1845
1846 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex);
1850skip_clone:
1851 return &cs->css;
1859} 1852}
1860 1853
1861/* 1854/*
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1864 * will call async_rebuild_sched_domains(). 1857 * will call async_rebuild_sched_domains().
1865 */ 1858 */
1866 1859
1867static void cpuset_destroy(struct cgroup *cont) 1860static void cpuset_css_free(struct cgroup *cont)
1868{ 1861{
1869 struct cpuset *cs = cgroup_cs(cont); 1862 struct cpuset *cs = cgroup_cs(cont);
1870 1863
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
1878 1871
1879struct cgroup_subsys cpuset_subsys = { 1872struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset", 1873 .name = "cpuset",
1881 .create = cpuset_create, 1874 .css_alloc = cpuset_css_alloc,
1882 .destroy = cpuset_destroy, 1875 .css_free = cpuset_css_free,
1883 .can_attach = cpuset_can_attach, 1876 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach, 1877 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id, 1878 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files, 1879 .base_cftypes = files,
1888 .early_init = 1, 1880 .early_init = 1,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83c134d..f9ff5493171d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7434,7 +7434,7 @@ unlock:
7434device_initcall(perf_event_sysfs_init); 7434device_initcall(perf_event_sysfs_init);
7435 7435
7436#ifdef CONFIG_CGROUP_PERF 7436#ifdef CONFIG_CGROUP_PERF
7437static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) 7437static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7438{ 7438{
7439 struct perf_cgroup *jc; 7439 struct perf_cgroup *jc;
7440 7440
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7451 return &jc->css; 7451 return &jc->css;
7452} 7452}
7453 7453
7454static void perf_cgroup_destroy(struct cgroup *cont) 7454static void perf_cgroup_css_free(struct cgroup *cont)
7455{ 7455{
7456 struct perf_cgroup *jc; 7456 struct perf_cgroup *jc;
7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7492struct cgroup_subsys perf_subsys = { 7492struct cgroup_subsys perf_subsys = {
7493 .name = "perf_event", 7493 .name = "perf_event",
7494 .subsys_id = perf_subsys_id, 7494 .subsys_id = perf_subsys_id,
7495 .create = perf_cgroup_create, 7495 .css_alloc = perf_cgroup_css_alloc,
7496 .destroy = perf_cgroup_destroy, 7496 .css_free = perf_cgroup_css_free,
7497 .exit = perf_cgroup_exit, 7497 .exit = perf_cgroup_exit,
7498 .attach = perf_cgroup_attach, 7498 .attach = perf_cgroup_attach,
7499 7499
diff --git a/kernel/fork.c b/kernel/fork.c
index 850dde1e0c84..79de9f99a48d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1137,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1137{ 1137{
1138 int retval; 1138 int retval;
1139 struct task_struct *p; 1139 struct task_struct *p;
1140 int cgroup_callbacks_done = 0;
1141 1140
1142 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1143 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
@@ -1395,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1395 INIT_LIST_HEAD(&p->thread_group); 1394 INIT_LIST_HEAD(&p->thread_group);
1396 p->task_works = NULL; 1395 p->task_works = NULL;
1397 1396
1398 /* Now that the task is set up, run cgroup callbacks if
1399 * necessary. We need to run them before the task is visible
1400 * on the tasklist. */
1401 cgroup_fork_callbacks(p);
1402 cgroup_callbacks_done = 1;
1403
1404 /* Need tasklist lock for parent etc handling! */ 1397 /* Need tasklist lock for parent etc handling! */
1405 write_lock_irq(&tasklist_lock); 1398 write_lock_irq(&tasklist_lock);
1406 1399
@@ -1505,7 +1498,7 @@ bad_fork_cleanup_cgroup:
1505#endif 1498#endif
1506 if (clone_flags & CLONE_THREAD) 1499 if (clone_flags & CLONE_THREAD)
1507 threadgroup_change_end(current); 1500 threadgroup_change_end(current);
1508 cgroup_exit(p, cgroup_callbacks_done); 1501 cgroup_exit(p, 0);
1509 delayacct_tsk_free(p); 1502 delayacct_tsk_free(p);
1510 module_put(task_thread_info(p)->exec_domain->module); 1503 module_put(task_thread_info(p)->exec_domain->module);
1511bad_fork_cleanup_count: 1504bad_fork_cleanup_count:
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
116 return false; 116 return false;
117 } 117 }
118 118
119 if (!(p->flags & PF_KTHREAD)) { 119 if (!(p->flags & PF_KTHREAD))
120 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
121 /* 121 else
122 * fake_signal_wake_up() goes through p's scheduler
123 * lock and guarantees that TASK_STOPPED/TRACED ->
124 * TASK_RUNNING transition can't race with task state
125 * testing in try_to_freeze_tasks().
126 */
127 } else {
128 wake_up_state(p, TASK_INTERRUPTIBLE); 122 wake_up_state(p, TASK_INTERRUPTIBLE);
129 }
130 123
131 spin_unlock_irqrestore(&freezer_lock, flags); 124 spin_unlock_irqrestore(&freezer_lock, flags);
132 return true; 125 return true;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..d5a258b60c6f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
48 if (p == current || !freeze_task(p)) 48 if (p == current || !freeze_task(p))
49 continue; 49 continue;
50 50
51 /* 51 if (!freezer_should_skip(p))
52 * Now that we've done set_freeze_flag, don't
53 * perturb a task in TASK_STOPPED or TASK_TRACED.
54 * It is "frozen enough". If the task does wake
55 * up, it will immediately call try_to_freeze.
56 *
57 * Because freeze_task() goes through p's scheduler lock, it's
58 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
59 * transition can't race with task state testing here.
60 */
61 if (!task_is_stopped_or_traced(p) &&
62 !freezer_should_skip(p))
63 todo++; 52 todo++;
64 } while_each_thread(g, p); 53 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 54 read_unlock(&tasklist_lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5066a61f971..6271b89f87ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7484,7 +7484,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7484 struct task_group, css); 7484 struct task_group, css);
7485} 7485}
7486 7486
7487static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7487static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7488{ 7488{
7489 struct task_group *tg, *parent; 7489 struct task_group *tg, *parent;
7490 7490
@@ -7501,7 +7501,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7501 return &tg->css; 7501 return &tg->css;
7502} 7502}
7503 7503
7504static void cpu_cgroup_destroy(struct cgroup *cgrp) 7504static void cpu_cgroup_css_free(struct cgroup *cgrp)
7505{ 7505{
7506 struct task_group *tg = cgroup_tg(cgrp); 7506 struct task_group *tg = cgroup_tg(cgrp);
7507 7507
@@ -7861,8 +7861,8 @@ static struct cftype cpu_files[] = {
7861 7861
7862struct cgroup_subsys cpu_cgroup_subsys = { 7862struct cgroup_subsys cpu_cgroup_subsys = {
7863 .name = "cpu", 7863 .name = "cpu",
7864 .create = cpu_cgroup_create, 7864 .css_alloc = cpu_cgroup_css_alloc,
7865 .destroy = cpu_cgroup_destroy, 7865 .css_free = cpu_cgroup_css_free,
7866 .can_attach = cpu_cgroup_can_attach, 7866 .can_attach = cpu_cgroup_can_attach,
7867 .attach = cpu_cgroup_attach, 7867 .attach = cpu_cgroup_attach,
7868 .exit = cpu_cgroup_exit, 7868 .exit = cpu_cgroup_exit,
@@ -7885,7 +7885,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7885struct cpuacct root_cpuacct; 7885struct cpuacct root_cpuacct;
7886 7886
7887/* create a new cpu accounting group */ 7887/* create a new cpu accounting group */
7888static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7888static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7889{ 7889{
7890 struct cpuacct *ca; 7890 struct cpuacct *ca;
7891 7891
@@ -7915,7 +7915,7 @@ out:
7915} 7915}
7916 7916
7917/* destroy an existing cpu accounting group */ 7917/* destroy an existing cpu accounting group */
7918static void cpuacct_destroy(struct cgroup *cgrp) 7918static void cpuacct_css_free(struct cgroup *cgrp)
7919{ 7919{
7920 struct cpuacct *ca = cgroup_ca(cgrp); 7920 struct cpuacct *ca = cgroup_ca(cgrp);
7921 7921
@@ -8086,8 +8086,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8086 8086
8087struct cgroup_subsys cpuacct_subsys = { 8087struct cgroup_subsys cpuacct_subsys = {
8088 .name = "cpuacct", 8088 .name = "cpuacct",
8089 .create = cpuacct_create, 8089 .css_alloc = cpuacct_css_alloc,
8090 .destroy = cpuacct_destroy, 8090 .css_free = cpuacct_css_free,
8091 .subsys_id = cpuacct_subsys_id, 8091 .subsys_id = cpuacct_subsys_id,
8092 .base_cftypes = files, 8092 .base_cftypes = files,
8093}; 8093};
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d6..5ffb5626e072 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1908 preempt_disable(); 1908 preempt_disable();
1909 read_unlock(&tasklist_lock); 1909 read_unlock(&tasklist_lock);
1910 preempt_enable_no_resched(); 1910 preempt_enable_no_resched();
1911 schedule(); 1911 freezable_schedule();
1912 } else { 1912 } else {
1913 /* 1913 /*
1914 * By the time we got the lock, our tracer went away. 1914 * By the time we got the lock, our tracer went away.
@@ -1930,13 +1930,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1930 } 1930 }
1931 1931
1932 /* 1932 /*
1933 * While in TASK_TRACED, we were considered "frozen enough".
1934 * Now that we woke up, it's crucial if we're supposed to be
1935 * frozen that we freeze now before running anything substantial.
1936 */
1937 try_to_freeze();
1938
1939 /*
1940 * We are back. Now reacquire the siglock before touching 1933 * We are back. Now reacquire the siglock before touching
1941 * last_siginfo, so that we are sure to have synchronized with 1934 * last_siginfo, so that we are sure to have synchronized with
1942 * any signal-sending on another CPU that wants to examine it. 1935 * any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr)
2092 } 2085 }
2093 2086
2094 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2087 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2095 schedule(); 2088 freezable_schedule();
2096 return true; 2089 return true;
2097 } else { 2090 } else {
2098 /* 2091 /*
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2200 if (unlikely(uprobe_deny_signal())) 2193 if (unlikely(uprobe_deny_signal()))
2201 return 0; 2194 return 0;
2202 2195
2203relock:
2204 /* 2196 /*
2205 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2197 * Do this once, we can't return to user-mode if freezing() == T.
2206 * While in TASK_STOPPED, we were considered "frozen enough". 2198 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2207 * Now that we woke up, it's crucial if we're supposed to be 2199 * thus do not need another check after return.
2208 * frozen that we freeze now before running anything substantial.
2209 */ 2200 */
2210 try_to_freeze(); 2201 try_to_freeze();
2211 2202
2203relock:
2212 spin_lock_irq(&sighand->siglock); 2204 spin_lock_irq(&sighand->siglock);
2213 /* 2205 /*
2214 * Every stopped thread goes here after wakeup. Check to see if 2206 * Every stopped thread goes here after wakeup. Check to see if
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..b5bde7a5c017 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
77 return false; 77 return false;
78} 78}
79 79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) 80static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
81{ 81{
82 int idx; 82 int idx;
83 struct cgroup *parent_cgroup; 83 struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
101 return &h_cgroup->css; 101 return &h_cgroup->css;
102} 102}
103 103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup) 104static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
105{ 105{
106 struct hugetlb_cgroup *h_cgroup; 106 struct hugetlb_cgroup *h_cgroup;
107 107
@@ -155,18 +155,13 @@ out:
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup. 156 * the parent cgroup.
157 */ 157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) 158static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
159{ 159{
160 struct hstate *h; 160 struct hstate *h;
161 struct page *page; 161 struct page *page;
162 int ret = 0, idx = 0; 162 int idx = 0;
163 163
164 do { 164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) { 165 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock); 166 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru) 167 list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
177 } 172 }
178 cond_resched(); 173 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup)); 174 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182} 175}
183 176
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 177int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -411,8 +404,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
411 404
412struct cgroup_subsys hugetlb_subsys = { 405struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb", 406 .name = "hugetlb",
414 .create = hugetlb_cgroup_create, 407 .css_alloc = hugetlb_cgroup_css_alloc,
415 .pre_destroy = hugetlb_cgroup_pre_destroy, 408 .css_offline = hugetlb_cgroup_css_offline,
416 .destroy = hugetlb_cgroup_destroy, 409 .css_free = hugetlb_cgroup_css_free,
417 .subsys_id = hugetlb_subsys_id, 410 .subsys_id = hugetlb_subsys_id,
418}; 411};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf6d0df4849c..12307b3838fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2370,7 +2370,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2370again: 2370again:
2371 if (*ptr) { /* css should be a valid one */ 2371 if (*ptr) { /* css should be a valid one */
2372 memcg = *ptr; 2372 memcg = *ptr;
2373 VM_BUG_ON(css_is_removed(&memcg->css));
2374 if (mem_cgroup_is_root(memcg)) 2373 if (mem_cgroup_is_root(memcg))
2375 goto done; 2374 goto done;
2376 if (nr_pages == 1 && consume_stock(memcg)) 2375 if (nr_pages == 1 && consume_stock(memcg))
@@ -2510,9 +2509,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2510 2509
2511/* 2510/*
2512 * A helper function to get mem_cgroup from ID. must be called under 2511 * A helper function to get mem_cgroup from ID. must be called under
2513 * rcu_read_lock(). The caller must check css_is_removed() or some if 2512 * rcu_read_lock(). The caller is responsible for calling css_tryget if
2514 * it's concern. (dropping refcnt from swap can be called against removed 2513 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2515 * memcg.) 2514 * called against removed memcg.)
2516 */ 2515 */
2517static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2516static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2518{ 2517{
@@ -2709,13 +2708,6 @@ static int mem_cgroup_move_account(struct page *page,
2709 /* caller should have done css_get */ 2708 /* caller should have done css_get */
2710 pc->mem_cgroup = to; 2709 pc->mem_cgroup = to;
2711 mem_cgroup_charge_statistics(to, anon, nr_pages); 2710 mem_cgroup_charge_statistics(to, anon, nr_pages);
2712 /*
2713 * We charges against "to" which may not have any tasks. Then, "to"
2714 * can be under rmdir(). But in current implementation, caller of
2715 * this function is just force_empty() and move charge, so it's
2716 * guaranteed that "to" is never removed. So, we don't check rmdir
2717 * status here.
2718 */
2719 move_unlock_mem_cgroup(from, &flags); 2711 move_unlock_mem_cgroup(from, &flags);
2720 ret = 0; 2712 ret = 0;
2721unlock: 2713unlock:
@@ -2729,10 +2721,27 @@ out:
2729 return ret; 2721 return ret;
2730} 2722}
2731 2723
2732/* 2724/**
2733 * move charges to its parent. 2725 * mem_cgroup_move_parent - moves page to the parent group
2726 * @page: the page to move
2727 * @pc: page_cgroup of the page
2728 * @child: page's cgroup
2729 *
2730 * move charges to its parent or the root cgroup if the group has no
2731 * parent (aka use_hierarchy==0).
2732 * Although this might fail (get_page_unless_zero, isolate_lru_page or
2733 * mem_cgroup_move_account fails) the failure is always temporary and
2734 * it signals a race with a page removal/uncharge or migration. In the
2735 * first case the page is on the way out and it will vanish from the LRU
2736 * on the next attempt and the call should be retried later.
2737 * Isolation from the LRU fails only if page has been isolated from
2738 * the LRU since we looked at it and that usually means either global
2739 * reclaim or migration going on. The page will either get back to the
2740 * LRU or vanish.
2741 * Finaly mem_cgroup_move_account fails only if the page got uncharged
2742 * (!PageCgroupUsed) or moved to a different group. The page will
2743 * disappear in the next attempt.
2734 */ 2744 */
2735
2736static int mem_cgroup_move_parent(struct page *page, 2745static int mem_cgroup_move_parent(struct page *page,
2737 struct page_cgroup *pc, 2746 struct page_cgroup *pc,
2738 struct mem_cgroup *child) 2747 struct mem_cgroup *child)
@@ -2742,9 +2751,7 @@ static int mem_cgroup_move_parent(struct page *page,
2742 unsigned long uninitialized_var(flags); 2751 unsigned long uninitialized_var(flags);
2743 int ret; 2752 int ret;
2744 2753
2745 /* Is ROOT ? */ 2754 VM_BUG_ON(mem_cgroup_is_root(child));
2746 if (mem_cgroup_is_root(child))
2747 return -EINVAL;
2748 2755
2749 ret = -EBUSY; 2756 ret = -EBUSY;
2750 if (!get_page_unless_zero(page)) 2757 if (!get_page_unless_zero(page))
@@ -2761,8 +2768,10 @@ static int mem_cgroup_move_parent(struct page *page,
2761 if (!parent) 2768 if (!parent)
2762 parent = root_mem_cgroup; 2769 parent = root_mem_cgroup;
2763 2770
2764 if (nr_pages > 1) 2771 if (nr_pages > 1) {
2772 VM_BUG_ON(!PageTransHuge(page));
2765 flags = compound_lock_irqsave(page); 2773 flags = compound_lock_irqsave(page);
2774 }
2766 2775
2767 ret = mem_cgroup_move_account(page, nr_pages, 2776 ret = mem_cgroup_move_account(page, nr_pages,
2768 pc, child, parent); 2777 pc, child, parent);
@@ -2904,7 +2913,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2904 return; 2913 return;
2905 if (!memcg) 2914 if (!memcg)
2906 return; 2915 return;
2907 cgroup_exclude_rmdir(&memcg->css);
2908 2916
2909 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 2917 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2910 /* 2918 /*
@@ -2918,12 +2926,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2918 swp_entry_t ent = {.val = page_private(page)}; 2926 swp_entry_t ent = {.val = page_private(page)};
2919 mem_cgroup_uncharge_swap(ent); 2927 mem_cgroup_uncharge_swap(ent);
2920 } 2928 }
2921 /*
2922 * At swapin, we may charge account against cgroup which has no tasks.
2923 * So, rmdir()->pre_destroy() can be called while we do this charge.
2924 * In that case, we need to call pre_destroy() again. check it here.
2925 */
2926 cgroup_release_and_wakeup_rmdir(&memcg->css);
2927} 2929}
2928 2930
2929void mem_cgroup_commit_charge_swapin(struct page *page, 2931void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3371,8 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3371 3373
3372 if (!memcg) 3374 if (!memcg)
3373 return; 3375 return;
3374 /* blocks rmdir() */ 3376
3375 cgroup_exclude_rmdir(&memcg->css);
3376 if (!migration_ok) { 3377 if (!migration_ok) {
3377 used = oldpage; 3378 used = oldpage;
3378 unused = newpage; 3379 unused = newpage;
@@ -3406,13 +3407,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3406 */ 3407 */
3407 if (anon) 3408 if (anon)
3408 mem_cgroup_uncharge_page(used); 3409 mem_cgroup_uncharge_page(used);
3409 /*
3410 * At migration, we may charge account against cgroup which has no
3411 * tasks.
3412 * So, rmdir()->pre_destroy() can be called while we do this charge.
3413 * In that case, we need to call pre_destroy() again. check it here.
3414 */
3415 cgroup_release_and_wakeup_rmdir(&memcg->css);
3416} 3410}
3417 3411
3418/* 3412/*
@@ -3712,17 +3706,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3712 return nr_reclaimed; 3706 return nr_reclaimed;
3713} 3707}
3714 3708
3715/* 3709/**
3710 * mem_cgroup_force_empty_list - clears LRU of a group
3711 * @memcg: group to clear
3712 * @node: NUMA node
3713 * @zid: zone id
3714 * @lru: lru to to clear
3715 *
3716 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 3716 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3717 * reclaim the pages page themselves - it just removes the page_cgroups. 3717 * reclaim the pages page themselves - pages are moved to the parent (or root)
3718 * Returns true if some page_cgroups were not freed, indicating that the caller 3718 * group.
3719 * must retry this operation.
3720 */ 3719 */
3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3720static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3722 int node, int zid, enum lru_list lru) 3721 int node, int zid, enum lru_list lru)
3723{ 3722{
3724 struct lruvec *lruvec; 3723 struct lruvec *lruvec;
3725 unsigned long flags, loop; 3724 unsigned long flags;
3726 struct list_head *list; 3725 struct list_head *list;
3727 struct page *busy; 3726 struct page *busy;
3728 struct zone *zone; 3727 struct zone *zone;
@@ -3731,11 +3730,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 3730 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3732 list = &lruvec->lists[lru]; 3731 list = &lruvec->lists[lru];
3733 3732
3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3735 /* give some margin against EBUSY etc...*/
3736 loop += 256;
3737 busy = NULL; 3733 busy = NULL;
3738 while (loop--) { 3734 do {
3739 struct page_cgroup *pc; 3735 struct page_cgroup *pc;
3740 struct page *page; 3736 struct page *page;
3741 3737
@@ -3761,76 +3757,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3761 cond_resched(); 3757 cond_resched();
3762 } else 3758 } else
3763 busy = NULL; 3759 busy = NULL;
3764 } 3760 } while (!list_empty(list));
3765 return !list_empty(list);
3766} 3761}
3767 3762
3768/* 3763/*
3769 * make mem_cgroup's charge to be 0 if there is no task. 3764 * make mem_cgroup's charge to be 0 if there is no task by moving
3765 * all the charges and pages to the parent.
3770 * This enables deleting this mem_cgroup. 3766 * This enables deleting this mem_cgroup.
3767 *
3768 * Caller is responsible for holding css reference on the memcg.
3771 */ 3769 */
3772static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 3770static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3773{ 3771{
3774 int ret; 3772 int node, zid;
3775 int node, zid, shrink;
3776 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3777 struct cgroup *cgrp = memcg->css.cgroup;
3778
3779 css_get(&memcg->css);
3780 3773
3781 shrink = 0;
3782 /* should free all ? */
3783 if (free_all)
3784 goto try_to_free;
3785move_account:
3786 do { 3774 do {
3787 ret = -EBUSY;
3788 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3789 goto out;
3790 /* This is for making all *used* pages to be on LRU. */ 3775 /* This is for making all *used* pages to be on LRU. */
3791 lru_add_drain_all(); 3776 lru_add_drain_all();
3792 drain_all_stock_sync(memcg); 3777 drain_all_stock_sync(memcg);
3793 ret = 0;
3794 mem_cgroup_start_move(memcg); 3778 mem_cgroup_start_move(memcg);
3795 for_each_node_state(node, N_HIGH_MEMORY) { 3779 for_each_node_state(node, N_HIGH_MEMORY) {
3796 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3780 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3797 enum lru_list lru; 3781 enum lru_list lru;
3798 for_each_lru(lru) { 3782 for_each_lru(lru) {
3799 ret = mem_cgroup_force_empty_list(memcg, 3783 mem_cgroup_force_empty_list(memcg,
3800 node, zid, lru); 3784 node, zid, lru);
3801 if (ret)
3802 break;
3803 } 3785 }
3804 } 3786 }
3805 if (ret)
3806 break;
3807 } 3787 }
3808 mem_cgroup_end_move(memcg); 3788 mem_cgroup_end_move(memcg);
3809 memcg_oom_recover(memcg); 3789 memcg_oom_recover(memcg);
3810 cond_resched(); 3790 cond_resched();
3811 /* "ret" should also be checked to ensure all lists are empty. */
3812 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3813out:
3814 css_put(&memcg->css);
3815 return ret;
3816 3791
3817try_to_free: 3792 /*
3793 * This is a safety check because mem_cgroup_force_empty_list
3794 * could have raced with mem_cgroup_replace_page_cache callers
3795 * so the lru seemed empty but the page could have been added
3796 * right after the check. RES_USAGE should be safe as we always
3797 * charge before adding to the LRU.
3798 */
3799 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
3800}
3801
3802/*
3803 * Reclaims as many pages from the given memcg as possible and moves
3804 * the rest to the parent.
3805 *
3806 * Caller is responsible for holding css reference for memcg.
3807 */
3808static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3809{
3810 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3811 struct cgroup *cgrp = memcg->css.cgroup;
3812
3818 /* returns EBUSY if there is a task or if we come here twice. */ 3813 /* returns EBUSY if there is a task or if we come here twice. */
3819 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3814 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3820 ret = -EBUSY; 3815 return -EBUSY;
3821 goto out; 3816
3822 }
3823 /* we call try-to-free pages for make this cgroup empty */ 3817 /* we call try-to-free pages for make this cgroup empty */
3824 lru_add_drain_all(); 3818 lru_add_drain_all();
3825 /* try to free all pages in this cgroup */ 3819 /* try to free all pages in this cgroup */
3826 shrink = 1;
3827 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3820 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3828 int progress; 3821 int progress;
3829 3822
3830 if (signal_pending(current)) { 3823 if (signal_pending(current))
3831 ret = -EINTR; 3824 return -EINTR;
3832 goto out; 3825
3833 }
3834 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3826 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3835 false); 3827 false);
3836 if (!progress) { 3828 if (!progress) {
@@ -3841,13 +3833,23 @@ try_to_free:
3841 3833
3842 } 3834 }
3843 lru_add_drain(); 3835 lru_add_drain();
3844 /* try move_account...there may be some *locked* pages. */ 3836 mem_cgroup_reparent_charges(memcg);
3845 goto move_account; 3837
3838 return 0;
3846} 3839}
3847 3840
3848static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3841static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3849{ 3842{
3850 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3843 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3844 int ret;
3845
3846 if (mem_cgroup_is_root(memcg))
3847 return -EINVAL;
3848 css_get(&memcg->css);
3849 ret = mem_cgroup_force_empty(memcg);
3850 css_put(&memcg->css);
3851
3852 return ret;
3851} 3853}
3852 3854
3853 3855
@@ -4953,7 +4955,7 @@ err_cleanup:
4953} 4955}
4954 4956
4955static struct cgroup_subsys_state * __ref 4957static struct cgroup_subsys_state * __ref
4956mem_cgroup_create(struct cgroup *cont) 4958mem_cgroup_css_alloc(struct cgroup *cont)
4957{ 4959{
4958 struct mem_cgroup *memcg, *parent; 4960 struct mem_cgroup *memcg, *parent;
4959 long error = -ENOMEM; 4961 long error = -ENOMEM;
@@ -5034,14 +5036,14 @@ free_out:
5034 return ERR_PTR(error); 5036 return ERR_PTR(error);
5035} 5037}
5036 5038
5037static int mem_cgroup_pre_destroy(struct cgroup *cont) 5039static void mem_cgroup_css_offline(struct cgroup *cont)
5038{ 5040{
5039 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5041 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5040 5042
5041 return mem_cgroup_force_empty(memcg, false); 5043 mem_cgroup_reparent_charges(memcg);
5042} 5044}
5043 5045
5044static void mem_cgroup_destroy(struct cgroup *cont) 5046static void mem_cgroup_css_free(struct cgroup *cont)
5045{ 5047{
5046 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5048 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5047 5049
@@ -5631,16 +5633,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5631struct cgroup_subsys mem_cgroup_subsys = { 5633struct cgroup_subsys mem_cgroup_subsys = {
5632 .name = "memory", 5634 .name = "memory",
5633 .subsys_id = mem_cgroup_subsys_id, 5635 .subsys_id = mem_cgroup_subsys_id,
5634 .create = mem_cgroup_create, 5636 .css_alloc = mem_cgroup_css_alloc,
5635 .pre_destroy = mem_cgroup_pre_destroy, 5637 .css_offline = mem_cgroup_css_offline,
5636 .destroy = mem_cgroup_destroy, 5638 .css_free = mem_cgroup_css_free,
5637 .can_attach = mem_cgroup_can_attach, 5639 .can_attach = mem_cgroup_can_attach,
5638 .cancel_attach = mem_cgroup_cancel_attach, 5640 .cancel_attach = mem_cgroup_cancel_attach,
5639 .attach = mem_cgroup_move_task, 5641 .attach = mem_cgroup_move_task,
5640 .base_cftypes = mem_cgroup_files, 5642 .base_cftypes = mem_cgroup_files,
5641 .early_init = 0, 5643 .early_init = 0,
5642 .use_id = 1, 5644 .use_id = 1,
5643 .__DEPRECATED_clear_css_refs = true,
5644}; 5645};
5645 5646
5646#ifdef CONFIG_MEMCG_SWAP 5647#ifdef CONFIG_MEMCG_SWAP
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 79285a36035f..bde53da9cd86 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -27,11 +27,7 @@
27 27
28#include <linux/fdtable.h> 28#include <linux/fdtable.h>
29 29
30#define PRIOIDX_SZ 128 30#define PRIOMAP_MIN_SZ 128
31
32static unsigned long prioidx_map[PRIOIDX_SZ];
33static DEFINE_SPINLOCK(prioidx_map_lock);
34static atomic_t max_prioidx = ATOMIC_INIT(0);
35 31
36static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) 32static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
37{ 33{
@@ -39,136 +35,157 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgr
39 struct cgroup_netprio_state, css); 35 struct cgroup_netprio_state, css);
40} 36}
41 37
42static int get_prioidx(u32 *prio) 38/*
43{ 39 * Extend @dev->priomap so that it's large enough to accomodate
44 unsigned long flags; 40 * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
45 u32 prioidx; 41 * return. Must be called under rtnl lock.
46 42 */
47 spin_lock_irqsave(&prioidx_map_lock, flags); 43static int extend_netdev_table(struct net_device *dev, u32 target_idx)
48 prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
49 if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) {
50 spin_unlock_irqrestore(&prioidx_map_lock, flags);
51 return -ENOSPC;
52 }
53 set_bit(prioidx, prioidx_map);
54 if (atomic_read(&max_prioidx) < prioidx)
55 atomic_set(&max_prioidx, prioidx);
56 spin_unlock_irqrestore(&prioidx_map_lock, flags);
57 *prio = prioidx;
58 return 0;
59}
60
61static void put_prioidx(u32 idx)
62{ 44{
63 unsigned long flags; 45 struct netprio_map *old, *new;
64 46 size_t new_sz, new_len;
65 spin_lock_irqsave(&prioidx_map_lock, flags);
66 clear_bit(idx, prioidx_map);
67 spin_unlock_irqrestore(&prioidx_map_lock, flags);
68}
69 47
70static int extend_netdev_table(struct net_device *dev, u32 new_len) 48 /* is the existing priomap large enough? */
71{ 49 old = rtnl_dereference(dev->priomap);
72 size_t new_size = sizeof(struct netprio_map) + 50 if (old && old->priomap_len > target_idx)
73 ((sizeof(u32) * new_len)); 51 return 0;
74 struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
75 struct netprio_map *old_priomap;
76 52
77 old_priomap = rtnl_dereference(dev->priomap); 53 /*
54 * Determine the new size. Let's keep it power-of-two. We start
55 * from PRIOMAP_MIN_SZ and double it until it's large enough to
56 * accommodate @target_idx.
57 */
58 new_sz = PRIOMAP_MIN_SZ;
59 while (true) {
60 new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
61 sizeof(new->priomap[0]);
62 if (new_len > target_idx)
63 break;
64 new_sz *= 2;
65 /* overflowed? */
66 if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
67 return -ENOSPC;
68 }
78 69
79 if (!new_priomap) { 70 /* allocate & copy */
71 new = kzalloc(new_sz, GFP_KERNEL);
72 if (!new) {
80 pr_warn("Unable to alloc new priomap!\n"); 73 pr_warn("Unable to alloc new priomap!\n");
81 return -ENOMEM; 74 return -ENOMEM;
82 } 75 }
83 76
84 if (old_priomap) 77 if (old)
85 memcpy(new_priomap->priomap, old_priomap->priomap, 78 memcpy(new->priomap, old->priomap,
86 old_priomap->priomap_len * 79 old->priomap_len * sizeof(old->priomap[0]));
87 sizeof(old_priomap->priomap[0]));
88 80
89 new_priomap->priomap_len = new_len; 81 new->priomap_len = new_len;
90 82
91 rcu_assign_pointer(dev->priomap, new_priomap); 83 /* install the new priomap */
92 if (old_priomap) 84 rcu_assign_pointer(dev->priomap, new);
93 kfree_rcu(old_priomap, rcu); 85 if (old)
86 kfree_rcu(old, rcu);
94 return 0; 87 return 0;
95} 88}
96 89
97static int write_update_netdev_table(struct net_device *dev) 90/**
91 * netprio_prio - return the effective netprio of a cgroup-net_device pair
92 * @cgrp: cgroup part of the target pair
93 * @dev: net_device part of the target pair
94 *
95 * Should be called under RCU read or rtnl lock.
96 */
97static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev)
98{
99 struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
100
101 if (map && cgrp->id < map->priomap_len)
102 return map->priomap[cgrp->id];
103 return 0;
104}
105
106/**
107 * netprio_set_prio - set netprio on a cgroup-net_device pair
108 * @cgrp: cgroup part of the target pair
109 * @dev: net_device part of the target pair
110 * @prio: prio to set
111 *
112 * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl
113 * lock and may fail under memory pressure for non-zero @prio.
114 */
115static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev,
116 u32 prio)
98{ 117{
99 int ret = 0;
100 u32 max_len;
101 struct netprio_map *map; 118 struct netprio_map *map;
119 int ret;
102 120
103 max_len = atomic_read(&max_prioidx) + 1; 121 /* avoid extending priomap for zero writes */
104 map = rtnl_dereference(dev->priomap); 122 map = rtnl_dereference(dev->priomap);
105 if (!map || map->priomap_len < max_len) 123 if (!prio && (!map || map->priomap_len <= cgrp->id))
106 ret = extend_netdev_table(dev, max_len); 124 return 0;
107 125
108 return ret; 126 ret = extend_netdev_table(dev, cgrp->id);
127 if (ret)
128 return ret;
129
130 map = rtnl_dereference(dev->priomap);
131 map->priomap[cgrp->id] = prio;
132 return 0;
109} 133}
110 134
111static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) 135static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
112{ 136{
113 struct cgroup_netprio_state *cs; 137 struct cgroup_netprio_state *cs;
114 int ret = -EINVAL;
115 138
116 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 139 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
117 if (!cs) 140 if (!cs)
118 return ERR_PTR(-ENOMEM); 141 return ERR_PTR(-ENOMEM);
119 142
120 if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx)
121 goto out;
122
123 ret = get_prioidx(&cs->prioidx);
124 if (ret < 0) {
125 pr_warn("No space in priority index array\n");
126 goto out;
127 }
128
129 return &cs->css; 143 return &cs->css;
130out:
131 kfree(cs);
132 return ERR_PTR(ret);
133} 144}
134 145
135static void cgrp_destroy(struct cgroup *cgrp) 146static int cgrp_css_online(struct cgroup *cgrp)
136{ 147{
137 struct cgroup_netprio_state *cs; 148 struct cgroup *parent = cgrp->parent;
138 struct net_device *dev; 149 struct net_device *dev;
139 struct netprio_map *map; 150 int ret = 0;
151
152 if (!parent)
153 return 0;
140 154
141 cs = cgrp_netprio_state(cgrp);
142 rtnl_lock(); 155 rtnl_lock();
156 /*
157 * Inherit prios from the parent. As all prios are set during
158 * onlining, there is no need to clear them on offline.
159 */
143 for_each_netdev(&init_net, dev) { 160 for_each_netdev(&init_net, dev) {
144 map = rtnl_dereference(dev->priomap); 161 u32 prio = netprio_prio(parent, dev);
145 if (map && cs->prioidx < map->priomap_len) 162
146 map->priomap[cs->prioidx] = 0; 163 ret = netprio_set_prio(cgrp, dev, prio);
164 if (ret)
165 break;
147 } 166 }
148 rtnl_unlock(); 167 rtnl_unlock();
149 put_prioidx(cs->prioidx); 168 return ret;
150 kfree(cs); 169}
170
171static void cgrp_css_free(struct cgroup *cgrp)
172{
173 kfree(cgrp_netprio_state(cgrp));
151} 174}
152 175
153static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) 176static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
154{ 177{
155 return (u64)cgrp_netprio_state(cgrp)->prioidx; 178 return cgrp->id;
156} 179}
157 180
158static int read_priomap(struct cgroup *cont, struct cftype *cft, 181static int read_priomap(struct cgroup *cont, struct cftype *cft,
159 struct cgroup_map_cb *cb) 182 struct cgroup_map_cb *cb)
160{ 183{
161 struct net_device *dev; 184 struct net_device *dev;
162 u32 prioidx = cgrp_netprio_state(cont)->prioidx;
163 u32 priority;
164 struct netprio_map *map;
165 185
166 rcu_read_lock(); 186 rcu_read_lock();
167 for_each_netdev_rcu(&init_net, dev) { 187 for_each_netdev_rcu(&init_net, dev)
168 map = rcu_dereference(dev->priomap); 188 cb->fill(cb, dev->name, netprio_prio(cont, dev));
169 priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0;
170 cb->fill(cb, dev->name, priority);
171 }
172 rcu_read_unlock(); 189 rcu_read_unlock();
173 return 0; 190 return 0;
174} 191}
@@ -176,66 +193,24 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft,
176static int write_priomap(struct cgroup *cgrp, struct cftype *cft, 193static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
177 const char *buffer) 194 const char *buffer)
178{ 195{
179 char *devname = kstrdup(buffer, GFP_KERNEL); 196 char devname[IFNAMSIZ + 1];
180 int ret = -EINVAL;
181 u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
182 unsigned long priority;
183 char *priostr;
184 struct net_device *dev; 197 struct net_device *dev;
185 struct netprio_map *map; 198 u32 prio;
186 199 int ret;
187 if (!devname)
188 return -ENOMEM;
189
190 /*
191 * Minimally sized valid priomap string
192 */
193 if (strlen(devname) < 3)
194 goto out_free_devname;
195
196 priostr = strstr(devname, " ");
197 if (!priostr)
198 goto out_free_devname;
199
200 /*
201 *Separate the devname from the associated priority
202 *and advance the priostr pointer to the priority value
203 */
204 *priostr = '\0';
205 priostr++;
206
207 /*
208 * If the priostr points to NULL, we're at the end of the passed
209 * in string, and its not a valid write
210 */
211 if (*priostr == '\0')
212 goto out_free_devname;
213
214 ret = kstrtoul(priostr, 10, &priority);
215 if (ret < 0)
216 goto out_free_devname;
217 200
218 ret = -ENODEV; 201 if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
202 return -EINVAL;
219 203
220 dev = dev_get_by_name(&init_net, devname); 204 dev = dev_get_by_name(&init_net, devname);
221 if (!dev) 205 if (!dev)
222 goto out_free_devname; 206 return -ENODEV;
223 207
224 rtnl_lock(); 208 rtnl_lock();
225 ret = write_update_netdev_table(dev);
226 if (ret < 0)
227 goto out_put_dev;
228 209
229 map = rtnl_dereference(dev->priomap); 210 ret = netprio_set_prio(cgrp, dev, prio);
230 if (map)
231 map->priomap[prioidx] = priority;
232 211
233out_put_dev:
234 rtnl_unlock(); 212 rtnl_unlock();
235 dev_put(dev); 213 dev_put(dev);
236
237out_free_devname:
238 kfree(devname);
239 return ret; 214 return ret;
240} 215}
241 216
@@ -276,22 +251,13 @@ static struct cftype ss_files[] = {
276 251
277struct cgroup_subsys net_prio_subsys = { 252struct cgroup_subsys net_prio_subsys = {
278 .name = "net_prio", 253 .name = "net_prio",
279 .create = cgrp_create, 254 .css_alloc = cgrp_css_alloc,
280 .destroy = cgrp_destroy, 255 .css_online = cgrp_css_online,
256 .css_free = cgrp_css_free,
281 .attach = net_prio_attach, 257 .attach = net_prio_attach,
282 .subsys_id = net_prio_subsys_id, 258 .subsys_id = net_prio_subsys_id,
283 .base_cftypes = ss_files, 259 .base_cftypes = ss_files,
284 .module = THIS_MODULE, 260 .module = THIS_MODULE,
285
286 /*
287 * net_prio has artificial limit on the number of cgroups and
288 * disallows nesting making it impossible to co-mount it with other
289 * hierarchical subsystems. Remove the artificially low PRIOIDX_SZ
290 * limit and properly nest configuration such that children follow
291 * their parents' configurations by default and are allowed to
292 * override and remove the following.
293 */
294 .broken_hierarchy = true,
295}; 261};
296 262
297static int netprio_device_event(struct notifier_block *unused, 263static int netprio_device_event(struct notifier_block *unused,
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 2ecde225ae60..31f06b633574 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -34,21 +34,25 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
34 struct cgroup_cls_state, css); 34 struct cgroup_cls_state, css);
35} 35}
36 36
37static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) 37static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
38{ 38{
39 struct cgroup_cls_state *cs; 39 struct cgroup_cls_state *cs;
40 40
41 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 41 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
42 if (!cs) 42 if (!cs)
43 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
44 return &cs->css;
45}
44 46
47static int cgrp_css_online(struct cgroup *cgrp)
48{
45 if (cgrp->parent) 49 if (cgrp->parent)
46 cs->classid = cgrp_cls_state(cgrp->parent)->classid; 50 cgrp_cls_state(cgrp)->classid =
47 51 cgrp_cls_state(cgrp->parent)->classid;
48 return &cs->css; 52 return 0;
49} 53}
50 54
51static void cgrp_destroy(struct cgroup *cgrp) 55static void cgrp_css_free(struct cgroup *cgrp)
52{ 56{
53 kfree(cgrp_cls_state(cgrp)); 57 kfree(cgrp_cls_state(cgrp));
54} 58}
@@ -75,20 +79,12 @@ static struct cftype ss_files[] = {
75 79
76struct cgroup_subsys net_cls_subsys = { 80struct cgroup_subsys net_cls_subsys = {
77 .name = "net_cls", 81 .name = "net_cls",
78 .create = cgrp_create, 82 .css_alloc = cgrp_css_alloc,
79 .destroy = cgrp_destroy, 83 .css_online = cgrp_css_online,
84 .css_free = cgrp_css_free,
80 .subsys_id = net_cls_subsys_id, 85 .subsys_id = net_cls_subsys_id,
81 .base_cftypes = ss_files, 86 .base_cftypes = ss_files,
82 .module = THIS_MODULE, 87 .module = THIS_MODULE,
83
84 /*
85 * While net_cls cgroup has the rudimentary hierarchy support of
86 * inheriting the parent's classid on cgroup creation, it doesn't
87 * properly propagates config changes in ancestors to their
88 * descendents. A child should follow the parent's configuration
89 * but be allowed to override it. Fix it and remove the following.
90 */
91 .broken_hierarchy = true,
92}; 88};
93 89
94struct cls_cgroup_head { 90struct cls_cgroup_head {
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index b08d20c66c2e..19ecc8de9e6b 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -82,6 +82,8 @@ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
82{ 82{
83 struct dev_exception_item *ex, *tmp, *new; 83 struct dev_exception_item *ex, *tmp, *new;
84 84
85 lockdep_assert_held(&devcgroup_mutex);
86
85 list_for_each_entry(ex, orig, list) { 87 list_for_each_entry(ex, orig, list) {
86 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 88 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
87 if (!new) 89 if (!new)
@@ -107,6 +109,8 @@ static int dev_exception_add(struct dev_cgroup *dev_cgroup,
107{ 109{
108 struct dev_exception_item *excopy, *walk; 110 struct dev_exception_item *excopy, *walk;
109 111
112 lockdep_assert_held(&devcgroup_mutex);
113
110 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 114 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
111 if (!excopy) 115 if (!excopy)
112 return -ENOMEM; 116 return -ENOMEM;
@@ -137,6 +141,8 @@ static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
137{ 141{
138 struct dev_exception_item *walk, *tmp; 142 struct dev_exception_item *walk, *tmp;
139 143
144 lockdep_assert_held(&devcgroup_mutex);
145
140 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 146 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
141 if (walk->type != ex->type) 147 if (walk->type != ex->type)
142 continue; 148 continue;
@@ -163,6 +169,8 @@ static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
163{ 169{
164 struct dev_exception_item *ex, *tmp; 170 struct dev_exception_item *ex, *tmp;
165 171
172 lockdep_assert_held(&devcgroup_mutex);
173
166 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 174 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
167 list_del_rcu(&ex->list); 175 list_del_rcu(&ex->list);
168 kfree_rcu(ex, rcu); 176 kfree_rcu(ex, rcu);
@@ -172,7 +180,7 @@ static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
172/* 180/*
173 * called from kernel/cgroup.c with cgroup_lock() held. 181 * called from kernel/cgroup.c with cgroup_lock() held.
174 */ 182 */
175static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup) 183static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
176{ 184{
177 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; 185 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
178 struct cgroup *parent_cgroup; 186 struct cgroup *parent_cgroup;
@@ -202,7 +210,7 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
202 return &dev_cgroup->css; 210 return &dev_cgroup->css;
203} 211}
204 212
205static void devcgroup_destroy(struct cgroup *cgroup) 213static void devcgroup_css_free(struct cgroup *cgroup)
206{ 214{
207 struct dev_cgroup *dev_cgroup; 215 struct dev_cgroup *dev_cgroup;
208 216
@@ -298,6 +306,10 @@ static int may_access(struct dev_cgroup *dev_cgroup,
298 struct dev_exception_item *ex; 306 struct dev_exception_item *ex;
299 bool match = false; 307 bool match = false;
300 308
309 rcu_lockdep_assert(rcu_read_lock_held() ||
310 lockdep_is_held(&devcgroup_mutex),
311 "device_cgroup::may_access() called without proper synchronization");
312
301 list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) { 313 list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
302 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 314 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
303 continue; 315 continue;
@@ -552,8 +564,8 @@ static struct cftype dev_cgroup_files[] = {
552struct cgroup_subsys devices_subsys = { 564struct cgroup_subsys devices_subsys = {
553 .name = "devices", 565 .name = "devices",
554 .can_attach = devcgroup_can_attach, 566 .can_attach = devcgroup_can_attach,
555 .create = devcgroup_create, 567 .css_alloc = devcgroup_css_alloc,
556 .destroy = devcgroup_destroy, 568 .css_free = devcgroup_css_free,
557 .subsys_id = devices_subsys_id, 569 .subsys_id = devices_subsys_id,
558 .base_cftypes = dev_cgroup_files, 570 .base_cftypes = dev_cgroup_files,
559 571