aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-04-29 22:14:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-04-29 22:14:20 -0400
commit191a712090bb8a10e6f129360eeed2d68f3d4c9a (patch)
tree17e2d6c27fb8a7c3a61828fbcc7c343a4966a0a9
parent46d9be3e5eb01f71fc02653755d970247174b400 (diff)
parent2a0010af17b1739ef8ea8cf02647a127241ee674 (diff)
Merge branch 'for-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Fixes and a lot of cleanups. Locking cleanup is finally complete. cgroup_mutex is no longer exposed to individual controlelrs which used to cause nasty deadlock issues. Li fixed and cleaned up quite a bit including long standing ones like racy cgroup_path(). - device cgroup now supports proper hierarchy thanks to Aristeu. - perf_event cgroup now supports proper hierarchy. - A new mount option "__DEVEL__sane_behavior" is added. As indicated by the name, this option is to be used for development only at this point and generates a warning message when used. Unfortunately, cgroup interface currently has too many brekages and inconsistencies to implement a consistent and unified hierarchy on top. The new flag is used to collect the behavior changes which are necessary to implement consistent unified hierarchy. It's likely that this flag won't be used verbatim when it becomes ready but will be enabled implicitly along with unified hierarchy. The option currently disables some of broken behaviors in cgroup core and also .use_hierarchy switch in memcg (will be routed through -mm), which can be used to make very unusual hierarchy where nesting is partially honored. It will also be used to implement hierarchy support for blk-throttle which would be impossible otherwise without introducing a full separate set of control knobs. This is essentially versioning of interface which isn't very nice but at this point I can't see any other options which would allow keeping the interface the same while moving towards hierarchy behavior which is at least somewhat sane. The planned unified hierarchy is likely to require some level of adaptation from userland anyway, so I think it'd be best to take the chance and update the interface such that it's supportable in the long term. Maintaining the existing interface does complicate cgroup core but shouldn't put too much strain on individual controllers and I think it'd be manageable for the foreseeable future. Maybe we'll be able to drop it in a decade. Fix up conflicts (including a semantic one adding a new #include to ppc that was uncovered by header the file changes) as per Tejun. * 'for-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (45 commits) cpuset: fix compile warning when CONFIG_SMP=n cpuset: fix cpu hotplug vs rebuild_sched_domains() race cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn() cgroup: restore the call to eventfd->poll() cgroup: fix use-after-free when umounting cgroupfs cgroup: fix broken file xattrs devcg: remove parent_cgroup. memcg: force use_hierarchy if sane_behavior cgroup: remove cgrp->top_cgroup cgroup: introduce sane_behavior mount option move cgroupfs_root to include/linux/cgroup.h cgroup: convert cgroupfs_root flag bits to masks and add CGRP_ prefix cgroup: make cgroup_path() not print double slashes Revert "cgroup: remove bind() method from cgroup_subsys." perf: make perf_event cgroup hierarchical cgroup: implement cgroup_is_descendant() cgroup: make sure parent won't be destroyed before its children cgroup: remove bind() method from cgroup_subsys. devcg: remove broken_hierarchy tag cgroup: remove cgroup_lock_is_held() ...
-rw-r--r--Documentation/cgroups/cgroups.txt3
-rw-r--r--Documentation/cgroups/devices.txt70
-rw-r--r--arch/powerpc/mm/numa.c1
-rw-r--r--block/blk-cgroup.h2
-rw-r--r--include/linux/cgroup.h170
-rw-r--r--include/linux/cpuset.h1
-rw-r--r--include/linux/res_counter.h2
-rw-r--r--kernel/cgroup.c724
-rw-r--r--kernel/cpuset.c115
-rw-r--r--kernel/events/core.c24
-rw-r--r--mm/memcontrol.c80
-rw-r--r--security/device_cgroup.c267
12 files changed, 826 insertions, 633 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index bcf1a00b06a1..638bf17ff869 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -442,7 +442,7 @@ You can attach the current shell task by echoing 0:
442You can use the cgroup.procs file instead of the tasks file to move all 442You can use the cgroup.procs file instead of the tasks file to move all
443threads in a threadgroup at once. Echoing the PID of any task in a 443threads in a threadgroup at once. Echoing the PID of any task in a
444threadgroup to cgroup.procs causes all tasks in that threadgroup to be 444threadgroup to cgroup.procs causes all tasks in that threadgroup to be
445be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks 445attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
446in the writing task's threadgroup. 446in the writing task's threadgroup.
447 447
448Note: Since every task is always a member of exactly one cgroup in each 448Note: Since every task is always a member of exactly one cgroup in each
@@ -580,6 +580,7 @@ propagation along the hierarchy. See the comment on
580cgroup_for_each_descendant_pre() for details. 580cgroup_for_each_descendant_pre() for details.
581 581
582void css_offline(struct cgroup *cgrp); 582void css_offline(struct cgroup *cgrp);
583(cgroup_mutex held by caller)
583 584
584This is the counterpart of css_online() and called iff css_online() 585This is the counterpart of css_online() and called iff css_online()
585has succeeded on @cgrp. This signifies the beginning of the end of 586has succeeded on @cgrp. This signifies the beginning of the end of
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt
index 16624a7f8222..3c1095ca02ea 100644
--- a/Documentation/cgroups/devices.txt
+++ b/Documentation/cgroups/devices.txt
@@ -13,9 +13,7 @@ either an integer or * for all. Access is a composition of r
13The root device cgroup starts with rwm to 'all'. A child device 13The root device cgroup starts with rwm to 'all'. A child device
14cgroup gets a copy of the parent. Administrators can then remove 14cgroup gets a copy of the parent. Administrators can then remove
15devices from the whitelist or add new entries. A child cgroup can 15devices from the whitelist or add new entries. A child cgroup can
16never receive a device access which is denied by its parent. However 16never receive a device access which is denied by its parent.
17when a device access is removed from a parent it will not also be
18removed from the child(ren).
19 17
202. User Interface 182. User Interface
21 19
@@ -50,3 +48,69 @@ task to a new cgroup. (Again we'll probably want to change that).
50 48
51A cgroup may not be granted more permissions than the cgroup's 49A cgroup may not be granted more permissions than the cgroup's
52parent has. 50parent has.
51
524. Hierarchy
53
54device cgroups maintain hierarchy by making sure a cgroup never has more
55access permissions than its parent. Every time an entry is written to
56a cgroup's devices.deny file, all its children will have that entry removed
57from their whitelist and all the locally set whitelist entries will be
58re-evaluated. In case one of the locally set whitelist entries would provide
59more access than the cgroup's parent, it'll be removed from the whitelist.
60
61Example:
62 A
63 / \
64 B
65
66 group behavior exceptions
67 A allow "b 8:* rwm", "c 116:1 rw"
68 B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
69
70If a device is denied in group A:
71 # echo "c 116:* r" > A/devices.deny
72it'll propagate down and after revalidating B's entries, the whitelist entry
73"c 116:2 rwm" will be removed:
74
75 group whitelist entries denied devices
76 A all "b 8:* rwm", "c 116:* rw"
77 B "c 1:3 rwm", "b 3:* rwm" all the rest
78
79In case parent's exceptions change and local exceptions are not allowed
80anymore, they'll be deleted.
81
82Notice that new whitelist entries will not be propagated:
83 A
84 / \
85 B
86
87 group whitelist entries denied devices
88 A "c 1:3 rwm", "c 1:5 r" all the rest
89 B "c 1:3 rwm", "c 1:5 r" all the rest
90
91when adding "c *:3 rwm":
92 # echo "c *:3 rwm" >A/devices.allow
93
94the result:
95 group whitelist entries denied devices
96 A "c *:3 rwm", "c 1:5 r" all the rest
97 B "c 1:3 rwm", "c 1:5 r" all the rest
98
99but now it'll be possible to add new entries to B:
100 # echo "c 2:3 rwm" >B/devices.allow
101 # echo "c 50:3 r" >B/devices.allow
102or even
103 # echo "c *:3 rwm" >B/devices.allow
104
105Allowing or denying all by writing 'a' to devices.allow or devices.deny will
106not be possible once the device cgroups has children.
107
1084.1 Hierarchy (internal implementation)
109
110device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
111list of exceptions. The internal state is controlled using the same user
112interface to preserve compatibility with the previous whitelist-only
113implementation. Removal or addition of exceptions that will reduce the access
114to devices will be propagated down the hierarchy.
115For every propagated exception, the effective rules will be re-evaluated based
116on current parent's access rules.
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b8020dc7b71e..fa33c546e778 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -22,6 +22,7 @@
22#include <linux/pfn.h> 22#include <linux/pfn.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/node.h> 24#include <linux/node.h>
25#include <linux/slab.h>
25#include <asm/sparsemem.h> 26#include <asm/sparsemem.h>
26#include <asm/prom.h> 27#include <asm/prom.h>
27#include <asm/smp.h> 28#include <asm/smp.h>
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index f2b292925ccd..4e595ee8c915 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
247{ 247{
248 int ret; 248 int ret;
249 249
250 rcu_read_lock();
251 ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); 250 ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
252 rcu_read_unlock();
253 if (ret) 251 if (ret)
254 strncpy(buf, "<unavailable>", buflen); 252 strncpy(buf, "<unavailable>", buflen);
255 return ret; 253 return ret;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 470073bf93d0..d86e215ca2b8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -19,6 +19,7 @@
19#include <linux/idr.h> 19#include <linux/idr.h>
20#include <linux/workqueue.h> 20#include <linux/workqueue.h>
21#include <linux/xattr.h> 21#include <linux/xattr.h>
22#include <linux/fs.h>
22 23
23#ifdef CONFIG_CGROUPS 24#ifdef CONFIG_CGROUPS
24 25
@@ -30,10 +31,6 @@ struct css_id;
30 31
31extern int cgroup_init_early(void); 32extern int cgroup_init_early(void);
32extern int cgroup_init(void); 33extern int cgroup_init(void);
33extern void cgroup_lock(void);
34extern int cgroup_lock_is_held(void);
35extern bool cgroup_lock_live_group(struct cgroup *cgrp);
36extern void cgroup_unlock(void);
37extern void cgroup_fork(struct task_struct *p); 34extern void cgroup_fork(struct task_struct *p);
38extern void cgroup_post_fork(struct task_struct *p); 35extern void cgroup_post_fork(struct task_struct *p);
39extern void cgroup_exit(struct task_struct *p, int run_callbacks); 36extern void cgroup_exit(struct task_struct *p, int run_callbacks);
@@ -44,14 +41,25 @@ extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
44 41
45extern const struct file_operations proc_cgroup_operations; 42extern const struct file_operations proc_cgroup_operations;
46 43
47/* Define the enumeration of all builtin cgroup subsystems */ 44/*
45 * Define the enumeration of all cgroup subsystems.
46 *
47 * We define ids for builtin subsystems and then modular ones.
48 */
48#define SUBSYS(_x) _x ## _subsys_id, 49#define SUBSYS(_x) _x ## _subsys_id,
49#define IS_SUBSYS_ENABLED(option) IS_ENABLED(option)
50enum cgroup_subsys_id { 50enum cgroup_subsys_id {
51#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
52#include <linux/cgroup_subsys.h>
53#undef IS_SUBSYS_ENABLED
54 CGROUP_BUILTIN_SUBSYS_COUNT,
55
56 __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
57
58#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
51#include <linux/cgroup_subsys.h> 59#include <linux/cgroup_subsys.h>
60#undef IS_SUBSYS_ENABLED
52 CGROUP_SUBSYS_COUNT, 61 CGROUP_SUBSYS_COUNT,
53}; 62};
54#undef IS_SUBSYS_ENABLED
55#undef SUBSYS 63#undef SUBSYS
56 64
57/* Per-subsystem/per-cgroup state maintained by the system. */ 65/* Per-subsystem/per-cgroup state maintained by the system. */
@@ -148,6 +156,13 @@ enum {
148 * specified at mount time and thus is implemented here. 156 * specified at mount time and thus is implemented here.
149 */ 157 */
150 CGRP_CPUSET_CLONE_CHILDREN, 158 CGRP_CPUSET_CLONE_CHILDREN,
159 /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
160 CGRP_SANE_BEHAVIOR,
161};
162
163struct cgroup_name {
164 struct rcu_head rcu_head;
165 char name[];
151}; 166};
152 167
153struct cgroup { 168struct cgroup {
@@ -172,11 +187,23 @@ struct cgroup {
172 struct cgroup *parent; /* my parent */ 187 struct cgroup *parent; /* my parent */
173 struct dentry *dentry; /* cgroup fs entry, RCU protected */ 188 struct dentry *dentry; /* cgroup fs entry, RCU protected */
174 189
190 /*
191 * This is a copy of dentry->d_name, and it's needed because
192 * we can't use dentry->d_name in cgroup_path().
193 *
194 * You must acquire rcu_read_lock() to access cgrp->name, and
195 * the only place that can change it is rename(), which is
196 * protected by parent dir's i_mutex.
197 *
198 * Normally you should use cgroup_name() wrapper rather than
199 * access it directly.
200 */
201 struct cgroup_name __rcu *name;
202
175 /* Private pointers for each registered subsystem */ 203 /* Private pointers for each registered subsystem */
176 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 204 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
177 205
178 struct cgroupfs_root *root; 206 struct cgroupfs_root *root;
179 struct cgroup *top_cgroup;
180 207
181 /* 208 /*
182 * List of cg_cgroup_links pointing at css_sets with 209 * List of cg_cgroup_links pointing at css_sets with
@@ -213,6 +240,96 @@ struct cgroup {
213 struct simple_xattrs xattrs; 240 struct simple_xattrs xattrs;
214}; 241};
215 242
243#define MAX_CGROUP_ROOT_NAMELEN 64
244
245/* cgroupfs_root->flags */
246enum {
247 /*
248 * Unfortunately, cgroup core and various controllers are riddled
249 * with idiosyncrasies and pointless options. The following flag,
250 * when set, will force sane behavior - some options are forced on,
251 * others are disallowed, and some controllers will change their
252 * hierarchical or other behaviors.
253 *
254 * The set of behaviors affected by this flag are still being
255 * determined and developed and the mount option for this flag is
256 * prefixed with __DEVEL__. The prefix will be dropped once we
257 * reach the point where all behaviors are compatible with the
258 * planned unified hierarchy, which will automatically turn on this
259 * flag.
260 *
261 * The followings are the behaviors currently affected this flag.
262 *
263 * - Mount options "noprefix" and "clone_children" are disallowed.
264 * Also, cgroupfs file cgroup.clone_children is not created.
265 *
266 * - When mounting an existing superblock, mount options should
267 * match.
268 *
269 * - Remount is disallowed.
270 *
271 * - memcg: use_hierarchy is on by default and the cgroup file for
272 * the flag is not created.
273 *
274 * The followings are planned changes.
275 *
276 * - release_agent will be disallowed once replacement notification
277 * mechanism is implemented.
278 */
279 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
280
281 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
282 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
283};
284
285/*
286 * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
287 * associated with a superblock to form an active hierarchy. This is
288 * internal to cgroup core. Don't access directly from controllers.
289 */
290struct cgroupfs_root {
291 struct super_block *sb;
292
293 /*
294 * The bitmask of subsystems intended to be attached to this
295 * hierarchy
296 */
297 unsigned long subsys_mask;
298
299 /* Unique id for this hierarchy. */
300 int hierarchy_id;
301
302 /* The bitmask of subsystems currently attached to this hierarchy */
303 unsigned long actual_subsys_mask;
304
305 /* A list running through the attached subsystems */
306 struct list_head subsys_list;
307
308 /* The root cgroup for this hierarchy */
309 struct cgroup top_cgroup;
310
311 /* Tracks how many cgroups are currently defined in hierarchy.*/
312 int number_of_cgroups;
313
314 /* A list running through the active hierarchies */
315 struct list_head root_list;
316
317 /* All cgroups on this root, cgroup_mutex protected */
318 struct list_head allcg_list;
319
320 /* Hierarchy-specific flags */
321 unsigned long flags;
322
323 /* IDs for cgroups in this hierarchy */
324 struct ida cgroup_ida;
325
326 /* The path to use for release notifications. */
327 char release_agent_path[PATH_MAX];
328
329 /* The name for this hierarchy - may be empty */
330 char name[MAX_CGROUP_ROOT_NAMELEN];
331};
332
216/* 333/*
217 * A css_set is a structure holding pointers to a set of 334 * A css_set is a structure holding pointers to a set of
218 * cgroup_subsys_state objects. This saves space in the task struct 335 * cgroup_subsys_state objects. This saves space in the task struct
@@ -278,6 +395,7 @@ struct cgroup_map_cb {
278/* cftype->flags */ 395/* cftype->flags */
279#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ 396#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */
280#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */ 397#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */
398#define CFTYPE_INSANE (1U << 2) /* don't create if sane_behavior */
281 399
282#define MAX_CFTYPE_NAME 64 400#define MAX_CFTYPE_NAME 64
283 401
@@ -304,9 +422,6 @@ struct cftype {
304 /* CFTYPE_* flags */ 422 /* CFTYPE_* flags */
305 unsigned int flags; 423 unsigned int flags;
306 424
307 /* file xattrs */
308 struct simple_xattrs xattrs;
309
310 int (*open)(struct inode *inode, struct file *file); 425 int (*open)(struct inode *inode, struct file *file);
311 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 426 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
312 struct file *file, 427 struct file *file,
@@ -404,18 +519,31 @@ struct cgroup_scanner {
404 void *data; 519 void *data;
405}; 520};
406 521
522/*
523 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
524 * function can be called as long as @cgrp is accessible.
525 */
526static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
527{
528 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
529}
530
531/* Caller should hold rcu_read_lock() */
532static inline const char *cgroup_name(const struct cgroup *cgrp)
533{
534 return rcu_dereference(cgrp->name)->name;
535}
536
407int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 537int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
408int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 538int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
409 539
410int cgroup_is_removed(const struct cgroup *cgrp); 540int cgroup_is_removed(const struct cgroup *cgrp);
541bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
411 542
412int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 543int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
413 544
414int cgroup_task_count(const struct cgroup *cgrp); 545int cgroup_task_count(const struct cgroup *cgrp);
415 546
416/* Return true if cgrp is a descendant of the task's cgroup */
417int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
418
419/* 547/*
420 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 548 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
421 * methods. 549 * methods.
@@ -523,10 +651,16 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
523 * rcu_dereference_check() conditions, such as locks used during the 651 * rcu_dereference_check() conditions, such as locks used during the
524 * cgroup_subsys::attach() methods. 652 * cgroup_subsys::attach() methods.
525 */ 653 */
654#ifdef CONFIG_PROVE_RCU
655extern struct mutex cgroup_mutex;
656#define task_subsys_state_check(task, subsys_id, __c) \
657 rcu_dereference_check((task)->cgroups->subsys[(subsys_id)], \
658 lockdep_is_held(&(task)->alloc_lock) || \
659 lockdep_is_held(&cgroup_mutex) || (__c))
660#else
526#define task_subsys_state_check(task, subsys_id, __c) \ 661#define task_subsys_state_check(task, subsys_id, __c) \
527 rcu_dereference_check(task->cgroups->subsys[subsys_id], \ 662 rcu_dereference((task)->cgroups->subsys[(subsys_id)])
528 lockdep_is_held(&task->alloc_lock) || \ 663#endif
529 cgroup_lock_is_held() || (__c))
530 664
531static inline struct cgroup_subsys_state * 665static inline struct cgroup_subsys_state *
532task_subsys_state(struct task_struct *task, int subsys_id) 666task_subsys_state(struct task_struct *task, int subsys_id)
@@ -661,8 +795,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
661 struct cgroup_iter *it); 795 struct cgroup_iter *it);
662void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); 796void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
663int cgroup_scan_tasks(struct cgroup_scanner *scan); 797int cgroup_scan_tasks(struct cgroup_scanner *scan);
664int cgroup_attach_task(struct cgroup *, struct task_struct *);
665int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 798int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
799int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
666 800
667/* 801/*
668 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works 802 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8c8a60d29407..ccd1de8ad822 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -11,7 +11,6 @@
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/cgroup.h>
15#include <linux/mm.h> 14#include <linux/mm.h>
16 15
17#ifdef CONFIG_CPUSETS 16#ifdef CONFIG_CPUSETS
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index c23099413ad6..96a509b6be04 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -13,7 +13,7 @@
13 * info about what this counter is. 13 * info about what this counter is.
14 */ 14 */
15 15
16#include <linux/cgroup.h> 16#include <linux/spinlock.h>
17#include <linux/errno.h> 17#include <linux/errno.h>
18 18
19/* 19/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1f628bc039f4..eeb7e49946b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
30#include <linux/cred.h> 30#include <linux/cred.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/errno.h> 32#include <linux/errno.h>
33#include <linux/fs.h>
34#include <linux/init_task.h> 33#include <linux/init_task.h>
35#include <linux/kernel.h> 34#include <linux/kernel.h>
36#include <linux/list.h> 35#include <linux/list.h>
@@ -59,7 +58,7 @@
59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h> 59#include <linux/eventfd.h>
61#include <linux/poll.h> 60#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
63#include <linux/kthread.h> 62#include <linux/kthread.h>
64 63
65#include <linux/atomic.h> 64#include <linux/atomic.h>
@@ -83,7 +82,13 @@
83 * B happens only through cgroup_show_options() and using cgroup_root_mutex 82 * B happens only through cgroup_show_options() and using cgroup_root_mutex
84 * breaks it. 83 * breaks it.
85 */ 84 */
85#ifdef CONFIG_PROVE_RCU
86DEFINE_MUTEX(cgroup_mutex);
87EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
88#else
86static DEFINE_MUTEX(cgroup_mutex); 89static DEFINE_MUTEX(cgroup_mutex);
90#endif
91
87static DEFINE_MUTEX(cgroup_root_mutex); 92static DEFINE_MUTEX(cgroup_root_mutex);
88 93
89/* 94/*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
98#include <linux/cgroup_subsys.h> 103#include <linux/cgroup_subsys.h>
99}; 104};
100 105
101#define MAX_CGROUP_ROOT_NAMELEN 64
102
103/*
104 * A cgroupfs_root represents the root of a cgroup hierarchy,
105 * and may be associated with a superblock to form an active
106 * hierarchy
107 */
108struct cgroupfs_root {
109 struct super_block *sb;
110
111 /*
112 * The bitmask of subsystems intended to be attached to this
113 * hierarchy
114 */
115 unsigned long subsys_mask;
116
117 /* Unique id for this hierarchy. */
118 int hierarchy_id;
119
120 /* The bitmask of subsystems currently attached to this hierarchy */
121 unsigned long actual_subsys_mask;
122
123 /* A list running through the attached subsystems */
124 struct list_head subsys_list;
125
126 /* The root cgroup for this hierarchy */
127 struct cgroup top_cgroup;
128
129 /* Tracks how many cgroups are currently defined in hierarchy.*/
130 int number_of_cgroups;
131
132 /* A list running through the active hierarchies */
133 struct list_head root_list;
134
135 /* All cgroups on this root, cgroup_mutex protected */
136 struct list_head allcg_list;
137
138 /* Hierarchy-specific flags */
139 unsigned long flags;
140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
144 /* The path to use for release notifications. */
145 char release_agent_path[PATH_MAX];
146
147 /* The name for this hierarchy - may be empty */
148 char name[MAX_CGROUP_ROOT_NAMELEN];
149};
150
151/* 106/*
152 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
153 * subsystems that are otherwise unattached - it never has more than a 108 * subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
162 struct list_head node; 117 struct list_head node;
163 struct dentry *dentry; 118 struct dentry *dentry;
164 struct cftype *type; 119 struct cftype *type;
120
121 /* file xattrs */
122 struct simple_xattrs xattrs;
165}; 123};
166 124
167/* 125/*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
238/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
239#define dummytop (&rootnode.top_cgroup) 197#define dummytop (&rootnode.top_cgroup)
240 198
199static struct cgroup_name root_cgroup_name = { .name = "/" };
200
241/* This flag indicates whether tasks in the fork and exit paths should 201/* This flag indicates whether tasks in the fork and exit paths should
242 * check for fork/exit handlers to call. This avoids us having to do 202 * check for fork/exit handlers to call. This avoids us having to do
243 * extra work in the fork/exit path if none of the subsystems need to 203 * extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add); 210 struct cftype cfts[], bool is_add);
251 211
252#ifdef CONFIG_PROVE_LOCKING
253int cgroup_lock_is_held(void)
254{
255 return lockdep_is_held(&cgroup_mutex);
256}
257#else /* #ifdef CONFIG_PROVE_LOCKING */
258int cgroup_lock_is_held(void)
259{
260 return mutex_is_locked(&cgroup_mutex);
261}
262#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
263
264EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
265
266static int css_unbias_refcnt(int refcnt) 212static int css_unbias_refcnt(int refcnt)
267{ 213{
268 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; 214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
282 return test_bit(CGRP_REMOVED, &cgrp->flags); 228 return test_bit(CGRP_REMOVED, &cgrp->flags);
283} 229}
284 230
285/* bits in struct cgroupfs_root flags field */ 231/**
286enum { 232 * cgroup_is_descendant - test ancestry
287 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 233 * @cgrp: the cgroup to be tested
288 ROOT_XATTR, /* supports extended attributes */ 234 * @ancestor: possible ancestor of @cgrp
289}; 235 *
236 * Test whether @cgrp is a descendant of @ancestor. It also returns %true
237 * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
238 * and @ancestor are accessible.
239 */
240bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
241{
242 while (cgrp) {
243 if (cgrp == ancestor)
244 return true;
245 cgrp = cgrp->parent;
246 }
247 return false;
248}
249EXPORT_SYMBOL_GPL(cgroup_is_descendant);
290 250
291static int cgroup_is_releasable(const struct cgroup *cgrp) 251static int cgroup_is_releasable(const struct cgroup *cgrp)
292{ 252{
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
327 return __d_cfe(dentry)->type; 287 return __d_cfe(dentry)->type;
328} 288}
329 289
290/**
291 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
292 * @cgrp: the cgroup to be checked for liveness
293 *
294 * On success, returns true; the mutex should be later unlocked. On
295 * failure returns false with no lock held.
296 */
297static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{
299 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) {
301 mutex_unlock(&cgroup_mutex);
302 return false;
303 }
304 return true;
305}
306
330/* the list of cgroups eligible for automatic release. Protected by 307/* the list of cgroups eligible for automatic release. Protected by
331 * release_list_lock */ 308 * release_list_lock */
332static LIST_HEAD(release_list); 309static LIST_HEAD(release_list);
@@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
800 * update of a tasks cgroup pointer by cgroup_attach_task() 777 * update of a tasks cgroup pointer by cgroup_attach_task()
801 */ 778 */
802 779
803/**
804 * cgroup_lock - lock out any changes to cgroup structures
805 *
806 */
807void cgroup_lock(void)
808{
809 mutex_lock(&cgroup_mutex);
810}
811EXPORT_SYMBOL_GPL(cgroup_lock);
812
813/**
814 * cgroup_unlock - release lock on cgroup changes
815 *
816 * Undo the lock taken in a previous cgroup_lock() call.
817 */
818void cgroup_unlock(void)
819{
820 mutex_unlock(&cgroup_mutex);
821}
822EXPORT_SYMBOL_GPL(cgroup_unlock);
823
824/* 780/*
825 * A couple of forward declarations required, due to cyclic reference loop: 781 * A couple of forward declarations required, due to cyclic reference loop:
826 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 782 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
859 return inode; 815 return inode;
860} 816}
861 817
818static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
819{
820 struct cgroup_name *name;
821
822 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
823 if (!name)
824 return NULL;
825 strcpy(name->name, dentry->d_name.name);
826 return name;
827}
828
862static void cgroup_free_fn(struct work_struct *work) 829static void cgroup_free_fn(struct work_struct *work)
863{ 830{
864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work)
875 mutex_unlock(&cgroup_mutex); 842 mutex_unlock(&cgroup_mutex);
876 843
877 /* 844 /*
845 * We get a ref to the parent's dentry, and put the ref when
846 * this cgroup is being freed, so it's guaranteed that the
847 * parent won't be destroyed before its children.
848 */
849 dput(cgrp->parent->dentry);
850
851 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
852
853 /*
878 * Drop the active superblock reference that we took when we 854 * Drop the active superblock reference that we took when we
879 * created the cgroup 855 * created the cgroup. This will free cgrp->root, if we are
856 * holding the last reference to @sb.
880 */ 857 */
881 deactivate_super(cgrp->root->sb); 858 deactivate_super(cgrp->root->sb);
882 859
@@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work)
888 865
889 simple_xattrs_free(&cgrp->xattrs); 866 simple_xattrs_free(&cgrp->xattrs);
890 867
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 868 kfree(rcu_dereference_raw(cgrp->name));
892 kfree(cgrp); 869 kfree(cgrp);
893} 870}
894 871
@@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
910 } else { 887 } else {
911 struct cfent *cfe = __d_cfe(dentry); 888 struct cfent *cfe = __d_cfe(dentry);
912 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 889 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
913 struct cftype *cft = cfe->type;
914 890
915 WARN_ONCE(!list_empty(&cfe->node) && 891 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup, 892 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name); 893 "cfe still linked for %s\n", cfe->type->name);
894 simple_xattrs_free(&cfe->xattrs);
918 kfree(cfe); 895 kfree(cfe);
919 simple_xattrs_free(&cft->xattrs);
920 } 896 }
921 iput(inode); 897 iput(inode);
922} 898}
@@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1108 mutex_lock(&cgroup_root_mutex); 1084 mutex_lock(&cgroup_root_mutex);
1109 for_each_subsys(root, ss) 1085 for_each_subsys(root, ss)
1110 seq_printf(seq, ",%s", ss->name); 1086 seq_printf(seq, ",%s", ss->name);
1111 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior");
1089 if (root->flags & CGRP_ROOT_NOPREFIX)
1112 seq_puts(seq, ",noprefix"); 1090 seq_puts(seq, ",noprefix");
1113 if (test_bit(ROOT_XATTR, &root->flags)) 1091 if (root->flags & CGRP_ROOT_XATTR)
1114 seq_puts(seq, ",xattr"); 1092 seq_puts(seq, ",xattr");
1115 if (strlen(root->release_agent_path)) 1093 if (strlen(root->release_agent_path))
1116 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1094 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1172 all_ss = true; 1150 all_ss = true;
1173 continue; 1151 continue;
1174 } 1152 }
1153 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1154 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1155 continue;
1156 }
1175 if (!strcmp(token, "noprefix")) { 1157 if (!strcmp(token, "noprefix")) {
1176 set_bit(ROOT_NOPREFIX, &opts->flags); 1158 opts->flags |= CGRP_ROOT_NOPREFIX;
1177 continue; 1159 continue;
1178 } 1160 }
1179 if (!strcmp(token, "clone_children")) { 1161 if (!strcmp(token, "clone_children")) {
@@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1181 continue; 1163 continue;
1182 } 1164 }
1183 if (!strcmp(token, "xattr")) { 1165 if (!strcmp(token, "xattr")) {
1184 set_bit(ROOT_XATTR, &opts->flags); 1166 opts->flags |= CGRP_ROOT_XATTR;
1185 continue; 1167 continue;
1186 } 1168 }
1187 if (!strncmp(token, "release_agent=", 14)) { 1169 if (!strncmp(token, "release_agent=", 14)) {
@@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1259 1241
1260 /* Consistency checks */ 1242 /* Consistency checks */
1261 1243
1244 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1245 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1246
1247 if (opts->flags & CGRP_ROOT_NOPREFIX) {
1248 pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1249 return -EINVAL;
1250 }
1251
1252 if (opts->cpuset_clone_children) {
1253 pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1254 return -EINVAL;
1255 }
1256 }
1257
1262 /* 1258 /*
1263 * Option noprefix was introduced just for backward compatibility 1259 * Option noprefix was introduced just for backward compatibility
1264 * with the old cpuset, so we allow noprefix only if mounting just 1260 * with the old cpuset, so we allow noprefix only if mounting just
1265 * the cpuset subsystem. 1261 * the cpuset subsystem.
1266 */ 1262 */
1267 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1263 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1268 (opts->subsys_mask & mask))
1269 return -EINVAL; 1264 return -EINVAL;
1270 1265
1271 1266
@@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1336 struct cgroup_sb_opts opts; 1331 struct cgroup_sb_opts opts;
1337 unsigned long added_mask, removed_mask; 1332 unsigned long added_mask, removed_mask;
1338 1333
1334 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1335 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1336 return -EINVAL;
1337 }
1338
1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1340 mutex_lock(&cgroup_mutex); 1340 mutex_lock(&cgroup_mutex);
1341 mutex_lock(&cgroup_root_mutex); 1341 mutex_lock(&cgroup_root_mutex);
@@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1421 INIT_LIST_HEAD(&root->allcg_list); 1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1422 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1423 cgrp->root = root;
1424 cgrp->top_cgroup = cgrp; 1424 cgrp->name = &root_cgroup_name;
1425 init_cgroup_housekeeping(cgrp); 1425 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1427}
@@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1685 * any) is not needed 1685 * any) is not needed
1686 */ 1686 */
1687 cgroup_drop_root(opts.new_root); 1687 cgroup_drop_root(opts.new_root);
1688
1689 if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
1690 root->flags != opts.flags) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL;
1693 goto drop_new_super;
1694 }
1695
1688 /* no subsys rebinding, so refcounts don't change */ 1696 /* no subsys rebinding, so refcounts don't change */
1689 drop_parsed_module_refcounts(opts.subsys_mask); 1697 drop_parsed_module_refcounts(opts.subsys_mask);
1690 } 1698 }
@@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj;
1769 * @buf: the buffer to write the path into 1777 * @buf: the buffer to write the path into
1770 * @buflen: the length of the buffer 1778 * @buflen: the length of the buffer
1771 * 1779 *
1772 * Called with cgroup_mutex held or else with an RCU-protected cgroup 1780 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1773 * reference. Writes path of cgroup into buf. Returns 0 on success, 1781 *
1774 * -errno on error. 1782 * We can't generate cgroup path using dentry->d_name, as accessing
1783 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1784 * inode's i_mutex, while on the other hand cgroup_path() can be called
1785 * with some irq-safe spinlocks held.
1775 */ 1786 */
1776int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1787int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1777{ 1788{
1778 struct dentry *dentry = cgrp->dentry; 1789 int ret = -ENAMETOOLONG;
1779 char *start; 1790 char *start;
1780 1791
1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1792 if (!cgrp->parent) {
1782 "cgroup_path() called without proper locking"); 1793 if (strlcpy(buf, "/", buflen) >= buflen)
1783 1794 return -ENAMETOOLONG;
1784 if (cgrp == dummytop) {
1785 /*
1786 * Inactive subsystems have no dentry for their root
1787 * cgroup
1788 */
1789 strcpy(buf, "/");
1790 return 0; 1795 return 0;
1791 } 1796 }
1792 1797
1793 start = buf + buflen - 1; 1798 start = buf + buflen - 1;
1794
1795 *start = '\0'; 1799 *start = '\0';
1796 for (;;) {
1797 int len = dentry->d_name.len;
1798 1800
1801 rcu_read_lock();
1802 do {
1803 const char *name = cgroup_name(cgrp);
1804 int len;
1805
1806 len = strlen(name);
1799 if ((start -= len) < buf) 1807 if ((start -= len) < buf)
1800 return -ENAMETOOLONG; 1808 goto out;
1801 memcpy(start, dentry->d_name.name, len); 1809 memcpy(start, name, len);
1802 cgrp = cgrp->parent;
1803 if (!cgrp)
1804 break;
1805 1810
1806 dentry = cgrp->dentry;
1807 if (!cgrp->parent)
1808 continue;
1809 if (--start < buf) 1811 if (--start < buf)
1810 return -ENAMETOOLONG; 1812 goto out;
1811 *start = '/'; 1813 *start = '/';
1812 } 1814
1815 cgrp = cgrp->parent;
1816 } while (cgrp->parent);
1817 ret = 0;
1813 memmove(buf, start, buf + buflen - start); 1818 memmove(buf, start, buf + buflen - start);
1814 return 0; 1819out:
1820 rcu_read_unlock();
1821 return ret;
1815} 1822}
1816EXPORT_SYMBOL_GPL(cgroup_path); 1823EXPORT_SYMBOL_GPL(cgroup_path);
1817 1824
@@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1900 * 1907 *
1901 * Must be called with cgroup_mutex and threadgroup locked. 1908 * Must be called with cgroup_mutex and threadgroup locked.
1902 */ 1909 */
1903static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1910static void cgroup_task_migrate(struct cgroup *oldcgrp,
1904 struct task_struct *tsk, struct css_set *newcg) 1911 struct task_struct *tsk, struct css_set *newcg)
1905{ 1912{
1906 struct css_set *oldcg; 1913 struct css_set *oldcg;
@@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1933} 1940}
1934 1941
1935/** 1942/**
1936 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1943 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
1937 * @cgrp: the cgroup the task is attaching to
1938 * @tsk: the task to be attached
1939 *
1940 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1941 * @tsk during call.
1942 */
1943int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1944{
1945 int retval = 0;
1946 struct cgroup_subsys *ss, *failed_ss = NULL;
1947 struct cgroup *oldcgrp;
1948 struct cgroupfs_root *root = cgrp->root;
1949 struct cgroup_taskset tset = { };
1950 struct css_set *newcg;
1951
1952 /* @tsk either already exited or can't exit until the end */
1953 if (tsk->flags & PF_EXITING)
1954 return -ESRCH;
1955
1956 /* Nothing to do if the task is already in that cgroup */
1957 oldcgrp = task_cgroup_from_root(tsk, root);
1958 if (cgrp == oldcgrp)
1959 return 0;
1960
1961 tset.single.task = tsk;
1962 tset.single.cgrp = oldcgrp;
1963
1964 for_each_subsys(root, ss) {
1965 if (ss->can_attach) {
1966 retval = ss->can_attach(cgrp, &tset);
1967 if (retval) {
1968 /*
1969 * Remember on which subsystem the can_attach()
1970 * failed, so that we only call cancel_attach()
1971 * against the subsystems whose can_attach()
1972 * succeeded. (See below)
1973 */
1974 failed_ss = ss;
1975 goto out;
1976 }
1977 }
1978 }
1979
1980 newcg = find_css_set(tsk->cgroups, cgrp);
1981 if (!newcg) {
1982 retval = -ENOMEM;
1983 goto out;
1984 }
1985
1986 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1987
1988 for_each_subsys(root, ss) {
1989 if (ss->attach)
1990 ss->attach(cgrp, &tset);
1991 }
1992
1993out:
1994 if (retval) {
1995 for_each_subsys(root, ss) {
1996 if (ss == failed_ss)
1997 /*
1998 * This subsystem was the one that failed the
1999 * can_attach() check earlier, so we don't need
2000 * to call cancel_attach() against it or any
2001 * remaining subsystems.
2002 */
2003 break;
2004 if (ss->cancel_attach)
2005 ss->cancel_attach(cgrp, &tset);
2006 }
2007 }
2008 return retval;
2009}
2010
2011/**
2012 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2013 * @from: attach to all cgroups of a given task
2014 * @tsk: the task to be attached
2015 */
2016int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2017{
2018 struct cgroupfs_root *root;
2019 int retval = 0;
2020
2021 cgroup_lock();
2022 for_each_active_root(root) {
2023 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2024
2025 retval = cgroup_attach_task(from_cg, tsk);
2026 if (retval)
2027 break;
2028 }
2029 cgroup_unlock();
2030
2031 return retval;
2032}
2033EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2034
2035/**
2036 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2037 * @cgrp: the cgroup to attach to 1944 * @cgrp: the cgroup to attach to
2038 * @leader: the threadgroup leader task_struct of the group to be attached 1945 * @tsk: the task or the leader of the threadgroup to be attached
1946 * @threadgroup: attach the whole threadgroup?
2039 * 1947 *
2040 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1948 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
2041 * task_lock of each thread in leader's threadgroup individually in turn. 1949 * task_lock of @tsk or each thread in the threadgroup individually in turn.
2042 */ 1950 */
2043static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1951static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1952 bool threadgroup)
2044{ 1953{
2045 int retval, i, group_size; 1954 int retval, i, group_size;
2046 struct cgroup_subsys *ss, *failed_ss = NULL; 1955 struct cgroup_subsys *ss, *failed_ss = NULL;
2047 /* guaranteed to be initialized later, but the compiler needs this */
2048 struct cgroupfs_root *root = cgrp->root; 1956 struct cgroupfs_root *root = cgrp->root;
2049 /* threadgroup list cursor and array */ 1957 /* threadgroup list cursor and array */
2050 struct task_struct *tsk; 1958 struct task_struct *leader = tsk;
2051 struct task_and_cgroup *tc; 1959 struct task_and_cgroup *tc;
2052 struct flex_array *group; 1960 struct flex_array *group;
2053 struct cgroup_taskset tset = { }; 1961 struct cgroup_taskset tset = { };
@@ -2059,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2059 * group - group_rwsem prevents new threads from appearing, and if 1967 * group - group_rwsem prevents new threads from appearing, and if
2060 * threads exit, this will just be an over-estimate. 1968 * threads exit, this will just be an over-estimate.
2061 */ 1969 */
2062 group_size = get_nr_threads(leader); 1970 if (threadgroup)
1971 group_size = get_nr_threads(tsk);
1972 else
1973 group_size = 1;
2063 /* flex_array supports very large thread-groups better than kmalloc. */ 1974 /* flex_array supports very large thread-groups better than kmalloc. */
2064 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1975 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2065 if (!group) 1976 if (!group)
2066 return -ENOMEM; 1977 return -ENOMEM;
2067 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1978 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2068 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); 1979 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
2069 if (retval) 1980 if (retval)
2070 goto out_free_group_list; 1981 goto out_free_group_list;
2071 1982
2072 tsk = leader;
2073 i = 0; 1983 i = 0;
2074 /* 1984 /*
2075 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1985 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2098,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2098 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2008 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2099 BUG_ON(retval != 0); 2009 BUG_ON(retval != 0);
2100 i++; 2010 i++;
2011
2012 if (!threadgroup)
2013 break;
2101 } while_each_thread(leader, tsk); 2014 } while_each_thread(leader, tsk);
2102 rcu_read_unlock(); 2015 rcu_read_unlock();
2103 /* remember the number of threads in the array for later. */ 2016 /* remember the number of threads in the array for later. */
@@ -2143,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2143 */ 2056 */
2144 for (i = 0; i < group_size; i++) { 2057 for (i = 0; i < group_size; i++) {
2145 tc = flex_array_get(group, i); 2058 tc = flex_array_get(group, i);
2146 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); 2059 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
2147 } 2060 }
2148 /* nothing is sensitive to fork() after this point. */ 2061 /* nothing is sensitive to fork() after this point. */
2149 2062
@@ -2251,17 +2164,42 @@ retry_find_task:
2251 put_task_struct(tsk); 2164 put_task_struct(tsk);
2252 goto retry_find_task; 2165 goto retry_find_task;
2253 } 2166 }
2254 ret = cgroup_attach_proc(cgrp, tsk); 2167 }
2255 } else 2168
2256 ret = cgroup_attach_task(cgrp, tsk); 2169 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2170
2257 threadgroup_unlock(tsk); 2171 threadgroup_unlock(tsk);
2258 2172
2259 put_task_struct(tsk); 2173 put_task_struct(tsk);
2260out_unlock_cgroup: 2174out_unlock_cgroup:
2261 cgroup_unlock(); 2175 mutex_unlock(&cgroup_mutex);
2262 return ret; 2176 return ret;
2263} 2177}
2264 2178
2179/**
2180 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2181 * @from: attach to all cgroups of a given task
2182 * @tsk: the task to be attached
2183 */
2184int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2185{
2186 struct cgroupfs_root *root;
2187 int retval = 0;
2188
2189 mutex_lock(&cgroup_mutex);
2190 for_each_active_root(root) {
2191 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2192
2193 retval = cgroup_attach_task(from_cg, tsk, false);
2194 if (retval)
2195 break;
2196 }
2197 mutex_unlock(&cgroup_mutex);
2198
2199 return retval;
2200}
2201EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2202
2265static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2203static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2266{ 2204{
2267 return attach_task_by_pid(cgrp, pid, false); 2205 return attach_task_by_pid(cgrp, pid, false);
@@ -2272,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2272 return attach_task_by_pid(cgrp, tgid, true); 2210 return attach_task_by_pid(cgrp, tgid, true);
2273} 2211}
2274 2212
2275/**
2276 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2277 * @cgrp: the cgroup to be checked for liveness
2278 *
2279 * On success, returns true; the lock should be later released with
2280 * cgroup_unlock(). On failure returns false with no lock held.
2281 */
2282bool cgroup_lock_live_group(struct cgroup *cgrp)
2283{
2284 mutex_lock(&cgroup_mutex);
2285 if (cgroup_is_removed(cgrp)) {
2286 mutex_unlock(&cgroup_mutex);
2287 return false;
2288 }
2289 return true;
2290}
2291EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2292
2293static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2213static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2294 const char *buffer) 2214 const char *buffer)
2295{ 2215{
@@ -2301,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301 mutex_lock(&cgroup_root_mutex); 2221 mutex_lock(&cgroup_root_mutex);
2302 strcpy(cgrp->root->release_agent_path, buffer); 2222 strcpy(cgrp->root->release_agent_path, buffer);
2303 mutex_unlock(&cgroup_root_mutex); 2223 mutex_unlock(&cgroup_root_mutex);
2304 cgroup_unlock(); 2224 mutex_unlock(&cgroup_mutex);
2305 return 0; 2225 return 0;
2306} 2226}
2307 2227
@@ -2312,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2312 return -ENODEV; 2232 return -ENODEV;
2313 seq_puts(seq, cgrp->root->release_agent_path); 2233 seq_puts(seq, cgrp->root->release_agent_path);
2314 seq_putc(seq, '\n'); 2234 seq_putc(seq, '\n');
2315 cgroup_unlock(); 2235 mutex_unlock(&cgroup_mutex);
2236 return 0;
2237}
2238
2239static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
2240 struct seq_file *seq)
2241{
2242 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2316 return 0; 2243 return 0;
2317} 2244}
2318 2245
@@ -2537,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2537static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2464static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2538 struct inode *new_dir, struct dentry *new_dentry) 2465 struct inode *new_dir, struct dentry *new_dentry)
2539{ 2466{
2467 int ret;
2468 struct cgroup_name *name, *old_name;
2469 struct cgroup *cgrp;
2470
2471 /*
2472 * It's convinient to use parent dir's i_mutex to protected
2473 * cgrp->name.
2474 */
2475 lockdep_assert_held(&old_dir->i_mutex);
2476
2540 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2477 if (!S_ISDIR(old_dentry->d_inode->i_mode))
2541 return -ENOTDIR; 2478 return -ENOTDIR;
2542 if (new_dentry->d_inode) 2479 if (new_dentry->d_inode)
2543 return -EEXIST; 2480 return -EEXIST;
2544 if (old_dir != new_dir) 2481 if (old_dir != new_dir)
2545 return -EIO; 2482 return -EIO;
2546 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2483
2484 cgrp = __d_cgrp(old_dentry);
2485
2486 name = cgroup_alloc_name(new_dentry);
2487 if (!name)
2488 return -ENOMEM;
2489
2490 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2491 if (ret) {
2492 kfree(name);
2493 return ret;
2494 }
2495
2496 old_name = cgrp->name;
2497 rcu_assign_pointer(cgrp->name, name);
2498
2499 kfree_rcu(old_name, rcu_head);
2500 return 0;
2547} 2501}
2548 2502
2549static struct simple_xattrs *__d_xattrs(struct dentry *dentry) 2503static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2551,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2551 if (S_ISDIR(dentry->d_inode->i_mode)) 2505 if (S_ISDIR(dentry->d_inode->i_mode))
2552 return &__d_cgrp(dentry)->xattrs; 2506 return &__d_cgrp(dentry)->xattrs;
2553 else 2507 else
2554 return &__d_cft(dentry)->xattrs; 2508 return &__d_cfe(dentry)->xattrs;
2555} 2509}
2556 2510
2557static inline int xattr_enabled(struct dentry *dentry) 2511static inline int xattr_enabled(struct dentry *dentry)
2558{ 2512{
2559 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 2513 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2560 return test_bit(ROOT_XATTR, &root->flags); 2514 return root->flags & CGRP_ROOT_XATTR;
2561} 2515}
2562 2516
2563static bool is_valid_xattr(const char *name) 2517static bool is_valid_xattr(const char *name)
@@ -2727,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2727 umode_t mode; 2681 umode_t mode;
2728 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2729 2683
2730 simple_xattrs_init(&cft->xattrs); 2684 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2731
2732 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2733 strcpy(name, subsys->name); 2685 strcpy(name, subsys->name);
2734 strcat(name, "."); 2686 strcat(name, ".");
2735 } 2687 }
@@ -2753,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2753 cfe->type = (void *)cft; 2705 cfe->type = (void *)cft;
2754 cfe->dentry = dentry; 2706 cfe->dentry = dentry;
2755 dentry->d_fsdata = cfe; 2707 dentry->d_fsdata = cfe;
2708 simple_xattrs_init(&cfe->xattrs);
2756 list_add_tail(&cfe->node, &parent->files); 2709 list_add_tail(&cfe->node, &parent->files);
2757 cfe = NULL; 2710 cfe = NULL;
2758 } 2711 }
@@ -2770,6 +2723,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2770 2723
2771 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2724 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2772 /* does cft->flags tell us to skip this file on @cgrp? */ 2725 /* does cft->flags tell us to skip this file on @cgrp? */
2726 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2727 continue;
2773 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2728 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2774 continue; 2729 continue;
2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2730 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -3300,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3300 return 0; 3255 return 0;
3301} 3256}
3302 3257
3258static void cgroup_transfer_one_task(struct task_struct *task,
3259 struct cgroup_scanner *scan)
3260{
3261 struct cgroup *new_cgroup = scan->data;
3262
3263 mutex_lock(&cgroup_mutex);
3264 cgroup_attach_task(new_cgroup, task, false);
3265 mutex_unlock(&cgroup_mutex);
3266}
3267
3268/**
3269 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3270 * @to: cgroup to which the tasks will be moved
3271 * @from: cgroup in which the tasks currently reside
3272 */
3273int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3274{
3275 struct cgroup_scanner scan;
3276
3277 scan.cg = from;
3278 scan.test_task = NULL; /* select all tasks in cgroup */
3279 scan.process_task = cgroup_transfer_one_task;
3280 scan.heap = NULL;
3281 scan.data = to;
3282
3283 return cgroup_scan_tasks(&scan);
3284}
3285
3303/* 3286/*
3304 * Stuff for reading the 'tasks'/'procs' files. 3287 * Stuff for reading the 'tasks'/'procs' files.
3305 * 3288 *
@@ -3362,35 +3345,14 @@ static void pidlist_free(void *p)
3362 else 3345 else
3363 kfree(p); 3346 kfree(p);
3364} 3347}
3365static void *pidlist_resize(void *p, int newcount)
3366{
3367 void *newlist;
3368 /* note: if new alloc fails, old p will still be valid either way */
3369 if (is_vmalloc_addr(p)) {
3370 newlist = vmalloc(newcount * sizeof(pid_t));
3371 if (!newlist)
3372 return NULL;
3373 memcpy(newlist, p, newcount * sizeof(pid_t));
3374 vfree(p);
3375 } else {
3376 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3377 }
3378 return newlist;
3379}
3380 3348
3381/* 3349/*
3382 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3350 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3383 * If the new stripped list is sufficiently smaller and there's enough memory 3351 * Returns the number of unique elements.
3384 * to allocate a new buffer, will let go of the unneeded memory. Returns the
3385 * number of unique elements.
3386 */ 3352 */
3387/* is the size difference enough that we should re-allocate the array? */ 3353static int pidlist_uniq(pid_t *list, int length)
3388#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3389static int pidlist_uniq(pid_t **p, int length)
3390{ 3354{
3391 int src, dest = 1; 3355 int src, dest = 1;
3392 pid_t *list = *p;
3393 pid_t *newlist;
3394 3356
3395 /* 3357 /*
3396 * we presume the 0th element is unique, so i starts at 1. trivial 3358 * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3411,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length)
3411 dest++; 3373 dest++;
3412 } 3374 }
3413after: 3375after:
3414 /*
3415 * if the length difference is large enough, we want to allocate a
3416 * smaller buffer to save memory. if this fails due to out of memory,
3417 * we'll just stay with what we've got.
3418 */
3419 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3420 newlist = pidlist_resize(list, dest);
3421 if (newlist)
3422 *p = newlist;
3423 }
3424 return dest; 3376 return dest;
3425} 3377}
3426 3378
@@ -3516,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3516 /* now sort & (if procs) strip out duplicates */ 3468 /* now sort & (if procs) strip out duplicates */
3517 sort(array, length, sizeof(pid_t), cmppid, NULL); 3469 sort(array, length, sizeof(pid_t), cmppid, NULL);
3518 if (type == CGROUP_FILE_PROCS) 3470 if (type == CGROUP_FILE_PROCS)
3519 length = pidlist_uniq(&array, length); 3471 length = pidlist_uniq(array, length);
3520 l = cgroup_pidlist_find(cgrp, type); 3472 l = cgroup_pidlist_find(cgrp, type);
3521 if (!l) { 3473 if (!l) {
3522 pidlist_free(array); 3474 pidlist_free(array);
@@ -3930,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3930 if (ret) 3882 if (ret)
3931 goto fail; 3883 goto fail;
3932 3884
3933 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { 3885 efile->f_op->poll(efile, &event->pt);
3934 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3935 ret = 0;
3936 goto fail;
3937 }
3938 3886
3939 /* 3887 /*
3940 * Events should be removed after rmdir of cgroup directory, but before 3888 * Events should be removed after rmdir of cgroup directory, but before
@@ -4016,10 +3964,16 @@ static struct cftype files[] = {
4016 }, 3964 },
4017 { 3965 {
4018 .name = "cgroup.clone_children", 3966 .name = "cgroup.clone_children",
3967 .flags = CFTYPE_INSANE,
4019 .read_u64 = cgroup_clone_children_read, 3968 .read_u64 = cgroup_clone_children_read,
4020 .write_u64 = cgroup_clone_children_write, 3969 .write_u64 = cgroup_clone_children_write,
4021 }, 3970 },
4022 { 3971 {
3972 .name = "cgroup.sane_behavior",
3973 .flags = CFTYPE_ONLY_ON_ROOT,
3974 .read_seq_string = cgroup_sane_behavior_show,
3975 },
3976 {
4023 .name = "release_agent", 3977 .name = "release_agent",
4024 .flags = CFTYPE_ONLY_ON_ROOT, 3978 .flags = CFTYPE_ONLY_ON_ROOT,
4025 .read_seq_string = cgroup_release_agent_show, 3979 .read_seq_string = cgroup_release_agent_show,
@@ -4131,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4131 if (!(css->flags & CSS_ONLINE)) 4085 if (!(css->flags & CSS_ONLINE))
4132 return; 4086 return;
4133 4087
4134 /* 4088 if (ss->css_offline)
4135 * css_offline() should be called with cgroup_mutex unlocked. See
4136 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4137 * details. This temporary unlocking should go away once
4138 * cgroup_mutex is unexported from controllers.
4139 */
4140 if (ss->css_offline) {
4141 mutex_unlock(&cgroup_mutex);
4142 ss->css_offline(cgrp); 4089 ss->css_offline(cgrp);
4143 mutex_lock(&cgroup_mutex);
4144 }
4145 4090
4146 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4091 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4147} 4092}
@@ -4158,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4158 umode_t mode) 4103 umode_t mode)
4159{ 4104{
4160 struct cgroup *cgrp; 4105 struct cgroup *cgrp;
4106 struct cgroup_name *name;
4161 struct cgroupfs_root *root = parent->root; 4107 struct cgroupfs_root *root = parent->root;
4162 int err = 0; 4108 int err = 0;
4163 struct cgroup_subsys *ss; 4109 struct cgroup_subsys *ss;
@@ -4168,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4168 if (!cgrp) 4114 if (!cgrp)
4169 return -ENOMEM; 4115 return -ENOMEM;
4170 4116
4117 name = cgroup_alloc_name(dentry);
4118 if (!name)
4119 goto err_free_cgrp;
4120 rcu_assign_pointer(cgrp->name, name);
4121
4171 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4122 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4172 if (cgrp->id < 0) 4123 if (cgrp->id < 0)
4173 goto err_free_cgrp; 4124 goto err_free_name;
4174 4125
4175 /* 4126 /*
4176 * Only live parents can have children. Note that the liveliness 4127 * Only live parents can have children. Note that the liveliness
@@ -4198,7 +4149,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4198 4149
4199 cgrp->parent = parent; 4150 cgrp->parent = parent;
4200 cgrp->root = parent->root; 4151 cgrp->root = parent->root;
4201 cgrp->top_cgroup = parent->top_cgroup;
4202 4152
4203 if (notify_on_release(parent)) 4153 if (notify_on_release(parent))
4204 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4154 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4241 for_each_subsys(root, ss) 4191 for_each_subsys(root, ss)
4242 dget(dentry); 4192 dget(dentry);
4243 4193
4194 /* hold a ref to the parent's dentry */
4195 dget(parent->dentry);
4196
4244 /* creation succeeded, notify subsystems */ 4197 /* creation succeeded, notify subsystems */
4245 for_each_subsys(root, ss) { 4198 for_each_subsys(root, ss) {
4246 err = online_css(ss, cgrp); 4199 err = online_css(ss, cgrp);
@@ -4276,6 +4229,8 @@ err_free_all:
4276 deactivate_super(sb); 4229 deactivate_super(sb);
4277err_free_id: 4230err_free_id:
4278 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4231 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4232err_free_name:
4233 kfree(rcu_dereference_raw(cgrp->name));
4279err_free_cgrp: 4234err_free_cgrp:
4280 kfree(cgrp); 4235 kfree(cgrp);
4281 return err; 4236 return err;
@@ -4295,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4295 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4250 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4296} 4251}
4297 4252
4298/*
4299 * Check the reference count on each subsystem. Since we already
4300 * established that there are no tasks in the cgroup, if the css refcount
4301 * is also 1, then there should be no outstanding references, so the
4302 * subsystem is safe to destroy. We scan across all subsystems rather than
4303 * using the per-hierarchy linked list of mounted subsystems since we can
4304 * be called via check_for_release() with no synchronization other than
4305 * RCU, and the subsystem linked list isn't RCU-safe.
4306 */
4307static int cgroup_has_css_refs(struct cgroup *cgrp)
4308{
4309 int i;
4310
4311 /*
4312 * We won't need to lock the subsys array, because the subsystems
4313 * we're concerned about aren't going anywhere since our cgroup root
4314 * has a reference on them.
4315 */
4316 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4317 struct cgroup_subsys *ss = subsys[i];
4318 struct cgroup_subsys_state *css;
4319
4320 /* Skip subsystems not present or not in this hierarchy */
4321 if (ss == NULL || ss->root != cgrp->root)
4322 continue;
4323
4324 css = cgrp->subsys[ss->subsys_id];
4325 /*
4326 * When called from check_for_release() it's possible
4327 * that by this point the cgroup has been removed
4328 * and the css deleted. But a false-positive doesn't
4329 * matter, since it can only happen if the cgroup
4330 * has been deleted and hence no longer needs the
4331 * release agent to be called anyway.
4332 */
4333 if (css && css_refcnt(css) > 1)
4334 return 1;
4335 }
4336 return 0;
4337}
4338
4339static int cgroup_destroy_locked(struct cgroup *cgrp) 4253static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4254 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4341{ 4255{
4342 struct dentry *d = cgrp->dentry; 4256 struct dentry *d = cgrp->dentry;
4343 struct cgroup *parent = cgrp->parent; 4257 struct cgroup *parent = cgrp->parent;
4344 DEFINE_WAIT(wait);
4345 struct cgroup_event *event, *tmp; 4258 struct cgroup_event *event, *tmp;
4346 struct cgroup_subsys *ss; 4259 struct cgroup_subsys *ss;
4347 LIST_HEAD(tmp_list);
4348 4260
4349 lockdep_assert_held(&d->d_inode->i_mutex); 4261 lockdep_assert_held(&d->d_inode->i_mutex);
4350 lockdep_assert_held(&cgroup_mutex); 4262 lockdep_assert_held(&cgroup_mutex);
@@ -4935,17 +4847,17 @@ void cgroup_post_fork(struct task_struct *child)
4935 * and addition to css_set. 4847 * and addition to css_set.
4936 */ 4848 */
4937 if (need_forkexit_callback) { 4849 if (need_forkexit_callback) {
4938 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4850 /*
4851 * fork/exit callbacks are supported only for builtin
4852 * subsystems, and the builtin section of the subsys
4853 * array is immutable, so we don't need to lock the
4854 * subsys array here. On the other hand, modular section
4855 * of the array can be freed at module unload, so we
4856 * can't touch that.
4857 */
4858 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4939 struct cgroup_subsys *ss = subsys[i]; 4859 struct cgroup_subsys *ss = subsys[i];
4940 4860
4941 /*
4942 * fork/exit callbacks are supported only for
4943 * builtin subsystems and we don't need further
4944 * synchronization as they never go away.
4945 */
4946 if (!ss || ss->module)
4947 continue;
4948
4949 if (ss->fork) 4861 if (ss->fork)
4950 ss->fork(child); 4862 ss->fork(child);
4951 } 4863 }
@@ -5010,13 +4922,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5010 tsk->cgroups = &init_css_set; 4922 tsk->cgroups = &init_css_set;
5011 4923
5012 if (run_callbacks && need_forkexit_callback) { 4924 if (run_callbacks && need_forkexit_callback) {
5013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4925 /*
4926 * fork/exit callbacks are supported only for builtin
4927 * subsystems, see cgroup_post_fork() for details.
4928 */
4929 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5014 struct cgroup_subsys *ss = subsys[i]; 4930 struct cgroup_subsys *ss = subsys[i];
5015 4931
5016 /* modular subsystems can't use callbacks */
5017 if (!ss || ss->module)
5018 continue;
5019
5020 if (ss->exit) { 4932 if (ss->exit) {
5021 struct cgroup *old_cgrp = 4933 struct cgroup *old_cgrp =
5022 rcu_dereference_raw(cg->subsys[i])->cgroup; 4934 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5030,44 +4942,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5030 put_css_set_taskexit(cg); 4942 put_css_set_taskexit(cg);
5031} 4943}
5032 4944
5033/**
5034 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
5035 * @cgrp: the cgroup in question
5036 * @task: the task in question
5037 *
5038 * See if @cgrp is a descendant of @task's cgroup in the appropriate
5039 * hierarchy.
5040 *
5041 * If we are sending in dummytop, then presumably we are creating
5042 * the top cgroup in the subsystem.
5043 *
5044 * Called only by the ns (nsproxy) cgroup.
5045 */
5046int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
5047{
5048 int ret;
5049 struct cgroup *target;
5050
5051 if (cgrp == dummytop)
5052 return 1;
5053
5054 target = task_cgroup_from_root(task, cgrp->root);
5055 while (cgrp != target && cgrp!= cgrp->top_cgroup)
5056 cgrp = cgrp->parent;
5057 ret = (cgrp == target);
5058 return ret;
5059}
5060
5061static void check_for_release(struct cgroup *cgrp) 4945static void check_for_release(struct cgroup *cgrp)
5062{ 4946{
5063 /* All of these checks rely on RCU to keep the cgroup 4947 /* All of these checks rely on RCU to keep the cgroup
5064 * structure alive */ 4948 * structure alive */
5065 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 4949 if (cgroup_is_releasable(cgrp) &&
5066 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 4950 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
5067 /* Control Group is currently removeable. If it's not 4951 /*
4952 * Control Group is currently removeable. If it's not
5068 * already queued for a userspace notification, queue 4953 * already queued for a userspace notification, queue
5069 * it now */ 4954 * it now
4955 */
5070 int need_schedule_work = 0; 4956 int need_schedule_work = 0;
4957
5071 raw_spin_lock(&release_list_lock); 4958 raw_spin_lock(&release_list_lock);
5072 if (!cgroup_is_removed(cgrp) && 4959 if (!cgroup_is_removed(cgrp) &&
5073 list_empty(&cgrp->release_list)) { 4960 list_empty(&cgrp->release_list)) {
@@ -5100,24 +4987,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
5100/* Caller must verify that the css is not for root cgroup */ 4987/* Caller must verify that the css is not for root cgroup */
5101void __css_put(struct cgroup_subsys_state *css) 4988void __css_put(struct cgroup_subsys_state *css)
5102{ 4989{
5103 struct cgroup *cgrp = css->cgroup;
5104 int v; 4990 int v;
5105 4991
5106 rcu_read_lock();
5107 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 4992 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5108 4993 if (v == 0)
5109 switch (v) {
5110 case 1:
5111 if (notify_on_release(cgrp)) {
5112 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5113 check_for_release(cgrp);
5114 }
5115 break;
5116 case 0:
5117 schedule_work(&css->dput_work); 4994 schedule_work(&css->dput_work);
5118 break;
5119 }
5120 rcu_read_unlock();
5121} 4995}
5122EXPORT_SYMBOL_GPL(__css_put); 4996EXPORT_SYMBOL_GPL(__css_put);
5123 4997
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 027a6f65f2ad..12331120767c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex);
265static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
266 266
267/* 267/*
268 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
269 * buffers. They are statically allocated to prevent using excess stack
270 * when calling cpuset_print_task_mems_allowed().
271 */
272#define CPUSET_NAME_LEN (128)
273#define CPUSET_NODELIST_LEN (256)
274static char cpuset_name[CPUSET_NAME_LEN];
275static char cpuset_nodelist[CPUSET_NODELIST_LEN];
276static DEFINE_SPINLOCK(cpuset_buffer_lock);
277
278/*
279 * CPU / memory hotplug is handled asynchronously. 268 * CPU / memory hotplug is handled asynchronously.
280 */ 269 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq; 270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)
780 lockdep_assert_held(&cpuset_mutex); 769 lockdep_assert_held(&cpuset_mutex);
781 get_online_cpus(); 770 get_online_cpus();
782 771
772 /*
773 * We have raced with CPU hotplug. Don't do anything to avoid
774 * passing doms with offlined cpu to partition_sched_domains().
775 * Anyways, hotplug work item will rebuild sched domains.
776 */
777 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
778 goto out;
779
783 /* Generate domain masks and attrs */ 780 /* Generate domain masks and attrs */
784 ndoms = generate_sched_domains(&doms, &attr); 781 ndoms = generate_sched_domains(&doms, &attr);
785 782
786 /* Have scheduler rebuild the domains */ 783 /* Have scheduler rebuild the domains */
787 partition_sched_domains(ndoms, doms, attr); 784 partition_sched_domains(ndoms, doms, attr);
788 785out:
789 put_online_cpus(); 786 put_online_cpus();
790} 787}
791#else /* !CONFIG_SMP */ 788#else /* !CONFIG_SMP */
792static void rebuild_sched_domains_locked(void) 789static void rebuild_sched_domains_locked(void)
793{ 790{
794} 791}
795
796static int generate_sched_domains(cpumask_var_t **domains,
797 struct sched_domain_attr **attributes)
798{
799 *domains = NULL;
800 return 1;
801}
802#endif /* CONFIG_SMP */ 792#endif /* CONFIG_SMP */
803 793
804void rebuild_sched_domains(void) 794void rebuild_sched_domains(void)
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void)
2005 return 0; 1995 return 0;
2006} 1996}
2007 1997
2008/**
2009 * cpuset_do_move_task - move a given task to another cpuset
2010 * @tsk: pointer to task_struct the task to move
2011 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
2012 *
2013 * Called by cgroup_scan_tasks() for each task in a cgroup.
2014 * Return nonzero to stop the walk through the tasks.
2015 */
2016static void cpuset_do_move_task(struct task_struct *tsk,
2017 struct cgroup_scanner *scan)
2018{
2019 struct cgroup *new_cgroup = scan->data;
2020
2021 cgroup_lock();
2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
2024}
2025
2026/**
2027 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
2028 * @from: cpuset in which the tasks currently reside
2029 * @to: cpuset to which the tasks will be moved
2030 *
2031 * Called with cpuset_mutex held
2032 * callback_mutex must not be held, as cpuset_attach() will take it.
2033 *
2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
2035 * calling callback functions for each.
2036 */
2037static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
2038{
2039 struct cgroup_scanner scan;
2040
2041 scan.cg = from->css.cgroup;
2042 scan.test_task = NULL; /* select all tasks in cgroup */
2043 scan.process_task = cpuset_do_move_task;
2044 scan.heap = NULL;
2045 scan.data = to->css.cgroup;
2046
2047 if (cgroup_scan_tasks(&scan))
2048 printk(KERN_ERR "move_member_tasks_to_cpuset: "
2049 "cgroup_scan_tasks failed\n");
2050}
2051
2052/* 1998/*
2053 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 1999 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2054 * or memory nodes, we need to walk over the cpuset hierarchy, 2000 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2069 nodes_empty(parent->mems_allowed)) 2015 nodes_empty(parent->mems_allowed))
2070 parent = parent_cs(parent); 2016 parent = parent_cs(parent);
2071 2017
2072 move_member_tasks_to_cpuset(cs, parent); 2018 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2019 rcu_read_lock();
2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
2021 cgroup_name(cs->css.cgroup));
2022 rcu_read_unlock();
2023 }
2073} 2024}
2074 2025
2075/** 2026/**
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2222 flush_workqueue(cpuset_propagate_hotplug_wq); 2173 flush_workqueue(cpuset_propagate_hotplug_wq);
2223 2174
2224 /* rebuild sched domains if cpus_allowed has changed */ 2175 /* rebuild sched domains if cpus_allowed has changed */
2225 if (cpus_updated) { 2176 if (cpus_updated)
2226 struct sched_domain_attr *attr; 2177 rebuild_sched_domains();
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2235 }
2236} 2178}
2237 2179
2238void cpuset_update_active_cpus(bool cpu_online) 2180void cpuset_update_active_cpus(bool cpu_online)
@@ -2594,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2594 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2536 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2595} 2537}
2596 2538
2539#define CPUSET_NODELIST_LEN (256)
2540
2597/** 2541/**
2598 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2542 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2599 * @task: pointer to task_struct of some task. 2543 * @task: pointer to task_struct of some task.
@@ -2604,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2604 */ 2548 */
2605void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2549void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2606{ 2550{
2607 struct dentry *dentry; 2551 /* Statically allocated to prevent using excess stack. */
2552 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2553 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2608 2554
2609 dentry = task_cs(tsk)->css.cgroup->dentry; 2555 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2610 spin_lock(&cpuset_buffer_lock);
2611 2556
2612 if (!dentry) { 2557 rcu_read_lock();
2613 strcpy(cpuset_name, "/"); 2558 spin_lock(&cpuset_buffer_lock);
2614 } else {
2615 spin_lock(&dentry->d_lock);
2616 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2617 CPUSET_NAME_LEN);
2618 spin_unlock(&dentry->d_lock);
2619 }
2620 2559
2621 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2560 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2622 tsk->mems_allowed); 2561 tsk->mems_allowed);
2623 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2562 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2624 tsk->comm, cpuset_name, cpuset_nodelist); 2563 tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
2564
2625 spin_unlock(&cpuset_buffer_lock); 2565 spin_unlock(&cpuset_buffer_lock);
2566 rcu_read_unlock();
2626} 2567}
2627 2568
2628/* 2569/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9fcb0944f071..dce6e13cf9d7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -251,7 +251,22 @@ perf_cgroup_match(struct perf_event *event)
251 struct perf_event_context *ctx = event->ctx; 251 struct perf_event_context *ctx = event->ctx;
252 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 252 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
253 253
254 return !event->cgrp || event->cgrp == cpuctx->cgrp; 254 /* @event doesn't care about cgroup */
255 if (!event->cgrp)
256 return true;
257
258 /* wants specific cgroup scope but @cpuctx isn't associated with any */
259 if (!cpuctx->cgrp)
260 return false;
261
262 /*
263 * Cgroup scoping is recursive. An event enabled for a cgroup is
264 * also enabled for all its descendant cgroups. If @cpuctx's
265 * cgroup is a descendant of @event's (the test covers identity
266 * case), it's a match.
267 */
268 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
269 event->cgrp->css.cgroup);
255} 270}
256 271
257static inline bool perf_tryget_cgroup(struct perf_event *event) 272static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -7517,12 +7532,5 @@ struct cgroup_subsys perf_subsys = {
7517 .css_free = perf_cgroup_css_free, 7532 .css_free = perf_cgroup_css_free,
7518 .exit = perf_cgroup_exit, 7533 .exit = perf_cgroup_exit,
7519 .attach = perf_cgroup_attach, 7534 .attach = perf_cgroup_attach,
7520
7521 /*
7522 * perf_event cgroup doesn't handle nesting correctly.
7523 * ctx->nr_cgroups adjustments should be propagated through the
7524 * cgroup hierarchy. Fix it and remove the following.
7525 */
7526 .broken_hierarchy = true,
7527}; 7535};
7528#endif /* CONFIG_CGROUP_PERF */ 7536#endif /* CONFIG_CGROUP_PERF */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b8dc8e4cbf6a..0f1d92163f30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3321,52 +3321,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3321 schedule_work(&cachep->memcg_params->destroy); 3321 schedule_work(&cachep->memcg_params->destroy);
3322} 3322}
3323 3323
3324static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) 3324/*
3325{ 3325 * This lock protects updaters, not readers. We want readers to be as fast as
3326 char *name; 3326 * they can, and they will either see NULL or a valid cache value. Our model
3327 struct dentry *dentry; 3327 * allow them to see NULL, in which case the root memcg will be selected.
3328 3328 *
3329 rcu_read_lock(); 3329 * We need this lock because multiple allocations to the same cache from a non
3330 dentry = rcu_dereference(memcg->css.cgroup->dentry); 3330 * will span more than one worker. Only one of them can create the cache.
3331 rcu_read_unlock(); 3331 */
3332 3332static DEFINE_MUTEX(memcg_cache_mutex);
3333 BUG_ON(dentry == NULL);
3334
3335 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3336 memcg_cache_id(memcg), dentry->d_name.name);
3337
3338 return name;
3339}
3340 3333
3334/*
3335 * Called with memcg_cache_mutex held
3336 */
3341static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, 3337static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3342 struct kmem_cache *s) 3338 struct kmem_cache *s)
3343{ 3339{
3344 char *name;
3345 struct kmem_cache *new; 3340 struct kmem_cache *new;
3341 static char *tmp_name = NULL;
3346 3342
3347 name = memcg_cache_name(memcg, s); 3343 lockdep_assert_held(&memcg_cache_mutex);
3348 if (!name) 3344
3349 return NULL; 3345 /*
3346 * kmem_cache_create_memcg duplicates the given name and
3347 * cgroup_name for this name requires RCU context.
3348 * This static temporary buffer is used to prevent from
3349 * pointless shortliving allocation.
3350 */
3351 if (!tmp_name) {
3352 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3353 if (!tmp_name)
3354 return NULL;
3355 }
3356
3357 rcu_read_lock();
3358 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3359 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3360 rcu_read_unlock();
3350 3361
3351 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, 3362 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3352 (s->flags & ~SLAB_PANIC), s->ctor, s); 3363 (s->flags & ~SLAB_PANIC), s->ctor, s);
3353 3364
3354 if (new) 3365 if (new)
3355 new->allocflags |= __GFP_KMEMCG; 3366 new->allocflags |= __GFP_KMEMCG;
3356 3367
3357 kfree(name);
3358 return new; 3368 return new;
3359} 3369}
3360 3370
3361/*
3362 * This lock protects updaters, not readers. We want readers to be as fast as
3363 * they can, and they will either see NULL or a valid cache value. Our model
3364 * allow them to see NULL, in which case the root memcg will be selected.
3365 *
3366 * We need this lock because multiple allocations to the same cache from a non
3367 * will span more than one worker. Only one of them can create the cache.
3368 */
3369static DEFINE_MUTEX(memcg_cache_mutex);
3370static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 3371static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3371 struct kmem_cache *cachep) 3372 struct kmem_cache *cachep)
3372{ 3373{
@@ -5912,6 +5913,7 @@ static struct cftype mem_cgroup_files[] = {
5912 }, 5913 },
5913 { 5914 {
5914 .name = "use_hierarchy", 5915 .name = "use_hierarchy",
5916 .flags = CFTYPE_INSANE,
5915 .write_u64 = mem_cgroup_hierarchy_write, 5917 .write_u64 = mem_cgroup_hierarchy_write,
5916 .read_u64 = mem_cgroup_hierarchy_read, 5918 .read_u64 = mem_cgroup_hierarchy_read,
5917 }, 5919 },
@@ -6907,6 +6909,21 @@ static void mem_cgroup_move_task(struct cgroup *cont,
6907} 6909}
6908#endif 6910#endif
6909 6911
6912/*
6913 * Cgroup retains root cgroups across [un]mount cycles making it necessary
6914 * to verify sane_behavior flag on each mount attempt.
6915 */
6916static void mem_cgroup_bind(struct cgroup *root)
6917{
6918 /*
6919 * use_hierarchy is forced with sane_behavior. cgroup core
6920 * guarantees that @root doesn't have any children, so turning it
6921 * on for the root memcg is enough.
6922 */
6923 if (cgroup_sane_behavior(root))
6924 mem_cgroup_from_cont(root)->use_hierarchy = true;
6925}
6926
6910struct cgroup_subsys mem_cgroup_subsys = { 6927struct cgroup_subsys mem_cgroup_subsys = {
6911 .name = "memory", 6928 .name = "memory",
6912 .subsys_id = mem_cgroup_subsys_id, 6929 .subsys_id = mem_cgroup_subsys_id,
@@ -6917,6 +6934,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
6917 .can_attach = mem_cgroup_can_attach, 6934 .can_attach = mem_cgroup_can_attach,
6918 .cancel_attach = mem_cgroup_cancel_attach, 6935 .cancel_attach = mem_cgroup_cancel_attach,
6919 .attach = mem_cgroup_move_task, 6936 .attach = mem_cgroup_move_task,
6937 .bind = mem_cgroup_bind,
6920 .base_cftypes = mem_cgroup_files, 6938 .base_cftypes = mem_cgroup_files,
6921 .early_init = 0, 6939 .early_init = 0,
6922 .use_id = 1, 6940 .use_id = 1,
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 1c69e38e3a2c..dd0dc574d78d 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -25,6 +25,12 @@
25 25
26static DEFINE_MUTEX(devcgroup_mutex); 26static DEFINE_MUTEX(devcgroup_mutex);
27 27
28enum devcg_behavior {
29 DEVCG_DEFAULT_NONE,
30 DEVCG_DEFAULT_ALLOW,
31 DEVCG_DEFAULT_DENY,
32};
33
28/* 34/*
29 * exception list locking rules: 35 * exception list locking rules:
30 * hold devcgroup_mutex for update/read. 36 * hold devcgroup_mutex for update/read.
@@ -42,10 +48,9 @@ struct dev_exception_item {
42struct dev_cgroup { 48struct dev_cgroup {
43 struct cgroup_subsys_state css; 49 struct cgroup_subsys_state css;
44 struct list_head exceptions; 50 struct list_head exceptions;
45 enum { 51 enum devcg_behavior behavior;
46 DEVCG_DEFAULT_ALLOW, 52 /* temporary list for pending propagation operations */
47 DEVCG_DEFAULT_DENY, 53 struct list_head propagate_pending;
48 } behavior;
49}; 54};
50 55
51static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 56static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
@@ -182,35 +187,62 @@ static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
182 __dev_exception_clean(dev_cgroup); 187 __dev_exception_clean(dev_cgroup);
183} 188}
184 189
190static inline bool is_devcg_online(const struct dev_cgroup *devcg)
191{
192 return (devcg->behavior != DEVCG_DEFAULT_NONE);
193}
194
195/**
196 * devcgroup_online - initializes devcgroup's behavior and exceptions based on
197 * parent's
198 * @cgroup: cgroup getting online
199 * returns 0 in case of success, error code otherwise
200 */
201static int devcgroup_online(struct cgroup *cgroup)
202{
203 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL;
204 int ret = 0;
205
206 mutex_lock(&devcgroup_mutex);
207 dev_cgroup = cgroup_to_devcgroup(cgroup);
208 if (cgroup->parent)
209 parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent);
210
211 if (parent_dev_cgroup == NULL)
212 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
213 else {
214 ret = dev_exceptions_copy(&dev_cgroup->exceptions,
215 &parent_dev_cgroup->exceptions);
216 if (!ret)
217 dev_cgroup->behavior = parent_dev_cgroup->behavior;
218 }
219 mutex_unlock(&devcgroup_mutex);
220
221 return ret;
222}
223
224static void devcgroup_offline(struct cgroup *cgroup)
225{
226 struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup);
227
228 mutex_lock(&devcgroup_mutex);
229 dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
230 mutex_unlock(&devcgroup_mutex);
231}
232
185/* 233/*
186 * called from kernel/cgroup.c with cgroup_lock() held. 234 * called from kernel/cgroup.c with cgroup_lock() held.
187 */ 235 */
188static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) 236static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
189{ 237{
190 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; 238 struct dev_cgroup *dev_cgroup;
191 struct cgroup *parent_cgroup;
192 int ret;
193 239
194 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 240 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
195 if (!dev_cgroup) 241 if (!dev_cgroup)
196 return ERR_PTR(-ENOMEM); 242 return ERR_PTR(-ENOMEM);
197 INIT_LIST_HEAD(&dev_cgroup->exceptions); 243 INIT_LIST_HEAD(&dev_cgroup->exceptions);
198 parent_cgroup = cgroup->parent; 244 INIT_LIST_HEAD(&dev_cgroup->propagate_pending);
199 245 dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
200 if (parent_cgroup == NULL)
201 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
202 else {
203 parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
204 mutex_lock(&devcgroup_mutex);
205 ret = dev_exceptions_copy(&dev_cgroup->exceptions,
206 &parent_dev_cgroup->exceptions);
207 dev_cgroup->behavior = parent_dev_cgroup->behavior;
208 mutex_unlock(&devcgroup_mutex);
209 if (ret) {
210 kfree(dev_cgroup);
211 return ERR_PTR(ret);
212 }
213 }
214 246
215 return &dev_cgroup->css; 247 return &dev_cgroup->css;
216} 248}
@@ -304,9 +336,11 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
304 * verify if a certain access is allowed. 336 * verify if a certain access is allowed.
305 * @dev_cgroup: dev cgroup to be tested against 337 * @dev_cgroup: dev cgroup to be tested against
306 * @refex: new exception 338 * @refex: new exception
339 * @behavior: behavior of the exception
307 */ 340 */
308static int may_access(struct dev_cgroup *dev_cgroup, 341static bool may_access(struct dev_cgroup *dev_cgroup,
309 struct dev_exception_item *refex) 342 struct dev_exception_item *refex,
343 enum devcg_behavior behavior)
310{ 344{
311 struct dev_exception_item *ex; 345 struct dev_exception_item *ex;
312 bool match = false; 346 bool match = false;
@@ -330,18 +364,29 @@ static int may_access(struct dev_cgroup *dev_cgroup,
330 break; 364 break;
331 } 365 }
332 366
333 /* 367 if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
334 * In two cases we'll consider this new exception valid: 368 if (behavior == DEVCG_DEFAULT_ALLOW) {
335 * - the dev cgroup has its default policy to allow + exception list: 369 /* the exception will deny access to certain devices */
336 * the new exception should *not* match any of the exceptions 370 return true;
337 * (behavior == DEVCG_DEFAULT_ALLOW, !match) 371 } else {
338 * - the dev cgroup has its default policy to deny + exception list: 372 /* the exception will allow access to certain devices */
339 * the new exception *should* match the exceptions 373 if (match)
340 * (behavior == DEVCG_DEFAULT_DENY, match) 374 /*
341 */ 375 * a new exception allowing access shouldn't
342 if ((dev_cgroup->behavior == DEVCG_DEFAULT_DENY) == match) 376 * match an parent's exception
343 return 1; 377 */
344 return 0; 378 return false;
379 return true;
380 }
381 } else {
382 /* only behavior == DEVCG_DEFAULT_DENY allowed here */
383 if (match)
384 /* parent has an exception that matches the proposed */
385 return true;
386 else
387 return false;
388 }
389 return false;
345} 390}
346 391
347/* 392/*
@@ -358,7 +403,7 @@ static int parent_has_perm(struct dev_cgroup *childcg,
358 if (!pcg) 403 if (!pcg)
359 return 1; 404 return 1;
360 parent = cgroup_to_devcgroup(pcg); 405 parent = cgroup_to_devcgroup(pcg);
361 return may_access(parent, ex); 406 return may_access(parent, ex, childcg->behavior);
362} 407}
363 408
364/** 409/**
@@ -374,6 +419,111 @@ static inline int may_allow_all(struct dev_cgroup *parent)
374 return parent->behavior == DEVCG_DEFAULT_ALLOW; 419 return parent->behavior == DEVCG_DEFAULT_ALLOW;
375} 420}
376 421
422/**
423 * revalidate_active_exceptions - walks through the active exception list and
424 * revalidates the exceptions based on parent's
425 * behavior and exceptions. The exceptions that
426 * are no longer valid will be removed.
427 * Called with devcgroup_mutex held.
428 * @devcg: cgroup which exceptions will be checked
429 *
430 * This is one of the three key functions for hierarchy implementation.
431 * This function is responsible for re-evaluating all the cgroup's active
432 * exceptions due to a parent's exception change.
433 * Refer to Documentation/cgroups/devices.txt for more details.
434 */
435static void revalidate_active_exceptions(struct dev_cgroup *devcg)
436{
437 struct dev_exception_item *ex;
438 struct list_head *this, *tmp;
439
440 list_for_each_safe(this, tmp, &devcg->exceptions) {
441 ex = container_of(this, struct dev_exception_item, list);
442 if (!parent_has_perm(devcg, ex))
443 dev_exception_rm(devcg, ex);
444 }
445}
446
447/**
448 * get_online_devcg - walks the cgroup tree and fills a list with the online
449 * groups
450 * @root: cgroup used as starting point
451 * @online: list that will be filled with online groups
452 *
453 * Must be called with devcgroup_mutex held. Grabs RCU lock.
454 * Because devcgroup_mutex is held, no devcg will become online or offline
455 * during the tree walk (see devcgroup_online, devcgroup_offline)
456 * A separated list is needed because propagate_behavior() and
457 * propagate_exception() need to allocate memory and can block.
458 */
459static void get_online_devcg(struct cgroup *root, struct list_head *online)
460{
461 struct cgroup *pos;
462 struct dev_cgroup *devcg;
463
464 lockdep_assert_held(&devcgroup_mutex);
465
466 rcu_read_lock();
467 cgroup_for_each_descendant_pre(pos, root) {
468 devcg = cgroup_to_devcgroup(pos);
469 if (is_devcg_online(devcg))
470 list_add_tail(&devcg->propagate_pending, online);
471 }
472 rcu_read_unlock();
473}
474
475/**
476 * propagate_exception - propagates a new exception to the children
477 * @devcg_root: device cgroup that added a new exception
478 * @ex: new exception to be propagated
479 *
480 * returns: 0 in case of success, != 0 in case of error
481 */
482static int propagate_exception(struct dev_cgroup *devcg_root,
483 struct dev_exception_item *ex)
484{
485 struct cgroup *root = devcg_root->css.cgroup;
486 struct dev_cgroup *devcg, *parent, *tmp;
487 int rc = 0;
488 LIST_HEAD(pending);
489
490 get_online_devcg(root, &pending);
491
492 list_for_each_entry_safe(devcg, tmp, &pending, propagate_pending) {
493 parent = cgroup_to_devcgroup(devcg->css.cgroup->parent);
494
495 /*
496 * in case both root's behavior and devcg is allow, a new
497 * restriction means adding to the exception list
498 */
499 if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW &&
500 devcg->behavior == DEVCG_DEFAULT_ALLOW) {
501 rc = dev_exception_add(devcg, ex);
502 if (rc)
503 break;
504 } else {
505 /*
506 * in the other possible cases:
507 * root's behavior: allow, devcg's: deny
508 * root's behavior: deny, devcg's: deny
509 * the exception will be removed
510 */
511 dev_exception_rm(devcg, ex);
512 }
513 revalidate_active_exceptions(devcg);
514
515 list_del_init(&devcg->propagate_pending);
516 }
517 return rc;
518}
519
520static inline bool has_children(struct dev_cgroup *devcgroup)
521{
522 struct cgroup *cgrp = devcgroup->css.cgroup;
523
524 return !list_empty(&cgrp->children);
525}
526
377/* 527/*
378 * Modify the exception list using allow/deny rules. 528 * Modify the exception list using allow/deny rules.
379 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 529 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD
@@ -392,7 +542,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
392{ 542{
393 const char *b; 543 const char *b;
394 char temp[12]; /* 11 + 1 characters needed for a u32 */ 544 char temp[12]; /* 11 + 1 characters needed for a u32 */
395 int count, rc; 545 int count, rc = 0;
396 struct dev_exception_item ex; 546 struct dev_exception_item ex;
397 struct cgroup *p = devcgroup->css.cgroup; 547 struct cgroup *p = devcgroup->css.cgroup;
398 struct dev_cgroup *parent = NULL; 548 struct dev_cgroup *parent = NULL;
@@ -410,6 +560,9 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
410 case 'a': 560 case 'a':
411 switch (filetype) { 561 switch (filetype) {
412 case DEVCG_ALLOW: 562 case DEVCG_ALLOW:
563 if (has_children(devcgroup))
564 return -EINVAL;
565
413 if (!may_allow_all(parent)) 566 if (!may_allow_all(parent))
414 return -EPERM; 567 return -EPERM;
415 dev_exception_clean(devcgroup); 568 dev_exception_clean(devcgroup);
@@ -423,6 +576,9 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
423 return rc; 576 return rc;
424 break; 577 break;
425 case DEVCG_DENY: 578 case DEVCG_DENY:
579 if (has_children(devcgroup))
580 return -EINVAL;
581
426 dev_exception_clean(devcgroup); 582 dev_exception_clean(devcgroup);
427 devcgroup->behavior = DEVCG_DEFAULT_DENY; 583 devcgroup->behavior = DEVCG_DEFAULT_DENY;
428 break; 584 break;
@@ -517,22 +673,28 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
517 dev_exception_rm(devcgroup, &ex); 673 dev_exception_rm(devcgroup, &ex);
518 return 0; 674 return 0;
519 } 675 }
520 return dev_exception_add(devcgroup, &ex); 676 rc = dev_exception_add(devcgroup, &ex);
677 break;
521 case DEVCG_DENY: 678 case DEVCG_DENY:
522 /* 679 /*
523 * If the default policy is to deny by default, try to remove 680 * If the default policy is to deny by default, try to remove
524 * an matching exception instead. And be silent about it: we 681 * an matching exception instead. And be silent about it: we
525 * don't want to break compatibility 682 * don't want to break compatibility
526 */ 683 */
527 if (devcgroup->behavior == DEVCG_DEFAULT_DENY) { 684 if (devcgroup->behavior == DEVCG_DEFAULT_DENY)
528 dev_exception_rm(devcgroup, &ex); 685 dev_exception_rm(devcgroup, &ex);
529 return 0; 686 else
530 } 687 rc = dev_exception_add(devcgroup, &ex);
531 return dev_exception_add(devcgroup, &ex); 688
689 if (rc)
690 break;
691 /* we only propagate new restrictions */
692 rc = propagate_exception(devcgroup, &ex);
693 break;
532 default: 694 default:
533 return -EINVAL; 695 rc = -EINVAL;
534 } 696 }
535 return 0; 697 return rc;
536} 698}
537 699
538static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 700static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft,
@@ -571,17 +733,10 @@ struct cgroup_subsys devices_subsys = {
571 .can_attach = devcgroup_can_attach, 733 .can_attach = devcgroup_can_attach,
572 .css_alloc = devcgroup_css_alloc, 734 .css_alloc = devcgroup_css_alloc,
573 .css_free = devcgroup_css_free, 735 .css_free = devcgroup_css_free,
736 .css_online = devcgroup_online,
737 .css_offline = devcgroup_offline,
574 .subsys_id = devices_subsys_id, 738 .subsys_id = devices_subsys_id,
575 .base_cftypes = dev_cgroup_files, 739 .base_cftypes = dev_cgroup_files,
576
577 /*
578 * While devices cgroup has the rudimentary hierarchy support which
579 * checks the parent's restriction, it doesn't properly propagates
580 * config changes in ancestors to their descendents. A child
581 * should only be allowed to add more restrictions to the parent's
582 * configuration. Fix it and remove the following.
583 */
584 .broken_hierarchy = true,
585}; 740};
586 741
587/** 742/**
@@ -609,7 +764,7 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
609 764
610 rcu_read_lock(); 765 rcu_read_lock();
611 dev_cgroup = task_devcgroup(current); 766 dev_cgroup = task_devcgroup(current);
612 rc = may_access(dev_cgroup, &ex); 767 rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior);
613 rcu_read_unlock(); 768 rcu_read_unlock();
614 769
615 if (!rc) 770 if (!rc)