aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-26 22:50:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-26 22:50:04 -0400
commitbbe179f88d39274630823a0dc07d2714fd19a103 (patch)
treef70181a660e0f859f230233643faded7d44360e5
parent4b703b1d4c46ca4a00109ca1a391943ec21991b3 (diff)
parent8a0792ef8e01f03cb43806c6a87738bde34df713 (diff)
Merge branch 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - threadgroup_lock got reorganized so that its users can pick the actual locking mechanism to use. Its only user - cgroups - is updated to use a percpu_rwsem instead of per-process rwsem. This makes things a bit lighter on hot paths and allows cgroups to perform and fail multi-task (a process) migrations atomically. Multi-task migrations are used in several places including the unified hierarchy. - Delegation rule and documentation added to unified hierarchy. This will likely be the last interface update from the cgroup core side for unified hierarchy before lifting the devel mask. - Some groundwork for the pids controller which is scheduled to be merged in the coming devel cycle. * 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: add delegation section to unified hierarchy documentation cgroup: require write perm on common ancestor when moving processes on the default hierarchy cgroup: separate out cgroup_procs_write_permission() from __cgroup_procs_write() kernfs: make kernfs_get_inode() public MAINTAINERS: add a cgroup core co-maintainer cgroup: fix uninitialised iterator in for_each_subsys_which cgroup: replace explicit ss_mask checking with for_each_subsys_which cgroup: use bitmask to filter for_each_subsys cgroup: add seq_file forward declaration for struct cftype cgroup: simplify threadgroup locking sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem sched, cgroup: reorganize threadgroup locking cgroup: switch to unsigned long for bitmasks cgroup: reorganize include/linux/cgroup.h cgroup: separate out include/linux/cgroup-defs.h cgroup: fix some comment typos
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt102
-rw-r--r--MAINTAINERS1
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--include/linux/cgroup-defs.h501
-rw-r--r--include/linux/cgroup.h1007
-rw-r--r--include/linux/init_task.h8
-rw-r--r--include/linux/kernfs.h5
-rw-r--r--include/linux/sched.h65
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup.c273
-rw-r--r--kernel/fork.c4
11 files changed, 1028 insertions, 940 deletions
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index eb102fb72213..86847a7647ab 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -17,15 +17,18 @@ CONTENTS
173. Structural Constraints 173. Structural Constraints
18 3-1. Top-down 18 3-1. Top-down
19 3-2. No internal tasks 19 3-2. No internal tasks
204. Other Changes 204. Delegation
21 4-1. [Un]populated Notification 21 4-1. Model of delegation
22 4-2. Other Core Changes 22 4-2. Common ancestor rule
23 4-3. Per-Controller Changes 235. Other Changes
24 4-3-1. blkio 24 5-1. [Un]populated Notification
25 4-3-2. cpuset 25 5-2. Other Core Changes
26 4-3-3. memory 26 5-3. Per-Controller Changes
275. Planned Changes 27 5-3-1. blkio
28 5-1. CAP for resource control 28 5-3-2. cpuset
29 5-3-3. memory
306. Planned Changes
31 6-1. CAP for resource control
29 32
30 33
311. Background 341. Background
@@ -245,9 +248,72 @@ cgroup must create children and transfer all its tasks to the children
245before enabling controllers in its "cgroup.subtree_control" file. 248before enabling controllers in its "cgroup.subtree_control" file.
246 249
247 250
2484. Other Changes 2514. Delegation
249 252
2504-1. [Un]populated Notification 2534-1. Model of delegation
254
255A cgroup can be delegated to a less privileged user by granting write
256access of the directory and its "cgroup.procs" file to the user. Note
257that the resource control knobs in a given directory concern the
258resources of the parent and thus must not be delegated along with the
259directory.
260
261Once delegated, the user can build sub-hierarchy under the directory,
262organize processes as it sees fit and further distribute the resources
263it got from the parent. The limits and other settings of all resource
264controllers are hierarchical and regardless of what happens in the
265delegated sub-hierarchy, nothing can escape the resource restrictions
266imposed by the parent.
267
268Currently, cgroup doesn't impose any restrictions on the number of
269cgroups in or nesting depth of a delegated sub-hierarchy; however,
270this may in the future be limited explicitly.
271
272
2734-2. Common ancestor rule
274
275On the unified hierarchy, to write to a "cgroup.procs" file, in
276addition to the usual write permission to the file and uid match, the
277writer must also have write access to the "cgroup.procs" file of the
278common ancestor of the source and destination cgroups. This prevents
279delegatees from smuggling processes across disjoint sub-hierarchies.
280
281Let's say cgroups C0 and C1 have been delegated to user U0 who created
282C00, C01 under C0 and C10 under C1 as follows.
283
284 ~~~~~~~~~~~~~ - C0 - C00
285 ~ cgroup ~ \ C01
286 ~ hierarchy ~
287 ~~~~~~~~~~~~~ - C1 - C10
288
289C0 and C1 are separate entities in terms of resource distribution
290regardless of their relative positions in the hierarchy. The
291resources the processes under C0 are entitled to are controlled by
292C0's ancestors and may be completely different from C1. It's clear
293that the intention of delegating C0 to U0 is allowing U0 to organize
294the processes under C0 and further control the distribution of C0's
295resources.
296
297On traditional hierarchies, if a task has write access to "tasks" or
298"cgroup.procs" file of a cgroup and its uid agrees with the target, it
299can move the target to the cgroup. In the above example, U0 will not
300only be able to move processes in each sub-hierarchy but also across
301the two sub-hierarchies, effectively allowing it to violate the
302organizational and resource restrictions implied by the hierarchical
303structure above C0 and C1.
304
305On the unified hierarchy, let's say U0 wants to write the pid of a
306process which has a matching uid and is currently in C10 into
307"C00/cgroup.procs". U0 obviously has write access to the file and
308migration permission on the process; however, the common ancestor of
309the source cgroup C10 and the destination cgroup C00 is above the
310points of delegation and U0 would not have write access to its
311"cgroup.procs" and thus be denied with -EACCES.
312
313
3145. Other Changes
315
3165-1. [Un]populated Notification
251 317
252cgroup users often need a way to determine when a cgroup's 318cgroup users often need a way to determine when a cgroup's
253subhierarchy becomes empty so that it can be cleaned up. cgroup 319subhierarchy becomes empty so that it can be cleaned up. cgroup
@@ -289,7 +355,7 @@ supported and the interface files "release_agent" and
289"notify_on_release" do not exist. 355"notify_on_release" do not exist.
290 356
291 357
2924-2. Other Core Changes 3585-2. Other Core Changes
293 359
294- None of the mount options is allowed. 360- None of the mount options is allowed.
295 361
@@ -306,14 +372,14 @@ supported and the interface files "release_agent" and
306- The "cgroup.clone_children" file is removed. 372- The "cgroup.clone_children" file is removed.
307 373
308 374
3094-3. Per-Controller Changes 3755-3. Per-Controller Changes
310 376
3114-3-1. blkio 3775-3-1. blkio
312 378
313- blk-throttle becomes properly hierarchical. 379- blk-throttle becomes properly hierarchical.
314 380
315 381
3164-3-2. cpuset 3825-3-2. cpuset
317 383
318- Tasks are kept in empty cpusets after hotplug and take on the masks 384- Tasks are kept in empty cpusets after hotplug and take on the masks
319 of the nearest non-empty ancestor, instead of being moved to it. 385 of the nearest non-empty ancestor, instead of being moved to it.
@@ -322,7 +388,7 @@ supported and the interface files "release_agent" and
322 masks of the nearest non-empty ancestor. 388 masks of the nearest non-empty ancestor.
323 389
324 390
3254-3-3. memory 3915-3-3. memory
326 392
327- use_hierarchy is on by default and the cgroup file for the flag is 393- use_hierarchy is on by default and the cgroup file for the flag is
328 not created. 394 not created.
@@ -407,9 +473,9 @@ supported and the interface files "release_agent" and
407 memory.low, memory.high, and memory.max will use the string "max" to 473 memory.low, memory.high, and memory.max will use the string "max" to
408 indicate and set the highest possible value. 474 indicate and set the highest possible value.
409 475
4105. Planned Changes 4766. Planned Changes
411 477
4125-1. CAP for resource control 4786-1. CAP for resource control
413 479
414Unified hierarchy will require one of the capabilities(7), which is 480Unified hierarchy will require one of the capabilities(7), which is
415yet to be decided, for all resource control related knobs. Process 481yet to be decided, for all resource control related knobs. Process
diff --git a/MAINTAINERS b/MAINTAINERS
index 68457d869b61..c54a67434048 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2816,6 +2816,7 @@ F: drivers/connector/
2816CONTROL GROUP (CGROUP) 2816CONTROL GROUP (CGROUP)
2817M: Tejun Heo <tj@kernel.org> 2817M: Tejun Heo <tj@kernel.org>
2818M: Li Zefan <lizefan@huawei.com> 2818M: Li Zefan <lizefan@huawei.com>
2819M: Johannes Weiner <hannes@cmpxchg.org>
2819L: cgroups@vger.kernel.org 2820L: cgroups@vger.kernel.org
2820T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 2821T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
2821S: Maintained 2822S: Maintained
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index af9fa7499919..6762bfbd8207 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache;
76/* 76/*
77 * inode.c 77 * inode.c
78 */ 78 */
79struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
80void kernfs_evict_inode(struct inode *inode); 79void kernfs_evict_inode(struct inode *inode);
81int kernfs_iop_permission(struct inode *inode, int mask); 80int kernfs_iop_permission(struct inode *inode, int mask);
82int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); 81int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
new file mode 100644
index 000000000000..93755a629299
--- /dev/null
+++ b/include/linux/cgroup-defs.h
@@ -0,0 +1,501 @@
1/*
2 * linux/cgroup-defs.h - basic definitions for cgroup
3 *
4 * This file provides basic type and interface. Include this file directly
5 * only if necessary to avoid cyclic dependencies.
6 */
7#ifndef _LINUX_CGROUP_DEFS_H
8#define _LINUX_CGROUP_DEFS_H
9
10#include <linux/limits.h>
11#include <linux/list.h>
12#include <linux/idr.h>
13#include <linux/wait.h>
14#include <linux/mutex.h>
15#include <linux/rcupdate.h>
16#include <linux/percpu-refcount.h>
17#include <linux/percpu-rwsem.h>
18#include <linux/workqueue.h>
19
20#ifdef CONFIG_CGROUPS
21
22struct cgroup;
23struct cgroup_root;
24struct cgroup_subsys;
25struct cgroup_taskset;
26struct kernfs_node;
27struct kernfs_ops;
28struct kernfs_open_file;
29struct seq_file;
30
31#define MAX_CGROUP_TYPE_NAMELEN 32
32#define MAX_CGROUP_ROOT_NAMELEN 64
33#define MAX_CFTYPE_NAME 64
34
35/* define the enumeration of all cgroup subsystems */
36#define SUBSYS(_x) _x ## _cgrp_id,
37enum cgroup_subsys_id {
38#include <linux/cgroup_subsys.h>
39 CGROUP_SUBSYS_COUNT,
40};
41#undef SUBSYS
42
43/* bits in struct cgroup_subsys_state flags field */
44enum {
45 CSS_NO_REF = (1 << 0), /* no reference counting for this css */
46 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
47 CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
48};
49
50/* bits in struct cgroup flags field */
51enum {
52 /* Control Group requires release notifications to userspace */
53 CGRP_NOTIFY_ON_RELEASE,
54 /*
55 * Clone the parent's configuration when creating a new child
56 * cpuset cgroup. For historical reasons, this option can be
57 * specified at mount time and thus is implemented here.
58 */
59 CGRP_CPUSET_CLONE_CHILDREN,
60};
61
62/* cgroup_root->flags */
63enum {
64 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
65 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
66 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
67};
68
69/* cftype->flags */
70enum {
71 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
72 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
73 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
74
75 /* internal flags, do not use outside cgroup core proper */
76 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
77 __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
78};
79
80/*
81 * Per-subsystem/per-cgroup state maintained by the system. This is the
82 * fundamental structural building block that controllers deal with.
83 *
84 * Fields marked with "PI:" are public and immutable and may be accessed
85 * directly without synchronization.
86 */
87struct cgroup_subsys_state {
88 /* PI: the cgroup that this css is attached to */
89 struct cgroup *cgroup;
90
91 /* PI: the cgroup subsystem that this css is attached to */
92 struct cgroup_subsys *ss;
93
94 /* reference count - access via css_[try]get() and css_put() */
95 struct percpu_ref refcnt;
96
97 /* PI: the parent css */
98 struct cgroup_subsys_state *parent;
99
100 /* siblings list anchored at the parent's ->children */
101 struct list_head sibling;
102 struct list_head children;
103
104 /*
105 * PI: Subsys-unique ID. 0 is unused and root is always 1. The
106 * matching css can be looked up using css_from_id().
107 */
108 int id;
109
110 unsigned int flags;
111
112 /*
113 * Monotonically increasing unique serial number which defines a
114 * uniform order among all csses. It's guaranteed that all
115 * ->children lists are in the ascending order of ->serial_nr and
116 * used to allow interrupting and resuming iterations.
117 */
118 u64 serial_nr;
119
120 /* percpu_ref killing and RCU release */
121 struct rcu_head rcu_head;
122 struct work_struct destroy_work;
123};
124
125/*
126 * A css_set is a structure holding pointers to a set of
127 * cgroup_subsys_state objects. This saves space in the task struct
128 * object and speeds up fork()/exit(), since a single inc/dec and a
129 * list_add()/del() can bump the reference count on the entire cgroup
130 * set for a task.
131 */
132struct css_set {
133 /* Reference count */
134 atomic_t refcount;
135
136 /*
137 * List running through all cgroup groups in the same hash
138 * slot. Protected by css_set_lock
139 */
140 struct hlist_node hlist;
141
142 /*
143 * Lists running through all tasks using this cgroup group.
144 * mg_tasks lists tasks which belong to this cset but are in the
145 * process of being migrated out or in. Protected by
146 * css_set_rwsem, but, during migration, once tasks are moved to
147 * mg_tasks, it can be read safely while holding cgroup_mutex.
148 */
149 struct list_head tasks;
150 struct list_head mg_tasks;
151
152 /*
153 * List of cgrp_cset_links pointing at cgroups referenced from this
154 * css_set. Protected by css_set_lock.
155 */
156 struct list_head cgrp_links;
157
158 /* the default cgroup associated with this css_set */
159 struct cgroup *dfl_cgrp;
160
161 /*
162 * Set of subsystem states, one for each subsystem. This array is
163 * immutable after creation apart from the init_css_set during
164 * subsystem registration (at boot time).
165 */
166 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
167
168 /*
169 * List of csets participating in the on-going migration either as
170 * source or destination. Protected by cgroup_mutex.
171 */
172 struct list_head mg_preload_node;
173 struct list_head mg_node;
174
175 /*
176 * If this cset is acting as the source of migration the following
177 * two fields are set. mg_src_cgrp is the source cgroup of the
178 * on-going migration and mg_dst_cset is the destination cset the
179 * target tasks on this cset should be migrated to. Protected by
180 * cgroup_mutex.
181 */
182 struct cgroup *mg_src_cgrp;
183 struct css_set *mg_dst_cset;
184
185 /*
186 * On the default hierarhcy, ->subsys[ssid] may point to a css
187 * attached to an ancestor instead of the cgroup this css_set is
188 * associated with. The following node is anchored at
189 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
190 * iterate through all css's attached to a given cgroup.
191 */
192 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
193
194 /* For RCU-protected deletion */
195 struct rcu_head rcu_head;
196};
197
198struct cgroup {
199 /* self css with NULL ->ss, points back to this cgroup */
200 struct cgroup_subsys_state self;
201
202 unsigned long flags; /* "unsigned long" so bitops work */
203
204 /*
205 * idr allocated in-hierarchy ID.
206 *
207 * ID 0 is not used, the ID of the root cgroup is always 1, and a
208 * new cgroup will be assigned with a smallest available ID.
209 *
210 * Allocating/Removing ID must be protected by cgroup_mutex.
211 */
212 int id;
213
214 /*
215 * If this cgroup contains any tasks, it contributes one to
216 * populated_cnt. All children with non-zero popuplated_cnt of
217 * their own contribute one. The count is zero iff there's no task
218 * in this cgroup or its subtree.
219 */
220 int populated_cnt;
221
222 struct kernfs_node *kn; /* cgroup kernfs entry */
223 struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */
224 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
225
226 /*
227 * The bitmask of subsystems enabled on the child cgroups.
228 * ->subtree_control is the one configured through
229 * "cgroup.subtree_control" while ->child_subsys_mask is the
230 * effective one which may have more subsystems enabled.
231 * Controller knobs are made available iff it's enabled in
232 * ->subtree_control.
233 */
234 unsigned int subtree_control;
235 unsigned int child_subsys_mask;
236
237 /* Private pointers for each registered subsystem */
238 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
239
240 struct cgroup_root *root;
241
242 /*
243 * List of cgrp_cset_links pointing at css_sets with tasks in this
244 * cgroup. Protected by css_set_lock.
245 */
246 struct list_head cset_links;
247
248 /*
249 * On the default hierarchy, a css_set for a cgroup with some
250 * susbsys disabled will point to css's which are associated with
251 * the closest ancestor which has the subsys enabled. The
252 * following lists all css_sets which point to this cgroup's css
253 * for the given subsystem.
254 */
255 struct list_head e_csets[CGROUP_SUBSYS_COUNT];
256
257 /*
258 * list of pidlists, up to two for each namespace (one for procs, one
259 * for tasks); created on demand.
260 */
261 struct list_head pidlists;
262 struct mutex pidlist_mutex;
263
264 /* used to wait for offlining of csses */
265 wait_queue_head_t offline_waitq;
266
267 /* used to schedule release agent */
268 struct work_struct release_agent_work;
269};
270
271/*
272 * A cgroup_root represents the root of a cgroup hierarchy, and may be
273 * associated with a kernfs_root to form an active hierarchy. This is
274 * internal to cgroup core. Don't access directly from controllers.
275 */
276struct cgroup_root {
277 struct kernfs_root *kf_root;
278
279 /* The bitmask of subsystems attached to this hierarchy */
280 unsigned int subsys_mask;
281
282 /* Unique id for this hierarchy. */
283 int hierarchy_id;
284
285 /* The root cgroup. Root is destroyed on its release. */
286 struct cgroup cgrp;
287
288 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
289 atomic_t nr_cgrps;
290
291 /* A list running through the active hierarchies */
292 struct list_head root_list;
293
294 /* Hierarchy-specific flags */
295 unsigned int flags;
296
297 /* IDs for cgroups in this hierarchy */
298 struct idr cgroup_idr;
299
300 /* The path to use for release notifications. */
301 char release_agent_path[PATH_MAX];
302
303 /* The name for this hierarchy - may be empty */
304 char name[MAX_CGROUP_ROOT_NAMELEN];
305};
306
307/*
308 * struct cftype: handler definitions for cgroup control files
309 *
310 * When reading/writing to a file:
311 * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
312 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
313 */
314struct cftype {
315 /*
316 * By convention, the name should begin with the name of the
317 * subsystem, followed by a period. Zero length string indicates
318 * end of cftype array.
319 */
320 char name[MAX_CFTYPE_NAME];
321 int private;
322 /*
323 * If not 0, file mode is set to this value, otherwise it will
324 * be figured out automatically
325 */
326 umode_t mode;
327
328 /*
329 * The maximum length of string, excluding trailing nul, that can
330 * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
331 */
332 size_t max_write_len;
333
334 /* CFTYPE_* flags */
335 unsigned int flags;
336
337 /*
338 * Fields used for internal bookkeeping. Initialized automatically
339 * during registration.
340 */
341 struct cgroup_subsys *ss; /* NULL for cgroup core files */
342 struct list_head node; /* anchored at ss->cfts */
343 struct kernfs_ops *kf_ops;
344
345 /*
346 * read_u64() is a shortcut for the common case of returning a
347 * single integer. Use it in place of read()
348 */
349 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
350 /*
351 * read_s64() is a signed version of read_u64()
352 */
353 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
354
355 /* generic seq_file read interface */
356 int (*seq_show)(struct seq_file *sf, void *v);
357
358 /* optional ops, implement all or none */
359 void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
360 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
361 void (*seq_stop)(struct seq_file *sf, void *v);
362
363 /*
364 * write_u64() is a shortcut for the common case of accepting
365 * a single integer (as parsed by simple_strtoull) from
366 * userspace. Use in place of write(); return 0 or error.
367 */
368 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
369 u64 val);
370 /*
371 * write_s64() is a signed version of write_u64()
372 */
373 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
374 s64 val);
375
376 /*
377 * write() is the generic write callback which maps directly to
378 * kernfs write operation and overrides all other operations.
379 * Maximum write size is determined by ->max_write_len. Use
380 * of_css/cft() to access the associated css and cft.
381 */
382 ssize_t (*write)(struct kernfs_open_file *of,
383 char *buf, size_t nbytes, loff_t off);
384
385#ifdef CONFIG_DEBUG_LOCK_ALLOC
386 struct lock_class_key lockdep_key;
387#endif
388};
389
390/*
391 * Control Group subsystem type.
392 * See Documentation/cgroups/cgroups.txt for details
393 */
394struct cgroup_subsys {
395 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
396 int (*css_online)(struct cgroup_subsys_state *css);
397 void (*css_offline)(struct cgroup_subsys_state *css);
398 void (*css_released)(struct cgroup_subsys_state *css);
399 void (*css_free)(struct cgroup_subsys_state *css);
400 void (*css_reset)(struct cgroup_subsys_state *css);
401 void (*css_e_css_changed)(struct cgroup_subsys_state *css);
402
403 int (*can_attach)(struct cgroup_subsys_state *css,
404 struct cgroup_taskset *tset);
405 void (*cancel_attach)(struct cgroup_subsys_state *css,
406 struct cgroup_taskset *tset);
407 void (*attach)(struct cgroup_subsys_state *css,
408 struct cgroup_taskset *tset);
409 void (*fork)(struct task_struct *task);
410 void (*exit)(struct cgroup_subsys_state *css,
411 struct cgroup_subsys_state *old_css,
412 struct task_struct *task);
413 void (*bind)(struct cgroup_subsys_state *root_css);
414
415 int disabled;
416 int early_init;
417
418 /*
419 * If %false, this subsystem is properly hierarchical -
420 * configuration, resource accounting and restriction on a parent
421 * cgroup cover those of its children. If %true, hierarchy support
422 * is broken in some ways - some subsystems ignore hierarchy
423 * completely while others are only implemented half-way.
424 *
425 * It's now disallowed to create nested cgroups if the subsystem is
426 * broken and cgroup core will emit a warning message on such
427 * cases. Eventually, all subsystems will be made properly
428 * hierarchical and this will go away.
429 */
430 bool broken_hierarchy;
431 bool warned_broken_hierarchy;
432
433 /* the following two fields are initialized automtically during boot */
434 int id;
435 const char *name;
436
437 /* link to parent, protected by cgroup_lock() */
438 struct cgroup_root *root;
439
440 /* idr for css->id */
441 struct idr css_idr;
442
443 /*
444 * List of cftypes. Each entry is the first entry of an array
445 * terminated by zero length name.
446 */
447 struct list_head cfts;
448
449 /*
450 * Base cftypes which are automatically registered. The two can
451 * point to the same array.
452 */
453 struct cftype *dfl_cftypes; /* for the default hierarchy */
454 struct cftype *legacy_cftypes; /* for the legacy hierarchies */
455
456 /*
457 * A subsystem may depend on other subsystems. When such subsystem
458 * is enabled on a cgroup, the depended-upon subsystems are enabled
459 * together if available. Subsystems enabled due to dependency are
460 * not visible to userland until explicitly enabled. The following
461 * specifies the mask of subsystems that this one depends on.
462 */
463 unsigned int depends_on;
464};
465
466extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
467
468/**
469 * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
470 * @tsk: target task
471 *
472 * Called from threadgroup_change_begin() and allows cgroup operations to
473 * synchronize against threadgroup changes using a percpu_rw_semaphore.
474 */
475static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
476{
477 percpu_down_read(&cgroup_threadgroup_rwsem);
478}
479
480/**
481 * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
482 * @tsk: target task
483 *
484 * Called from threadgroup_change_end(). Counterpart of
485 * cgroup_threadcgroup_change_begin().
486 */
487static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
488{
489 percpu_up_read(&cgroup_threadgroup_rwsem);
490}
491
492#else /* CONFIG_CGROUPS */
493
494#define CGROUP_SUBSYS_COUNT 0
495
496static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
497static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
498
499#endif /* CONFIG_CGROUPS */
500
501#endif /* _LINUX_CGROUP_DEFS_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e7da0aa65b2d..a593e299162e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -11,94 +11,200 @@
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/rcupdate.h>
15#include <linux/rculist.h> 14#include <linux/rculist.h>
16#include <linux/cgroupstats.h> 15#include <linux/cgroupstats.h>
17#include <linux/rwsem.h> 16#include <linux/rwsem.h>
18#include <linux/idr.h>
19#include <linux/workqueue.h>
20#include <linux/fs.h> 17#include <linux/fs.h>
21#include <linux/percpu-refcount.h>
22#include <linux/seq_file.h> 18#include <linux/seq_file.h>
23#include <linux/kernfs.h> 19#include <linux/kernfs.h>
24#include <linux/wait.h> 20
21#include <linux/cgroup-defs.h>
25 22
26#ifdef CONFIG_CGROUPS 23#ifdef CONFIG_CGROUPS
27 24
28struct cgroup_root; 25/* a css_task_iter should be treated as an opaque object */
29struct cgroup_subsys; 26struct css_task_iter {
30struct cgroup; 27 struct cgroup_subsys *ss;
31 28
32extern int cgroup_init_early(void); 29 struct list_head *cset_pos;
33extern int cgroup_init(void); 30 struct list_head *cset_head;
34extern void cgroup_fork(struct task_struct *p); 31
35extern void cgroup_post_fork(struct task_struct *p); 32 struct list_head *task_pos;
36extern void cgroup_exit(struct task_struct *p); 33 struct list_head *tasks_head;
37extern int cgroupstats_build(struct cgroupstats *stats, 34 struct list_head *mg_tasks_head;
38 struct dentry *dentry); 35};
39 36
40extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, 37extern struct cgroup_root cgrp_dfl_root;
41 struct pid *pid, struct task_struct *tsk); 38extern struct css_set init_css_set;
42 39
43/* define the enumeration of all cgroup subsystems */ 40#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
44#define SUBSYS(_x) _x ## _cgrp_id,
45enum cgroup_subsys_id {
46#include <linux/cgroup_subsys.h> 41#include <linux/cgroup_subsys.h>
47 CGROUP_SUBSYS_COUNT,
48};
49#undef SUBSYS 42#undef SUBSYS
50 43
44bool css_has_online_children(struct cgroup_subsys_state *css);
45struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
46struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
47 struct cgroup_subsys *ss);
48struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
49 struct cgroup_subsys *ss);
50
51bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
52int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
53int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
54
55int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
56int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
57int cgroup_rm_cftypes(struct cftype *cfts);
58
59char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
60int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
61int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
62 struct pid *pid, struct task_struct *tsk);
63
64void cgroup_fork(struct task_struct *p);
65void cgroup_post_fork(struct task_struct *p);
66void cgroup_exit(struct task_struct *p);
67
68int cgroup_init_early(void);
69int cgroup_init(void);
70
51/* 71/*
52 * Per-subsystem/per-cgroup state maintained by the system. This is the 72 * Iteration helpers and macros.
53 * fundamental structural building block that controllers deal with. 73 */
74
75struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
76 struct cgroup_subsys_state *parent);
77struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
78 struct cgroup_subsys_state *css);
79struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
80struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
81 struct cgroup_subsys_state *css);
82
83struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
84struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
85
86void css_task_iter_start(struct cgroup_subsys_state *css,
87 struct css_task_iter *it);
88struct task_struct *css_task_iter_next(struct css_task_iter *it);
89void css_task_iter_end(struct css_task_iter *it);
90
91/**
92 * css_for_each_child - iterate through children of a css
93 * @pos: the css * to use as the loop cursor
94 * @parent: css whose children to walk
54 * 95 *
55 * Fields marked with "PI:" are public and immutable and may be accessed 96 * Walk @parent's children. Must be called under rcu_read_lock().
56 * directly without synchronization. 97 *
98 * If a subsystem synchronizes ->css_online() and the start of iteration, a
99 * css which finished ->css_online() is guaranteed to be visible in the
100 * future iterations and will stay visible until the last reference is put.
101 * A css which hasn't finished ->css_online() or already finished
102 * ->css_offline() may show up during traversal. It's each subsystem's
103 * responsibility to synchronize against on/offlining.
104 *
105 * It is allowed to temporarily drop RCU read lock during iteration. The
106 * caller is responsible for ensuring that @pos remains accessible until
107 * the start of the next iteration by, for example, bumping the css refcnt.
57 */ 108 */
58struct cgroup_subsys_state { 109#define css_for_each_child(pos, parent) \
59 /* PI: the cgroup that this css is attached to */ 110 for ((pos) = css_next_child(NULL, (parent)); (pos); \
60 struct cgroup *cgroup; 111 (pos) = css_next_child((pos), (parent)))
61
62 /* PI: the cgroup subsystem that this css is attached to */
63 struct cgroup_subsys *ss;
64
65 /* reference count - access via css_[try]get() and css_put() */
66 struct percpu_ref refcnt;
67
68 /* PI: the parent css */
69 struct cgroup_subsys_state *parent;
70
71 /* siblings list anchored at the parent's ->children */
72 struct list_head sibling;
73 struct list_head children;
74
75 /*
76 * PI: Subsys-unique ID. 0 is unused and root is always 1. The
77 * matching css can be looked up using css_from_id().
78 */
79 int id;
80
81 unsigned int flags;
82
83 /*
84 * Monotonically increasing unique serial number which defines a
85 * uniform order among all csses. It's guaranteed that all
86 * ->children lists are in the ascending order of ->serial_nr and
87 * used to allow interrupting and resuming iterations.
88 */
89 u64 serial_nr;
90
91 /* percpu_ref killing and RCU release */
92 struct rcu_head rcu_head;
93 struct work_struct destroy_work;
94};
95 112
96/* bits in struct cgroup_subsys_state flags field */ 113/**
97enum { 114 * css_for_each_descendant_pre - pre-order walk of a css's descendants
98 CSS_NO_REF = (1 << 0), /* no reference counting for this css */ 115 * @pos: the css * to use as the loop cursor
99 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 116 * @root: css whose descendants to walk
100 CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ 117 *
101}; 118 * Walk @root's descendants. @root is included in the iteration and the
119 * first node to be visited. Must be called under rcu_read_lock().
120 *
121 * If a subsystem synchronizes ->css_online() and the start of iteration, a
122 * css which finished ->css_online() is guaranteed to be visible in the
123 * future iterations and will stay visible until the last reference is put.
124 * A css which hasn't finished ->css_online() or already finished
125 * ->css_offline() may show up during traversal. It's each subsystem's
126 * responsibility to synchronize against on/offlining.
127 *
128 * For example, the following guarantees that a descendant can't escape
129 * state updates of its ancestors.
130 *
131 * my_online(@css)
132 * {
133 * Lock @css's parent and @css;
134 * Inherit state from the parent;
135 * Unlock both.
136 * }
137 *
138 * my_update_state(@css)
139 * {
140 * css_for_each_descendant_pre(@pos, @css) {
141 * Lock @pos;
142 * if (@pos == @css)
143 * Update @css's state;
144 * else
145 * Verify @pos is alive and inherit state from its parent;
146 * Unlock @pos;
147 * }
148 * }
149 *
150 * As long as the inheriting step, including checking the parent state, is
151 * enclosed inside @pos locking, double-locking the parent isn't necessary
152 * while inheriting. The state update to the parent is guaranteed to be
153 * visible by walking order and, as long as inheriting operations to the
154 * same @pos are atomic to each other, multiple updates racing each other
155 * still result in the correct state. It's guaranateed that at least one
156 * inheritance happens for any css after the latest update to its parent.
157 *
158 * If checking parent's state requires locking the parent, each inheriting
159 * iteration should lock and unlock both @pos->parent and @pos.
160 *
161 * Alternatively, a subsystem may choose to use a single global lock to
162 * synchronize ->css_online() and ->css_offline() against tree-walking
163 * operations.
164 *
165 * It is allowed to temporarily drop RCU read lock during iteration. The
166 * caller is responsible for ensuring that @pos remains accessible until
167 * the start of the next iteration by, for example, bumping the css refcnt.
168 */
169#define css_for_each_descendant_pre(pos, css) \
170 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \
171 (pos) = css_next_descendant_pre((pos), (css)))
172
173/**
174 * css_for_each_descendant_post - post-order walk of a css's descendants
175 * @pos: the css * to use as the loop cursor
176 * @css: css whose descendants to walk
177 *
178 * Similar to css_for_each_descendant_pre() but performs post-order
179 * traversal instead. @root is included in the iteration and the last
180 * node to be visited.
181 *
182 * If a subsystem synchronizes ->css_online() and the start of iteration, a
183 * css which finished ->css_online() is guaranteed to be visible in the
184 * future iterations and will stay visible until the last reference is put.
185 * A css which hasn't finished ->css_online() or already finished
186 * ->css_offline() may show up during traversal. It's each subsystem's
187 * responsibility to synchronize against on/offlining.
188 *
189 * Note that the walk visibility guarantee example described in pre-order
190 * walk doesn't apply the same to post-order walks.
191 */
192#define css_for_each_descendant_post(pos, css) \
193 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
194 (pos) = css_next_descendant_post((pos), (css)))
195
196/**
197 * cgroup_taskset_for_each - iterate cgroup_taskset
198 * @task: the loop cursor
199 * @tset: taskset to iterate
200 */
201#define cgroup_taskset_for_each(task, tset) \
202 for ((task) = cgroup_taskset_first((tset)); (task); \
203 (task) = cgroup_taskset_next((tset)))
204
205/*
206 * Inline functions.
207 */
102 208
103/** 209/**
104 * css_get - obtain a reference on the specified css 210 * css_get - obtain a reference on the specified css
@@ -185,532 +291,6 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
185 percpu_ref_put_many(&css->refcnt, n); 291 percpu_ref_put_many(&css->refcnt, n);
186} 292}
187 293
188/* bits in struct cgroup flags field */
189enum {
190 /* Control Group requires release notifications to userspace */
191 CGRP_NOTIFY_ON_RELEASE,
192 /*
193 * Clone the parent's configuration when creating a new child
194 * cpuset cgroup. For historical reasons, this option can be
195 * specified at mount time and thus is implemented here.
196 */
197 CGRP_CPUSET_CLONE_CHILDREN,
198};
199
200struct cgroup {
201 /* self css with NULL ->ss, points back to this cgroup */
202 struct cgroup_subsys_state self;
203
204 unsigned long flags; /* "unsigned long" so bitops work */
205
206 /*
207 * idr allocated in-hierarchy ID.
208 *
209 * ID 0 is not used, the ID of the root cgroup is always 1, and a
210 * new cgroup will be assigned with a smallest available ID.
211 *
212 * Allocating/Removing ID must be protected by cgroup_mutex.
213 */
214 int id;
215
216 /*
217 * If this cgroup contains any tasks, it contributes one to
218 * populated_cnt. All children with non-zero popuplated_cnt of
219 * their own contribute one. The count is zero iff there's no task
220 * in this cgroup or its subtree.
221 */
222 int populated_cnt;
223
224 struct kernfs_node *kn; /* cgroup kernfs entry */
225 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
226
227 /*
228 * The bitmask of subsystems enabled on the child cgroups.
229 * ->subtree_control is the one configured through
230 * "cgroup.subtree_control" while ->child_subsys_mask is the
231 * effective one which may have more subsystems enabled.
232 * Controller knobs are made available iff it's enabled in
233 * ->subtree_control.
234 */
235 unsigned int subtree_control;
236 unsigned int child_subsys_mask;
237
238 /* Private pointers for each registered subsystem */
239 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
240
241 struct cgroup_root *root;
242
243 /*
244 * List of cgrp_cset_links pointing at css_sets with tasks in this
245 * cgroup. Protected by css_set_lock.
246 */
247 struct list_head cset_links;
248
249 /*
250 * On the default hierarchy, a css_set for a cgroup with some
251 * susbsys disabled will point to css's which are associated with
252 * the closest ancestor which has the subsys enabled. The
253 * following lists all css_sets which point to this cgroup's css
254 * for the given subsystem.
255 */
256 struct list_head e_csets[CGROUP_SUBSYS_COUNT];
257
258 /*
259 * list of pidlists, up to two for each namespace (one for procs, one
260 * for tasks); created on demand.
261 */
262 struct list_head pidlists;
263 struct mutex pidlist_mutex;
264
265 /* used to wait for offlining of csses */
266 wait_queue_head_t offline_waitq;
267
268 /* used to schedule release agent */
269 struct work_struct release_agent_work;
270};
271
272#define MAX_CGROUP_ROOT_NAMELEN 64
273
274/* cgroup_root->flags */
275enum {
276 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
277 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
278 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
279};
280
281/*
282 * A cgroup_root represents the root of a cgroup hierarchy, and may be
283 * associated with a kernfs_root to form an active hierarchy. This is
284 * internal to cgroup core. Don't access directly from controllers.
285 */
286struct cgroup_root {
287 struct kernfs_root *kf_root;
288
289 /* The bitmask of subsystems attached to this hierarchy */
290 unsigned int subsys_mask;
291
292 /* Unique id for this hierarchy. */
293 int hierarchy_id;
294
295 /* The root cgroup. Root is destroyed on its release. */
296 struct cgroup cgrp;
297
298 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
299 atomic_t nr_cgrps;
300
301 /* A list running through the active hierarchies */
302 struct list_head root_list;
303
304 /* Hierarchy-specific flags */
305 unsigned int flags;
306
307 /* IDs for cgroups in this hierarchy */
308 struct idr cgroup_idr;
309
310 /* The path to use for release notifications. */
311 char release_agent_path[PATH_MAX];
312
313 /* The name for this hierarchy - may be empty */
314 char name[MAX_CGROUP_ROOT_NAMELEN];
315};
316
317/*
318 * A css_set is a structure holding pointers to a set of
319 * cgroup_subsys_state objects. This saves space in the task struct
320 * object and speeds up fork()/exit(), since a single inc/dec and a
321 * list_add()/del() can bump the reference count on the entire cgroup
322 * set for a task.
323 */
324
325struct css_set {
326
327 /* Reference count */
328 atomic_t refcount;
329
330 /*
331 * List running through all cgroup groups in the same hash
332 * slot. Protected by css_set_lock
333 */
334 struct hlist_node hlist;
335
336 /*
337 * Lists running through all tasks using this cgroup group.
338 * mg_tasks lists tasks which belong to this cset but are in the
339 * process of being migrated out or in. Protected by
340 * css_set_rwsem, but, during migration, once tasks are moved to
341 * mg_tasks, it can be read safely while holding cgroup_mutex.
342 */
343 struct list_head tasks;
344 struct list_head mg_tasks;
345
346 /*
347 * List of cgrp_cset_links pointing at cgroups referenced from this
348 * css_set. Protected by css_set_lock.
349 */
350 struct list_head cgrp_links;
351
352 /* the default cgroup associated with this css_set */
353 struct cgroup *dfl_cgrp;
354
355 /*
356 * Set of subsystem states, one for each subsystem. This array is
357 * immutable after creation apart from the init_css_set during
358 * subsystem registration (at boot time).
359 */
360 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
361
362 /*
363 * List of csets participating in the on-going migration either as
364 * source or destination. Protected by cgroup_mutex.
365 */
366 struct list_head mg_preload_node;
367 struct list_head mg_node;
368
369 /*
370 * If this cset is acting as the source of migration the following
371 * two fields are set. mg_src_cgrp is the source cgroup of the
372 * on-going migration and mg_dst_cset is the destination cset the
373 * target tasks on this cset should be migrated to. Protected by
374 * cgroup_mutex.
375 */
376 struct cgroup *mg_src_cgrp;
377 struct css_set *mg_dst_cset;
378
379 /*
380 * On the default hierarhcy, ->subsys[ssid] may point to a css
381 * attached to an ancestor instead of the cgroup this css_set is
382 * associated with. The following node is anchored at
383 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
384 * iterate through all css's attached to a given cgroup.
385 */
386 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
387
388 /* For RCU-protected deletion */
389 struct rcu_head rcu_head;
390};
391
392/*
393 * struct cftype: handler definitions for cgroup control files
394 *
395 * When reading/writing to a file:
396 * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
397 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
398 */
399
400/* cftype->flags */
401enum {
402 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
403 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
404 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
405
406 /* internal flags, do not use outside cgroup core proper */
407 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
408 __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
409};
410
411#define MAX_CFTYPE_NAME 64
412
413struct cftype {
414 /*
415 * By convention, the name should begin with the name of the
416 * subsystem, followed by a period. Zero length string indicates
417 * end of cftype array.
418 */
419 char name[MAX_CFTYPE_NAME];
420 int private;
421 /*
422 * If not 0, file mode is set to this value, otherwise it will
423 * be figured out automatically
424 */
425 umode_t mode;
426
427 /*
428 * The maximum length of string, excluding trailing nul, that can
429 * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
430 */
431 size_t max_write_len;
432
433 /* CFTYPE_* flags */
434 unsigned int flags;
435
436 /*
437 * Fields used for internal bookkeeping. Initialized automatically
438 * during registration.
439 */
440 struct cgroup_subsys *ss; /* NULL for cgroup core files */
441 struct list_head node; /* anchored at ss->cfts */
442 struct kernfs_ops *kf_ops;
443
444 /*
445 * read_u64() is a shortcut for the common case of returning a
446 * single integer. Use it in place of read()
447 */
448 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
449 /*
450 * read_s64() is a signed version of read_u64()
451 */
452 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
453
454 /* generic seq_file read interface */
455 int (*seq_show)(struct seq_file *sf, void *v);
456
457 /* optional ops, implement all or none */
458 void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
459 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
460 void (*seq_stop)(struct seq_file *sf, void *v);
461
462 /*
463 * write_u64() is a shortcut for the common case of accepting
464 * a single integer (as parsed by simple_strtoull) from
465 * userspace. Use in place of write(); return 0 or error.
466 */
467 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
468 u64 val);
469 /*
470 * write_s64() is a signed version of write_u64()
471 */
472 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
473 s64 val);
474
475 /*
476 * write() is the generic write callback which maps directly to
477 * kernfs write operation and overrides all other operations.
478 * Maximum write size is determined by ->max_write_len. Use
479 * of_css/cft() to access the associated css and cft.
480 */
481 ssize_t (*write)(struct kernfs_open_file *of,
482 char *buf, size_t nbytes, loff_t off);
483
484#ifdef CONFIG_DEBUG_LOCK_ALLOC
485 struct lock_class_key lockdep_key;
486#endif
487};
488
489extern struct cgroup_root cgrp_dfl_root;
490extern struct css_set init_css_set;
491
492/**
493 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
494 * @cgrp: the cgroup of interest
495 *
496 * The default hierarchy is the v2 interface of cgroup and this function
497 * can be used to test whether a cgroup is on the default hierarchy for
498 * cases where a subsystem should behave differnetly depending on the
499 * interface version.
500 *
501 * The set of behaviors which change on the default hierarchy are still
502 * being determined and the mount option is prefixed with __DEVEL__.
503 *
504 * List of changed behaviors:
505 *
506 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
507 * and "name" are disallowed.
508 *
509 * - When mounting an existing superblock, mount options should match.
510 *
511 * - Remount is disallowed.
512 *
513 * - rename(2) is disallowed.
514 *
515 * - "tasks" is removed. Everything should be at process granularity. Use
516 * "cgroup.procs" instead.
517 *
518 * - "cgroup.procs" is not sorted. pids will be unique unless they got
519 * recycled inbetween reads.
520 *
521 * - "release_agent" and "notify_on_release" are removed. Replacement
522 * notification mechanism will be implemented.
523 *
524 * - "cgroup.clone_children" is removed.
525 *
526 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
527 * and its descendants contain no task; otherwise, 1. The file also
528 * generates kernfs notification which can be monitored through poll and
529 * [di]notify when the value of the file changes.
530 *
531 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
532 * take masks of ancestors with non-empty cpus/mems, instead of being
533 * moved to an ancestor.
534 *
535 * - cpuset: a task can be moved into an empty cpuset, and again it takes
536 * masks of ancestors.
537 *
538 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
539 * is not created.
540 *
541 * - blkcg: blk-throttle becomes properly hierarchical.
542 *
543 * - debug: disallowed on the default hierarchy.
544 */
545static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
546{
547 return cgrp->root == &cgrp_dfl_root;
548}
549
550/* no synchronization, the result can only be used as a hint */
551static inline bool cgroup_has_tasks(struct cgroup *cgrp)
552{
553 return !list_empty(&cgrp->cset_links);
554}
555
556/* returns ino associated with a cgroup */
557static inline ino_t cgroup_ino(struct cgroup *cgrp)
558{
559 return cgrp->kn->ino;
560}
561
562/* cft/css accessors for cftype->write() operation */
563static inline struct cftype *of_cft(struct kernfs_open_file *of)
564{
565 return of->kn->priv;
566}
567
568struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
569
570/* cft/css accessors for cftype->seq_*() operations */
571static inline struct cftype *seq_cft(struct seq_file *seq)
572{
573 return of_cft(seq->private);
574}
575
576static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
577{
578 return of_css(seq->private);
579}
580
581/*
582 * Name / path handling functions. All are thin wrappers around the kernfs
583 * counterparts and can be called under any context.
584 */
585
586static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
587{
588 return kernfs_name(cgrp->kn, buf, buflen);
589}
590
591static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
592 size_t buflen)
593{
594 return kernfs_path(cgrp->kn, buf, buflen);
595}
596
597static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
598{
599 pr_cont_kernfs_name(cgrp->kn);
600}
601
602static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
603{
604 pr_cont_kernfs_path(cgrp->kn);
605}
606
607char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
608
609int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
610int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
611int cgroup_rm_cftypes(struct cftype *cfts);
612
613bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
614
615/*
616 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
617 * methods.
618 */
619struct cgroup_taskset;
620struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
621struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
622
623/**
624 * cgroup_taskset_for_each - iterate cgroup_taskset
625 * @task: the loop cursor
626 * @tset: taskset to iterate
627 */
628#define cgroup_taskset_for_each(task, tset) \
629 for ((task) = cgroup_taskset_first((tset)); (task); \
630 (task) = cgroup_taskset_next((tset)))
631
632/*
633 * Control Group subsystem type.
634 * See Documentation/cgroups/cgroups.txt for details
635 */
636
637struct cgroup_subsys {
638 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
639 int (*css_online)(struct cgroup_subsys_state *css);
640 void (*css_offline)(struct cgroup_subsys_state *css);
641 void (*css_released)(struct cgroup_subsys_state *css);
642 void (*css_free)(struct cgroup_subsys_state *css);
643 void (*css_reset)(struct cgroup_subsys_state *css);
644 void (*css_e_css_changed)(struct cgroup_subsys_state *css);
645
646 int (*can_attach)(struct cgroup_subsys_state *css,
647 struct cgroup_taskset *tset);
648 void (*cancel_attach)(struct cgroup_subsys_state *css,
649 struct cgroup_taskset *tset);
650 void (*attach)(struct cgroup_subsys_state *css,
651 struct cgroup_taskset *tset);
652 void (*fork)(struct task_struct *task);
653 void (*exit)(struct cgroup_subsys_state *css,
654 struct cgroup_subsys_state *old_css,
655 struct task_struct *task);
656 void (*bind)(struct cgroup_subsys_state *root_css);
657
658 int disabled;
659 int early_init;
660
661 /*
662 * If %false, this subsystem is properly hierarchical -
663 * configuration, resource accounting and restriction on a parent
664 * cgroup cover those of its children. If %true, hierarchy support
665 * is broken in some ways - some subsystems ignore hierarchy
666 * completely while others are only implemented half-way.
667 *
668 * It's now disallowed to create nested cgroups if the subsystem is
669 * broken and cgroup core will emit a warning message on such
670 * cases. Eventually, all subsystems will be made properly
671 * hierarchical and this will go away.
672 */
673 bool broken_hierarchy;
674 bool warned_broken_hierarchy;
675
676 /* the following two fields are initialized automtically during boot */
677 int id;
678#define MAX_CGROUP_TYPE_NAMELEN 32
679 const char *name;
680
681 /* link to parent, protected by cgroup_lock() */
682 struct cgroup_root *root;
683
684 /* idr for css->id */
685 struct idr css_idr;
686
687 /*
688 * List of cftypes. Each entry is the first entry of an array
689 * terminated by zero length name.
690 */
691 struct list_head cfts;
692
693 /*
694 * Base cftypes which are automatically registered. The two can
695 * point to the same array.
696 */
697 struct cftype *dfl_cftypes; /* for the default hierarchy */
698 struct cftype *legacy_cftypes; /* for the legacy hierarchies */
699
700 /*
701 * A subsystem may depend on other subsystems. When such subsystem
702 * is enabled on a cgroup, the depended-upon subsystems are enabled
703 * together if available. Subsystems enabled due to dependency are
704 * not visible to userland until explicitly enabled. The following
705 * specifies the mask of subsystems that this one depends on.
706 */
707 unsigned int depends_on;
708};
709
710#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
711#include <linux/cgroup_subsys.h>
712#undef SUBSYS
713
714/** 294/**
715 * task_css_set_check - obtain a task's css_set with extra access conditions 295 * task_css_set_check - obtain a task's css_set with extra access conditions
716 * @task: the task to obtain css_set for 296 * @task: the task to obtain css_set for
@@ -818,178 +398,137 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
818 return task_css(task, subsys_id)->cgroup; 398 return task_css(task, subsys_id)->cgroup;
819} 399}
820 400
821struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
822 struct cgroup_subsys_state *parent);
823
824struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
825
826/** 401/**
827 * css_for_each_child - iterate through children of a css 402 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
828 * @pos: the css * to use as the loop cursor 403 * @cgrp: the cgroup of interest
829 * @parent: css whose children to walk
830 * 404 *
831 * Walk @parent's children. Must be called under rcu_read_lock(). 405 * The default hierarchy is the v2 interface of cgroup and this function
406 * can be used to test whether a cgroup is on the default hierarchy for
407 * cases where a subsystem should behave differnetly depending on the
408 * interface version.
832 * 409 *
833 * If a subsystem synchronizes ->css_online() and the start of iteration, a 410 * The set of behaviors which change on the default hierarchy are still
834 * css which finished ->css_online() is guaranteed to be visible in the 411 * being determined and the mount option is prefixed with __DEVEL__.
835 * future iterations and will stay visible until the last reference is put.
836 * A css which hasn't finished ->css_online() or already finished
837 * ->css_offline() may show up during traversal. It's each subsystem's
838 * responsibility to synchronize against on/offlining.
839 * 412 *
840 * It is allowed to temporarily drop RCU read lock during iteration. The 413 * List of changed behaviors:
841 * caller is responsible for ensuring that @pos remains accessible until
842 * the start of the next iteration by, for example, bumping the css refcnt.
843 */
844#define css_for_each_child(pos, parent) \
845 for ((pos) = css_next_child(NULL, (parent)); (pos); \
846 (pos) = css_next_child((pos), (parent)))
847
848struct cgroup_subsys_state *
849css_next_descendant_pre(struct cgroup_subsys_state *pos,
850 struct cgroup_subsys_state *css);
851
852struct cgroup_subsys_state *
853css_rightmost_descendant(struct cgroup_subsys_state *pos);
854
855/**
856 * css_for_each_descendant_pre - pre-order walk of a css's descendants
857 * @pos: the css * to use as the loop cursor
858 * @root: css whose descendants to walk
859 * 414 *
860 * Walk @root's descendants. @root is included in the iteration and the 415 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
861 * first node to be visited. Must be called under rcu_read_lock(). 416 * and "name" are disallowed.
862 * 417 *
863 * If a subsystem synchronizes ->css_online() and the start of iteration, a 418 * - When mounting an existing superblock, mount options should match.
864 * css which finished ->css_online() is guaranteed to be visible in the
865 * future iterations and will stay visible until the last reference is put.
866 * A css which hasn't finished ->css_online() or already finished
867 * ->css_offline() may show up during traversal. It's each subsystem's
868 * responsibility to synchronize against on/offlining.
869 * 419 *
870 * For example, the following guarantees that a descendant can't escape 420 * - Remount is disallowed.
871 * state updates of its ancestors.
872 * 421 *
873 * my_online(@css) 422 * - rename(2) is disallowed.
874 * {
875 * Lock @css's parent and @css;
876 * Inherit state from the parent;
877 * Unlock both.
878 * }
879 * 423 *
880 * my_update_state(@css) 424 * - "tasks" is removed. Everything should be at process granularity. Use
881 * { 425 * "cgroup.procs" instead.
882 * css_for_each_descendant_pre(@pos, @css) {
883 * Lock @pos;
884 * if (@pos == @css)
885 * Update @css's state;
886 * else
887 * Verify @pos is alive and inherit state from its parent;
888 * Unlock @pos;
889 * }
890 * }
891 * 426 *
892 * As long as the inheriting step, including checking the parent state, is 427 * - "cgroup.procs" is not sorted. pids will be unique unless they got
893 * enclosed inside @pos locking, double-locking the parent isn't necessary 428 * recycled inbetween reads.
894 * while inheriting. The state update to the parent is guaranteed to be
895 * visible by walking order and, as long as inheriting operations to the
896 * same @pos are atomic to each other, multiple updates racing each other
897 * still result in the correct state. It's guaranateed that at least one
898 * inheritance happens for any css after the latest update to its parent.
899 * 429 *
900 * If checking parent's state requires locking the parent, each inheriting 430 * - "release_agent" and "notify_on_release" are removed. Replacement
901 * iteration should lock and unlock both @pos->parent and @pos. 431 * notification mechanism will be implemented.
902 * 432 *
903 * Alternatively, a subsystem may choose to use a single global lock to 433 * - "cgroup.clone_children" is removed.
904 * synchronize ->css_online() and ->css_offline() against tree-walking
905 * operations.
906 * 434 *
907 * It is allowed to temporarily drop RCU read lock during iteration. The 435 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
908 * caller is responsible for ensuring that @pos remains accessible until 436 * and its descendants contain no task; otherwise, 1. The file also
909 * the start of the next iteration by, for example, bumping the css refcnt. 437 * generates kernfs notification which can be monitored through poll and
910 */ 438 * [di]notify when the value of the file changes.
911#define css_for_each_descendant_pre(pos, css) \
912 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \
913 (pos) = css_next_descendant_pre((pos), (css)))
914
915struct cgroup_subsys_state *
916css_next_descendant_post(struct cgroup_subsys_state *pos,
917 struct cgroup_subsys_state *css);
918
919/**
920 * css_for_each_descendant_post - post-order walk of a css's descendants
921 * @pos: the css * to use as the loop cursor
922 * @css: css whose descendants to walk
923 * 439 *
924 * Similar to css_for_each_descendant_pre() but performs post-order 440 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
925 * traversal instead. @root is included in the iteration and the last 441 * take masks of ancestors with non-empty cpus/mems, instead of being
926 * node to be visited. 442 * moved to an ancestor.
927 * 443 *
928 * If a subsystem synchronizes ->css_online() and the start of iteration, a 444 * - cpuset: a task can be moved into an empty cpuset, and again it takes
929 * css which finished ->css_online() is guaranteed to be visible in the 445 * masks of ancestors.
930 * future iterations and will stay visible until the last reference is put.
931 * A css which hasn't finished ->css_online() or already finished
932 * ->css_offline() may show up during traversal. It's each subsystem's
933 * responsibility to synchronize against on/offlining.
934 * 446 *
935 * Note that the walk visibility guarantee example described in pre-order 447 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
936 * walk doesn't apply the same to post-order walks. 448 * is not created.
449 *
450 * - blkcg: blk-throttle becomes properly hierarchical.
451 *
452 * - debug: disallowed on the default hierarchy.
937 */ 453 */
938#define css_for_each_descendant_post(pos, css) \ 454static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
939 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ 455{
940 (pos) = css_next_descendant_post((pos), (css))) 456 return cgrp->root == &cgrp_dfl_root;
457}
941 458
942bool css_has_online_children(struct cgroup_subsys_state *css); 459/* no synchronization, the result can only be used as a hint */
460static inline bool cgroup_has_tasks(struct cgroup *cgrp)
461{
462 return !list_empty(&cgrp->cset_links);
463}
943 464
944/* A css_task_iter should be treated as an opaque object */ 465/* returns ino associated with a cgroup */
945struct css_task_iter { 466static inline ino_t cgroup_ino(struct cgroup *cgrp)
946 struct cgroup_subsys *ss; 467{
468 return cgrp->kn->ino;
469}
947 470
948 struct list_head *cset_pos; 471/* cft/css accessors for cftype->write() operation */
949 struct list_head *cset_head; 472static inline struct cftype *of_cft(struct kernfs_open_file *of)
473{
474 return of->kn->priv;
475}
950 476
951 struct list_head *task_pos; 477struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
952 struct list_head *tasks_head;
953 struct list_head *mg_tasks_head;
954};
955 478
956void css_task_iter_start(struct cgroup_subsys_state *css, 479/* cft/css accessors for cftype->seq_*() operations */
957 struct css_task_iter *it); 480static inline struct cftype *seq_cft(struct seq_file *seq)
958struct task_struct *css_task_iter_next(struct css_task_iter *it); 481{
959void css_task_iter_end(struct css_task_iter *it); 482 return of_cft(seq->private);
483}
960 484
961int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 485static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
962int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 486{
487 return of_css(seq->private);
488}
963 489
964struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, 490/*
965 struct cgroup_subsys *ss); 491 * Name / path handling functions. All are thin wrappers around the kernfs
966struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, 492 * counterparts and can be called under any context.
967 struct cgroup_subsys *ss); 493 */
968 494
969#else /* !CONFIG_CGROUPS */ 495static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
496{
497 return kernfs_name(cgrp->kn, buf, buflen);
498}
970 499
971struct cgroup_subsys_state; 500static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
501 size_t buflen)
502{
503 return kernfs_path(cgrp->kn, buf, buflen);
504}
972 505
973static inline int cgroup_init_early(void) { return 0; } 506static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
974static inline int cgroup_init(void) { return 0; } 507{
975static inline void cgroup_fork(struct task_struct *p) {} 508 pr_cont_kernfs_name(cgrp->kn);
976static inline void cgroup_post_fork(struct task_struct *p) {} 509}
977static inline void cgroup_exit(struct task_struct *p) {}
978 510
979static inline int cgroupstats_build(struct cgroupstats *stats, 511static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
980 struct dentry *dentry)
981{ 512{
982 return -EINVAL; 513 pr_cont_kernfs_path(cgrp->kn);
983} 514}
984 515
985static inline void css_put(struct cgroup_subsys_state *css) {} 516#else /* !CONFIG_CGROUPS */
986 517
987/* No cgroups - nothing to do */ 518struct cgroup_subsys_state;
519
520static inline void css_put(struct cgroup_subsys_state *css) {}
988static inline int cgroup_attach_task_all(struct task_struct *from, 521static inline int cgroup_attach_task_all(struct task_struct *from,
989 struct task_struct *t) 522 struct task_struct *t) { return 0; }
990{ 523static inline int cgroupstats_build(struct cgroupstats *stats,
991 return 0; 524 struct dentry *dentry) { return -EINVAL; }
992} 525
526static inline void cgroup_fork(struct task_struct *p) {}
527static inline void cgroup_post_fork(struct task_struct *p) {}
528static inline void cgroup_exit(struct task_struct *p) {}
529
530static inline int cgroup_init_early(void) { return 0; }
531static inline int cgroup_init(void) { return 0; }
993 532
994#endif /* !CONFIG_CGROUPS */ 533#endif /* !CONFIG_CGROUPS */
995 534
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bb9b075f0eb0..e8493fee8160 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,13 +25,6 @@
25extern struct files_struct init_files; 25extern struct files_struct init_files;
26extern struct fs_struct init_fs; 26extern struct fs_struct init_fs;
27 27
28#ifdef CONFIG_CGROUPS
29#define INIT_GROUP_RWSEM(sig) \
30 .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
31#else
32#define INIT_GROUP_RWSEM(sig)
33#endif
34
35#ifdef CONFIG_CPUSETS 28#ifdef CONFIG_CPUSETS
36#define INIT_CPUSET_SEQ(tsk) \ 29#define INIT_CPUSET_SEQ(tsk) \
37 .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), 30 .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@ -55,7 +48,6 @@ extern struct fs_struct init_fs;
55 }, \ 48 }, \
56 .cred_guard_mutex = \ 49 .cred_guard_mutex = \
57 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ 50 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
58 INIT_GROUP_RWSEM(sig) \
59} 51}
60 52
61extern struct nsproxy init_nsproxy; 53extern struct nsproxy init_nsproxy;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 71ecdab1671b..e6b2f7db9c0c 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn);
277 277
278struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); 278struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
279struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); 279struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
280struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
280 281
281struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, 282struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
282 unsigned int flags, void *priv); 283 unsigned int flags, void *priv);
@@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
352static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) 353static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
353{ return NULL; } 354{ return NULL; }
354 355
356static inline struct inode *
357kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
358{ return NULL; }
359
355static inline struct kernfs_root * 360static inline struct kernfs_root *
356kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, 361kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
357 void *priv) 362 void *priv)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 93ed0b682adb..a09ece354c64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -58,6 +58,7 @@ struct sched_param {
58#include <linux/uidgid.h> 58#include <linux/uidgid.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/magic.h> 60#include <linux/magic.h>
61#include <linux/cgroup-defs.h>
61 62
62#include <asm/processor.h> 63#include <asm/processor.h>
63 64
@@ -755,18 +756,6 @@ struct signal_struct {
755 unsigned audit_tty_log_passwd; 756 unsigned audit_tty_log_passwd;
756 struct tty_audit_buf *tty_audit_buf; 757 struct tty_audit_buf *tty_audit_buf;
757#endif 758#endif
758#ifdef CONFIG_CGROUPS
759 /*
760 * group_rwsem prevents new tasks from entering the threadgroup and
761 * member tasks from exiting,a more specifically, setting of
762 * PF_EXITING. fork and exit paths are protected with this rwsem
763 * using threadgroup_change_begin/end(). Users which require
764 * threadgroup to remain stable should use threadgroup_[un]lock()
765 * which also takes care of exec path. Currently, cgroup is the
766 * only user.
767 */
768 struct rw_semaphore group_rwsem;
769#endif
770 759
771 oom_flags_t oom_flags; 760 oom_flags_t oom_flags;
772 short oom_score_adj; /* OOM kill score adjustment */ 761 short oom_score_adj; /* OOM kill score adjustment */
@@ -2725,53 +2714,33 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
2725 spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); 2714 spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
2726} 2715}
2727 2716
2728#ifdef CONFIG_CGROUPS
2729static inline void threadgroup_change_begin(struct task_struct *tsk)
2730{
2731 down_read(&tsk->signal->group_rwsem);
2732}
2733static inline void threadgroup_change_end(struct task_struct *tsk)
2734{
2735 up_read(&tsk->signal->group_rwsem);
2736}
2737
2738/** 2717/**
2739 * threadgroup_lock - lock threadgroup 2718 * threadgroup_change_begin - mark the beginning of changes to a threadgroup
2740 * @tsk: member task of the threadgroup to lock 2719 * @tsk: task causing the changes
2741 *
2742 * Lock the threadgroup @tsk belongs to. No new task is allowed to enter
2743 * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
2744 * change ->group_leader/pid. This is useful for cases where the threadgroup
2745 * needs to stay stable across blockable operations.
2746 *
2747 * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
2748 * synchronization. While held, no new task will be added to threadgroup
2749 * and no existing live task will have its PF_EXITING set.
2750 * 2720 *
2751 * de_thread() does threadgroup_change_{begin|end}() when a non-leader 2721 * All operations which modify a threadgroup - a new thread joining the
2752 * sub-thread becomes a new leader. 2722 * group, death of a member thread (the assertion of PF_EXITING) and
2723 * exec(2) dethreading the process and replacing the leader - are wrapped
2724 * by threadgroup_change_{begin|end}(). This is to provide a place which
2725 * subsystems needing threadgroup stability can hook into for
2726 * synchronization.
2753 */ 2727 */
2754static inline void threadgroup_lock(struct task_struct *tsk) 2728static inline void threadgroup_change_begin(struct task_struct *tsk)
2755{ 2729{
2756 down_write(&tsk->signal->group_rwsem); 2730 might_sleep();
2731 cgroup_threadgroup_change_begin(tsk);
2757} 2732}
2758 2733
2759/** 2734/**
2760 * threadgroup_unlock - unlock threadgroup 2735 * threadgroup_change_end - mark the end of changes to a threadgroup
2761 * @tsk: member task of the threadgroup to unlock 2736 * @tsk: task causing the changes
2762 * 2737 *
2763 * Reverse threadgroup_lock(). 2738 * See threadgroup_change_begin().
2764 */ 2739 */
2765static inline void threadgroup_unlock(struct task_struct *tsk) 2740static inline void threadgroup_change_end(struct task_struct *tsk)
2766{ 2741{
2767 up_write(&tsk->signal->group_rwsem); 2742 cgroup_threadgroup_change_end(tsk);
2768} 2743}
2769#else
2770static inline void threadgroup_change_begin(struct task_struct *tsk) {}
2771static inline void threadgroup_change_end(struct task_struct *tsk) {}
2772static inline void threadgroup_lock(struct task_struct *tsk) {}
2773static inline void threadgroup_unlock(struct task_struct *tsk) {}
2774#endif
2775 2744
2776#ifndef __HAVE_THREAD_FUNCTIONS 2745#ifndef __HAVE_THREAD_FUNCTIONS
2777 2746
diff --git a/init/Kconfig b/init/Kconfig
index f0c2e681b506..7d1ffd2ae536 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -924,6 +924,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED
924menuconfig CGROUPS 924menuconfig CGROUPS
925 bool "Control Group support" 925 bool "Control Group support"
926 select KERNFS 926 select KERNFS
927 select PERCPU_RWSEM
927 help 928 help
928 This option adds support for grouping sets of processes together, for 929 This option adds support for grouping sets of processes together, for
929 use with process control subsystems such as Cpusets, CFS, memory 930 use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 469dd547770c..9ef9fc8a774b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/spinlock.h> 47#include <linux/spinlock.h>
48#include <linux/rwsem.h> 48#include <linux/rwsem.h>
49#include <linux/percpu-rwsem.h>
49#include <linux/string.h> 50#include <linux/string.h>
50#include <linux/sort.h> 51#include <linux/sort.h>
51#include <linux/kmod.h> 52#include <linux/kmod.h>
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
103 */ 104 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 105static DEFINE_SPINLOCK(release_agent_path_lock);
105 106
107struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
108
106#define cgroup_assert_mutex_or_rcu_locked() \ 109#define cgroup_assert_mutex_or_rcu_locked() \
107 rcu_lockdep_assert(rcu_read_lock_held() || \ 110 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_mutex), \ 111 lockdep_is_held(&cgroup_mutex), \
@@ -156,7 +159,7 @@ static bool cgrp_dfl_root_visible;
156static bool cgroup_legacy_files_on_dfl; 159static bool cgroup_legacy_files_on_dfl;
157 160
158/* some controllers are not supported in the default hierarchy */ 161/* some controllers are not supported in the default hierarchy */
159static unsigned int cgrp_dfl_root_inhibit_ss_mask; 162static unsigned long cgrp_dfl_root_inhibit_ss_mask;
160 163
161/* The list of hierarchy roots */ 164/* The list of hierarchy roots */
162 165
@@ -175,18 +178,19 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
175 */ 178 */
176static u64 css_serial_nr_next = 1; 179static u64 css_serial_nr_next = 1;
177 180
178/* This flag indicates whether tasks in the fork and exit paths should 181/*
179 * check for fork/exit handlers to call. This avoids us having to do 182 * These bitmask flags indicate whether tasks in the fork and exit paths have
180 * extra work in the fork/exit path if none of the subsystems need to 183 * fork/exit handlers to call. This avoids us having to do extra work in the
181 * be called. 184 * fork/exit path to check which subsystems have fork/exit callbacks.
182 */ 185 */
183static int need_forkexit_callback __read_mostly; 186static unsigned long have_fork_callback __read_mostly;
187static unsigned long have_exit_callback __read_mostly;
184 188
185static struct cftype cgroup_dfl_base_files[]; 189static struct cftype cgroup_dfl_base_files[];
186static struct cftype cgroup_legacy_base_files[]; 190static struct cftype cgroup_legacy_base_files[];
187 191
188static int rebind_subsystems(struct cgroup_root *dst_root, 192static int rebind_subsystems(struct cgroup_root *dst_root,
189 unsigned int ss_mask); 193 unsigned long ss_mask);
190static int cgroup_destroy_locked(struct cgroup *cgrp); 194static int cgroup_destroy_locked(struct cgroup *cgrp);
191static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, 195static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
192 bool visible); 196 bool visible);
@@ -261,7 +265,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
261 * @cgrp: the cgroup of interest 265 * @cgrp: the cgroup of interest
262 * @ss: the subsystem of interest (%NULL returns @cgrp->self) 266 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
263 * 267 *
264 * Similar to cgroup_css() but returns the effctive css, which is defined 268 * Similar to cgroup_css() but returns the effective css, which is defined
265 * as the matching css of the nearest ancestor including self which has @ss 269 * as the matching css of the nearest ancestor including self which has @ss
266 * enabled. If @ss is associated with the hierarchy @cgrp is on, this 270 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
267 * function is guaranteed to return non-NULL css. 271 * function is guaranteed to return non-NULL css.
@@ -409,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp)
409 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ 413 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
410 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) 414 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
411 415
416/**
417 * for_each_subsys_which - filter for_each_subsys with a bitmask
418 * @ss: the iteration cursor
419 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
420 * @ss_maskp: a pointer to the bitmask
421 *
422 * The block will only run for cases where the ssid-th bit (1 << ssid) of
423 * mask is set to 1.
424 */
425#define for_each_subsys_which(ss, ssid, ss_maskp) \
426 if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
427 (ssid) = 0; \
428 else \
429 for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
430 if (((ss) = cgroup_subsys[ssid]) && false) \
431 break; \
432 else
433
412/* iterate across the hierarchies */ 434/* iterate across the hierarchies */
413#define for_each_root(root) \ 435#define for_each_root(root) \
414 list_for_each_entry((root), &cgroup_roots, root_list) 436 list_for_each_entry((root), &cgroup_roots, root_list)
@@ -882,7 +904,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
882static void cgroup_free_root(struct cgroup_root *root) 904static void cgroup_free_root(struct cgroup_root *root)
883{ 905{
884 if (root) { 906 if (root) {
885 /* hierarhcy ID shoulid already have been released */ 907 /* hierarchy ID should already have been released */
886 WARN_ON_ONCE(root->hierarchy_id); 908 WARN_ON_ONCE(root->hierarchy_id);
887 909
888 idr_destroy(&root->cgroup_idr); 910 idr_destroy(&root->cgroup_idr);
@@ -998,7 +1020,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
998 * update of a tasks cgroup pointer by cgroup_attach_task() 1020 * update of a tasks cgroup pointer by cgroup_attach_task()
999 */ 1021 */
1000 1022
1001static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); 1023static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
1002static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 1024static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1003static const struct file_operations proc_cgroupstats_operations; 1025static const struct file_operations proc_cgroupstats_operations;
1004 1026
@@ -1068,11 +1090,11 @@ static void cgroup_put(struct cgroup *cgrp)
1068 * @subtree_control is to be applied to @cgrp. The returned mask is always 1090 * @subtree_control is to be applied to @cgrp. The returned mask is always
1069 * a superset of @subtree_control and follows the usual hierarchy rules. 1091 * a superset of @subtree_control and follows the usual hierarchy rules.
1070 */ 1092 */
1071static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, 1093static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
1072 unsigned int subtree_control) 1094 unsigned long subtree_control)
1073{ 1095{
1074 struct cgroup *parent = cgroup_parent(cgrp); 1096 struct cgroup *parent = cgroup_parent(cgrp);
1075 unsigned int cur_ss_mask = subtree_control; 1097 unsigned long cur_ss_mask = subtree_control;
1076 struct cgroup_subsys *ss; 1098 struct cgroup_subsys *ss;
1077 int ssid; 1099 int ssid;
1078 1100
@@ -1082,11 +1104,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
1082 return cur_ss_mask; 1104 return cur_ss_mask;
1083 1105
1084 while (true) { 1106 while (true) {
1085 unsigned int new_ss_mask = cur_ss_mask; 1107 unsigned long new_ss_mask = cur_ss_mask;
1086 1108
1087 for_each_subsys(ss, ssid) 1109 for_each_subsys_which(ss, ssid, &cur_ss_mask)
1088 if (cur_ss_mask & (1 << ssid)) 1110 new_ss_mask |= ss->depends_on;
1089 new_ss_mask |= ss->depends_on;
1090 1111
1091 /* 1112 /*
1092 * Mask out subsystems which aren't available. This can 1113 * Mask out subsystems which aren't available. This can
@@ -1200,7 +1221,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1200 * @cgrp: target cgroup 1221 * @cgrp: target cgroup
1201 * @subsys_mask: mask of the subsystem ids whose files should be removed 1222 * @subsys_mask: mask of the subsystem ids whose files should be removed
1202 */ 1223 */
1203static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) 1224static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
1204{ 1225{
1205 struct cgroup_subsys *ss; 1226 struct cgroup_subsys *ss;
1206 int i; 1227 int i;
@@ -1215,18 +1236,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
1215 } 1236 }
1216} 1237}
1217 1238
1218static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) 1239static int rebind_subsystems(struct cgroup_root *dst_root,
1240 unsigned long ss_mask)
1219{ 1241{
1220 struct cgroup_subsys *ss; 1242 struct cgroup_subsys *ss;
1221 unsigned int tmp_ss_mask; 1243 unsigned long tmp_ss_mask;
1222 int ssid, i, ret; 1244 int ssid, i, ret;
1223 1245
1224 lockdep_assert_held(&cgroup_mutex); 1246 lockdep_assert_held(&cgroup_mutex);
1225 1247
1226 for_each_subsys(ss, ssid) { 1248 for_each_subsys_which(ss, ssid, &ss_mask) {
1227 if (!(ss_mask & (1 << ssid)))
1228 continue;
1229
1230 /* if @ss has non-root csses attached to it, can't move */ 1249 /* if @ss has non-root csses attached to it, can't move */
1231 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) 1250 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
1232 return -EBUSY; 1251 return -EBUSY;
@@ -1253,7 +1272,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1253 * Just warn about it and continue. 1272 * Just warn about it and continue.
1254 */ 1273 */
1255 if (cgrp_dfl_root_visible) { 1274 if (cgrp_dfl_root_visible) {
1256 pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", 1275 pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
1257 ret, ss_mask); 1276 ret, ss_mask);
1258 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); 1277 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1259 } 1278 }
@@ -1263,18 +1282,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1263 * Nothing can fail from this point on. Remove files for the 1282 * Nothing can fail from this point on. Remove files for the
1264 * removed subsystems and rebind each subsystem. 1283 * removed subsystems and rebind each subsystem.
1265 */ 1284 */
1266 for_each_subsys(ss, ssid) 1285 for_each_subsys_which(ss, ssid, &ss_mask)
1267 if (ss_mask & (1 << ssid)) 1286 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1268 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1269 1287
1270 for_each_subsys(ss, ssid) { 1288 for_each_subsys_which(ss, ssid, &ss_mask) {
1271 struct cgroup_root *src_root; 1289 struct cgroup_root *src_root;
1272 struct cgroup_subsys_state *css; 1290 struct cgroup_subsys_state *css;
1273 struct css_set *cset; 1291 struct css_set *cset;
1274 1292
1275 if (!(ss_mask & (1 << ssid)))
1276 continue;
1277
1278 src_root = ss->root; 1293 src_root = ss->root;
1279 css = cgroup_css(&src_root->cgrp, ss); 1294 css = cgroup_css(&src_root->cgrp, ss);
1280 1295
@@ -1338,7 +1353,7 @@ static int cgroup_show_options(struct seq_file *seq,
1338} 1353}
1339 1354
1340struct cgroup_sb_opts { 1355struct cgroup_sb_opts {
1341 unsigned int subsys_mask; 1356 unsigned long subsys_mask;
1342 unsigned int flags; 1357 unsigned int flags;
1343 char *release_agent; 1358 char *release_agent;
1344 bool cpuset_clone_children; 1359 bool cpuset_clone_children;
@@ -1351,7 +1366,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1351{ 1366{
1352 char *token, *o = data; 1367 char *token, *o = data;
1353 bool all_ss = false, one_ss = false; 1368 bool all_ss = false, one_ss = false;
1354 unsigned int mask = -1U; 1369 unsigned long mask = -1UL;
1355 struct cgroup_subsys *ss; 1370 struct cgroup_subsys *ss;
1356 int nr_opts = 0; 1371 int nr_opts = 0;
1357 int i; 1372 int i;
@@ -1495,7 +1510,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1495 int ret = 0; 1510 int ret = 0;
1496 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1511 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1497 struct cgroup_sb_opts opts; 1512 struct cgroup_sb_opts opts;
1498 unsigned int added_mask, removed_mask; 1513 unsigned long added_mask, removed_mask;
1499 1514
1500 if (root == &cgrp_dfl_root) { 1515 if (root == &cgrp_dfl_root) {
1501 pr_err("remount is not allowed\n"); 1516 pr_err("remount is not allowed\n");
@@ -1641,7 +1656,7 @@ static void init_cgroup_root(struct cgroup_root *root,
1641 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1656 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1642} 1657}
1643 1658
1644static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) 1659static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1645{ 1660{
1646 LIST_HEAD(tmp_links); 1661 LIST_HEAD(tmp_links);
1647 struct cgroup *root_cgrp = &root->cgrp; 1662 struct cgroup *root_cgrp = &root->cgrp;
@@ -2052,9 +2067,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
2052 lockdep_assert_held(&css_set_rwsem); 2067 lockdep_assert_held(&css_set_rwsem);
2053 2068
2054 /* 2069 /*
2055 * We are synchronized through threadgroup_lock() against PF_EXITING 2070 * We are synchronized through cgroup_threadgroup_rwsem against
2056 * setting such that we can't race against cgroup_exit() changing the 2071 * PF_EXITING setting such that we can't race against cgroup_exit()
2057 * css_set to init_css_set and dropping the old one. 2072 * changing the css_set to init_css_set and dropping the old one.
2058 */ 2073 */
2059 WARN_ON_ONCE(tsk->flags & PF_EXITING); 2074 WARN_ON_ONCE(tsk->flags & PF_EXITING);
2060 old_cset = task_css_set(tsk); 2075 old_cset = task_css_set(tsk);
@@ -2111,10 +2126,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2111 * @src_cset and add it to @preloaded_csets, which should later be cleaned 2126 * @src_cset and add it to @preloaded_csets, which should later be cleaned
2112 * up by cgroup_migrate_finish(). 2127 * up by cgroup_migrate_finish().
2113 * 2128 *
2114 * This function may be called without holding threadgroup_lock even if the 2129 * This function may be called without holding cgroup_threadgroup_rwsem
2115 * target is a process. Threads may be created and destroyed but as long 2130 * even if the target is a process. Threads may be created and destroyed
2116 * as cgroup_mutex is not dropped, no new css_set can be put into play and 2131 * but as long as cgroup_mutex is not dropped, no new css_set can be put
2117 * the preloaded css_sets are guaranteed to cover all migrations. 2132 * into play and the preloaded css_sets are guaranteed to cover all
2133 * migrations.
2118 */ 2134 */
2119static void cgroup_migrate_add_src(struct css_set *src_cset, 2135static void cgroup_migrate_add_src(struct css_set *src_cset,
2120 struct cgroup *dst_cgrp, 2136 struct cgroup *dst_cgrp,
@@ -2217,7 +2233,7 @@ err:
2217 * @threadgroup: whether @leader points to the whole process or a single task 2233 * @threadgroup: whether @leader points to the whole process or a single task
2218 * 2234 *
2219 * Migrate a process or task denoted by @leader to @cgrp. If migrating a 2235 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
2220 * process, the caller must be holding threadgroup_lock of @leader. The 2236 * process, the caller must be holding cgroup_threadgroup_rwsem. The
2221 * caller is also responsible for invoking cgroup_migrate_add_src() and 2237 * caller is also responsible for invoking cgroup_migrate_add_src() and
2222 * cgroup_migrate_prepare_dst() on the targets before invoking this 2238 * cgroup_migrate_prepare_dst() on the targets before invoking this
2223 * function and following up with cgroup_migrate_finish(). 2239 * function and following up with cgroup_migrate_finish().
@@ -2345,7 +2361,7 @@ out_release_tset:
2345 * @leader: the task or the leader of the threadgroup to be attached 2361 * @leader: the task or the leader of the threadgroup to be attached
2346 * @threadgroup: attach the whole threadgroup? 2362 * @threadgroup: attach the whole threadgroup?
2347 * 2363 *
2348 * Call holding cgroup_mutex and threadgroup_lock of @leader. 2364 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2349 */ 2365 */
2350static int cgroup_attach_task(struct cgroup *dst_cgrp, 2366static int cgroup_attach_task(struct cgroup *dst_cgrp,
2351 struct task_struct *leader, bool threadgroup) 2367 struct task_struct *leader, bool threadgroup)
@@ -2376,6 +2392,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2376 return ret; 2392 return ret;
2377} 2393}
2378 2394
2395static int cgroup_procs_write_permission(struct task_struct *task,
2396 struct cgroup *dst_cgrp,
2397 struct kernfs_open_file *of)
2398{
2399 const struct cred *cred = current_cred();
2400 const struct cred *tcred = get_task_cred(task);
2401 int ret = 0;
2402
2403 /*
2404 * even if we're attaching all tasks in the thread group, we only
2405 * need to check permissions on one of them.
2406 */
2407 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2408 !uid_eq(cred->euid, tcred->uid) &&
2409 !uid_eq(cred->euid, tcred->suid))
2410 ret = -EACCES;
2411
2412 if (!ret && cgroup_on_dfl(dst_cgrp)) {
2413 struct super_block *sb = of->file->f_path.dentry->d_sb;
2414 struct cgroup *cgrp;
2415 struct inode *inode;
2416
2417 down_read(&css_set_rwsem);
2418 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2419 up_read(&css_set_rwsem);
2420
2421 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2422 cgrp = cgroup_parent(cgrp);
2423
2424 ret = -ENOMEM;
2425 inode = kernfs_get_inode(sb, cgrp->procs_kn);
2426 if (inode) {
2427 ret = inode_permission(inode, MAY_WRITE);
2428 iput(inode);
2429 }
2430 }
2431
2432 put_cred(tcred);
2433 return ret;
2434}
2435
2379/* 2436/*
2380 * Find the task_struct of the task to attach by vpid and pass it along to the 2437 * Find the task_struct of the task to attach by vpid and pass it along to the
2381 * function to attach either it or all tasks in its threadgroup. Will lock 2438 * function to attach either it or all tasks in its threadgroup. Will lock
@@ -2385,7 +2442,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2385 size_t nbytes, loff_t off, bool threadgroup) 2442 size_t nbytes, loff_t off, bool threadgroup)
2386{ 2443{
2387 struct task_struct *tsk; 2444 struct task_struct *tsk;
2388 const struct cred *cred = current_cred(), *tcred;
2389 struct cgroup *cgrp; 2445 struct cgroup *cgrp;
2390 pid_t pid; 2446 pid_t pid;
2391 int ret; 2447 int ret;
@@ -2397,29 +2453,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2397 if (!cgrp) 2453 if (!cgrp)
2398 return -ENODEV; 2454 return -ENODEV;
2399 2455
2400retry_find_task: 2456 percpu_down_write(&cgroup_threadgroup_rwsem);
2401 rcu_read_lock(); 2457 rcu_read_lock();
2402 if (pid) { 2458 if (pid) {
2403 tsk = find_task_by_vpid(pid); 2459 tsk = find_task_by_vpid(pid);
2404 if (!tsk) { 2460 if (!tsk) {
2405 rcu_read_unlock();
2406 ret = -ESRCH; 2461 ret = -ESRCH;
2407 goto out_unlock_cgroup; 2462 goto out_unlock_rcu;
2408 } 2463 }
2409 /* 2464 } else {
2410 * even if we're attaching all tasks in the thread group, we
2411 * only need to check permissions on one of them.
2412 */
2413 tcred = __task_cred(tsk);
2414 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2415 !uid_eq(cred->euid, tcred->uid) &&
2416 !uid_eq(cred->euid, tcred->suid)) {
2417 rcu_read_unlock();
2418 ret = -EACCES;
2419 goto out_unlock_cgroup;
2420 }
2421 } else
2422 tsk = current; 2465 tsk = current;
2466 }
2423 2467
2424 if (threadgroup) 2468 if (threadgroup)
2425 tsk = tsk->group_leader; 2469 tsk = tsk->group_leader;
@@ -2431,35 +2475,23 @@ retry_find_task:
2431 */ 2475 */
2432 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { 2476 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2433 ret = -EINVAL; 2477 ret = -EINVAL;
2434 rcu_read_unlock(); 2478 goto out_unlock_rcu;
2435 goto out_unlock_cgroup;
2436 } 2479 }
2437 2480
2438 get_task_struct(tsk); 2481 get_task_struct(tsk);
2439 rcu_read_unlock(); 2482 rcu_read_unlock();
2440 2483
2441 threadgroup_lock(tsk); 2484 ret = cgroup_procs_write_permission(tsk, cgrp, of);
2442 if (threadgroup) { 2485 if (!ret)
2443 if (!thread_group_leader(tsk)) { 2486 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2444 /*
2445 * a race with de_thread from another thread's exec()
2446 * may strip us of our leadership, if this happens,
2447 * there is no choice but to throw this task away and
2448 * try again; this is
2449 * "double-double-toil-and-trouble-check locking".
2450 */
2451 threadgroup_unlock(tsk);
2452 put_task_struct(tsk);
2453 goto retry_find_task;
2454 }
2455 }
2456
2457 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2458
2459 threadgroup_unlock(tsk);
2460 2487
2461 put_task_struct(tsk); 2488 put_task_struct(tsk);
2462out_unlock_cgroup: 2489 goto out_unlock_threadgroup;
2490
2491out_unlock_rcu:
2492 rcu_read_unlock();
2493out_unlock_threadgroup:
2494 percpu_up_write(&cgroup_threadgroup_rwsem);
2463 cgroup_kn_unlock(of->kn); 2495 cgroup_kn_unlock(of->kn);
2464 return ret ?: nbytes; 2496 return ret ?: nbytes;
2465} 2497}
@@ -2542,19 +2574,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2542 return 0; 2574 return 0;
2543} 2575}
2544 2576
2545static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) 2577static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
2546{ 2578{
2547 struct cgroup_subsys *ss; 2579 struct cgroup_subsys *ss;
2548 bool printed = false; 2580 bool printed = false;
2549 int ssid; 2581 int ssid;
2550 2582
2551 for_each_subsys(ss, ssid) { 2583 for_each_subsys_which(ss, ssid, &ss_mask) {
2552 if (ss_mask & (1 << ssid)) { 2584 if (printed)
2553 if (printed) 2585 seq_putc(seq, ' ');
2554 seq_putc(seq, ' '); 2586 seq_printf(seq, "%s", ss->name);
2555 seq_printf(seq, "%s", ss->name); 2587 printed = true;
2556 printed = true;
2557 }
2558 } 2588 }
2559 if (printed) 2589 if (printed)
2560 seq_putc(seq, '\n'); 2590 seq_putc(seq, '\n');
@@ -2606,6 +2636,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2606 2636
2607 lockdep_assert_held(&cgroup_mutex); 2637 lockdep_assert_held(&cgroup_mutex);
2608 2638
2639 percpu_down_write(&cgroup_threadgroup_rwsem);
2640
2609 /* look up all csses currently attached to @cgrp's subtree */ 2641 /* look up all csses currently attached to @cgrp's subtree */
2610 down_read(&css_set_rwsem); 2642 down_read(&css_set_rwsem);
2611 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { 2643 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2661,17 +2693,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2661 goto out_finish; 2693 goto out_finish;
2662 last_task = task; 2694 last_task = task;
2663 2695
2664 threadgroup_lock(task);
2665 /* raced against de_thread() from another thread? */
2666 if (!thread_group_leader(task)) {
2667 threadgroup_unlock(task);
2668 put_task_struct(task);
2669 continue;
2670 }
2671
2672 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); 2696 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2673 2697
2674 threadgroup_unlock(task);
2675 put_task_struct(task); 2698 put_task_struct(task);
2676 2699
2677 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) 2700 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2681,6 +2704,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2681 2704
2682out_finish: 2705out_finish:
2683 cgroup_migrate_finish(&preloaded_csets); 2706 cgroup_migrate_finish(&preloaded_csets);
2707 percpu_up_write(&cgroup_threadgroup_rwsem);
2684 return ret; 2708 return ret;
2685} 2709}
2686 2710
@@ -2689,8 +2713,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2689 char *buf, size_t nbytes, 2713 char *buf, size_t nbytes,
2690 loff_t off) 2714 loff_t off)
2691{ 2715{
2692 unsigned int enable = 0, disable = 0; 2716 unsigned long enable = 0, disable = 0;
2693 unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; 2717 unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
2694 struct cgroup *cgrp, *child; 2718 struct cgroup *cgrp, *child;
2695 struct cgroup_subsys *ss; 2719 struct cgroup_subsys *ss;
2696 char *tok; 2720 char *tok;
@@ -2702,11 +2726,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2702 */ 2726 */
2703 buf = strstrip(buf); 2727 buf = strstrip(buf);
2704 while ((tok = strsep(&buf, " "))) { 2728 while ((tok = strsep(&buf, " "))) {
2729 unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
2730
2705 if (tok[0] == '\0') 2731 if (tok[0] == '\0')
2706 continue; 2732 continue;
2707 for_each_subsys(ss, ssid) { 2733 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
2708 if (ss->disabled || strcmp(tok + 1, ss->name) || 2734 if (ss->disabled || strcmp(tok + 1, ss->name))
2709 ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2710 continue; 2735 continue;
2711 2736
2712 if (*tok == '+') { 2737 if (*tok == '+') {
@@ -2793,10 +2818,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2793 * still around. In such cases, wait till it's gone using 2818 * still around. In such cases, wait till it's gone using
2794 * offline_waitq. 2819 * offline_waitq.
2795 */ 2820 */
2796 for_each_subsys(ss, ssid) { 2821 for_each_subsys_which(ss, ssid, &css_enable) {
2797 if (!(css_enable & (1 << ssid)))
2798 continue;
2799
2800 cgroup_for_each_live_child(child, cgrp) { 2822 cgroup_for_each_live_child(child, cgrp) {
2801 DEFINE_WAIT(wait); 2823 DEFINE_WAIT(wait);
2802 2824
@@ -3087,7 +3109,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3087 return ret; 3109 return ret;
3088 } 3110 }
3089 3111
3090 if (cft->seq_show == cgroup_populated_show) 3112 if (cft->write == cgroup_procs_write)
3113 cgrp->procs_kn = kn;
3114 else if (cft->seq_show == cgroup_populated_show)
3091 cgrp->populated_kn = kn; 3115 cgrp->populated_kn = kn;
3092 return 0; 3116 return 0;
3093} 3117}
@@ -4322,7 +4346,7 @@ static struct cftype cgroup_legacy_base_files[] = {
4322 * 4346 *
4323 * On failure, no file is added. 4347 * On failure, no file is added.
4324 */ 4348 */
4325static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) 4349static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4326{ 4350{
4327 struct cgroup_subsys *ss; 4351 struct cgroup_subsys *ss;
4328 int i, ret = 0; 4352 int i, ret = 0;
@@ -4931,7 +4955,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4931 * init_css_set is in the subsystem's root cgroup. */ 4955 * init_css_set is in the subsystem's root cgroup. */
4932 init_css_set.subsys[ss->id] = css; 4956 init_css_set.subsys[ss->id] = css;
4933 4957
4934 need_forkexit_callback |= ss->fork || ss->exit; 4958 have_fork_callback |= (bool)ss->fork << ss->id;
4959 have_exit_callback |= (bool)ss->exit << ss->id;
4935 4960
4936 /* At system boot, before all subsystems have been 4961 /* At system boot, before all subsystems have been
4937 * registered, no tasks have been forked, so we don't 4962 * registered, no tasks have been forked, so we don't
@@ -4989,6 +5014,7 @@ int __init cgroup_init(void)
4989 unsigned long key; 5014 unsigned long key;
4990 int ssid, err; 5015 int ssid, err;
4991 5016
5017 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
4992 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 5018 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
4993 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); 5019 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
4994 5020
@@ -5241,11 +5267,8 @@ void cgroup_post_fork(struct task_struct *child)
5241 * css_set; otherwise, @child might change state between ->fork() 5267 * css_set; otherwise, @child might change state between ->fork()
5242 * and addition to css_set. 5268 * and addition to css_set.
5243 */ 5269 */
5244 if (need_forkexit_callback) { 5270 for_each_subsys_which(ss, i, &have_fork_callback)
5245 for_each_subsys(ss, i) 5271 ss->fork(child);
5246 if (ss->fork)
5247 ss->fork(child);
5248 }
5249} 5272}
5250 5273
5251/** 5274/**
@@ -5289,16 +5312,12 @@ void cgroup_exit(struct task_struct *tsk)
5289 cset = task_css_set(tsk); 5312 cset = task_css_set(tsk);
5290 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 5313 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5291 5314
5292 if (need_forkexit_callback) { 5315 /* see cgroup_post_fork() for details */
5293 /* see cgroup_post_fork() for details */ 5316 for_each_subsys_which(ss, i, &have_exit_callback) {
5294 for_each_subsys(ss, i) { 5317 struct cgroup_subsys_state *old_css = cset->subsys[i];
5295 if (ss->exit) { 5318 struct cgroup_subsys_state *css = task_css(tsk, i);
5296 struct cgroup_subsys_state *old_css = cset->subsys[i];
5297 struct cgroup_subsys_state *css = task_css(tsk, i);
5298 5319
5299 ss->exit(css, old_css, tsk); 5320 ss->exit(css, old_css, tsk);
5300 }
5301 }
5302 } 5321 }
5303 5322
5304 if (put_cset) 5323 if (put_cset)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c95cb34243c..1bfefc6f96a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1141,10 +1141,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1141 tty_audit_fork(sig); 1141 tty_audit_fork(sig);
1142 sched_autogroup_fork(sig); 1142 sched_autogroup_fork(sig);
1143 1143
1144#ifdef CONFIG_CGROUPS
1145 init_rwsem(&sig->group_rwsem);
1146#endif
1147
1148 sig->oom_score_adj = current->signal->oom_score_adj; 1144 sig->oom_score_adj = current->signal->oom_score_adj;
1149 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1145 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1150 1146