diff options
author | Tejun Heo <tj@kernel.org> | 2016-12-27 14:49:06 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2016-12-27 14:49:06 -0500 |
commit | 0a268dbd7932c78896f5a45c8a492b31729db6c0 (patch) | |
tree | dc988c67e71fe5f43301042f23525bc8ff5cdb9b | |
parent | 201af4c0fab02876ef0311e7f7b4083aa138930c (diff) |
cgroup: move cgroup v1 specific code to kernel/cgroup/cgroup-v1.c
cgroup.c is getting too unwieldy. Let's move out cgroup v1 specific
code along with the debug controller into kernel/cgroup/cgroup-v1.c.
v2: cgroup_mutex and css_set_lock made available in cgroup-internal.h
regardless of CONFIG_PROVE_RCU.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
-rw-r--r-- | kernel/cgroup/Makefile | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 103 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 1027 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 1140 |
4 files changed, 1164 insertions, 1108 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 4d561a50a5ac..719588cb18cd 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-y := cgroup.o | 1 | obj-y := cgroup.o cgroup-v1.o |
2 | 2 | ||
3 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o | 3 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o |
4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o | 4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o |
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h new file mode 100644 index 000000000000..dca3193bd9d2 --- /dev/null +++ b/kernel/cgroup/cgroup-internal.h | |||
@@ -0,0 +1,103 @@ | |||
1 | #ifndef __CGROUP_INTERNAL_H | ||
2 | #define __CGROUP_INTERNAL_H | ||
3 | |||
4 | #include <linux/cgroup.h> | ||
5 | #include <linux/kernfs.h> | ||
6 | #include <linux/workqueue.h> | ||
7 | #include <linux/list.h> | ||
8 | |||
9 | /* | ||
10 | * A cgroup can be associated with multiple css_sets as different tasks may | ||
11 | * belong to different cgroups on different hierarchies. In the other | ||
12 | * direction, a css_set is naturally associated with multiple cgroups. | ||
13 | * This M:N relationship is represented by the following link structure | ||
14 | * which exists for each association and allows traversing the associations | ||
15 | * from both sides. | ||
16 | */ | ||
17 | struct cgrp_cset_link { | ||
18 | /* the cgroup and css_set this link associates */ | ||
19 | struct cgroup *cgrp; | ||
20 | struct css_set *cset; | ||
21 | |||
22 | /* list of cgrp_cset_links anchored at cgrp->cset_links */ | ||
23 | struct list_head cset_link; | ||
24 | |||
25 | /* list of cgrp_cset_links anchored at css_set->cgrp_links */ | ||
26 | struct list_head cgrp_link; | ||
27 | }; | ||
28 | |||
29 | extern struct mutex cgroup_mutex; | ||
30 | extern spinlock_t css_set_lock; | ||
31 | extern struct cgroup_subsys *cgroup_subsys[]; | ||
32 | extern struct list_head cgroup_roots; | ||
33 | extern struct file_system_type cgroup_fs_type; | ||
34 | |||
35 | /* iterate across the hierarchies */ | ||
36 | #define for_each_root(root) \ | ||
37 | list_for_each_entry((root), &cgroup_roots, root_list) | ||
38 | |||
39 | /** | ||
40 | * for_each_subsys - iterate all enabled cgroup subsystems | ||
41 | * @ss: the iteration cursor | ||
42 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | ||
43 | */ | ||
44 | #define for_each_subsys(ss, ssid) \ | ||
45 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ | ||
46 | (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) | ||
47 | |||
48 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | ||
49 | { | ||
50 | return !(cgrp->self.flags & CSS_ONLINE); | ||
51 | } | ||
52 | |||
53 | static inline bool notify_on_release(const struct cgroup *cgrp) | ||
54 | { | ||
55 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | ||
56 | } | ||
57 | |||
58 | bool cgroup_ssid_enabled(int ssid); | ||
59 | bool cgroup_on_dfl(const struct cgroup *cgrp); | ||
60 | |||
61 | struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); | ||
62 | struct cgroup *task_cgroup_from_root(struct task_struct *task, | ||
63 | struct cgroup_root *root); | ||
64 | struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); | ||
65 | void cgroup_kn_unlock(struct kernfs_node *kn); | ||
66 | int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, | ||
67 | struct cgroup_namespace *ns); | ||
68 | |||
69 | int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); | ||
70 | |||
71 | bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); | ||
72 | void cgroup_migrate_finish(struct list_head *preloaded_csets); | ||
73 | void cgroup_migrate_add_src(struct css_set *src_cset, | ||
74 | struct cgroup *dst_cgrp, | ||
75 | struct list_head *preloaded_csets); | ||
76 | int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets); | ||
77 | int cgroup_migrate(struct task_struct *leader, bool threadgroup, | ||
78 | struct cgroup_root *root); | ||
79 | |||
80 | int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, | ||
81 | bool threadgroup); | ||
82 | ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, | ||
83 | size_t nbytes, loff_t off, bool threadgroup); | ||
84 | ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, | ||
85 | loff_t off); | ||
86 | |||
87 | void cgroup_lock_and_drain_offline(struct cgroup *cgrp); | ||
88 | |||
89 | /* | ||
90 | * cgroup-v1.c | ||
91 | */ | ||
92 | extern spinlock_t release_agent_path_lock; | ||
93 | extern struct cftype cgroup_legacy_base_files[]; | ||
94 | extern const struct file_operations proc_cgroupstats_operations; | ||
95 | |||
96 | bool cgroup_ssid_no_v1(int ssid); | ||
97 | void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | ||
98 | int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | ||
99 | const char *new_name_str); | ||
100 | void cgroup_release_agent(struct work_struct *work); | ||
101 | void check_for_release(struct cgroup *cgrp); | ||
102 | |||
103 | #endif /* __CGROUP_INTERNAL_H */ | ||
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c new file mode 100644 index 000000000000..7af745a46f91 --- /dev/null +++ b/kernel/cgroup/cgroup-v1.c | |||
@@ -0,0 +1,1027 @@ | |||
1 | #include "cgroup-internal.h" | ||
2 | |||
3 | #include <linux/kmod.h> | ||
4 | #include <linux/sort.h> | ||
5 | #include <linux/mm.h> | ||
6 | #include <linux/slab.h> | ||
7 | #include <linux/vmalloc.h> | ||
8 | #include <linux/delayacct.h> | ||
9 | #include <linux/pid_namespace.h> | ||
10 | #include <linux/cgroupstats.h> | ||
11 | |||
12 | #include <trace/events/cgroup.h> | ||
13 | |||
14 | /* | ||
15 | * pidlists linger the following amount before being destroyed. The goal | ||
16 | * is avoiding frequent destruction in the middle of consecutive read calls | ||
17 | * Expiring in the middle is a performance problem not a correctness one. | ||
18 | * 1 sec should be enough. | ||
19 | */ | ||
20 | #define CGROUP_PIDLIST_DESTROY_DELAY HZ | ||
21 | |||
22 | /* Controllers blocked by the commandline in v1 */ | ||
23 | static u16 cgroup_no_v1_mask; | ||
24 | |||
25 | /* | ||
26 | * pidlist destructions need to be flushed on cgroup destruction. Use a | ||
27 | * separate workqueue as flush domain. | ||
28 | */ | ||
29 | static struct workqueue_struct *cgroup_pidlist_destroy_wq; | ||
30 | |||
31 | /* | ||
32 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires | ||
33 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. | ||
34 | */ | ||
35 | DEFINE_SPINLOCK(release_agent_path_lock); | ||
36 | |||
37 | bool cgroup_ssid_no_v1(int ssid) | ||
38 | { | ||
39 | return cgroup_no_v1_mask & (1 << ssid); | ||
40 | } | ||
41 | |||
42 | /** | ||
43 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' | ||
44 | * @from: attach to all cgroups of a given task | ||
45 | * @tsk: the task to be attached | ||
46 | */ | ||
47 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | ||
48 | { | ||
49 | struct cgroup_root *root; | ||
50 | int retval = 0; | ||
51 | |||
52 | mutex_lock(&cgroup_mutex); | ||
53 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
54 | for_each_root(root) { | ||
55 | struct cgroup *from_cgrp; | ||
56 | |||
57 | if (root == &cgrp_dfl_root) | ||
58 | continue; | ||
59 | |||
60 | spin_lock_irq(&css_set_lock); | ||
61 | from_cgrp = task_cgroup_from_root(from, root); | ||
62 | spin_unlock_irq(&css_set_lock); | ||
63 | |||
64 | retval = cgroup_attach_task(from_cgrp, tsk, false); | ||
65 | if (retval) | ||
66 | break; | ||
67 | } | ||
68 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
69 | mutex_unlock(&cgroup_mutex); | ||
70 | |||
71 | return retval; | ||
72 | } | ||
73 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | ||
74 | |||
75 | /** | ||
76 | * cgroup_trasnsfer_tasks - move tasks from one cgroup to another | ||
77 | * @to: cgroup to which the tasks will be moved | ||
78 | * @from: cgroup in which the tasks currently reside | ||
79 | * | ||
80 | * Locking rules between cgroup_post_fork() and the migration path | ||
81 | * guarantee that, if a task is forking while being migrated, the new child | ||
82 | * is guaranteed to be either visible in the source cgroup after the | ||
83 | * parent's migration is complete or put into the target cgroup. No task | ||
84 | * can slip out of migration through forking. | ||
85 | */ | ||
86 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | ||
87 | { | ||
88 | LIST_HEAD(preloaded_csets); | ||
89 | struct cgrp_cset_link *link; | ||
90 | struct css_task_iter it; | ||
91 | struct task_struct *task; | ||
92 | int ret; | ||
93 | |||
94 | if (cgroup_on_dfl(to)) | ||
95 | return -EINVAL; | ||
96 | |||
97 | if (!cgroup_may_migrate_to(to)) | ||
98 | return -EBUSY; | ||
99 | |||
100 | mutex_lock(&cgroup_mutex); | ||
101 | |||
102 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
103 | |||
104 | /* all tasks in @from are being moved, all csets are source */ | ||
105 | spin_lock_irq(&css_set_lock); | ||
106 | list_for_each_entry(link, &from->cset_links, cset_link) | ||
107 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); | ||
108 | spin_unlock_irq(&css_set_lock); | ||
109 | |||
110 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); | ||
111 | if (ret) | ||
112 | goto out_err; | ||
113 | |||
114 | /* | ||
115 | * Migrate tasks one-by-one until @from is empty. This fails iff | ||
116 | * ->can_attach() fails. | ||
117 | */ | ||
118 | do { | ||
119 | css_task_iter_start(&from->self, &it); | ||
120 | task = css_task_iter_next(&it); | ||
121 | if (task) | ||
122 | get_task_struct(task); | ||
123 | css_task_iter_end(&it); | ||
124 | |||
125 | if (task) { | ||
126 | ret = cgroup_migrate(task, false, to->root); | ||
127 | if (!ret) | ||
128 | trace_cgroup_transfer_tasks(to, task, false); | ||
129 | put_task_struct(task); | ||
130 | } | ||
131 | } while (task && !ret); | ||
132 | out_err: | ||
133 | cgroup_migrate_finish(&preloaded_csets); | ||
134 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
135 | mutex_unlock(&cgroup_mutex); | ||
136 | return ret; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * Stuff for reading the 'tasks'/'procs' files. | ||
141 | * | ||
142 | * Reading this file can return large amounts of data if a cgroup has | ||
143 | * *lots* of attached tasks. So it may need several calls to read(), | ||
144 | * but we cannot guarantee that the information we produce is correct | ||
145 | * unless we produce it entirely atomically. | ||
146 | * | ||
147 | */ | ||
148 | |||
149 | /* which pidlist file are we talking about? */ | ||
150 | enum cgroup_filetype { | ||
151 | CGROUP_FILE_PROCS, | ||
152 | CGROUP_FILE_TASKS, | ||
153 | }; | ||
154 | |||
155 | /* | ||
156 | * A pidlist is a list of pids that virtually represents the contents of one | ||
157 | * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, | ||
158 | * a pair (one each for procs, tasks) for each pid namespace that's relevant | ||
159 | * to the cgroup. | ||
160 | */ | ||
161 | struct cgroup_pidlist { | ||
162 | /* | ||
163 | * used to find which pidlist is wanted. doesn't change as long as | ||
164 | * this particular list stays in the list. | ||
165 | */ | ||
166 | struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; | ||
167 | /* array of xids */ | ||
168 | pid_t *list; | ||
169 | /* how many elements the above list has */ | ||
170 | int length; | ||
171 | /* each of these stored in a list by its cgroup */ | ||
172 | struct list_head links; | ||
173 | /* pointer to the cgroup we belong to, for list removal purposes */ | ||
174 | struct cgroup *owner; | ||
175 | /* for delayed destruction */ | ||
176 | struct delayed_work destroy_dwork; | ||
177 | }; | ||
178 | |||
179 | /* | ||
180 | * The following two functions "fix" the issue where there are more pids | ||
181 | * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. | ||
182 | * TODO: replace with a kernel-wide solution to this problem | ||
183 | */ | ||
184 | #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) | ||
185 | static void *pidlist_allocate(int count) | ||
186 | { | ||
187 | if (PIDLIST_TOO_LARGE(count)) | ||
188 | return vmalloc(count * sizeof(pid_t)); | ||
189 | else | ||
190 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); | ||
191 | } | ||
192 | |||
193 | static void pidlist_free(void *p) | ||
194 | { | ||
195 | kvfree(p); | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * Used to destroy all pidlists lingering waiting for destroy timer. None | ||
200 | * should be left afterwards. | ||
201 | */ | ||
202 | void cgroup_pidlist_destroy_all(struct cgroup *cgrp) | ||
203 | { | ||
204 | struct cgroup_pidlist *l, *tmp_l; | ||
205 | |||
206 | mutex_lock(&cgrp->pidlist_mutex); | ||
207 | list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) | ||
208 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); | ||
209 | mutex_unlock(&cgrp->pidlist_mutex); | ||
210 | |||
211 | flush_workqueue(cgroup_pidlist_destroy_wq); | ||
212 | BUG_ON(!list_empty(&cgrp->pidlists)); | ||
213 | } | ||
214 | |||
215 | static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) | ||
216 | { | ||
217 | struct delayed_work *dwork = to_delayed_work(work); | ||
218 | struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, | ||
219 | destroy_dwork); | ||
220 | struct cgroup_pidlist *tofree = NULL; | ||
221 | |||
222 | mutex_lock(&l->owner->pidlist_mutex); | ||
223 | |||
224 | /* | ||
225 | * Destroy iff we didn't get queued again. The state won't change | ||
226 | * as destroy_dwork can only be queued while locked. | ||
227 | */ | ||
228 | if (!delayed_work_pending(dwork)) { | ||
229 | list_del(&l->links); | ||
230 | pidlist_free(l->list); | ||
231 | put_pid_ns(l->key.ns); | ||
232 | tofree = l; | ||
233 | } | ||
234 | |||
235 | mutex_unlock(&l->owner->pidlist_mutex); | ||
236 | kfree(tofree); | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries | ||
241 | * Returns the number of unique elements. | ||
242 | */ | ||
243 | static int pidlist_uniq(pid_t *list, int length) | ||
244 | { | ||
245 | int src, dest = 1; | ||
246 | |||
247 | /* | ||
248 | * we presume the 0th element is unique, so i starts at 1. trivial | ||
249 | * edge cases first; no work needs to be done for either | ||
250 | */ | ||
251 | if (length == 0 || length == 1) | ||
252 | return length; | ||
253 | /* src and dest walk down the list; dest counts unique elements */ | ||
254 | for (src = 1; src < length; src++) { | ||
255 | /* find next unique element */ | ||
256 | while (list[src] == list[src-1]) { | ||
257 | src++; | ||
258 | if (src == length) | ||
259 | goto after; | ||
260 | } | ||
261 | /* dest always points to where the next unique element goes */ | ||
262 | list[dest] = list[src]; | ||
263 | dest++; | ||
264 | } | ||
265 | after: | ||
266 | return dest; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * The two pid files - task and cgroup.procs - guaranteed that the result | ||
271 | * is sorted, which forced this whole pidlist fiasco. As pid order is | ||
272 | * different per namespace, each namespace needs differently sorted list, | ||
273 | * making it impossible to use, for example, single rbtree of member tasks | ||
274 | * sorted by task pointer. As pidlists can be fairly large, allocating one | ||
275 | * per open file is dangerous, so cgroup had to implement shared pool of | ||
276 | * pidlists keyed by cgroup and namespace. | ||
277 | */ | ||
278 | static int cmppid(const void *a, const void *b) | ||
279 | { | ||
280 | return *(pid_t *)a - *(pid_t *)b; | ||
281 | } | ||
282 | |||
283 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | ||
284 | enum cgroup_filetype type) | ||
285 | { | ||
286 | struct cgroup_pidlist *l; | ||
287 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
288 | struct pid_namespace *ns = task_active_pid_ns(current); | ||
289 | |||
290 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
291 | |||
292 | list_for_each_entry(l, &cgrp->pidlists, links) | ||
293 | if (l->key.type == type && l->key.ns == ns) | ||
294 | return l; | ||
295 | return NULL; | ||
296 | } | ||
297 | |||
298 | /* | ||
299 | * find the appropriate pidlist for our purpose (given procs vs tasks) | ||
300 | * returns with the lock on that pidlist already held, and takes care | ||
301 | * of the use count, or returns NULL with no locks held if we're out of | ||
302 | * memory. | ||
303 | */ | ||
304 | static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, | ||
305 | enum cgroup_filetype type) | ||
306 | { | ||
307 | struct cgroup_pidlist *l; | ||
308 | |||
309 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
310 | |||
311 | l = cgroup_pidlist_find(cgrp, type); | ||
312 | if (l) | ||
313 | return l; | ||
314 | |||
315 | /* entry not found; create a new one */ | ||
316 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | ||
317 | if (!l) | ||
318 | return l; | ||
319 | |||
320 | INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); | ||
321 | l->key.type = type; | ||
322 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
323 | l->key.ns = get_pid_ns(task_active_pid_ns(current)); | ||
324 | l->owner = cgrp; | ||
325 | list_add(&l->links, &cgrp->pidlists); | ||
326 | return l; | ||
327 | } | ||
328 | |||
329 | /** | ||
330 | * cgroup_task_count - count the number of tasks in a cgroup. | ||
331 | * @cgrp: the cgroup in question | ||
332 | * | ||
333 | * Return the number of tasks in the cgroup. The returned number can be | ||
334 | * higher than the actual number of tasks due to css_set references from | ||
335 | * namespace roots and temporary usages. | ||
336 | */ | ||
337 | static int cgroup_task_count(const struct cgroup *cgrp) | ||
338 | { | ||
339 | int count = 0; | ||
340 | struct cgrp_cset_link *link; | ||
341 | |||
342 | spin_lock_irq(&css_set_lock); | ||
343 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | ||
344 | count += atomic_read(&link->cset->refcount); | ||
345 | spin_unlock_irq(&css_set_lock); | ||
346 | return count; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * Load a cgroup's pidarray with either procs' tgids or tasks' pids | ||
351 | */ | ||
352 | static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | ||
353 | struct cgroup_pidlist **lp) | ||
354 | { | ||
355 | pid_t *array; | ||
356 | int length; | ||
357 | int pid, n = 0; /* used for populating the array */ | ||
358 | struct css_task_iter it; | ||
359 | struct task_struct *tsk; | ||
360 | struct cgroup_pidlist *l; | ||
361 | |||
362 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
363 | |||
364 | /* | ||
365 | * If cgroup gets more users after we read count, we won't have | ||
366 | * enough space - tough. This race is indistinguishable to the | ||
367 | * caller from the case that the additional cgroup users didn't | ||
368 | * show up until sometime later on. | ||
369 | */ | ||
370 | length = cgroup_task_count(cgrp); | ||
371 | array = pidlist_allocate(length); | ||
372 | if (!array) | ||
373 | return -ENOMEM; | ||
374 | /* now, populate the array */ | ||
375 | css_task_iter_start(&cgrp->self, &it); | ||
376 | while ((tsk = css_task_iter_next(&it))) { | ||
377 | if (unlikely(n == length)) | ||
378 | break; | ||
379 | /* get tgid or pid for procs or tasks file respectively */ | ||
380 | if (type == CGROUP_FILE_PROCS) | ||
381 | pid = task_tgid_vnr(tsk); | ||
382 | else | ||
383 | pid = task_pid_vnr(tsk); | ||
384 | if (pid > 0) /* make sure to only use valid results */ | ||
385 | array[n++] = pid; | ||
386 | } | ||
387 | css_task_iter_end(&it); | ||
388 | length = n; | ||
389 | /* now sort & (if procs) strip out duplicates */ | ||
390 | sort(array, length, sizeof(pid_t), cmppid, NULL); | ||
391 | if (type == CGROUP_FILE_PROCS) | ||
392 | length = pidlist_uniq(array, length); | ||
393 | |||
394 | l = cgroup_pidlist_find_create(cgrp, type); | ||
395 | if (!l) { | ||
396 | pidlist_free(array); | ||
397 | return -ENOMEM; | ||
398 | } | ||
399 | |||
400 | /* store array, freeing old if necessary */ | ||
401 | pidlist_free(l->list); | ||
402 | l->list = array; | ||
403 | l->length = length; | ||
404 | *lp = l; | ||
405 | return 0; | ||
406 | } | ||
407 | |||
408 | /* | ||
409 | * seq_file methods for the tasks/procs files. The seq_file position is the | ||
410 | * next pid to display; the seq_file iterator is a pointer to the pid | ||
411 | * in the cgroup->l->list array. | ||
412 | */ | ||
413 | |||
414 | static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | ||
415 | { | ||
416 | /* | ||
417 | * Initially we receive a position value that corresponds to | ||
418 | * one more than the last pid shown (or 0 on the first call or | ||
419 | * after a seek to the start). Use a binary-search to find the | ||
420 | * next pid to display, if any | ||
421 | */ | ||
422 | struct kernfs_open_file *of = s->private; | ||
423 | struct cgroup *cgrp = seq_css(s)->cgroup; | ||
424 | struct cgroup_pidlist *l; | ||
425 | enum cgroup_filetype type = seq_cft(s)->private; | ||
426 | int index = 0, pid = *pos; | ||
427 | int *iter, ret; | ||
428 | |||
429 | mutex_lock(&cgrp->pidlist_mutex); | ||
430 | |||
431 | /* | ||
432 | * !NULL @of->priv indicates that this isn't the first start() | ||
433 | * after open. If the matching pidlist is around, we can use that. | ||
434 | * Look for it. Note that @of->priv can't be used directly. It | ||
435 | * could already have been destroyed. | ||
436 | */ | ||
437 | if (of->priv) | ||
438 | of->priv = cgroup_pidlist_find(cgrp, type); | ||
439 | |||
440 | /* | ||
441 | * Either this is the first start() after open or the matching | ||
442 | * pidlist has been destroyed inbetween. Create a new one. | ||
443 | */ | ||
444 | if (!of->priv) { | ||
445 | ret = pidlist_array_load(cgrp, type, | ||
446 | (struct cgroup_pidlist **)&of->priv); | ||
447 | if (ret) | ||
448 | return ERR_PTR(ret); | ||
449 | } | ||
450 | l = of->priv; | ||
451 | |||
452 | if (pid) { | ||
453 | int end = l->length; | ||
454 | |||
455 | while (index < end) { | ||
456 | int mid = (index + end) / 2; | ||
457 | if (l->list[mid] == pid) { | ||
458 | index = mid; | ||
459 | break; | ||
460 | } else if (l->list[mid] <= pid) | ||
461 | index = mid + 1; | ||
462 | else | ||
463 | end = mid; | ||
464 | } | ||
465 | } | ||
466 | /* If we're off the end of the array, we're done */ | ||
467 | if (index >= l->length) | ||
468 | return NULL; | ||
469 | /* Update the abstract position to be the actual pid that we found */ | ||
470 | iter = l->list + index; | ||
471 | *pos = *iter; | ||
472 | return iter; | ||
473 | } | ||
474 | |||
475 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | ||
476 | { | ||
477 | struct kernfs_open_file *of = s->private; | ||
478 | struct cgroup_pidlist *l = of->priv; | ||
479 | |||
480 | if (l) | ||
481 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, | ||
482 | CGROUP_PIDLIST_DESTROY_DELAY); | ||
483 | mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); | ||
484 | } | ||
485 | |||
486 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | ||
487 | { | ||
488 | struct kernfs_open_file *of = s->private; | ||
489 | struct cgroup_pidlist *l = of->priv; | ||
490 | pid_t *p = v; | ||
491 | pid_t *end = l->list + l->length; | ||
492 | /* | ||
493 | * Advance to the next pid in the array. If this goes off the | ||
494 | * end, we're done | ||
495 | */ | ||
496 | p++; | ||
497 | if (p >= end) { | ||
498 | return NULL; | ||
499 | } else { | ||
500 | *pos = *p; | ||
501 | return p; | ||
502 | } | ||
503 | } | ||
504 | |||
505 | static int cgroup_pidlist_show(struct seq_file *s, void *v) | ||
506 | { | ||
507 | seq_printf(s, "%d\n", *(int *)v); | ||
508 | |||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, | ||
513 | char *buf, size_t nbytes, loff_t off) | ||
514 | { | ||
515 | return __cgroup_procs_write(of, buf, nbytes, off, false); | ||
516 | } | ||
517 | |||
518 | static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, | ||
519 | char *buf, size_t nbytes, loff_t off) | ||
520 | { | ||
521 | struct cgroup *cgrp; | ||
522 | |||
523 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | ||
524 | |||
525 | cgrp = cgroup_kn_lock_live(of->kn, false); | ||
526 | if (!cgrp) | ||
527 | return -ENODEV; | ||
528 | spin_lock(&release_agent_path_lock); | ||
529 | strlcpy(cgrp->root->release_agent_path, strstrip(buf), | ||
530 | sizeof(cgrp->root->release_agent_path)); | ||
531 | spin_unlock(&release_agent_path_lock); | ||
532 | cgroup_kn_unlock(of->kn); | ||
533 | return nbytes; | ||
534 | } | ||
535 | |||
536 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) | ||
537 | { | ||
538 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
539 | |||
540 | spin_lock(&release_agent_path_lock); | ||
541 | seq_puts(seq, cgrp->root->release_agent_path); | ||
542 | spin_unlock(&release_agent_path_lock); | ||
543 | seq_putc(seq, '\n'); | ||
544 | return 0; | ||
545 | } | ||
546 | |||
547 | static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) | ||
548 | { | ||
549 | seq_puts(seq, "0\n"); | ||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | ||
554 | struct cftype *cft) | ||
555 | { | ||
556 | return notify_on_release(css->cgroup); | ||
557 | } | ||
558 | |||
559 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, | ||
560 | struct cftype *cft, u64 val) | ||
561 | { | ||
562 | if (val) | ||
563 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); | ||
564 | else | ||
565 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); | ||
566 | return 0; | ||
567 | } | ||
568 | |||
569 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, | ||
570 | struct cftype *cft) | ||
571 | { | ||
572 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); | ||
573 | } | ||
574 | |||
575 | static int cgroup_clone_children_write(struct cgroup_subsys_state *css, | ||
576 | struct cftype *cft, u64 val) | ||
577 | { | ||
578 | if (val) | ||
579 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); | ||
580 | else | ||
581 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); | ||
582 | return 0; | ||
583 | } | ||
584 | |||
585 | /* cgroup core interface files for the legacy hierarchies */ | ||
586 | struct cftype cgroup_legacy_base_files[] = { | ||
587 | { | ||
588 | .name = "cgroup.procs", | ||
589 | .seq_start = cgroup_pidlist_start, | ||
590 | .seq_next = cgroup_pidlist_next, | ||
591 | .seq_stop = cgroup_pidlist_stop, | ||
592 | .seq_show = cgroup_pidlist_show, | ||
593 | .private = CGROUP_FILE_PROCS, | ||
594 | .write = cgroup_procs_write, | ||
595 | }, | ||
596 | { | ||
597 | .name = "cgroup.clone_children", | ||
598 | .read_u64 = cgroup_clone_children_read, | ||
599 | .write_u64 = cgroup_clone_children_write, | ||
600 | }, | ||
601 | { | ||
602 | .name = "cgroup.sane_behavior", | ||
603 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
604 | .seq_show = cgroup_sane_behavior_show, | ||
605 | }, | ||
606 | { | ||
607 | .name = "tasks", | ||
608 | .seq_start = cgroup_pidlist_start, | ||
609 | .seq_next = cgroup_pidlist_next, | ||
610 | .seq_stop = cgroup_pidlist_stop, | ||
611 | .seq_show = cgroup_pidlist_show, | ||
612 | .private = CGROUP_FILE_TASKS, | ||
613 | .write = cgroup_tasks_write, | ||
614 | }, | ||
615 | { | ||
616 | .name = "notify_on_release", | ||
617 | .read_u64 = cgroup_read_notify_on_release, | ||
618 | .write_u64 = cgroup_write_notify_on_release, | ||
619 | }, | ||
620 | { | ||
621 | .name = "release_agent", | ||
622 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
623 | .seq_show = cgroup_release_agent_show, | ||
624 | .write = cgroup_release_agent_write, | ||
625 | .max_write_len = PATH_MAX - 1, | ||
626 | }, | ||
627 | { } /* terminate */ | ||
628 | }; | ||
629 | |||
630 | /* Display information about each subsystem and each hierarchy */ | ||
631 | static int proc_cgroupstats_show(struct seq_file *m, void *v) | ||
632 | { | ||
633 | struct cgroup_subsys *ss; | ||
634 | int i; | ||
635 | |||
636 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); | ||
637 | /* | ||
638 | * ideally we don't want subsystems moving around while we do this. | ||
639 | * cgroup_mutex is also necessary to guarantee an atomic snapshot of | ||
640 | * subsys/hierarchy state. | ||
641 | */ | ||
642 | mutex_lock(&cgroup_mutex); | ||
643 | |||
644 | for_each_subsys(ss, i) | ||
645 | seq_printf(m, "%s\t%d\t%d\t%d\n", | ||
646 | ss->legacy_name, ss->root->hierarchy_id, | ||
647 | atomic_read(&ss->root->nr_cgrps), | ||
648 | cgroup_ssid_enabled(i)); | ||
649 | |||
650 | mutex_unlock(&cgroup_mutex); | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | static int cgroupstats_open(struct inode *inode, struct file *file) | ||
655 | { | ||
656 | return single_open(file, proc_cgroupstats_show, NULL); | ||
657 | } | ||
658 | |||
659 | const struct file_operations proc_cgroupstats_operations = { | ||
660 | .open = cgroupstats_open, | ||
661 | .read = seq_read, | ||
662 | .llseek = seq_lseek, | ||
663 | .release = single_release, | ||
664 | }; | ||
665 | |||
666 | /** | ||
667 | * cgroupstats_build - build and fill cgroupstats | ||
668 | * @stats: cgroupstats to fill information into | ||
669 | * @dentry: A dentry entry belonging to the cgroup for which stats have | ||
670 | * been requested. | ||
671 | * | ||
672 | * Build and fill cgroupstats so that taskstats can export it to user | ||
673 | * space. | ||
674 | */ | ||
675 | int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | ||
676 | { | ||
677 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); | ||
678 | struct cgroup *cgrp; | ||
679 | struct css_task_iter it; | ||
680 | struct task_struct *tsk; | ||
681 | |||
682 | /* it should be kernfs_node belonging to cgroupfs and is a directory */ | ||
683 | if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || | ||
684 | kernfs_type(kn) != KERNFS_DIR) | ||
685 | return -EINVAL; | ||
686 | |||
687 | mutex_lock(&cgroup_mutex); | ||
688 | |||
689 | /* | ||
690 | * We aren't being called from kernfs and there's no guarantee on | ||
691 | * @kn->priv's validity. For this and css_tryget_online_from_dir(), | ||
692 | * @kn->priv is RCU safe. Let's do the RCU dancing. | ||
693 | */ | ||
694 | rcu_read_lock(); | ||
695 | cgrp = rcu_dereference(kn->priv); | ||
696 | if (!cgrp || cgroup_is_dead(cgrp)) { | ||
697 | rcu_read_unlock(); | ||
698 | mutex_unlock(&cgroup_mutex); | ||
699 | return -ENOENT; | ||
700 | } | ||
701 | rcu_read_unlock(); | ||
702 | |||
703 | css_task_iter_start(&cgrp->self, &it); | ||
704 | while ((tsk = css_task_iter_next(&it))) { | ||
705 | switch (tsk->state) { | ||
706 | case TASK_RUNNING: | ||
707 | stats->nr_running++; | ||
708 | break; | ||
709 | case TASK_INTERRUPTIBLE: | ||
710 | stats->nr_sleeping++; | ||
711 | break; | ||
712 | case TASK_UNINTERRUPTIBLE: | ||
713 | stats->nr_uninterruptible++; | ||
714 | break; | ||
715 | case TASK_STOPPED: | ||
716 | stats->nr_stopped++; | ||
717 | break; | ||
718 | default: | ||
719 | if (delayacct_is_task_waiting_on_io(tsk)) | ||
720 | stats->nr_io_wait++; | ||
721 | break; | ||
722 | } | ||
723 | } | ||
724 | css_task_iter_end(&it); | ||
725 | |||
726 | mutex_unlock(&cgroup_mutex); | ||
727 | return 0; | ||
728 | } | ||
729 | |||
730 | void check_for_release(struct cgroup *cgrp) | ||
731 | { | ||
732 | if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && | ||
733 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) | ||
734 | schedule_work(&cgrp->release_agent_work); | ||
735 | } | ||
736 | |||
737 | /* | ||
738 | * Notify userspace when a cgroup is released, by running the | ||
739 | * configured release agent with the name of the cgroup (path | ||
740 | * relative to the root of cgroup file system) as the argument. | ||
741 | * | ||
742 | * Most likely, this user command will try to rmdir this cgroup. | ||
743 | * | ||
744 | * This races with the possibility that some other task will be | ||
745 | * attached to this cgroup before it is removed, or that some other | ||
746 | * user task will 'mkdir' a child cgroup of this cgroup. That's ok. | ||
747 | * The presumed 'rmdir' will fail quietly if this cgroup is no longer | ||
748 | * unused, and this cgroup will be reprieved from its death sentence, | ||
749 | * to continue to serve a useful existence. Next time it's released, | ||
750 | * we will get notified again, if it still has 'notify_on_release' set. | ||
751 | * | ||
752 | * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which | ||
753 | * means only wait until the task is successfully execve()'d. The | ||
754 | * separate release agent task is forked by call_usermodehelper(), | ||
755 | * then control in this thread returns here, without waiting for the | ||
756 | * release agent task. We don't bother to wait because the caller of | ||
757 | * this routine has no use for the exit status of the release agent | ||
758 | * task, so no sense holding our caller up for that. | ||
759 | */ | ||
760 | void cgroup_release_agent(struct work_struct *work) | ||
761 | { | ||
762 | struct cgroup *cgrp = | ||
763 | container_of(work, struct cgroup, release_agent_work); | ||
764 | char *pathbuf = NULL, *agentbuf = NULL; | ||
765 | char *argv[3], *envp[3]; | ||
766 | int ret; | ||
767 | |||
768 | mutex_lock(&cgroup_mutex); | ||
769 | |||
770 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); | ||
771 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); | ||
772 | if (!pathbuf || !agentbuf) | ||
773 | goto out; | ||
774 | |||
775 | spin_lock_irq(&css_set_lock); | ||
776 | ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); | ||
777 | spin_unlock_irq(&css_set_lock); | ||
778 | if (ret < 0 || ret >= PATH_MAX) | ||
779 | goto out; | ||
780 | |||
781 | argv[0] = agentbuf; | ||
782 | argv[1] = pathbuf; | ||
783 | argv[2] = NULL; | ||
784 | |||
785 | /* minimal command environment */ | ||
786 | envp[0] = "HOME=/"; | ||
787 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
788 | envp[2] = NULL; | ||
789 | |||
790 | mutex_unlock(&cgroup_mutex); | ||
791 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
792 | goto out_free; | ||
793 | out: | ||
794 | mutex_unlock(&cgroup_mutex); | ||
795 | out_free: | ||
796 | kfree(agentbuf); | ||
797 | kfree(pathbuf); | ||
798 | } | ||
799 | |||
800 | /* | ||
801 | * cgroup_rename - Only allow simple rename of directories in place. | ||
802 | */ | ||
803 | int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | ||
804 | const char *new_name_str) | ||
805 | { | ||
806 | struct cgroup *cgrp = kn->priv; | ||
807 | int ret; | ||
808 | |||
809 | if (kernfs_type(kn) != KERNFS_DIR) | ||
810 | return -ENOTDIR; | ||
811 | if (kn->parent != new_parent) | ||
812 | return -EIO; | ||
813 | |||
814 | /* | ||
815 | * This isn't a proper migration and its usefulness is very | ||
816 | * limited. Disallow on the default hierarchy. | ||
817 | */ | ||
818 | if (cgroup_on_dfl(cgrp)) | ||
819 | return -EPERM; | ||
820 | |||
821 | /* | ||
822 | * We're gonna grab cgroup_mutex which nests outside kernfs | ||
823 | * active_ref. kernfs_rename() doesn't require active_ref | ||
824 | * protection. Break them before grabbing cgroup_mutex. | ||
825 | */ | ||
826 | kernfs_break_active_protection(new_parent); | ||
827 | kernfs_break_active_protection(kn); | ||
828 | |||
829 | mutex_lock(&cgroup_mutex); | ||
830 | |||
831 | ret = kernfs_rename(kn, new_parent, new_name_str); | ||
832 | if (!ret) | ||
833 | trace_cgroup_rename(cgrp); | ||
834 | |||
835 | mutex_unlock(&cgroup_mutex); | ||
836 | |||
837 | kernfs_unbreak_active_protection(kn); | ||
838 | kernfs_unbreak_active_protection(new_parent); | ||
839 | return ret; | ||
840 | } | ||
841 | |||
842 | static int __init cgroup1_wq_init(void) | ||
843 | { | ||
844 | /* | ||
845 | * Used to destroy pidlists and separate to serve as flush domain. | ||
846 | * Cap @max_active to 1 too. | ||
847 | */ | ||
848 | cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", | ||
849 | 0, 1); | ||
850 | BUG_ON(!cgroup_pidlist_destroy_wq); | ||
851 | return 0; | ||
852 | } | ||
853 | core_initcall(cgroup1_wq_init); | ||
854 | |||
855 | static int __init cgroup_no_v1(char *str) | ||
856 | { | ||
857 | struct cgroup_subsys *ss; | ||
858 | char *token; | ||
859 | int i; | ||
860 | |||
861 | while ((token = strsep(&str, ",")) != NULL) { | ||
862 | if (!*token) | ||
863 | continue; | ||
864 | |||
865 | if (!strcmp(token, "all")) { | ||
866 | cgroup_no_v1_mask = U16_MAX; | ||
867 | break; | ||
868 | } | ||
869 | |||
870 | for_each_subsys(ss, i) { | ||
871 | if (strcmp(token, ss->name) && | ||
872 | strcmp(token, ss->legacy_name)) | ||
873 | continue; | ||
874 | |||
875 | cgroup_no_v1_mask |= 1 << i; | ||
876 | } | ||
877 | } | ||
878 | return 1; | ||
879 | } | ||
880 | __setup("cgroup_no_v1=", cgroup_no_v1); | ||
881 | |||
882 | |||
883 | #ifdef CONFIG_CGROUP_DEBUG | ||
884 | static struct cgroup_subsys_state * | ||
885 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
886 | { | ||
887 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | ||
888 | |||
889 | if (!css) | ||
890 | return ERR_PTR(-ENOMEM); | ||
891 | |||
892 | return css; | ||
893 | } | ||
894 | |||
895 | static void debug_css_free(struct cgroup_subsys_state *css) | ||
896 | { | ||
897 | kfree(css); | ||
898 | } | ||
899 | |||
900 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, | ||
901 | struct cftype *cft) | ||
902 | { | ||
903 | return cgroup_task_count(css->cgroup); | ||
904 | } | ||
905 | |||
906 | static u64 current_css_set_read(struct cgroup_subsys_state *css, | ||
907 | struct cftype *cft) | ||
908 | { | ||
909 | return (u64)(unsigned long)current->cgroups; | ||
910 | } | ||
911 | |||
912 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | ||
913 | struct cftype *cft) | ||
914 | { | ||
915 | u64 count; | ||
916 | |||
917 | rcu_read_lock(); | ||
918 | count = atomic_read(&task_css_set(current)->refcount); | ||
919 | rcu_read_unlock(); | ||
920 | return count; | ||
921 | } | ||
922 | |||
923 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | ||
924 | { | ||
925 | struct cgrp_cset_link *link; | ||
926 | struct css_set *cset; | ||
927 | char *name_buf; | ||
928 | |||
929 | name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
930 | if (!name_buf) | ||
931 | return -ENOMEM; | ||
932 | |||
933 | spin_lock_irq(&css_set_lock); | ||
934 | rcu_read_lock(); | ||
935 | cset = rcu_dereference(current->cgroups); | ||
936 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
937 | struct cgroup *c = link->cgrp; | ||
938 | |||
939 | cgroup_name(c, name_buf, NAME_MAX + 1); | ||
940 | seq_printf(seq, "Root %d group %s\n", | ||
941 | c->root->hierarchy_id, name_buf); | ||
942 | } | ||
943 | rcu_read_unlock(); | ||
944 | spin_unlock_irq(&css_set_lock); | ||
945 | kfree(name_buf); | ||
946 | return 0; | ||
947 | } | ||
948 | |||
949 | #define MAX_TASKS_SHOWN_PER_CSS 25 | ||
950 | static int cgroup_css_links_read(struct seq_file *seq, void *v) | ||
951 | { | ||
952 | struct cgroup_subsys_state *css = seq_css(seq); | ||
953 | struct cgrp_cset_link *link; | ||
954 | |||
955 | spin_lock_irq(&css_set_lock); | ||
956 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | ||
957 | struct css_set *cset = link->cset; | ||
958 | struct task_struct *task; | ||
959 | int count = 0; | ||
960 | |||
961 | seq_printf(seq, "css_set %p\n", cset); | ||
962 | |||
963 | list_for_each_entry(task, &cset->tasks, cg_list) { | ||
964 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
965 | goto overflow; | ||
966 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
967 | } | ||
968 | |||
969 | list_for_each_entry(task, &cset->mg_tasks, cg_list) { | ||
970 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
971 | goto overflow; | ||
972 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
973 | } | ||
974 | continue; | ||
975 | overflow: | ||
976 | seq_puts(seq, " ...\n"); | ||
977 | } | ||
978 | spin_unlock_irq(&css_set_lock); | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | ||
983 | { | ||
984 | return (!cgroup_is_populated(css->cgroup) && | ||
985 | !css_has_online_children(&css->cgroup->self)); | ||
986 | } | ||
987 | |||
988 | static struct cftype debug_files[] = { | ||
989 | { | ||
990 | .name = "taskcount", | ||
991 | .read_u64 = debug_taskcount_read, | ||
992 | }, | ||
993 | |||
994 | { | ||
995 | .name = "current_css_set", | ||
996 | .read_u64 = current_css_set_read, | ||
997 | }, | ||
998 | |||
999 | { | ||
1000 | .name = "current_css_set_refcount", | ||
1001 | .read_u64 = current_css_set_refcount_read, | ||
1002 | }, | ||
1003 | |||
1004 | { | ||
1005 | .name = "current_css_set_cg_links", | ||
1006 | .seq_show = current_css_set_cg_links_read, | ||
1007 | }, | ||
1008 | |||
1009 | { | ||
1010 | .name = "cgroup_css_links", | ||
1011 | .seq_show = cgroup_css_links_read, | ||
1012 | }, | ||
1013 | |||
1014 | { | ||
1015 | .name = "releasable", | ||
1016 | .read_u64 = releasable_read, | ||
1017 | }, | ||
1018 | |||
1019 | { } /* terminate */ | ||
1020 | }; | ||
1021 | |||
1022 | struct cgroup_subsys debug_cgrp_subsys = { | ||
1023 | .css_alloc = debug_css_alloc, | ||
1024 | .css_free = debug_css_free, | ||
1025 | .legacy_cftypes = debug_files, | ||
1026 | }; | ||
1027 | #endif /* CONFIG_CGROUP_DEBUG */ | ||
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1a815f275849..d34c170f87ef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -28,15 +28,14 @@ | |||
28 | 28 | ||
29 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 29 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
30 | 30 | ||
31 | #include <linux/cgroup.h> | 31 | #include "cgroup-internal.h" |
32 | |||
32 | #include <linux/cred.h> | 33 | #include <linux/cred.h> |
33 | #include <linux/ctype.h> | 34 | #include <linux/ctype.h> |
34 | #include <linux/errno.h> | 35 | #include <linux/errno.h> |
35 | #include <linux/init_task.h> | 36 | #include <linux/init_task.h> |
36 | #include <linux/kernel.h> | 37 | #include <linux/kernel.h> |
37 | #include <linux/list.h> | ||
38 | #include <linux/magic.h> | 38 | #include <linux/magic.h> |
39 | #include <linux/mm.h> | ||
40 | #include <linux/mutex.h> | 39 | #include <linux/mutex.h> |
41 | #include <linux/mount.h> | 40 | #include <linux/mount.h> |
42 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
@@ -47,14 +46,8 @@ | |||
47 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
48 | #include <linux/percpu-rwsem.h> | 47 | #include <linux/percpu-rwsem.h> |
49 | #include <linux/string.h> | 48 | #include <linux/string.h> |
50 | #include <linux/sort.h> | ||
51 | #include <linux/kmod.h> | ||
52 | #include <linux/delayacct.h> | ||
53 | #include <linux/cgroupstats.h> | ||
54 | #include <linux/hashtable.h> | 49 | #include <linux/hashtable.h> |
55 | #include <linux/pid_namespace.h> | ||
56 | #include <linux/idr.h> | 50 | #include <linux/idr.h> |
57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | ||
58 | #include <linux/kthread.h> | 51 | #include <linux/kthread.h> |
59 | #include <linux/delay.h> | 52 | #include <linux/delay.h> |
60 | #include <linux/atomic.h> | 53 | #include <linux/atomic.h> |
@@ -67,14 +60,6 @@ | |||
67 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
68 | #include <trace/events/cgroup.h> | 61 | #include <trace/events/cgroup.h> |
69 | 62 | ||
70 | /* | ||
71 | * pidlists linger the following amount before being destroyed. The goal | ||
72 | * is avoiding frequent destruction in the middle of consecutive read calls | ||
73 | * Expiring in the middle is a performance problem not a correctness one. | ||
74 | * 1 sec should be enough. | ||
75 | */ | ||
76 | #define CGROUP_PIDLIST_DESTROY_DELAY HZ | ||
77 | |||
78 | #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ | 63 | #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ |
79 | MAX_CFTYPE_NAME + 2) | 64 | MAX_CFTYPE_NAME + 2) |
80 | 65 | ||
@@ -88,14 +73,12 @@ | |||
88 | * These locks are exported if CONFIG_PROVE_RCU so that accessors in | 73 | * These locks are exported if CONFIG_PROVE_RCU so that accessors in |
89 | * cgroup.h can use them for lockdep annotations. | 74 | * cgroup.h can use them for lockdep annotations. |
90 | */ | 75 | */ |
91 | #ifdef CONFIG_PROVE_RCU | ||
92 | DEFINE_MUTEX(cgroup_mutex); | 76 | DEFINE_MUTEX(cgroup_mutex); |
93 | DEFINE_SPINLOCK(css_set_lock); | 77 | DEFINE_SPINLOCK(css_set_lock); |
78 | |||
79 | #ifdef CONFIG_PROVE_RCU | ||
94 | EXPORT_SYMBOL_GPL(cgroup_mutex); | 80 | EXPORT_SYMBOL_GPL(cgroup_mutex); |
95 | EXPORT_SYMBOL_GPL(css_set_lock); | 81 | EXPORT_SYMBOL_GPL(css_set_lock); |
96 | #else | ||
97 | static DEFINE_MUTEX(cgroup_mutex); | ||
98 | static DEFINE_SPINLOCK(css_set_lock); | ||
99 | #endif | 82 | #endif |
100 | 83 | ||
101 | /* | 84 | /* |
@@ -110,12 +93,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); | |||
110 | */ | 93 | */ |
111 | static DEFINE_SPINLOCK(cgroup_file_kn_lock); | 94 | static DEFINE_SPINLOCK(cgroup_file_kn_lock); |
112 | 95 | ||
113 | /* | ||
114 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires | ||
115 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. | ||
116 | */ | ||
117 | static DEFINE_SPINLOCK(release_agent_path_lock); | ||
118 | |||
119 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | 96 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; |
120 | 97 | ||
121 | #define cgroup_assert_mutex_or_rcu_locked() \ | 98 | #define cgroup_assert_mutex_or_rcu_locked() \ |
@@ -131,15 +108,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | |||
131 | */ | 108 | */ |
132 | static struct workqueue_struct *cgroup_destroy_wq; | 109 | static struct workqueue_struct *cgroup_destroy_wq; |
133 | 110 | ||
134 | /* | ||
135 | * pidlist destructions need to be flushed on cgroup destruction. Use a | ||
136 | * separate workqueue as flush domain. | ||
137 | */ | ||
138 | static struct workqueue_struct *cgroup_pidlist_destroy_wq; | ||
139 | |||
140 | /* generate an array of cgroup subsystem pointers */ | 111 | /* generate an array of cgroup subsystem pointers */ |
141 | #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, | 112 | #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, |
142 | static struct cgroup_subsys *cgroup_subsys[] = { | 113 | struct cgroup_subsys *cgroup_subsys[] = { |
143 | #include <linux/cgroup_subsys.h> | 114 | #include <linux/cgroup_subsys.h> |
144 | }; | 115 | }; |
145 | #undef SUBSYS | 116 | #undef SUBSYS |
@@ -186,9 +157,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); | |||
186 | */ | 157 | */ |
187 | static bool cgrp_dfl_visible; | 158 | static bool cgrp_dfl_visible; |
188 | 159 | ||
189 | /* Controllers blocked by the commandline in v1 */ | ||
190 | static u16 cgroup_no_v1_mask; | ||
191 | |||
192 | /* some controllers are not supported in the default hierarchy */ | 160 | /* some controllers are not supported in the default hierarchy */ |
193 | static u16 cgrp_dfl_inhibit_ss_mask; | 161 | static u16 cgrp_dfl_inhibit_ss_mask; |
194 | 162 | ||
@@ -196,8 +164,7 @@ static u16 cgrp_dfl_inhibit_ss_mask; | |||
196 | static unsigned long cgrp_dfl_implicit_ss_mask; | 164 | static unsigned long cgrp_dfl_implicit_ss_mask; |
197 | 165 | ||
198 | /* The list of hierarchy roots */ | 166 | /* The list of hierarchy roots */ |
199 | 167 | LIST_HEAD(cgroup_roots); | |
200 | static LIST_HEAD(cgroup_roots); | ||
201 | static int cgroup_root_count; | 168 | static int cgroup_root_count; |
202 | 169 | ||
203 | /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ | 170 | /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ |
@@ -235,10 +202,7 @@ static u16 have_canfork_callback __read_mostly; | |||
235 | 202 | ||
236 | static struct file_system_type cgroup2_fs_type; | 203 | static struct file_system_type cgroup2_fs_type; |
237 | static struct cftype cgroup_dfl_base_files[]; | 204 | static struct cftype cgroup_dfl_base_files[]; |
238 | static struct cftype cgroup_legacy_base_files[]; | ||
239 | 205 | ||
240 | static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); | ||
241 | static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); | ||
242 | static int cgroup_apply_control(struct cgroup *cgrp); | 206 | static int cgroup_apply_control(struct cgroup *cgrp); |
243 | static void cgroup_finalize_control(struct cgroup *cgrp, int ret); | 207 | static void cgroup_finalize_control(struct cgroup *cgrp, int ret); |
244 | static void css_task_iter_advance(struct css_task_iter *it); | 208 | static void css_task_iter_advance(struct css_task_iter *it); |
@@ -259,7 +223,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, | |||
259 | * is fine for individual subsystems but unsuitable for cgroup core. This | 223 | * is fine for individual subsystems but unsuitable for cgroup core. This |
260 | * is slower static_key_enabled() based test indexed by @ssid. | 224 | * is slower static_key_enabled() based test indexed by @ssid. |
261 | */ | 225 | */ |
262 | static bool cgroup_ssid_enabled(int ssid) | 226 | bool cgroup_ssid_enabled(int ssid) |
263 | { | 227 | { |
264 | if (CGROUP_SUBSYS_COUNT == 0) | 228 | if (CGROUP_SUBSYS_COUNT == 0) |
265 | return false; | 229 | return false; |
@@ -267,11 +231,6 @@ static bool cgroup_ssid_enabled(int ssid) | |||
267 | return static_key_enabled(cgroup_subsys_enabled_key[ssid]); | 231 | return static_key_enabled(cgroup_subsys_enabled_key[ssid]); |
268 | } | 232 | } |
269 | 233 | ||
270 | static bool cgroup_ssid_no_v1(int ssid) | ||
271 | { | ||
272 | return cgroup_no_v1_mask & (1 << ssid); | ||
273 | } | ||
274 | |||
275 | /** | 234 | /** |
276 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy | 235 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy |
277 | * @cgrp: the cgroup of interest | 236 | * @cgrp: the cgroup of interest |
@@ -325,7 +284,7 @@ static bool cgroup_ssid_no_v1(int ssid) | |||
325 | * | 284 | * |
326 | * - debug: disallowed on the default hierarchy. | 285 | * - debug: disallowed on the default hierarchy. |
327 | */ | 286 | */ |
328 | static bool cgroup_on_dfl(const struct cgroup *cgrp) | 287 | bool cgroup_on_dfl(const struct cgroup *cgrp) |
329 | { | 288 | { |
330 | return cgrp->root == &cgrp_dfl_root; | 289 | return cgrp->root == &cgrp_dfl_root; |
331 | } | 290 | } |
@@ -481,12 +440,6 @@ out_unlock: | |||
481 | return css; | 440 | return css; |
482 | } | 441 | } |
483 | 442 | ||
484 | /* convenient tests for these bits */ | ||
485 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | ||
486 | { | ||
487 | return !(cgrp->self.flags & CSS_ONLINE); | ||
488 | } | ||
489 | |||
490 | static void cgroup_get(struct cgroup *cgrp) | 443 | static void cgroup_get(struct cgroup *cgrp) |
491 | { | 444 | { |
492 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | 445 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); |
@@ -518,11 +471,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) | |||
518 | } | 471 | } |
519 | EXPORT_SYMBOL_GPL(of_css); | 472 | EXPORT_SYMBOL_GPL(of_css); |
520 | 473 | ||
521 | static int notify_on_release(const struct cgroup *cgrp) | ||
522 | { | ||
523 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | ||
524 | } | ||
525 | |||
526 | /** | 474 | /** |
527 | * for_each_css - iterate all css's of a cgroup | 475 | * for_each_css - iterate all css's of a cgroup |
528 | * @css: the iteration cursor | 476 | * @css: the iteration cursor |
@@ -553,15 +501,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
553 | else | 501 | else |
554 | 502 | ||
555 | /** | 503 | /** |
556 | * for_each_subsys - iterate all enabled cgroup subsystems | ||
557 | * @ss: the iteration cursor | ||
558 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | ||
559 | */ | ||
560 | #define for_each_subsys(ss, ssid) \ | ||
561 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ | ||
562 | (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) | ||
563 | |||
564 | /** | ||
565 | * do_each_subsys_mask - filter for_each_subsys with a bitmask | 504 | * do_each_subsys_mask - filter for_each_subsys with a bitmask |
566 | * @ss: the iteration cursor | 505 | * @ss: the iteration cursor |
567 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | 506 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end |
@@ -585,10 +524,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
585 | } \ | 524 | } \ |
586 | } while (false) | 525 | } while (false) |
587 | 526 | ||
588 | /* iterate across the hierarchies */ | ||
589 | #define for_each_root(root) \ | ||
590 | list_for_each_entry((root), &cgroup_roots, root_list) | ||
591 | |||
592 | /* iterate over child cgrps, lock should be held throughout iteration */ | 527 | /* iterate over child cgrps, lock should be held throughout iteration */ |
593 | #define cgroup_for_each_live_child(child, cgrp) \ | 528 | #define cgroup_for_each_live_child(child, cgrp) \ |
594 | list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ | 529 | list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ |
@@ -615,29 +550,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
615 | ; \ | 550 | ; \ |
616 | else | 551 | else |
617 | 552 | ||
618 | static void cgroup_release_agent(struct work_struct *work); | ||
619 | static void check_for_release(struct cgroup *cgrp); | ||
620 | |||
621 | /* | ||
622 | * A cgroup can be associated with multiple css_sets as different tasks may | ||
623 | * belong to different cgroups on different hierarchies. In the other | ||
624 | * direction, a css_set is naturally associated with multiple cgroups. | ||
625 | * This M:N relationship is represented by the following link structure | ||
626 | * which exists for each association and allows traversing the associations | ||
627 | * from both sides. | ||
628 | */ | ||
629 | struct cgrp_cset_link { | ||
630 | /* the cgroup and css_set this link associates */ | ||
631 | struct cgroup *cgrp; | ||
632 | struct css_set *cset; | ||
633 | |||
634 | /* list of cgrp_cset_links anchored at cgrp->cset_links */ | ||
635 | struct list_head cset_link; | ||
636 | |||
637 | /* list of cgrp_cset_links anchored at css_set->cgrp_links */ | ||
638 | struct list_head cgrp_link; | ||
639 | }; | ||
640 | |||
641 | /* | 553 | /* |
642 | * The default css_set - used by init and its children prior to any | 554 | * The default css_set - used by init and its children prior to any |
643 | * hierarchies being mounted. It contains a pointer to the root state | 555 | * hierarchies being mounted. It contains a pointer to the root state |
@@ -1138,7 +1050,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
1138 | return cset; | 1050 | return cset; |
1139 | } | 1051 | } |
1140 | 1052 | ||
1141 | static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) | 1053 | struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) |
1142 | { | 1054 | { |
1143 | struct cgroup *root_cgrp = kf_root->kn->priv; | 1055 | struct cgroup *root_cgrp = kf_root->kn->priv; |
1144 | 1056 | ||
@@ -1283,8 +1195,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, | |||
1283 | * Return the cgroup for "task" from the given hierarchy. Must be | 1195 | * Return the cgroup for "task" from the given hierarchy. Must be |
1284 | * called with cgroup_mutex and css_set_lock held. | 1196 | * called with cgroup_mutex and css_set_lock held. |
1285 | */ | 1197 | */ |
1286 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, | 1198 | struct cgroup *task_cgroup_from_root(struct task_struct *task, |
1287 | struct cgroup_root *root) | 1199 | struct cgroup_root *root) |
1288 | { | 1200 | { |
1289 | /* | 1201 | /* |
1290 | * No need to lock the task - since we hold cgroup_mutex the | 1202 | * No need to lock the task - since we hold cgroup_mutex the |
@@ -1321,7 +1233,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
1321 | */ | 1233 | */ |
1322 | 1234 | ||
1323 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; | 1235 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; |
1324 | static const struct file_operations proc_cgroupstats_operations; | ||
1325 | 1236 | ||
1326 | static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, | 1237 | static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, |
1327 | char *buf) | 1238 | char *buf) |
@@ -1415,7 +1326,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) | |||
1415 | * inaccessible any time. If the caller intends to continue to access the | 1326 | * inaccessible any time. If the caller intends to continue to access the |
1416 | * cgroup, it should pin it before invoking this function. | 1327 | * cgroup, it should pin it before invoking this function. |
1417 | */ | 1328 | */ |
1418 | static void cgroup_kn_unlock(struct kernfs_node *kn) | 1329 | void cgroup_kn_unlock(struct kernfs_node *kn) |
1419 | { | 1330 | { |
1420 | struct cgroup *cgrp; | 1331 | struct cgroup *cgrp; |
1421 | 1332 | ||
@@ -1447,8 +1358,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) | |||
1447 | * locking under kernfs active protection and allows all kernfs operations | 1358 | * locking under kernfs active protection and allows all kernfs operations |
1448 | * including self-removal. | 1359 | * including self-removal. |
1449 | */ | 1360 | */ |
1450 | static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, | 1361 | struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) |
1451 | bool drain_offline) | ||
1452 | { | 1362 | { |
1453 | struct cgroup *cgrp; | 1363 | struct cgroup *cgrp; |
1454 | 1364 | ||
@@ -1559,7 +1469,7 @@ err: | |||
1559 | return ret; | 1469 | return ret; |
1560 | } | 1470 | } |
1561 | 1471 | ||
1562 | static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) | 1472 | int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) |
1563 | { | 1473 | { |
1564 | struct cgroup *dcgrp = &dst_root->cgrp; | 1474 | struct cgroup *dcgrp = &dst_root->cgrp; |
1565 | struct cgroup_subsys *ss; | 1475 | struct cgroup_subsys *ss; |
@@ -1656,8 +1566,7 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | |||
1656 | return len; | 1566 | return len; |
1657 | } | 1567 | } |
1658 | 1568 | ||
1659 | static int cgroup_show_options(struct seq_file *seq, | 1569 | static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) |
1660 | struct kernfs_root *kf_root) | ||
1661 | { | 1570 | { |
1662 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1571 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
1663 | struct cgroup_subsys *ss; | 1572 | struct cgroup_subsys *ss; |
@@ -2311,7 +2220,7 @@ static void cgroup_kill_sb(struct super_block *sb) | |||
2311 | kernfs_kill_sb(sb); | 2220 | kernfs_kill_sb(sb); |
2312 | } | 2221 | } |
2313 | 2222 | ||
2314 | static struct file_system_type cgroup_fs_type = { | 2223 | struct file_system_type cgroup_fs_type = { |
2315 | .name = "cgroup", | 2224 | .name = "cgroup", |
2316 | .mount = cgroup_mount, | 2225 | .mount = cgroup_mount, |
2317 | .kill_sb = cgroup_kill_sb, | 2226 | .kill_sb = cgroup_kill_sb, |
@@ -2325,8 +2234,8 @@ static struct file_system_type cgroup2_fs_type = { | |||
2325 | .fs_flags = FS_USERNS_MOUNT, | 2234 | .fs_flags = FS_USERNS_MOUNT, |
2326 | }; | 2235 | }; |
2327 | 2236 | ||
2328 | static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, | 2237 | int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, |
2329 | struct cgroup_namespace *ns) | 2238 | struct cgroup_namespace *ns) |
2330 | { | 2239 | { |
2331 | struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); | 2240 | struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); |
2332 | 2241 | ||
@@ -2616,7 +2525,7 @@ out_release_tset: | |||
2616 | * zero for migration destination cgroups with tasks so that child cgroups | 2525 | * zero for migration destination cgroups with tasks so that child cgroups |
2617 | * don't compete against tasks. | 2526 | * don't compete against tasks. |
2618 | */ | 2527 | */ |
2619 | static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) | 2528 | bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) |
2620 | { | 2529 | { |
2621 | return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || | 2530 | return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || |
2622 | !dst_cgrp->subtree_control; | 2531 | !dst_cgrp->subtree_control; |
@@ -2629,7 +2538,7 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) | |||
2629 | * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See | 2538 | * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See |
2630 | * those functions for details. | 2539 | * those functions for details. |
2631 | */ | 2540 | */ |
2632 | static void cgroup_migrate_finish(struct list_head *preloaded_csets) | 2541 | void cgroup_migrate_finish(struct list_head *preloaded_csets) |
2633 | { | 2542 | { |
2634 | struct css_set *cset, *tmp_cset; | 2543 | struct css_set *cset, *tmp_cset; |
2635 | 2544 | ||
@@ -2662,9 +2571,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2662 | * into play and the preloaded css_sets are guaranteed to cover all | 2571 | * into play and the preloaded css_sets are guaranteed to cover all |
2663 | * migrations. | 2572 | * migrations. |
2664 | */ | 2573 | */ |
2665 | static void cgroup_migrate_add_src(struct css_set *src_cset, | 2574 | void cgroup_migrate_add_src(struct css_set *src_cset, |
2666 | struct cgroup *dst_cgrp, | 2575 | struct cgroup *dst_cgrp, |
2667 | struct list_head *preloaded_csets) | 2576 | struct list_head *preloaded_csets) |
2668 | { | 2577 | { |
2669 | struct cgroup *src_cgrp; | 2578 | struct cgroup *src_cgrp; |
2670 | 2579 | ||
@@ -2709,7 +2618,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
2709 | * using cgroup_migrate(), cgroup_migrate_finish() must be called on | 2618 | * using cgroup_migrate(), cgroup_migrate_finish() must be called on |
2710 | * @preloaded_csets. | 2619 | * @preloaded_csets. |
2711 | */ | 2620 | */ |
2712 | static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) | 2621 | int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) |
2713 | { | 2622 | { |
2714 | LIST_HEAD(csets); | 2623 | LIST_HEAD(csets); |
2715 | struct css_set *src_cset, *tmp_cset; | 2624 | struct css_set *src_cset, *tmp_cset; |
@@ -2773,8 +2682,8 @@ err: | |||
2773 | * decided for all targets by invoking group_migrate_prepare_dst() before | 2682 | * decided for all targets by invoking group_migrate_prepare_dst() before |
2774 | * actually starting migrating. | 2683 | * actually starting migrating. |
2775 | */ | 2684 | */ |
2776 | static int cgroup_migrate(struct task_struct *leader, bool threadgroup, | 2685 | int cgroup_migrate(struct task_struct *leader, bool threadgroup, |
2777 | struct cgroup_root *root) | 2686 | struct cgroup_root *root) |
2778 | { | 2687 | { |
2779 | struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); | 2688 | struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); |
2780 | struct task_struct *task; | 2689 | struct task_struct *task; |
@@ -2806,8 +2715,8 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, | |||
2806 | * | 2715 | * |
2807 | * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. | 2716 | * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. |
2808 | */ | 2717 | */ |
2809 | static int cgroup_attach_task(struct cgroup *dst_cgrp, | 2718 | int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, |
2810 | struct task_struct *leader, bool threadgroup) | 2719 | bool threadgroup) |
2811 | { | 2720 | { |
2812 | LIST_HEAD(preloaded_csets); | 2721 | LIST_HEAD(preloaded_csets); |
2813 | struct task_struct *task; | 2722 | struct task_struct *task; |
@@ -2888,8 +2797,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
2888 | * function to attach either it or all tasks in its threadgroup. Will lock | 2797 | * function to attach either it or all tasks in its threadgroup. Will lock |
2889 | * cgroup_mutex and threadgroup. | 2798 | * cgroup_mutex and threadgroup. |
2890 | */ | 2799 | */ |
2891 | static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, | 2800 | ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, |
2892 | size_t nbytes, loff_t off, bool threadgroup) | 2801 | size_t nbytes, loff_t off, bool threadgroup) |
2893 | { | 2802 | { |
2894 | struct task_struct *tsk; | 2803 | struct task_struct *tsk; |
2895 | struct cgroup_subsys *ss; | 2804 | struct cgroup_subsys *ss; |
@@ -2950,86 +2859,12 @@ out_unlock_threadgroup: | |||
2950 | return ret ?: nbytes; | 2859 | return ret ?: nbytes; |
2951 | } | 2860 | } |
2952 | 2861 | ||
2953 | /** | 2862 | ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, |
2954 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' | 2863 | loff_t off) |
2955 | * @from: attach to all cgroups of a given task | ||
2956 | * @tsk: the task to be attached | ||
2957 | */ | ||
2958 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | ||
2959 | { | ||
2960 | struct cgroup_root *root; | ||
2961 | int retval = 0; | ||
2962 | |||
2963 | mutex_lock(&cgroup_mutex); | ||
2964 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
2965 | for_each_root(root) { | ||
2966 | struct cgroup *from_cgrp; | ||
2967 | |||
2968 | if (root == &cgrp_dfl_root) | ||
2969 | continue; | ||
2970 | |||
2971 | spin_lock_irq(&css_set_lock); | ||
2972 | from_cgrp = task_cgroup_from_root(from, root); | ||
2973 | spin_unlock_irq(&css_set_lock); | ||
2974 | |||
2975 | retval = cgroup_attach_task(from_cgrp, tsk, false); | ||
2976 | if (retval) | ||
2977 | break; | ||
2978 | } | ||
2979 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
2980 | mutex_unlock(&cgroup_mutex); | ||
2981 | |||
2982 | return retval; | ||
2983 | } | ||
2984 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | ||
2985 | |||
2986 | static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, | ||
2987 | char *buf, size_t nbytes, loff_t off) | ||
2988 | { | ||
2989 | return __cgroup_procs_write(of, buf, nbytes, off, false); | ||
2990 | } | ||
2991 | |||
2992 | static ssize_t cgroup_procs_write(struct kernfs_open_file *of, | ||
2993 | char *buf, size_t nbytes, loff_t off) | ||
2994 | { | 2864 | { |
2995 | return __cgroup_procs_write(of, buf, nbytes, off, true); | 2865 | return __cgroup_procs_write(of, buf, nbytes, off, true); |
2996 | } | 2866 | } |
2997 | 2867 | ||
2998 | static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, | ||
2999 | char *buf, size_t nbytes, loff_t off) | ||
3000 | { | ||
3001 | struct cgroup *cgrp; | ||
3002 | |||
3003 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | ||
3004 | |||
3005 | cgrp = cgroup_kn_lock_live(of->kn, false); | ||
3006 | if (!cgrp) | ||
3007 | return -ENODEV; | ||
3008 | spin_lock(&release_agent_path_lock); | ||
3009 | strlcpy(cgrp->root->release_agent_path, strstrip(buf), | ||
3010 | sizeof(cgrp->root->release_agent_path)); | ||
3011 | spin_unlock(&release_agent_path_lock); | ||
3012 | cgroup_kn_unlock(of->kn); | ||
3013 | return nbytes; | ||
3014 | } | ||
3015 | |||
3016 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) | ||
3017 | { | ||
3018 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
3019 | |||
3020 | spin_lock(&release_agent_path_lock); | ||
3021 | seq_puts(seq, cgrp->root->release_agent_path); | ||
3022 | spin_unlock(&release_agent_path_lock); | ||
3023 | seq_putc(seq, '\n'); | ||
3024 | return 0; | ||
3025 | } | ||
3026 | |||
3027 | static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) | ||
3028 | { | ||
3029 | seq_puts(seq, "0\n"); | ||
3030 | return 0; | ||
3031 | } | ||
3032 | |||
3033 | static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) | 2868 | static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) |
3034 | { | 2869 | { |
3035 | struct cgroup_subsys *ss; | 2870 | struct cgroup_subsys *ss; |
@@ -3131,7 +2966,7 @@ out_finish: | |||
3131 | * controller while the previous css is still around. This function grabs | 2966 | * controller while the previous css is still around. This function grabs |
3132 | * cgroup_mutex and drains the previous css instances of @cgrp's subtree. | 2967 | * cgroup_mutex and drains the previous css instances of @cgrp's subtree. |
3133 | */ | 2968 | */ |
3134 | static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) | 2969 | void cgroup_lock_and_drain_offline(struct cgroup *cgrp) |
3135 | __acquires(&cgroup_mutex) | 2970 | __acquires(&cgroup_mutex) |
3136 | { | 2971 | { |
3137 | struct cgroup *dsct; | 2972 | struct cgroup *dsct; |
@@ -3610,48 +3445,6 @@ static struct kernfs_ops cgroup_kf_ops = { | |||
3610 | .seq_show = cgroup_seqfile_show, | 3445 | .seq_show = cgroup_seqfile_show, |
3611 | }; | 3446 | }; |
3612 | 3447 | ||
3613 | /* | ||
3614 | * cgroup_rename - Only allow simple rename of directories in place. | ||
3615 | */ | ||
3616 | static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | ||
3617 | const char *new_name_str) | ||
3618 | { | ||
3619 | struct cgroup *cgrp = kn->priv; | ||
3620 | int ret; | ||
3621 | |||
3622 | if (kernfs_type(kn) != KERNFS_DIR) | ||
3623 | return -ENOTDIR; | ||
3624 | if (kn->parent != new_parent) | ||
3625 | return -EIO; | ||
3626 | |||
3627 | /* | ||
3628 | * This isn't a proper migration and its usefulness is very | ||
3629 | * limited. Disallow on the default hierarchy. | ||
3630 | */ | ||
3631 | if (cgroup_on_dfl(cgrp)) | ||
3632 | return -EPERM; | ||
3633 | |||
3634 | /* | ||
3635 | * We're gonna grab cgroup_mutex which nests outside kernfs | ||
3636 | * active_ref. kernfs_rename() doesn't require active_ref | ||
3637 | * protection. Break them before grabbing cgroup_mutex. | ||
3638 | */ | ||
3639 | kernfs_break_active_protection(new_parent); | ||
3640 | kernfs_break_active_protection(kn); | ||
3641 | |||
3642 | mutex_lock(&cgroup_mutex); | ||
3643 | |||
3644 | ret = kernfs_rename(kn, new_parent, new_name_str); | ||
3645 | if (!ret) | ||
3646 | trace_cgroup_rename(cgrp); | ||
3647 | |||
3648 | mutex_unlock(&cgroup_mutex); | ||
3649 | |||
3650 | kernfs_unbreak_active_protection(kn); | ||
3651 | kernfs_unbreak_active_protection(new_parent); | ||
3652 | return ret; | ||
3653 | } | ||
3654 | |||
3655 | /* set uid and gid of cgroup dirs and files to that of the creator */ | 3448 | /* set uid and gid of cgroup dirs and files to that of the creator */ |
3656 | static int cgroup_kn_set_ugid(struct kernfs_node *kn) | 3449 | static int cgroup_kn_set_ugid(struct kernfs_node *kn) |
3657 | { | 3450 | { |
@@ -3948,26 +3741,6 @@ void cgroup_file_notify(struct cgroup_file *cfile) | |||
3948 | } | 3741 | } |
3949 | 3742 | ||
3950 | /** | 3743 | /** |
3951 | * cgroup_task_count - count the number of tasks in a cgroup. | ||
3952 | * @cgrp: the cgroup in question | ||
3953 | * | ||
3954 | * Return the number of tasks in the cgroup. The returned number can be | ||
3955 | * higher than the actual number of tasks due to css_set references from | ||
3956 | * namespace roots and temporary usages. | ||
3957 | */ | ||
3958 | static int cgroup_task_count(const struct cgroup *cgrp) | ||
3959 | { | ||
3960 | int count = 0; | ||
3961 | struct cgrp_cset_link *link; | ||
3962 | |||
3963 | spin_lock_irq(&css_set_lock); | ||
3964 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | ||
3965 | count += atomic_read(&link->cset->refcount); | ||
3966 | spin_unlock_irq(&css_set_lock); | ||
3967 | return count; | ||
3968 | } | ||
3969 | |||
3970 | /** | ||
3971 | * css_next_child - find the next child of a given css | 3744 | * css_next_child - find the next child of a given css |
3972 | * @pos: the current position (%NULL to initiate traversal) | 3745 | * @pos: the current position (%NULL to initiate traversal) |
3973 | * @parent: css whose children to walk | 3746 | * @parent: css whose children to walk |
@@ -4365,70 +4138,6 @@ void css_task_iter_end(struct css_task_iter *it) | |||
4365 | put_task_struct(it->cur_task); | 4138 | put_task_struct(it->cur_task); |
4366 | } | 4139 | } |
4367 | 4140 | ||
4368 | /** | ||
4369 | * cgroup_trasnsfer_tasks - move tasks from one cgroup to another | ||
4370 | * @to: cgroup to which the tasks will be moved | ||
4371 | * @from: cgroup in which the tasks currently reside | ||
4372 | * | ||
4373 | * Locking rules between cgroup_post_fork() and the migration path | ||
4374 | * guarantee that, if a task is forking while being migrated, the new child | ||
4375 | * is guaranteed to be either visible in the source cgroup after the | ||
4376 | * parent's migration is complete or put into the target cgroup. No task | ||
4377 | * can slip out of migration through forking. | ||
4378 | */ | ||
4379 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | ||
4380 | { | ||
4381 | LIST_HEAD(preloaded_csets); | ||
4382 | struct cgrp_cset_link *link; | ||
4383 | struct css_task_iter it; | ||
4384 | struct task_struct *task; | ||
4385 | int ret; | ||
4386 | |||
4387 | if (cgroup_on_dfl(to)) | ||
4388 | return -EINVAL; | ||
4389 | |||
4390 | if (!cgroup_may_migrate_to(to)) | ||
4391 | return -EBUSY; | ||
4392 | |||
4393 | mutex_lock(&cgroup_mutex); | ||
4394 | |||
4395 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
4396 | |||
4397 | /* all tasks in @from are being moved, all csets are source */ | ||
4398 | spin_lock_irq(&css_set_lock); | ||
4399 | list_for_each_entry(link, &from->cset_links, cset_link) | ||
4400 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); | ||
4401 | spin_unlock_irq(&css_set_lock); | ||
4402 | |||
4403 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); | ||
4404 | if (ret) | ||
4405 | goto out_err; | ||
4406 | |||
4407 | /* | ||
4408 | * Migrate tasks one-by-one until @from is empty. This fails iff | ||
4409 | * ->can_attach() fails. | ||
4410 | */ | ||
4411 | do { | ||
4412 | css_task_iter_start(&from->self, &it); | ||
4413 | task = css_task_iter_next(&it); | ||
4414 | if (task) | ||
4415 | get_task_struct(task); | ||
4416 | css_task_iter_end(&it); | ||
4417 | |||
4418 | if (task) { | ||
4419 | ret = cgroup_migrate(task, false, to->root); | ||
4420 | if (!ret) | ||
4421 | trace_cgroup_transfer_tasks(to, task, false); | ||
4422 | put_task_struct(task); | ||
4423 | } | ||
4424 | } while (task && !ret); | ||
4425 | out_err: | ||
4426 | cgroup_migrate_finish(&preloaded_csets); | ||
4427 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
4428 | mutex_unlock(&cgroup_mutex); | ||
4429 | return ret; | ||
4430 | } | ||
4431 | |||
4432 | static void cgroup_procs_release(struct kernfs_open_file *of) | 4141 | static void cgroup_procs_release(struct kernfs_open_file *of) |
4433 | { | 4142 | { |
4434 | if (of->priv) { | 4143 | if (of->priv) { |
@@ -4483,456 +4192,6 @@ static int cgroup_procs_show(struct seq_file *s, void *v) | |||
4483 | return 0; | 4192 | return 0; |
4484 | } | 4193 | } |
4485 | 4194 | ||
4486 | /* | ||
4487 | * Stuff for reading the 'tasks'/'procs' files. | ||
4488 | * | ||
4489 | * Reading this file can return large amounts of data if a cgroup has | ||
4490 | * *lots* of attached tasks. So it may need several calls to read(), | ||
4491 | * but we cannot guarantee that the information we produce is correct | ||
4492 | * unless we produce it entirely atomically. | ||
4493 | * | ||
4494 | */ | ||
4495 | |||
4496 | /* which pidlist file are we talking about? */ | ||
4497 | enum cgroup_filetype { | ||
4498 | CGROUP_FILE_PROCS, | ||
4499 | CGROUP_FILE_TASKS, | ||
4500 | }; | ||
4501 | |||
4502 | /* | ||
4503 | * A pidlist is a list of pids that virtually represents the contents of one | ||
4504 | * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, | ||
4505 | * a pair (one each for procs, tasks) for each pid namespace that's relevant | ||
4506 | * to the cgroup. | ||
4507 | */ | ||
4508 | struct cgroup_pidlist { | ||
4509 | /* | ||
4510 | * used to find which pidlist is wanted. doesn't change as long as | ||
4511 | * this particular list stays in the list. | ||
4512 | */ | ||
4513 | struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; | ||
4514 | /* array of xids */ | ||
4515 | pid_t *list; | ||
4516 | /* how many elements the above list has */ | ||
4517 | int length; | ||
4518 | /* each of these stored in a list by its cgroup */ | ||
4519 | struct list_head links; | ||
4520 | /* pointer to the cgroup we belong to, for list removal purposes */ | ||
4521 | struct cgroup *owner; | ||
4522 | /* for delayed destruction */ | ||
4523 | struct delayed_work destroy_dwork; | ||
4524 | }; | ||
4525 | |||
4526 | /* | ||
4527 | * The following two functions "fix" the issue where there are more pids | ||
4528 | * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. | ||
4529 | * TODO: replace with a kernel-wide solution to this problem | ||
4530 | */ | ||
4531 | #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) | ||
4532 | static void *pidlist_allocate(int count) | ||
4533 | { | ||
4534 | if (PIDLIST_TOO_LARGE(count)) | ||
4535 | return vmalloc(count * sizeof(pid_t)); | ||
4536 | else | ||
4537 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); | ||
4538 | } | ||
4539 | |||
4540 | static void pidlist_free(void *p) | ||
4541 | { | ||
4542 | kvfree(p); | ||
4543 | } | ||
4544 | |||
4545 | /* | ||
4546 | * Used to destroy all pidlists lingering waiting for destroy timer. None | ||
4547 | * should be left afterwards. | ||
4548 | */ | ||
4549 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) | ||
4550 | { | ||
4551 | struct cgroup_pidlist *l, *tmp_l; | ||
4552 | |||
4553 | mutex_lock(&cgrp->pidlist_mutex); | ||
4554 | list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) | ||
4555 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); | ||
4556 | mutex_unlock(&cgrp->pidlist_mutex); | ||
4557 | |||
4558 | flush_workqueue(cgroup_pidlist_destroy_wq); | ||
4559 | BUG_ON(!list_empty(&cgrp->pidlists)); | ||
4560 | } | ||
4561 | |||
4562 | static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) | ||
4563 | { | ||
4564 | struct delayed_work *dwork = to_delayed_work(work); | ||
4565 | struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, | ||
4566 | destroy_dwork); | ||
4567 | struct cgroup_pidlist *tofree = NULL; | ||
4568 | |||
4569 | mutex_lock(&l->owner->pidlist_mutex); | ||
4570 | |||
4571 | /* | ||
4572 | * Destroy iff we didn't get queued again. The state won't change | ||
4573 | * as destroy_dwork can only be queued while locked. | ||
4574 | */ | ||
4575 | if (!delayed_work_pending(dwork)) { | ||
4576 | list_del(&l->links); | ||
4577 | pidlist_free(l->list); | ||
4578 | put_pid_ns(l->key.ns); | ||
4579 | tofree = l; | ||
4580 | } | ||
4581 | |||
4582 | mutex_unlock(&l->owner->pidlist_mutex); | ||
4583 | kfree(tofree); | ||
4584 | } | ||
4585 | |||
4586 | /* | ||
4587 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries | ||
4588 | * Returns the number of unique elements. | ||
4589 | */ | ||
4590 | static int pidlist_uniq(pid_t *list, int length) | ||
4591 | { | ||
4592 | int src, dest = 1; | ||
4593 | |||
4594 | /* | ||
4595 | * we presume the 0th element is unique, so i starts at 1. trivial | ||
4596 | * edge cases first; no work needs to be done for either | ||
4597 | */ | ||
4598 | if (length == 0 || length == 1) | ||
4599 | return length; | ||
4600 | /* src and dest walk down the list; dest counts unique elements */ | ||
4601 | for (src = 1; src < length; src++) { | ||
4602 | /* find next unique element */ | ||
4603 | while (list[src] == list[src-1]) { | ||
4604 | src++; | ||
4605 | if (src == length) | ||
4606 | goto after; | ||
4607 | } | ||
4608 | /* dest always points to where the next unique element goes */ | ||
4609 | list[dest] = list[src]; | ||
4610 | dest++; | ||
4611 | } | ||
4612 | after: | ||
4613 | return dest; | ||
4614 | } | ||
4615 | |||
4616 | /* | ||
4617 | * The two pid files - task and cgroup.procs - guaranteed that the result | ||
4618 | * is sorted, which forced this whole pidlist fiasco. As pid order is | ||
4619 | * different per namespace, each namespace needs differently sorted list, | ||
4620 | * making it impossible to use, for example, single rbtree of member tasks | ||
4621 | * sorted by task pointer. As pidlists can be fairly large, allocating one | ||
4622 | * per open file is dangerous, so cgroup had to implement shared pool of | ||
4623 | * pidlists keyed by cgroup and namespace. | ||
4624 | */ | ||
4625 | static int cmppid(const void *a, const void *b) | ||
4626 | { | ||
4627 | return *(pid_t *)a - *(pid_t *)b; | ||
4628 | } | ||
4629 | |||
4630 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | ||
4631 | enum cgroup_filetype type) | ||
4632 | { | ||
4633 | struct cgroup_pidlist *l; | ||
4634 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
4635 | struct pid_namespace *ns = task_active_pid_ns(current); | ||
4636 | |||
4637 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
4638 | |||
4639 | list_for_each_entry(l, &cgrp->pidlists, links) | ||
4640 | if (l->key.type == type && l->key.ns == ns) | ||
4641 | return l; | ||
4642 | return NULL; | ||
4643 | } | ||
4644 | |||
4645 | /* | ||
4646 | * find the appropriate pidlist for our purpose (given procs vs tasks) | ||
4647 | * returns with the lock on that pidlist already held, and takes care | ||
4648 | * of the use count, or returns NULL with no locks held if we're out of | ||
4649 | * memory. | ||
4650 | */ | ||
4651 | static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, | ||
4652 | enum cgroup_filetype type) | ||
4653 | { | ||
4654 | struct cgroup_pidlist *l; | ||
4655 | |||
4656 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
4657 | |||
4658 | l = cgroup_pidlist_find(cgrp, type); | ||
4659 | if (l) | ||
4660 | return l; | ||
4661 | |||
4662 | /* entry not found; create a new one */ | ||
4663 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | ||
4664 | if (!l) | ||
4665 | return l; | ||
4666 | |||
4667 | INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); | ||
4668 | l->key.type = type; | ||
4669 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
4670 | l->key.ns = get_pid_ns(task_active_pid_ns(current)); | ||
4671 | l->owner = cgrp; | ||
4672 | list_add(&l->links, &cgrp->pidlists); | ||
4673 | return l; | ||
4674 | } | ||
4675 | |||
4676 | /* | ||
4677 | * Load a cgroup's pidarray with either procs' tgids or tasks' pids | ||
4678 | */ | ||
4679 | static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | ||
4680 | struct cgroup_pidlist **lp) | ||
4681 | { | ||
4682 | pid_t *array; | ||
4683 | int length; | ||
4684 | int pid, n = 0; /* used for populating the array */ | ||
4685 | struct css_task_iter it; | ||
4686 | struct task_struct *tsk; | ||
4687 | struct cgroup_pidlist *l; | ||
4688 | |||
4689 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
4690 | |||
4691 | /* | ||
4692 | * If cgroup gets more users after we read count, we won't have | ||
4693 | * enough space - tough. This race is indistinguishable to the | ||
4694 | * caller from the case that the additional cgroup users didn't | ||
4695 | * show up until sometime later on. | ||
4696 | */ | ||
4697 | length = cgroup_task_count(cgrp); | ||
4698 | array = pidlist_allocate(length); | ||
4699 | if (!array) | ||
4700 | return -ENOMEM; | ||
4701 | /* now, populate the array */ | ||
4702 | css_task_iter_start(&cgrp->self, &it); | ||
4703 | while ((tsk = css_task_iter_next(&it))) { | ||
4704 | if (unlikely(n == length)) | ||
4705 | break; | ||
4706 | /* get tgid or pid for procs or tasks file respectively */ | ||
4707 | if (type == CGROUP_FILE_PROCS) | ||
4708 | pid = task_tgid_vnr(tsk); | ||
4709 | else | ||
4710 | pid = task_pid_vnr(tsk); | ||
4711 | if (pid > 0) /* make sure to only use valid results */ | ||
4712 | array[n++] = pid; | ||
4713 | } | ||
4714 | css_task_iter_end(&it); | ||
4715 | length = n; | ||
4716 | /* now sort & (if procs) strip out duplicates */ | ||
4717 | sort(array, length, sizeof(pid_t), cmppid, NULL); | ||
4718 | if (type == CGROUP_FILE_PROCS) | ||
4719 | length = pidlist_uniq(array, length); | ||
4720 | |||
4721 | l = cgroup_pidlist_find_create(cgrp, type); | ||
4722 | if (!l) { | ||
4723 | pidlist_free(array); | ||
4724 | return -ENOMEM; | ||
4725 | } | ||
4726 | |||
4727 | /* store array, freeing old if necessary */ | ||
4728 | pidlist_free(l->list); | ||
4729 | l->list = array; | ||
4730 | l->length = length; | ||
4731 | *lp = l; | ||
4732 | return 0; | ||
4733 | } | ||
4734 | |||
4735 | /** | ||
4736 | * cgroupstats_build - build and fill cgroupstats | ||
4737 | * @stats: cgroupstats to fill information into | ||
4738 | * @dentry: A dentry entry belonging to the cgroup for which stats have | ||
4739 | * been requested. | ||
4740 | * | ||
4741 | * Build and fill cgroupstats so that taskstats can export it to user | ||
4742 | * space. | ||
4743 | */ | ||
4744 | int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | ||
4745 | { | ||
4746 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); | ||
4747 | struct cgroup *cgrp; | ||
4748 | struct css_task_iter it; | ||
4749 | struct task_struct *tsk; | ||
4750 | |||
4751 | /* it should be kernfs_node belonging to cgroupfs and is a directory */ | ||
4752 | if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || | ||
4753 | kernfs_type(kn) != KERNFS_DIR) | ||
4754 | return -EINVAL; | ||
4755 | |||
4756 | mutex_lock(&cgroup_mutex); | ||
4757 | |||
4758 | /* | ||
4759 | * We aren't being called from kernfs and there's no guarantee on | ||
4760 | * @kn->priv's validity. For this and css_tryget_online_from_dir(), | ||
4761 | * @kn->priv is RCU safe. Let's do the RCU dancing. | ||
4762 | */ | ||
4763 | rcu_read_lock(); | ||
4764 | cgrp = rcu_dereference(kn->priv); | ||
4765 | if (!cgrp || cgroup_is_dead(cgrp)) { | ||
4766 | rcu_read_unlock(); | ||
4767 | mutex_unlock(&cgroup_mutex); | ||
4768 | return -ENOENT; | ||
4769 | } | ||
4770 | rcu_read_unlock(); | ||
4771 | |||
4772 | css_task_iter_start(&cgrp->self, &it); | ||
4773 | while ((tsk = css_task_iter_next(&it))) { | ||
4774 | switch (tsk->state) { | ||
4775 | case TASK_RUNNING: | ||
4776 | stats->nr_running++; | ||
4777 | break; | ||
4778 | case TASK_INTERRUPTIBLE: | ||
4779 | stats->nr_sleeping++; | ||
4780 | break; | ||
4781 | case TASK_UNINTERRUPTIBLE: | ||
4782 | stats->nr_uninterruptible++; | ||
4783 | break; | ||
4784 | case TASK_STOPPED: | ||
4785 | stats->nr_stopped++; | ||
4786 | break; | ||
4787 | default: | ||
4788 | if (delayacct_is_task_waiting_on_io(tsk)) | ||
4789 | stats->nr_io_wait++; | ||
4790 | break; | ||
4791 | } | ||
4792 | } | ||
4793 | css_task_iter_end(&it); | ||
4794 | |||
4795 | mutex_unlock(&cgroup_mutex); | ||
4796 | return 0; | ||
4797 | } | ||
4798 | |||
4799 | |||
4800 | /* | ||
4801 | * seq_file methods for the tasks/procs files. The seq_file position is the | ||
4802 | * next pid to display; the seq_file iterator is a pointer to the pid | ||
4803 | * in the cgroup->l->list array. | ||
4804 | */ | ||
4805 | |||
4806 | static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | ||
4807 | { | ||
4808 | /* | ||
4809 | * Initially we receive a position value that corresponds to | ||
4810 | * one more than the last pid shown (or 0 on the first call or | ||
4811 | * after a seek to the start). Use a binary-search to find the | ||
4812 | * next pid to display, if any | ||
4813 | */ | ||
4814 | struct kernfs_open_file *of = s->private; | ||
4815 | struct cgroup *cgrp = seq_css(s)->cgroup; | ||
4816 | struct cgroup_pidlist *l; | ||
4817 | enum cgroup_filetype type = seq_cft(s)->private; | ||
4818 | int index = 0, pid = *pos; | ||
4819 | int *iter, ret; | ||
4820 | |||
4821 | mutex_lock(&cgrp->pidlist_mutex); | ||
4822 | |||
4823 | /* | ||
4824 | * !NULL @of->priv indicates that this isn't the first start() | ||
4825 | * after open. If the matching pidlist is around, we can use that. | ||
4826 | * Look for it. Note that @of->priv can't be used directly. It | ||
4827 | * could already have been destroyed. | ||
4828 | */ | ||
4829 | if (of->priv) | ||
4830 | of->priv = cgroup_pidlist_find(cgrp, type); | ||
4831 | |||
4832 | /* | ||
4833 | * Either this is the first start() after open or the matching | ||
4834 | * pidlist has been destroyed inbetween. Create a new one. | ||
4835 | */ | ||
4836 | if (!of->priv) { | ||
4837 | ret = pidlist_array_load(cgrp, type, | ||
4838 | (struct cgroup_pidlist **)&of->priv); | ||
4839 | if (ret) | ||
4840 | return ERR_PTR(ret); | ||
4841 | } | ||
4842 | l = of->priv; | ||
4843 | |||
4844 | if (pid) { | ||
4845 | int end = l->length; | ||
4846 | |||
4847 | while (index < end) { | ||
4848 | int mid = (index + end) / 2; | ||
4849 | if (l->list[mid] == pid) { | ||
4850 | index = mid; | ||
4851 | break; | ||
4852 | } else if (l->list[mid] <= pid) | ||
4853 | index = mid + 1; | ||
4854 | else | ||
4855 | end = mid; | ||
4856 | } | ||
4857 | } | ||
4858 | /* If we're off the end of the array, we're done */ | ||
4859 | if (index >= l->length) | ||
4860 | return NULL; | ||
4861 | /* Update the abstract position to be the actual pid that we found */ | ||
4862 | iter = l->list + index; | ||
4863 | *pos = *iter; | ||
4864 | return iter; | ||
4865 | } | ||
4866 | |||
4867 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | ||
4868 | { | ||
4869 | struct kernfs_open_file *of = s->private; | ||
4870 | struct cgroup_pidlist *l = of->priv; | ||
4871 | |||
4872 | if (l) | ||
4873 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, | ||
4874 | CGROUP_PIDLIST_DESTROY_DELAY); | ||
4875 | mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); | ||
4876 | } | ||
4877 | |||
4878 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | ||
4879 | { | ||
4880 | struct kernfs_open_file *of = s->private; | ||
4881 | struct cgroup_pidlist *l = of->priv; | ||
4882 | pid_t *p = v; | ||
4883 | pid_t *end = l->list + l->length; | ||
4884 | /* | ||
4885 | * Advance to the next pid in the array. If this goes off the | ||
4886 | * end, we're done | ||
4887 | */ | ||
4888 | p++; | ||
4889 | if (p >= end) { | ||
4890 | return NULL; | ||
4891 | } else { | ||
4892 | *pos = *p; | ||
4893 | return p; | ||
4894 | } | ||
4895 | } | ||
4896 | |||
4897 | static int cgroup_pidlist_show(struct seq_file *s, void *v) | ||
4898 | { | ||
4899 | seq_printf(s, "%d\n", *(int *)v); | ||
4900 | |||
4901 | return 0; | ||
4902 | } | ||
4903 | |||
4904 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | ||
4905 | struct cftype *cft) | ||
4906 | { | ||
4907 | return notify_on_release(css->cgroup); | ||
4908 | } | ||
4909 | |||
4910 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, | ||
4911 | struct cftype *cft, u64 val) | ||
4912 | { | ||
4913 | if (val) | ||
4914 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); | ||
4915 | else | ||
4916 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); | ||
4917 | return 0; | ||
4918 | } | ||
4919 | |||
4920 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, | ||
4921 | struct cftype *cft) | ||
4922 | { | ||
4923 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); | ||
4924 | } | ||
4925 | |||
4926 | static int cgroup_clone_children_write(struct cgroup_subsys_state *css, | ||
4927 | struct cftype *cft, u64 val) | ||
4928 | { | ||
4929 | if (val) | ||
4930 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); | ||
4931 | else | ||
4932 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); | ||
4933 | return 0; | ||
4934 | } | ||
4935 | |||
4936 | /* cgroup core interface files for the default hierarchy */ | 4195 | /* cgroup core interface files for the default hierarchy */ |
4937 | static struct cftype cgroup_dfl_base_files[] = { | 4196 | static struct cftype cgroup_dfl_base_files[] = { |
4938 | { | 4197 | { |
@@ -4962,51 +4221,6 @@ static struct cftype cgroup_dfl_base_files[] = { | |||
4962 | { } /* terminate */ | 4221 | { } /* terminate */ |
4963 | }; | 4222 | }; |
4964 | 4223 | ||
4965 | /* cgroup core interface files for the legacy hierarchies */ | ||
4966 | static struct cftype cgroup_legacy_base_files[] = { | ||
4967 | { | ||
4968 | .name = "cgroup.procs", | ||
4969 | .seq_start = cgroup_pidlist_start, | ||
4970 | .seq_next = cgroup_pidlist_next, | ||
4971 | .seq_stop = cgroup_pidlist_stop, | ||
4972 | .seq_show = cgroup_pidlist_show, | ||
4973 | .private = CGROUP_FILE_PROCS, | ||
4974 | .write = cgroup_procs_write, | ||
4975 | }, | ||
4976 | { | ||
4977 | .name = "cgroup.clone_children", | ||
4978 | .read_u64 = cgroup_clone_children_read, | ||
4979 | .write_u64 = cgroup_clone_children_write, | ||
4980 | }, | ||
4981 | { | ||
4982 | .name = "cgroup.sane_behavior", | ||
4983 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
4984 | .seq_show = cgroup_sane_behavior_show, | ||
4985 | }, | ||
4986 | { | ||
4987 | .name = "tasks", | ||
4988 | .seq_start = cgroup_pidlist_start, | ||
4989 | .seq_next = cgroup_pidlist_next, | ||
4990 | .seq_stop = cgroup_pidlist_stop, | ||
4991 | .seq_show = cgroup_pidlist_show, | ||
4992 | .private = CGROUP_FILE_TASKS, | ||
4993 | .write = cgroup_tasks_write, | ||
4994 | }, | ||
4995 | { | ||
4996 | .name = "notify_on_release", | ||
4997 | .read_u64 = cgroup_read_notify_on_release, | ||
4998 | .write_u64 = cgroup_write_notify_on_release, | ||
4999 | }, | ||
5000 | { | ||
5001 | .name = "release_agent", | ||
5002 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
5003 | .seq_show = cgroup_release_agent_show, | ||
5004 | .write = cgroup_release_agent_write, | ||
5005 | .max_write_len = PATH_MAX - 1, | ||
5006 | }, | ||
5007 | { } /* terminate */ | ||
5008 | }; | ||
5009 | |||
5010 | /* | 4224 | /* |
5011 | * css destruction is four-stage process. | 4225 | * css destruction is four-stage process. |
5012 | * | 4226 | * |
@@ -5792,15 +5006,6 @@ static int __init cgroup_wq_init(void) | |||
5792 | */ | 5006 | */ |
5793 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); | 5007 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); |
5794 | BUG_ON(!cgroup_destroy_wq); | 5008 | BUG_ON(!cgroup_destroy_wq); |
5795 | |||
5796 | /* | ||
5797 | * Used to destroy pidlists and separate to serve as flush domain. | ||
5798 | * Cap @max_active to 1 too. | ||
5799 | */ | ||
5800 | cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", | ||
5801 | 0, 1); | ||
5802 | BUG_ON(!cgroup_pidlist_destroy_wq); | ||
5803 | |||
5804 | return 0; | 5009 | return 0; |
5805 | } | 5010 | } |
5806 | core_initcall(cgroup_wq_init); | 5011 | core_initcall(cgroup_wq_init); |
@@ -5883,42 +5088,6 @@ out: | |||
5883 | return retval; | 5088 | return retval; |
5884 | } | 5089 | } |
5885 | 5090 | ||
5886 | /* Display information about each subsystem and each hierarchy */ | ||
5887 | static int proc_cgroupstats_show(struct seq_file *m, void *v) | ||
5888 | { | ||
5889 | struct cgroup_subsys *ss; | ||
5890 | int i; | ||
5891 | |||
5892 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); | ||
5893 | /* | ||
5894 | * ideally we don't want subsystems moving around while we do this. | ||
5895 | * cgroup_mutex is also necessary to guarantee an atomic snapshot of | ||
5896 | * subsys/hierarchy state. | ||
5897 | */ | ||
5898 | mutex_lock(&cgroup_mutex); | ||
5899 | |||
5900 | for_each_subsys(ss, i) | ||
5901 | seq_printf(m, "%s\t%d\t%d\t%d\n", | ||
5902 | ss->legacy_name, ss->root->hierarchy_id, | ||
5903 | atomic_read(&ss->root->nr_cgrps), | ||
5904 | cgroup_ssid_enabled(i)); | ||
5905 | |||
5906 | mutex_unlock(&cgroup_mutex); | ||
5907 | return 0; | ||
5908 | } | ||
5909 | |||
5910 | static int cgroupstats_open(struct inode *inode, struct file *file) | ||
5911 | { | ||
5912 | return single_open(file, proc_cgroupstats_show, NULL); | ||
5913 | } | ||
5914 | |||
5915 | static const struct file_operations proc_cgroupstats_operations = { | ||
5916 | .open = cgroupstats_open, | ||
5917 | .read = seq_read, | ||
5918 | .llseek = seq_lseek, | ||
5919 | .release = single_release, | ||
5920 | }; | ||
5921 | |||
5922 | /** | 5091 | /** |
5923 | * cgroup_fork - initialize cgroup related fields during copy_process() | 5092 | * cgroup_fork - initialize cgroup related fields during copy_process() |
5924 | * @child: pointer to task_struct of forking parent process. | 5093 | * @child: pointer to task_struct of forking parent process. |
@@ -6098,76 +5267,6 @@ void cgroup_free(struct task_struct *task) | |||
6098 | put_css_set(cset); | 5267 | put_css_set(cset); |
6099 | } | 5268 | } |
6100 | 5269 | ||
6101 | static void check_for_release(struct cgroup *cgrp) | ||
6102 | { | ||
6103 | if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && | ||
6104 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) | ||
6105 | schedule_work(&cgrp->release_agent_work); | ||
6106 | } | ||
6107 | |||
6108 | /* | ||
6109 | * Notify userspace when a cgroup is released, by running the | ||
6110 | * configured release agent with the name of the cgroup (path | ||
6111 | * relative to the root of cgroup file system) as the argument. | ||
6112 | * | ||
6113 | * Most likely, this user command will try to rmdir this cgroup. | ||
6114 | * | ||
6115 | * This races with the possibility that some other task will be | ||
6116 | * attached to this cgroup before it is removed, or that some other | ||
6117 | * user task will 'mkdir' a child cgroup of this cgroup. That's ok. | ||
6118 | * The presumed 'rmdir' will fail quietly if this cgroup is no longer | ||
6119 | * unused, and this cgroup will be reprieved from its death sentence, | ||
6120 | * to continue to serve a useful existence. Next time it's released, | ||
6121 | * we will get notified again, if it still has 'notify_on_release' set. | ||
6122 | * | ||
6123 | * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which | ||
6124 | * means only wait until the task is successfully execve()'d. The | ||
6125 | * separate release agent task is forked by call_usermodehelper(), | ||
6126 | * then control in this thread returns here, without waiting for the | ||
6127 | * release agent task. We don't bother to wait because the caller of | ||
6128 | * this routine has no use for the exit status of the release agent | ||
6129 | * task, so no sense holding our caller up for that. | ||
6130 | */ | ||
6131 | static void cgroup_release_agent(struct work_struct *work) | ||
6132 | { | ||
6133 | struct cgroup *cgrp = | ||
6134 | container_of(work, struct cgroup, release_agent_work); | ||
6135 | char *pathbuf = NULL, *agentbuf = NULL; | ||
6136 | char *argv[3], *envp[3]; | ||
6137 | int ret; | ||
6138 | |||
6139 | mutex_lock(&cgroup_mutex); | ||
6140 | |||
6141 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); | ||
6142 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); | ||
6143 | if (!pathbuf || !agentbuf) | ||
6144 | goto out; | ||
6145 | |||
6146 | spin_lock_irq(&css_set_lock); | ||
6147 | ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); | ||
6148 | spin_unlock_irq(&css_set_lock); | ||
6149 | if (ret < 0 || ret >= PATH_MAX) | ||
6150 | goto out; | ||
6151 | |||
6152 | argv[0] = agentbuf; | ||
6153 | argv[1] = pathbuf; | ||
6154 | argv[2] = NULL; | ||
6155 | |||
6156 | /* minimal command environment */ | ||
6157 | envp[0] = "HOME=/"; | ||
6158 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
6159 | envp[2] = NULL; | ||
6160 | |||
6161 | mutex_unlock(&cgroup_mutex); | ||
6162 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
6163 | goto out_free; | ||
6164 | out: | ||
6165 | mutex_unlock(&cgroup_mutex); | ||
6166 | out_free: | ||
6167 | kfree(agentbuf); | ||
6168 | kfree(pathbuf); | ||
6169 | } | ||
6170 | |||
6171 | static int __init cgroup_disable(char *str) | 5270 | static int __init cgroup_disable(char *str) |
6172 | { | 5271 | { |
6173 | struct cgroup_subsys *ss; | 5272 | struct cgroup_subsys *ss; |
@@ -6189,33 +5288,6 @@ static int __init cgroup_disable(char *str) | |||
6189 | } | 5288 | } |
6190 | __setup("cgroup_disable=", cgroup_disable); | 5289 | __setup("cgroup_disable=", cgroup_disable); |
6191 | 5290 | ||
6192 | static int __init cgroup_no_v1(char *str) | ||
6193 | { | ||
6194 | struct cgroup_subsys *ss; | ||
6195 | char *token; | ||
6196 | int i; | ||
6197 | |||
6198 | while ((token = strsep(&str, ",")) != NULL) { | ||
6199 | if (!*token) | ||
6200 | continue; | ||
6201 | |||
6202 | if (!strcmp(token, "all")) { | ||
6203 | cgroup_no_v1_mask = U16_MAX; | ||
6204 | break; | ||
6205 | } | ||
6206 | |||
6207 | for_each_subsys(ss, i) { | ||
6208 | if (strcmp(token, ss->name) && | ||
6209 | strcmp(token, ss->legacy_name)) | ||
6210 | continue; | ||
6211 | |||
6212 | cgroup_no_v1_mask |= 1 << i; | ||
6213 | } | ||
6214 | } | ||
6215 | return 1; | ||
6216 | } | ||
6217 | __setup("cgroup_no_v1=", cgroup_no_v1); | ||
6218 | |||
6219 | /** | 5291 | /** |
6220 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry | 5292 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
6221 | * @dentry: directory dentry of interest | 5293 | * @dentry: directory dentry of interest |
@@ -6557,149 +5629,3 @@ void cgroup_bpf_update(struct cgroup *cgrp, | |||
6557 | mutex_unlock(&cgroup_mutex); | 5629 | mutex_unlock(&cgroup_mutex); |
6558 | } | 5630 | } |
6559 | #endif /* CONFIG_CGROUP_BPF */ | 5631 | #endif /* CONFIG_CGROUP_BPF */ |
6560 | |||
6561 | #ifdef CONFIG_CGROUP_DEBUG | ||
6562 | static struct cgroup_subsys_state * | ||
6563 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
6564 | { | ||
6565 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | ||
6566 | |||
6567 | if (!css) | ||
6568 | return ERR_PTR(-ENOMEM); | ||
6569 | |||
6570 | return css; | ||
6571 | } | ||
6572 | |||
6573 | static void debug_css_free(struct cgroup_subsys_state *css) | ||
6574 | { | ||
6575 | kfree(css); | ||
6576 | } | ||
6577 | |||
6578 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, | ||
6579 | struct cftype *cft) | ||
6580 | { | ||
6581 | return cgroup_task_count(css->cgroup); | ||
6582 | } | ||
6583 | |||
6584 | static u64 current_css_set_read(struct cgroup_subsys_state *css, | ||
6585 | struct cftype *cft) | ||
6586 | { | ||
6587 | return (u64)(unsigned long)current->cgroups; | ||
6588 | } | ||
6589 | |||
6590 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | ||
6591 | struct cftype *cft) | ||
6592 | { | ||
6593 | u64 count; | ||
6594 | |||
6595 | rcu_read_lock(); | ||
6596 | count = atomic_read(&task_css_set(current)->refcount); | ||
6597 | rcu_read_unlock(); | ||
6598 | return count; | ||
6599 | } | ||
6600 | |||
6601 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | ||
6602 | { | ||
6603 | struct cgrp_cset_link *link; | ||
6604 | struct css_set *cset; | ||
6605 | char *name_buf; | ||
6606 | |||
6607 | name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
6608 | if (!name_buf) | ||
6609 | return -ENOMEM; | ||
6610 | |||
6611 | spin_lock_irq(&css_set_lock); | ||
6612 | rcu_read_lock(); | ||
6613 | cset = rcu_dereference(current->cgroups); | ||
6614 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
6615 | struct cgroup *c = link->cgrp; | ||
6616 | |||
6617 | cgroup_name(c, name_buf, NAME_MAX + 1); | ||
6618 | seq_printf(seq, "Root %d group %s\n", | ||
6619 | c->root->hierarchy_id, name_buf); | ||
6620 | } | ||
6621 | rcu_read_unlock(); | ||
6622 | spin_unlock_irq(&css_set_lock); | ||
6623 | kfree(name_buf); | ||
6624 | return 0; | ||
6625 | } | ||
6626 | |||
6627 | #define MAX_TASKS_SHOWN_PER_CSS 25 | ||
6628 | static int cgroup_css_links_read(struct seq_file *seq, void *v) | ||
6629 | { | ||
6630 | struct cgroup_subsys_state *css = seq_css(seq); | ||
6631 | struct cgrp_cset_link *link; | ||
6632 | |||
6633 | spin_lock_irq(&css_set_lock); | ||
6634 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | ||
6635 | struct css_set *cset = link->cset; | ||
6636 | struct task_struct *task; | ||
6637 | int count = 0; | ||
6638 | |||
6639 | seq_printf(seq, "css_set %p\n", cset); | ||
6640 | |||
6641 | list_for_each_entry(task, &cset->tasks, cg_list) { | ||
6642 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
6643 | goto overflow; | ||
6644 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
6645 | } | ||
6646 | |||
6647 | list_for_each_entry(task, &cset->mg_tasks, cg_list) { | ||
6648 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
6649 | goto overflow; | ||
6650 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
6651 | } | ||
6652 | continue; | ||
6653 | overflow: | ||
6654 | seq_puts(seq, " ...\n"); | ||
6655 | } | ||
6656 | spin_unlock_irq(&css_set_lock); | ||
6657 | return 0; | ||
6658 | } | ||
6659 | |||
6660 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | ||
6661 | { | ||
6662 | return (!cgroup_is_populated(css->cgroup) && | ||
6663 | !css_has_online_children(&css->cgroup->self)); | ||
6664 | } | ||
6665 | |||
6666 | static struct cftype debug_files[] = { | ||
6667 | { | ||
6668 | .name = "taskcount", | ||
6669 | .read_u64 = debug_taskcount_read, | ||
6670 | }, | ||
6671 | |||
6672 | { | ||
6673 | .name = "current_css_set", | ||
6674 | .read_u64 = current_css_set_read, | ||
6675 | }, | ||
6676 | |||
6677 | { | ||
6678 | .name = "current_css_set_refcount", | ||
6679 | .read_u64 = current_css_set_refcount_read, | ||
6680 | }, | ||
6681 | |||
6682 | { | ||
6683 | .name = "current_css_set_cg_links", | ||
6684 | .seq_show = current_css_set_cg_links_read, | ||
6685 | }, | ||
6686 | |||
6687 | { | ||
6688 | .name = "cgroup_css_links", | ||
6689 | .seq_show = cgroup_css_links_read, | ||
6690 | }, | ||
6691 | |||
6692 | { | ||
6693 | .name = "releasable", | ||
6694 | .read_u64 = releasable_read, | ||
6695 | }, | ||
6696 | |||
6697 | { } /* terminate */ | ||
6698 | }; | ||
6699 | |||
6700 | struct cgroup_subsys debug_cgrp_subsys = { | ||
6701 | .css_alloc = debug_css_alloc, | ||
6702 | .css_free = debug_css_free, | ||
6703 | .legacy_cftypes = debug_files, | ||
6704 | }; | ||
6705 | #endif /* CONFIG_CGROUP_DEBUG */ | ||