aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:39:30 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:36 -0400
commitddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (patch)
tree0881a031e669582f819d572339e955b04abfc3d2 /kernel
parent55a230aae650157720becc09cadb7d10efbf5013 (diff)
Task Control Groups: basic task cgroup framework
Generic Process Control Groups -------------------------- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy cgroups, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. This patchset provides a framework for tracking and grouping processes into arbitrary "cgroups" and assigning arbitrary state to those groupings, in order to control the behaviour of the cgroup as an aggregate. The intention is that the various resource management and virtualization/cgroup efforts can also become task cgroup clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel This patch: Add the main task cgroups framework - the cgroup filesystem, and the basic structures for tracking membership and associating subsystem state objects to tasks. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c1198
2 files changed, 1199 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 001bd3b65dd1..ea8c8a12e19a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_PM) += power/
36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
37obj-$(CONFIG_KEXEC) += kexec.o 37obj-$(CONFIG_KEXEC) += kexec.o
38obj-$(CONFIG_COMPAT) += compat.o 38obj-$(CONFIG_COMPAT) += compat.o
39obj-$(CONFIG_CGROUPS) += cgroup.o
39obj-$(CONFIG_CPUSETS) += cpuset.o 40obj-$(CONFIG_CPUSETS) += cpuset.o
40obj-$(CONFIG_IKCONFIG) += configs.o 41obj-$(CONFIG_IKCONFIG) += configs.o
41obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 42obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 000000000000..6ba857bec71b
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,1198 @@
1/*
2 * kernel/cgroup.c
3 *
4 * Generic process-grouping system.
5 *
6 * Based originally on the cpuset system, extracted by Paul Menage
7 * Copyright (C) 2006 Google, Inc
8 *
9 * Copyright notices from the original cpuset code:
10 * --------------------------------------------------
11 * Copyright (C) 2003 BULL SA.
12 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
13 *
14 * Portions derived from Patrick Mochel's sysfs code.
15 * sysfs is Copyright (c) 2001-3 Patrick Mochel
16 *
17 * 2003-10-10 Written by Simon Derr.
18 * 2003-10-22 Updates by Stephen Hemminger.
19 * 2004 May-July Rework by Paul Jackson.
20 * ---------------------------------------------------
21 *
22 * This file is subject to the terms and conditions of the GNU General Public
23 * License. See the file COPYING in the main directory of the Linux
24 * distribution for more details.
25 */
26
27#include <linux/cgroup.h>
28#include <linux/errno.h>
29#include <linux/fs.h>
30#include <linux/kernel.h>
31#include <linux/list.h>
32#include <linux/mm.h>
33#include <linux/mutex.h>
34#include <linux/mount.h>
35#include <linux/pagemap.h>
36#include <linux/rcupdate.h>
37#include <linux/sched.h>
38#include <linux/seq_file.h>
39#include <linux/slab.h>
40#include <linux/magic.h>
41#include <linux/spinlock.h>
42#include <linux/string.h>
43
44#include <asm/atomic.h>
45
46/* Generate an array of cgroup subsystem pointers */
47#define SUBSYS(_x) &_x ## _subsys,
48
49static struct cgroup_subsys *subsys[] = {
50#include <linux/cgroup_subsys.h>
51};
52
53/*
54 * A cgroupfs_root represents the root of a cgroup hierarchy,
55 * and may be associated with a superblock to form an active
56 * hierarchy
57 */
58struct cgroupfs_root {
59 struct super_block *sb;
60
61 /*
62 * The bitmask of subsystems intended to be attached to this
63 * hierarchy
64 */
65 unsigned long subsys_bits;
66
67 /* The bitmask of subsystems currently attached to this hierarchy */
68 unsigned long actual_subsys_bits;
69
70 /* A list running through the attached subsystems */
71 struct list_head subsys_list;
72
73 /* The root cgroup for this hierarchy */
74 struct cgroup top_cgroup;
75
76 /* Tracks how many cgroups are currently defined in hierarchy.*/
77 int number_of_cgroups;
78
79 /* A list running through the mounted hierarchies */
80 struct list_head root_list;
81
82 /* Hierarchy-specific flags */
83 unsigned long flags;
84};
85
86
87/*
88 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
89 * subsystems that are otherwise unattached - it never has more than a
90 * single cgroup, and all tasks are part of that cgroup.
91 */
92static struct cgroupfs_root rootnode;
93
94/* The list of hierarchy roots */
95
96static LIST_HEAD(roots);
97
98/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
99#define dummytop (&rootnode.top_cgroup)
100
101/* This flag indicates whether tasks in the fork and exit paths should
102 * take callback_mutex and check for fork/exit handlers to call. This
103 * avoids us having to do extra work in the fork/exit path if none of the
104 * subsystems need to be called.
105 */
106static int need_forkexit_callback;
107
108/* bits in struct cgroup flags field */
109enum {
110 CONT_REMOVED,
111};
112
113/* convenient tests for these bits */
114inline int cgroup_is_removed(const struct cgroup *cont)
115{
116 return test_bit(CONT_REMOVED, &cont->flags);
117}
118
119/* bits in struct cgroupfs_root flags field */
120enum {
121 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
122};
123
124/*
125 * for_each_subsys() allows you to iterate on each subsystem attached to
126 * an active hierarchy
127 */
128#define for_each_subsys(_root, _ss) \
129list_for_each_entry(_ss, &_root->subsys_list, sibling)
130
131/* for_each_root() allows you to iterate across the active hierarchies */
132#define for_each_root(_root) \
133list_for_each_entry(_root, &roots, root_list)
134
135/*
136 * There is one global cgroup mutex. We also require taking
137 * task_lock() when dereferencing a task's cgroup subsys pointers.
138 * See "The task_lock() exception", at the end of this comment.
139 *
140 * A task must hold cgroup_mutex to modify cgroups.
141 *
142 * Any task can increment and decrement the count field without lock.
143 * So in general, code holding cgroup_mutex can't rely on the count
144 * field not changing. However, if the count goes to zero, then only
145 * attach_task() can increment it again. Because a count of zero
146 * means that no tasks are currently attached, therefore there is no
147 * way a task attached to that cgroup can fork (the other way to
148 * increment the count). So code holding cgroup_mutex can safely
149 * assume that if the count is zero, it will stay zero. Similarly, if
150 * a task holds cgroup_mutex on a cgroup with zero count, it
151 * knows that the cgroup won't be removed, as cgroup_rmdir()
152 * needs that mutex.
153 *
154 * The cgroup_common_file_write handler for operations that modify
155 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
156 * single threading all such cgroup modifications across the system.
157 *
158 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
159 * (usually) take cgroup_mutex. These are the two most performance
160 * critical pieces of code here. The exception occurs on cgroup_exit(),
161 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
162 * is taken, and if the cgroup count is zero, a usermode call made
163 * to /sbin/cgroup_release_agent with the name of the cgroup (path
164 * relative to the root of cgroup file system) as the argument.
165 *
166 * A cgroup can only be deleted if both its 'count' of using tasks
167 * is zero, and its list of 'children' cgroups is empty. Since all
168 * tasks in the system use _some_ cgroup, and since there is always at
169 * least one task in the system (init, pid == 1), therefore, top_cgroup
170 * always has either children cgroups and/or using tasks. So we don't
171 * need a special hack to ensure that top_cgroup cannot be deleted.
172 *
173 * The task_lock() exception
174 *
175 * The need for this exception arises from the action of
176 * attach_task(), which overwrites one tasks cgroup pointer with
177 * another. It does so using cgroup_mutexe, however there are
178 * several performance critical places that need to reference
179 * task->cgroup without the expense of grabbing a system global
180 * mutex. Therefore except as noted below, when dereferencing or, as
181 * in attach_task(), modifying a task'ss cgroup pointer we use
182 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
183 * the task_struct routinely used for such matters.
184 *
185 * P.S. One more locking exception. RCU is used to guard the
186 * update of a tasks cgroup pointer by attach_task()
187 */
188
189static DEFINE_MUTEX(cgroup_mutex);
190
191/**
192 * cgroup_lock - lock out any changes to cgroup structures
193 *
194 */
195
196void cgroup_lock(void)
197{
198 mutex_lock(&cgroup_mutex);
199}
200
201/**
202 * cgroup_unlock - release lock on cgroup changes
203 *
204 * Undo the lock taken in a previous cgroup_lock() call.
205 */
206
207void cgroup_unlock(void)
208{
209 mutex_unlock(&cgroup_mutex);
210}
211
212/*
213 * A couple of forward declarations required, due to cyclic reference loop:
214 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
215 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
216 * -> cgroup_mkdir.
217 */
218
219static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
220static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
221static int cgroup_populate_dir(struct cgroup *cont);
222static struct inode_operations cgroup_dir_inode_operations;
223
224static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
225{
226 struct inode *inode = new_inode(sb);
227 static struct backing_dev_info cgroup_backing_dev_info = {
228 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
229 };
230
231 if (inode) {
232 inode->i_mode = mode;
233 inode->i_uid = current->fsuid;
234 inode->i_gid = current->fsgid;
235 inode->i_blocks = 0;
236 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
237 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
238 }
239 return inode;
240}
241
242static void cgroup_diput(struct dentry *dentry, struct inode *inode)
243{
244 /* is dentry a directory ? if so, kfree() associated cgroup */
245 if (S_ISDIR(inode->i_mode)) {
246 struct cgroup *cont = dentry->d_fsdata;
247 BUG_ON(!(cgroup_is_removed(cont)));
248 kfree(cont);
249 }
250 iput(inode);
251}
252
253static void remove_dir(struct dentry *d)
254{
255 struct dentry *parent = dget(d->d_parent);
256
257 d_delete(d);
258 simple_rmdir(parent->d_inode, d);
259 dput(parent);
260}
261
262static void cgroup_clear_directory(struct dentry *dentry)
263{
264 struct list_head *node;
265
266 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
267 spin_lock(&dcache_lock);
268 node = dentry->d_subdirs.next;
269 while (node != &dentry->d_subdirs) {
270 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
271 list_del_init(node);
272 if (d->d_inode) {
273 /* This should never be called on a cgroup
274 * directory with child cgroups */
275 BUG_ON(d->d_inode->i_mode & S_IFDIR);
276 d = dget_locked(d);
277 spin_unlock(&dcache_lock);
278 d_delete(d);
279 simple_unlink(dentry->d_inode, d);
280 dput(d);
281 spin_lock(&dcache_lock);
282 }
283 node = dentry->d_subdirs.next;
284 }
285 spin_unlock(&dcache_lock);
286}
287
288/*
289 * NOTE : the dentry must have been dget()'ed
290 */
291static void cgroup_d_remove_dir(struct dentry *dentry)
292{
293 cgroup_clear_directory(dentry);
294
295 spin_lock(&dcache_lock);
296 list_del_init(&dentry->d_u.d_child);
297 spin_unlock(&dcache_lock);
298 remove_dir(dentry);
299}
300
301static int rebind_subsystems(struct cgroupfs_root *root,
302 unsigned long final_bits)
303{
304 unsigned long added_bits, removed_bits;
305 struct cgroup *cont = &root->top_cgroup;
306 int i;
307
308 removed_bits = root->actual_subsys_bits & ~final_bits;
309 added_bits = final_bits & ~root->actual_subsys_bits;
310 /* Check that any added subsystems are currently free */
311 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
312 unsigned long long bit = 1ull << i;
313 struct cgroup_subsys *ss = subsys[i];
314 if (!(bit & added_bits))
315 continue;
316 if (ss->root != &rootnode) {
317 /* Subsystem isn't free */
318 return -EBUSY;
319 }
320 }
321
322 /* Currently we don't handle adding/removing subsystems when
323 * any child cgroups exist. This is theoretically supportable
324 * but involves complex error handling, so it's being left until
325 * later */
326 if (!list_empty(&cont->children))
327 return -EBUSY;
328
329 /* Process each subsystem */
330 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
331 struct cgroup_subsys *ss = subsys[i];
332 unsigned long bit = 1UL << i;
333 if (bit & added_bits) {
334 /* We're binding this subsystem to this hierarchy */
335 BUG_ON(cont->subsys[i]);
336 BUG_ON(!dummytop->subsys[i]);
337 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
338 cont->subsys[i] = dummytop->subsys[i];
339 cont->subsys[i]->cgroup = cont;
340 list_add(&ss->sibling, &root->subsys_list);
341 rcu_assign_pointer(ss->root, root);
342 if (ss->bind)
343 ss->bind(ss, cont);
344
345 } else if (bit & removed_bits) {
346 /* We're removing this subsystem */
347 BUG_ON(cont->subsys[i] != dummytop->subsys[i]);
348 BUG_ON(cont->subsys[i]->cgroup != cont);
349 if (ss->bind)
350 ss->bind(ss, dummytop);
351 dummytop->subsys[i]->cgroup = dummytop;
352 cont->subsys[i] = NULL;
353 rcu_assign_pointer(subsys[i]->root, &rootnode);
354 list_del(&ss->sibling);
355 } else if (bit & final_bits) {
356 /* Subsystem state should already exist */
357 BUG_ON(!cont->subsys[i]);
358 } else {
359 /* Subsystem state shouldn't exist */
360 BUG_ON(cont->subsys[i]);
361 }
362 }
363 root->subsys_bits = root->actual_subsys_bits = final_bits;
364 synchronize_rcu();
365
366 return 0;
367}
368
369static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
370{
371 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
372 struct cgroup_subsys *ss;
373
374 mutex_lock(&cgroup_mutex);
375 for_each_subsys(root, ss)
376 seq_printf(seq, ",%s", ss->name);
377 if (test_bit(ROOT_NOPREFIX, &root->flags))
378 seq_puts(seq, ",noprefix");
379 mutex_unlock(&cgroup_mutex);
380 return 0;
381}
382
383struct cgroup_sb_opts {
384 unsigned long subsys_bits;
385 unsigned long flags;
386};
387
388/* Convert a hierarchy specifier into a bitmask of subsystems and
389 * flags. */
390static int parse_cgroupfs_options(char *data,
391 struct cgroup_sb_opts *opts)
392{
393 char *token, *o = data ?: "all";
394
395 opts->subsys_bits = 0;
396 opts->flags = 0;
397
398 while ((token = strsep(&o, ",")) != NULL) {
399 if (!*token)
400 return -EINVAL;
401 if (!strcmp(token, "all")) {
402 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
403 } else if (!strcmp(token, "noprefix")) {
404 set_bit(ROOT_NOPREFIX, &opts->flags);
405 } else {
406 struct cgroup_subsys *ss;
407 int i;
408 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
409 ss = subsys[i];
410 if (!strcmp(token, ss->name)) {
411 set_bit(i, &opts->subsys_bits);
412 break;
413 }
414 }
415 if (i == CGROUP_SUBSYS_COUNT)
416 return -ENOENT;
417 }
418 }
419
420 /* We can't have an empty hierarchy */
421 if (!opts->subsys_bits)
422 return -EINVAL;
423
424 return 0;
425}
426
427static int cgroup_remount(struct super_block *sb, int *flags, char *data)
428{
429 int ret = 0;
430 struct cgroupfs_root *root = sb->s_fs_info;
431 struct cgroup *cont = &root->top_cgroup;
432 struct cgroup_sb_opts opts;
433
434 mutex_lock(&cont->dentry->d_inode->i_mutex);
435 mutex_lock(&cgroup_mutex);
436
437 /* See what subsystems are wanted */
438 ret = parse_cgroupfs_options(data, &opts);
439 if (ret)
440 goto out_unlock;
441
442 /* Don't allow flags to change at remount */
443 if (opts.flags != root->flags) {
444 ret = -EINVAL;
445 goto out_unlock;
446 }
447
448 ret = rebind_subsystems(root, opts.subsys_bits);
449
450 /* (re)populate subsystem files */
451 if (!ret)
452 cgroup_populate_dir(cont);
453
454 out_unlock:
455 mutex_unlock(&cgroup_mutex);
456 mutex_unlock(&cont->dentry->d_inode->i_mutex);
457 return ret;
458}
459
460static struct super_operations cgroup_ops = {
461 .statfs = simple_statfs,
462 .drop_inode = generic_delete_inode,
463 .show_options = cgroup_show_options,
464 .remount_fs = cgroup_remount,
465};
466
467static void init_cgroup_root(struct cgroupfs_root *root)
468{
469 struct cgroup *cont = &root->top_cgroup;
470 INIT_LIST_HEAD(&root->subsys_list);
471 INIT_LIST_HEAD(&root->root_list);
472 root->number_of_cgroups = 1;
473 cont->root = root;
474 cont->top_cgroup = cont;
475 INIT_LIST_HEAD(&cont->sibling);
476 INIT_LIST_HEAD(&cont->children);
477}
478
479static int cgroup_test_super(struct super_block *sb, void *data)
480{
481 struct cgroupfs_root *new = data;
482 struct cgroupfs_root *root = sb->s_fs_info;
483
484 /* First check subsystems */
485 if (new->subsys_bits != root->subsys_bits)
486 return 0;
487
488 /* Next check flags */
489 if (new->flags != root->flags)
490 return 0;
491
492 return 1;
493}
494
495static int cgroup_set_super(struct super_block *sb, void *data)
496{
497 int ret;
498 struct cgroupfs_root *root = data;
499
500 ret = set_anon_super(sb, NULL);
501 if (ret)
502 return ret;
503
504 sb->s_fs_info = root;
505 root->sb = sb;
506
507 sb->s_blocksize = PAGE_CACHE_SIZE;
508 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
509 sb->s_magic = CGROUP_SUPER_MAGIC;
510 sb->s_op = &cgroup_ops;
511
512 return 0;
513}
514
515static int cgroup_get_rootdir(struct super_block *sb)
516{
517 struct inode *inode =
518 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
519 struct dentry *dentry;
520
521 if (!inode)
522 return -ENOMEM;
523
524 inode->i_op = &simple_dir_inode_operations;
525 inode->i_fop = &simple_dir_operations;
526 inode->i_op = &cgroup_dir_inode_operations;
527 /* directories start off with i_nlink == 2 (for "." entry) */
528 inc_nlink(inode);
529 dentry = d_alloc_root(inode);
530 if (!dentry) {
531 iput(inode);
532 return -ENOMEM;
533 }
534 sb->s_root = dentry;
535 return 0;
536}
537
538static int cgroup_get_sb(struct file_system_type *fs_type,
539 int flags, const char *unused_dev_name,
540 void *data, struct vfsmount *mnt)
541{
542 struct cgroup_sb_opts opts;
543 int ret = 0;
544 struct super_block *sb;
545 struct cgroupfs_root *root;
546
547 /* First find the desired set of subsystems */
548 ret = parse_cgroupfs_options(data, &opts);
549 if (ret)
550 return ret;
551
552 root = kzalloc(sizeof(*root), GFP_KERNEL);
553 if (!root)
554 return -ENOMEM;
555
556 init_cgroup_root(root);
557 root->subsys_bits = opts.subsys_bits;
558 root->flags = opts.flags;
559
560 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
561
562 if (IS_ERR(sb)) {
563 kfree(root);
564 return PTR_ERR(sb);
565 }
566
567 if (sb->s_fs_info != root) {
568 /* Reusing an existing superblock */
569 BUG_ON(sb->s_root == NULL);
570 kfree(root);
571 root = NULL;
572 } else {
573 /* New superblock */
574 struct cgroup *cont = &root->top_cgroup;
575
576 BUG_ON(sb->s_root != NULL);
577
578 ret = cgroup_get_rootdir(sb);
579 if (ret)
580 goto drop_new_super;
581
582 mutex_lock(&cgroup_mutex);
583
584 ret = rebind_subsystems(root, root->subsys_bits);
585 if (ret == -EBUSY) {
586 mutex_unlock(&cgroup_mutex);
587 goto drop_new_super;
588 }
589
590 /* EBUSY should be the only error here */
591 BUG_ON(ret);
592
593 list_add(&root->root_list, &roots);
594
595 sb->s_root->d_fsdata = &root->top_cgroup;
596 root->top_cgroup.dentry = sb->s_root;
597
598 BUG_ON(!list_empty(&cont->sibling));
599 BUG_ON(!list_empty(&cont->children));
600 BUG_ON(root->number_of_cgroups != 1);
601
602 /*
603 * I believe that it's safe to nest i_mutex inside
604 * cgroup_mutex in this case, since no-one else can
605 * be accessing this directory yet. But we still need
606 * to teach lockdep that this is the case - currently
607 * a cgroupfs remount triggers a lockdep warning
608 */
609 mutex_lock(&cont->dentry->d_inode->i_mutex);
610 cgroup_populate_dir(cont);
611 mutex_unlock(&cont->dentry->d_inode->i_mutex);
612 mutex_unlock(&cgroup_mutex);
613 }
614
615 return simple_set_mnt(mnt, sb);
616
617 drop_new_super:
618 up_write(&sb->s_umount);
619 deactivate_super(sb);
620 return ret;
621}
622
623static void cgroup_kill_sb(struct super_block *sb) {
624 struct cgroupfs_root *root = sb->s_fs_info;
625 struct cgroup *cont = &root->top_cgroup;
626 int ret;
627
628 BUG_ON(!root);
629
630 BUG_ON(root->number_of_cgroups != 1);
631 BUG_ON(!list_empty(&cont->children));
632 BUG_ON(!list_empty(&cont->sibling));
633
634 mutex_lock(&cgroup_mutex);
635
636 /* Rebind all subsystems back to the default hierarchy */
637 ret = rebind_subsystems(root, 0);
638 /* Shouldn't be able to fail ... */
639 BUG_ON(ret);
640
641 if (!list_empty(&root->root_list))
642 list_del(&root->root_list);
643 mutex_unlock(&cgroup_mutex);
644
645 kfree(root);
646 kill_litter_super(sb);
647}
648
649static struct file_system_type cgroup_fs_type = {
650 .name = "cgroup",
651 .get_sb = cgroup_get_sb,
652 .kill_sb = cgroup_kill_sb,
653};
654
655static inline struct cgroup *__d_cont(struct dentry *dentry)
656{
657 return dentry->d_fsdata;
658}
659
660static inline struct cftype *__d_cft(struct dentry *dentry)
661{
662 return dentry->d_fsdata;
663}
664
665/*
666 * Called with cgroup_mutex held. Writes path of cgroup into buf.
667 * Returns 0 on success, -errno on error.
668 */
669int cgroup_path(const struct cgroup *cont, char *buf, int buflen)
670{
671 char *start;
672
673 if (cont == dummytop) {
674 /*
675 * Inactive subsystems have no dentry for their root
676 * cgroup
677 */
678 strcpy(buf, "/");
679 return 0;
680 }
681
682 start = buf + buflen;
683
684 *--start = '\0';
685 for (;;) {
686 int len = cont->dentry->d_name.len;
687 if ((start -= len) < buf)
688 return -ENAMETOOLONG;
689 memcpy(start, cont->dentry->d_name.name, len);
690 cont = cont->parent;
691 if (!cont)
692 break;
693 if (!cont->parent)
694 continue;
695 if (--start < buf)
696 return -ENAMETOOLONG;
697 *start = '/';
698 }
699 memmove(buf, start, buf + buflen - start);
700 return 0;
701}
702
703/* The various types of files and directories in a cgroup file system */
704
705enum cgroup_filetype {
706 FILE_ROOT,
707 FILE_DIR,
708 FILE_TASKLIST,
709};
710
711static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
712 size_t nbytes, loff_t *ppos)
713{
714 struct cftype *cft = __d_cft(file->f_dentry);
715 struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
716
717 if (!cft)
718 return -ENODEV;
719 if (!cft->write)
720 return -EINVAL;
721
722 return cft->write(cont, cft, file, buf, nbytes, ppos);
723}
724
725static ssize_t cgroup_read_uint(struct cgroup *cont, struct cftype *cft,
726 struct file *file,
727 char __user *buf, size_t nbytes,
728 loff_t *ppos)
729{
730 char tmp[64];
731 u64 val = cft->read_uint(cont, cft);
732 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
733
734 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
735}
736
737static ssize_t cgroup_file_read(struct file *file, char __user *buf,
738 size_t nbytes, loff_t *ppos)
739{
740 struct cftype *cft = __d_cft(file->f_dentry);
741 struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
742
743 if (!cft)
744 return -ENODEV;
745
746 if (cft->read)
747 return cft->read(cont, cft, file, buf, nbytes, ppos);
748 if (cft->read_uint)
749 return cgroup_read_uint(cont, cft, file, buf, nbytes, ppos);
750 return -EINVAL;
751}
752
753static int cgroup_file_open(struct inode *inode, struct file *file)
754{
755 int err;
756 struct cftype *cft;
757
758 err = generic_file_open(inode, file);
759 if (err)
760 return err;
761
762 cft = __d_cft(file->f_dentry);
763 if (!cft)
764 return -ENODEV;
765 if (cft->open)
766 err = cft->open(inode, file);
767 else
768 err = 0;
769
770 return err;
771}
772
773static int cgroup_file_release(struct inode *inode, struct file *file)
774{
775 struct cftype *cft = __d_cft(file->f_dentry);
776 if (cft->release)
777 return cft->release(inode, file);
778 return 0;
779}
780
781/*
782 * cgroup_rename - Only allow simple rename of directories in place.
783 */
784static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
785 struct inode *new_dir, struct dentry *new_dentry)
786{
787 if (!S_ISDIR(old_dentry->d_inode->i_mode))
788 return -ENOTDIR;
789 if (new_dentry->d_inode)
790 return -EEXIST;
791 if (old_dir != new_dir)
792 return -EIO;
793 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
794}
795
796static struct file_operations cgroup_file_operations = {
797 .read = cgroup_file_read,
798 .write = cgroup_file_write,
799 .llseek = generic_file_llseek,
800 .open = cgroup_file_open,
801 .release = cgroup_file_release,
802};
803
804static struct inode_operations cgroup_dir_inode_operations = {
805 .lookup = simple_lookup,
806 .mkdir = cgroup_mkdir,
807 .rmdir = cgroup_rmdir,
808 .rename = cgroup_rename,
809};
810
811static int cgroup_create_file(struct dentry *dentry, int mode,
812 struct super_block *sb)
813{
814 static struct dentry_operations cgroup_dops = {
815 .d_iput = cgroup_diput,
816 };
817
818 struct inode *inode;
819
820 if (!dentry)
821 return -ENOENT;
822 if (dentry->d_inode)
823 return -EEXIST;
824
825 inode = cgroup_new_inode(mode, sb);
826 if (!inode)
827 return -ENOMEM;
828
829 if (S_ISDIR(mode)) {
830 inode->i_op = &cgroup_dir_inode_operations;
831 inode->i_fop = &simple_dir_operations;
832
833 /* start off with i_nlink == 2 (for "." entry) */
834 inc_nlink(inode);
835
836 /* start with the directory inode held, so that we can
837 * populate it without racing with another mkdir */
838 mutex_lock(&inode->i_mutex);
839 } else if (S_ISREG(mode)) {
840 inode->i_size = 0;
841 inode->i_fop = &cgroup_file_operations;
842 }
843 dentry->d_op = &cgroup_dops;
844 d_instantiate(dentry, inode);
845 dget(dentry); /* Extra count - pin the dentry in core */
846 return 0;
847}
848
849/*
850 * cgroup_create_dir - create a directory for an object.
851 * cont: the cgroup we create the directory for.
852 * It must have a valid ->parent field
853 * And we are going to fill its ->dentry field.
854 * dentry: dentry of the new container
855 * mode: mode to set on new directory.
856 */
857static int cgroup_create_dir(struct cgroup *cont, struct dentry *dentry,
858 int mode)
859{
860 struct dentry *parent;
861 int error = 0;
862
863 parent = cont->parent->dentry;
864 error = cgroup_create_file(dentry, S_IFDIR | mode, cont->root->sb);
865 if (!error) {
866 dentry->d_fsdata = cont;
867 inc_nlink(parent->d_inode);
868 cont->dentry = dentry;
869 dget(dentry);
870 }
871 dput(dentry);
872
873 return error;
874}
875
876int cgroup_add_file(struct cgroup *cont,
877 struct cgroup_subsys *subsys,
878 const struct cftype *cft)
879{
880 struct dentry *dir = cont->dentry;
881 struct dentry *dentry;
882 int error;
883
884 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
885 if (subsys && !test_bit(ROOT_NOPREFIX, &cont->root->flags)) {
886 strcpy(name, subsys->name);
887 strcat(name, ".");
888 }
889 strcat(name, cft->name);
890 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
891 dentry = lookup_one_len(name, dir, strlen(name));
892 if (!IS_ERR(dentry)) {
893 error = cgroup_create_file(dentry, 0644 | S_IFREG,
894 cont->root->sb);
895 if (!error)
896 dentry->d_fsdata = (void *)cft;
897 dput(dentry);
898 } else
899 error = PTR_ERR(dentry);
900 return error;
901}
902
903int cgroup_add_files(struct cgroup *cont,
904 struct cgroup_subsys *subsys,
905 const struct cftype cft[],
906 int count)
907{
908 int i, err;
909 for (i = 0; i < count; i++) {
910 err = cgroup_add_file(cont, subsys, &cft[i]);
911 if (err)
912 return err;
913 }
914 return 0;
915}
916
917static int cgroup_populate_dir(struct cgroup *cont)
918{
919 int err;
920 struct cgroup_subsys *ss;
921
922 /* First clear out any existing files */
923 cgroup_clear_directory(cont->dentry);
924
925 for_each_subsys(cont->root, ss) {
926 if (ss->populate && (err = ss->populate(ss, cont)) < 0)
927 return err;
928 }
929
930 return 0;
931}
932
933static void init_cgroup_css(struct cgroup_subsys_state *css,
934 struct cgroup_subsys *ss,
935 struct cgroup *cont)
936{
937 css->cgroup = cont;
938 atomic_set(&css->refcnt, 0);
939 css->flags = 0;
940 if (cont == dummytop)
941 set_bit(CSS_ROOT, &css->flags);
942 BUG_ON(cont->subsys[ss->subsys_id]);
943 cont->subsys[ss->subsys_id] = css;
944}
945
946/*
947 * cgroup_create - create a cgroup
948 * parent: cgroup that will be parent of the new cgroup.
949 * name: name of the new cgroup. Will be strcpy'ed.
950 * mode: mode to set on new inode
951 *
952 * Must be called with the mutex on the parent inode held
953 */
954
955static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
956 int mode)
957{
958 struct cgroup *cont;
959 struct cgroupfs_root *root = parent->root;
960 int err = 0;
961 struct cgroup_subsys *ss;
962 struct super_block *sb = root->sb;
963
964 cont = kzalloc(sizeof(*cont), GFP_KERNEL);
965 if (!cont)
966 return -ENOMEM;
967
968 /* Grab a reference on the superblock so the hierarchy doesn't
969 * get deleted on unmount if there are child cgroups. This
970 * can be done outside cgroup_mutex, since the sb can't
971 * disappear while someone has an open control file on the
972 * fs */
973 atomic_inc(&sb->s_active);
974
975 mutex_lock(&cgroup_mutex);
976
977 cont->flags = 0;
978 INIT_LIST_HEAD(&cont->sibling);
979 INIT_LIST_HEAD(&cont->children);
980
981 cont->parent = parent;
982 cont->root = parent->root;
983 cont->top_cgroup = parent->top_cgroup;
984
985 for_each_subsys(root, ss) {
986 struct cgroup_subsys_state *css = ss->create(ss, cont);
987 if (IS_ERR(css)) {
988 err = PTR_ERR(css);
989 goto err_destroy;
990 }
991 init_cgroup_css(css, ss, cont);
992 }
993
994 list_add(&cont->sibling, &cont->parent->children);
995 root->number_of_cgroups++;
996
997 err = cgroup_create_dir(cont, dentry, mode);
998 if (err < 0)
999 goto err_remove;
1000
1001 /* The cgroup directory was pre-locked for us */
1002 BUG_ON(!mutex_is_locked(&cont->dentry->d_inode->i_mutex));
1003
1004 err = cgroup_populate_dir(cont);
1005 /* If err < 0, we have a half-filled directory - oh well ;) */
1006
1007 mutex_unlock(&cgroup_mutex);
1008 mutex_unlock(&cont->dentry->d_inode->i_mutex);
1009
1010 return 0;
1011
1012 err_remove:
1013
1014 list_del(&cont->sibling);
1015 root->number_of_cgroups--;
1016
1017 err_destroy:
1018
1019 for_each_subsys(root, ss) {
1020 if (cont->subsys[ss->subsys_id])
1021 ss->destroy(ss, cont);
1022 }
1023
1024 mutex_unlock(&cgroup_mutex);
1025
1026 /* Release the reference count that we took on the superblock */
1027 deactivate_super(sb);
1028
1029 kfree(cont);
1030 return err;
1031}
1032
1033static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1034{
1035 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
1036
1037 /* the vfs holds inode->i_mutex already */
1038 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
1039}
1040
1041static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
1042{
1043 struct cgroup *cont = dentry->d_fsdata;
1044 struct dentry *d;
1045 struct cgroup *parent;
1046 struct cgroup_subsys *ss;
1047 struct super_block *sb;
1048 struct cgroupfs_root *root;
1049 int css_busy = 0;
1050
1051 /* the vfs holds both inode->i_mutex already */
1052
1053 mutex_lock(&cgroup_mutex);
1054 if (atomic_read(&cont->count) != 0) {
1055 mutex_unlock(&cgroup_mutex);
1056 return -EBUSY;
1057 }
1058 if (!list_empty(&cont->children)) {
1059 mutex_unlock(&cgroup_mutex);
1060 return -EBUSY;
1061 }
1062
1063 parent = cont->parent;
1064 root = cont->root;
1065 sb = root->sb;
1066
1067 /* Check the reference count on each subsystem. Since we
1068 * already established that there are no tasks in the
1069 * cgroup, if the css refcount is also 0, then there should
1070 * be no outstanding references, so the subsystem is safe to
1071 * destroy */
1072 for_each_subsys(root, ss) {
1073 struct cgroup_subsys_state *css;
1074 css = cont->subsys[ss->subsys_id];
1075 if (atomic_read(&css->refcnt)) {
1076 css_busy = 1;
1077 break;
1078 }
1079 }
1080 if (css_busy) {
1081 mutex_unlock(&cgroup_mutex);
1082 return -EBUSY;
1083 }
1084
1085 for_each_subsys(root, ss) {
1086 if (cont->subsys[ss->subsys_id])
1087 ss->destroy(ss, cont);
1088 }
1089
1090 set_bit(CONT_REMOVED, &cont->flags);
1091 /* delete my sibling from parent->children */
1092 list_del(&cont->sibling);
1093 spin_lock(&cont->dentry->d_lock);
1094 d = dget(cont->dentry);
1095 cont->dentry = NULL;
1096 spin_unlock(&d->d_lock);
1097
1098 cgroup_d_remove_dir(d);
1099 dput(d);
1100 root->number_of_cgroups--;
1101
1102 mutex_unlock(&cgroup_mutex);
1103 /* Drop the active superblock reference that we took when we
1104 * created the cgroup */
1105 deactivate_super(sb);
1106 return 0;
1107}
1108
1109static void cgroup_init_subsys(struct cgroup_subsys *ss)
1110{
1111 struct task_struct *g, *p;
1112 struct cgroup_subsys_state *css;
1113 printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name);
1114
1115 /* Create the top cgroup state for this subsystem */
1116 ss->root = &rootnode;
1117 css = ss->create(ss, dummytop);
1118 /* We don't handle early failures gracefully */
1119 BUG_ON(IS_ERR(css));
1120 init_cgroup_css(css, ss, dummytop);
1121
1122 /* Update all tasks to contain a subsys pointer to this state
1123 * - since the subsystem is newly registered, all tasks are in
1124 * the subsystem's top cgroup. */
1125
1126 /* If this subsystem requested that it be notified with fork
1127 * events, we should send it one now for every process in the
1128 * system */
1129
1130 read_lock(&tasklist_lock);
1131 init_task.cgroups.subsys[ss->subsys_id] = css;
1132 if (ss->fork)
1133 ss->fork(ss, &init_task);
1134
1135 do_each_thread(g, p) {
1136 printk(KERN_INFO "Setting task %p css to %p (%d)\n", css, p, p->pid);
1137 p->cgroups.subsys[ss->subsys_id] = css;
1138 if (ss->fork)
1139 ss->fork(ss, p);
1140 } while_each_thread(g, p);
1141 read_unlock(&tasklist_lock);
1142
1143 need_forkexit_callback |= ss->fork || ss->exit;
1144
1145 ss->active = 1;
1146}
1147
1148/**
1149 * cgroup_init_early - initialize cgroups at system boot, and
1150 * initialize any subsystems that request early init.
1151 */
1152int __init cgroup_init_early(void)
1153{
1154 int i;
1155 init_cgroup_root(&rootnode);
1156 list_add(&rootnode.root_list, &roots);
1157
1158 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1159 struct cgroup_subsys *ss = subsys[i];
1160
1161 BUG_ON(!ss->name);
1162 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
1163 BUG_ON(!ss->create);
1164 BUG_ON(!ss->destroy);
1165 if (ss->subsys_id != i) {
1166 printk(KERN_ERR "Subsys %s id == %d\n",
1167 ss->name, ss->subsys_id);
1168 BUG();
1169 }
1170
1171 if (ss->early_init)
1172 cgroup_init_subsys(ss);
1173 }
1174 return 0;
1175}
1176
1177/**
1178 * cgroup_init - register cgroup filesystem and /proc file, and
1179 * initialize any subsystems that didn't request early init.
1180 */
1181int __init cgroup_init(void)
1182{
1183 int err;
1184 int i;
1185
1186 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1187 struct cgroup_subsys *ss = subsys[i];
1188 if (!ss->early_init)
1189 cgroup_init_subsys(ss);
1190 }
1191
1192 err = register_filesystem(&cgroup_fs_type);
1193 if (err < 0)
1194 goto out;
1195
1196out:
1197 return err;
1198}