aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c437
1 files changed, 402 insertions, 35 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e14db9c089b9..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
94 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
95}; 95};
96 96
97
98/* 97/*
99 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 98 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
100 * subsystems that are otherwise unattached - it never has more than a 99 * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
102 */ 101 */
103static struct cgroupfs_root rootnode; 102static struct cgroupfs_root rootnode;
104 103
104/*
105 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
106 * cgroup_subsys->use_id != 0.
107 */
108#define CSS_ID_MAX (65535)
109struct css_id {
110 /*
111 * The css to which this ID points. This pointer is set to valid value
112 * after cgroup is populated. If cgroup is removed, this will be NULL.
113 * This pointer is expected to be RCU-safe because destroy()
114 * is called after synchronize_rcu(). But for safe use, css_is_removed()
115 * css_tryget() should be used for avoiding race.
116 */
117 struct cgroup_subsys_state *css;
118 /*
119 * ID of this css.
120 */
121 unsigned short id;
122 /*
123 * Depth in hierarchy which this ID belongs to.
124 */
125 unsigned short depth;
126 /*
127 * ID is freed by RCU. (and lookup routine is RCU safe.)
128 */
129 struct rcu_head rcu_head;
130 /*
131 * Hierarchy of CSS ID belongs to.
132 */
133 unsigned short stack[0]; /* Array of Length (depth+1) */
134};
135
136
105/* The list of hierarchy roots */ 137/* The list of hierarchy roots */
106 138
107static LIST_HEAD(roots); 139static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
185static struct css_set init_css_set; 217static struct css_set init_css_set;
186static struct cg_cgroup_link init_css_set_link; 218static struct cg_cgroup_link init_css_set_link;
187 219
220static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
221
188/* css_set_lock protects the list of css_set objects, and the 222/* css_set_lock protects the list of css_set objects, and the
189 * chain of tasks off each css_set. Nests outside task->alloc_lock 223 * chain of tasks off each css_set. Nests outside task->alloc_lock
190 * due to cgroup_iter_start() */ 224 * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
567 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 601 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
568}; 602};
569 603
604static int alloc_css_id(struct cgroup_subsys *ss,
605 struct cgroup *parent, struct cgroup *child);
606
570static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 607static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571{ 608{
572 struct inode *inode = new_inode(sb); 609 struct inode *inode = new_inode(sb);
@@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
585 * Call subsys's pre_destroy handler. 622 * Call subsys's pre_destroy handler.
586 * This is called before css refcnt check. 623 * This is called before css refcnt check.
587 */ 624 */
588static void cgroup_call_pre_destroy(struct cgroup *cgrp) 625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
589{ 626{
590 struct cgroup_subsys *ss; 627 struct cgroup_subsys *ss;
628 int ret = 0;
629
591 for_each_subsys(cgrp->root, ss) 630 for_each_subsys(cgrp->root, ss)
592 if (ss->pre_destroy) 631 if (ss->pre_destroy) {
593 ss->pre_destroy(ss, cgrp); 632 ret = ss->pre_destroy(ss, cgrp);
594 return; 633 if (ret)
634 break;
635 }
636 return ret;
595} 637}
596 638
597static void free_cgroup_rcu(struct rcu_head *obj) 639static void free_cgroup_rcu(struct rcu_head *obj)
@@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
685 remove_dir(dentry); 727 remove_dir(dentry);
686} 728}
687 729
730/*
731 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
732 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
733 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon.
735 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
737 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
688static int rebind_subsystems(struct cgroupfs_root *root, 746static int rebind_subsystems(struct cgroupfs_root *root,
689 unsigned long final_bits) 747 unsigned long final_bits)
690{ 748{
@@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
857 } 915 }
858 916
859 ret = rebind_subsystems(root, opts.subsys_bits); 917 ret = rebind_subsystems(root, opts.subsys_bits);
918 if (ret)
919 goto out_unlock;
860 920
861 /* (re)populate subsystem files */ 921 /* (re)populate subsystem files */
862 if (!ret) 922 cgroup_populate_dir(cgrp);
863 cgroup_populate_dir(cgrp);
864 923
865 if (opts.release_agent) 924 if (opts.release_agent)
866 strcpy(root->release_agent_path, opts.release_agent); 925 strcpy(root->release_agent_path, opts.release_agent);
867 out_unlock: 926 out_unlock:
868 if (opts.release_agent) 927 kfree(opts.release_agent);
869 kfree(opts.release_agent);
870 mutex_unlock(&cgroup_mutex); 928 mutex_unlock(&cgroup_mutex);
871 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
872 return ret; 930 return ret;
@@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
969 /* First find the desired set of subsystems */ 1027 /* First find the desired set of subsystems */
970 ret = parse_cgroupfs_options(data, &opts); 1028 ret = parse_cgroupfs_options(data, &opts);
971 if (ret) { 1029 if (ret) {
972 if (opts.release_agent) 1030 kfree(opts.release_agent);
973 kfree(opts.release_agent);
974 return ret; 1031 return ret;
975 } 1032 }
976 1033
977 root = kzalloc(sizeof(*root), GFP_KERNEL); 1034 root = kzalloc(sizeof(*root), GFP_KERNEL);
978 if (!root) { 1035 if (!root) {
979 if (opts.release_agent) 1036 kfree(opts.release_agent);
980 kfree(opts.release_agent);
981 return -ENOMEM; 1037 return -ENOMEM;
982 } 1038 }
983 1039
@@ -1071,7 +1127,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1071 mutex_unlock(&cgroup_mutex); 1127 mutex_unlock(&cgroup_mutex);
1072 } 1128 }
1073 1129
1074 return simple_set_mnt(mnt, sb); 1130 simple_set_mnt(mnt, sb);
1131 return 0;
1075 1132
1076 free_cg_links: 1133 free_cg_links:
1077 free_cg_links(&tmp_cg_links); 1134 free_cg_links(&tmp_cg_links);
@@ -1122,8 +1179,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1122 1179
1123 mutex_unlock(&cgroup_mutex); 1180 mutex_unlock(&cgroup_mutex);
1124 1181
1125 kfree(root);
1126 kill_litter_super(sb); 1182 kill_litter_super(sb);
1183 kfree(root);
1127} 1184}
1128 1185
1129static struct file_system_type cgroup_fs_type = { 1186static struct file_system_type cgroup_fs_type = {
@@ -1279,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1279 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1336 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1280 synchronize_rcu(); 1337 synchronize_rcu();
1281 put_css_set(cg); 1338 put_css_set(cg);
1339
1340 /*
1341 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1342 * is no longer empty.
1343 */
1344 cgroup_wakeup_rmdir_waiters(cgrp);
1282 return 0; 1345 return 0;
1283} 1346}
1284 1347
@@ -1624,10 +1687,10 @@ static struct inode_operations cgroup_dir_inode_operations = {
1624 .rename = cgroup_rename, 1687 .rename = cgroup_rename,
1625}; 1688};
1626 1689
1627static int cgroup_create_file(struct dentry *dentry, int mode, 1690static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1628 struct super_block *sb) 1691 struct super_block *sb)
1629{ 1692{
1630 static struct dentry_operations cgroup_dops = { 1693 static const struct dentry_operations cgroup_dops = {
1631 .d_iput = cgroup_diput, 1694 .d_iput = cgroup_diput,
1632 }; 1695 };
1633 1696
@@ -1670,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
1670 * @mode: mode to set on new directory. 1733 * @mode: mode to set on new directory.
1671 */ 1734 */
1672static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 1735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1673 int mode) 1736 mode_t mode)
1674{ 1737{
1675 struct dentry *parent; 1738 struct dentry *parent;
1676 int error = 0; 1739 int error = 0;
@@ -1688,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1688 return error; 1751 return error;
1689} 1752}
1690 1753
1754/**
1755 * cgroup_file_mode - deduce file mode of a control file
1756 * @cft: the control file in question
1757 *
1758 * returns cft->mode if ->mode is not 0
1759 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1760 * returns S_IRUGO if it has only a read handler
1761 * returns S_IWUSR if it has only a write hander
1762 */
1763static mode_t cgroup_file_mode(const struct cftype *cft)
1764{
1765 mode_t mode = 0;
1766
1767 if (cft->mode)
1768 return cft->mode;
1769
1770 if (cft->read || cft->read_u64 || cft->read_s64 ||
1771 cft->read_map || cft->read_seq_string)
1772 mode |= S_IRUGO;
1773
1774 if (cft->write || cft->write_u64 || cft->write_s64 ||
1775 cft->write_string || cft->trigger)
1776 mode |= S_IWUSR;
1777
1778 return mode;
1779}
1780
1691int cgroup_add_file(struct cgroup *cgrp, 1781int cgroup_add_file(struct cgroup *cgrp,
1692 struct cgroup_subsys *subsys, 1782 struct cgroup_subsys *subsys,
1693 const struct cftype *cft) 1783 const struct cftype *cft)
@@ -1695,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,
1695 struct dentry *dir = cgrp->dentry; 1785 struct dentry *dir = cgrp->dentry;
1696 struct dentry *dentry; 1786 struct dentry *dentry;
1697 int error; 1787 int error;
1788 mode_t mode;
1698 1789
1699 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 1790 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1700 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 1791 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1705,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,
1705 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 1796 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1706 dentry = lookup_one_len(name, dir, strlen(name)); 1797 dentry = lookup_one_len(name, dir, strlen(name));
1707 if (!IS_ERR(dentry)) { 1798 if (!IS_ERR(dentry)) {
1708 error = cgroup_create_file(dentry, 0644 | S_IFREG, 1799 mode = cgroup_file_mode(cft);
1800 error = cgroup_create_file(dentry, mode | S_IFREG,
1709 cgrp->root->sb); 1801 cgrp->root->sb);
1710 if (!error) 1802 if (!error)
1711 dentry->d_fsdata = (void *)cft; 1803 dentry->d_fsdata = (void *)cft;
@@ -2287,6 +2379,7 @@ static struct cftype files[] = {
2287 .write_u64 = cgroup_tasks_write, 2379 .write_u64 = cgroup_tasks_write,
2288 .release = cgroup_tasks_release, 2380 .release = cgroup_tasks_release,
2289 .private = FILE_TASKLIST, 2381 .private = FILE_TASKLIST,
2382 .mode = S_IRUGO | S_IWUSR,
2290 }, 2383 },
2291 2384
2292 { 2385 {
@@ -2326,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
2326 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 2419 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2327 return err; 2420 return err;
2328 } 2421 }
2422 /* This cgroup is ready now */
2423 for_each_subsys(cgrp->root, ss) {
2424 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2425 /*
2426 * Update id->css pointer and make this css visible from
2427 * CSS ID functions. This pointer will be dereferened
2428 * from RCU-read-side without locks.
2429 */
2430 if (css->id)
2431 rcu_assign_pointer(css->id->css, css);
2432 }
2329 2433
2330 return 0; 2434 return 0;
2331} 2435}
@@ -2337,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2337 css->cgroup = cgrp; 2441 css->cgroup = cgrp;
2338 atomic_set(&css->refcnt, 1); 2442 atomic_set(&css->refcnt, 1);
2339 css->flags = 0; 2443 css->flags = 0;
2444 css->id = NULL;
2340 if (cgrp == dummytop) 2445 if (cgrp == dummytop)
2341 set_bit(CSS_ROOT, &css->flags); 2446 set_bit(CSS_ROOT, &css->flags);
2342 BUG_ON(cgrp->subsys[ss->subsys_id]); 2447 BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2375,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2375 * Must be called with the mutex on the parent inode held 2480 * Must be called with the mutex on the parent inode held
2376 */ 2481 */
2377static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 2482static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2378 int mode) 2483 mode_t mode)
2379{ 2484{
2380 struct cgroup *cgrp; 2485 struct cgroup *cgrp;
2381 struct cgroupfs_root *root = parent->root; 2486 struct cgroupfs_root *root = parent->root;
@@ -2412,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2412 goto err_destroy; 2517 goto err_destroy;
2413 } 2518 }
2414 init_cgroup_css(css, ss, cgrp); 2519 init_cgroup_css(css, ss, cgrp);
2520 if (ss->use_id)
2521 if (alloc_css_id(ss, parent, cgrp))
2522 goto err_destroy;
2523 /* At error, ->destroy() callback has to free assigned ID. */
2415 } 2524 }
2416 2525
2417 cgroup_lock_hierarchy(root); 2526 cgroup_lock_hierarchy(root);
@@ -2554,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2554 struct cgroup *cgrp = dentry->d_fsdata; 2663 struct cgroup *cgrp = dentry->d_fsdata;
2555 struct dentry *d; 2664 struct dentry *d;
2556 struct cgroup *parent; 2665 struct cgroup *parent;
2666 DEFINE_WAIT(wait);
2667 int ret;
2557 2668
2558 /* the vfs holds both inode->i_mutex already */ 2669 /* the vfs holds both inode->i_mutex already */
2559 2670again:
2560 mutex_lock(&cgroup_mutex); 2671 mutex_lock(&cgroup_mutex);
2561 if (atomic_read(&cgrp->count) != 0) { 2672 if (atomic_read(&cgrp->count) != 0) {
2562 mutex_unlock(&cgroup_mutex); 2673 mutex_unlock(&cgroup_mutex);
@@ -2572,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2572 * Call pre_destroy handlers of subsys. Notify subsystems 2683 * Call pre_destroy handlers of subsys. Notify subsystems
2573 * that rmdir() request comes. 2684 * that rmdir() request comes.
2574 */ 2685 */
2575 cgroup_call_pre_destroy(cgrp); 2686 ret = cgroup_call_pre_destroy(cgrp);
2687 if (ret)
2688 return ret;
2576 2689
2577 mutex_lock(&cgroup_mutex); 2690 mutex_lock(&cgroup_mutex);
2578 parent = cgrp->parent; 2691 parent = cgrp->parent;
2579 2692 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2580 if (atomic_read(&cgrp->count)
2581 || !list_empty(&cgrp->children)
2582 || !cgroup_clear_css_refs(cgrp)) {
2583 mutex_unlock(&cgroup_mutex); 2693 mutex_unlock(&cgroup_mutex);
2584 return -EBUSY; 2694 return -EBUSY;
2585 } 2695 }
2696 /*
2697 * css_put/get is provided for subsys to grab refcnt to css. In typical
2698 * case, subsystem has no reference after pre_destroy(). But, under
2699 * hierarchy management, some *temporal* refcnt can be hold.
2700 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2701 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2702 * is called when css_put() is called and refcnt goes down to 0.
2703 */
2704 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2705 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2706
2707 if (!cgroup_clear_css_refs(cgrp)) {
2708 mutex_unlock(&cgroup_mutex);
2709 schedule();
2710 finish_wait(&cgroup_rmdir_waitq, &wait);
2711 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2712 if (signal_pending(current))
2713 return -EINTR;
2714 goto again;
2715 }
2716 /* NO css_tryget() can success after here. */
2717 finish_wait(&cgroup_rmdir_waitq, &wait);
2718 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2586 2719
2587 spin_lock(&release_list_lock); 2720 spin_lock(&release_list_lock);
2588 set_bit(CGRP_REMOVED, &cgrp->flags); 2721 set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -2707,6 +2840,8 @@ int __init cgroup_init(void)
2707 struct cgroup_subsys *ss = subsys[i]; 2840 struct cgroup_subsys *ss = subsys[i];
2708 if (!ss->early_init) 2841 if (!ss->early_init)
2709 cgroup_init_subsys(ss); 2842 cgroup_init_subsys(ss);
2843 if (ss->use_id)
2844 cgroup_subsys_init_idr(ss);
2710 } 2845 }
2711 2846
2712 /* Add init_css_set to the hash table */ 2847 /* Add init_css_set to the hash table */
@@ -3083,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3083} 3218}
3084 3219
3085/** 3220/**
3086 * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp 3221 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
3087 * @cgrp: the cgroup in question 3222 * @cgrp: the cgroup in question
3223 * @task: the task in question
3088 * 3224 *
3089 * See if @cgrp is a descendant of the current task's cgroup in 3225 * See if @cgrp is a descendant of @task's cgroup in the appropriate
3090 * the appropriate hierarchy. 3226 * hierarchy.
3091 * 3227 *
3092 * If we are sending in dummytop, then presumably we are creating 3228 * If we are sending in dummytop, then presumably we are creating
3093 * the top cgroup in the subsystem. 3229 * the top cgroup in the subsystem.
3094 * 3230 *
3095 * Called only by the ns (nsproxy) cgroup. 3231 * Called only by the ns (nsproxy) cgroup.
3096 */ 3232 */
3097int cgroup_is_descendant(const struct cgroup *cgrp) 3233int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3098{ 3234{
3099 int ret; 3235 int ret;
3100 struct cgroup *target; 3236 struct cgroup *target;
@@ -3104,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
3104 return 1; 3240 return 1;
3105 3241
3106 get_first_subsys(cgrp, NULL, &subsys_id); 3242 get_first_subsys(cgrp, NULL, &subsys_id);
3107 target = task_cgroup(current, subsys_id); 3243 target = task_cgroup(task, subsys_id);
3108 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3244 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3109 cgrp = cgrp->parent; 3245 cgrp = cgrp->parent;
3110 ret = (cgrp == target); 3246 ret = (cgrp == target);
@@ -3137,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)
3137{ 3273{
3138 struct cgroup *cgrp = css->cgroup; 3274 struct cgroup *cgrp = css->cgroup;
3139 rcu_read_lock(); 3275 rcu_read_lock();
3140 if ((atomic_dec_return(&css->refcnt) == 1) && 3276 if (atomic_dec_return(&css->refcnt) == 1) {
3141 notify_on_release(cgrp)) { 3277 if (notify_on_release(cgrp)) {
3142 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3278 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3143 check_for_release(cgrp); 3279 check_for_release(cgrp);
3280 }
3281 cgroup_wakeup_rmdir_waiters(cgrp);
3144 } 3282 }
3145 rcu_read_unlock(); 3283 rcu_read_unlock();
3146} 3284}
@@ -3240,3 +3378,232 @@ static int __init cgroup_disable(char *str)
3240 return 1; 3378 return 1;
3241} 3379}
3242__setup("cgroup_disable=", cgroup_disable); 3380__setup("cgroup_disable=", cgroup_disable);
3381
3382/*
3383 * Functons for CSS ID.
3384 */
3385
3386/*
3387 *To get ID other than 0, this should be called when !cgroup_is_removed().
3388 */
3389unsigned short css_id(struct cgroup_subsys_state *css)
3390{
3391 struct css_id *cssid = rcu_dereference(css->id);
3392
3393 if (cssid)
3394 return cssid->id;
3395 return 0;
3396}
3397
3398unsigned short css_depth(struct cgroup_subsys_state *css)
3399{
3400 struct css_id *cssid = rcu_dereference(css->id);
3401
3402 if (cssid)
3403 return cssid->depth;
3404 return 0;
3405}
3406
3407bool css_is_ancestor(struct cgroup_subsys_state *child,
3408 const struct cgroup_subsys_state *root)
3409{
3410 struct css_id *child_id = rcu_dereference(child->id);
3411 struct css_id *root_id = rcu_dereference(root->id);
3412
3413 if (!child_id || !root_id || (child_id->depth < root_id->depth))
3414 return false;
3415 return child_id->stack[root_id->depth] == root_id->id;
3416}
3417
3418static void __free_css_id_cb(struct rcu_head *head)
3419{
3420 struct css_id *id;
3421
3422 id = container_of(head, struct css_id, rcu_head);
3423 kfree(id);
3424}
3425
3426void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3427{
3428 struct css_id *id = css->id;
3429 /* When this is called before css_id initialization, id can be NULL */
3430 if (!id)
3431 return;
3432
3433 BUG_ON(!ss->use_id);
3434
3435 rcu_assign_pointer(id->css, NULL);
3436 rcu_assign_pointer(css->id, NULL);
3437 spin_lock(&ss->id_lock);
3438 idr_remove(&ss->idr, id->id);
3439 spin_unlock(&ss->id_lock);
3440 call_rcu(&id->rcu_head, __free_css_id_cb);
3441}
3442
3443/*
3444 * This is called by init or create(). Then, calls to this function are
3445 * always serialized (By cgroup_mutex() at create()).
3446 */
3447
3448static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
3449{
3450 struct css_id *newid;
3451 int myid, error, size;
3452
3453 BUG_ON(!ss->use_id);
3454
3455 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
3456 newid = kzalloc(size, GFP_KERNEL);
3457 if (!newid)
3458 return ERR_PTR(-ENOMEM);
3459 /* get id */
3460 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
3461 error = -ENOMEM;
3462 goto err_out;
3463 }
3464 spin_lock(&ss->id_lock);
3465 /* Don't use 0. allocates an ID of 1-65535 */
3466 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
3467 spin_unlock(&ss->id_lock);
3468
3469 /* Returns error when there are no free spaces for new ID.*/
3470 if (error) {
3471 error = -ENOSPC;
3472 goto err_out;
3473 }
3474 if (myid > CSS_ID_MAX)
3475 goto remove_idr;
3476
3477 newid->id = myid;
3478 newid->depth = depth;
3479 return newid;
3480remove_idr:
3481 error = -ENOSPC;
3482 spin_lock(&ss->id_lock);
3483 idr_remove(&ss->idr, myid);
3484 spin_unlock(&ss->id_lock);
3485err_out:
3486 kfree(newid);
3487 return ERR_PTR(error);
3488
3489}
3490
3491static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
3492{
3493 struct css_id *newid;
3494 struct cgroup_subsys_state *rootcss;
3495
3496 spin_lock_init(&ss->id_lock);
3497 idr_init(&ss->idr);
3498
3499 rootcss = init_css_set.subsys[ss->subsys_id];
3500 newid = get_new_cssid(ss, 0);
3501 if (IS_ERR(newid))
3502 return PTR_ERR(newid);
3503
3504 newid->stack[0] = newid->id;
3505 newid->css = rootcss;
3506 rootcss->id = newid;
3507 return 0;
3508}
3509
3510static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3511 struct cgroup *child)
3512{
3513 int subsys_id, i, depth = 0;
3514 struct cgroup_subsys_state *parent_css, *child_css;
3515 struct css_id *child_id, *parent_id = NULL;
3516
3517 subsys_id = ss->subsys_id;
3518 parent_css = parent->subsys[subsys_id];
3519 child_css = child->subsys[subsys_id];
3520 depth = css_depth(parent_css) + 1;
3521 parent_id = parent_css->id;
3522
3523 child_id = get_new_cssid(ss, depth);
3524 if (IS_ERR(child_id))
3525 return PTR_ERR(child_id);
3526
3527 for (i = 0; i < depth; i++)
3528 child_id->stack[i] = parent_id->stack[i];
3529 child_id->stack[depth] = child_id->id;
3530 /*
3531 * child_id->css pointer will be set after this cgroup is available
3532 * see cgroup_populate_dir()
3533 */
3534 rcu_assign_pointer(child_css->id, child_id);
3535
3536 return 0;
3537}
3538
3539/**
3540 * css_lookup - lookup css by id
3541 * @ss: cgroup subsys to be looked into.
3542 * @id: the id
3543 *
3544 * Returns pointer to cgroup_subsys_state if there is valid one with id.
3545 * NULL if not. Should be called under rcu_read_lock()
3546 */
3547struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3548{
3549 struct css_id *cssid = NULL;
3550
3551 BUG_ON(!ss->use_id);
3552 cssid = idr_find(&ss->idr, id);
3553
3554 if (unlikely(!cssid))
3555 return NULL;
3556
3557 return rcu_dereference(cssid->css);
3558}
3559
3560/**
3561 * css_get_next - lookup next cgroup under specified hierarchy.
3562 * @ss: pointer to subsystem
3563 * @id: current position of iteration.
3564 * @root: pointer to css. search tree under this.
3565 * @foundid: position of found object.
3566 *
3567 * Search next css under the specified hierarchy of rootid. Calling under
3568 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
3569 */
3570struct cgroup_subsys_state *
3571css_get_next(struct cgroup_subsys *ss, int id,
3572 struct cgroup_subsys_state *root, int *foundid)
3573{
3574 struct cgroup_subsys_state *ret = NULL;
3575 struct css_id *tmp;
3576 int tmpid;
3577 int rootid = css_id(root);
3578 int depth = css_depth(root);
3579
3580 if (!rootid)
3581 return NULL;
3582
3583 BUG_ON(!ss->use_id);
3584 /* fill start point for scan */
3585 tmpid = id;
3586 while (1) {
3587 /*
3588 * scan next entry from bitmap(tree), tmpid is updated after
3589 * idr_get_next().
3590 */
3591 spin_lock(&ss->id_lock);
3592 tmp = idr_get_next(&ss->idr, &tmpid);
3593 spin_unlock(&ss->id_lock);
3594
3595 if (!tmp)
3596 break;
3597 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
3598 ret = rcu_dereference(tmp->css);
3599 if (ret) {
3600 *foundid = tmpid;
3601 break;
3602 }
3603 }
3604 /* continue to scan from next id */
3605 tmpid = tmpid + 1;
3606 }
3607 return ret;
3608}
3609