aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c276
1 files changed, 178 insertions, 98 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f221446aa02d..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
84 /* Tracks how many cgroups are currently defined in hierarchy.*/ 84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups; 85 int number_of_cgroups;
86 86
87 /* A list running through the mounted hierarchies */ 87 /* A list running through the active hierarchies */
88 struct list_head root_list; 88 struct list_head root_list;
89 89
90 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
@@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
148#define for_each_subsys(_root, _ss) \ 148#define for_each_subsys(_root, _ss) \
149list_for_each_entry(_ss, &_root->subsys_list, sibling) 149list_for_each_entry(_ss, &_root->subsys_list, sibling)
150 150
151/* for_each_root() allows you to iterate across the active hierarchies */ 151/* for_each_active_root() allows you to iterate across the active hierarchies */
152#define for_each_root(_root) \ 152#define for_each_active_root(_root) \
153list_for_each_entry(_root, &roots, root_list) 153list_for_each_entry(_root, &roots, root_list)
154 154
155/* the list of cgroups eligible for automatic release. Protected by 155/* the list of cgroups eligible for automatic release. Protected by
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
271 271
272 rcu_read_lock(); 272 rcu_read_lock();
273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
274 struct cgroup *cgrp = cg->subsys[i]->cgroup; 274 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
275 if (atomic_dec_and_test(&cgrp->count) && 275 if (atomic_dec_and_test(&cgrp->count) &&
276 notify_on_release(cgrp)) { 276 notify_on_release(cgrp)) {
277 if (taskexit) 277 if (taskexit)
@@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
384 return 0; 384 return 0;
385} 385}
386 386
387/**
388 * link_css_set - a helper function to link a css_set to a cgroup
389 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
390 * @cg: the css_set to be linked
391 * @cgrp: the destination cgroup
392 */
393static void link_css_set(struct list_head *tmp_cg_links,
394 struct css_set *cg, struct cgroup *cgrp)
395{
396 struct cg_cgroup_link *link;
397
398 BUG_ON(list_empty(tmp_cg_links));
399 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
400 cgrp_link_list);
401 link->cg = cg;
402 list_move(&link->cgrp_link_list, &cgrp->css_sets);
403 list_add(&link->cg_link_list, &cg->cg_links);
404}
405
387/* 406/*
388 * find_css_set() takes an existing cgroup group and a 407 * find_css_set() takes an existing cgroup group and a
389 * cgroup object, and returns a css_set object that's 408 * cgroup object, and returns a css_set object that's
@@ -399,7 +418,6 @@ static struct css_set *find_css_set(
399 int i; 418 int i;
400 419
401 struct list_head tmp_cg_links; 420 struct list_head tmp_cg_links;
402 struct cg_cgroup_link *link;
403 421
404 struct hlist_head *hhead; 422 struct hlist_head *hhead;
405 423
@@ -444,26 +462,11 @@ static struct css_set *find_css_set(
444 * only do it for the first subsystem in each 462 * only do it for the first subsystem in each
445 * hierarchy 463 * hierarchy
446 */ 464 */
447 if (ss->root->subsys_list.next == &ss->sibling) { 465 if (ss->root->subsys_list.next == &ss->sibling)
448 BUG_ON(list_empty(&tmp_cg_links)); 466 link_css_set(&tmp_cg_links, res, cgrp);
449 link = list_entry(tmp_cg_links.next,
450 struct cg_cgroup_link,
451 cgrp_link_list);
452 list_del(&link->cgrp_link_list);
453 list_add(&link->cgrp_link_list, &cgrp->css_sets);
454 link->cg = res;
455 list_add(&link->cg_link_list, &res->cg_links);
456 }
457 }
458 if (list_empty(&rootnode.subsys_list)) {
459 link = list_entry(tmp_cg_links.next,
460 struct cg_cgroup_link,
461 cgrp_link_list);
462 list_del(&link->cgrp_link_list);
463 list_add(&link->cgrp_link_list, &dummytop->css_sets);
464 link->cg = res;
465 list_add(&link->cg_link_list, &res->cg_links);
466 } 467 }
468 if (list_empty(&rootnode.subsys_list))
469 link_css_set(&tmp_cg_links, res, dummytop);
467 470
468 BUG_ON(!list_empty(&tmp_cg_links)); 471 BUG_ON(!list_empty(&tmp_cg_links));
469 472
@@ -586,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
586{ 589{
587 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
588 for_each_subsys(cgrp->root, ss) 591 for_each_subsys(cgrp->root, ss)
589 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 592 if (ss->pre_destroy)
590 ss->pre_destroy(ss, cgrp); 593 ss->pre_destroy(ss, cgrp);
591 return; 594 return;
592} 595}
593 596
597static void free_cgroup_rcu(struct rcu_head *obj)
598{
599 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
600
601 kfree(cgrp);
602}
603
594static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
595{ 605{
596 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -610,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
610 /* 620 /*
611 * Release the subsystem state objects. 621 * Release the subsystem state objects.
612 */ 622 */
613 for_each_subsys(cgrp->root, ss) { 623 for_each_subsys(cgrp->root, ss)
614 if (cgrp->subsys[ss->subsys_id]) 624 ss->destroy(ss, cgrp);
615 ss->destroy(ss, cgrp);
616 }
617 625
618 cgrp->root->number_of_cgroups--; 626 cgrp->root->number_of_cgroups--;
619 mutex_unlock(&cgroup_mutex); 627 mutex_unlock(&cgroup_mutex);
620 628
621 /* Drop the active superblock reference that we took when we 629 /*
622 * created the cgroup */ 630 * Drop the active superblock reference that we took when we
631 * created the cgroup
632 */
623 deactivate_super(cgrp->root->sb); 633 deactivate_super(cgrp->root->sb);
624 634
625 kfree(cgrp); 635 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
626 } 636 }
627 iput(inode); 637 iput(inode);
628} 638}
@@ -712,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
712 BUG_ON(cgrp->subsys[i]); 722 BUG_ON(cgrp->subsys[i]);
713 BUG_ON(!dummytop->subsys[i]); 723 BUG_ON(!dummytop->subsys[i]);
714 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 724 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
725 mutex_lock(&ss->hierarchy_mutex);
715 cgrp->subsys[i] = dummytop->subsys[i]; 726 cgrp->subsys[i] = dummytop->subsys[i];
716 cgrp->subsys[i]->cgroup = cgrp; 727 cgrp->subsys[i]->cgroup = cgrp;
717 list_add(&ss->sibling, &root->subsys_list); 728 list_move(&ss->sibling, &root->subsys_list);
718 rcu_assign_pointer(ss->root, root); 729 ss->root = root;
719 if (ss->bind) 730 if (ss->bind)
720 ss->bind(ss, cgrp); 731 ss->bind(ss, cgrp);
721 732 mutex_unlock(&ss->hierarchy_mutex);
722 } else if (bit & removed_bits) { 733 } else if (bit & removed_bits) {
723 /* We're removing this subsystem */ 734 /* We're removing this subsystem */
724 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 735 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
725 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 736 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
737 mutex_lock(&ss->hierarchy_mutex);
726 if (ss->bind) 738 if (ss->bind)
727 ss->bind(ss, dummytop); 739 ss->bind(ss, dummytop);
728 dummytop->subsys[i]->cgroup = dummytop; 740 dummytop->subsys[i]->cgroup = dummytop;
729 cgrp->subsys[i] = NULL; 741 cgrp->subsys[i] = NULL;
730 rcu_assign_pointer(subsys[i]->root, &rootnode); 742 subsys[i]->root = &rootnode;
731 list_del(&ss->sibling); 743 list_move(&ss->sibling, &rootnode.subsys_list);
744 mutex_unlock(&ss->hierarchy_mutex);
732 } else if (bit & final_bits) { 745 } else if (bit & final_bits) {
733 /* Subsystem state should already exist */ 746 /* Subsystem state should already exist */
734 BUG_ON(!cgrp->subsys[i]); 747 BUG_ON(!cgrp->subsys[i]);
@@ -990,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
990 root = NULL; 1003 root = NULL;
991 } else { 1004 } else {
992 /* New superblock */ 1005 /* New superblock */
993 struct cgroup *cgrp = &root->top_cgroup; 1006 struct cgroup *root_cgrp = &root->top_cgroup;
994 struct inode *inode; 1007 struct inode *inode;
995 int i; 1008 int i;
996 1009
@@ -1031,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1031 list_add(&root->root_list, &roots); 1044 list_add(&root->root_list, &roots);
1032 root_count++; 1045 root_count++;
1033 1046
1034 sb->s_root->d_fsdata = &root->top_cgroup; 1047 sb->s_root->d_fsdata = root_cgrp;
1035 root->top_cgroup.dentry = sb->s_root; 1048 root->top_cgroup.dentry = sb->s_root;
1036 1049
1037 /* Link the top cgroup in this hierarchy into all 1050 /* Link the top cgroup in this hierarchy into all
@@ -1042,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1042 struct hlist_node *node; 1055 struct hlist_node *node;
1043 struct css_set *cg; 1056 struct css_set *cg;
1044 1057
1045 hlist_for_each_entry(cg, node, hhead, hlist) { 1058 hlist_for_each_entry(cg, node, hhead, hlist)
1046 struct cg_cgroup_link *link; 1059 link_css_set(&tmp_cg_links, cg, root_cgrp);
1047
1048 BUG_ON(list_empty(&tmp_cg_links));
1049 link = list_entry(tmp_cg_links.next,
1050 struct cg_cgroup_link,
1051 cgrp_link_list);
1052 list_del(&link->cgrp_link_list);
1053 link->cg = cg;
1054 list_add(&link->cgrp_link_list,
1055 &root->top_cgroup.css_sets);
1056 list_add(&link->cg_link_list, &cg->cg_links);
1057 }
1058 } 1060 }
1059 write_unlock(&css_set_lock); 1061 write_unlock(&css_set_lock);
1060 1062
1061 free_cg_links(&tmp_cg_links); 1063 free_cg_links(&tmp_cg_links);
1062 1064
1063 BUG_ON(!list_empty(&cgrp->sibling)); 1065 BUG_ON(!list_empty(&root_cgrp->sibling));
1064 BUG_ON(!list_empty(&cgrp->children)); 1066 BUG_ON(!list_empty(&root_cgrp->children));
1065 BUG_ON(root->number_of_cgroups != 1); 1067 BUG_ON(root->number_of_cgroups != 1);
1066 1068
1067 cgroup_populate_dir(cgrp); 1069 cgroup_populate_dir(root_cgrp);
1068 mutex_unlock(&inode->i_mutex); 1070 mutex_unlock(&inode->i_mutex);
1069 mutex_unlock(&cgroup_mutex); 1071 mutex_unlock(&cgroup_mutex);
1070 } 1072 }
@@ -1113,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1113 } 1115 }
1114 write_unlock(&css_set_lock); 1116 write_unlock(&css_set_lock);
1115 1117
1116 if (!list_empty(&root->root_list)) { 1118 list_del(&root->root_list);
1117 list_del(&root->root_list); 1119 root_count--;
1118 root_count--; 1120
1119 }
1120 mutex_unlock(&cgroup_mutex); 1121 mutex_unlock(&cgroup_mutex);
1121 1122
1122 kfree(root); 1123 kfree(root);
@@ -1145,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1145 * @buf: the buffer to write the path into 1146 * @buf: the buffer to write the path into
1146 * @buflen: the length of the buffer 1147 * @buflen: the length of the buffer
1147 * 1148 *
1148 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1149 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1149 * Returns 0 on success, -errno on error. 1150 * reference. Writes path of cgroup into buf. Returns 0 on success,
1151 * -errno on error.
1150 */ 1152 */
1151int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1152{ 1154{
1153 char *start; 1155 char *start;
1156 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1154 1157
1155 if (cgrp == dummytop) { 1158 if (!dentry || cgrp == dummytop) {
1156 /* 1159 /*
1157 * Inactive subsystems have no dentry for their root 1160 * Inactive subsystems have no dentry for their root
1158 * cgroup 1161 * cgroup
@@ -1165,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1165 1168
1166 *--start = '\0'; 1169 *--start = '\0';
1167 for (;;) { 1170 for (;;) {
1168 int len = cgrp->dentry->d_name.len; 1171 int len = dentry->d_name.len;
1169 if ((start -= len) < buf) 1172 if ((start -= len) < buf)
1170 return -ENAMETOOLONG; 1173 return -ENAMETOOLONG;
1171 memcpy(start, cgrp->dentry->d_name.name, len); 1174 memcpy(start, cgrp->dentry->d_name.name, len);
1172 cgrp = cgrp->parent; 1175 cgrp = cgrp->parent;
1173 if (!cgrp) 1176 if (!cgrp)
1174 break; 1177 break;
1178 dentry = rcu_dereference(cgrp->dentry);
1175 if (!cgrp->parent) 1179 if (!cgrp->parent)
1176 continue; 1180 continue;
1177 if (--start < buf) 1181 if (--start < buf)
@@ -1216,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1216 int retval = 0; 1220 int retval = 0;
1217 struct cgroup_subsys *ss; 1221 struct cgroup_subsys *ss;
1218 struct cgroup *oldcgrp; 1222 struct cgroup *oldcgrp;
1219 struct css_set *cg = tsk->cgroups; 1223 struct css_set *cg;
1220 struct css_set *newcg; 1224 struct css_set *newcg;
1221 struct cgroupfs_root *root = cgrp->root; 1225 struct cgroupfs_root *root = cgrp->root;
1222 int subsys_id; 1226 int subsys_id;
@@ -1236,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1236 } 1240 }
1237 } 1241 }
1238 1242
1243 task_lock(tsk);
1244 cg = tsk->cgroups;
1245 get_css_set(cg);
1246 task_unlock(tsk);
1239 /* 1247 /*
1240 * Locate or allocate a new css_set for this task, 1248 * Locate or allocate a new css_set for this task,
1241 * based on its final set of cgroups 1249 * based on its final set of cgroups
1242 */ 1250 */
1243 newcg = find_css_set(cg, cgrp); 1251 newcg = find_css_set(cg, cgrp);
1252 put_css_set(cg);
1244 if (!newcg) 1253 if (!newcg)
1245 return -ENOMEM; 1254 return -ENOMEM;
1246 1255
@@ -1445,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1445 struct cftype *cft = __d_cft(file->f_dentry); 1454 struct cftype *cft = __d_cft(file->f_dentry);
1446 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1455 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1447 1456
1448 if (!cft || cgroup_is_removed(cgrp)) 1457 if (cgroup_is_removed(cgrp))
1449 return -ENODEV; 1458 return -ENODEV;
1450 if (cft->write) 1459 if (cft->write)
1451 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1460 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1490,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1490 struct cftype *cft = __d_cft(file->f_dentry); 1499 struct cftype *cft = __d_cft(file->f_dentry);
1491 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1500 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1492 1501
1493 if (!cft || cgroup_is_removed(cgrp)) 1502 if (cgroup_is_removed(cgrp))
1494 return -ENODEV; 1503 return -ENODEV;
1495 1504
1496 if (cft->read) 1505 if (cft->read)
@@ -1554,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1554 err = generic_file_open(inode, file); 1563 err = generic_file_open(inode, file);
1555 if (err) 1564 if (err)
1556 return err; 1565 return err;
1557
1558 cft = __d_cft(file->f_dentry); 1566 cft = __d_cft(file->f_dentry);
1559 if (!cft) 1567
1560 return -ENODEV;
1561 if (cft->read_map || cft->read_seq_string) { 1568 if (cft->read_map || cft->read_seq_string) {
1562 struct cgroup_seqfile_state *state = 1569 struct cgroup_seqfile_state *state =
1563 kzalloc(sizeof(*state), GFP_USER); 1570 kzalloc(sizeof(*state), GFP_USER);
@@ -1671,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1671 if (!error) { 1678 if (!error) {
1672 dentry->d_fsdata = cgrp; 1679 dentry->d_fsdata = cgrp;
1673 inc_nlink(parent->d_inode); 1680 inc_nlink(parent->d_inode);
1674 cgrp->dentry = dentry; 1681 rcu_assign_pointer(cgrp->dentry, dentry);
1675 dget(dentry); 1682 dget(dentry);
1676 } 1683 }
1677 dput(dentry); 1684 dput(dentry);
@@ -1812,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1812{ 1819{
1813 struct task_struct *res; 1820 struct task_struct *res;
1814 struct list_head *l = it->task; 1821 struct list_head *l = it->task;
1822 struct cg_cgroup_link *link;
1815 1823
1816 /* If the iterator cg is NULL, we have no tasks */ 1824 /* If the iterator cg is NULL, we have no tasks */
1817 if (!it->cg_link) 1825 if (!it->cg_link)
@@ -1819,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1819 res = list_entry(l, struct task_struct, cg_list); 1827 res = list_entry(l, struct task_struct, cg_list);
1820 /* Advance iterator to find next entry */ 1828 /* Advance iterator to find next entry */
1821 l = l->next; 1829 l = l->next;
1822 if (l == &res->cgroups->tasks) { 1830 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1831 if (l == &link->cg->tasks) {
1823 /* We reached the end of this task list - move on to 1832 /* We reached the end of this task list - move on to
1824 * the next cg_cgroup_link */ 1833 * the next cg_cgroup_link */
1825 cgroup_advance_iter(cgrp, it); 1834 cgroup_advance_iter(cgrp, it);
@@ -2013,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2013 */ 2022 */
2014static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2023static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2015{ 2024{
2016 int n = 0; 2025 int n = 0, pid;
2017 struct cgroup_iter it; 2026 struct cgroup_iter it;
2018 struct task_struct *tsk; 2027 struct task_struct *tsk;
2019 cgroup_iter_start(cgrp, &it); 2028 cgroup_iter_start(cgrp, &it);
2020 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2029 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2021 if (unlikely(n == npids)) 2030 if (unlikely(n == npids))
2022 break; 2031 break;
2023 pidarray[n++] = task_pid_vnr(tsk); 2032 pid = task_pid_vnr(tsk);
2033 if (pid > 0)
2034 pidarray[n++] = pid;
2024 } 2035 }
2025 cgroup_iter_end(cgrp, &it); 2036 cgroup_iter_end(cgrp, &it);
2026 return n; 2037 return n;
@@ -2052,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2052 2063
2053 ret = 0; 2064 ret = 0;
2054 cgrp = dentry->d_fsdata; 2065 cgrp = dentry->d_fsdata;
2055 rcu_read_lock();
2056 2066
2057 cgroup_iter_start(cgrp, &it); 2067 cgroup_iter_start(cgrp, &it);
2058 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2068 while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2077,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2077 } 2087 }
2078 cgroup_iter_end(cgrp, &it); 2088 cgroup_iter_end(cgrp, &it);
2079 2089
2080 rcu_read_unlock();
2081err: 2090err:
2082 return ret; 2091 return ret;
2083} 2092}
@@ -2324,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2324 struct cgroup *cgrp) 2333 struct cgroup *cgrp)
2325{ 2334{
2326 css->cgroup = cgrp; 2335 css->cgroup = cgrp;
2327 atomic_set(&css->refcnt, 0); 2336 atomic_set(&css->refcnt, 1);
2328 css->flags = 0; 2337 css->flags = 0;
2329 if (cgrp == dummytop) 2338 if (cgrp == dummytop)
2330 set_bit(CSS_ROOT, &css->flags); 2339 set_bit(CSS_ROOT, &css->flags);
@@ -2332,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2332 cgrp->subsys[ss->subsys_id] = css; 2341 cgrp->subsys[ss->subsys_id] = css;
2333} 2342}
2334 2343
2344static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2345{
2346 /* We need to take each hierarchy_mutex in a consistent order */
2347 int i;
2348
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i);
2353 }
2354}
2355
2356static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2357{
2358 int i;
2359
2360 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2361 struct cgroup_subsys *ss = subsys[i];
2362 if (ss->root == root)
2363 mutex_unlock(&ss->hierarchy_mutex);
2364 }
2365}
2366
2335/* 2367/*
2336 * cgroup_create - create a cgroup 2368 * cgroup_create - create a cgroup
2337 * @parent: cgroup that will be parent of the new cgroup 2369 * @parent: cgroup that will be parent of the new cgroup
@@ -2380,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2380 init_cgroup_css(css, ss, cgrp); 2412 init_cgroup_css(css, ss, cgrp);
2381 } 2413 }
2382 2414
2415 cgroup_lock_hierarchy(root);
2383 list_add(&cgrp->sibling, &cgrp->parent->children); 2416 list_add(&cgrp->sibling, &cgrp->parent->children);
2417 cgroup_unlock_hierarchy(root);
2384 root->number_of_cgroups++; 2418 root->number_of_cgroups++;
2385 2419
2386 err = cgroup_create_dir(cgrp, dentry, mode); 2420 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2431,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2431{ 2465{
2432 /* Check the reference count on each subsystem. Since we 2466 /* Check the reference count on each subsystem. Since we
2433 * already established that there are no tasks in the 2467 * already established that there are no tasks in the
2434 * cgroup, if the css refcount is also 0, then there should 2468 * cgroup, if the css refcount is also 1, then there should
2435 * be no outstanding references, so the subsystem is safe to 2469 * be no outstanding references, so the subsystem is safe to
2436 * destroy. We scan across all subsystems rather than using 2470 * destroy. We scan across all subsystems rather than using
2437 * the per-hierarchy linked list of mounted subsystems since 2471 * the per-hierarchy linked list of mounted subsystems since
@@ -2452,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2452 * matter, since it can only happen if the cgroup 2486 * matter, since it can only happen if the cgroup
2453 * has been deleted and hence no longer needs the 2487 * has been deleted and hence no longer needs the
2454 * release agent to be called anyway. */ 2488 * release agent to be called anyway. */
2455 if (css && atomic_read(&css->refcnt)) 2489 if (css && (atomic_read(&css->refcnt) > 1))
2456 return 1; 2490 return 1;
2457 } 2491 }
2458 return 0; 2492 return 0;
2459} 2493}
2460 2494
2495/*
2496 * Atomically mark all (or else none) of the cgroup's CSS objects as
2497 * CSS_REMOVED. Return true on success, or false if the cgroup has
2498 * busy subsystems. Call with cgroup_mutex held
2499 */
2500
2501static int cgroup_clear_css_refs(struct cgroup *cgrp)
2502{
2503 struct cgroup_subsys *ss;
2504 unsigned long flags;
2505 bool failed = false;
2506 local_irq_save(flags);
2507 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt;
2510 do {
2511 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) {
2514 failed = true;
2515 goto done;
2516 }
2517 BUG_ON(!refcnt);
2518 /*
2519 * Drop the refcnt to 0 while we check other
2520 * subsystems. This will cause any racing
2521 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort
2523 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
2525 }
2526 done:
2527 for_each_subsys(cgrp->root, ss) {
2528 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2529 if (failed) {
2530 /*
2531 * Restore old refcnt if we previously managed
2532 * to clear it from 1 to 0
2533 */
2534 if (!atomic_read(&css->refcnt))
2535 atomic_set(&css->refcnt, 1);
2536 } else {
2537 /* Commit the fact that the CSS is removed */
2538 set_bit(CSS_REMOVED, &css->flags);
2539 }
2540 }
2541 local_irq_restore(flags);
2542 return !failed;
2543}
2544
2461static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2545static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2462{ 2546{
2463 struct cgroup *cgrp = dentry->d_fsdata; 2547 struct cgroup *cgrp = dentry->d_fsdata;
2464 struct dentry *d; 2548 struct dentry *d;
2465 struct cgroup *parent; 2549 struct cgroup *parent;
2466 struct super_block *sb;
2467 struct cgroupfs_root *root;
2468 2550
2469 /* the vfs holds both inode->i_mutex already */ 2551 /* the vfs holds both inode->i_mutex already */
2470 2552
@@ -2487,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2487 2569
2488 mutex_lock(&cgroup_mutex); 2570 mutex_lock(&cgroup_mutex);
2489 parent = cgrp->parent; 2571 parent = cgrp->parent;
2490 root = cgrp->root;
2491 sb = root->sb;
2492 2572
2493 if (atomic_read(&cgrp->count) 2573 if (atomic_read(&cgrp->count)
2494 || !list_empty(&cgrp->children) 2574 || !list_empty(&cgrp->children)
2495 || cgroup_has_css_refs(cgrp)) { 2575 || !cgroup_clear_css_refs(cgrp)) {
2496 mutex_unlock(&cgroup_mutex); 2576 mutex_unlock(&cgroup_mutex);
2497 return -EBUSY; 2577 return -EBUSY;
2498 } 2578 }
@@ -2502,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2502 if (!list_empty(&cgrp->release_list)) 2582 if (!list_empty(&cgrp->release_list))
2503 list_del(&cgrp->release_list); 2583 list_del(&cgrp->release_list);
2504 spin_unlock(&release_list_lock); 2584 spin_unlock(&release_list_lock);
2505 /* delete my sibling from parent->children */ 2585
2586 cgroup_lock_hierarchy(cgrp->root);
2587 /* delete this cgroup from parent->children */
2506 list_del(&cgrp->sibling); 2588 list_del(&cgrp->sibling);
2589 cgroup_unlock_hierarchy(cgrp->root);
2590
2507 spin_lock(&cgrp->dentry->d_lock); 2591 spin_lock(&cgrp->dentry->d_lock);
2508 d = dget(cgrp->dentry); 2592 d = dget(cgrp->dentry);
2509 spin_unlock(&d->d_lock); 2593 spin_unlock(&d->d_lock);
@@ -2525,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2525 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2609 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2526 2610
2527 /* Create the top cgroup state for this subsystem */ 2611 /* Create the top cgroup state for this subsystem */
2612 list_add(&ss->sibling, &rootnode.subsys_list);
2528 ss->root = &rootnode; 2613 ss->root = &rootnode;
2529 css = ss->create(ss, dummytop); 2614 css = ss->create(ss, dummytop);
2530 /* We don't handle early failures gracefully */ 2615 /* We don't handle early failures gracefully */
@@ -2544,6 +2629,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2544 * need to invoke fork callbacks here. */ 2629 * need to invoke fork callbacks here. */
2545 BUG_ON(!list_empty(&init_task.tasks)); 2630 BUG_ON(!list_empty(&init_task.tasks));
2546 2631
2632 mutex_init(&ss->hierarchy_mutex);
2547 ss->active = 1; 2633 ss->active = 1;
2548} 2634}
2549 2635
@@ -2562,7 +2648,6 @@ int __init cgroup_init_early(void)
2562 INIT_HLIST_NODE(&init_css_set.hlist); 2648 INIT_HLIST_NODE(&init_css_set.hlist);
2563 css_set_count = 1; 2649 css_set_count = 1;
2564 init_cgroup_root(&rootnode); 2650 init_cgroup_root(&rootnode);
2565 list_add(&rootnode.root_list, &roots);
2566 root_count = 1; 2651 root_count = 1;
2567 init_task.cgroups = &init_css_set; 2652 init_task.cgroups = &init_css_set;
2568 2653
@@ -2669,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2669 2754
2670 mutex_lock(&cgroup_mutex); 2755 mutex_lock(&cgroup_mutex);
2671 2756
2672 for_each_root(root) { 2757 for_each_active_root(root) {
2673 struct cgroup_subsys *ss; 2758 struct cgroup_subsys *ss;
2674 struct cgroup *cgrp; 2759 struct cgroup *cgrp;
2675 int subsys_id; 2760 int subsys_id;
2676 int count = 0; 2761 int count = 0;
2677 2762
2678 /* Skip this hierarchy if it has no active subsystems */
2679 if (!root->actual_subsys_bits)
2680 continue;
2681 seq_printf(m, "%lu:", root->subsys_bits); 2763 seq_printf(m, "%lu:", root->subsys_bits);
2682 for_each_subsys(root, ss) 2764 for_each_subsys(root, ss)
2683 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2765 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2800,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
2800{ 2882{
2801 if (use_task_css_set_links) { 2883 if (use_task_css_set_links) {
2802 write_lock(&css_set_lock); 2884 write_lock(&css_set_lock);
2885 task_lock(child);
2803 if (list_empty(&child->cg_list)) 2886 if (list_empty(&child->cg_list))
2804 list_add(&child->cg_list, &child->cgroups->tasks); 2887 list_add(&child->cg_list, &child->cgroups->tasks);
2888 task_unlock(child);
2805 write_unlock(&css_set_lock); 2889 write_unlock(&css_set_lock);
2806 } 2890 }
2807} 2891}
@@ -2907,6 +2991,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2907 mutex_unlock(&cgroup_mutex); 2991 mutex_unlock(&cgroup_mutex);
2908 return 0; 2992 return 0;
2909 } 2993 }
2994 task_lock(tsk);
2910 cg = tsk->cgroups; 2995 cg = tsk->cgroups;
2911 parent = task_cgroup(tsk, subsys->subsys_id); 2996 parent = task_cgroup(tsk, subsys->subsys_id);
2912 2997
@@ -2919,6 +3004,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2919 3004
2920 /* Keep the cgroup alive */ 3005 /* Keep the cgroup alive */
2921 get_css_set(cg); 3006 get_css_set(cg);
3007 task_unlock(tsk);
2922 mutex_unlock(&cgroup_mutex); 3008 mutex_unlock(&cgroup_mutex);
2923 3009
2924 /* Now do the VFS work to create a cgroup */ 3010 /* Now do the VFS work to create a cgroup */
@@ -2937,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2937 } 3023 }
2938 3024
2939 /* Create the cgroup directory, which also creates the cgroup */ 3025 /* Create the cgroup directory, which also creates the cgroup */
2940 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 3026 ret = vfs_mkdir(inode, dentry, 0755);
2941 child = __d_cgrp(dentry); 3027 child = __d_cgrp(dentry);
2942 dput(dentry); 3028 dput(dentry);
2943 if (ret) { 3029 if (ret) {
@@ -2947,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2947 goto out_release; 3033 goto out_release;
2948 } 3034 }
2949 3035
2950 if (!child) {
2951 printk(KERN_INFO
2952 "Couldn't find new cgroup %s\n", nodename);
2953 ret = -ENOMEM;
2954 goto out_release;
2955 }
2956
2957 /* The cgroup now exists. Retake cgroup_mutex and check 3036 /* The cgroup now exists. Retake cgroup_mutex and check
2958 * that we're still in the same state that we thought we 3037 * that we're still in the same state that we thought we
2959 * were. */ 3038 * were. */
@@ -3049,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
3049{ 3128{
3050 struct cgroup *cgrp = css->cgroup; 3129 struct cgroup *cgrp = css->cgroup;
3051 rcu_read_lock(); 3130 rcu_read_lock();
3052 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 3131 if ((atomic_dec_return(&css->refcnt) == 1) &&
3132 notify_on_release(cgrp)) {
3053 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3133 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3054 check_for_release(cgrp); 3134 check_for_release(cgrp);
3055 } 3135 }