aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c316
1 files changed, 183 insertions, 133 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 48348dde6d81..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
84 /* Tracks how many cgroups are currently defined in hierarchy.*/ 84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups; 85 int number_of_cgroups;
86 86
87 /* A list running through the mounted hierarchies */ 87 /* A list running through the active hierarchies */
88 struct list_head root_list; 88 struct list_head root_list;
89 89
90 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
@@ -116,7 +116,6 @@ static int root_count;
116 * be called. 116 * be called.
117 */ 117 */
118static int need_forkexit_callback __read_mostly; 118static int need_forkexit_callback __read_mostly;
119static int need_mm_owner_callback __read_mostly;
120 119
121/* convenient tests for these bits */ 120/* convenient tests for these bits */
122inline int cgroup_is_removed(const struct cgroup *cgrp) 121inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
149#define for_each_subsys(_root, _ss) \ 148#define for_each_subsys(_root, _ss) \
150list_for_each_entry(_ss, &_root->subsys_list, sibling) 149list_for_each_entry(_ss, &_root->subsys_list, sibling)
151 150
152/* for_each_root() allows you to iterate across the active hierarchies */ 151/* for_each_active_root() allows you to iterate across the active hierarchies */
153#define for_each_root(_root) \ 152#define for_each_active_root(_root) \
154list_for_each_entry(_root, &roots, root_list) 153list_for_each_entry(_root, &roots, root_list)
155 154
156/* the list of cgroups eligible for automatic release. Protected by 155/* the list of cgroups eligible for automatic release. Protected by
@@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
272 271
273 rcu_read_lock(); 272 rcu_read_lock();
274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
275 struct cgroup *cgrp = cg->subsys[i]->cgroup; 274 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
276 if (atomic_dec_and_test(&cgrp->count) && 275 if (atomic_dec_and_test(&cgrp->count) &&
277 notify_on_release(cgrp)) { 276 notify_on_release(cgrp)) {
278 if (taskexit) 277 if (taskexit)
@@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
385 return 0; 384 return 0;
386} 385}
387 386
387/**
388 * link_css_set - a helper function to link a css_set to a cgroup
389 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
390 * @cg: the css_set to be linked
391 * @cgrp: the destination cgroup
392 */
393static void link_css_set(struct list_head *tmp_cg_links,
394 struct css_set *cg, struct cgroup *cgrp)
395{
396 struct cg_cgroup_link *link;
397
398 BUG_ON(list_empty(tmp_cg_links));
399 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
400 cgrp_link_list);
401 link->cg = cg;
402 list_move(&link->cgrp_link_list, &cgrp->css_sets);
403 list_add(&link->cg_link_list, &cg->cg_links);
404}
405
388/* 406/*
389 * find_css_set() takes an existing cgroup group and a 407 * find_css_set() takes an existing cgroup group and a
390 * cgroup object, and returns a css_set object that's 408 * cgroup object, and returns a css_set object that's
@@ -400,7 +418,6 @@ static struct css_set *find_css_set(
400 int i; 418 int i;
401 419
402 struct list_head tmp_cg_links; 420 struct list_head tmp_cg_links;
403 struct cg_cgroup_link *link;
404 421
405 struct hlist_head *hhead; 422 struct hlist_head *hhead;
406 423
@@ -445,26 +462,11 @@ static struct css_set *find_css_set(
445 * only do it for the first subsystem in each 462 * only do it for the first subsystem in each
446 * hierarchy 463 * hierarchy
447 */ 464 */
448 if (ss->root->subsys_list.next == &ss->sibling) { 465 if (ss->root->subsys_list.next == &ss->sibling)
449 BUG_ON(list_empty(&tmp_cg_links)); 466 link_css_set(&tmp_cg_links, res, cgrp);
450 link = list_entry(tmp_cg_links.next,
451 struct cg_cgroup_link,
452 cgrp_link_list);
453 list_del(&link->cgrp_link_list);
454 list_add(&link->cgrp_link_list, &cgrp->css_sets);
455 link->cg = res;
456 list_add(&link->cg_link_list, &res->cg_links);
457 }
458 }
459 if (list_empty(&rootnode.subsys_list)) {
460 link = list_entry(tmp_cg_links.next,
461 struct cg_cgroup_link,
462 cgrp_link_list);
463 list_del(&link->cgrp_link_list);
464 list_add(&link->cgrp_link_list, &dummytop->css_sets);
465 link->cg = res;
466 list_add(&link->cg_link_list, &res->cg_links);
467 } 467 }
468 if (list_empty(&rootnode.subsys_list))
469 link_css_set(&tmp_cg_links, res, dummytop);
468 470
469 BUG_ON(!list_empty(&tmp_cg_links)); 471 BUG_ON(!list_empty(&tmp_cg_links));
470 472
@@ -573,7 +575,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
573 inode->i_mode = mode; 575 inode->i_mode = mode;
574 inode->i_uid = current_fsuid(); 576 inode->i_uid = current_fsuid();
575 inode->i_gid = current_fsgid(); 577 inode->i_gid = current_fsgid();
576 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 578 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 579 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
579 } 580 }
@@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
588{ 589{
589 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
590 for_each_subsys(cgrp->root, ss) 591 for_each_subsys(cgrp->root, ss)
591 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 592 if (ss->pre_destroy)
592 ss->pre_destroy(ss, cgrp); 593 ss->pre_destroy(ss, cgrp);
593 return; 594 return;
594} 595}
595 596
597static void free_cgroup_rcu(struct rcu_head *obj)
598{
599 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
600
601 kfree(cgrp);
602}
603
596static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
597{ 605{
598 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
612 /* 620 /*
613 * Release the subsystem state objects. 621 * Release the subsystem state objects.
614 */ 622 */
615 for_each_subsys(cgrp->root, ss) { 623 for_each_subsys(cgrp->root, ss)
616 if (cgrp->subsys[ss->subsys_id]) 624 ss->destroy(ss, cgrp);
617 ss->destroy(ss, cgrp);
618 }
619 625
620 cgrp->root->number_of_cgroups--; 626 cgrp->root->number_of_cgroups--;
621 mutex_unlock(&cgroup_mutex); 627 mutex_unlock(&cgroup_mutex);
622 628
623 /* Drop the active superblock reference that we took when we 629 /*
624 * created the cgroup */ 630 * Drop the active superblock reference that we took when we
631 * created the cgroup
632 */
625 deactivate_super(cgrp->root->sb); 633 deactivate_super(cgrp->root->sb);
626 634
627 kfree(cgrp); 635 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
628 } 636 }
629 iput(inode); 637 iput(inode);
630} 638}
@@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
714 BUG_ON(cgrp->subsys[i]); 722 BUG_ON(cgrp->subsys[i]);
715 BUG_ON(!dummytop->subsys[i]); 723 BUG_ON(!dummytop->subsys[i]);
716 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 724 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
725 mutex_lock(&ss->hierarchy_mutex);
717 cgrp->subsys[i] = dummytop->subsys[i]; 726 cgrp->subsys[i] = dummytop->subsys[i];
718 cgrp->subsys[i]->cgroup = cgrp; 727 cgrp->subsys[i]->cgroup = cgrp;
719 list_add(&ss->sibling, &root->subsys_list); 728 list_move(&ss->sibling, &root->subsys_list);
720 rcu_assign_pointer(ss->root, root); 729 ss->root = root;
721 if (ss->bind) 730 if (ss->bind)
722 ss->bind(ss, cgrp); 731 ss->bind(ss, cgrp);
723 732 mutex_unlock(&ss->hierarchy_mutex);
724 } else if (bit & removed_bits) { 733 } else if (bit & removed_bits) {
725 /* We're removing this subsystem */ 734 /* We're removing this subsystem */
726 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 735 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
727 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 736 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
737 mutex_lock(&ss->hierarchy_mutex);
728 if (ss->bind) 738 if (ss->bind)
729 ss->bind(ss, dummytop); 739 ss->bind(ss, dummytop);
730 dummytop->subsys[i]->cgroup = dummytop; 740 dummytop->subsys[i]->cgroup = dummytop;
731 cgrp->subsys[i] = NULL; 741 cgrp->subsys[i] = NULL;
732 rcu_assign_pointer(subsys[i]->root, &rootnode); 742 subsys[i]->root = &rootnode;
733 list_del(&ss->sibling); 743 list_move(&ss->sibling, &rootnode.subsys_list);
744 mutex_unlock(&ss->hierarchy_mutex);
734 } else if (bit & final_bits) { 745 } else if (bit & final_bits) {
735 /* Subsystem state should already exist */ 746 /* Subsystem state should already exist */
736 BUG_ON(!cgrp->subsys[i]); 747 BUG_ON(!cgrp->subsys[i]);
@@ -992,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
992 root = NULL; 1003 root = NULL;
993 } else { 1004 } else {
994 /* New superblock */ 1005 /* New superblock */
995 struct cgroup *cgrp = &root->top_cgroup; 1006 struct cgroup *root_cgrp = &root->top_cgroup;
996 struct inode *inode; 1007 struct inode *inode;
997 int i; 1008 int i;
998 1009
@@ -1033,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1033 list_add(&root->root_list, &roots); 1044 list_add(&root->root_list, &roots);
1034 root_count++; 1045 root_count++;
1035 1046
1036 sb->s_root->d_fsdata = &root->top_cgroup; 1047 sb->s_root->d_fsdata = root_cgrp;
1037 root->top_cgroup.dentry = sb->s_root; 1048 root->top_cgroup.dentry = sb->s_root;
1038 1049
1039 /* Link the top cgroup in this hierarchy into all 1050 /* Link the top cgroup in this hierarchy into all
@@ -1044,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1044 struct hlist_node *node; 1055 struct hlist_node *node;
1045 struct css_set *cg; 1056 struct css_set *cg;
1046 1057
1047 hlist_for_each_entry(cg, node, hhead, hlist) { 1058 hlist_for_each_entry(cg, node, hhead, hlist)
1048 struct cg_cgroup_link *link; 1059 link_css_set(&tmp_cg_links, cg, root_cgrp);
1049
1050 BUG_ON(list_empty(&tmp_cg_links));
1051 link = list_entry(tmp_cg_links.next,
1052 struct cg_cgroup_link,
1053 cgrp_link_list);
1054 list_del(&link->cgrp_link_list);
1055 link->cg = cg;
1056 list_add(&link->cgrp_link_list,
1057 &root->top_cgroup.css_sets);
1058 list_add(&link->cg_link_list, &cg->cg_links);
1059 }
1060 } 1060 }
1061 write_unlock(&css_set_lock); 1061 write_unlock(&css_set_lock);
1062 1062
1063 free_cg_links(&tmp_cg_links); 1063 free_cg_links(&tmp_cg_links);
1064 1064
1065 BUG_ON(!list_empty(&cgrp->sibling)); 1065 BUG_ON(!list_empty(&root_cgrp->sibling));
1066 BUG_ON(!list_empty(&cgrp->children)); 1066 BUG_ON(!list_empty(&root_cgrp->children));
1067 BUG_ON(root->number_of_cgroups != 1); 1067 BUG_ON(root->number_of_cgroups != 1);
1068 1068
1069 cgroup_populate_dir(cgrp); 1069 cgroup_populate_dir(root_cgrp);
1070 mutex_unlock(&inode->i_mutex); 1070 mutex_unlock(&inode->i_mutex);
1071 mutex_unlock(&cgroup_mutex); 1071 mutex_unlock(&cgroup_mutex);
1072 } 1072 }
@@ -1115,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1115 } 1115 }
1116 write_unlock(&css_set_lock); 1116 write_unlock(&css_set_lock);
1117 1117
1118 if (!list_empty(&root->root_list)) { 1118 list_del(&root->root_list);
1119 list_del(&root->root_list); 1119 root_count--;
1120 root_count--; 1120
1121 }
1122 mutex_unlock(&cgroup_mutex); 1121 mutex_unlock(&cgroup_mutex);
1123 1122
1124 kfree(root); 1123 kfree(root);
@@ -1147,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1147 * @buf: the buffer to write the path into 1146 * @buf: the buffer to write the path into
1148 * @buflen: the length of the buffer 1147 * @buflen: the length of the buffer
1149 * 1148 *
1150 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1149 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1151 * Returns 0 on success, -errno on error. 1150 * reference. Writes path of cgroup into buf. Returns 0 on success,
1151 * -errno on error.
1152 */ 1152 */
1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1154{ 1154{
1155 char *start; 1155 char *start;
1156 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1156 1157
1157 if (cgrp == dummytop) { 1158 if (!dentry || cgrp == dummytop) {
1158 /* 1159 /*
1159 * Inactive subsystems have no dentry for their root 1160 * Inactive subsystems have no dentry for their root
1160 * cgroup 1161 * cgroup
@@ -1167,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1167 1168
1168 *--start = '\0'; 1169 *--start = '\0';
1169 for (;;) { 1170 for (;;) {
1170 int len = cgrp->dentry->d_name.len; 1171 int len = dentry->d_name.len;
1171 if ((start -= len) < buf) 1172 if ((start -= len) < buf)
1172 return -ENAMETOOLONG; 1173 return -ENAMETOOLONG;
1173 memcpy(start, cgrp->dentry->d_name.name, len); 1174 memcpy(start, cgrp->dentry->d_name.name, len);
1174 cgrp = cgrp->parent; 1175 cgrp = cgrp->parent;
1175 if (!cgrp) 1176 if (!cgrp)
1176 break; 1177 break;
1178 dentry = rcu_dereference(cgrp->dentry);
1177 if (!cgrp->parent) 1179 if (!cgrp->parent)
1178 continue; 1180 continue;
1179 if (--start < buf) 1181 if (--start < buf)
@@ -1218,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1218 int retval = 0; 1220 int retval = 0;
1219 struct cgroup_subsys *ss; 1221 struct cgroup_subsys *ss;
1220 struct cgroup *oldcgrp; 1222 struct cgroup *oldcgrp;
1221 struct css_set *cg = tsk->cgroups; 1223 struct css_set *cg;
1222 struct css_set *newcg; 1224 struct css_set *newcg;
1223 struct cgroupfs_root *root = cgrp->root; 1225 struct cgroupfs_root *root = cgrp->root;
1224 int subsys_id; 1226 int subsys_id;
@@ -1238,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1238 } 1240 }
1239 } 1241 }
1240 1242
1243 task_lock(tsk);
1244 cg = tsk->cgroups;
1245 get_css_set(cg);
1246 task_unlock(tsk);
1241 /* 1247 /*
1242 * Locate or allocate a new css_set for this task, 1248 * Locate or allocate a new css_set for this task,
1243 * based on its final set of cgroups 1249 * based on its final set of cgroups
1244 */ 1250 */
1245 newcg = find_css_set(cg, cgrp); 1251 newcg = find_css_set(cg, cgrp);
1252 put_css_set(cg);
1246 if (!newcg) 1253 if (!newcg)
1247 return -ENOMEM; 1254 return -ENOMEM;
1248 1255
@@ -1447,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1447 struct cftype *cft = __d_cft(file->f_dentry); 1454 struct cftype *cft = __d_cft(file->f_dentry);
1448 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1455 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1449 1456
1450 if (!cft || cgroup_is_removed(cgrp)) 1457 if (cgroup_is_removed(cgrp))
1451 return -ENODEV; 1458 return -ENODEV;
1452 if (cft->write) 1459 if (cft->write)
1453 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1460 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1492,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1492 struct cftype *cft = __d_cft(file->f_dentry); 1499 struct cftype *cft = __d_cft(file->f_dentry);
1493 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1500 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1494 1501
1495 if (!cft || cgroup_is_removed(cgrp)) 1502 if (cgroup_is_removed(cgrp))
1496 return -ENODEV; 1503 return -ENODEV;
1497 1504
1498 if (cft->read) 1505 if (cft->read)
@@ -1556,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1556 err = generic_file_open(inode, file); 1563 err = generic_file_open(inode, file);
1557 if (err) 1564 if (err)
1558 return err; 1565 return err;
1559
1560 cft = __d_cft(file->f_dentry); 1566 cft = __d_cft(file->f_dentry);
1561 if (!cft) 1567
1562 return -ENODEV;
1563 if (cft->read_map || cft->read_seq_string) { 1568 if (cft->read_map || cft->read_seq_string) {
1564 struct cgroup_seqfile_state *state = 1569 struct cgroup_seqfile_state *state =
1565 kzalloc(sizeof(*state), GFP_USER); 1570 kzalloc(sizeof(*state), GFP_USER);
@@ -1673,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1673 if (!error) { 1678 if (!error) {
1674 dentry->d_fsdata = cgrp; 1679 dentry->d_fsdata = cgrp;
1675 inc_nlink(parent->d_inode); 1680 inc_nlink(parent->d_inode);
1676 cgrp->dentry = dentry; 1681 rcu_assign_pointer(cgrp->dentry, dentry);
1677 dget(dentry); 1682 dget(dentry);
1678 } 1683 }
1679 dput(dentry); 1684 dput(dentry);
@@ -1814,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1814{ 1819{
1815 struct task_struct *res; 1820 struct task_struct *res;
1816 struct list_head *l = it->task; 1821 struct list_head *l = it->task;
1822 struct cg_cgroup_link *link;
1817 1823
1818 /* If the iterator cg is NULL, we have no tasks */ 1824 /* If the iterator cg is NULL, we have no tasks */
1819 if (!it->cg_link) 1825 if (!it->cg_link)
@@ -1821,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1821 res = list_entry(l, struct task_struct, cg_list); 1827 res = list_entry(l, struct task_struct, cg_list);
1822 /* Advance iterator to find next entry */ 1828 /* Advance iterator to find next entry */
1823 l = l->next; 1829 l = l->next;
1824 if (l == &res->cgroups->tasks) { 1830 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1831 if (l == &link->cg->tasks) {
1825 /* We reached the end of this task list - move on to 1832 /* We reached the end of this task list - move on to
1826 * the next cg_cgroup_link */ 1833 * the next cg_cgroup_link */
1827 cgroup_advance_iter(cgrp, it); 1834 cgroup_advance_iter(cgrp, it);
@@ -2015,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2015 */ 2022 */
2016static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2023static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2017{ 2024{
2018 int n = 0; 2025 int n = 0, pid;
2019 struct cgroup_iter it; 2026 struct cgroup_iter it;
2020 struct task_struct *tsk; 2027 struct task_struct *tsk;
2021 cgroup_iter_start(cgrp, &it); 2028 cgroup_iter_start(cgrp, &it);
2022 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2029 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2023 if (unlikely(n == npids)) 2030 if (unlikely(n == npids))
2024 break; 2031 break;
2025 pidarray[n++] = task_pid_vnr(tsk); 2032 pid = task_pid_vnr(tsk);
2033 if (pid > 0)
2034 pidarray[n++] = pid;
2026 } 2035 }
2027 cgroup_iter_end(cgrp, &it); 2036 cgroup_iter_end(cgrp, &it);
2028 return n; 2037 return n;
@@ -2054,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2054 2063
2055 ret = 0; 2064 ret = 0;
2056 cgrp = dentry->d_fsdata; 2065 cgrp = dentry->d_fsdata;
2057 rcu_read_lock();
2058 2066
2059 cgroup_iter_start(cgrp, &it); 2067 cgroup_iter_start(cgrp, &it);
2060 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2068 while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2079,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2079 } 2087 }
2080 cgroup_iter_end(cgrp, &it); 2088 cgroup_iter_end(cgrp, &it);
2081 2089
2082 rcu_read_unlock();
2083err: 2090err:
2084 return ret; 2091 return ret;
2085} 2092}
@@ -2326,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2326 struct cgroup *cgrp) 2333 struct cgroup *cgrp)
2327{ 2334{
2328 css->cgroup = cgrp; 2335 css->cgroup = cgrp;
2329 atomic_set(&css->refcnt, 0); 2336 atomic_set(&css->refcnt, 1);
2330 css->flags = 0; 2337 css->flags = 0;
2331 if (cgrp == dummytop) 2338 if (cgrp == dummytop)
2332 set_bit(CSS_ROOT, &css->flags); 2339 set_bit(CSS_ROOT, &css->flags);
@@ -2334,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2334 cgrp->subsys[ss->subsys_id] = css; 2341 cgrp->subsys[ss->subsys_id] = css;
2335} 2342}
2336 2343
2344static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2345{
2346 /* We need to take each hierarchy_mutex in a consistent order */
2347 int i;
2348
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i);
2353 }
2354}
2355
2356static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2357{
2358 int i;
2359
2360 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2361 struct cgroup_subsys *ss = subsys[i];
2362 if (ss->root == root)
2363 mutex_unlock(&ss->hierarchy_mutex);
2364 }
2365}
2366
2337/* 2367/*
2338 * cgroup_create - create a cgroup 2368 * cgroup_create - create a cgroup
2339 * @parent: cgroup that will be parent of the new cgroup 2369 * @parent: cgroup that will be parent of the new cgroup
@@ -2382,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2382 init_cgroup_css(css, ss, cgrp); 2412 init_cgroup_css(css, ss, cgrp);
2383 } 2413 }
2384 2414
2415 cgroup_lock_hierarchy(root);
2385 list_add(&cgrp->sibling, &cgrp->parent->children); 2416 list_add(&cgrp->sibling, &cgrp->parent->children);
2417 cgroup_unlock_hierarchy(root);
2386 root->number_of_cgroups++; 2418 root->number_of_cgroups++;
2387 2419
2388 err = cgroup_create_dir(cgrp, dentry, mode); 2420 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2433,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2433{ 2465{
2434 /* Check the reference count on each subsystem. Since we 2466 /* Check the reference count on each subsystem. Since we
2435 * already established that there are no tasks in the 2467 * already established that there are no tasks in the
2436 * cgroup, if the css refcount is also 0, then there should 2468 * cgroup, if the css refcount is also 1, then there should
2437 * be no outstanding references, so the subsystem is safe to 2469 * be no outstanding references, so the subsystem is safe to
2438 * destroy. We scan across all subsystems rather than using 2470 * destroy. We scan across all subsystems rather than using
2439 * the per-hierarchy linked list of mounted subsystems since 2471 * the per-hierarchy linked list of mounted subsystems since
@@ -2454,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2454 * matter, since it can only happen if the cgroup 2486 * matter, since it can only happen if the cgroup
2455 * has been deleted and hence no longer needs the 2487 * has been deleted and hence no longer needs the
2456 * release agent to be called anyway. */ 2488 * release agent to be called anyway. */
2457 if (css && atomic_read(&css->refcnt)) 2489 if (css && (atomic_read(&css->refcnt) > 1))
2458 return 1; 2490 return 1;
2459 } 2491 }
2460 return 0; 2492 return 0;
2461} 2493}
2462 2494
2495/*
2496 * Atomically mark all (or else none) of the cgroup's CSS objects as
2497 * CSS_REMOVED. Return true on success, or false if the cgroup has
2498 * busy subsystems. Call with cgroup_mutex held
2499 */
2500
2501static int cgroup_clear_css_refs(struct cgroup *cgrp)
2502{
2503 struct cgroup_subsys *ss;
2504 unsigned long flags;
2505 bool failed = false;
2506 local_irq_save(flags);
2507 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt;
2510 do {
2511 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) {
2514 failed = true;
2515 goto done;
2516 }
2517 BUG_ON(!refcnt);
2518 /*
2519 * Drop the refcnt to 0 while we check other
2520 * subsystems. This will cause any racing
2521 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort
2523 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
2525 }
2526 done:
2527 for_each_subsys(cgrp->root, ss) {
2528 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2529 if (failed) {
2530 /*
2531 * Restore old refcnt if we previously managed
2532 * to clear it from 1 to 0
2533 */
2534 if (!atomic_read(&css->refcnt))
2535 atomic_set(&css->refcnt, 1);
2536 } else {
2537 /* Commit the fact that the CSS is removed */
2538 set_bit(CSS_REMOVED, &css->flags);
2539 }
2540 }
2541 local_irq_restore(flags);
2542 return !failed;
2543}
2544
2463static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2545static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2464{ 2546{
2465 struct cgroup *cgrp = dentry->d_fsdata; 2547 struct cgroup *cgrp = dentry->d_fsdata;
2466 struct dentry *d; 2548 struct dentry *d;
2467 struct cgroup *parent; 2549 struct cgroup *parent;
2468 struct super_block *sb;
2469 struct cgroupfs_root *root;
2470 2550
2471 /* the vfs holds both inode->i_mutex already */ 2551 /* the vfs holds both inode->i_mutex already */
2472 2552
@@ -2489,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2489 2569
2490 mutex_lock(&cgroup_mutex); 2570 mutex_lock(&cgroup_mutex);
2491 parent = cgrp->parent; 2571 parent = cgrp->parent;
2492 root = cgrp->root;
2493 sb = root->sb;
2494 2572
2495 if (atomic_read(&cgrp->count) 2573 if (atomic_read(&cgrp->count)
2496 || !list_empty(&cgrp->children) 2574 || !list_empty(&cgrp->children)
2497 || cgroup_has_css_refs(cgrp)) { 2575 || !cgroup_clear_css_refs(cgrp)) {
2498 mutex_unlock(&cgroup_mutex); 2576 mutex_unlock(&cgroup_mutex);
2499 return -EBUSY; 2577 return -EBUSY;
2500 } 2578 }
@@ -2504,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2504 if (!list_empty(&cgrp->release_list)) 2582 if (!list_empty(&cgrp->release_list))
2505 list_del(&cgrp->release_list); 2583 list_del(&cgrp->release_list);
2506 spin_unlock(&release_list_lock); 2584 spin_unlock(&release_list_lock);
2507 /* delete my sibling from parent->children */ 2585
2586 cgroup_lock_hierarchy(cgrp->root);
2587 /* delete this cgroup from parent->children */
2508 list_del(&cgrp->sibling); 2588 list_del(&cgrp->sibling);
2589 cgroup_unlock_hierarchy(cgrp->root);
2590
2509 spin_lock(&cgrp->dentry->d_lock); 2591 spin_lock(&cgrp->dentry->d_lock);
2510 d = dget(cgrp->dentry); 2592 d = dget(cgrp->dentry);
2511 spin_unlock(&d->d_lock); 2593 spin_unlock(&d->d_lock);
@@ -2527,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2527 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2609 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2528 2610
2529 /* Create the top cgroup state for this subsystem */ 2611 /* Create the top cgroup state for this subsystem */
2612 list_add(&ss->sibling, &rootnode.subsys_list);
2530 ss->root = &rootnode; 2613 ss->root = &rootnode;
2531 css = ss->create(ss, dummytop); 2614 css = ss->create(ss, dummytop);
2532 /* We don't handle early failures gracefully */ 2615 /* We don't handle early failures gracefully */
@@ -2540,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2540 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2623 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2541 2624
2542 need_forkexit_callback |= ss->fork || ss->exit; 2625 need_forkexit_callback |= ss->fork || ss->exit;
2543 need_mm_owner_callback |= !!ss->mm_owner_changed;
2544 2626
2545 /* At system boot, before all subsystems have been 2627 /* At system boot, before all subsystems have been
2546 * registered, no tasks have been forked, so we don't 2628 * registered, no tasks have been forked, so we don't
2547 * need to invoke fork callbacks here. */ 2629 * need to invoke fork callbacks here. */
2548 BUG_ON(!list_empty(&init_task.tasks)); 2630 BUG_ON(!list_empty(&init_task.tasks));
2549 2631
2632 mutex_init(&ss->hierarchy_mutex);
2550 ss->active = 1; 2633 ss->active = 1;
2551} 2634}
2552 2635
@@ -2565,7 +2648,6 @@ int __init cgroup_init_early(void)
2565 INIT_HLIST_NODE(&init_css_set.hlist); 2648 INIT_HLIST_NODE(&init_css_set.hlist);
2566 css_set_count = 1; 2649 css_set_count = 1;
2567 init_cgroup_root(&rootnode); 2650 init_cgroup_root(&rootnode);
2568 list_add(&rootnode.root_list, &roots);
2569 root_count = 1; 2651 root_count = 1;
2570 init_task.cgroups = &init_css_set; 2652 init_task.cgroups = &init_css_set;
2571 2653
@@ -2672,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2672 2754
2673 mutex_lock(&cgroup_mutex); 2755 mutex_lock(&cgroup_mutex);
2674 2756
2675 for_each_root(root) { 2757 for_each_active_root(root) {
2676 struct cgroup_subsys *ss; 2758 struct cgroup_subsys *ss;
2677 struct cgroup *cgrp; 2759 struct cgroup *cgrp;
2678 int subsys_id; 2760 int subsys_id;
2679 int count = 0; 2761 int count = 0;
2680 2762
2681 /* Skip this hierarchy if it has no active subsystems */
2682 if (!root->actual_subsys_bits)
2683 continue;
2684 seq_printf(m, "%lu:", root->subsys_bits); 2763 seq_printf(m, "%lu:", root->subsys_bits);
2685 for_each_subsys(root, ss) 2764 for_each_subsys(root, ss)
2686 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2765 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2790,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
2790 } 2869 }
2791} 2870}
2792 2871
2793#ifdef CONFIG_MM_OWNER
2794/**
2795 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2796 * @p: the new owner
2797 *
2798 * Called on every change to mm->owner. mm_init_owner() does not
2799 * invoke this routine, since it assigns the mm->owner the first time
2800 * and does not change it.
2801 *
2802 * The callbacks are invoked with mmap_sem held in read mode.
2803 */
2804void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2805{
2806 struct cgroup *oldcgrp, *newcgrp = NULL;
2807
2808 if (need_mm_owner_callback) {
2809 int i;
2810 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2811 struct cgroup_subsys *ss = subsys[i];
2812 oldcgrp = task_cgroup(old, ss->subsys_id);
2813 if (new)
2814 newcgrp = task_cgroup(new, ss->subsys_id);
2815 if (oldcgrp == newcgrp)
2816 continue;
2817 if (ss->mm_owner_changed)
2818 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2819 }
2820 }
2821}
2822#endif /* CONFIG_MM_OWNER */
2823
2824/** 2872/**
2825 * cgroup_post_fork - called on a new task after adding it to the task list 2873 * cgroup_post_fork - called on a new task after adding it to the task list
2826 * @child: the task in question 2874 * @child: the task in question
@@ -2834,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
2834{ 2882{
2835 if (use_task_css_set_links) { 2883 if (use_task_css_set_links) {
2836 write_lock(&css_set_lock); 2884 write_lock(&css_set_lock);
2885 task_lock(child);
2837 if (list_empty(&child->cg_list)) 2886 if (list_empty(&child->cg_list))
2838 list_add(&child->cg_list, &child->cgroups->tasks); 2887 list_add(&child->cg_list, &child->cgroups->tasks);
2888 task_unlock(child);
2839 write_unlock(&css_set_lock); 2889 write_unlock(&css_set_lock);
2840 } 2890 }
2841} 2891}
@@ -2941,14 +2991,20 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2941 mutex_unlock(&cgroup_mutex); 2991 mutex_unlock(&cgroup_mutex);
2942 return 0; 2992 return 0;
2943 } 2993 }
2994 task_lock(tsk);
2944 cg = tsk->cgroups; 2995 cg = tsk->cgroups;
2945 parent = task_cgroup(tsk, subsys->subsys_id); 2996 parent = task_cgroup(tsk, subsys->subsys_id);
2946 2997
2947 /* Pin the hierarchy */ 2998 /* Pin the hierarchy */
2948 atomic_inc(&parent->root->sb->s_active); 2999 if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
3000 /* We race with the final deactivate_super() */
3001 mutex_unlock(&cgroup_mutex);
3002 return 0;
3003 }
2949 3004
2950 /* Keep the cgroup alive */ 3005 /* Keep the cgroup alive */
2951 get_css_set(cg); 3006 get_css_set(cg);
3007 task_unlock(tsk);
2952 mutex_unlock(&cgroup_mutex); 3008 mutex_unlock(&cgroup_mutex);
2953 3009
2954 /* Now do the VFS work to create a cgroup */ 3010 /* Now do the VFS work to create a cgroup */
@@ -2967,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2967 } 3023 }
2968 3024
2969 /* Create the cgroup directory, which also creates the cgroup */ 3025 /* Create the cgroup directory, which also creates the cgroup */
2970 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 3026 ret = vfs_mkdir(inode, dentry, 0755);
2971 child = __d_cgrp(dentry); 3027 child = __d_cgrp(dentry);
2972 dput(dentry); 3028 dput(dentry);
2973 if (ret) { 3029 if (ret) {
@@ -2977,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2977 goto out_release; 3033 goto out_release;
2978 } 3034 }
2979 3035
2980 if (!child) {
2981 printk(KERN_INFO
2982 "Couldn't find new cgroup %s\n", nodename);
2983 ret = -ENOMEM;
2984 goto out_release;
2985 }
2986
2987 /* The cgroup now exists. Retake cgroup_mutex and check 3036 /* The cgroup now exists. Retake cgroup_mutex and check
2988 * that we're still in the same state that we thought we 3037 * that we're still in the same state that we thought we
2989 * were. */ 3038 * were. */
@@ -3079,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
3079{ 3128{
3080 struct cgroup *cgrp = css->cgroup; 3129 struct cgroup *cgrp = css->cgroup;
3081 rcu_read_lock(); 3130 rcu_read_lock();
3082 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 3131 if ((atomic_dec_return(&css->refcnt) == 1) &&
3132 notify_on_release(cgrp)) {
3083 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3133 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3084 check_for_release(cgrp); 3134 check_for_release(cgrp);
3085 } 3135 }