aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-09 15:59:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-09 15:59:24 -0500
commitdb0c2bf69aa095d4a6de7b1145f29fe9a7c0f6a3 (patch)
tree8f38957c01b18edddd44d49ecc3beeac08a20b4e /kernel/cgroup.c
parentac69e0928054ff29a5049902fb477f9c7605c773 (diff)
parent0d19ea866562e46989412a0676412fa0983c9ce7 (diff)
Merge branch 'for-3.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
* 'for-3.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits) cgroup: fix to allow mounting a hierarchy by name cgroup: move assignement out of condition in cgroup_attach_proc() cgroup: Remove task_lock() from cgroup_post_fork() cgroup: add sparse annotation to cgroup_iter_start() and cgroup_iter_end() cgroup: mark cgroup_rmdir_waitq and cgroup_attach_proc() as static cgroup: only need to check oldcgrp==newgrp once cgroup: remove redundant get/put of task struct cgroup: remove redundant get/put of old css_set from migrate cgroup: Remove unnecessary task_lock before fetching css_set on migration cgroup: Drop task_lock(parent) on cgroup_fork() cgroups: remove redundant get/put of css_set from css_set_check_fetched() resource cgroups: remove bogus cast cgroup: kill subsys->can_attach_task(), pre_attach() and attach_task() cgroup, cpuset: don't use ss->pre_attach() cgroup: don't use subsys->can_attach_task() or ->attach_task() cgroup: introduce cgroup_taskset and use it in subsys->can_attach(), cancel_attach() and attach() cgroup: improve old cgroup handling in cgroup_attach_proc() cgroup: always lock threadgroup during migration threadgroup: extend threadgroup_lock() to cover exit and exec threadgroup: rename signal->threadgroup_fork_lock to ->group_rwsem ... Fix up conflict in kernel/cgroup.c due to commit e0197aae59e5: "cgroups: fix a css_set not found bug in cgroup_attach_proc" that already mentioned that the bug is fixed (differently) in Tejun's cgroup patchset. This one, in other words.
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c401
1 files changed, 231 insertions, 170 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7cab65f83f1..a5d3b5325f7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it.
69 *
70 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
71 * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
72 * release_agent_path and so on. Modifying requires both cgroup_mutex and
73 * cgroup_root_mutex. Readers can acquire either of the two. This is to
74 * break the following locking order cycle.
75 *
76 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
77 * B. namespace_sem -> cgroup_mutex
78 *
79 * B happens only through cgroup_show_options() and using cgroup_root_mutex
80 * breaks it.
81 */
66static DEFINE_MUTEX(cgroup_mutex); 82static DEFINE_MUTEX(cgroup_mutex);
83static DEFINE_MUTEX(cgroup_root_mutex);
67 84
68/* 85/*
69 * Generate an array of cgroup subsystem pointers. At boot time, this is 86 * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
921 * 938 *
922 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; 939 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
923 */ 940 */
924DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 941static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
925 942
926static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) 943static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
927{ 944{
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
953 int i; 970 int i;
954 971
955 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 972 BUG_ON(!mutex_is_locked(&cgroup_mutex));
973 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
956 974
957 removed_bits = root->actual_subsys_bits & ~final_bits; 975 removed_bits = root->actual_subsys_bits & ~final_bits;
958 added_bits = final_bits & ~root->actual_subsys_bits; 976 added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1043,7 +1061,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1043 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1061 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1044 struct cgroup_subsys *ss; 1062 struct cgroup_subsys *ss;
1045 1063
1046 mutex_lock(&cgroup_mutex); 1064 mutex_lock(&cgroup_root_mutex);
1047 for_each_subsys(root, ss) 1065 for_each_subsys(root, ss)
1048 seq_printf(seq, ",%s", ss->name); 1066 seq_printf(seq, ",%s", ss->name);
1049 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1067 if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1054 seq_puts(seq, ",clone_children"); 1072 seq_puts(seq, ",clone_children");
1055 if (strlen(root->name)) 1073 if (strlen(root->name))
1056 seq_printf(seq, ",name=%s", root->name); 1074 seq_printf(seq, ",name=%s", root->name);
1057 mutex_unlock(&cgroup_mutex); 1075 mutex_unlock(&cgroup_root_mutex);
1058 return 0; 1076 return 0;
1059} 1077}
1060 1078
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1175 1193
1176 /* 1194 /*
1177 * If the 'all' option was specified select all the subsystems, 1195 * If the 'all' option was specified select all the subsystems,
1178 * otherwise 'all, 'none' and a subsystem name options were not 1196 * otherwise if 'none', 'name=' and a subsystem name options
1179 * specified, let's default to 'all' 1197 * were not specified, let's default to 'all'
1180 */ 1198 */
1181 if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1199 if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1182 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1200 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1183 struct cgroup_subsys *ss = subsys[i]; 1201 struct cgroup_subsys *ss = subsys[i];
1184 if (ss == NULL) 1202 if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1269 1287
1270 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1288 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1271 mutex_lock(&cgroup_mutex); 1289 mutex_lock(&cgroup_mutex);
1290 mutex_lock(&cgroup_root_mutex);
1272 1291
1273 /* See what subsystems are wanted */ 1292 /* See what subsystems are wanted */
1274 ret = parse_cgroupfs_options(data, &opts); 1293 ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1297 out_unlock: 1316 out_unlock:
1298 kfree(opts.release_agent); 1317 kfree(opts.release_agent);
1299 kfree(opts.name); 1318 kfree(opts.name);
1319 mutex_unlock(&cgroup_root_mutex);
1300 mutex_unlock(&cgroup_mutex); 1320 mutex_unlock(&cgroup_mutex);
1301 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1321 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1302 return ret; 1322 return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1481 int ret = 0; 1501 int ret = 0;
1482 struct super_block *sb; 1502 struct super_block *sb;
1483 struct cgroupfs_root *new_root; 1503 struct cgroupfs_root *new_root;
1504 struct inode *inode;
1484 1505
1485 /* First find the desired set of subsystems */ 1506 /* First find the desired set of subsystems */
1486 mutex_lock(&cgroup_mutex); 1507 mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1514 /* We used the new root structure, so this is a new hierarchy */ 1535 /* We used the new root structure, so this is a new hierarchy */
1515 struct list_head tmp_cg_links; 1536 struct list_head tmp_cg_links;
1516 struct cgroup *root_cgrp = &root->top_cgroup; 1537 struct cgroup *root_cgrp = &root->top_cgroup;
1517 struct inode *inode;
1518 struct cgroupfs_root *existing_root; 1538 struct cgroupfs_root *existing_root;
1519 const struct cred *cred; 1539 const struct cred *cred;
1520 int i; 1540 int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1528 1548
1529 mutex_lock(&inode->i_mutex); 1549 mutex_lock(&inode->i_mutex);
1530 mutex_lock(&cgroup_mutex); 1550 mutex_lock(&cgroup_mutex);
1551 mutex_lock(&cgroup_root_mutex);
1531 1552
1532 if (strlen(root->name)) { 1553 /* Check for name clashes with existing mounts */
1533 /* Check for name clashes with existing mounts */ 1554 ret = -EBUSY;
1534 for_each_active_root(existing_root) { 1555 if (strlen(root->name))
1535 if (!strcmp(existing_root->name, root->name)) { 1556 for_each_active_root(existing_root)
1536 ret = -EBUSY; 1557 if (!strcmp(existing_root->name, root->name))
1537 mutex_unlock(&cgroup_mutex); 1558 goto unlock_drop;
1538 mutex_unlock(&inode->i_mutex);
1539 goto drop_new_super;
1540 }
1541 }
1542 }
1543 1559
1544 /* 1560 /*
1545 * We're accessing css_set_count without locking 1561 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1549 * have some link structures left over 1565 * have some link structures left over
1550 */ 1566 */
1551 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1567 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1552 if (ret) { 1568 if (ret)
1553 mutex_unlock(&cgroup_mutex); 1569 goto unlock_drop;
1554 mutex_unlock(&inode->i_mutex);
1555 goto drop_new_super;
1556 }
1557 1570
1558 ret = rebind_subsystems(root, root->subsys_bits); 1571 ret = rebind_subsystems(root, root->subsys_bits);
1559 if (ret == -EBUSY) { 1572 if (ret == -EBUSY) {
1560 mutex_unlock(&cgroup_mutex);
1561 mutex_unlock(&inode->i_mutex);
1562 free_cg_links(&tmp_cg_links); 1573 free_cg_links(&tmp_cg_links);
1563 goto drop_new_super; 1574 goto unlock_drop;
1564 } 1575 }
1565 /* 1576 /*
1566 * There must be no failure case after here, since rebinding 1577 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1599 cred = override_creds(&init_cred); 1610 cred = override_creds(&init_cred);
1600 cgroup_populate_dir(root_cgrp); 1611 cgroup_populate_dir(root_cgrp);
1601 revert_creds(cred); 1612 revert_creds(cred);
1613 mutex_unlock(&cgroup_root_mutex);
1602 mutex_unlock(&cgroup_mutex); 1614 mutex_unlock(&cgroup_mutex);
1603 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1604 } else { 1616 } else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1615 kfree(opts.name); 1627 kfree(opts.name);
1616 return dget(sb->s_root); 1628 return dget(sb->s_root);
1617 1629
1630 unlock_drop:
1631 mutex_unlock(&cgroup_root_mutex);
1632 mutex_unlock(&cgroup_mutex);
1633 mutex_unlock(&inode->i_mutex);
1618 drop_new_super: 1634 drop_new_super:
1619 deactivate_locked_super(sb); 1635 deactivate_locked_super(sb);
1620 drop_modules: 1636 drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1639 BUG_ON(!list_empty(&cgrp->sibling)); 1655 BUG_ON(!list_empty(&cgrp->sibling));
1640 1656
1641 mutex_lock(&cgroup_mutex); 1657 mutex_lock(&cgroup_mutex);
1658 mutex_lock(&cgroup_root_mutex);
1642 1659
1643 /* Rebind all subsystems back to the default hierarchy */ 1660 /* Rebind all subsystems back to the default hierarchy */
1644 ret = rebind_subsystems(root, 0); 1661 ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1664 root_count--; 1681 root_count--;
1665 } 1682 }
1666 1683
1684 mutex_unlock(&cgroup_root_mutex);
1667 mutex_unlock(&cgroup_mutex); 1685 mutex_unlock(&cgroup_mutex);
1668 1686
1669 kill_litter_super(sb); 1687 kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1740EXPORT_SYMBOL_GPL(cgroup_path); 1758EXPORT_SYMBOL_GPL(cgroup_path);
1741 1759
1742/* 1760/*
1761 * Control Group taskset
1762 */
1763struct task_and_cgroup {
1764 struct task_struct *task;
1765 struct cgroup *cgrp;
1766};
1767
1768struct cgroup_taskset {
1769 struct task_and_cgroup single;
1770 struct flex_array *tc_array;
1771 int tc_array_len;
1772 int idx;
1773 struct cgroup *cur_cgrp;
1774};
1775
1776/**
1777 * cgroup_taskset_first - reset taskset and return the first task
1778 * @tset: taskset of interest
1779 *
1780 * @tset iteration is initialized and the first task is returned.
1781 */
1782struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1783{
1784 if (tset->tc_array) {
1785 tset->idx = 0;
1786 return cgroup_taskset_next(tset);
1787 } else {
1788 tset->cur_cgrp = tset->single.cgrp;
1789 return tset->single.task;
1790 }
1791}
1792EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1793
1794/**
1795 * cgroup_taskset_next - iterate to the next task in taskset
1796 * @tset: taskset of interest
1797 *
1798 * Return the next task in @tset. Iteration must have been initialized
1799 * with cgroup_taskset_first().
1800 */
1801struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1802{
1803 struct task_and_cgroup *tc;
1804
1805 if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1806 return NULL;
1807
1808 tc = flex_array_get(tset->tc_array, tset->idx++);
1809 tset->cur_cgrp = tc->cgrp;
1810 return tc->task;
1811}
1812EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1813
1814/**
1815 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
1816 * @tset: taskset of interest
1817 *
1818 * Return the cgroup for the current (last returned) task of @tset. This
1819 * function must be preceded by either cgroup_taskset_first() or
1820 * cgroup_taskset_next().
1821 */
1822struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1823{
1824 return tset->cur_cgrp;
1825}
1826EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
1827
1828/**
1829 * cgroup_taskset_size - return the number of tasks in taskset
1830 * @tset: taskset of interest
1831 */
1832int cgroup_taskset_size(struct cgroup_taskset *tset)
1833{
1834 return tset->tc_array ? tset->tc_array_len : 1;
1835}
1836EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1837
1838
1839/*
1743 * cgroup_task_migrate - move a task from one cgroup to another. 1840 * cgroup_task_migrate - move a task from one cgroup to another.
1744 * 1841 *
1745 * 'guarantee' is set if the caller promises that a new css_set for the task 1842 * 'guarantee' is set if the caller promises that a new css_set for the task
1746 * will already exist. If not set, this function might sleep, and can fail with 1843 * will already exist. If not set, this function might sleep, and can fail with
1747 * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1748 */ 1845 */
1749static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1750 struct task_struct *tsk, bool guarantee) 1847 struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1753 struct css_set *newcg; 1850 struct css_set *newcg;
1754 1851
1755 /* 1852 /*
1756 * get old css_set. we need to take task_lock and refcount it, because 1853 * We are synchronized through threadgroup_lock() against PF_EXITING
1757 * an exiting task can change its css_set to init_css_set and drop its 1854 * setting such that we can't race against cgroup_exit() changing the
1758 * old one without taking cgroup_mutex. 1855 * css_set to init_css_set and dropping the old one.
1759 */ 1856 */
1760 task_lock(tsk); 1857 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1761 oldcg = tsk->cgroups; 1858 oldcg = tsk->cgroups;
1762 get_css_set(oldcg);
1763 task_unlock(tsk);
1764 1859
1765 /* locate or allocate a new css_set for this task. */ 1860 /* locate or allocate a new css_set for this task. */
1766 if (guarantee) { 1861 if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1775 might_sleep(); 1870 might_sleep();
1776 /* find_css_set will give us newcg already referenced. */ 1871 /* find_css_set will give us newcg already referenced. */
1777 newcg = find_css_set(oldcg, cgrp); 1872 newcg = find_css_set(oldcg, cgrp);
1778 if (!newcg) { 1873 if (!newcg)
1779 put_css_set(oldcg);
1780 return -ENOMEM; 1874 return -ENOMEM;
1781 }
1782 } 1875 }
1783 put_css_set(oldcg);
1784 1876
1785 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1786 task_lock(tsk); 1877 task_lock(tsk);
1787 if (tsk->flags & PF_EXITING) {
1788 task_unlock(tsk);
1789 put_css_set(newcg);
1790 return -ESRCH;
1791 }
1792 rcu_assign_pointer(tsk->cgroups, newcg); 1878 rcu_assign_pointer(tsk->cgroups, newcg);
1793 task_unlock(tsk); 1879 task_unlock(tsk);
1794 1880
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1814 * @cgrp: the cgroup the task is attaching to 1900 * @cgrp: the cgroup the task is attaching to
1815 * @tsk: the task to be attached 1901 * @tsk: the task to be attached
1816 * 1902 *
1817 * Call holding cgroup_mutex. May take task_lock of 1903 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1818 * the task 'tsk' during call. 1904 * @tsk during call.
1819 */ 1905 */
1820int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1906int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1821{ 1907{
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1823 struct cgroup_subsys *ss, *failed_ss = NULL; 1909 struct cgroup_subsys *ss, *failed_ss = NULL;
1824 struct cgroup *oldcgrp; 1910 struct cgroup *oldcgrp;
1825 struct cgroupfs_root *root = cgrp->root; 1911 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { };
1913
1914 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING)
1916 return -ESRCH;
1826 1917
1827 /* Nothing to do if the task is already in that cgroup */ 1918 /* Nothing to do if the task is already in that cgroup */
1828 oldcgrp = task_cgroup_from_root(tsk, root); 1919 oldcgrp = task_cgroup_from_root(tsk, root);
1829 if (cgrp == oldcgrp) 1920 if (cgrp == oldcgrp)
1830 return 0; 1921 return 0;
1831 1922
1923 tset.single.task = tsk;
1924 tset.single.cgrp = oldcgrp;
1925
1832 for_each_subsys(root, ss) { 1926 for_each_subsys(root, ss) {
1833 if (ss->can_attach) { 1927 if (ss->can_attach) {
1834 retval = ss->can_attach(ss, cgrp, tsk); 1928 retval = ss->can_attach(ss, cgrp, &tset);
1835 if (retval) { 1929 if (retval) {
1836 /* 1930 /*
1837 * Remember on which subsystem the can_attach() 1931 * Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1843 goto out; 1937 goto out;
1844 } 1938 }
1845 } 1939 }
1846 if (ss->can_attach_task) {
1847 retval = ss->can_attach_task(cgrp, tsk);
1848 if (retval) {
1849 failed_ss = ss;
1850 goto out;
1851 }
1852 }
1853 } 1940 }
1854 1941
1855 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1857 goto out; 1944 goto out;
1858 1945
1859 for_each_subsys(root, ss) { 1946 for_each_subsys(root, ss) {
1860 if (ss->pre_attach)
1861 ss->pre_attach(cgrp);
1862 if (ss->attach_task)
1863 ss->attach_task(cgrp, tsk);
1864 if (ss->attach) 1947 if (ss->attach)
1865 ss->attach(ss, cgrp, oldcgrp, tsk); 1948 ss->attach(ss, cgrp, &tset);
1866 } 1949 }
1867 1950
1868 synchronize_rcu(); 1951 synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
1884 */ 1967 */
1885 break; 1968 break;
1886 if (ss->cancel_attach) 1969 if (ss->cancel_attach)
1887 ss->cancel_attach(ss, cgrp, tsk); 1970 ss->cancel_attach(ss, cgrp, &tset);
1888 } 1971 }
1889 } 1972 }
1890 return retval; 1973 return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
1935 2018
1936 read_lock(&css_set_lock); 2019 read_lock(&css_set_lock);
1937 newcg = find_existing_css_set(cg, cgrp, template); 2020 newcg = find_existing_css_set(cg, cgrp, template);
1938 if (newcg)
1939 get_css_set(newcg);
1940 read_unlock(&css_set_lock); 2021 read_unlock(&css_set_lock);
1941 2022
1942 /* doesn't exist at all? */ 2023 /* doesn't exist at all? */
1943 if (!newcg) 2024 if (!newcg)
1944 return false; 2025 return false;
1945 /* see if it's already in the list */ 2026 /* see if it's already in the list */
1946 list_for_each_entry(cg_entry, newcg_list, links) { 2027 list_for_each_entry(cg_entry, newcg_list, links)
1947 if (cg_entry->cg == newcg) { 2028 if (cg_entry->cg == newcg)
1948 put_css_set(newcg);
1949 return true; 2029 return true;
1950 }
1951 }
1952 2030
1953 /* not found */ 2031 /* not found */
1954 put_css_set(newcg);
1955 return false; 2032 return false;
1956} 2033}
1957 2034
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1985 * @cgrp: the cgroup to attach to 2062 * @cgrp: the cgroup to attach to
1986 * @leader: the threadgroup leader task_struct of the group to be attached 2063 * @leader: the threadgroup leader task_struct of the group to be attached
1987 * 2064 *
1988 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 2065 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1989 * take task_lock of each thread in leader's threadgroup individually in turn. 2066 * task_lock of each thread in leader's threadgroup individually in turn.
1990 */ 2067 */
1991int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2068static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1992{ 2069{
1993 int retval, i, group_size; 2070 int retval, i, group_size;
1994 struct cgroup_subsys *ss, *failed_ss = NULL; 2071 struct cgroup_subsys *ss, *failed_ss = NULL;
1995 bool cancel_failed_ss = false;
1996 /* guaranteed to be initialized later, but the compiler needs this */ 2072 /* guaranteed to be initialized later, but the compiler needs this */
1997 struct cgroup *oldcgrp = NULL;
1998 struct css_set *oldcg; 2073 struct css_set *oldcg;
1999 struct cgroupfs_root *root = cgrp->root; 2074 struct cgroupfs_root *root = cgrp->root;
2000 /* threadgroup list cursor and array */ 2075 /* threadgroup list cursor and array */
2001 struct task_struct *tsk; 2076 struct task_struct *tsk;
2077 struct task_and_cgroup *tc;
2002 struct flex_array *group; 2078 struct flex_array *group;
2079 struct cgroup_taskset tset = { };
2003 /* 2080 /*
2004 * we need to make sure we have css_sets for all the tasks we're 2081 * we need to make sure we have css_sets for all the tasks we're
2005 * going to move -before- we actually start moving them, so that in 2082 * going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2012 * step 0: in order to do expensive, possibly blocking operations for 2089 * step 0: in order to do expensive, possibly blocking operations for
2013 * every thread, we cannot iterate the thread group list, since it needs 2090 * every thread, we cannot iterate the thread group list, since it needs
2014 * rcu or tasklist locked. instead, build an array of all threads in the 2091 * rcu or tasklist locked. instead, build an array of all threads in the
2015 * group - threadgroup_fork_lock prevents new threads from appearing, 2092 * group - group_rwsem prevents new threads from appearing, and if
2016 * and if threads exit, this will just be an over-estimate. 2093 * threads exit, this will just be an over-estimate.
2017 */ 2094 */
2018 group_size = get_nr_threads(leader); 2095 group_size = get_nr_threads(leader);
2019 /* flex_array supports very large thread-groups better than kmalloc. */ 2096 /* flex_array supports very large thread-groups better than kmalloc. */
2020 group = flex_array_alloc(sizeof(struct task_struct *), group_size, 2097 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2021 GFP_KERNEL);
2022 if (!group) 2098 if (!group)
2023 return -ENOMEM; 2099 return -ENOMEM;
2024 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2100 /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2040 retval = -EAGAIN; 2116 retval = -EAGAIN;
2041 goto out_free_group_list; 2117 goto out_free_group_list;
2042 } 2118 }
2043 /* take a reference on each task in the group to go in the array. */ 2119
2044 tsk = leader; 2120 tsk = leader;
2045 i = 0; 2121 i = 0;
2046 do { 2122 do {
2123 struct task_and_cgroup ent;
2124
2125 /* @tsk either already exited or can't exit until the end */
2126 if (tsk->flags & PF_EXITING)
2127 continue;
2128
2047 /* as per above, nr_threads may decrease, but not increase. */ 2129 /* as per above, nr_threads may decrease, but not increase. */
2048 BUG_ON(i >= group_size); 2130 BUG_ON(i >= group_size);
2049 get_task_struct(tsk);
2050 /* 2131 /*
2051 * saying GFP_ATOMIC has no effect here because we did prealloc 2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2052 * earlier, but it's good form to communicate our expectations. 2133 * earlier, but it's good form to communicate our expectations.
2053 */ 2134 */
2054 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); 2135 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp)
2139 continue;
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2055 BUG_ON(retval != 0); 2141 BUG_ON(retval != 0);
2056 i++; 2142 i++;
2057 } while_each_thread(leader, tsk); 2143 } while_each_thread(leader, tsk);
2058 /* remember the number of threads in the array for later. */ 2144 /* remember the number of threads in the array for later. */
2059 group_size = i; 2145 group_size = i;
2146 tset.tc_array = group;
2147 tset.tc_array_len = group_size;
2060 read_unlock(&tasklist_lock); 2148 read_unlock(&tasklist_lock);
2061 2149
2150 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0;
2152 if (!group_size)
2153 goto out_free_group_list;
2154
2062 /* 2155 /*
2063 * step 1: check that we can legitimately attach to the cgroup. 2156 * step 1: check that we can legitimately attach to the cgroup.
2064 */ 2157 */
2065 for_each_subsys(root, ss) { 2158 for_each_subsys(root, ss) {
2066 if (ss->can_attach) { 2159 if (ss->can_attach) {
2067 retval = ss->can_attach(ss, cgrp, leader); 2160 retval = ss->can_attach(ss, cgrp, &tset);
2068 if (retval) { 2161 if (retval) {
2069 failed_ss = ss; 2162 failed_ss = ss;
2070 goto out_cancel_attach; 2163 goto out_cancel_attach;
2071 } 2164 }
2072 } 2165 }
2073 /* a callback to be run on every thread in the threadgroup. */
2074 if (ss->can_attach_task) {
2075 /* run on each task in the threadgroup. */
2076 for (i = 0; i < group_size; i++) {
2077 tsk = flex_array_get_ptr(group, i);
2078 retval = ss->can_attach_task(cgrp, tsk);
2079 if (retval) {
2080 failed_ss = ss;
2081 cancel_failed_ss = true;
2082 goto out_cancel_attach;
2083 }
2084 }
2085 }
2086 } 2166 }
2087 2167
2088 /* 2168 /*
@@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2091 */ 2171 */
2092 INIT_LIST_HEAD(&newcg_list); 2172 INIT_LIST_HEAD(&newcg_list);
2093 for (i = 0; i < group_size; i++) { 2173 for (i = 0; i < group_size; i++) {
2094 tsk = flex_array_get_ptr(group, i); 2174 tc = flex_array_get(group, i);
2095 /* nothing to do if this task is already in the cgroup */ 2175 oldcg = tc->task->cgroups;
2096 oldcgrp = task_cgroup_from_root(tsk, root); 2176
2097 if (cgrp == oldcgrp) 2177 /* if we don't already have it in the list get a new one */
2098 continue; 2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg,
2099 /* get old css_set pointer */ 2179 &newcg_list)) {
2100 task_lock(tsk);
2101 oldcg = tsk->cgroups;
2102 get_css_set(oldcg);
2103 task_unlock(tsk);
2104 /* see if the new one for us is already in the list? */
2105 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2106 /* was already there, nothing to do. */
2107 put_css_set(oldcg);
2108 } else {
2109 /* we don't already have it. get new one. */
2110 retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2111 put_css_set(oldcg);
2112 if (retval) 2181 if (retval)
2113 goto out_list_teardown; 2182 goto out_list_teardown;
2114 } 2183 }
2115 } 2184 }
2116 2185
2117 /* 2186 /*
2118 * step 3: now that we're guaranteed success wrt the css_sets, proceed 2187 * step 3: now that we're guaranteed success wrt the css_sets,
2119 * to move all tasks to the new cgroup, calling ss->attach_task for each 2188 * proceed to move all tasks to the new cgroup. There are no
2120 * one along the way. there are no failure cases after here, so this is 2189 * failure cases after here, so this is the commit point.
2121 * the commit point.
2122 */ 2190 */
2123 for_each_subsys(root, ss) {
2124 if (ss->pre_attach)
2125 ss->pre_attach(cgrp);
2126 }
2127 for (i = 0; i < group_size; i++) { 2191 for (i = 0; i < group_size; i++) {
2128 tsk = flex_array_get_ptr(group, i); 2192 tc = flex_array_get(group, i);
2129 /* leave current thread as it is if it's already there */ 2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
2130 oldcgrp = task_cgroup_from_root(tsk, root); 2194 BUG_ON(retval);
2131 if (cgrp == oldcgrp)
2132 continue;
2133 /* if the thread is PF_EXITING, it can just get skipped. */
2134 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2135 if (retval == 0) {
2136 /* attach each task to each subsystem */
2137 for_each_subsys(root, ss) {
2138 if (ss->attach_task)
2139 ss->attach_task(cgrp, tsk);
2140 }
2141 } else {
2142 BUG_ON(retval != -ESRCH);
2143 }
2144 } 2195 }
2145 /* nothing is sensitive to fork() after this point. */ 2196 /* nothing is sensitive to fork() after this point. */
2146 2197
2147 /* 2198 /*
2148 * step 4: do expensive, non-thread-specific subsystem callbacks. 2199 * step 4: do subsystem attach callbacks.
2149 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2150 * being moved, this call will need to be reworked to communicate that.
2151 */ 2200 */
2152 for_each_subsys(root, ss) { 2201 for_each_subsys(root, ss) {
2153 if (ss->attach) 2202 if (ss->attach)
2154 ss->attach(ss, cgrp, oldcgrp, leader); 2203 ss->attach(ss, cgrp, &tset);
2155 } 2204 }
2156 2205
2157 /* 2206 /*
@@ -2171,20 +2220,12 @@ out_cancel_attach:
2171 /* same deal as in cgroup_attach_task */ 2220 /* same deal as in cgroup_attach_task */
2172 if (retval) { 2221 if (retval) {
2173 for_each_subsys(root, ss) { 2222 for_each_subsys(root, ss) {
2174 if (ss == failed_ss) { 2223 if (ss == failed_ss)
2175 if (cancel_failed_ss && ss->cancel_attach)
2176 ss->cancel_attach(ss, cgrp, leader);
2177 break; 2224 break;
2178 }
2179 if (ss->cancel_attach) 2225 if (ss->cancel_attach)
2180 ss->cancel_attach(ss, cgrp, leader); 2226 ss->cancel_attach(ss, cgrp, &tset);
2181 } 2227 }
2182 } 2228 }
2183 /* clean up the array of referenced threads in the group. */
2184 for (i = 0; i < group_size; i++) {
2185 tsk = flex_array_get_ptr(group, i);
2186 put_task_struct(tsk);
2187 }
2188out_free_group_list: 2229out_free_group_list:
2189 flex_array_free(group); 2230 flex_array_free(group);
2190 return retval; 2231 return retval;
@@ -2192,8 +2233,8 @@ out_free_group_list:
2192 2233
2193/* 2234/*
2194 * Find the task_struct of the task to attach by vpid and pass it along to the 2235 * Find the task_struct of the task to attach by vpid and pass it along to the
2195 * function to attach either it or all tasks in its threadgroup. Will take 2236 * function to attach either it or all tasks in its threadgroup. Will lock
2196 * cgroup_mutex; may take task_lock of task. 2237 * cgroup_mutex and threadgroup; may take task_lock of task.
2197 */ 2238 */
2198static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2239static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2199{ 2240{
@@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2220 * detect it later. 2261 * detect it later.
2221 */ 2262 */
2222 tsk = tsk->group_leader; 2263 tsk = tsk->group_leader;
2223 } else if (tsk->flags & PF_EXITING) {
2224 /* optimization for the single-task-only case */
2225 rcu_read_unlock();
2226 cgroup_unlock();
2227 return -ESRCH;
2228 } 2264 }
2229
2230 /* 2265 /*
2231 * even if we're attaching all tasks in the thread group, we 2266 * even if we're attaching all tasks in the thread group, we
2232 * only need to check permissions on one of them. 2267 * only need to check permissions on one of them.
@@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2249 get_task_struct(tsk); 2284 get_task_struct(tsk);
2250 } 2285 }
2251 2286
2252 if (threadgroup) { 2287 threadgroup_lock(tsk);
2253 threadgroup_fork_write_lock(tsk); 2288
2289 if (threadgroup)
2254 ret = cgroup_attach_proc(cgrp, tsk); 2290 ret = cgroup_attach_proc(cgrp, tsk);
2255 threadgroup_fork_write_unlock(tsk); 2291 else
2256 } else {
2257 ret = cgroup_attach_task(cgrp, tsk); 2292 ret = cgroup_attach_task(cgrp, tsk);
2258 } 2293
2294 threadgroup_unlock(tsk);
2295
2259 put_task_struct(tsk); 2296 put_task_struct(tsk);
2260 cgroup_unlock(); 2297 cgroup_unlock();
2261 return ret; 2298 return ret;
@@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2306 return -EINVAL; 2343 return -EINVAL;
2307 if (!cgroup_lock_live_group(cgrp)) 2344 if (!cgroup_lock_live_group(cgrp))
2308 return -ENODEV; 2345 return -ENODEV;
2346 mutex_lock(&cgroup_root_mutex);
2309 strcpy(cgrp->root->release_agent_path, buffer); 2347 strcpy(cgrp->root->release_agent_path, buffer);
2348 mutex_unlock(&cgroup_root_mutex);
2310 cgroup_unlock(); 2349 cgroup_unlock();
2311 return 0; 2350 return 0;
2312} 2351}
@@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
2789} 2828}
2790 2829
2791void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2830void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2831 __acquires(css_set_lock)
2792{ 2832{
2793 /* 2833 /*
2794 * The first time anyone tries to iterate across a cgroup, 2834 * The first time anyone tries to iterate across a cgroup,
@@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2828} 2868}
2829 2869
2830void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2870void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2871 __releases(css_set_lock)
2831{ 2872{
2832 read_unlock(&css_set_lock); 2873 read_unlock(&css_set_lock);
2833} 2874}
@@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
4491 * 4532 *
4492 * A pointer to the shared css_set was automatically copied in 4533 * A pointer to the shared css_set was automatically copied in
4493 * fork.c by dup_task_struct(). However, we ignore that copy, since 4534 * fork.c by dup_task_struct(). However, we ignore that copy, since
4494 * it was not made under the protection of RCU or cgroup_mutex, so 4535 * it was not made under the protection of RCU, cgroup_mutex or
4495 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4536 * threadgroup_change_begin(), so it might no longer be a valid
4496 * have already changed current->cgroups, allowing the previously 4537 * cgroup pointer. cgroup_attach_task() might have already changed
4497 * referenced cgroup group to be removed and freed. 4538 * current->cgroups, allowing the previously referenced cgroup
4539 * group to be removed and freed.
4540 *
4541 * Outside the pointer validity we also need to process the css_set
4542 * inheritance between threadgoup_change_begin() and
4543 * threadgoup_change_end(), this way there is no leak in any process
4544 * wide migration performed by cgroup_attach_proc() that could otherwise
4545 * miss a thread because it is too early or too late in the fork stage.
4498 * 4546 *
4499 * At the point that cgroup_fork() is called, 'current' is the parent 4547 * At the point that cgroup_fork() is called, 'current' is the parent
4500 * task, and the passed argument 'child' points to the child task. 4548 * task, and the passed argument 'child' points to the child task.
4501 */ 4549 */
4502void cgroup_fork(struct task_struct *child) 4550void cgroup_fork(struct task_struct *child)
4503{ 4551{
4504 task_lock(current); 4552 /*
4553 * We don't need to task_lock() current because current->cgroups
4554 * can't be changed concurrently here. The parent obviously hasn't
4555 * exited and called cgroup_exit(), and we are synchronized against
4556 * cgroup migration through threadgroup_change_begin().
4557 */
4505 child->cgroups = current->cgroups; 4558 child->cgroups = current->cgroups;
4506 get_css_set(child->cgroups); 4559 get_css_set(child->cgroups);
4507 task_unlock(current);
4508 INIT_LIST_HEAD(&child->cg_list); 4560 INIT_LIST_HEAD(&child->cg_list);
4509} 4561}
4510 4562
@@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
4546{ 4598{
4547 if (use_task_css_set_links) { 4599 if (use_task_css_set_links) {
4548 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4549 task_lock(child); 4601 if (list_empty(&child->cg_list)) {
4550 if (list_empty(&child->cg_list)) 4602 /*
4603 * It's safe to use child->cgroups without task_lock()
4604 * here because we are protected through
4605 * threadgroup_change_begin() against concurrent
4606 * css_set change in cgroup_task_migrate(). Also
4607 * the task can't exit at that point until
4608 * wake_up_new_task() is called, so we are protected
4609 * against cgroup_exit() setting child->cgroup to
4610 * init_css_set.
4611 */
4551 list_add(&child->cg_list, &child->cgroups->tasks); 4612 list_add(&child->cg_list, &child->cgroups->tasks);
4552 task_unlock(child); 4613 }
4553 write_unlock(&css_set_lock); 4614 write_unlock(&css_set_lock);
4554 } 4615 }
4555} 4616}