aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c356
1 files changed, 148 insertions, 208 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..ed64ccac67c9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
818 818
819 for_each_subsys(cgrp->root, ss) 819 for_each_subsys(cgrp->root, ss)
820 if (ss->pre_destroy) { 820 if (ss->pre_destroy) {
821 ret = ss->pre_destroy(ss, cgrp); 821 ret = ss->pre_destroy(cgrp);
822 if (ret) 822 if (ret)
823 break; 823 break;
824 } 824 }
@@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
846 * Release the subsystem state objects. 846 * Release the subsystem state objects.
847 */ 847 */
848 for_each_subsys(cgrp->root, ss) 848 for_each_subsys(cgrp->root, ss)
849 ss->destroy(ss, cgrp); 849 ss->destroy(cgrp);
850 850
851 cgrp->root->number_of_cgroups--; 851 cgrp->root->number_of_cgroups--;
852 mutex_unlock(&cgroup_mutex); 852 mutex_unlock(&cgroup_mutex);
@@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1015 list_move(&ss->sibling, &root->subsys_list); 1015 list_move(&ss->sibling, &root->subsys_list);
1016 ss->root = root; 1016 ss->root = root;
1017 if (ss->bind) 1017 if (ss->bind)
1018 ss->bind(ss, cgrp); 1018 ss->bind(cgrp);
1019 mutex_unlock(&ss->hierarchy_mutex); 1019 mutex_unlock(&ss->hierarchy_mutex);
1020 /* refcount was already taken, and we're keeping it */ 1020 /* refcount was already taken, and we're keeping it */
1021 } else if (bit & removed_bits) { 1021 } else if (bit & removed_bits) {
@@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1026 mutex_lock(&ss->hierarchy_mutex); 1026 mutex_lock(&ss->hierarchy_mutex);
1027 if (ss->bind) 1027 if (ss->bind)
1028 ss->bind(ss, dummytop); 1028 ss->bind(dummytop);
1029 dummytop->subsys[i]->cgroup = dummytop; 1029 dummytop->subsys[i]->cgroup = dummytop;
1030 cgrp->subsys[i] = NULL; 1030 cgrp->subsys[i] = NULL;
1031 subsys[i]->root = &rootnode; 1031 subsys[i]->root = &rootnode;
@@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
1472 1472
1473 struct inode *inode = 1473 struct inode *inode =
1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1475 struct dentry *dentry;
1476 1475
1477 if (!inode) 1476 if (!inode)
1478 return -ENOMEM; 1477 return -ENOMEM;
@@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1481 inode->i_op = &cgroup_dir_inode_operations; 1480 inode->i_op = &cgroup_dir_inode_operations;
1482 /* directories start off with i_nlink == 2 (for "." entry) */ 1481 /* directories start off with i_nlink == 2 (for "." entry) */
1483 inc_nlink(inode); 1482 inc_nlink(inode);
1484 dentry = d_alloc_root(inode); 1483 sb->s_root = d_make_root(inode);
1485 if (!dentry) { 1484 if (!sb->s_root)
1486 iput(inode);
1487 return -ENOMEM; 1485 return -ENOMEM;
1488 }
1489 sb->s_root = dentry;
1490 /* for everything else we want ->d_op set */ 1486 /* for everything else we want ->d_op set */
1491 sb->s_d_op = &cgroup_dops; 1487 sb->s_d_op = &cgroup_dops;
1492 return 0; 1488 return 0;
@@ -1763,6 +1759,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1763struct task_and_cgroup { 1759struct task_and_cgroup {
1764 struct task_struct *task; 1760 struct task_struct *task;
1765 struct cgroup *cgrp; 1761 struct cgroup *cgrp;
1762 struct css_set *cg;
1766}; 1763};
1767 1764
1768struct cgroup_taskset { 1765struct cgroup_taskset {
@@ -1843,11 +1840,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1843 * will already exist. If not set, this function might sleep, and can fail with 1840 * will already exist. If not set, this function might sleep, and can fail with
1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. 1841 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1845 */ 1842 */
1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1843static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1847 struct task_struct *tsk, bool guarantee) 1844 struct task_struct *tsk, struct css_set *newcg)
1848{ 1845{
1849 struct css_set *oldcg; 1846 struct css_set *oldcg;
1850 struct css_set *newcg;
1851 1847
1852 /* 1848 /*
1853 * We are synchronized through threadgroup_lock() against PF_EXITING 1849 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1857,23 +1853,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1857 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1853 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1858 oldcg = tsk->cgroups; 1854 oldcg = tsk->cgroups;
1859 1855
1860 /* locate or allocate a new css_set for this task. */
1861 if (guarantee) {
1862 /* we know the css_set we want already exists. */
1863 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1864 read_lock(&css_set_lock);
1865 newcg = find_existing_css_set(oldcg, cgrp, template);
1866 BUG_ON(!newcg);
1867 get_css_set(newcg);
1868 read_unlock(&css_set_lock);
1869 } else {
1870 might_sleep();
1871 /* find_css_set will give us newcg already referenced. */
1872 newcg = find_css_set(oldcg, cgrp);
1873 if (!newcg)
1874 return -ENOMEM;
1875 }
1876
1877 task_lock(tsk); 1856 task_lock(tsk);
1878 rcu_assign_pointer(tsk->cgroups, newcg); 1857 rcu_assign_pointer(tsk->cgroups, newcg);
1879 task_unlock(tsk); 1858 task_unlock(tsk);
@@ -1892,7 +1871,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1892 put_css_set(oldcg); 1871 put_css_set(oldcg);
1893 1872
1894 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1873 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1895 return 0;
1896} 1874}
1897 1875
1898/** 1876/**
@@ -1905,11 +1883,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1905 */ 1883 */
1906int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1884int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1907{ 1885{
1908 int retval; 1886 int retval = 0;
1909 struct cgroup_subsys *ss, *failed_ss = NULL; 1887 struct cgroup_subsys *ss, *failed_ss = NULL;
1910 struct cgroup *oldcgrp; 1888 struct cgroup *oldcgrp;
1911 struct cgroupfs_root *root = cgrp->root; 1889 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { }; 1890 struct cgroup_taskset tset = { };
1891 struct css_set *newcg;
1913 1892
1914 /* @tsk either already exited or can't exit until the end */ 1893 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING) 1894 if (tsk->flags & PF_EXITING)
@@ -1925,7 +1904,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1925 1904
1926 for_each_subsys(root, ss) { 1905 for_each_subsys(root, ss) {
1927 if (ss->can_attach) { 1906 if (ss->can_attach) {
1928 retval = ss->can_attach(ss, cgrp, &tset); 1907 retval = ss->can_attach(cgrp, &tset);
1929 if (retval) { 1908 if (retval) {
1930 /* 1909 /*
1931 * Remember on which subsystem the can_attach() 1910 * Remember on which subsystem the can_attach()
@@ -1939,13 +1918,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1939 } 1918 }
1940 } 1919 }
1941 1920
1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1921 newcg = find_css_set(tsk->cgroups, cgrp);
1943 if (retval) 1922 if (!newcg) {
1923 retval = -ENOMEM;
1944 goto out; 1924 goto out;
1925 }
1926
1927 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1945 1928
1946 for_each_subsys(root, ss) { 1929 for_each_subsys(root, ss) {
1947 if (ss->attach) 1930 if (ss->attach)
1948 ss->attach(ss, cgrp, &tset); 1931 ss->attach(cgrp, &tset);
1949 } 1932 }
1950 1933
1951 synchronize_rcu(); 1934 synchronize_rcu();
@@ -1967,7 +1950,7 @@ out:
1967 */ 1950 */
1968 break; 1951 break;
1969 if (ss->cancel_attach) 1952 if (ss->cancel_attach)
1970 ss->cancel_attach(ss, cgrp, &tset); 1953 ss->cancel_attach(cgrp, &tset);
1971 } 1954 }
1972 } 1955 }
1973 return retval; 1956 return retval;
@@ -1997,66 +1980,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1997} 1980}
1998EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1981EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1999 1982
2000/*
2001 * cgroup_attach_proc works in two stages, the first of which prefetches all
2002 * new css_sets needed (to make sure we have enough memory before committing
2003 * to the move) and stores them in a list of entries of the following type.
2004 * TODO: possible optimization: use css_set->rcu_head for chaining instead
2005 */
2006struct cg_list_entry {
2007 struct css_set *cg;
2008 struct list_head links;
2009};
2010
2011static bool css_set_check_fetched(struct cgroup *cgrp,
2012 struct task_struct *tsk, struct css_set *cg,
2013 struct list_head *newcg_list)
2014{
2015 struct css_set *newcg;
2016 struct cg_list_entry *cg_entry;
2017 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
2018
2019 read_lock(&css_set_lock);
2020 newcg = find_existing_css_set(cg, cgrp, template);
2021 read_unlock(&css_set_lock);
2022
2023 /* doesn't exist at all? */
2024 if (!newcg)
2025 return false;
2026 /* see if it's already in the list */
2027 list_for_each_entry(cg_entry, newcg_list, links)
2028 if (cg_entry->cg == newcg)
2029 return true;
2030
2031 /* not found */
2032 return false;
2033}
2034
2035/*
2036 * Find the new css_set and store it in the list in preparation for moving the
2037 * given task to the given cgroup. Returns 0 or -ENOMEM.
2038 */
2039static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
2040 struct list_head *newcg_list)
2041{
2042 struct css_set *newcg;
2043 struct cg_list_entry *cg_entry;
2044
2045 /* ensure a new css_set will exist for this thread */
2046 newcg = find_css_set(cg, cgrp);
2047 if (!newcg)
2048 return -ENOMEM;
2049 /* add it to the list */
2050 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
2051 if (!cg_entry) {
2052 put_css_set(newcg);
2053 return -ENOMEM;
2054 }
2055 cg_entry->cg = newcg;
2056 list_add(&cg_entry->links, newcg_list);
2057 return 0;
2058}
2059
2060/** 1983/**
2061 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 1984 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2062 * @cgrp: the cgroup to attach to 1985 * @cgrp: the cgroup to attach to
@@ -2070,20 +1993,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2070 int retval, i, group_size; 1993 int retval, i, group_size;
2071 struct cgroup_subsys *ss, *failed_ss = NULL; 1994 struct cgroup_subsys *ss, *failed_ss = NULL;
2072 /* guaranteed to be initialized later, but the compiler needs this */ 1995 /* guaranteed to be initialized later, but the compiler needs this */
2073 struct css_set *oldcg;
2074 struct cgroupfs_root *root = cgrp->root; 1996 struct cgroupfs_root *root = cgrp->root;
2075 /* threadgroup list cursor and array */ 1997 /* threadgroup list cursor and array */
2076 struct task_struct *tsk; 1998 struct task_struct *tsk;
2077 struct task_and_cgroup *tc; 1999 struct task_and_cgroup *tc;
2078 struct flex_array *group; 2000 struct flex_array *group;
2079 struct cgroup_taskset tset = { }; 2001 struct cgroup_taskset tset = { };
2080 /*
2081 * we need to make sure we have css_sets for all the tasks we're
2082 * going to move -before- we actually start moving them, so that in
2083 * case we get an ENOMEM we can bail out before making any changes.
2084 */
2085 struct list_head newcg_list;
2086 struct cg_list_entry *cg_entry, *temp_nobe;
2087 2002
2088 /* 2003 /*
2089 * step 0: in order to do expensive, possibly blocking operations for 2004 * step 0: in order to do expensive, possibly blocking operations for
@@ -2102,23 +2017,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2102 if (retval) 2017 if (retval)
2103 goto out_free_group_list; 2018 goto out_free_group_list;
2104 2019
2105 /* prevent changes to the threadgroup list while we take a snapshot. */
2106 read_lock(&tasklist_lock);
2107 if (!thread_group_leader(leader)) {
2108 /*
2109 * a race with de_thread from another thread's exec() may strip
2110 * us of our leadership, making while_each_thread unsafe to use
2111 * on this task. if this happens, there is no choice but to
2112 * throw this task away and try again (from cgroup_procs_write);
2113 * this is "double-double-toil-and-trouble-check locking".
2114 */
2115 read_unlock(&tasklist_lock);
2116 retval = -EAGAIN;
2117 goto out_free_group_list;
2118 }
2119
2120 tsk = leader; 2020 tsk = leader;
2121 i = 0; 2021 i = 0;
2022 /*
2023 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2024 * already PF_EXITING could be freed from underneath us unless we
2025 * take an rcu_read_lock.
2026 */
2027 rcu_read_lock();
2122 do { 2028 do {
2123 struct task_and_cgroup ent; 2029 struct task_and_cgroup ent;
2124 2030
@@ -2128,24 +2034,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2128 2034
2129 /* as per above, nr_threads may decrease, but not increase. */ 2035 /* as per above, nr_threads may decrease, but not increase. */
2130 BUG_ON(i >= group_size); 2036 BUG_ON(i >= group_size);
2131 /*
2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2133 * earlier, but it's good form to communicate our expectations.
2134 */
2135 ent.task = tsk; 2037 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root); 2038 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */ 2039 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp) 2040 if (ent.cgrp == cgrp)
2139 continue; 2041 continue;
2042 /*
2043 * saying GFP_ATOMIC has no effect here because we did prealloc
2044 * earlier, but it's good form to communicate our expectations.
2045 */
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2046 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2141 BUG_ON(retval != 0); 2047 BUG_ON(retval != 0);
2142 i++; 2048 i++;
2143 } while_each_thread(leader, tsk); 2049 } while_each_thread(leader, tsk);
2050 rcu_read_unlock();
2144 /* remember the number of threads in the array for later. */ 2051 /* remember the number of threads in the array for later. */
2145 group_size = i; 2052 group_size = i;
2146 tset.tc_array = group; 2053 tset.tc_array = group;
2147 tset.tc_array_len = group_size; 2054 tset.tc_array_len = group_size;
2148 read_unlock(&tasklist_lock);
2149 2055
2150 /* methods shouldn't be called if no task is actually migrating */ 2056 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0; 2057 retval = 0;
@@ -2157,7 +2063,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2157 */ 2063 */
2158 for_each_subsys(root, ss) { 2064 for_each_subsys(root, ss) {
2159 if (ss->can_attach) { 2065 if (ss->can_attach) {
2160 retval = ss->can_attach(ss, cgrp, &tset); 2066 retval = ss->can_attach(cgrp, &tset);
2161 if (retval) { 2067 if (retval) {
2162 failed_ss = ss; 2068 failed_ss = ss;
2163 goto out_cancel_attach; 2069 goto out_cancel_attach;
@@ -2169,17 +2075,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2169 * step 2: make sure css_sets exist for all threads to be migrated. 2075 * step 2: make sure css_sets exist for all threads to be migrated.
2170 * we use find_css_set, which allocates a new one if necessary. 2076 * we use find_css_set, which allocates a new one if necessary.
2171 */ 2077 */
2172 INIT_LIST_HEAD(&newcg_list);
2173 for (i = 0; i < group_size; i++) { 2078 for (i = 0; i < group_size; i++) {
2174 tc = flex_array_get(group, i); 2079 tc = flex_array_get(group, i);
2175 oldcg = tc->task->cgroups; 2080 tc->cg = find_css_set(tc->task->cgroups, cgrp);
2176 2081 if (!tc->cg) {
2177 /* if we don't already have it in the list get a new one */ 2082 retval = -ENOMEM;
2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg, 2083 goto out_put_css_set_refs;
2179 &newcg_list)) {
2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2181 if (retval)
2182 goto out_list_teardown;
2183 } 2084 }
2184 } 2085 }
2185 2086
@@ -2190,8 +2091,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2190 */ 2091 */
2191 for (i = 0; i < group_size; i++) { 2092 for (i = 0; i < group_size; i++) {
2192 tc = flex_array_get(group, i); 2093 tc = flex_array_get(group, i);
2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); 2094 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
2194 BUG_ON(retval);
2195 } 2095 }
2196 /* nothing is sensitive to fork() after this point. */ 2096 /* nothing is sensitive to fork() after this point. */
2197 2097
@@ -2200,7 +2100,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 */ 2100 */
2201 for_each_subsys(root, ss) { 2101 for_each_subsys(root, ss) {
2202 if (ss->attach) 2102 if (ss->attach)
2203 ss->attach(ss, cgrp, &tset); 2103 ss->attach(cgrp, &tset);
2204 } 2104 }
2205 2105
2206 /* 2106 /*
@@ -2209,21 +2109,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2209 synchronize_rcu(); 2109 synchronize_rcu();
2210 cgroup_wakeup_rmdir_waiter(cgrp); 2110 cgroup_wakeup_rmdir_waiter(cgrp);
2211 retval = 0; 2111 retval = 0;
2212out_list_teardown: 2112out_put_css_set_refs:
2213 /* clean up the list of prefetched css_sets. */ 2113 if (retval) {
2214 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { 2114 for (i = 0; i < group_size; i++) {
2215 list_del(&cg_entry->links); 2115 tc = flex_array_get(group, i);
2216 put_css_set(cg_entry->cg); 2116 if (!tc->cg)
2217 kfree(cg_entry); 2117 break;
2118 put_css_set(tc->cg);
2119 }
2218 } 2120 }
2219out_cancel_attach: 2121out_cancel_attach:
2220 /* same deal as in cgroup_attach_task */
2221 if (retval) { 2122 if (retval) {
2222 for_each_subsys(root, ss) { 2123 for_each_subsys(root, ss) {
2223 if (ss == failed_ss) 2124 if (ss == failed_ss)
2224 break; 2125 break;
2225 if (ss->cancel_attach) 2126 if (ss->cancel_attach)
2226 ss->cancel_attach(ss, cgrp, &tset); 2127 ss->cancel_attach(cgrp, &tset);
2227 } 2128 }
2228 } 2129 }
2229out_free_group_list: 2130out_free_group_list:
@@ -2245,22 +2146,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2245 if (!cgroup_lock_live_group(cgrp)) 2146 if (!cgroup_lock_live_group(cgrp))
2246 return -ENODEV; 2147 return -ENODEV;
2247 2148
2149retry_find_task:
2150 rcu_read_lock();
2248 if (pid) { 2151 if (pid) {
2249 rcu_read_lock();
2250 tsk = find_task_by_vpid(pid); 2152 tsk = find_task_by_vpid(pid);
2251 if (!tsk) { 2153 if (!tsk) {
2252 rcu_read_unlock(); 2154 rcu_read_unlock();
2253 cgroup_unlock(); 2155 ret= -ESRCH;
2254 return -ESRCH; 2156 goto out_unlock_cgroup;
2255 }
2256 if (threadgroup) {
2257 /*
2258 * RCU protects this access, since tsk was found in the
2259 * tid map. a race with de_thread may cause group_leader
2260 * to stop being the leader, but cgroup_attach_proc will
2261 * detect it later.
2262 */
2263 tsk = tsk->group_leader;
2264 } 2157 }
2265 /* 2158 /*
2266 * even if we're attaching all tasks in the thread group, we 2159 * even if we're attaching all tasks in the thread group, we
@@ -2271,29 +2164,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2271 cred->euid != tcred->uid && 2164 cred->euid != tcred->uid &&
2272 cred->euid != tcred->suid) { 2165 cred->euid != tcred->suid) {
2273 rcu_read_unlock(); 2166 rcu_read_unlock();
2274 cgroup_unlock(); 2167 ret = -EACCES;
2275 return -EACCES; 2168 goto out_unlock_cgroup;
2276 } 2169 }
2277 get_task_struct(tsk); 2170 } else
2278 rcu_read_unlock(); 2171 tsk = current;
2279 } else {
2280 if (threadgroup)
2281 tsk = current->group_leader;
2282 else
2283 tsk = current;
2284 get_task_struct(tsk);
2285 }
2286
2287 threadgroup_lock(tsk);
2288 2172
2289 if (threadgroup) 2173 if (threadgroup)
2174 tsk = tsk->group_leader;
2175 get_task_struct(tsk);
2176 rcu_read_unlock();
2177
2178 threadgroup_lock(tsk);
2179 if (threadgroup) {
2180 if (!thread_group_leader(tsk)) {
2181 /*
2182 * a race with de_thread from another thread's exec()
2183 * may strip us of our leadership, if this happens,
2184 * there is no choice but to throw this task away and
2185 * try again; this is
2186 * "double-double-toil-and-trouble-check locking".
2187 */
2188 threadgroup_unlock(tsk);
2189 put_task_struct(tsk);
2190 goto retry_find_task;
2191 }
2290 ret = cgroup_attach_proc(cgrp, tsk); 2192 ret = cgroup_attach_proc(cgrp, tsk);
2291 else 2193 } else
2292 ret = cgroup_attach_task(cgrp, tsk); 2194 ret = cgroup_attach_task(cgrp, tsk);
2293
2294 threadgroup_unlock(tsk); 2195 threadgroup_unlock(tsk);
2295 2196
2296 put_task_struct(tsk); 2197 put_task_struct(tsk);
2198out_unlock_cgroup:
2297 cgroup_unlock(); 2199 cgroup_unlock();
2298 return ret; 2200 return ret;
2299} 2201}
@@ -2305,16 +2207,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2305 2207
2306static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2208static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2307{ 2209{
2308 int ret; 2210 return attach_task_by_pid(cgrp, tgid, true);
2309 do {
2310 /*
2311 * attach_proc fails with -EAGAIN if threadgroup leadership
2312 * changes in the middle of the operation, in which case we need
2313 * to find the task_struct for the new leader and start over.
2314 */
2315 ret = attach_task_by_pid(cgrp, tgid, true);
2316 } while (ret == -EAGAIN);
2317 return ret;
2318} 2211}
2319 2212
2320/** 2213/**
@@ -2804,15 +2697,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
2804 * using their cgroups capability, we don't maintain the lists running 2697 * using their cgroups capability, we don't maintain the lists running
2805 * through each css_set to its tasks until we see the list actually 2698 * through each css_set to its tasks until we see the list actually
2806 * used - in other words after the first call to cgroup_iter_start(). 2699 * used - in other words after the first call to cgroup_iter_start().
2807 *
2808 * The tasklist_lock is not held here, as do_each_thread() and
2809 * while_each_thread() are protected by RCU.
2810 */ 2700 */
2811static void cgroup_enable_task_cg_lists(void) 2701static void cgroup_enable_task_cg_lists(void)
2812{ 2702{
2813 struct task_struct *p, *g; 2703 struct task_struct *p, *g;
2814 write_lock(&css_set_lock); 2704 write_lock(&css_set_lock);
2815 use_task_css_set_links = 1; 2705 use_task_css_set_links = 1;
2706 /*
2707 * We need tasklist_lock because RCU is not safe against
2708 * while_each_thread(). Besides, a forking task that has passed
2709 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2710 * is not guaranteed to have its child immediately visible in the
2711 * tasklist if we walk through it with RCU.
2712 */
2713 read_lock(&tasklist_lock);
2816 do_each_thread(g, p) { 2714 do_each_thread(g, p) {
2817 task_lock(p); 2715 task_lock(p);
2818 /* 2716 /*
@@ -2824,6 +2722,7 @@ static void cgroup_enable_task_cg_lists(void)
2824 list_add(&p->cg_list, &p->cgroups->tasks); 2722 list_add(&p->cg_list, &p->cgroups->tasks);
2825 task_unlock(p); 2723 task_unlock(p);
2826 } while_each_thread(g, p); 2724 } while_each_thread(g, p);
2725 read_unlock(&tasklist_lock);
2827 write_unlock(&css_set_lock); 2726 write_unlock(&css_set_lock);
2828} 2727}
2829 2728
@@ -3043,6 +2942,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3043 * 2942 *
3044 */ 2943 */
3045 2944
2945/* which pidlist file are we talking about? */
2946enum cgroup_filetype {
2947 CGROUP_FILE_PROCS,
2948 CGROUP_FILE_TASKS,
2949};
2950
2951/*
2952 * A pidlist is a list of pids that virtually represents the contents of one
2953 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
2954 * a pair (one each for procs, tasks) for each pid namespace that's relevant
2955 * to the cgroup.
2956 */
2957struct cgroup_pidlist {
2958 /*
2959 * used to find which pidlist is wanted. doesn't change as long as
2960 * this particular list stays in the list.
2961 */
2962 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
2963 /* array of xids */
2964 pid_t *list;
2965 /* how many elements the above list has */
2966 int length;
2967 /* how many files are using the current array */
2968 int use_count;
2969 /* each of these stored in a list by its cgroup */
2970 struct list_head links;
2971 /* pointer to the cgroup we belong to, for list removal purposes */
2972 struct cgroup *owner;
2973 /* protects the other fields */
2974 struct rw_semaphore mutex;
2975};
2976
3046/* 2977/*
3047 * The following two functions "fix" the issue where there are more pids 2978 * The following two functions "fix" the issue where there are more pids
3048 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 2979 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3827,7 +3758,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3827 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3758 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3828 3759
3829 for_each_subsys(root, ss) { 3760 for_each_subsys(root, ss) {
3830 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3761 struct cgroup_subsys_state *css = ss->create(cgrp);
3831 3762
3832 if (IS_ERR(css)) { 3763 if (IS_ERR(css)) {
3833 err = PTR_ERR(css); 3764 err = PTR_ERR(css);
@@ -3841,7 +3772,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3841 } 3772 }
3842 /* At error, ->destroy() callback has to free assigned ID. */ 3773 /* At error, ->destroy() callback has to free assigned ID. */
3843 if (clone_children(parent) && ss->post_clone) 3774 if (clone_children(parent) && ss->post_clone)
3844 ss->post_clone(ss, cgrp); 3775 ss->post_clone(cgrp);
3845 } 3776 }
3846 3777
3847 cgroup_lock_hierarchy(root); 3778 cgroup_lock_hierarchy(root);
@@ -3875,7 +3806,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3875 3806
3876 for_each_subsys(root, ss) { 3807 for_each_subsys(root, ss) {
3877 if (cgrp->subsys[ss->subsys_id]) 3808 if (cgrp->subsys[ss->subsys_id])
3878 ss->destroy(ss, cgrp); 3809 ss->destroy(cgrp);
3879 } 3810 }
3880 3811
3881 mutex_unlock(&cgroup_mutex); 3812 mutex_unlock(&cgroup_mutex);
@@ -4099,7 +4030,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4099 /* Create the top cgroup state for this subsystem */ 4030 /* Create the top cgroup state for this subsystem */
4100 list_add(&ss->sibling, &rootnode.subsys_list); 4031 list_add(&ss->sibling, &rootnode.subsys_list);
4101 ss->root = &rootnode; 4032 ss->root = &rootnode;
4102 css = ss->create(ss, dummytop); 4033 css = ss->create(dummytop);
4103 /* We don't handle early failures gracefully */ 4034 /* We don't handle early failures gracefully */
4104 BUG_ON(IS_ERR(css)); 4035 BUG_ON(IS_ERR(css));
4105 init_cgroup_css(css, ss, dummytop); 4036 init_cgroup_css(css, ss, dummytop);
@@ -4188,7 +4119,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4188 * no ss->create seems to need anything important in the ss struct, so 4119 * no ss->create seems to need anything important in the ss struct, so
4189 * this can happen first (i.e. before the rootnode attachment). 4120 * this can happen first (i.e. before the rootnode attachment).
4190 */ 4121 */
4191 css = ss->create(ss, dummytop); 4122 css = ss->create(dummytop);
4192 if (IS_ERR(css)) { 4123 if (IS_ERR(css)) {
4193 /* failure case - need to deassign the subsys[] slot. */ 4124 /* failure case - need to deassign the subsys[] slot. */
4194 subsys[i] = NULL; 4125 subsys[i] = NULL;
@@ -4206,7 +4137,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4206 int ret = cgroup_init_idr(ss, css); 4137 int ret = cgroup_init_idr(ss, css);
4207 if (ret) { 4138 if (ret) {
4208 dummytop->subsys[ss->subsys_id] = NULL; 4139 dummytop->subsys[ss->subsys_id] = NULL;
4209 ss->destroy(ss, dummytop); 4140 ss->destroy(dummytop);
4210 subsys[i] = NULL; 4141 subsys[i] = NULL;
4211 mutex_unlock(&cgroup_mutex); 4142 mutex_unlock(&cgroup_mutex);
4212 return ret; 4143 return ret;
@@ -4304,7 +4235,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4304 * pointer to find their state. note that this also takes care of 4235 * pointer to find their state. note that this also takes care of
4305 * freeing the css_id. 4236 * freeing the css_id.
4306 */ 4237 */
4307 ss->destroy(ss, dummytop); 4238 ss->destroy(dummytop);
4308 dummytop->subsys[ss->subsys_id] = NULL; 4239 dummytop->subsys[ss->subsys_id] = NULL;
4309 4240
4310 mutex_unlock(&cgroup_mutex); 4241 mutex_unlock(&cgroup_mutex);
@@ -4580,7 +4511,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
4580 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4511 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4581 struct cgroup_subsys *ss = subsys[i]; 4512 struct cgroup_subsys *ss = subsys[i];
4582 if (ss->fork) 4513 if (ss->fork)
4583 ss->fork(ss, child); 4514 ss->fork(child);
4584 } 4515 }
4585 } 4516 }
4586} 4517}
@@ -4596,6 +4527,17 @@ void cgroup_fork_callbacks(struct task_struct *child)
4596 */ 4527 */
4597void cgroup_post_fork(struct task_struct *child) 4528void cgroup_post_fork(struct task_struct *child)
4598{ 4529{
4530 /*
4531 * use_task_css_set_links is set to 1 before we walk the tasklist
4532 * under the tasklist_lock and we read it here after we added the child
4533 * to the tasklist under the tasklist_lock as well. If the child wasn't
4534 * yet in the tasklist when we walked through it from
4535 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
4536 * should be visible now due to the paired locking and barriers implied
4537 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
4538 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
4539 * lock on fork.
4540 */
4599 if (use_task_css_set_links) { 4541 if (use_task_css_set_links) {
4600 write_lock(&css_set_lock); 4542 write_lock(&css_set_lock);
4601 if (list_empty(&child->cg_list)) { 4543 if (list_empty(&child->cg_list)) {
@@ -4682,7 +4624,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4682 struct cgroup *old_cgrp = 4624 struct cgroup *old_cgrp =
4683 rcu_dereference_raw(cg->subsys[i])->cgroup; 4625 rcu_dereference_raw(cg->subsys[i])->cgroup;
4684 struct cgroup *cgrp = task_cgroup(tsk, i); 4626 struct cgroup *cgrp = task_cgroup(tsk, i);
4685 ss->exit(ss, cgrp, old_cgrp, tsk); 4627 ss->exit(cgrp, old_cgrp, tsk);
4686 } 4628 }
4687 } 4629 }
4688 } 4630 }
@@ -4939,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4939 4881
4940 rcu_assign_pointer(id->css, NULL); 4882 rcu_assign_pointer(id->css, NULL);
4941 rcu_assign_pointer(css->id, NULL); 4883 rcu_assign_pointer(css->id, NULL);
4942 write_lock(&ss->id_lock); 4884 spin_lock(&ss->id_lock);
4943 idr_remove(&ss->idr, id->id); 4885 idr_remove(&ss->idr, id->id);
4944 write_unlock(&ss->id_lock); 4886 spin_unlock(&ss->id_lock);
4945 kfree_rcu(id, rcu_head); 4887 kfree_rcu(id, rcu_head);
4946} 4888}
4947EXPORT_SYMBOL_GPL(free_css_id); 4889EXPORT_SYMBOL_GPL(free_css_id);
@@ -4967,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4967 error = -ENOMEM; 4909 error = -ENOMEM;
4968 goto err_out; 4910 goto err_out;
4969 } 4911 }
4970 write_lock(&ss->id_lock); 4912 spin_lock(&ss->id_lock);
4971 /* Don't use 0. allocates an ID of 1-65535 */ 4913 /* Don't use 0. allocates an ID of 1-65535 */
4972 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4914 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4973 write_unlock(&ss->id_lock); 4915 spin_unlock(&ss->id_lock);
4974 4916
4975 /* Returns error when there are no free spaces for new ID.*/ 4917 /* Returns error when there are no free spaces for new ID.*/
4976 if (error) { 4918 if (error) {
@@ -4985,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4985 return newid; 4927 return newid;
4986remove_idr: 4928remove_idr:
4987 error = -ENOSPC; 4929 error = -ENOSPC;
4988 write_lock(&ss->id_lock); 4930 spin_lock(&ss->id_lock);
4989 idr_remove(&ss->idr, myid); 4931 idr_remove(&ss->idr, myid);
4990 write_unlock(&ss->id_lock); 4932 spin_unlock(&ss->id_lock);
4991err_out: 4933err_out:
4992 kfree(newid); 4934 kfree(newid);
4993 return ERR_PTR(error); 4935 return ERR_PTR(error);
@@ -4999,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4999{ 4941{
5000 struct css_id *newid; 4942 struct css_id *newid;
5001 4943
5002 rwlock_init(&ss->id_lock); 4944 spin_lock_init(&ss->id_lock);
5003 idr_init(&ss->idr); 4945 idr_init(&ss->idr);
5004 4946
5005 newid = get_new_cssid(ss, 0); 4947 newid = get_new_cssid(ss, 0);
@@ -5087,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
5087 return NULL; 5029 return NULL;
5088 5030
5089 BUG_ON(!ss->use_id); 5031 BUG_ON(!ss->use_id);
5032 WARN_ON_ONCE(!rcu_read_lock_held());
5033
5090 /* fill start point for scan */ 5034 /* fill start point for scan */
5091 tmpid = id; 5035 tmpid = id;
5092 while (1) { 5036 while (1) {
@@ -5094,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
5094 * scan next entry from bitmap(tree), tmpid is updated after 5038 * scan next entry from bitmap(tree), tmpid is updated after
5095 * idr_get_next(). 5039 * idr_get_next().
5096 */ 5040 */
5097 read_lock(&ss->id_lock);
5098 tmp = idr_get_next(&ss->idr, &tmpid); 5041 tmp = idr_get_next(&ss->idr, &tmpid);
5099 read_unlock(&ss->id_lock);
5100
5101 if (!tmp) 5042 if (!tmp)
5102 break; 5043 break;
5103 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5044 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5137,8 +5078,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5137} 5078}
5138 5079
5139#ifdef CONFIG_CGROUP_DEBUG 5080#ifdef CONFIG_CGROUP_DEBUG
5140static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5081static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5141 struct cgroup *cont)
5142{ 5082{
5143 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5083 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5144 5084
@@ -5148,7 +5088,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5148 return css; 5088 return css;
5149} 5089}
5150 5090
5151static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 5091static void debug_destroy(struct cgroup *cont)
5152{ 5092{
5153 kfree(cont->subsys[debug_subsys_id]); 5093 kfree(cont->subsys[debug_subsys_id]);
5154} 5094}