aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorBen Blum <bblum@andrew.cmu.edu>2011-05-26 19:25:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-26 20:12:34 -0400
commit74a1166dfe1135dcc168d35fa5261aa7e087011b (patch)
treea7add70f0344e2352b8d0d6beb10aef85c6585f7 /kernel
parentf780bdb7c1c73009cb57adcf99ef50027d80bf3c (diff)
cgroups: make procs file writable
Make procs file writable to move all threads by tgid at once. Add functionality that enables users to move all threads in a threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs' file. This current implementation makes use of a per-threadgroup rwsem that's taken for reading in the fork() path to prevent newly forking threads within the threadgroup from "escaping" while the move is in progress. Signed-off-by: Ben Blum <bblum@andrew.cmu.edu> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Matt Helsley <matthltc@us.ibm.com> Reviewed-by: Paul Menage <menage@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c439
1 files changed, 393 insertions, 46 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 38fb0ad1cb46..5e6a9745f0e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1735,6 +1735,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1735} 1735}
1736EXPORT_SYMBOL_GPL(cgroup_path); 1736EXPORT_SYMBOL_GPL(cgroup_path);
1737 1737
1738/*
1739 * cgroup_task_migrate - move a task from one cgroup to another.
1740 *
1741 * 'guarantee' is set if the caller promises that a new css_set for the task
1742 * will already exist. If not set, this function might sleep, and can fail with
1743 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1744 */
1745static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1746 struct task_struct *tsk, bool guarantee)
1747{
1748 struct css_set *oldcg;
1749 struct css_set *newcg;
1750
1751 /*
1752 * get old css_set. we need to take task_lock and refcount it, because
1753 * an exiting task can change its css_set to init_css_set and drop its
1754 * old one without taking cgroup_mutex.
1755 */
1756 task_lock(tsk);
1757 oldcg = tsk->cgroups;
1758 get_css_set(oldcg);
1759 task_unlock(tsk);
1760
1761 /* locate or allocate a new css_set for this task. */
1762 if (guarantee) {
1763 /* we know the css_set we want already exists. */
1764 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1765 read_lock(&css_set_lock);
1766 newcg = find_existing_css_set(oldcg, cgrp, template);
1767 BUG_ON(!newcg);
1768 get_css_set(newcg);
1769 read_unlock(&css_set_lock);
1770 } else {
1771 might_sleep();
1772 /* find_css_set will give us newcg already referenced. */
1773 newcg = find_css_set(oldcg, cgrp);
1774 if (!newcg) {
1775 put_css_set(oldcg);
1776 return -ENOMEM;
1777 }
1778 }
1779 put_css_set(oldcg);
1780
1781 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1782 task_lock(tsk);
1783 if (tsk->flags & PF_EXITING) {
1784 task_unlock(tsk);
1785 put_css_set(newcg);
1786 return -ESRCH;
1787 }
1788 rcu_assign_pointer(tsk->cgroups, newcg);
1789 task_unlock(tsk);
1790
1791 /* Update the css_set linked lists if we're using them */
1792 write_lock(&css_set_lock);
1793 if (!list_empty(&tsk->cg_list))
1794 list_move(&tsk->cg_list, &newcg->tasks);
1795 write_unlock(&css_set_lock);
1796
1797 /*
1798 * We just gained a reference on oldcg by taking it from the task. As
1799 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1800 * it here; it will be freed under RCU.
1801 */
1802 put_css_set(oldcg);
1803
1804 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1805 return 0;
1806}
1807
1738/** 1808/**
1739 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1809 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1740 * @cgrp: the cgroup the task is attaching to 1810 * @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1815,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1745 */ 1815 */
1746int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1816int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1747{ 1817{
1748 int retval = 0; 1818 int retval;
1749 struct cgroup_subsys *ss, *failed_ss = NULL; 1819 struct cgroup_subsys *ss, *failed_ss = NULL;
1750 struct cgroup *oldcgrp; 1820 struct cgroup *oldcgrp;
1751 struct css_set *cg;
1752 struct css_set *newcg;
1753 struct cgroupfs_root *root = cgrp->root; 1821 struct cgroupfs_root *root = cgrp->root;
1754 1822
1755 /* Nothing to do if the task is already in that cgroup */ 1823 /* Nothing to do if the task is already in that cgroup */
@@ -1780,36 +1848,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1780 } 1848 }
1781 } 1849 }
1782 1850
1783 task_lock(tsk); 1851 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1784 cg = tsk->cgroups; 1852 if (retval)
1785 get_css_set(cg);
1786 task_unlock(tsk);
1787 /*
1788 * Locate or allocate a new css_set for this task,
1789 * based on its final set of cgroups
1790 */
1791 newcg = find_css_set(cg, cgrp);
1792 put_css_set(cg);
1793 if (!newcg) {
1794 retval = -ENOMEM;
1795 goto out;
1796 }
1797
1798 task_lock(tsk);
1799 if (tsk->flags & PF_EXITING) {
1800 task_unlock(tsk);
1801 put_css_set(newcg);
1802 retval = -ESRCH;
1803 goto out; 1853 goto out;
1804 }
1805 rcu_assign_pointer(tsk->cgroups, newcg);
1806 task_unlock(tsk);
1807
1808 /* Update the css_set linked lists if we're using them */
1809 write_lock(&css_set_lock);
1810 if (!list_empty(&tsk->cg_list))
1811 list_move(&tsk->cg_list, &newcg->tasks);
1812 write_unlock(&css_set_lock);
1813 1854
1814 for_each_subsys(root, ss) { 1855 for_each_subsys(root, ss) {
1815 if (ss->pre_attach) 1856 if (ss->pre_attach)
@@ -1819,9 +1860,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1819 if (ss->attach) 1860 if (ss->attach)
1820 ss->attach(ss, cgrp, oldcgrp, tsk); 1861 ss->attach(ss, cgrp, oldcgrp, tsk);
1821 } 1862 }
1822 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1863
1823 synchronize_rcu(); 1864 synchronize_rcu();
1824 put_css_set(cg);
1825 1865
1826 /* 1866 /*
1827 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1867 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1871,49 +1911,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1871EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1911EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1872 1912
1873/* 1913/*
1874 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1914 * cgroup_attach_proc works in two stages, the first of which prefetches all
1875 * held. May take task_lock of task 1915 * new css_sets needed (to make sure we have enough memory before committing
1916 * to the move) and stores them in a list of entries of the following type.
1917 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1918 */
1919struct cg_list_entry {
1920 struct css_set *cg;
1921 struct list_head links;
1922};
1923
1924static bool css_set_check_fetched(struct cgroup *cgrp,
1925 struct task_struct *tsk, struct css_set *cg,
1926 struct list_head *newcg_list)
1927{
1928 struct css_set *newcg;
1929 struct cg_list_entry *cg_entry;
1930 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1931
1932 read_lock(&css_set_lock);
1933 newcg = find_existing_css_set(cg, cgrp, template);
1934 if (newcg)
1935 get_css_set(newcg);
1936 read_unlock(&css_set_lock);
1937
1938 /* doesn't exist at all? */
1939 if (!newcg)
1940 return false;
1941 /* see if it's already in the list */
1942 list_for_each_entry(cg_entry, newcg_list, links) {
1943 if (cg_entry->cg == newcg) {
1944 put_css_set(newcg);
1945 return true;
1946 }
1947 }
1948
1949 /* not found */
1950 put_css_set(newcg);
1951 return false;
1952}
1953
1954/*
1955 * Find the new css_set and store it in the list in preparation for moving the
1956 * given task to the given cgroup. Returns 0 or -ENOMEM.
1957 */
1958static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1959 struct list_head *newcg_list)
1960{
1961 struct css_set *newcg;
1962 struct cg_list_entry *cg_entry;
1963
1964 /* ensure a new css_set will exist for this thread */
1965 newcg = find_css_set(cg, cgrp);
1966 if (!newcg)
1967 return -ENOMEM;
1968 /* add it to the list */
1969 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1970 if (!cg_entry) {
1971 put_css_set(newcg);
1972 return -ENOMEM;
1973 }
1974 cg_entry->cg = newcg;
1975 list_add(&cg_entry->links, newcg_list);
1976 return 0;
1977}
1978
1979/**
1980 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1981 * @cgrp: the cgroup to attach to
1982 * @leader: the threadgroup leader task_struct of the group to be attached
1983 *
1984 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1985 * take task_lock of each thread in leader's threadgroup individually in turn.
1986 */
1987int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1988{
1989 int retval, i, group_size;
1990 struct cgroup_subsys *ss, *failed_ss = NULL;
1991 bool cancel_failed_ss = false;
1992 /* guaranteed to be initialized later, but the compiler needs this */
1993 struct cgroup *oldcgrp = NULL;
1994 struct css_set *oldcg;
1995 struct cgroupfs_root *root = cgrp->root;
1996 /* threadgroup list cursor and array */
1997 struct task_struct *tsk;
1998 struct task_struct **group;
1999 /*
2000 * we need to make sure we have css_sets for all the tasks we're
2001 * going to move -before- we actually start moving them, so that in
2002 * case we get an ENOMEM we can bail out before making any changes.
2003 */
2004 struct list_head newcg_list;
2005 struct cg_list_entry *cg_entry, *temp_nobe;
2006
2007 /*
2008 * step 0: in order to do expensive, possibly blocking operations for
2009 * every thread, we cannot iterate the thread group list, since it needs
2010 * rcu or tasklist locked. instead, build an array of all threads in the
2011 * group - threadgroup_fork_lock prevents new threads from appearing,
2012 * and if threads exit, this will just be an over-estimate.
2013 */
2014 group_size = get_nr_threads(leader);
2015 group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
2016 if (!group)
2017 return -ENOMEM;
2018
2019 /* prevent changes to the threadgroup list while we take a snapshot. */
2020 rcu_read_lock();
2021 if (!thread_group_leader(leader)) {
2022 /*
2023 * a race with de_thread from another thread's exec() may strip
2024 * us of our leadership, making while_each_thread unsafe to use
2025 * on this task. if this happens, there is no choice but to
2026 * throw this task away and try again (from cgroup_procs_write);
2027 * this is "double-double-toil-and-trouble-check locking".
2028 */
2029 rcu_read_unlock();
2030 retval = -EAGAIN;
2031 goto out_free_group_list;
2032 }
2033 /* take a reference on each task in the group to go in the array. */
2034 tsk = leader;
2035 i = 0;
2036 do {
2037 /* as per above, nr_threads may decrease, but not increase. */
2038 BUG_ON(i >= group_size);
2039 get_task_struct(tsk);
2040 group[i] = tsk;
2041 i++;
2042 } while_each_thread(leader, tsk);
2043 /* remember the number of threads in the array for later. */
2044 group_size = i;
2045 rcu_read_unlock();
2046
2047 /*
2048 * step 1: check that we can legitimately attach to the cgroup.
2049 */
2050 for_each_subsys(root, ss) {
2051 if (ss->can_attach) {
2052 retval = ss->can_attach(ss, cgrp, leader);
2053 if (retval) {
2054 failed_ss = ss;
2055 goto out_cancel_attach;
2056 }
2057 }
2058 /* a callback to be run on every thread in the threadgroup. */
2059 if (ss->can_attach_task) {
2060 /* run on each task in the threadgroup. */
2061 for (i = 0; i < group_size; i++) {
2062 retval = ss->can_attach_task(cgrp, group[i]);
2063 if (retval) {
2064 failed_ss = ss;
2065 cancel_failed_ss = true;
2066 goto out_cancel_attach;
2067 }
2068 }
2069 }
2070 }
2071
2072 /*
2073 * step 2: make sure css_sets exist for all threads to be migrated.
2074 * we use find_css_set, which allocates a new one if necessary.
2075 */
2076 INIT_LIST_HEAD(&newcg_list);
2077 for (i = 0; i < group_size; i++) {
2078 tsk = group[i];
2079 /* nothing to do if this task is already in the cgroup */
2080 oldcgrp = task_cgroup_from_root(tsk, root);
2081 if (cgrp == oldcgrp)
2082 continue;
2083 /* get old css_set pointer */
2084 task_lock(tsk);
2085 if (tsk->flags & PF_EXITING) {
2086 /* ignore this task if it's going away */
2087 task_unlock(tsk);
2088 continue;
2089 }
2090 oldcg = tsk->cgroups;
2091 get_css_set(oldcg);
2092 task_unlock(tsk);
2093 /* see if the new one for us is already in the list? */
2094 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2095 /* was already there, nothing to do. */
2096 put_css_set(oldcg);
2097 } else {
2098 /* we don't already have it. get new one. */
2099 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2100 put_css_set(oldcg);
2101 if (retval)
2102 goto out_list_teardown;
2103 }
2104 }
2105
2106 /*
2107 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2108 * to move all tasks to the new cgroup, calling ss->attach_task for each
2109 * one along the way. there are no failure cases after here, so this is
2110 * the commit point.
2111 */
2112 for_each_subsys(root, ss) {
2113 if (ss->pre_attach)
2114 ss->pre_attach(cgrp);
2115 }
2116 for (i = 0; i < group_size; i++) {
2117 tsk = group[i];
2118 /* leave current thread as it is if it's already there */
2119 oldcgrp = task_cgroup_from_root(tsk, root);
2120 if (cgrp == oldcgrp)
2121 continue;
2122 /* attach each task to each subsystem */
2123 for_each_subsys(root, ss) {
2124 if (ss->attach_task)
2125 ss->attach_task(cgrp, tsk);
2126 }
2127 /* if the thread is PF_EXITING, it can just get skipped. */
2128 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2129 BUG_ON(retval != 0 && retval != -ESRCH);
2130 }
2131 /* nothing is sensitive to fork() after this point. */
2132
2133 /*
2134 * step 4: do expensive, non-thread-specific subsystem callbacks.
2135 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2136 * being moved, this call will need to be reworked to communicate that.
2137 */
2138 for_each_subsys(root, ss) {
2139 if (ss->attach)
2140 ss->attach(ss, cgrp, oldcgrp, leader);
2141 }
2142
2143 /*
2144 * step 5: success! and cleanup
2145 */
2146 synchronize_rcu();
2147 cgroup_wakeup_rmdir_waiter(cgrp);
2148 retval = 0;
2149out_list_teardown:
2150 /* clean up the list of prefetched css_sets. */
2151 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2152 list_del(&cg_entry->links);
2153 put_css_set(cg_entry->cg);
2154 kfree(cg_entry);
2155 }
2156out_cancel_attach:
2157 /* same deal as in cgroup_attach_task */
2158 if (retval) {
2159 for_each_subsys(root, ss) {
2160 if (ss == failed_ss) {
2161 if (cancel_failed_ss && ss->cancel_attach)
2162 ss->cancel_attach(ss, cgrp, leader);
2163 break;
2164 }
2165 if (ss->cancel_attach)
2166 ss->cancel_attach(ss, cgrp, leader);
2167 }
2168 }
2169 /* clean up the array of referenced threads in the group. */
2170 for (i = 0; i < group_size; i++)
2171 put_task_struct(group[i]);
2172out_free_group_list:
2173 kfree(group);
2174 return retval;
2175}
2176
2177/*
2178 * Find the task_struct of the task to attach by vpid and pass it along to the
2179 * function to attach either it or all tasks in its threadgroup. Will take
2180 * cgroup_mutex; may take task_lock of task.
1876 */ 2181 */
1877static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 2182static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1878{ 2183{
1879 struct task_struct *tsk; 2184 struct task_struct *tsk;
1880 const struct cred *cred = current_cred(), *tcred; 2185 const struct cred *cred = current_cred(), *tcred;
1881 int ret; 2186 int ret;
1882 2187
2188 if (!cgroup_lock_live_group(cgrp))
2189 return -ENODEV;
2190
1883 if (pid) { 2191 if (pid) {
1884 rcu_read_lock(); 2192 rcu_read_lock();
1885 tsk = find_task_by_vpid(pid); 2193 tsk = find_task_by_vpid(pid);
1886 if (!tsk || tsk->flags & PF_EXITING) { 2194 if (!tsk) {
1887 rcu_read_unlock(); 2195 rcu_read_unlock();
2196 cgroup_unlock();
2197 return -ESRCH;
2198 }
2199 if (threadgroup) {
2200 /*
2201 * RCU protects this access, since tsk was found in the
2202 * tid map. a race with de_thread may cause group_leader
2203 * to stop being the leader, but cgroup_attach_proc will
2204 * detect it later.
2205 */
2206 tsk = tsk->group_leader;
2207 } else if (tsk->flags & PF_EXITING) {
2208 /* optimization for the single-task-only case */
2209 rcu_read_unlock();
2210 cgroup_unlock();
1888 return -ESRCH; 2211 return -ESRCH;
1889 } 2212 }
1890 2213
2214 /*
2215 * even if we're attaching all tasks in the thread group, we
2216 * only need to check permissions on one of them.
2217 */
1891 tcred = __task_cred(tsk); 2218 tcred = __task_cred(tsk);
1892 if (cred->euid && 2219 if (cred->euid &&
1893 cred->euid != tcred->uid && 2220 cred->euid != tcred->uid &&
1894 cred->euid != tcred->suid) { 2221 cred->euid != tcred->suid) {
1895 rcu_read_unlock(); 2222 rcu_read_unlock();
2223 cgroup_unlock();
1896 return -EACCES; 2224 return -EACCES;
1897 } 2225 }
1898 get_task_struct(tsk); 2226 get_task_struct(tsk);
1899 rcu_read_unlock(); 2227 rcu_read_unlock();
1900 } else { 2228 } else {
1901 tsk = current; 2229 if (threadgroup)
2230 tsk = current->group_leader;
2231 else
2232 tsk = current;
1902 get_task_struct(tsk); 2233 get_task_struct(tsk);
1903 } 2234 }
1904 2235
1905 ret = cgroup_attach_task(cgrp, tsk); 2236 if (threadgroup) {
2237 threadgroup_fork_write_lock(tsk);
2238 ret = cgroup_attach_proc(cgrp, tsk);
2239 threadgroup_fork_write_unlock(tsk);
2240 } else {
2241 ret = cgroup_attach_task(cgrp, tsk);
2242 }
1906 put_task_struct(tsk); 2243 put_task_struct(tsk);
2244 cgroup_unlock();
1907 return ret; 2245 return ret;
1908} 2246}
1909 2247
1910static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2248static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1911{ 2249{
2250 return attach_task_by_pid(cgrp, pid, false);
2251}
2252
2253static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2254{
1912 int ret; 2255 int ret;
1913 if (!cgroup_lock_live_group(cgrp)) 2256 do {
1914 return -ENODEV; 2257 /*
1915 ret = attach_task_by_pid(cgrp, pid); 2258 * attach_proc fails with -EAGAIN if threadgroup leadership
1916 cgroup_unlock(); 2259 * changes in the middle of the operation, in which case we need
2260 * to find the task_struct for the new leader and start over.
2261 */
2262 ret = attach_task_by_pid(cgrp, tgid, true);
2263 } while (ret == -EAGAIN);
1917 return ret; 2264 return ret;
1918} 2265}
1919 2266
@@ -3270,9 +3617,9 @@ static struct cftype files[] = {
3270 { 3617 {
3271 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3618 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3272 .open = cgroup_procs_open, 3619 .open = cgroup_procs_open,
3273 /* .write_u64 = cgroup_procs_write, TODO */ 3620 .write_u64 = cgroup_procs_write,
3274 .release = cgroup_pidlist_release, 3621 .release = cgroup_pidlist_release,
3275 .mode = S_IRUGO, 3622 .mode = S_IRUGO | S_IWUSR,
3276 }, 3623 },
3277 { 3624 {
3278 .name = "notify_on_release", 3625 .name = "notify_on_release",