cgroups: mechanism to process each task in a cgroup

Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Cliff Wickman <cpw@sgi.com> 2008-02-07 03:14:42 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-02-07 11:42:22 -0500
commit: 31a7df01fd0cd786f60873a921aecafac148c290 (patch)
tree: 221f00c864c50e7dc4719cb4de09292040567c55
parent: dfc05c259e424e4160c66eab728f55cc4b53fd75 (diff)
2 files changed, 200 insertions, 12 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d8e92223a79c..8675c691d3e2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,6 +14,7 @@
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
+#include <linux/prio_heap.h>
 #ifdef CONFIG_CGROUPS
@@ -207,6 +208,14 @@ struct cftype {
        int (*release) (struct inode *inode, struct file *file);
 };
+struct cgroup_scanner {
+        struct cgroup *cg;
+        int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
+        void (*process_task)(struct task_struct *p,
+                        struct cgroup_scanner *scan);
+        struct ptr_heap *heap;
+};
 /* Add a new file to the given cgroup directory. Should only be
 * called by subsystems from within a populate() method */
 int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
@@ -299,11 +308,16 @@ struct cgroup_iter {
 *    returns NULL or until you want to end the iteration
 *
 * 3) call cgroup_iter_end() to destroy the iterator.
+ *
+ * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
+ *    - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
+ *      callback, but not while calling the process_task() callback.
 */
 void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cont,
                                        struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
+int cgroup_scan_tasks(struct cgroup_scanner *scan);
 #else /* !CONFIG_CGROUPS */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4e8b16a8266c..bcc7a6e8e3c0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1695,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
        it->task = cg->tasks.next;
 }
+/*
+ * To reduce the fork() overhead for systems that are not actually
+ * using their cgroups capability, we don't maintain the lists running
+ * through each css_set to its tasks until we see the list actually
+ * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
+ */
+void cgroup_enable_task_cg_lists(void)
+{
+        struct task_struct *p, *g;
+        write_lock(&css_set_lock);
+        use_task_css_set_links = 1;
+        do_each_thread(g, p) {
+                task_lock(p);
+                if (list_empty(&p->cg_list))
+                        list_add(&p->cg_list, &p->cgroups->tasks);
+                task_unlock(p);
+        } while_each_thread(g, p);
+        write_unlock(&css_set_lock);
+}
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
        /*
@@ -1702,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
         * we need to enable the list linking each css_set to its
         * tasks, and fix up all existing tasks.
         */
-        if (!use_task_css_set_links) {
+        if (!use_task_css_set_links)
-                struct task_struct *p, *g;
+                cgroup_enable_task_cg_lists();
-                write_lock(&css_set_lock);
-                use_task_css_set_links = 1;
-                do_each_thread(g, p) {
-                        task_lock(p);
-                        if (list_empty(&p->cg_list))
-                                list_add(&p->cg_list, &p->cgroups->tasks);
-                        task_unlock(p);
-                } while_each_thread(g, p);
-                write_unlock(&css_set_lock);
-        }
        read_lock(&css_set_lock);
        it->cg_link = &cgrp->css_sets;
        cgroup_advance_iter(cgrp, it);
@@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
        read_unlock(&css_set_lock);
 }
+static inline int started_after_time(struct task_struct *t1,
+                                     struct timespec *time,
+                                     struct task_struct *t2)
+{
+        int start_diff = timespec_compare(&t1->start_time, time);
+        if (start_diff > 0) {
+                return 1;
+        } else if (start_diff < 0) {
+                return 0;
+        } else {
+                /*
+                 * Arbitrarily, if two processes started at the same
+                 * time, we'll say that the lower pointer value
+                 * started first. Note that t2 may have exited by now
+                 * so this may not be a valid pointer any longer, but
+                 * that's fine - it still serves to distinguish
+                 * between two tasks started (effectively) simultaneously.
+                 */
+                return t1 > t2;
+        }
+}
+/*
+ * This function is a callback from heap_insert() and is used to order
+ * the heap.
+ * In this case we order the heap in descending task start time.
+ */
+static inline int started_after(void *p1, void *p2)
+{
+        struct task_struct *t1 = p1;
+        struct task_struct *t2 = p2;
+        return started_after_time(t1, &t2->start_time, t2);
+}
+/**
+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * @scan: struct cgroup_scanner containing arguments for the scan
+ *
+ * Arguments include pointers to callback functions test_task() and
+ * process_task().
+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * and if it returns true, call process_task() for it also.
+ * The test_task pointer may be NULL, meaning always true (select all tasks).
+ * Effectively duplicates cgroup_iter_{start,next,end}()
+ * but does not lock css_set_lock for the call to process_task().
+ * The struct cgroup_scanner may be embedded in any structure of the caller's
+ * creation.
+ * It is guaranteed that process_task() will act on every task that
+ * is a member of the cgroup for the duration of this call. This
+ * function may or may not call process_task() for tasks that exit
+ * or move to a different cgroup during the call, or are forked or
+ * move into the cgroup during the call.
+ *
+ * Note that test_task() may be called with locks held, and may in some
+ * situations be called multiple times for the same task, so it should
+ * be cheap.
+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * pre-allocated and will be used for heap operations (and its "gt" member will
+ * be overwritten), else a temporary heap will be used (allocation of which
+ * may cause this function to fail).
+ */
+int cgroup_scan_tasks(struct cgroup_scanner *scan)
+{
+        int retval, i;
+        struct cgroup_iter it;
+        struct task_struct *p, *dropped;
+        /* Never dereference latest_task, since it's not refcounted */
+        struct task_struct *latest_task = NULL;
+        struct ptr_heap tmp_heap;
+        struct ptr_heap *heap;
+        struct timespec latest_time = { 0, 0 };
+        if (scan->heap) {
+                /* The caller supplied our heap and pre-allocated its memory */
+                heap = scan->heap;
+                heap->gt = &started_after;
+        } else {
+                /* We need to allocate our own heap memory */
+                heap = &tmp_heap;
+                retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+                if (retval)
+                        /* cannot allocate the heap */
+                        return retval;
+        }
+ again:
+        /*
+         * Scan tasks in the cgroup, using the scanner's "test_task" callback
+         * to determine which are of interest, and using the scanner's
+         * "process_task" callback to process any of them that need an update.
+         * Since we don't want to hold any locks during the task updates,
+         * gather tasks to be processed in a heap structure.
+         * The heap is sorted by descending task start time.
+         * If the statically-sized heap fills up, we overflow tasks that
+         * started later, and in future iterations only consider tasks that
+         * started after the latest task in the previous pass. This
+         * guarantees forward progress and that we don't miss any tasks.
+         */
+        heap->size = 0;
+        cgroup_iter_start(scan->cg, &it);
+        while ((p = cgroup_iter_next(scan->cg, &it))) {
+                /*
+                 * Only affect tasks that qualify per the caller's callback,
+                 * if he provided one
+                 */
+                if (scan->test_task && !scan->test_task(p, scan))
+                        continue;
+                /*
+                 * Only process tasks that started after the last task
+                 * we processed
+                 */
+                if (!started_after_time(p, &latest_time, latest_task))
+                        continue;
+                dropped = heap_insert(heap, p);
+                if (dropped == NULL) {
+                        /*
+                         * The new task was inserted; the heap wasn't
+                         * previously full
+                         */
+                        get_task_struct(p);
+                } else if (dropped != p) {
+                        /*
+                         * The new task was inserted, and pushed out a
+                         * different task
+                         */
+                        get_task_struct(p);
+                        put_task_struct(dropped);
+                }
+                /*
+                 * Else the new task was newer than anything already in
+                 * the heap and wasn't inserted
+                 */
+        }
+        cgroup_iter_end(scan->cg, &it);
+        if (heap->size) {
+                for (i = 0; i < heap->size; i++) {
+                        struct task_struct *p = heap->ptrs[i];
+                        if (i == 0) {
+                                latest_time = p->start_time;
+                                latest_task = p;
+                        }
+                        /* Process the task per the caller's callback */
+                        scan->process_task(p, scan);
+                        put_task_struct(p);
+                }
+                /*
+                 * If we had to process any tasks at all, scan again
+                 * in case some of them were in the middle of forking
+                 * children that didn't get processed.
+                 * Not the most efficient way to do it, but it avoids
+                 * having to take callback_mutex in the fork path
+                 */
+                goto again;
+        }
+        if (heap == &tmp_heap)
+                heap_free(&tmp_heap);
+        return 0;
+}
 /*
 * Stuff for reading the 'tasks' file.
 *
author	Cliff Wickman <cpw@sgi.com>	2008-02-07 03:14:42 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-02-07 11:42:22 -0500
commit	31a7df01fd0cd786f60873a921aecafac148c290 (patch)
tree	221f00c864c50e7dc4719cb4de09292040567c55
parent	dfc05c259e424e4160c66eab728f55cc4b53fd75 (diff)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d8e92223a79c..8675c691d3e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h
@@ -14,6 +14,7 @@
14	#include <linux/nodemask.h>	14	#include <linux/nodemask.h>
15	#include <linux/rcupdate.h>	15	#include <linux/rcupdate.h>
16	#include <linux/cgroupstats.h>	16	#include <linux/cgroupstats.h>
		17	#include <linux/prio_heap.h>
17		18
18	#ifdef CONFIG_CGROUPS	19	#ifdef CONFIG_CGROUPS
19		20
@@ -207,6 +208,14 @@ struct cftype {
207	int (release) (struct inode inode, struct file *file);	208	int (release) (struct inode inode, struct file *file);
208	};	209	};
209		210
		211	struct cgroup_scanner {
		212	struct cgroup *cg;
		213	int (test_task)(struct task_struct p, struct cgroup_scanner *scan);
		214	void (process_task)(struct task_struct p,
		215	struct cgroup_scanner *scan);
		216	struct ptr_heap *heap;
		217	};
		218
210	/* Add a new file to the given cgroup directory. Should only be	219	/* Add a new file to the given cgroup directory. Should only be
211	* called by subsystems from within a populate() method */	220	* called by subsystems from within a populate() method */
212	int cgroup_add_file(struct cgroup cont, struct cgroup_subsys subsys,	221	int cgroup_add_file(struct cgroup cont, struct cgroup_subsys subsys,
@@ -299,11 +308,16 @@ struct cgroup_iter {
299	* returns NULL or until you want to end the iteration	308	* returns NULL or until you want to end the iteration
300	*	309	*
301	* 3) call cgroup_iter_end() to destroy the iterator.	310	* 3) call cgroup_iter_end() to destroy the iterator.
		311	*
		312	* Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
		313	* - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
		314	* callback, but not while calling the process_task() callback.
302	*/	315	*/
303	void cgroup_iter_start(struct cgroup cont, struct cgroup_iter it);	316	void cgroup_iter_start(struct cgroup cont, struct cgroup_iter it);
304	struct task_struct cgroup_iter_next(struct cgroup cont,	317	struct task_struct cgroup_iter_next(struct cgroup cont,
305	struct cgroup_iter *it);	318	struct cgroup_iter *it);
306	void cgroup_iter_end(struct cgroup cont, struct cgroup_iter it);	319	void cgroup_iter_end(struct cgroup cont, struct cgroup_iter it);
		320	int cgroup_scan_tasks(struct cgroup_scanner *scan);
307		321
308	#else /* !CONFIG_CGROUPS */	322	#else /* !CONFIG_CGROUPS */
309		323


diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4e8b16a8266c..bcc7a6e8e3c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -1695,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1695	it->task = cg->tasks.next;	1695	it->task = cg->tasks.next;
1696	}	1696	}
1697		1697
		1698	/*
		1699	* To reduce the fork() overhead for systems that are not actually
		1700	* using their cgroups capability, we don't maintain the lists running
		1701	* through each css_set to its tasks until we see the list actually
		1702	* used - in other words after the first call to cgroup_iter_start().
		1703	*
		1704	* The tasklist_lock is not held here, as do_each_thread() and
		1705	* while_each_thread() are protected by RCU.
		1706	*/
		1707	void cgroup_enable_task_cg_lists(void)
		1708	{
		1709	struct task_struct p, g;
		1710	write_lock(&css_set_lock);
		1711	use_task_css_set_links = 1;
		1712	do_each_thread(g, p) {
		1713	task_lock(p);
		1714	if (list_empty(&p->cg_list))
		1715	list_add(&p->cg_list, &p->cgroups->tasks);
		1716	task_unlock(p);
		1717	} while_each_thread(g, p);
		1718	write_unlock(&css_set_lock);
		1719	}
		1720
1698	void cgroup_iter_start(struct cgroup cgrp, struct cgroup_iter it)	1721	void cgroup_iter_start(struct cgroup cgrp, struct cgroup_iter it)
1699	{	1722	{
1700	/*	1723	/*
@@ -1702,18 +1725,9 @@ void cgroup_iter_start(struct cgroup cgrp, struct cgroup_iter it)
1702	* we need to enable the list linking each css_set to its	1725	* we need to enable the list linking each css_set to its
1703	* tasks, and fix up all existing tasks.	1726	* tasks, and fix up all existing tasks.
1704	*/	1727	*/
1705	if (!use_task_css_set_links) {	1728	if (!use_task_css_set_links)
1706	struct task_struct p, g;	1729	cgroup_enable_task_cg_lists();
1707	write_lock(&css_set_lock);	1730
1708	use_task_css_set_links = 1;
1709	do_each_thread(g, p) {
1710	task_lock(p);
1711	if (list_empty(&p->cg_list))
1712	list_add(&p->cg_list, &p->cgroups->tasks);
1713	task_unlock(p);
1714	} while_each_thread(g, p);
1715	write_unlock(&css_set_lock);
1716	}
1717	read_lock(&css_set_lock);	1731	read_lock(&css_set_lock);
1718	it->cg_link = &cgrp->css_sets;	1732	it->cg_link = &cgrp->css_sets;
1719	cgroup_advance_iter(cgrp, it);	1733	cgroup_advance_iter(cgrp, it);
@@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup cgrp, struct cgroup_iter it)
1746	read_unlock(&css_set_lock);	1760	read_unlock(&css_set_lock);
1747	}	1761	}
1748		1762
		1763	static inline int started_after_time(struct task_struct *t1,
		1764	struct timespec *time,
		1765	struct task_struct *t2)
		1766	{
		1767	int start_diff = timespec_compare(&t1->start_time, time);
		1768	if (start_diff > 0) {
		1769	return 1;
		1770	} else if (start_diff < 0) {
		1771	return 0;
		1772	} else {
		1773	/*
		1774	* Arbitrarily, if two processes started at the same
		1775	* time, we'll say that the lower pointer value
		1776	* started first. Note that t2 may have exited by now
		1777	* so this may not be a valid pointer any longer, but
		1778	* that's fine - it still serves to distinguish
		1779	* between two tasks started (effectively) simultaneously.
		1780	*/
		1781	return t1 > t2;
		1782	}
		1783	}
		1784
		1785	/*
		1786	* This function is a callback from heap_insert() and is used to order
		1787	* the heap.
		1788	* In this case we order the heap in descending task start time.
		1789	*/
		1790	static inline int started_after(void p1, void p2)
		1791	{
		1792	struct task_struct *t1 = p1;
		1793	struct task_struct *t2 = p2;
		1794	return started_after_time(t1, &t2->start_time, t2);
		1795	}
		1796
		1797	/**
		1798	* cgroup_scan_tasks - iterate though all the tasks in a cgroup
		1799	* @scan: struct cgroup_scanner containing arguments for the scan
		1800	*
		1801	* Arguments include pointers to callback functions test_task() and
		1802	* process_task().
		1803	* Iterate through all the tasks in a cgroup, calling test_task() for each,
		1804	* and if it returns true, call process_task() for it also.
		1805	* The test_task pointer may be NULL, meaning always true (select all tasks).
		1806	* Effectively duplicates cgroup_iter_{start,next,end}()
		1807	* but does not lock css_set_lock for the call to process_task().
		1808	* The struct cgroup_scanner may be embedded in any structure of the caller's
		1809	* creation.
		1810	* It is guaranteed that process_task() will act on every task that
		1811	* is a member of the cgroup for the duration of this call. This
		1812	* function may or may not call process_task() for tasks that exit
		1813	* or move to a different cgroup during the call, or are forked or
		1814	* move into the cgroup during the call.
		1815	*
		1816	* Note that test_task() may be called with locks held, and may in some
		1817	* situations be called multiple times for the same task, so it should
		1818	* be cheap.
		1819	* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
		1820	* pre-allocated and will be used for heap operations (and its "gt" member will
		1821	* be overwritten), else a temporary heap will be used (allocation of which
		1822	* may cause this function to fail).
		1823	*/
		1824	int cgroup_scan_tasks(struct cgroup_scanner *scan)
		1825	{
		1826	int retval, i;
		1827	struct cgroup_iter it;
		1828	struct task_struct p, dropped;
		1829	/* Never dereference latest_task, since it's not refcounted */
		1830	struct task_struct *latest_task = NULL;
		1831	struct ptr_heap tmp_heap;
		1832	struct ptr_heap *heap;
		1833	struct timespec latest_time = { 0, 0 };
		1834
		1835	if (scan->heap) {
		1836	/* The caller supplied our heap and pre-allocated its memory */
		1837	heap = scan->heap;
		1838	heap->gt = &started_after;
		1839	} else {
		1840	/* We need to allocate our own heap memory */
		1841	heap = &tmp_heap;
		1842	retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
		1843	if (retval)
		1844	/* cannot allocate the heap */
		1845	return retval;
		1846	}
		1847
		1848	again:
		1849	/*
		1850	* Scan tasks in the cgroup, using the scanner's "test_task" callback
		1851	* to determine which are of interest, and using the scanner's
		1852	* "process_task" callback to process any of them that need an update.
		1853	* Since we don't want to hold any locks during the task updates,
		1854	* gather tasks to be processed in a heap structure.
		1855	* The heap is sorted by descending task start time.
		1856	* If the statically-sized heap fills up, we overflow tasks that
		1857	* started later, and in future iterations only consider tasks that
		1858	* started after the latest task in the previous pass. This
		1859	* guarantees forward progress and that we don't miss any tasks.
		1860	*/
		1861	heap->size = 0;
		1862	cgroup_iter_start(scan->cg, &it);
		1863	while ((p = cgroup_iter_next(scan->cg, &it))) {
		1864	/*
		1865	* Only affect tasks that qualify per the caller's callback,
		1866	* if he provided one
		1867	*/
		1868	if (scan->test_task && !scan->test_task(p, scan))
		1869	continue;
		1870	/*
		1871	* Only process tasks that started after the last task
		1872	* we processed
		1873	*/
		1874	if (!started_after_time(p, &latest_time, latest_task))
		1875	continue;
		1876	dropped = heap_insert(heap, p);
		1877	if (dropped == NULL) {
		1878	/*
		1879	* The new task was inserted; the heap wasn't
		1880	* previously full
		1881	*/
		1882	get_task_struct(p);
		1883	} else if (dropped != p) {
		1884	/*
		1885	* The new task was inserted, and pushed out a
		1886	* different task
		1887	*/
		1888	get_task_struct(p);
		1889	put_task_struct(dropped);
		1890	}
		1891	/*
		1892	* Else the new task was newer than anything already in
		1893	* the heap and wasn't inserted
		1894	*/
		1895	}
		1896	cgroup_iter_end(scan->cg, &it);
		1897
		1898	if (heap->size) {
		1899	for (i = 0; i < heap->size; i++) {
		1900	struct task_struct *p = heap->ptrs[i];
		1901	if (i == 0) {
		1902	latest_time = p->start_time;
		1903	latest_task = p;
		1904	}
		1905	/* Process the task per the caller's callback */
		1906	scan->process_task(p, scan);
		1907	put_task_struct(p);
		1908	}
		1909	/*
		1910	* If we had to process any tasks at all, scan again
		1911	* in case some of them were in the middle of forking
		1912	* children that didn't get processed.
		1913	* Not the most efficient way to do it, but it avoids
		1914	* having to take callback_mutex in the fork path
		1915	*/
		1916	goto again;
		1917	}
		1918	if (heap == &tmp_heap)
		1919	heap_free(&tmp_heap);
		1920	return 0;
		1921	}
		1922
1749	/*	1923	/*
1750	* Stuff for reading the 'tasks' file.	1924	* Stuff for reading the 'tasks' file.
1751	*	1925	*