aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCliff Wickman <cpw@sgi.com>2008-02-07 03:14:42 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:22 -0500
commit31a7df01fd0cd786f60873a921aecafac148c290 (patch)
tree221f00c864c50e7dc4719cb4de09292040567c55
parentdfc05c259e424e4160c66eab728f55cc4b53fd75 (diff)
cgroups: mechanism to process each task in a cgroup
Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h14
-rw-r--r--kernel/cgroup.c198
2 files changed, 200 insertions, 12 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d8e92223a79c..8675c691d3e2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,6 +14,7 @@
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/rcupdate.h> 15#include <linux/rcupdate.h>
16#include <linux/cgroupstats.h> 16#include <linux/cgroupstats.h>
17#include <linux/prio_heap.h>
17 18
18#ifdef CONFIG_CGROUPS 19#ifdef CONFIG_CGROUPS
19 20
@@ -207,6 +208,14 @@ struct cftype {
207 int (*release) (struct inode *inode, struct file *file); 208 int (*release) (struct inode *inode, struct file *file);
208}; 209};
209 210
211struct cgroup_scanner {
212 struct cgroup *cg;
213 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
214 void (*process_task)(struct task_struct *p,
215 struct cgroup_scanner *scan);
216 struct ptr_heap *heap;
217};
218
210/* Add a new file to the given cgroup directory. Should only be 219/* Add a new file to the given cgroup directory. Should only be
211 * called by subsystems from within a populate() method */ 220 * called by subsystems from within a populate() method */
212int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys, 221int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
@@ -299,11 +308,16 @@ struct cgroup_iter {
299 * returns NULL or until you want to end the iteration 308 * returns NULL or until you want to end the iteration
300 * 309 *
301 * 3) call cgroup_iter_end() to destroy the iterator. 310 * 3) call cgroup_iter_end() to destroy the iterator.
311 *
312 * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
313 * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
314 * callback, but not while calling the process_task() callback.
302 */ 315 */
303void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it); 316void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
304struct task_struct *cgroup_iter_next(struct cgroup *cont, 317struct task_struct *cgroup_iter_next(struct cgroup *cont,
305 struct cgroup_iter *it); 318 struct cgroup_iter *it);
306void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); 319void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
320int cgroup_scan_tasks(struct cgroup_scanner *scan);
307 321
308#else /* !CONFIG_CGROUPS */ 322#else /* !CONFIG_CGROUPS */
309 323
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4e8b16a8266c..bcc7a6e8e3c0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1695,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1695 it->task = cg->tasks.next; 1695 it->task = cg->tasks.next;
1696} 1696}
1697 1697
1698/*
1699 * To reduce the fork() overhead for systems that are not actually
1700 * using their cgroups capability, we don't maintain the lists running
1701 * through each css_set to its tasks until we see the list actually
1702 * used - in other words after the first call to cgroup_iter_start().
1703 *
1704 * The tasklist_lock is not held here, as do_each_thread() and
1705 * while_each_thread() are protected by RCU.
1706 */
1707void cgroup_enable_task_cg_lists(void)
1708{
1709 struct task_struct *p, *g;
1710 write_lock(&css_set_lock);
1711 use_task_css_set_links = 1;
1712 do_each_thread(g, p) {
1713 task_lock(p);
1714 if (list_empty(&p->cg_list))
1715 list_add(&p->cg_list, &p->cgroups->tasks);
1716 task_unlock(p);
1717 } while_each_thread(g, p);
1718 write_unlock(&css_set_lock);
1719}
1720
1698void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 1721void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1699{ 1722{
1700 /* 1723 /*
@@ -1702,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1702 * we need to enable the list linking each css_set to its 1725 * we need to enable the list linking each css_set to its
1703 * tasks, and fix up all existing tasks. 1726 * tasks, and fix up all existing tasks.
1704 */ 1727 */
1705 if (!use_task_css_set_links) { 1728 if (!use_task_css_set_links)
1706 struct task_struct *p, *g; 1729 cgroup_enable_task_cg_lists();
1707 write_lock(&css_set_lock); 1730
1708 use_task_css_set_links = 1;
1709 do_each_thread(g, p) {
1710 task_lock(p);
1711 if (list_empty(&p->cg_list))
1712 list_add(&p->cg_list, &p->cgroups->tasks);
1713 task_unlock(p);
1714 } while_each_thread(g, p);
1715 write_unlock(&css_set_lock);
1716 }
1717 read_lock(&css_set_lock); 1731 read_lock(&css_set_lock);
1718 it->cg_link = &cgrp->css_sets; 1732 it->cg_link = &cgrp->css_sets;
1719 cgroup_advance_iter(cgrp, it); 1733 cgroup_advance_iter(cgrp, it);
@@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1746 read_unlock(&css_set_lock); 1760 read_unlock(&css_set_lock);
1747} 1761}
1748 1762
1763static inline int started_after_time(struct task_struct *t1,
1764 struct timespec *time,
1765 struct task_struct *t2)
1766{
1767 int start_diff = timespec_compare(&t1->start_time, time);
1768 if (start_diff > 0) {
1769 return 1;
1770 } else if (start_diff < 0) {
1771 return 0;
1772 } else {
1773 /*
1774 * Arbitrarily, if two processes started at the same
1775 * time, we'll say that the lower pointer value
1776 * started first. Note that t2 may have exited by now
1777 * so this may not be a valid pointer any longer, but
1778 * that's fine - it still serves to distinguish
1779 * between two tasks started (effectively) simultaneously.
1780 */
1781 return t1 > t2;
1782 }
1783}
1784
1785/*
1786 * This function is a callback from heap_insert() and is used to order
1787 * the heap.
1788 * In this case we order the heap in descending task start time.
1789 */
1790static inline int started_after(void *p1, void *p2)
1791{
1792 struct task_struct *t1 = p1;
1793 struct task_struct *t2 = p2;
1794 return started_after_time(t1, &t2->start_time, t2);
1795}
1796
1797/**
1798 * cgroup_scan_tasks - iterate though all the tasks in a cgroup
1799 * @scan: struct cgroup_scanner containing arguments for the scan
1800 *
1801 * Arguments include pointers to callback functions test_task() and
1802 * process_task().
1803 * Iterate through all the tasks in a cgroup, calling test_task() for each,
1804 * and if it returns true, call process_task() for it also.
1805 * The test_task pointer may be NULL, meaning always true (select all tasks).
1806 * Effectively duplicates cgroup_iter_{start,next,end}()
1807 * but does not lock css_set_lock for the call to process_task().
1808 * The struct cgroup_scanner may be embedded in any structure of the caller's
1809 * creation.
1810 * It is guaranteed that process_task() will act on every task that
1811 * is a member of the cgroup for the duration of this call. This
1812 * function may or may not call process_task() for tasks that exit
1813 * or move to a different cgroup during the call, or are forked or
1814 * move into the cgroup during the call.
1815 *
1816 * Note that test_task() may be called with locks held, and may in some
1817 * situations be called multiple times for the same task, so it should
1818 * be cheap.
1819 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
1820 * pre-allocated and will be used for heap operations (and its "gt" member will
1821 * be overwritten), else a temporary heap will be used (allocation of which
1822 * may cause this function to fail).
1823 */
1824int cgroup_scan_tasks(struct cgroup_scanner *scan)
1825{
1826 int retval, i;
1827 struct cgroup_iter it;
1828 struct task_struct *p, *dropped;
1829 /* Never dereference latest_task, since it's not refcounted */
1830 struct task_struct *latest_task = NULL;
1831 struct ptr_heap tmp_heap;
1832 struct ptr_heap *heap;
1833 struct timespec latest_time = { 0, 0 };
1834
1835 if (scan->heap) {
1836 /* The caller supplied our heap and pre-allocated its memory */
1837 heap = scan->heap;
1838 heap->gt = &started_after;
1839 } else {
1840 /* We need to allocate our own heap memory */
1841 heap = &tmp_heap;
1842 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
1843 if (retval)
1844 /* cannot allocate the heap */
1845 return retval;
1846 }
1847
1848 again:
1849 /*
1850 * Scan tasks in the cgroup, using the scanner's "test_task" callback
1851 * to determine which are of interest, and using the scanner's
1852 * "process_task" callback to process any of them that need an update.
1853 * Since we don't want to hold any locks during the task updates,
1854 * gather tasks to be processed in a heap structure.
1855 * The heap is sorted by descending task start time.
1856 * If the statically-sized heap fills up, we overflow tasks that
1857 * started later, and in future iterations only consider tasks that
1858 * started after the latest task in the previous pass. This
1859 * guarantees forward progress and that we don't miss any tasks.
1860 */
1861 heap->size = 0;
1862 cgroup_iter_start(scan->cg, &it);
1863 while ((p = cgroup_iter_next(scan->cg, &it))) {
1864 /*
1865 * Only affect tasks that qualify per the caller's callback,
1866 * if he provided one
1867 */
1868 if (scan->test_task && !scan->test_task(p, scan))
1869 continue;
1870 /*
1871 * Only process tasks that started after the last task
1872 * we processed
1873 */
1874 if (!started_after_time(p, &latest_time, latest_task))
1875 continue;
1876 dropped = heap_insert(heap, p);
1877 if (dropped == NULL) {
1878 /*
1879 * The new task was inserted; the heap wasn't
1880 * previously full
1881 */
1882 get_task_struct(p);
1883 } else if (dropped != p) {
1884 /*
1885 * The new task was inserted, and pushed out a
1886 * different task
1887 */
1888 get_task_struct(p);
1889 put_task_struct(dropped);
1890 }
1891 /*
1892 * Else the new task was newer than anything already in
1893 * the heap and wasn't inserted
1894 */
1895 }
1896 cgroup_iter_end(scan->cg, &it);
1897
1898 if (heap->size) {
1899 for (i = 0; i < heap->size; i++) {
1900 struct task_struct *p = heap->ptrs[i];
1901 if (i == 0) {
1902 latest_time = p->start_time;
1903 latest_task = p;
1904 }
1905 /* Process the task per the caller's callback */
1906 scan->process_task(p, scan);
1907 put_task_struct(p);
1908 }
1909 /*
1910 * If we had to process any tasks at all, scan again
1911 * in case some of them were in the middle of forking
1912 * children that didn't get processed.
1913 * Not the most efficient way to do it, but it avoids
1914 * having to take callback_mutex in the fork path
1915 */
1916 goto again;
1917 }
1918 if (heap == &tmp_heap)
1919 heap_free(&tmp_heap);
1920 return 0;
1921}
1922
1749/* 1923/*
1750 * Stuff for reading the 'tasks' file. 1924 * Stuff for reading the 'tasks' file.
1751 * 1925 *