diff options
author | Cliff Wickman <cpw@sgi.com> | 2008-02-07 03:14:42 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 11:42:22 -0500 |
commit | 31a7df01fd0cd786f60873a921aecafac148c290 (patch) | |
tree | 221f00c864c50e7dc4719cb4de09292040567c55 | |
parent | dfc05c259e424e4160c66eab728f55cc4b53fd75 (diff) |
cgroups: mechanism to process each task in a cgroup
Provide cgroup_scan_tasks(), which iterates through every task in a cgroup,
calling a test function and a process function for each. And call the process
function without holding the css_set_lock lock.
The idea is David Rientjes', predicting that such a function will make it much
easier in the future to extend things that require access to each task in a
cgroup without holding the lock,
[akpm@linux-foundation.org: cleanup]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/cgroup.h | 14 | ||||
-rw-r--r-- | kernel/cgroup.c | 198 |
2 files changed, 200 insertions, 12 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d8e92223a79c..8675c691d3e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/nodemask.h> | 14 | #include <linux/nodemask.h> |
15 | #include <linux/rcupdate.h> | 15 | #include <linux/rcupdate.h> |
16 | #include <linux/cgroupstats.h> | 16 | #include <linux/cgroupstats.h> |
17 | #include <linux/prio_heap.h> | ||
17 | 18 | ||
18 | #ifdef CONFIG_CGROUPS | 19 | #ifdef CONFIG_CGROUPS |
19 | 20 | ||
@@ -207,6 +208,14 @@ struct cftype { | |||
207 | int (*release) (struct inode *inode, struct file *file); | 208 | int (*release) (struct inode *inode, struct file *file); |
208 | }; | 209 | }; |
209 | 210 | ||
211 | struct cgroup_scanner { | ||
212 | struct cgroup *cg; | ||
213 | int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); | ||
214 | void (*process_task)(struct task_struct *p, | ||
215 | struct cgroup_scanner *scan); | ||
216 | struct ptr_heap *heap; | ||
217 | }; | ||
218 | |||
210 | /* Add a new file to the given cgroup directory. Should only be | 219 | /* Add a new file to the given cgroup directory. Should only be |
211 | * called by subsystems from within a populate() method */ | 220 | * called by subsystems from within a populate() method */ |
212 | int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys, | 221 | int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys, |
@@ -299,11 +308,16 @@ struct cgroup_iter { | |||
299 | * returns NULL or until you want to end the iteration | 308 | * returns NULL or until you want to end the iteration |
300 | * | 309 | * |
301 | * 3) call cgroup_iter_end() to destroy the iterator. | 310 | * 3) call cgroup_iter_end() to destroy the iterator. |
311 | * | ||
312 | * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset. | ||
313 | * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task() | ||
314 | * callback, but not while calling the process_task() callback. | ||
302 | */ | 315 | */ |
303 | void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it); | 316 | void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it); |
304 | struct task_struct *cgroup_iter_next(struct cgroup *cont, | 317 | struct task_struct *cgroup_iter_next(struct cgroup *cont, |
305 | struct cgroup_iter *it); | 318 | struct cgroup_iter *it); |
306 | void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); | 319 | void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); |
320 | int cgroup_scan_tasks(struct cgroup_scanner *scan); | ||
307 | 321 | ||
308 | #else /* !CONFIG_CGROUPS */ | 322 | #else /* !CONFIG_CGROUPS */ |
309 | 323 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4e8b16a8266c..bcc7a6e8e3c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1695,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp, | |||
1695 | it->task = cg->tasks.next; | 1695 | it->task = cg->tasks.next; |
1696 | } | 1696 | } |
1697 | 1697 | ||
1698 | /* | ||
1699 | * To reduce the fork() overhead for systems that are not actually | ||
1700 | * using their cgroups capability, we don't maintain the lists running | ||
1701 | * through each css_set to its tasks until we see the list actually | ||
1702 | * used - in other words after the first call to cgroup_iter_start(). | ||
1703 | * | ||
1704 | * The tasklist_lock is not held here, as do_each_thread() and | ||
1705 | * while_each_thread() are protected by RCU. | ||
1706 | */ | ||
1707 | void cgroup_enable_task_cg_lists(void) | ||
1708 | { | ||
1709 | struct task_struct *p, *g; | ||
1710 | write_lock(&css_set_lock); | ||
1711 | use_task_css_set_links = 1; | ||
1712 | do_each_thread(g, p) { | ||
1713 | task_lock(p); | ||
1714 | if (list_empty(&p->cg_list)) | ||
1715 | list_add(&p->cg_list, &p->cgroups->tasks); | ||
1716 | task_unlock(p); | ||
1717 | } while_each_thread(g, p); | ||
1718 | write_unlock(&css_set_lock); | ||
1719 | } | ||
1720 | |||
1698 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 1721 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
1699 | { | 1722 | { |
1700 | /* | 1723 | /* |
@@ -1702,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | |||
1702 | * we need to enable the list linking each css_set to its | 1725 | * we need to enable the list linking each css_set to its |
1703 | * tasks, and fix up all existing tasks. | 1726 | * tasks, and fix up all existing tasks. |
1704 | */ | 1727 | */ |
1705 | if (!use_task_css_set_links) { | 1728 | if (!use_task_css_set_links) |
1706 | struct task_struct *p, *g; | 1729 | cgroup_enable_task_cg_lists(); |
1707 | write_lock(&css_set_lock); | 1730 | |
1708 | use_task_css_set_links = 1; | ||
1709 | do_each_thread(g, p) { | ||
1710 | task_lock(p); | ||
1711 | if (list_empty(&p->cg_list)) | ||
1712 | list_add(&p->cg_list, &p->cgroups->tasks); | ||
1713 | task_unlock(p); | ||
1714 | } while_each_thread(g, p); | ||
1715 | write_unlock(&css_set_lock); | ||
1716 | } | ||
1717 | read_lock(&css_set_lock); | 1731 | read_lock(&css_set_lock); |
1718 | it->cg_link = &cgrp->css_sets; | 1732 | it->cg_link = &cgrp->css_sets; |
1719 | cgroup_advance_iter(cgrp, it); | 1733 | cgroup_advance_iter(cgrp, it); |
@@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | |||
1746 | read_unlock(&css_set_lock); | 1760 | read_unlock(&css_set_lock); |
1747 | } | 1761 | } |
1748 | 1762 | ||
1763 | static inline int started_after_time(struct task_struct *t1, | ||
1764 | struct timespec *time, | ||
1765 | struct task_struct *t2) | ||
1766 | { | ||
1767 | int start_diff = timespec_compare(&t1->start_time, time); | ||
1768 | if (start_diff > 0) { | ||
1769 | return 1; | ||
1770 | } else if (start_diff < 0) { | ||
1771 | return 0; | ||
1772 | } else { | ||
1773 | /* | ||
1774 | * Arbitrarily, if two processes started at the same | ||
1775 | * time, we'll say that the lower pointer value | ||
1776 | * started first. Note that t2 may have exited by now | ||
1777 | * so this may not be a valid pointer any longer, but | ||
1778 | * that's fine - it still serves to distinguish | ||
1779 | * between two tasks started (effectively) simultaneously. | ||
1780 | */ | ||
1781 | return t1 > t2; | ||
1782 | } | ||
1783 | } | ||
1784 | |||
1785 | /* | ||
1786 | * This function is a callback from heap_insert() and is used to order | ||
1787 | * the heap. | ||
1788 | * In this case we order the heap in descending task start time. | ||
1789 | */ | ||
1790 | static inline int started_after(void *p1, void *p2) | ||
1791 | { | ||
1792 | struct task_struct *t1 = p1; | ||
1793 | struct task_struct *t2 = p2; | ||
1794 | return started_after_time(t1, &t2->start_time, t2); | ||
1795 | } | ||
1796 | |||
1797 | /** | ||
1798 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | ||
1799 | * @scan: struct cgroup_scanner containing arguments for the scan | ||
1800 | * | ||
1801 | * Arguments include pointers to callback functions test_task() and | ||
1802 | * process_task(). | ||
1803 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | ||
1804 | * and if it returns true, call process_task() for it also. | ||
1805 | * The test_task pointer may be NULL, meaning always true (select all tasks). | ||
1806 | * Effectively duplicates cgroup_iter_{start,next,end}() | ||
1807 | * but does not lock css_set_lock for the call to process_task(). | ||
1808 | * The struct cgroup_scanner may be embedded in any structure of the caller's | ||
1809 | * creation. | ||
1810 | * It is guaranteed that process_task() will act on every task that | ||
1811 | * is a member of the cgroup for the duration of this call. This | ||
1812 | * function may or may not call process_task() for tasks that exit | ||
1813 | * or move to a different cgroup during the call, or are forked or | ||
1814 | * move into the cgroup during the call. | ||
1815 | * | ||
1816 | * Note that test_task() may be called with locks held, and may in some | ||
1817 | * situations be called multiple times for the same task, so it should | ||
1818 | * be cheap. | ||
1819 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | ||
1820 | * pre-allocated and will be used for heap operations (and its "gt" member will | ||
1821 | * be overwritten), else a temporary heap will be used (allocation of which | ||
1822 | * may cause this function to fail). | ||
1823 | */ | ||
1824 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | ||
1825 | { | ||
1826 | int retval, i; | ||
1827 | struct cgroup_iter it; | ||
1828 | struct task_struct *p, *dropped; | ||
1829 | /* Never dereference latest_task, since it's not refcounted */ | ||
1830 | struct task_struct *latest_task = NULL; | ||
1831 | struct ptr_heap tmp_heap; | ||
1832 | struct ptr_heap *heap; | ||
1833 | struct timespec latest_time = { 0, 0 }; | ||
1834 | |||
1835 | if (scan->heap) { | ||
1836 | /* The caller supplied our heap and pre-allocated its memory */ | ||
1837 | heap = scan->heap; | ||
1838 | heap->gt = &started_after; | ||
1839 | } else { | ||
1840 | /* We need to allocate our own heap memory */ | ||
1841 | heap = &tmp_heap; | ||
1842 | retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
1843 | if (retval) | ||
1844 | /* cannot allocate the heap */ | ||
1845 | return retval; | ||
1846 | } | ||
1847 | |||
1848 | again: | ||
1849 | /* | ||
1850 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | ||
1851 | * to determine which are of interest, and using the scanner's | ||
1852 | * "process_task" callback to process any of them that need an update. | ||
1853 | * Since we don't want to hold any locks during the task updates, | ||
1854 | * gather tasks to be processed in a heap structure. | ||
1855 | * The heap is sorted by descending task start time. | ||
1856 | * If the statically-sized heap fills up, we overflow tasks that | ||
1857 | * started later, and in future iterations only consider tasks that | ||
1858 | * started after the latest task in the previous pass. This | ||
1859 | * guarantees forward progress and that we don't miss any tasks. | ||
1860 | */ | ||
1861 | heap->size = 0; | ||
1862 | cgroup_iter_start(scan->cg, &it); | ||
1863 | while ((p = cgroup_iter_next(scan->cg, &it))) { | ||
1864 | /* | ||
1865 | * Only affect tasks that qualify per the caller's callback, | ||
1866 | * if he provided one | ||
1867 | */ | ||
1868 | if (scan->test_task && !scan->test_task(p, scan)) | ||
1869 | continue; | ||
1870 | /* | ||
1871 | * Only process tasks that started after the last task | ||
1872 | * we processed | ||
1873 | */ | ||
1874 | if (!started_after_time(p, &latest_time, latest_task)) | ||
1875 | continue; | ||
1876 | dropped = heap_insert(heap, p); | ||
1877 | if (dropped == NULL) { | ||
1878 | /* | ||
1879 | * The new task was inserted; the heap wasn't | ||
1880 | * previously full | ||
1881 | */ | ||
1882 | get_task_struct(p); | ||
1883 | } else if (dropped != p) { | ||
1884 | /* | ||
1885 | * The new task was inserted, and pushed out a | ||
1886 | * different task | ||
1887 | */ | ||
1888 | get_task_struct(p); | ||
1889 | put_task_struct(dropped); | ||
1890 | } | ||
1891 | /* | ||
1892 | * Else the new task was newer than anything already in | ||
1893 | * the heap and wasn't inserted | ||
1894 | */ | ||
1895 | } | ||
1896 | cgroup_iter_end(scan->cg, &it); | ||
1897 | |||
1898 | if (heap->size) { | ||
1899 | for (i = 0; i < heap->size; i++) { | ||
1900 | struct task_struct *p = heap->ptrs[i]; | ||
1901 | if (i == 0) { | ||
1902 | latest_time = p->start_time; | ||
1903 | latest_task = p; | ||
1904 | } | ||
1905 | /* Process the task per the caller's callback */ | ||
1906 | scan->process_task(p, scan); | ||
1907 | put_task_struct(p); | ||
1908 | } | ||
1909 | /* | ||
1910 | * If we had to process any tasks at all, scan again | ||
1911 | * in case some of them were in the middle of forking | ||
1912 | * children that didn't get processed. | ||
1913 | * Not the most efficient way to do it, but it avoids | ||
1914 | * having to take callback_mutex in the fork path | ||
1915 | */ | ||
1916 | goto again; | ||
1917 | } | ||
1918 | if (heap == &tmp_heap) | ||
1919 | heap_free(&tmp_heap); | ||
1920 | return 0; | ||
1921 | } | ||
1922 | |||
1749 | /* | 1923 | /* |
1750 | * Stuff for reading the 'tasks' file. | 1924 | * Stuff for reading the 'tasks' file. |
1751 | * | 1925 | * |