aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2014-02-25 10:04:01 -0500
committerTejun Heo <tj@kernel.org>2014-02-25 10:04:01 -0500
commitb3dc094e93905ae9c1bc0815402ad8e5b203d068 (patch)
tree6d99ba4737ccbf7ce94f06937db571a7fcb902a4
parentc75611282cf1bf717c1866e7a7eb4d0743815187 (diff)
cgroup: use css_set->mg_tasks to track target tasks during migration
Currently, while migrating tasks from one cgroup to another, cgroup_attach_task() builds a flex array of all target tasks; unfortunately, this has a couple issues. * Flex array has size limit. On 64bit, struct task_and_cgroup is 24bytes making the flex element limit around 87k. It is a high number but not impossible to hit. This means that the current cgroup implementation can't migrate a process with more than 87k threads. * Process migration involves memory allocation whose size is dependent on the number of threads the process has. This means that cgroup core can't guarantee success or failure of multi-process migrations as memory allocation failure can happen in the middle. This is in part because cgroup can't grab threadgroup locks of multiple processes at the same time, so when there are multiple processes to migrate, it is imposible to tell how many tasks are to be migrated beforehand. Note that this already affects cgroup_transfer_tasks(). cgroup currently cannot guarantee atomic success or failure of the operation. It may fail in the middle and after such failure cgroup doesn't have enough information to roll back properly. It just aborts with some tasks migrated and others not. To resolve the situation, this patch updates the migration path to use task->cg_list to track target tasks. The previous patch already added css_set->mg_tasks and updated iterations in non-migration paths to include them during task migration. This patch updates migration path to actually make use of it. Instead of putting onto a flex_array, each target task is moved from its css_set->tasks list to css_set->mg_tasks and the migration path keeps trace of all the source css_sets and the associated cgroups. Once all source css_sets are determined, the destination css_set for each is determined, linked to the matching source css_set and put on a separate list. To iterate the target tasks, migration path just needs to iterat through either the source or target css_sets, depending on whether migration has been committed or not, and the tasks on their ->mg_tasks lists. cgroup_taskset is updated to contain the list_heads for source and target css_sets and the iteration cursor. cgroup_taskset_*() are accordingly updated to walk through css_sets and their ->mg_tasks. This resolves the above listed issues with moderate additional complexity. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
-rw-r--r--include/linux/cgroup.h16
-rw-r--r--kernel/cgroup.c223
2 files changed, 131 insertions, 108 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 528e2aed36c3..3a1cb265afd6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -346,6 +346,22 @@ struct css_set {
346 */ 346 */
347 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 347 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
348 348
349 /*
350 * List of csets participating in the on-going migration either as
351 * source or destination. Protected by cgroup_mutex.
352 */
353 struct list_head mg_node;
354
355 /*
356 * If this cset is acting as the source of migration the following
357 * two fields are set. mg_src_cgrp is the source cgroup of the
358 * on-going migration and mg_dst_cset is the destination cset the
359 * target tasks on this cset should be migrated to. Protected by
360 * cgroup_mutex.
361 */
362 struct cgroup *mg_src_cgrp;
363 struct css_set *mg_dst_cset;
364
349 /* For RCU-protected deletion */ 365 /* For RCU-protected deletion */
350 struct rcu_head rcu_head; 366 struct rcu_head rcu_head;
351}; 367};
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b80c611ff836..5def4a800425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
52#include <linux/pid_namespace.h> 52#include <linux/pid_namespace.h>
53#include <linux/idr.h> 53#include <linux/idr.h>
54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
55#include <linux/flex_array.h> /* used in cgroup_attach_task */
56#include <linux/kthread.h> 55#include <linux/kthread.h>
57#include <linux/delay.h> 56#include <linux/delay.h>
58 57
@@ -645,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
645 INIT_LIST_HEAD(&cset->cgrp_links); 644 INIT_LIST_HEAD(&cset->cgrp_links);
646 INIT_LIST_HEAD(&cset->tasks); 645 INIT_LIST_HEAD(&cset->tasks);
647 INIT_LIST_HEAD(&cset->mg_tasks); 646 INIT_LIST_HEAD(&cset->mg_tasks);
647 INIT_LIST_HEAD(&cset->mg_node);
648 INIT_HLIST_NODE(&cset->hlist); 648 INIT_HLIST_NODE(&cset->hlist);
649 649
650 /* Copy the set of subsystem state objects generated in 650 /* Copy the set of subsystem state objects generated in
@@ -1639,20 +1639,26 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1639} 1639}
1640EXPORT_SYMBOL_GPL(task_cgroup_path); 1640EXPORT_SYMBOL_GPL(task_cgroup_path);
1641 1641
1642/* 1642/* used to track tasks and other necessary states during migration */
1643 * Control Group taskset
1644 */
1645struct task_and_cgroup {
1646 struct task_struct *task;
1647 struct cgroup *cgrp;
1648 struct css_set *cset;
1649};
1650
1651struct cgroup_taskset { 1643struct cgroup_taskset {
1652 struct task_and_cgroup single; 1644 /* the src and dst cset list running through cset->mg_node */
1653 struct flex_array *tc_array; 1645 struct list_head src_csets;
1654 int tc_array_len; 1646 struct list_head dst_csets;
1655 int idx; 1647
1648 /*
1649 * Fields for cgroup_taskset_*() iteration.
1650 *
1651 * Before migration is committed, the target migration tasks are on
1652 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1653 * the csets on ->dst_csets. ->csets point to either ->src_csets
1654 * or ->dst_csets depending on whether migration is committed.
1655 *
1656 * ->cur_csets and ->cur_task point to the current task position
1657 * during iteration.
1658 */
1659 struct list_head *csets;
1660 struct css_set *cur_cset;
1661 struct task_struct *cur_task;
1656}; 1662};
1657 1663
1658/** 1664/**
@@ -1663,12 +1669,10 @@ struct cgroup_taskset {
1663 */ 1669 */
1664struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1670struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1665{ 1671{
1666 if (tset->tc_array) { 1672 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1667 tset->idx = 0; 1673 tset->cur_task = NULL;
1668 return cgroup_taskset_next(tset); 1674
1669 } else { 1675 return cgroup_taskset_next(tset);
1670 return tset->single.task;
1671 }
1672} 1676}
1673 1677
1674/** 1678/**
@@ -1680,13 +1684,27 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1680 */ 1684 */
1681struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1685struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1682{ 1686{
1683 struct task_and_cgroup *tc; 1687 struct css_set *cset = tset->cur_cset;
1688 struct task_struct *task = tset->cur_task;
1684 1689
1685 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1690 while (&cset->mg_node != tset->csets) {
1686 return NULL; 1691 if (!task)
1692 task = list_first_entry(&cset->mg_tasks,
1693 struct task_struct, cg_list);
1694 else
1695 task = list_next_entry(task, cg_list);
1687 1696
1688 tc = flex_array_get(tset->tc_array, tset->idx++); 1697 if (&task->cg_list != &cset->mg_tasks) {
1689 return tc->task; 1698 tset->cur_cset = cset;
1699 tset->cur_task = task;
1700 return task;
1701 }
1702
1703 cset = list_next_entry(cset, mg_node);
1704 task = NULL;
1705 }
1706
1707 return NULL;
1690} 1708}
1691 1709
1692/** 1710/**
@@ -1714,11 +1732,13 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1714 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1732 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1715 old_cset = task_css_set(tsk); 1733 old_cset = task_css_set(tsk);
1716 1734
1735 get_css_set(new_cset);
1736
1717 task_lock(tsk); 1737 task_lock(tsk);
1718 rcu_assign_pointer(tsk->cgroups, new_cset); 1738 rcu_assign_pointer(tsk->cgroups, new_cset);
1719 task_unlock(tsk); 1739 task_unlock(tsk);
1720 1740
1721 list_move(&tsk->cg_list, &new_cset->tasks); 1741 list_move(&tsk->cg_list, &new_cset->mg_tasks);
1722 1742
1723 /* 1743 /*
1724 * We just gained a reference on old_cset by taking it from the 1744 * We just gained a reference on old_cset by taking it from the
@@ -1741,80 +1761,58 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1741static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, 1761static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
1742 bool threadgroup) 1762 bool threadgroup)
1743{ 1763{
1744 int ret, i, group_size; 1764 struct cgroup_taskset tset = {
1745 struct cgroupfs_root *root = cgrp->root; 1765 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1766 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1767 .csets = &tset.src_csets,
1768 };
1746 struct cgroup_subsys_state *css, *failed_css = NULL; 1769 struct cgroup_subsys_state *css, *failed_css = NULL;
1747 /* threadgroup list cursor and array */ 1770 struct css_set *cset, *tmp_cset;
1748 struct task_struct *task; 1771 struct task_struct *task, *tmp_task;
1749 struct task_and_cgroup *tc; 1772 int i, ret;
1750 struct flex_array *group;
1751 struct cgroup_taskset tset = { };
1752
1753 /*
1754 * step 0: in order to do expensive, possibly blocking operations for
1755 * every thread, we cannot iterate the thread group list, since it needs
1756 * rcu or tasklist locked. instead, build an array of all threads in the
1757 * group - group_rwsem prevents new threads from appearing, and if
1758 * threads exit, this will just be an over-estimate.
1759 */
1760 if (threadgroup)
1761 group_size = get_nr_threads(leader);
1762 else
1763 group_size = 1;
1764 /* flex_array supports very large thread-groups better than kmalloc. */
1765 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
1766 if (!group)
1767 return -ENOMEM;
1768 /* pre-allocate to guarantee space while iterating in rcu read-side. */
1769 ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
1770 if (ret)
1771 goto out_free_group_list;
1772 1773
1773 i = 0;
1774 /* 1774 /*
1775 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1775 * Prevent freeing of tasks while we take a snapshot. Tasks that are
1776 * already PF_EXITING could be freed from underneath us unless we 1776 * already PF_EXITING could be freed from underneath us unless we
1777 * take an rcu_read_lock. 1777 * take an rcu_read_lock.
1778 */ 1778 */
1779 down_read(&css_set_rwsem); 1779 down_write(&css_set_rwsem);
1780 rcu_read_lock(); 1780 rcu_read_lock();
1781 task = leader; 1781 task = leader;
1782 do { 1782 do {
1783 struct task_and_cgroup ent; 1783 struct cgroup *src_cgrp;
1784 1784
1785 /* @task either already exited or can't exit until the end */ 1785 /* @task either already exited or can't exit until the end */
1786 if (task->flags & PF_EXITING) 1786 if (task->flags & PF_EXITING)
1787 goto next; 1787 goto next;
1788 1788
1789 /* as per above, nr_threads may decrease, but not increase. */ 1789 cset = task_css_set(task);
1790 BUG_ON(i >= group_size); 1790 src_cgrp = task_cgroup_from_root(task, cgrp->root);
1791 ent.task = task; 1791
1792 ent.cgrp = task_cgroup_from_root(task, root);
1793 /* nothing to do if this task is already in the cgroup */ 1792 /* nothing to do if this task is already in the cgroup */
1794 if (ent.cgrp == cgrp) 1793 if (src_cgrp == cgrp)
1795 goto next; 1794 goto next;
1796 /* 1795
1797 * saying GFP_ATOMIC has no effect here because we did prealloc 1796 if (!cset->mg_src_cgrp) {
1798 * earlier, but it's good form to communicate our expectations. 1797 WARN_ON(!list_empty(&cset->mg_tasks));
1799 */ 1798 WARN_ON(!list_empty(&cset->mg_node));
1800 ret = flex_array_put(group, i, &ent, GFP_ATOMIC); 1799
1801 BUG_ON(ret != 0); 1800 cset->mg_src_cgrp = src_cgrp;
1802 i++; 1801 list_add(&cset->mg_node, &tset.src_csets);
1802 get_css_set(cset);
1803 }
1804
1805 list_move(&task->cg_list, &cset->mg_tasks);
1803 next: 1806 next:
1804 if (!threadgroup) 1807 if (!threadgroup)
1805 break; 1808 break;
1806 } while_each_thread(leader, task); 1809 } while_each_thread(leader, task);
1807 rcu_read_unlock(); 1810 rcu_read_unlock();
1808 up_read(&css_set_rwsem); 1811 up_write(&css_set_rwsem);
1809 /* remember the number of threads in the array for later. */
1810 group_size = i;
1811 tset.tc_array = group;
1812 tset.tc_array_len = group_size;
1813 1812
1814 /* methods shouldn't be called if no task is actually migrating */ 1813 /* methods shouldn't be called if no task is actually migrating */
1815 ret = 0; 1814 if (list_empty(&tset.src_csets))
1816 if (!group_size) 1815 return 0;
1817 goto out_free_group_list;
1818 1816
1819 /* 1817 /*
1820 * step 1: check that we can legitimately attach to the cgroup. 1818 * step 1: check that we can legitimately attach to the cgroup.
@@ -1833,16 +1831,21 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
1833 * step 2: make sure css_sets exist for all threads to be migrated. 1831 * step 2: make sure css_sets exist for all threads to be migrated.
1834 * we use find_css_set, which allocates a new one if necessary. 1832 * we use find_css_set, which allocates a new one if necessary.
1835 */ 1833 */
1836 for (i = 0; i < group_size; i++) { 1834 list_for_each_entry(cset, &tset.src_csets, mg_node) {
1837 struct css_set *old_cset; 1835 struct css_set *dst_cset;
1838 1836
1839 tc = flex_array_get(group, i); 1837 dst_cset = find_css_set(cset, cgrp);
1840 old_cset = task_css_set(tc->task); 1838 if (!dst_cset) {
1841 tc->cset = find_css_set(old_cset, cgrp);
1842 if (!tc->cset) {
1843 ret = -ENOMEM; 1839 ret = -ENOMEM;
1844 goto out_put_css_set_refs; 1840 goto out_release_tset;
1845 } 1841 }
1842
1843 if (list_empty(&dst_cset->mg_node))
1844 list_add(&dst_cset->mg_node, &tset.dst_csets);
1845 else
1846 put_css_set(dst_cset, false);
1847
1848 cset->mg_dst_cset = dst_cset;
1846 } 1849 }
1847 1850
1848 /* 1851 /*
@@ -1851,12 +1854,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
1851 * failure cases after here, so this is the commit point. 1854 * failure cases after here, so this is the commit point.
1852 */ 1855 */
1853 down_write(&css_set_rwsem); 1856 down_write(&css_set_rwsem);
1854 for (i = 0; i < group_size; i++) { 1857 list_for_each_entry(cset, &tset.src_csets, mg_node) {
1855 tc = flex_array_get(group, i); 1858 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
1856 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); 1859 cgroup_task_migrate(cset->mg_src_cgrp, task,
1860 cset->mg_dst_cset);
1857 } 1861 }
1858 up_write(&css_set_rwsem); 1862 up_write(&css_set_rwsem);
1859 /* nothing is sensitive to fork() after this point. */ 1863
1864 /* migration is committed, all target tasks are now on dst_csets */
1865 tset.csets = &tset.dst_csets;
1866
1867 /* nothing is sensitive to fork() after this point */
1860 1868
1861 /* 1869 /*
1862 * step 4: do subsystem attach callbacks. 1870 * step 4: do subsystem attach callbacks.
@@ -1865,30 +1873,27 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
1865 if (css->ss->attach) 1873 if (css->ss->attach)
1866 css->ss->attach(css, &tset); 1874 css->ss->attach(css, &tset);
1867 1875
1868 /*
1869 * step 5: success! and cleanup
1870 */
1871 ret = 0; 1876 ret = 0;
1872out_put_css_set_refs: 1877 goto out_release_tset;
1873 if (ret) { 1878
1874 for (i = 0; i < group_size; i++) {
1875 tc = flex_array_get(group, i);
1876 if (!tc->cset)
1877 break;
1878 put_css_set(tc->cset, false);
1879 }
1880 }
1881out_cancel_attach: 1879out_cancel_attach:
1882 if (ret) { 1880 for_each_css(css, i, cgrp) {
1883 for_each_css(css, i, cgrp) { 1881 if (css == failed_css)
1884 if (css == failed_css) 1882 break;
1885 break; 1883 if (css->ss->cancel_attach)
1886 if (css->ss->cancel_attach) 1884 css->ss->cancel_attach(css, &tset);
1887 css->ss->cancel_attach(css, &tset);
1888 }
1889 } 1885 }
1890out_free_group_list: 1886out_release_tset:
1891 flex_array_free(group); 1887 down_write(&css_set_rwsem);
1888 list_splice_init(&tset.dst_csets, &tset.src_csets);
1889 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
1890 list_splice_init(&cset->mg_tasks, &cset->tasks);
1891 cset->mg_dst_cset = NULL;
1892 cset->mg_src_cgrp = NULL;
1893 list_del_init(&cset->mg_node);
1894 put_css_set_locked(cset, false);
1895 }
1896 up_write(&css_set_rwsem);
1892 return ret; 1897 return ret;
1893} 1898}
1894 1899
@@ -3895,6 +3900,8 @@ int __init cgroup_init_early(void)
3895 atomic_set(&init_css_set.refcount, 1); 3900 atomic_set(&init_css_set.refcount, 1);
3896 INIT_LIST_HEAD(&init_css_set.cgrp_links); 3901 INIT_LIST_HEAD(&init_css_set.cgrp_links);
3897 INIT_LIST_HEAD(&init_css_set.tasks); 3902 INIT_LIST_HEAD(&init_css_set.tasks);
3903 INIT_LIST_HEAD(&init_css_set.mg_tasks);
3904 INIT_LIST_HEAD(&init_css_set.mg_node);
3898 INIT_HLIST_NODE(&init_css_set.hlist); 3905 INIT_HLIST_NODE(&init_css_set.hlist);
3899 css_set_count = 1; 3906 css_set_count = 1;
3900 init_cgroup_root(&cgroup_dummy_root); 3907 init_cgroup_root(&cgroup_dummy_root);