aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <jweiner@redhat.com>2012-01-12 20:17:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-12 23:13:04 -0500
commit9f3a0d0933de079665ec1b498947ffbf805b0018 (patch)
tree495b27908e328eff16a2269734109b8d6e5be95d /mm/memcontrol.c
parentab936cbcd02072a34b60d268f94440fd5cf1970b (diff)
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series: Memory control groups are currently bolted onto the side of traditional memory management in places where better integration would be preferrable. To reclaim memory, for example, memory control groups maintain their own LRU list and reclaim strategy aside from the global per-zone LRU list reclaim. But an extra list head for each existing page frame is expensive and maintaining it requires additional code. This patchset disables the global per-zone LRU lists on memory cgroup configurations and converts all its users to operate on the per-memory cgroup lists instead. As LRU pages are then exclusively on one list, this saves two list pointers for each page frame in the system: page_cgroup array size with 4G physical memory vanilla: allocated 31457280 bytes of page_cgroup patched: allocated 15728640 bytes of page_cgroup At the same time, system performance for various workloads is unaffected: 100G sparse file cat, 4G physical memory, 10 runs, to test for code bloat in the traditional LRU handling and kswapd & direct reclaim paths, without/with the memory controller configured in vanilla: 71.603(0.207) seconds patched: 71.640(0.156) seconds vanilla: 79.558(0.288) seconds patched: 77.233(0.147) seconds 100G sparse file cat in 1G memory cgroup, 10 runs, to test for code bloat in the traditional memory cgroup LRU handling and reclaim path vanilla: 96.844(0.281) seconds patched: 94.454(0.311) seconds 4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M swap on SSD, 10 runs, to test for regressions in kswapd & direct reclaim using per-memcg LRU lists with multiple memcgs and multiple allocators within each memcg vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ] patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ] 16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M swap on SSD, 10 runs, to test for regressions in hierarchical memcg setups vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ] patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ] This patch: There are currently two different implementations of iterating over a memory cgroup hierarchy tree. Consolidate them into one worker function and base the convenience looping-macros on top of it. Signed-off-by: Johannes Weiner <jweiner@redhat.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Ying Han <yinghan@google.com> Cc: Greg Thelen <gthelen@google.com> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c199
1 files changed, 75 insertions, 124 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b2d4036f1cd..6edef95fecf4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -853,83 +853,76 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
853 return memcg; 853 return memcg;
854} 854}
855 855
856/* The caller has to guarantee "mem" exists before calling this */ 856static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
857static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) 857 struct mem_cgroup *prev,
858 bool reclaim)
858{ 859{
859 struct cgroup_subsys_state *css; 860 struct mem_cgroup *memcg = NULL;
860 int found; 861 int id = 0;
861 862
862 if (!memcg) /* ROOT cgroup has the smallest ID */ 863 if (!root)
863 return root_mem_cgroup; /*css_put/get against root is ignored*/ 864 root = root_mem_cgroup;
864 if (!memcg->use_hierarchy) {
865 if (css_tryget(&memcg->css))
866 return memcg;
867 return NULL;
868 }
869 rcu_read_lock();
870 /*
871 * searching a memory cgroup which has the smallest ID under given
872 * ROOT cgroup. (ID >= 1)
873 */
874 css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
875 if (css && css_tryget(css))
876 memcg = container_of(css, struct mem_cgroup, css);
877 else
878 memcg = NULL;
879 rcu_read_unlock();
880 return memcg;
881}
882 865
883static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 866 if (prev && !reclaim)
884 struct mem_cgroup *root, 867 id = css_id(&prev->css);
885 bool cond)
886{
887 int nextid = css_id(&iter->css) + 1;
888 int found;
889 int hierarchy_used;
890 struct cgroup_subsys_state *css;
891 868
892 hierarchy_used = iter->use_hierarchy; 869 if (prev && prev != root)
870 css_put(&prev->css);
893 871
894 css_put(&iter->css); 872 if (!root->use_hierarchy && root != root_mem_cgroup) {
895 /* If no ROOT, walk all, ignore hierarchy */ 873 if (prev)
896 if (!cond || (root && !hierarchy_used)) 874 return NULL;
897 return NULL; 875 return root;
876 }
898 877
899 if (!root) 878 while (!memcg) {
900 root = root_mem_cgroup; 879 struct cgroup_subsys_state *css;
901 880
902 do { 881 if (reclaim)
903 iter = NULL; 882 id = root->last_scanned_child;
904 rcu_read_lock();
905 883
906 css = css_get_next(&mem_cgroup_subsys, nextid, 884 rcu_read_lock();
907 &root->css, &found); 885 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
908 if (css && css_tryget(css)) 886 if (css) {
909 iter = container_of(css, struct mem_cgroup, css); 887 if (css == &root->css || css_tryget(css))
888 memcg = container_of(css,
889 struct mem_cgroup, css);
890 } else
891 id = 0;
910 rcu_read_unlock(); 892 rcu_read_unlock();
911 /* If css is NULL, no more cgroups will be found */
912 nextid = found + 1;
913 } while (css && !iter);
914 893
915 return iter; 894 if (reclaim)
895 root->last_scanned_child = id;
896
897 if (prev && !css)
898 return NULL;
899 }
900 return memcg;
916} 901}
917/*
918 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
919 * be careful that "break" loop is not allowed. We have reference count.
920 * Instead of that modify "cond" to be false and "continue" to exit the loop.
921 */
922#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
923 for (iter = mem_cgroup_start_loop(root);\
924 iter != NULL;\
925 iter = mem_cgroup_get_next(iter, root, cond))
926 902
927#define for_each_mem_cgroup_tree(iter, root) \ 903static void mem_cgroup_iter_break(struct mem_cgroup *root,
928 for_each_mem_cgroup_tree_cond(iter, root, true) 904 struct mem_cgroup *prev)
905{
906 if (!root)
907 root = root_mem_cgroup;
908 if (prev && prev != root)
909 css_put(&prev->css);
910}
929 911
930#define for_each_mem_cgroup_all(iter) \ 912/*
931 for_each_mem_cgroup_tree_cond(iter, NULL, true) 913 * Iteration constructs for visiting all cgroups (under a tree). If
914 * loops are exited prematurely (break), mem_cgroup_iter_break() must
915 * be used for reference counting.
916 */
917#define for_each_mem_cgroup_tree(iter, root) \
918 for (iter = mem_cgroup_iter(root, NULL, false); \
919 iter != NULL; \
920 iter = mem_cgroup_iter(root, iter, false))
932 921
922#define for_each_mem_cgroup(iter) \
923 for (iter = mem_cgroup_iter(NULL, NULL, false); \
924 iter != NULL; \
925 iter = mem_cgroup_iter(NULL, iter, false))
933 926
934static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 927static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
935{ 928{
@@ -1536,43 +1529,6 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1536 return min(limit, memsw); 1529 return min(limit, memsw);
1537} 1530}
1538 1531
1539/*
1540 * Visit the first child (need not be the first child as per the ordering
1541 * of the cgroup list, since we track last_scanned_child) of @mem and use
1542 * that to reclaim free pages from.
1543 */
1544static struct mem_cgroup *
1545mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
1546{
1547 struct mem_cgroup *ret = NULL;
1548 struct cgroup_subsys_state *css;
1549 int nextid, found;
1550
1551 if (!root_memcg->use_hierarchy) {
1552 css_get(&root_memcg->css);
1553 ret = root_memcg;
1554 }
1555
1556 while (!ret) {
1557 rcu_read_lock();
1558 nextid = root_memcg->last_scanned_child + 1;
1559 css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
1560 &found);
1561 if (css && css_tryget(css))
1562 ret = container_of(css, struct mem_cgroup, css);
1563
1564 rcu_read_unlock();
1565 /* Updates scanning parameter */
1566 if (!css) {
1567 /* this means start scan from ID:1 */
1568 root_memcg->last_scanned_child = 0;
1569 } else
1570 root_memcg->last_scanned_child = found;
1571 }
1572
1573 return ret;
1574}
1575
1576/** 1532/**
1577 * test_mem_cgroup_node_reclaimable 1533 * test_mem_cgroup_node_reclaimable
1578 * @mem: the target memcg 1534 * @mem: the target memcg
@@ -1728,7 +1684,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1728 unsigned long reclaim_options, 1684 unsigned long reclaim_options,
1729 unsigned long *total_scanned) 1685 unsigned long *total_scanned)
1730{ 1686{
1731 struct mem_cgroup *victim; 1687 struct mem_cgroup *victim = NULL;
1732 int ret, total = 0; 1688 int ret, total = 0;
1733 int loop = 0; 1689 int loop = 0;
1734 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1690 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
@@ -1744,8 +1700,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1744 noswap = true; 1700 noswap = true;
1745 1701
1746 while (1) { 1702 while (1) {
1747 victim = mem_cgroup_select_victim(root_memcg); 1703 victim = mem_cgroup_iter(root_memcg, victim, true);
1748 if (victim == root_memcg) { 1704 if (!victim) {
1749 loop++; 1705 loop++;
1750 /* 1706 /*
1751 * We are not draining per cpu cached charges during 1707 * We are not draining per cpu cached charges during
@@ -1761,10 +1717,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1761 * anything, it might because there are 1717 * anything, it might because there are
1762 * no reclaimable pages under this hierarchy 1718 * no reclaimable pages under this hierarchy
1763 */ 1719 */
1764 if (!check_soft || !total) { 1720 if (!check_soft || !total)
1765 css_put(&victim->css);
1766 break; 1721 break;
1767 }
1768 /* 1722 /*
1769 * We want to do more targeted reclaim. 1723 * We want to do more targeted reclaim.
1770 * excess >> 2 is not to excessive so as to 1724 * excess >> 2 is not to excessive so as to
@@ -1772,15 +1726,13 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1772 * coming back to reclaim from this cgroup 1726 * coming back to reclaim from this cgroup
1773 */ 1727 */
1774 if (total >= (excess >> 2) || 1728 if (total >= (excess >> 2) ||
1775 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1729 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1776 css_put(&victim->css);
1777 break; 1730 break;
1778 }
1779 } 1731 }
1732 continue;
1780 } 1733 }
1781 if (!mem_cgroup_reclaimable(victim, noswap)) { 1734 if (!mem_cgroup_reclaimable(victim, noswap)) {
1782 /* this cgroup's local usage == 0 */ 1735 /* this cgroup's local usage == 0 */
1783 css_put(&victim->css);
1784 continue; 1736 continue;
1785 } 1737 }
1786 /* we use swappiness of local cgroup */ 1738 /* we use swappiness of local cgroup */
@@ -1791,21 +1743,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1791 } else 1743 } else
1792 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1744 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1793 noswap); 1745 noswap);
1794 css_put(&victim->css); 1746 total += ret;
1795 /* 1747 /*
1796 * At shrinking usage, we can't check we should stop here or 1748 * At shrinking usage, we can't check we should stop here or
1797 * reclaim more. It's depends on callers. last_scanned_child 1749 * reclaim more. It's depends on callers. last_scanned_child
1798 * will work enough for keeping fairness under tree. 1750 * will work enough for keeping fairness under tree.
1799 */ 1751 */
1800 if (shrink) 1752 if (shrink)
1801 return ret; 1753 break;
1802 total += ret;
1803 if (check_soft) { 1754 if (check_soft) {
1804 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1755 if (!res_counter_soft_limit_excess(&root_memcg->res))
1805 return total; 1756 break;
1806 } else if (mem_cgroup_margin(root_memcg)) 1757 } else if (mem_cgroup_margin(root_memcg))
1807 return total; 1758 break;
1808 } 1759 }
1760 mem_cgroup_iter_break(root_memcg, victim);
1809 return total; 1761 return total;
1810} 1762}
1811 1763
@@ -1817,16 +1769,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1817static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1769static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1818{ 1770{
1819 struct mem_cgroup *iter, *failed = NULL; 1771 struct mem_cgroup *iter, *failed = NULL;
1820 bool cond = true;
1821 1772
1822 for_each_mem_cgroup_tree_cond(iter, memcg, cond) { 1773 for_each_mem_cgroup_tree(iter, memcg) {
1823 if (iter->oom_lock) { 1774 if (iter->oom_lock) {
1824 /* 1775 /*
1825 * this subtree of our hierarchy is already locked 1776 * this subtree of our hierarchy is already locked
1826 * so we cannot give a lock. 1777 * so we cannot give a lock.
1827 */ 1778 */
1828 failed = iter; 1779 failed = iter;
1829 cond = false; 1780 mem_cgroup_iter_break(memcg, iter);
1781 break;
1830 } else 1782 } else
1831 iter->oom_lock = true; 1783 iter->oom_lock = true;
1832 } 1784 }
@@ -1838,11 +1790,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1838 * OK, we failed to lock the whole subtree so we have to clean up 1790 * OK, we failed to lock the whole subtree so we have to clean up
1839 * what we set up to the failing subtree 1791 * what we set up to the failing subtree
1840 */ 1792 */
1841 cond = true; 1793 for_each_mem_cgroup_tree(iter, memcg) {
1842 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1843 if (iter == failed) { 1794 if (iter == failed) {
1844 cond = false; 1795 mem_cgroup_iter_break(memcg, iter);
1845 continue; 1796 break;
1846 } 1797 }
1847 iter->oom_lock = false; 1798 iter->oom_lock = false;
1848 } 1799 }
@@ -2238,7 +2189,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2238 struct mem_cgroup *iter; 2189 struct mem_cgroup *iter;
2239 2190
2240 if ((action == CPU_ONLINE)) { 2191 if ((action == CPU_ONLINE)) {
2241 for_each_mem_cgroup_all(iter) 2192 for_each_mem_cgroup(iter)
2242 synchronize_mem_cgroup_on_move(iter, cpu); 2193 synchronize_mem_cgroup_on_move(iter, cpu);
2243 return NOTIFY_OK; 2194 return NOTIFY_OK;
2244 } 2195 }
@@ -2246,7 +2197,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2246 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2197 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2247 return NOTIFY_OK; 2198 return NOTIFY_OK;
2248 2199
2249 for_each_mem_cgroup_all(iter) 2200 for_each_mem_cgroup(iter)
2250 mem_cgroup_drain_pcp_counter(iter, cpu); 2201 mem_cgroup_drain_pcp_counter(iter, cpu);
2251 2202
2252 stock = &per_cpu(memcg_stock, cpu); 2203 stock = &per_cpu(memcg_stock, cpu);