aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c560
1 files changed, 407 insertions, 153 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..1c52ddbc839b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/swap.h> 44#include <linux/swap.h>
44#include <linux/swapops.h> 45#include <linux/swapops.h>
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {
160 161
161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 162 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
162 163
164 struct rb_node tree_node; /* RB tree node */
165 unsigned long long usage_in_excess;/* Set to the value by which */
166 /* the soft limit is exceeded*/
167 bool on_tree;
163 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 168 struct mem_cgroup *memcg; /* Back pointer, we cannot */
164 /* use container_of */ 169 /* use container_of */
165}; 170};
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node {
168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 173 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
169}; 174};
170 175
176/*
177 * Cgroups above their limits are maintained in a RB-Tree, independent of
178 * their hierarchy representation
179 */
180
181struct mem_cgroup_tree_per_zone {
182 struct rb_root rb_root;
183 spinlock_t lock;
184};
185
186struct mem_cgroup_tree_per_node {
187 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
188};
189
190struct mem_cgroup_tree {
191 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
192};
193
194static struct mem_cgroup_tree soft_limit_tree __read_mostly;
195
171struct mem_cgroup_threshold { 196struct mem_cgroup_threshold {
172 struct eventfd_ctx *eventfd; 197 struct eventfd_ctx *eventfd;
173 u64 threshold; 198 u64 threshold;
@@ -303,22 +328,6 @@ struct mem_cgroup {
303 atomic_t numainfo_events; 328 atomic_t numainfo_events;
304 atomic_t numainfo_updating; 329 atomic_t numainfo_updating;
305#endif 330#endif
306 /*
307 * Protects soft_contributed transitions.
308 * See mem_cgroup_update_soft_limit
309 */
310 spinlock_t soft_lock;
311
312 /*
313 * If true then this group has increased parents' children_in_excess
314 * when it got over the soft limit.
315 * When a group falls bellow the soft limit, parents' children_in_excess
316 * is decreased and soft_contributed changed to false.
317 */
318 bool soft_contributed;
319
320 /* Number of children that are in soft limit excess */
321 atomic_t children_in_excess;
322 331
323 struct mem_cgroup_per_node *nodeinfo[0]; 332 struct mem_cgroup_per_node *nodeinfo[0];
324 /* WARNING: nodeinfo must be the last member here */ 333 /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +431,7 @@ static bool move_file(void)
422 * limit reclaim to prevent infinite loops, if they ever occur. 431 * limit reclaim to prevent infinite loops, if they ever occur.
423 */ 432 */
424#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 433#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
434#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
425 435
426enum charge_type { 436enum charge_type {
427 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 437 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
648 return mem_cgroup_zoneinfo(memcg, nid, zid); 658 return mem_cgroup_zoneinfo(memcg, nid, zid);
649} 659}
650 660
661static struct mem_cgroup_tree_per_zone *
662soft_limit_tree_node_zone(int nid, int zid)
663{
664 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
665}
666
667static struct mem_cgroup_tree_per_zone *
668soft_limit_tree_from_page(struct page *page)
669{
670 int nid = page_to_nid(page);
671 int zid = page_zonenum(page);
672
673 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
674}
675
676static void
677__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
678 struct mem_cgroup_per_zone *mz,
679 struct mem_cgroup_tree_per_zone *mctz,
680 unsigned long long new_usage_in_excess)
681{
682 struct rb_node **p = &mctz->rb_root.rb_node;
683 struct rb_node *parent = NULL;
684 struct mem_cgroup_per_zone *mz_node;
685
686 if (mz->on_tree)
687 return;
688
689 mz->usage_in_excess = new_usage_in_excess;
690 if (!mz->usage_in_excess)
691 return;
692 while (*p) {
693 parent = *p;
694 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
695 tree_node);
696 if (mz->usage_in_excess < mz_node->usage_in_excess)
697 p = &(*p)->rb_left;
698 /*
699 * We can't avoid mem cgroups that are over their soft
700 * limit by the same amount
701 */
702 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
703 p = &(*p)->rb_right;
704 }
705 rb_link_node(&mz->tree_node, parent, p);
706 rb_insert_color(&mz->tree_node, &mctz->rb_root);
707 mz->on_tree = true;
708}
709
710static void
711__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
712 struct mem_cgroup_per_zone *mz,
713 struct mem_cgroup_tree_per_zone *mctz)
714{
715 if (!mz->on_tree)
716 return;
717 rb_erase(&mz->tree_node, &mctz->rb_root);
718 mz->on_tree = false;
719}
720
721static void
722mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
723 struct mem_cgroup_per_zone *mz,
724 struct mem_cgroup_tree_per_zone *mctz)
725{
726 spin_lock(&mctz->lock);
727 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
728 spin_unlock(&mctz->lock);
729}
730
731
732static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
733{
734 unsigned long long excess;
735 struct mem_cgroup_per_zone *mz;
736 struct mem_cgroup_tree_per_zone *mctz;
737 int nid = page_to_nid(page);
738 int zid = page_zonenum(page);
739 mctz = soft_limit_tree_from_page(page);
740
741 /*
742 * Necessary to update all ancestors when hierarchy is used.
743 * because their event counter is not touched.
744 */
745 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
746 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
747 excess = res_counter_soft_limit_excess(&memcg->res);
748 /*
749 * We have to update the tree if mz is on RB-tree or
750 * mem is over its softlimit.
751 */
752 if (excess || mz->on_tree) {
753 spin_lock(&mctz->lock);
754 /* if on-tree, remove it */
755 if (mz->on_tree)
756 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
757 /*
758 * Insert again. mz->usage_in_excess will be updated.
759 * If excess is 0, no tree ops.
760 */
761 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
762 spin_unlock(&mctz->lock);
763 }
764 }
765}
766
767static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
768{
769 int node, zone;
770 struct mem_cgroup_per_zone *mz;
771 struct mem_cgroup_tree_per_zone *mctz;
772
773 for_each_node(node) {
774 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
775 mz = mem_cgroup_zoneinfo(memcg, node, zone);
776 mctz = soft_limit_tree_node_zone(node, zone);
777 mem_cgroup_remove_exceeded(memcg, mz, mctz);
778 }
779 }
780}
781
782static struct mem_cgroup_per_zone *
783__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
784{
785 struct rb_node *rightmost = NULL;
786 struct mem_cgroup_per_zone *mz;
787
788retry:
789 mz = NULL;
790 rightmost = rb_last(&mctz->rb_root);
791 if (!rightmost)
792 goto done; /* Nothing to reclaim from */
793
794 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
795 /*
796 * Remove the node now but someone else can add it back,
797 * we will to add it back at the end of reclaim to its correct
798 * position in the tree.
799 */
800 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
801 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
802 !css_tryget(&mz->memcg->css))
803 goto retry;
804done:
805 return mz;
806}
807
808static struct mem_cgroup_per_zone *
809mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
810{
811 struct mem_cgroup_per_zone *mz;
812
813 spin_lock(&mctz->lock);
814 mz = __mem_cgroup_largest_soft_limit_node(mctz);
815 spin_unlock(&mctz->lock);
816 return mz;
817}
818
651/* 819/*
652 * Implementation Note: reading percpu statistics for memcg. 820 * Implementation Note: reading percpu statistics for memcg.
653 * 821 *
@@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
822} 990}
823 991
824/* 992/*
825 * Called from rate-limited memcg_check_events when enough
826 * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
827 * that all the parents up the hierarchy will be notified that this group
828 * is in excess or that it is not in excess anymore. mmecg->soft_contributed
829 * makes the transition a single action whenever the state flips from one to
830 * the other.
831 */
832static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
833{
834 unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
835 struct mem_cgroup *parent = memcg;
836 int delta = 0;
837
838 spin_lock(&memcg->soft_lock);
839 if (excess) {
840 if (!memcg->soft_contributed) {
841 delta = 1;
842 memcg->soft_contributed = true;
843 }
844 } else {
845 if (memcg->soft_contributed) {
846 delta = -1;
847 memcg->soft_contributed = false;
848 }
849 }
850
851 /*
852 * Necessary to update all ancestors when hierarchy is used
853 * because their event counter is not touched.
854 * We track children even outside the hierarchy for the root
855 * cgroup because tree walk starting at root should visit
856 * all cgroups and we want to prevent from pointless tree
857 * walk if no children is below the limit.
858 */
859 while (delta && (parent = parent_mem_cgroup(parent)))
860 atomic_add(delta, &parent->children_in_excess);
861 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
862 atomic_add(delta, &root_mem_cgroup->children_in_excess);
863 spin_unlock(&memcg->soft_lock);
864}
865
866/*
867 * Check events in order. 993 * Check events in order.
868 * 994 *
869 */ 995 */
@@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
886 1012
887 mem_cgroup_threshold(memcg); 1013 mem_cgroup_threshold(memcg);
888 if (unlikely(do_softlimit)) 1014 if (unlikely(do_softlimit))
889 mem_cgroup_update_soft_limit(memcg); 1015 mem_cgroup_update_tree(memcg, page);
890#if MAX_NUMNODES > 1 1016#if MAX_NUMNODES > 1
891 if (unlikely(do_numainfo)) 1017 if (unlikely(do_numainfo))
892 atomic_inc(&memcg->numainfo_events); 1018 atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
929 return memcg; 1055 return memcg;
930} 1056}
931 1057
932static enum mem_cgroup_filter_t
933mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
934 mem_cgroup_iter_filter cond)
935{
936 if (!cond)
937 return VISIT;
938 return cond(memcg, root);
939}
940
941/* 1058/*
942 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1059 * Returns a next (in a pre-order walk) alive memcg (with elevated css
943 * ref. count) or NULL if the whole root's subtree has been visited. 1060 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
945 * helper function to be used by mem_cgroup_iter 1062 * helper function to be used by mem_cgroup_iter
946 */ 1063 */
947static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1064static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
948 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 1065 struct mem_cgroup *last_visited)
949{ 1066{
950 struct cgroup_subsys_state *prev_css, *next_css; 1067 struct cgroup_subsys_state *prev_css, *next_css;
951 1068
@@ -963,31 +1080,11 @@ skip_node:
963 if (next_css) { 1080 if (next_css) {
964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1081 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
965 1082
966 switch (mem_cgroup_filter(mem, root, cond)) { 1083 if (css_tryget(&mem->css))
967 case SKIP: 1084 return mem;
1085 else {
968 prev_css = next_css; 1086 prev_css = next_css;
969 goto skip_node; 1087 goto skip_node;
970 case SKIP_TREE:
971 if (mem == root)
972 return NULL;
973 /*
974 * css_rightmost_descendant is not an optimal way to
975 * skip through a subtree (especially for imbalanced
976 * trees leaning to right) but that's what we have right
977 * now. More effective solution would be traversing
978 * right-up for first non-NULL without calling
979 * css_next_descendant_pre afterwards.
980 */
981 prev_css = css_rightmost_descendant(next_css);
982 goto skip_node;
983 case VISIT:
984 if (css_tryget(&mem->css))
985 return mem;
986 else {
987 prev_css = next_css;
988 goto skip_node;
989 }
990 break;
991 } 1088 }
992 } 1089 }
993 1090
@@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1051 * @root: hierarchy root 1148 * @root: hierarchy root
1052 * @prev: previously returned memcg, NULL on first invocation 1149 * @prev: previously returned memcg, NULL on first invocation
1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1150 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1054 * @cond: filter for visited nodes, NULL for no filter
1055 * 1151 *
1056 * Returns references to children of the hierarchy below @root, or 1152 * Returns references to children of the hierarchy below @root, or
1057 * @root itself, or %NULL after a full round-trip. 1153 * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1064 * divide up the memcgs in the hierarchy among all concurrent 1160 * divide up the memcgs in the hierarchy among all concurrent
1065 * reclaimers operating on the same zone and priority. 1161 * reclaimers operating on the same zone and priority.
1066 */ 1162 */
1067struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1163struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1068 struct mem_cgroup *prev, 1164 struct mem_cgroup *prev,
1069 struct mem_cgroup_reclaim_cookie *reclaim, 1165 struct mem_cgroup_reclaim_cookie *reclaim)
1070 mem_cgroup_iter_filter cond)
1071{ 1166{
1072 struct mem_cgroup *memcg = NULL; 1167 struct mem_cgroup *memcg = NULL;
1073 struct mem_cgroup *last_visited = NULL; 1168 struct mem_cgroup *last_visited = NULL;
1074 1169
1075 if (mem_cgroup_disabled()) { 1170 if (mem_cgroup_disabled())
1076 /* first call must return non-NULL, second return NULL */ 1171 return NULL;
1077 return (struct mem_cgroup *)(unsigned long)!prev;
1078 }
1079 1172
1080 if (!root) 1173 if (!root)
1081 root = root_mem_cgroup; 1174 root = root_mem_cgroup;
@@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1086 if (!root->use_hierarchy && root != root_mem_cgroup) { 1179 if (!root->use_hierarchy && root != root_mem_cgroup) {
1087 if (prev) 1180 if (prev)
1088 goto out_css_put; 1181 goto out_css_put;
1089 if (mem_cgroup_filter(root, root, cond) == VISIT) 1182 return root;
1090 return root;
1091 return NULL;
1092 } 1183 }
1093 1184
1094 rcu_read_lock(); 1185 rcu_read_lock();
@@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1111 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1202 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1112 } 1203 }
1113 1204
1114 memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1205 memcg = __mem_cgroup_iter_next(root, last_visited);
1115 1206
1116 if (reclaim) { 1207 if (reclaim) {
1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1208 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1122 reclaim->generation = iter->generation; 1213 reclaim->generation = iter->generation;
1123 } 1214 }
1124 1215
1125 /* 1216 if (prev && !memcg)
1126 * We have finished the whole tree walk or no group has been
1127 * visited because filter told us to skip the root node.
1128 */
1129 if (!memcg && (prev || (cond && !last_visited)))
1130 goto out_unlock; 1217 goto out_unlock;
1131 } 1218 }
1132out_unlock: 1219out_unlock:
@@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1767 return total; 1854 return total;
1768} 1855}
1769 1856
1770#if MAX_NUMNODES > 1
1771/** 1857/**
1772 * test_mem_cgroup_node_reclaimable 1858 * test_mem_cgroup_node_reclaimable
1773 * @memcg: the target memcg 1859 * @memcg: the target memcg
@@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1790 return false; 1876 return false;
1791 1877
1792} 1878}
1879#if MAX_NUMNODES > 1
1793 1880
1794/* 1881/*
1795 * Always updating the nodemask is not very good - even if we have an empty 1882 * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1857 return node; 1944 return node;
1858} 1945}
1859 1946
1947/*
1948 * Check all nodes whether it contains reclaimable pages or not.
1949 * For quick scan, we make use of scan_nodes. This will allow us to skip
1950 * unused nodes. But scan_nodes is lazily updated and may not cotain
1951 * enough new information. We need to do double check.
1952 */
1953static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1954{
1955 int nid;
1956
1957 /*
1958 * quick check...making use of scan_node.
1959 * We can skip unused nodes.
1960 */
1961 if (!nodes_empty(memcg->scan_nodes)) {
1962 for (nid = first_node(memcg->scan_nodes);
1963 nid < MAX_NUMNODES;
1964 nid = next_node(nid, memcg->scan_nodes)) {
1965
1966 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1967 return true;
1968 }
1969 }
1970 /*
1971 * Check rest of nodes.
1972 */
1973 for_each_node_state(nid, N_MEMORY) {
1974 if (node_isset(nid, memcg->scan_nodes))
1975 continue;
1976 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1977 return true;
1978 }
1979 return false;
1980}
1981
1860#else 1982#else
1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1983int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1862{ 1984{
1863 return 0; 1985 return 0;
1864} 1986}
1865 1987
1866#endif 1988static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1867
1868/*
1869 * A group is eligible for the soft limit reclaim under the given root
1870 * hierarchy if
1871 * a) it is over its soft limit
1872 * b) any parent up the hierarchy is over its soft limit
1873 *
1874 * If the given group doesn't have any children over the limit then it
1875 * doesn't make any sense to iterate its subtree.
1876 */
1877enum mem_cgroup_filter_t
1878mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1879 struct mem_cgroup *root)
1880{ 1989{
1881 struct mem_cgroup *parent; 1990 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1882 1991}
1883 if (!memcg) 1992#endif
1884 memcg = root_mem_cgroup;
1885 parent = memcg;
1886
1887 if (res_counter_soft_limit_excess(&memcg->res))
1888 return VISIT;
1889 1993
1890 /* 1994static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1891 * If any parent up to the root in the hierarchy is over its soft limit 1995 struct zone *zone,
1892 * then we have to obey and reclaim from this group as well. 1996 gfp_t gfp_mask,
1893 */ 1997 unsigned long *total_scanned)
1894 while ((parent = parent_mem_cgroup(parent))) { 1998{
1895 if (res_counter_soft_limit_excess(&parent->res)) 1999 struct mem_cgroup *victim = NULL;
1896 return VISIT; 2000 int total = 0;
1897 if (parent == root) 2001 int loop = 0;
2002 unsigned long excess;
2003 unsigned long nr_scanned;
2004 struct mem_cgroup_reclaim_cookie reclaim = {
2005 .zone = zone,
2006 .priority = 0,
2007 };
2008
2009 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2010
2011 while (1) {
2012 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2013 if (!victim) {
2014 loop++;
2015 if (loop >= 2) {
2016 /*
2017 * If we have not been able to reclaim
2018 * anything, it might because there are
2019 * no reclaimable pages under this hierarchy
2020 */
2021 if (!total)
2022 break;
2023 /*
2024 * We want to do more targeted reclaim.
2025 * excess >> 2 is not to excessive so as to
2026 * reclaim too much, nor too less that we keep
2027 * coming back to reclaim from this cgroup
2028 */
2029 if (total >= (excess >> 2) ||
2030 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2031 break;
2032 }
2033 continue;
2034 }
2035 if (!mem_cgroup_reclaimable(victim, false))
2036 continue;
2037 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2038 zone, &nr_scanned);
2039 *total_scanned += nr_scanned;
2040 if (!res_counter_soft_limit_excess(&root_memcg->res))
1898 break; 2041 break;
1899 } 2042 }
1900 2043 mem_cgroup_iter_break(root_memcg, victim);
1901 if (!atomic_read(&memcg->children_in_excess)) 2044 return total;
1902 return SKIP_TREE;
1903 return SKIP;
1904} 2045}
1905 2046
1906static DEFINE_SPINLOCK(memcg_oom_lock); 2047static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2812 unlock_page_cgroup(pc); 2953 unlock_page_cgroup(pc);
2813 2954
2814 /* 2955 /*
2815 * "charge_statistics" updated event counter. 2956 * "charge_statistics" updated event counter. Then, check it.
2957 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2958 * if they exceeds softlimit.
2816 */ 2959 */
2817 memcg_check_events(memcg, page); 2960 memcg_check_events(memcg, page);
2818} 2961}
@@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4647 return ret; 4790 return ret;
4648} 4791}
4649 4792
4793unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4794 gfp_t gfp_mask,
4795 unsigned long *total_scanned)
4796{
4797 unsigned long nr_reclaimed = 0;
4798 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4799 unsigned long reclaimed;
4800 int loop = 0;
4801 struct mem_cgroup_tree_per_zone *mctz;
4802 unsigned long long excess;
4803 unsigned long nr_scanned;
4804
4805 if (order > 0)
4806 return 0;
4807
4808 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4809 /*
4810 * This loop can run a while, specially if mem_cgroup's continuously
4811 * keep exceeding their soft limit and putting the system under
4812 * pressure
4813 */
4814 do {
4815 if (next_mz)
4816 mz = next_mz;
4817 else
4818 mz = mem_cgroup_largest_soft_limit_node(mctz);
4819 if (!mz)
4820 break;
4821
4822 nr_scanned = 0;
4823 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4824 gfp_mask, &nr_scanned);
4825 nr_reclaimed += reclaimed;
4826 *total_scanned += nr_scanned;
4827 spin_lock(&mctz->lock);
4828
4829 /*
4830 * If we failed to reclaim anything from this memory cgroup
4831 * it is time to move on to the next cgroup
4832 */
4833 next_mz = NULL;
4834 if (!reclaimed) {
4835 do {
4836 /*
4837 * Loop until we find yet another one.
4838 *
4839 * By the time we get the soft_limit lock
4840 * again, someone might have aded the
4841 * group back on the RB tree. Iterate to
4842 * make sure we get a different mem.
4843 * mem_cgroup_largest_soft_limit_node returns
4844 * NULL if no other cgroup is present on
4845 * the tree
4846 */
4847 next_mz =
4848 __mem_cgroup_largest_soft_limit_node(mctz);
4849 if (next_mz == mz)
4850 css_put(&next_mz->memcg->css);
4851 else /* next_mz == NULL or other memcg */
4852 break;
4853 } while (1);
4854 }
4855 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4856 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4857 /*
4858 * One school of thought says that we should not add
4859 * back the node to the tree if reclaim returns 0.
4860 * But our reclaim could return 0, simply because due
4861 * to priority we are exposing a smaller subset of
4862 * memory to reclaim from. Consider this as a longer
4863 * term TODO.
4864 */
4865 /* If excess == 0, no tree ops */
4866 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4867 spin_unlock(&mctz->lock);
4868 css_put(&mz->memcg->css);
4869 loop++;
4870 /*
4871 * Could not reclaim anything and there are no more
4872 * mem cgroups to try or we seem to be looping without
4873 * reclaiming anything.
4874 */
4875 if (!nr_reclaimed &&
4876 (next_mz == NULL ||
4877 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4878 break;
4879 } while (!nr_reclaimed);
4880 if (next_mz)
4881 css_put(&next_mz->memcg->css);
4882 return nr_reclaimed;
4883}
4884
4650/** 4885/**
4651 * mem_cgroup_force_empty_list - clears LRU of a group 4886 * mem_cgroup_force_empty_list - clears LRU of a group
4652 * @memcg: group to clear 4887 * @memcg: group to clear
@@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6146 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5912 mz = &pn->zoneinfo[zone]; 6147 mz = &pn->zoneinfo[zone];
5913 lruvec_init(&mz->lruvec); 6148 lruvec_init(&mz->lruvec);
6149 mz->usage_in_excess = 0;
6150 mz->on_tree = false;
5914 mz->memcg = memcg; 6151 mz->memcg = memcg;
5915 } 6152 }
5916 memcg->nodeinfo[node] = pn; 6153 memcg->nodeinfo[node] = pn;
@@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5966 int node; 6203 int node;
5967 size_t size = memcg_size(); 6204 size_t size = memcg_size();
5968 6205
6206 mem_cgroup_remove_from_trees(memcg);
5969 free_css_id(&mem_cgroup_subsys, &memcg->css); 6207 free_css_id(&mem_cgroup_subsys, &memcg->css);
5970 6208
5971 for_each_node(node) 6209 for_each_node(node)
@@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6002} 6240}
6003EXPORT_SYMBOL(parent_mem_cgroup); 6241EXPORT_SYMBOL(parent_mem_cgroup);
6004 6242
6243static void __init mem_cgroup_soft_limit_tree_init(void)
6244{
6245 struct mem_cgroup_tree_per_node *rtpn;
6246 struct mem_cgroup_tree_per_zone *rtpz;
6247 int tmp, node, zone;
6248
6249 for_each_node(node) {
6250 tmp = node;
6251 if (!node_state(node, N_NORMAL_MEMORY))
6252 tmp = -1;
6253 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6254 BUG_ON(!rtpn);
6255
6256 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6257
6258 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6259 rtpz = &rtpn->rb_tree_per_zone[zone];
6260 rtpz->rb_root = RB_ROOT;
6261 spin_lock_init(&rtpz->lock);
6262 }
6263 }
6264}
6265
6005static struct cgroup_subsys_state * __ref 6266static struct cgroup_subsys_state * __ref
6006mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6267mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6007{ 6268{
@@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6031 mutex_init(&memcg->thresholds_lock); 6292 mutex_init(&memcg->thresholds_lock);
6032 spin_lock_init(&memcg->move_lock); 6293 spin_lock_init(&memcg->move_lock);
6033 vmpressure_init(&memcg->vmpressure); 6294 vmpressure_init(&memcg->vmpressure);
6034 spin_lock_init(&memcg->soft_lock);
6035 6295
6036 return &memcg->css; 6296 return &memcg->css;
6037 6297
@@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6109 6369
6110 mem_cgroup_invalidate_reclaim_iterators(memcg); 6370 mem_cgroup_invalidate_reclaim_iterators(memcg);
6111 mem_cgroup_reparent_charges(memcg); 6371 mem_cgroup_reparent_charges(memcg);
6112 if (memcg->soft_contributed) {
6113 while ((memcg = parent_mem_cgroup(memcg)))
6114 atomic_dec(&memcg->children_in_excess);
6115
6116 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6117 atomic_dec(&root_mem_cgroup->children_in_excess);
6118 }
6119 mem_cgroup_destroy_all_caches(memcg); 6372 mem_cgroup_destroy_all_caches(memcg);
6120 vmpressure_cleanup(&memcg->vmpressure); 6373 vmpressure_cleanup(&memcg->vmpressure);
6121} 6374}
@@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void)
6790{ 7043{
6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7044 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6792 enable_swap_cgroup(); 7045 enable_swap_cgroup();
7046 mem_cgroup_soft_limit_tree_init();
6793 memcg_stock_init(); 7047 memcg_stock_init();
6794 return 0; 7048 return 0;
6795} 7049}