aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorKevin Hilman <khilman@linaro.org>2013-10-14 18:29:10 -0400
committerKevin Hilman <khilman@linaro.org>2013-10-14 18:29:24 -0400
commit7587b5965f57c1c4d6fd1377432a8473f5cd449a (patch)
tree85b7ced77656ac142369c6436df02b51d6d13527 /mm/memcontrol.c
parent6a9d10d529db69244baab335fb02caba3d6ebbc9 (diff)
parent8d71528343c69ce387bd5fdb4fd8dc2b9f69d97c (diff)
Merge tag 'omap-for-v3.13/quirk-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap into next/dt
From Tony Lindgren: Changes needed to prepare for making omap3 device tree only: - Always build in board-generic, and add pdata quirks and auxdata support for it so we have all the pdata related quirks in the same place. - Merge of the drivers/pinctrl changes that are needed for PM to continue working on omap3 and also needed for other omaps eventually. The three pinctrl related patches have been acked by Linus Walleij and are pulled into both the pinctrl tree and this branch. - Few defconfig related changes for drivers needed. * tag 'omap-for-v3.13/quirk-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap: (523 commits) ARM: configs: omap2plus_defconfig: enable dwc3 and dependencies ARM: OMAP2+: Add WLAN modules and of_serial to omap2plus_defconfig ARM: OMAP2+: Run make savedefconfig on omap2plus_defconfig to shrink it ARM: OMAP2+: Add minimal 8250 support for GPMC ARM: OMAP2+: Use pdata quirks for wl12xx for omap3 evm and zoom3 ARM: OMAP: Move DT wake-up event handling over to use pinctrl-single-omap ARM: OMAP2+: Add support for auxdata pinctrl: single: Add support for auxdata pinctrl: single: Add support for wake-up interrupts pinctrl: single: Prepare for supporting SoC specific features ARM: OMAP2+: igep0020: use display init from dss-common ARM: OMAP2+: pdata-quirks: add legacy display init for IGEPv2 board +Linux 3.12-rc4 Signed-off-by: Kevin Hilman <khilman@linaro.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c560
1 files changed, 407 insertions, 153 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..1c52ddbc839b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/swap.h> 44#include <linux/swap.h>
44#include <linux/swapops.h> 45#include <linux/swapops.h>
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {
160 161
161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 162 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
162 163
164 struct rb_node tree_node; /* RB tree node */
165 unsigned long long usage_in_excess;/* Set to the value by which */
166 /* the soft limit is exceeded*/
167 bool on_tree;
163 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 168 struct mem_cgroup *memcg; /* Back pointer, we cannot */
164 /* use container_of */ 169 /* use container_of */
165}; 170};
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node {
168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 173 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
169}; 174};
170 175
176/*
177 * Cgroups above their limits are maintained in a RB-Tree, independent of
178 * their hierarchy representation
179 */
180
181struct mem_cgroup_tree_per_zone {
182 struct rb_root rb_root;
183 spinlock_t lock;
184};
185
186struct mem_cgroup_tree_per_node {
187 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
188};
189
190struct mem_cgroup_tree {
191 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
192};
193
194static struct mem_cgroup_tree soft_limit_tree __read_mostly;
195
171struct mem_cgroup_threshold { 196struct mem_cgroup_threshold {
172 struct eventfd_ctx *eventfd; 197 struct eventfd_ctx *eventfd;
173 u64 threshold; 198 u64 threshold;
@@ -303,22 +328,6 @@ struct mem_cgroup {
303 atomic_t numainfo_events; 328 atomic_t numainfo_events;
304 atomic_t numainfo_updating; 329 atomic_t numainfo_updating;
305#endif 330#endif
306 /*
307 * Protects soft_contributed transitions.
308 * See mem_cgroup_update_soft_limit
309 */
310 spinlock_t soft_lock;
311
312 /*
313 * If true then this group has increased parents' children_in_excess
314 * when it got over the soft limit.
315 * When a group falls bellow the soft limit, parents' children_in_excess
316 * is decreased and soft_contributed changed to false.
317 */
318 bool soft_contributed;
319
320 /* Number of children that are in soft limit excess */
321 atomic_t children_in_excess;
322 331
323 struct mem_cgroup_per_node *nodeinfo[0]; 332 struct mem_cgroup_per_node *nodeinfo[0];
324 /* WARNING: nodeinfo must be the last member here */ 333 /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +431,7 @@ static bool move_file(void)
422 * limit reclaim to prevent infinite loops, if they ever occur. 431 * limit reclaim to prevent infinite loops, if they ever occur.
423 */ 432 */
424#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 433#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
434#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
425 435
426enum charge_type { 436enum charge_type {
427 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 437 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
648 return mem_cgroup_zoneinfo(memcg, nid, zid); 658 return mem_cgroup_zoneinfo(memcg, nid, zid);
649} 659}
650 660
661static struct mem_cgroup_tree_per_zone *
662soft_limit_tree_node_zone(int nid, int zid)
663{
664 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
665}
666
667static struct mem_cgroup_tree_per_zone *
668soft_limit_tree_from_page(struct page *page)
669{
670 int nid = page_to_nid(page);
671 int zid = page_zonenum(page);
672
673 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
674}
675
676static void
677__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
678 struct mem_cgroup_per_zone *mz,
679 struct mem_cgroup_tree_per_zone *mctz,
680 unsigned long long new_usage_in_excess)
681{
682 struct rb_node **p = &mctz->rb_root.rb_node;
683 struct rb_node *parent = NULL;
684 struct mem_cgroup_per_zone *mz_node;
685
686 if (mz->on_tree)
687 return;
688
689 mz->usage_in_excess = new_usage_in_excess;
690 if (!mz->usage_in_excess)
691 return;
692 while (*p) {
693 parent = *p;
694 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
695 tree_node);
696 if (mz->usage_in_excess < mz_node->usage_in_excess)
697 p = &(*p)->rb_left;
698 /*
699 * We can't avoid mem cgroups that are over their soft
700 * limit by the same amount
701 */
702 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
703 p = &(*p)->rb_right;
704 }
705 rb_link_node(&mz->tree_node, parent, p);
706 rb_insert_color(&mz->tree_node, &mctz->rb_root);
707 mz->on_tree = true;
708}
709
710static void
711__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
712 struct mem_cgroup_per_zone *mz,
713 struct mem_cgroup_tree_per_zone *mctz)
714{
715 if (!mz->on_tree)
716 return;
717 rb_erase(&mz->tree_node, &mctz->rb_root);
718 mz->on_tree = false;
719}
720
721static void
722mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
723 struct mem_cgroup_per_zone *mz,
724 struct mem_cgroup_tree_per_zone *mctz)
725{
726 spin_lock(&mctz->lock);
727 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
728 spin_unlock(&mctz->lock);
729}
730
731
732static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
733{
734 unsigned long long excess;
735 struct mem_cgroup_per_zone *mz;
736 struct mem_cgroup_tree_per_zone *mctz;
737 int nid = page_to_nid(page);
738 int zid = page_zonenum(page);
739 mctz = soft_limit_tree_from_page(page);
740
741 /*
742 * Necessary to update all ancestors when hierarchy is used.
743 * because their event counter is not touched.
744 */
745 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
746 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
747 excess = res_counter_soft_limit_excess(&memcg->res);
748 /*
749 * We have to update the tree if mz is on RB-tree or
750 * mem is over its softlimit.
751 */
752 if (excess || mz->on_tree) {
753 spin_lock(&mctz->lock);
754 /* if on-tree, remove it */
755 if (mz->on_tree)
756 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
757 /*
758 * Insert again. mz->usage_in_excess will be updated.
759 * If excess is 0, no tree ops.
760 */
761 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
762 spin_unlock(&mctz->lock);
763 }
764 }
765}
766
767static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
768{
769 int node, zone;
770 struct mem_cgroup_per_zone *mz;
771 struct mem_cgroup_tree_per_zone *mctz;
772
773 for_each_node(node) {
774 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
775 mz = mem_cgroup_zoneinfo(memcg, node, zone);
776 mctz = soft_limit_tree_node_zone(node, zone);
777 mem_cgroup_remove_exceeded(memcg, mz, mctz);
778 }
779 }
780}
781
782static struct mem_cgroup_per_zone *
783__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
784{
785 struct rb_node *rightmost = NULL;
786 struct mem_cgroup_per_zone *mz;
787
788retry:
789 mz = NULL;
790 rightmost = rb_last(&mctz->rb_root);
791 if (!rightmost)
792 goto done; /* Nothing to reclaim from */
793
794 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
795 /*
796 * Remove the node now but someone else can add it back,
797 * we will to add it back at the end of reclaim to its correct
798 * position in the tree.
799 */
800 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
801 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
802 !css_tryget(&mz->memcg->css))
803 goto retry;
804done:
805 return mz;
806}
807
808static struct mem_cgroup_per_zone *
809mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
810{
811 struct mem_cgroup_per_zone *mz;
812
813 spin_lock(&mctz->lock);
814 mz = __mem_cgroup_largest_soft_limit_node(mctz);
815 spin_unlock(&mctz->lock);
816 return mz;
817}
818
651/* 819/*
652 * Implementation Note: reading percpu statistics for memcg. 820 * Implementation Note: reading percpu statistics for memcg.
653 * 821 *
@@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
822} 990}
823 991
824/* 992/*
825 * Called from rate-limited memcg_check_events when enough
826 * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
827 * that all the parents up the hierarchy will be notified that this group
828 * is in excess or that it is not in excess anymore. mmecg->soft_contributed
829 * makes the transition a single action whenever the state flips from one to
830 * the other.
831 */
832static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
833{
834 unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
835 struct mem_cgroup *parent = memcg;
836 int delta = 0;
837
838 spin_lock(&memcg->soft_lock);
839 if (excess) {
840 if (!memcg->soft_contributed) {
841 delta = 1;
842 memcg->soft_contributed = true;
843 }
844 } else {
845 if (memcg->soft_contributed) {
846 delta = -1;
847 memcg->soft_contributed = false;
848 }
849 }
850
851 /*
852 * Necessary to update all ancestors when hierarchy is used
853 * because their event counter is not touched.
854 * We track children even outside the hierarchy for the root
855 * cgroup because tree walk starting at root should visit
856 * all cgroups and we want to prevent from pointless tree
857 * walk if no children is below the limit.
858 */
859 while (delta && (parent = parent_mem_cgroup(parent)))
860 atomic_add(delta, &parent->children_in_excess);
861 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
862 atomic_add(delta, &root_mem_cgroup->children_in_excess);
863 spin_unlock(&memcg->soft_lock);
864}
865
866/*
867 * Check events in order. 993 * Check events in order.
868 * 994 *
869 */ 995 */
@@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
886 1012
887 mem_cgroup_threshold(memcg); 1013 mem_cgroup_threshold(memcg);
888 if (unlikely(do_softlimit)) 1014 if (unlikely(do_softlimit))
889 mem_cgroup_update_soft_limit(memcg); 1015 mem_cgroup_update_tree(memcg, page);
890#if MAX_NUMNODES > 1 1016#if MAX_NUMNODES > 1
891 if (unlikely(do_numainfo)) 1017 if (unlikely(do_numainfo))
892 atomic_inc(&memcg->numainfo_events); 1018 atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
929 return memcg; 1055 return memcg;
930} 1056}
931 1057
932static enum mem_cgroup_filter_t
933mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
934 mem_cgroup_iter_filter cond)
935{
936 if (!cond)
937 return VISIT;
938 return cond(memcg, root);
939}
940
941/* 1058/*
942 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1059 * Returns a next (in a pre-order walk) alive memcg (with elevated css
943 * ref. count) or NULL if the whole root's subtree has been visited. 1060 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
945 * helper function to be used by mem_cgroup_iter 1062 * helper function to be used by mem_cgroup_iter
946 */ 1063 */
947static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1064static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
948 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 1065 struct mem_cgroup *last_visited)
949{ 1066{
950 struct cgroup_subsys_state *prev_css, *next_css; 1067 struct cgroup_subsys_state *prev_css, *next_css;
951 1068
@@ -963,31 +1080,11 @@ skip_node:
963 if (next_css) { 1080 if (next_css) {
964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1081 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
965 1082
966 switch (mem_cgroup_filter(mem, root, cond)) { 1083 if (css_tryget(&mem->css))
967 case SKIP: 1084 return mem;
1085 else {
968 prev_css = next_css; 1086 prev_css = next_css;
969 goto skip_node; 1087 goto skip_node;
970 case SKIP_TREE:
971 if (mem == root)
972 return NULL;
973 /*
974 * css_rightmost_descendant is not an optimal way to
975 * skip through a subtree (especially for imbalanced
976 * trees leaning to right) but that's what we have right
977 * now. More effective solution would be traversing
978 * right-up for first non-NULL without calling
979 * css_next_descendant_pre afterwards.
980 */
981 prev_css = css_rightmost_descendant(next_css);
982 goto skip_node;
983 case VISIT:
984 if (css_tryget(&mem->css))
985 return mem;
986 else {
987 prev_css = next_css;
988 goto skip_node;
989 }
990 break;
991 } 1088 }
992 } 1089 }
993 1090
@@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1051 * @root: hierarchy root 1148 * @root: hierarchy root
1052 * @prev: previously returned memcg, NULL on first invocation 1149 * @prev: previously returned memcg, NULL on first invocation
1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1150 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1054 * @cond: filter for visited nodes, NULL for no filter
1055 * 1151 *
1056 * Returns references to children of the hierarchy below @root, or 1152 * Returns references to children of the hierarchy below @root, or
1057 * @root itself, or %NULL after a full round-trip. 1153 * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1064 * divide up the memcgs in the hierarchy among all concurrent 1160 * divide up the memcgs in the hierarchy among all concurrent
1065 * reclaimers operating on the same zone and priority. 1161 * reclaimers operating on the same zone and priority.
1066 */ 1162 */
1067struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1163struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1068 struct mem_cgroup *prev, 1164 struct mem_cgroup *prev,
1069 struct mem_cgroup_reclaim_cookie *reclaim, 1165 struct mem_cgroup_reclaim_cookie *reclaim)
1070 mem_cgroup_iter_filter cond)
1071{ 1166{
1072 struct mem_cgroup *memcg = NULL; 1167 struct mem_cgroup *memcg = NULL;
1073 struct mem_cgroup *last_visited = NULL; 1168 struct mem_cgroup *last_visited = NULL;
1074 1169
1075 if (mem_cgroup_disabled()) { 1170 if (mem_cgroup_disabled())
1076 /* first call must return non-NULL, second return NULL */ 1171 return NULL;
1077 return (struct mem_cgroup *)(unsigned long)!prev;
1078 }
1079 1172
1080 if (!root) 1173 if (!root)
1081 root = root_mem_cgroup; 1174 root = root_mem_cgroup;
@@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1086 if (!root->use_hierarchy && root != root_mem_cgroup) { 1179 if (!root->use_hierarchy && root != root_mem_cgroup) {
1087 if (prev) 1180 if (prev)
1088 goto out_css_put; 1181 goto out_css_put;
1089 if (mem_cgroup_filter(root, root, cond) == VISIT) 1182 return root;
1090 return root;
1091 return NULL;
1092 } 1183 }
1093 1184
1094 rcu_read_lock(); 1185 rcu_read_lock();
@@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1111 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1202 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1112 } 1203 }
1113 1204
1114 memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1205 memcg = __mem_cgroup_iter_next(root, last_visited);
1115 1206
1116 if (reclaim) { 1207 if (reclaim) {
1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1208 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1122 reclaim->generation = iter->generation; 1213 reclaim->generation = iter->generation;
1123 } 1214 }
1124 1215
1125 /* 1216 if (prev && !memcg)
1126 * We have finished the whole tree walk or no group has been
1127 * visited because filter told us to skip the root node.
1128 */
1129 if (!memcg && (prev || (cond && !last_visited)))
1130 goto out_unlock; 1217 goto out_unlock;
1131 } 1218 }
1132out_unlock: 1219out_unlock:
@@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1767 return total; 1854 return total;
1768} 1855}
1769 1856
1770#if MAX_NUMNODES > 1
1771/** 1857/**
1772 * test_mem_cgroup_node_reclaimable 1858 * test_mem_cgroup_node_reclaimable
1773 * @memcg: the target memcg 1859 * @memcg: the target memcg
@@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1790 return false; 1876 return false;
1791 1877
1792} 1878}
1879#if MAX_NUMNODES > 1
1793 1880
1794/* 1881/*
1795 * Always updating the nodemask is not very good - even if we have an empty 1882 * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1857 return node; 1944 return node;
1858} 1945}
1859 1946
1947/*
1948 * Check all nodes whether it contains reclaimable pages or not.
1949 * For quick scan, we make use of scan_nodes. This will allow us to skip
1950 * unused nodes. But scan_nodes is lazily updated and may not cotain
1951 * enough new information. We need to do double check.
1952 */
1953static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1954{
1955 int nid;
1956
1957 /*
1958 * quick check...making use of scan_node.
1959 * We can skip unused nodes.
1960 */
1961 if (!nodes_empty(memcg->scan_nodes)) {
1962 for (nid = first_node(memcg->scan_nodes);
1963 nid < MAX_NUMNODES;
1964 nid = next_node(nid, memcg->scan_nodes)) {
1965
1966 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1967 return true;
1968 }
1969 }
1970 /*
1971 * Check rest of nodes.
1972 */
1973 for_each_node_state(nid, N_MEMORY) {
1974 if (node_isset(nid, memcg->scan_nodes))
1975 continue;
1976 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1977 return true;
1978 }
1979 return false;
1980}
1981
1860#else 1982#else
1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1983int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1862{ 1984{
1863 return 0; 1985 return 0;
1864} 1986}
1865 1987
1866#endif 1988static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1867
1868/*
1869 * A group is eligible for the soft limit reclaim under the given root
1870 * hierarchy if
1871 * a) it is over its soft limit
1872 * b) any parent up the hierarchy is over its soft limit
1873 *
1874 * If the given group doesn't have any children over the limit then it
1875 * doesn't make any sense to iterate its subtree.
1876 */
1877enum mem_cgroup_filter_t
1878mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1879 struct mem_cgroup *root)
1880{ 1989{
1881 struct mem_cgroup *parent; 1990 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1882 1991}
1883 if (!memcg) 1992#endif
1884 memcg = root_mem_cgroup;
1885 parent = memcg;
1886
1887 if (res_counter_soft_limit_excess(&memcg->res))
1888 return VISIT;
1889 1993
1890 /* 1994static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1891 * If any parent up to the root in the hierarchy is over its soft limit 1995 struct zone *zone,
1892 * then we have to obey and reclaim from this group as well. 1996 gfp_t gfp_mask,
1893 */ 1997 unsigned long *total_scanned)
1894 while ((parent = parent_mem_cgroup(parent))) { 1998{
1895 if (res_counter_soft_limit_excess(&parent->res)) 1999 struct mem_cgroup *victim = NULL;
1896 return VISIT; 2000 int total = 0;
1897 if (parent == root) 2001 int loop = 0;
2002 unsigned long excess;
2003 unsigned long nr_scanned;
2004 struct mem_cgroup_reclaim_cookie reclaim = {
2005 .zone = zone,
2006 .priority = 0,
2007 };
2008
2009 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2010
2011 while (1) {
2012 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2013 if (!victim) {
2014 loop++;
2015 if (loop >= 2) {
2016 /*
2017 * If we have not been able to reclaim
2018 * anything, it might because there are
2019 * no reclaimable pages under this hierarchy
2020 */
2021 if (!total)
2022 break;
2023 /*
2024 * We want to do more targeted reclaim.
2025 * excess >> 2 is not to excessive so as to
2026 * reclaim too much, nor too less that we keep
2027 * coming back to reclaim from this cgroup
2028 */
2029 if (total >= (excess >> 2) ||
2030 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2031 break;
2032 }
2033 continue;
2034 }
2035 if (!mem_cgroup_reclaimable(victim, false))
2036 continue;
2037 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2038 zone, &nr_scanned);
2039 *total_scanned += nr_scanned;
2040 if (!res_counter_soft_limit_excess(&root_memcg->res))
1898 break; 2041 break;
1899 } 2042 }
1900 2043 mem_cgroup_iter_break(root_memcg, victim);
1901 if (!atomic_read(&memcg->children_in_excess)) 2044 return total;
1902 return SKIP_TREE;
1903 return SKIP;
1904} 2045}
1905 2046
1906static DEFINE_SPINLOCK(memcg_oom_lock); 2047static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2812 unlock_page_cgroup(pc); 2953 unlock_page_cgroup(pc);
2813 2954
2814 /* 2955 /*
2815 * "charge_statistics" updated event counter. 2956 * "charge_statistics" updated event counter. Then, check it.
2957 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2958 * if they exceeds softlimit.
2816 */ 2959 */
2817 memcg_check_events(memcg, page); 2960 memcg_check_events(memcg, page);
2818} 2961}
@@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4647 return ret; 4790 return ret;
4648} 4791}
4649 4792
4793unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4794 gfp_t gfp_mask,
4795 unsigned long *total_scanned)
4796{
4797 unsigned long nr_reclaimed = 0;
4798 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4799 unsigned long reclaimed;
4800 int loop = 0;
4801 struct mem_cgroup_tree_per_zone *mctz;
4802 unsigned long long excess;
4803 unsigned long nr_scanned;
4804
4805 if (order > 0)
4806 return 0;
4807
4808 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4809 /*
4810 * This loop can run a while, specially if mem_cgroup's continuously
4811 * keep exceeding their soft limit and putting the system under
4812 * pressure
4813 */
4814 do {
4815 if (next_mz)
4816 mz = next_mz;
4817 else
4818 mz = mem_cgroup_largest_soft_limit_node(mctz);
4819 if (!mz)
4820 break;
4821
4822 nr_scanned = 0;
4823 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4824 gfp_mask, &nr_scanned);
4825 nr_reclaimed += reclaimed;
4826 *total_scanned += nr_scanned;
4827 spin_lock(&mctz->lock);
4828
4829 /*
4830 * If we failed to reclaim anything from this memory cgroup
4831 * it is time to move on to the next cgroup
4832 */
4833 next_mz = NULL;
4834 if (!reclaimed) {
4835 do {
4836 /*
4837 * Loop until we find yet another one.
4838 *
4839 * By the time we get the soft_limit lock
4840 * again, someone might have aded the
4841 * group back on the RB tree. Iterate to
4842 * make sure we get a different mem.
4843 * mem_cgroup_largest_soft_limit_node returns
4844 * NULL if no other cgroup is present on
4845 * the tree
4846 */
4847 next_mz =
4848 __mem_cgroup_largest_soft_limit_node(mctz);
4849 if (next_mz == mz)
4850 css_put(&next_mz->memcg->css);
4851 else /* next_mz == NULL or other memcg */
4852 break;
4853 } while (1);
4854 }
4855 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4856 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4857 /*
4858 * One school of thought says that we should not add
4859 * back the node to the tree if reclaim returns 0.
4860 * But our reclaim could return 0, simply because due
4861 * to priority we are exposing a smaller subset of
4862 * memory to reclaim from. Consider this as a longer
4863 * term TODO.
4864 */
4865 /* If excess == 0, no tree ops */
4866 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4867 spin_unlock(&mctz->lock);
4868 css_put(&mz->memcg->css);
4869 loop++;
4870 /*
4871 * Could not reclaim anything and there are no more
4872 * mem cgroups to try or we seem to be looping without
4873 * reclaiming anything.
4874 */
4875 if (!nr_reclaimed &&
4876 (next_mz == NULL ||
4877 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4878 break;
4879 } while (!nr_reclaimed);
4880 if (next_mz)
4881 css_put(&next_mz->memcg->css);
4882 return nr_reclaimed;
4883}
4884
4650/** 4885/**
4651 * mem_cgroup_force_empty_list - clears LRU of a group 4886 * mem_cgroup_force_empty_list - clears LRU of a group
4652 * @memcg: group to clear 4887 * @memcg: group to clear
@@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6146 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5912 mz = &pn->zoneinfo[zone]; 6147 mz = &pn->zoneinfo[zone];
5913 lruvec_init(&mz->lruvec); 6148 lruvec_init(&mz->lruvec);
6149 mz->usage_in_excess = 0;
6150 mz->on_tree = false;
5914 mz->memcg = memcg; 6151 mz->memcg = memcg;
5915 } 6152 }
5916 memcg->nodeinfo[node] = pn; 6153 memcg->nodeinfo[node] = pn;
@@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5966 int node; 6203 int node;
5967 size_t size = memcg_size(); 6204 size_t size = memcg_size();
5968 6205
6206 mem_cgroup_remove_from_trees(memcg);
5969 free_css_id(&mem_cgroup_subsys, &memcg->css); 6207 free_css_id(&mem_cgroup_subsys, &memcg->css);
5970 6208
5971 for_each_node(node) 6209 for_each_node(node)
@@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6002} 6240}
6003EXPORT_SYMBOL(parent_mem_cgroup); 6241EXPORT_SYMBOL(parent_mem_cgroup);
6004 6242
6243static void __init mem_cgroup_soft_limit_tree_init(void)
6244{
6245 struct mem_cgroup_tree_per_node *rtpn;
6246 struct mem_cgroup_tree_per_zone *rtpz;
6247 int tmp, node, zone;
6248
6249 for_each_node(node) {
6250 tmp = node;
6251 if (!node_state(node, N_NORMAL_MEMORY))
6252 tmp = -1;
6253 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6254 BUG_ON(!rtpn);
6255
6256 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6257
6258 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6259 rtpz = &rtpn->rb_tree_per_zone[zone];
6260 rtpz->rb_root = RB_ROOT;
6261 spin_lock_init(&rtpz->lock);
6262 }
6263 }
6264}
6265
6005static struct cgroup_subsys_state * __ref 6266static struct cgroup_subsys_state * __ref
6006mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6267mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6007{ 6268{
@@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6031 mutex_init(&memcg->thresholds_lock); 6292 mutex_init(&memcg->thresholds_lock);
6032 spin_lock_init(&memcg->move_lock); 6293 spin_lock_init(&memcg->move_lock);
6033 vmpressure_init(&memcg->vmpressure); 6294 vmpressure_init(&memcg->vmpressure);
6034 spin_lock_init(&memcg->soft_lock);
6035 6295
6036 return &memcg->css; 6296 return &memcg->css;
6037 6297
@@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6109 6369
6110 mem_cgroup_invalidate_reclaim_iterators(memcg); 6370 mem_cgroup_invalidate_reclaim_iterators(memcg);
6111 mem_cgroup_reparent_charges(memcg); 6371 mem_cgroup_reparent_charges(memcg);
6112 if (memcg->soft_contributed) {
6113 while ((memcg = parent_mem_cgroup(memcg)))
6114 atomic_dec(&memcg->children_in_excess);
6115
6116 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6117 atomic_dec(&root_mem_cgroup->children_in_excess);
6118 }
6119 mem_cgroup_destroy_all_caches(memcg); 6372 mem_cgroup_destroy_all_caches(memcg);
6120 vmpressure_cleanup(&memcg->vmpressure); 6373 vmpressure_cleanup(&memcg->vmpressure);
6121} 6374}
@@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void)
6790{ 7043{
6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7044 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6792 enable_swap_cgroup(); 7045 enable_swap_cgroup();
7046 mem_cgroup_soft_limit_tree_init();
6793 memcg_stock_init(); 7047 memcg_stock_init();
6794 return 0; 7048 return 0;
6795} 7049}