aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c703
1 files changed, 462 insertions, 241 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..34d3ca9572d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/swap.h> 44#include <linux/swap.h>
44#include <linux/swapops.h> 45#include <linux/swapops.h>
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {
160 161
161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 162 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
162 163
164 struct rb_node tree_node; /* RB tree node */
165 unsigned long long usage_in_excess;/* Set to the value by which */
166 /* the soft limit is exceeded*/
167 bool on_tree;
163 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 168 struct mem_cgroup *memcg; /* Back pointer, we cannot */
164 /* use container_of */ 169 /* use container_of */
165}; 170};
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node {
168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 173 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
169}; 174};
170 175
176/*
177 * Cgroups above their limits are maintained in a RB-Tree, independent of
178 * their hierarchy representation
179 */
180
181struct mem_cgroup_tree_per_zone {
182 struct rb_root rb_root;
183 spinlock_t lock;
184};
185
186struct mem_cgroup_tree_per_node {
187 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
188};
189
190struct mem_cgroup_tree {
191 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
192};
193
194static struct mem_cgroup_tree soft_limit_tree __read_mostly;
195
171struct mem_cgroup_threshold { 196struct mem_cgroup_threshold {
172 struct eventfd_ctx *eventfd; 197 struct eventfd_ctx *eventfd;
173 u64 threshold; 198 u64 threshold;
@@ -303,22 +328,6 @@ struct mem_cgroup {
303 atomic_t numainfo_events; 328 atomic_t numainfo_events;
304 atomic_t numainfo_updating; 329 atomic_t numainfo_updating;
305#endif 330#endif
306 /*
307 * Protects soft_contributed transitions.
308 * See mem_cgroup_update_soft_limit
309 */
310 spinlock_t soft_lock;
311
312 /*
313 * If true then this group has increased parents' children_in_excess
314 * when it got over the soft limit.
315 * When a group falls bellow the soft limit, parents' children_in_excess
316 * is decreased and soft_contributed changed to false.
317 */
318 bool soft_contributed;
319
320 /* Number of children that are in soft limit excess */
321 atomic_t children_in_excess;
322 331
323 struct mem_cgroup_per_node *nodeinfo[0]; 332 struct mem_cgroup_per_node *nodeinfo[0];
324 /* WARNING: nodeinfo must be the last member here */ 333 /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +431,7 @@ static bool move_file(void)
422 * limit reclaim to prevent infinite loops, if they ever occur. 431 * limit reclaim to prevent infinite loops, if they ever occur.
423 */ 432 */
424#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 433#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
434#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
425 435
426enum charge_type { 436enum charge_type {
427 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 437 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
648 return mem_cgroup_zoneinfo(memcg, nid, zid); 658 return mem_cgroup_zoneinfo(memcg, nid, zid);
649} 659}
650 660
661static struct mem_cgroup_tree_per_zone *
662soft_limit_tree_node_zone(int nid, int zid)
663{
664 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
665}
666
667static struct mem_cgroup_tree_per_zone *
668soft_limit_tree_from_page(struct page *page)
669{
670 int nid = page_to_nid(page);
671 int zid = page_zonenum(page);
672
673 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
674}
675
676static void
677__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
678 struct mem_cgroup_per_zone *mz,
679 struct mem_cgroup_tree_per_zone *mctz,
680 unsigned long long new_usage_in_excess)
681{
682 struct rb_node **p = &mctz->rb_root.rb_node;
683 struct rb_node *parent = NULL;
684 struct mem_cgroup_per_zone *mz_node;
685
686 if (mz->on_tree)
687 return;
688
689 mz->usage_in_excess = new_usage_in_excess;
690 if (!mz->usage_in_excess)
691 return;
692 while (*p) {
693 parent = *p;
694 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
695 tree_node);
696 if (mz->usage_in_excess < mz_node->usage_in_excess)
697 p = &(*p)->rb_left;
698 /*
699 * We can't avoid mem cgroups that are over their soft
700 * limit by the same amount
701 */
702 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
703 p = &(*p)->rb_right;
704 }
705 rb_link_node(&mz->tree_node, parent, p);
706 rb_insert_color(&mz->tree_node, &mctz->rb_root);
707 mz->on_tree = true;
708}
709
710static void
711__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
712 struct mem_cgroup_per_zone *mz,
713 struct mem_cgroup_tree_per_zone *mctz)
714{
715 if (!mz->on_tree)
716 return;
717 rb_erase(&mz->tree_node, &mctz->rb_root);
718 mz->on_tree = false;
719}
720
721static void
722mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
723 struct mem_cgroup_per_zone *mz,
724 struct mem_cgroup_tree_per_zone *mctz)
725{
726 spin_lock(&mctz->lock);
727 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
728 spin_unlock(&mctz->lock);
729}
730
731
732static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
733{
734 unsigned long long excess;
735 struct mem_cgroup_per_zone *mz;
736 struct mem_cgroup_tree_per_zone *mctz;
737 int nid = page_to_nid(page);
738 int zid = page_zonenum(page);
739 mctz = soft_limit_tree_from_page(page);
740
741 /*
742 * Necessary to update all ancestors when hierarchy is used.
743 * because their event counter is not touched.
744 */
745 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
746 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
747 excess = res_counter_soft_limit_excess(&memcg->res);
748 /*
749 * We have to update the tree if mz is on RB-tree or
750 * mem is over its softlimit.
751 */
752 if (excess || mz->on_tree) {
753 spin_lock(&mctz->lock);
754 /* if on-tree, remove it */
755 if (mz->on_tree)
756 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
757 /*
758 * Insert again. mz->usage_in_excess will be updated.
759 * If excess is 0, no tree ops.
760 */
761 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
762 spin_unlock(&mctz->lock);
763 }
764 }
765}
766
767static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
768{
769 int node, zone;
770 struct mem_cgroup_per_zone *mz;
771 struct mem_cgroup_tree_per_zone *mctz;
772
773 for_each_node(node) {
774 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
775 mz = mem_cgroup_zoneinfo(memcg, node, zone);
776 mctz = soft_limit_tree_node_zone(node, zone);
777 mem_cgroup_remove_exceeded(memcg, mz, mctz);
778 }
779 }
780}
781
782static struct mem_cgroup_per_zone *
783__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
784{
785 struct rb_node *rightmost = NULL;
786 struct mem_cgroup_per_zone *mz;
787
788retry:
789 mz = NULL;
790 rightmost = rb_last(&mctz->rb_root);
791 if (!rightmost)
792 goto done; /* Nothing to reclaim from */
793
794 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
795 /*
796 * Remove the node now but someone else can add it back,
797 * we will to add it back at the end of reclaim to its correct
798 * position in the tree.
799 */
800 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
801 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
802 !css_tryget(&mz->memcg->css))
803 goto retry;
804done:
805 return mz;
806}
807
808static struct mem_cgroup_per_zone *
809mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
810{
811 struct mem_cgroup_per_zone *mz;
812
813 spin_lock(&mctz->lock);
814 mz = __mem_cgroup_largest_soft_limit_node(mctz);
815 spin_unlock(&mctz->lock);
816 return mz;
817}
818
651/* 819/*
652 * Implementation Note: reading percpu statistics for memcg. 820 * Implementation Note: reading percpu statistics for memcg.
653 * 821 *
@@ -698,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
698 unsigned long val = 0; 866 unsigned long val = 0;
699 int cpu; 867 int cpu;
700 868
869 get_online_cpus();
701 for_each_online_cpu(cpu) 870 for_each_online_cpu(cpu)
702 val += per_cpu(memcg->stat->events[idx], cpu); 871 val += per_cpu(memcg->stat->events[idx], cpu);
703#ifdef CONFIG_HOTPLUG_CPU 872#ifdef CONFIG_HOTPLUG_CPU
@@ -705,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
705 val += memcg->nocpu_base.events[idx]; 874 val += memcg->nocpu_base.events[idx];
706 spin_unlock(&memcg->pcp_counter_lock); 875 spin_unlock(&memcg->pcp_counter_lock);
707#endif 876#endif
877 put_online_cpus();
708 return val; 878 return val;
709} 879}
710 880
@@ -822,48 +992,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
822} 992}
823 993
824/* 994/*
825 * Called from rate-limited memcg_check_events when enough
826 * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
827 * that all the parents up the hierarchy will be notified that this group
828 * is in excess or that it is not in excess anymore. mmecg->soft_contributed
829 * makes the transition a single action whenever the state flips from one to
830 * the other.
831 */
832static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
833{
834 unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
835 struct mem_cgroup *parent = memcg;
836 int delta = 0;
837
838 spin_lock(&memcg->soft_lock);
839 if (excess) {
840 if (!memcg->soft_contributed) {
841 delta = 1;
842 memcg->soft_contributed = true;
843 }
844 } else {
845 if (memcg->soft_contributed) {
846 delta = -1;
847 memcg->soft_contributed = false;
848 }
849 }
850
851 /*
852 * Necessary to update all ancestors when hierarchy is used
853 * because their event counter is not touched.
854 * We track children even outside the hierarchy for the root
855 * cgroup because tree walk starting at root should visit
856 * all cgroups and we want to prevent from pointless tree
857 * walk if no children is below the limit.
858 */
859 while (delta && (parent = parent_mem_cgroup(parent)))
860 atomic_add(delta, &parent->children_in_excess);
861 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
862 atomic_add(delta, &root_mem_cgroup->children_in_excess);
863 spin_unlock(&memcg->soft_lock);
864}
865
866/*
867 * Check events in order. 995 * Check events in order.
868 * 996 *
869 */ 997 */
@@ -886,7 +1014,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
886 1014
887 mem_cgroup_threshold(memcg); 1015 mem_cgroup_threshold(memcg);
888 if (unlikely(do_softlimit)) 1016 if (unlikely(do_softlimit))
889 mem_cgroup_update_soft_limit(memcg); 1017 mem_cgroup_update_tree(memcg, page);
890#if MAX_NUMNODES > 1 1018#if MAX_NUMNODES > 1
891 if (unlikely(do_numainfo)) 1019 if (unlikely(do_numainfo))
892 atomic_inc(&memcg->numainfo_events); 1020 atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1057,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
929 return memcg; 1057 return memcg;
930} 1058}
931 1059
932static enum mem_cgroup_filter_t
933mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
934 mem_cgroup_iter_filter cond)
935{
936 if (!cond)
937 return VISIT;
938 return cond(memcg, root);
939}
940
941/* 1060/*
942 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1061 * Returns a next (in a pre-order walk) alive memcg (with elevated css
943 * ref. count) or NULL if the whole root's subtree has been visited. 1062 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1064,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
945 * helper function to be used by mem_cgroup_iter 1064 * helper function to be used by mem_cgroup_iter
946 */ 1065 */
947static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1066static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
948 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 1067 struct mem_cgroup *last_visited)
949{ 1068{
950 struct cgroup_subsys_state *prev_css, *next_css; 1069 struct cgroup_subsys_state *prev_css, *next_css;
951 1070
@@ -963,31 +1082,11 @@ skip_node:
963 if (next_css) { 1082 if (next_css) {
964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1083 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
965 1084
966 switch (mem_cgroup_filter(mem, root, cond)) { 1085 if (css_tryget(&mem->css))
967 case SKIP: 1086 return mem;
1087 else {
968 prev_css = next_css; 1088 prev_css = next_css;
969 goto skip_node; 1089 goto skip_node;
970 case SKIP_TREE:
971 if (mem == root)
972 return NULL;
973 /*
974 * css_rightmost_descendant is not an optimal way to
975 * skip through a subtree (especially for imbalanced
976 * trees leaning to right) but that's what we have right
977 * now. More effective solution would be traversing
978 * right-up for first non-NULL without calling
979 * css_next_descendant_pre afterwards.
980 */
981 prev_css = css_rightmost_descendant(next_css);
982 goto skip_node;
983 case VISIT:
984 if (css_tryget(&mem->css))
985 return mem;
986 else {
987 prev_css = next_css;
988 goto skip_node;
989 }
990 break;
991 } 1090 }
992 } 1091 }
993 1092
@@ -1051,7 +1150,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1051 * @root: hierarchy root 1150 * @root: hierarchy root
1052 * @prev: previously returned memcg, NULL on first invocation 1151 * @prev: previously returned memcg, NULL on first invocation
1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1152 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1054 * @cond: filter for visited nodes, NULL for no filter
1055 * 1153 *
1056 * Returns references to children of the hierarchy below @root, or 1154 * Returns references to children of the hierarchy below @root, or
1057 * @root itself, or %NULL after a full round-trip. 1155 * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1162,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1064 * divide up the memcgs in the hierarchy among all concurrent 1162 * divide up the memcgs in the hierarchy among all concurrent
1065 * reclaimers operating on the same zone and priority. 1163 * reclaimers operating on the same zone and priority.
1066 */ 1164 */
1067struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1165struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1068 struct mem_cgroup *prev, 1166 struct mem_cgroup *prev,
1069 struct mem_cgroup_reclaim_cookie *reclaim, 1167 struct mem_cgroup_reclaim_cookie *reclaim)
1070 mem_cgroup_iter_filter cond)
1071{ 1168{
1072 struct mem_cgroup *memcg = NULL; 1169 struct mem_cgroup *memcg = NULL;
1073 struct mem_cgroup *last_visited = NULL; 1170 struct mem_cgroup *last_visited = NULL;
1074 1171
1075 if (mem_cgroup_disabled()) { 1172 if (mem_cgroup_disabled())
1076 /* first call must return non-NULL, second return NULL */ 1173 return NULL;
1077 return (struct mem_cgroup *)(unsigned long)!prev;
1078 }
1079 1174
1080 if (!root) 1175 if (!root)
1081 root = root_mem_cgroup; 1176 root = root_mem_cgroup;
@@ -1086,9 +1181,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1086 if (!root->use_hierarchy && root != root_mem_cgroup) { 1181 if (!root->use_hierarchy && root != root_mem_cgroup) {
1087 if (prev) 1182 if (prev)
1088 goto out_css_put; 1183 goto out_css_put;
1089 if (mem_cgroup_filter(root, root, cond) == VISIT) 1184 return root;
1090 return root;
1091 return NULL;
1092 } 1185 }
1093 1186
1094 rcu_read_lock(); 1187 rcu_read_lock();
@@ -1111,7 +1204,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1111 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1204 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1112 } 1205 }
1113 1206
1114 memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1207 memcg = __mem_cgroup_iter_next(root, last_visited);
1115 1208
1116 if (reclaim) { 1209 if (reclaim) {
1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1210 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1215,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1122 reclaim->generation = iter->generation; 1215 reclaim->generation = iter->generation;
1123 } 1216 }
1124 1217
1125 /* 1218 if (prev && !memcg)
1126 * We have finished the whole tree walk or no group has been
1127 * visited because filter told us to skip the root node.
1128 */
1129 if (!memcg && (prev || (cond && !last_visited)))
1130 goto out_unlock; 1219 goto out_unlock;
1131 } 1220 }
1132out_unlock: 1221out_unlock:
@@ -1767,7 +1856,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1767 return total; 1856 return total;
1768} 1857}
1769 1858
1770#if MAX_NUMNODES > 1
1771/** 1859/**
1772 * test_mem_cgroup_node_reclaimable 1860 * test_mem_cgroup_node_reclaimable
1773 * @memcg: the target memcg 1861 * @memcg: the target memcg
@@ -1790,6 +1878,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1790 return false; 1878 return false;
1791 1879
1792} 1880}
1881#if MAX_NUMNODES > 1
1793 1882
1794/* 1883/*
1795 * Always updating the nodemask is not very good - even if we have an empty 1884 * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,50 +1946,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1857 return node; 1946 return node;
1858} 1947}
1859 1948
1949/*
1950 * Check all nodes whether it contains reclaimable pages or not.
1951 * For quick scan, we make use of scan_nodes. This will allow us to skip
1952 * unused nodes. But scan_nodes is lazily updated and may not cotain
1953 * enough new information. We need to do double check.
1954 */
1955static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1956{
1957 int nid;
1958
1959 /*
1960 * quick check...making use of scan_node.
1961 * We can skip unused nodes.
1962 */
1963 if (!nodes_empty(memcg->scan_nodes)) {
1964 for (nid = first_node(memcg->scan_nodes);
1965 nid < MAX_NUMNODES;
1966 nid = next_node(nid, memcg->scan_nodes)) {
1967
1968 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1969 return true;
1970 }
1971 }
1972 /*
1973 * Check rest of nodes.
1974 */
1975 for_each_node_state(nid, N_MEMORY) {
1976 if (node_isset(nid, memcg->scan_nodes))
1977 continue;
1978 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1979 return true;
1980 }
1981 return false;
1982}
1983
1860#else 1984#else
1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1985int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1862{ 1986{
1863 return 0; 1987 return 0;
1864} 1988}
1865 1989
1866#endif 1990static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1867
1868/*
1869 * A group is eligible for the soft limit reclaim under the given root
1870 * hierarchy if
1871 * a) it is over its soft limit
1872 * b) any parent up the hierarchy is over its soft limit
1873 *
1874 * If the given group doesn't have any children over the limit then it
1875 * doesn't make any sense to iterate its subtree.
1876 */
1877enum mem_cgroup_filter_t
1878mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1879 struct mem_cgroup *root)
1880{ 1991{
1881 struct mem_cgroup *parent; 1992 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1882 1993}
1883 if (!memcg) 1994#endif
1884 memcg = root_mem_cgroup;
1885 parent = memcg;
1886
1887 if (res_counter_soft_limit_excess(&memcg->res))
1888 return VISIT;
1889 1995
1890 /* 1996static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1891 * If any parent up to the root in the hierarchy is over its soft limit 1997 struct zone *zone,
1892 * then we have to obey and reclaim from this group as well. 1998 gfp_t gfp_mask,
1893 */ 1999 unsigned long *total_scanned)
1894 while ((parent = parent_mem_cgroup(parent))) { 2000{
1895 if (res_counter_soft_limit_excess(&parent->res)) 2001 struct mem_cgroup *victim = NULL;
1896 return VISIT; 2002 int total = 0;
1897 if (parent == root) 2003 int loop = 0;
2004 unsigned long excess;
2005 unsigned long nr_scanned;
2006 struct mem_cgroup_reclaim_cookie reclaim = {
2007 .zone = zone,
2008 .priority = 0,
2009 };
2010
2011 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2012
2013 while (1) {
2014 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2015 if (!victim) {
2016 loop++;
2017 if (loop >= 2) {
2018 /*
2019 * If we have not been able to reclaim
2020 * anything, it might because there are
2021 * no reclaimable pages under this hierarchy
2022 */
2023 if (!total)
2024 break;
2025 /*
2026 * We want to do more targeted reclaim.
2027 * excess >> 2 is not to excessive so as to
2028 * reclaim too much, nor too less that we keep
2029 * coming back to reclaim from this cgroup
2030 */
2031 if (total >= (excess >> 2) ||
2032 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2033 break;
2034 }
2035 continue;
2036 }
2037 if (!mem_cgroup_reclaimable(victim, false))
2038 continue;
2039 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2040 zone, &nr_scanned);
2041 *total_scanned += nr_scanned;
2042 if (!res_counter_soft_limit_excess(&root_memcg->res))
1898 break; 2043 break;
1899 } 2044 }
1900 2045 mem_cgroup_iter_break(root_memcg, victim);
1901 if (!atomic_read(&memcg->children_in_excess)) 2046 return total;
1902 return SKIP_TREE;
1903 return SKIP;
1904} 2047}
1905 2048
1906static DEFINE_SPINLOCK(memcg_oom_lock); 2049static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -2018,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2018 memcg_wakeup_oom(memcg); 2161 memcg_wakeup_oom(memcg);
2019} 2162}
2020 2163
2021/*
2022 * try to call OOM killer
2023 */
2024static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2164static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2025{ 2165{
2026 bool locked;
2027 int wakeups;
2028
2029 if (!current->memcg_oom.may_oom) 2166 if (!current->memcg_oom.may_oom)
2030 return; 2167 return;
2031
2032 current->memcg_oom.in_memcg_oom = 1;
2033
2034 /* 2168 /*
2035 * As with any blocking lock, a contender needs to start 2169 * We are in the middle of the charge context here, so we
2036 * listening for wakeups before attempting the trylock, 2170 * don't want to block when potentially sitting on a callstack
2037 * otherwise it can miss the wakeup from the unlock and sleep 2171 * that holds all kinds of filesystem and mm locks.
2038 * indefinitely. This is just open-coded because our locking 2172 *
2039 * is so particular to memcg hierarchies. 2173 * Also, the caller may handle a failed allocation gracefully
2174 * (like optional page cache readahead) and so an OOM killer
2175 * invocation might not even be necessary.
2176 *
2177 * That's why we don't do anything here except remember the
2178 * OOM context and then deal with it at the end of the page
2179 * fault when the stack is unwound, the locks are released,
2180 * and when we know whether the fault was overall successful.
2040 */ 2181 */
2041 wakeups = atomic_read(&memcg->oom_wakeups); 2182 css_get(&memcg->css);
2042 mem_cgroup_mark_under_oom(memcg); 2183 current->memcg_oom.memcg = memcg;
2043 2184 current->memcg_oom.gfp_mask = mask;
2044 locked = mem_cgroup_oom_trylock(memcg); 2185 current->memcg_oom.order = order;
2045
2046 if (locked)
2047 mem_cgroup_oom_notify(memcg);
2048
2049 if (locked && !memcg->oom_kill_disable) {
2050 mem_cgroup_unmark_under_oom(memcg);
2051 mem_cgroup_out_of_memory(memcg, mask, order);
2052 mem_cgroup_oom_unlock(memcg);
2053 /*
2054 * There is no guarantee that an OOM-lock contender
2055 * sees the wakeups triggered by the OOM kill
2056 * uncharges. Wake any sleepers explicitely.
2057 */
2058 memcg_oom_recover(memcg);
2059 } else {
2060 /*
2061 * A system call can just return -ENOMEM, but if this
2062 * is a page fault and somebody else is handling the
2063 * OOM already, we need to sleep on the OOM waitqueue
2064 * for this memcg until the situation is resolved.
2065 * Which can take some time because it might be
2066 * handled by a userspace task.
2067 *
2068 * However, this is the charge context, which means
2069 * that we may sit on a large call stack and hold
2070 * various filesystem locks, the mmap_sem etc. and we
2071 * don't want the OOM handler to deadlock on them
2072 * while we sit here and wait. Store the current OOM
2073 * context in the task_struct, then return -ENOMEM.
2074 * At the end of the page fault handler, with the
2075 * stack unwound, pagefault_out_of_memory() will check
2076 * back with us by calling
2077 * mem_cgroup_oom_synchronize(), possibly putting the
2078 * task to sleep.
2079 */
2080 current->memcg_oom.oom_locked = locked;
2081 current->memcg_oom.wakeups = wakeups;
2082 css_get(&memcg->css);
2083 current->memcg_oom.wait_on_memcg = memcg;
2084 }
2085} 2186}
2086 2187
2087/** 2188/**
2088 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2189 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2190 * @handle: actually kill/wait or just clean up the OOM state
2089 * 2191 *
2090 * This has to be called at the end of a page fault if the the memcg 2192 * This has to be called at the end of a page fault if the memcg OOM
2091 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2193 * handler was enabled.
2092 * 2194 *
2093 * Memcg supports userspace OOM handling, so failed allocations must 2195 * Memcg supports userspace OOM handling where failed allocations must
2094 * sleep on a waitqueue until the userspace task resolves the 2196 * sleep on a waitqueue until the userspace task resolves the
2095 * situation. Sleeping directly in the charge context with all kinds 2197 * situation. Sleeping directly in the charge context with all kinds
2096 * of locks held is not a good idea, instead we remember an OOM state 2198 * of locks held is not a good idea, instead we remember an OOM state
2097 * in the task and mem_cgroup_oom_synchronize() has to be called at 2199 * in the task and mem_cgroup_oom_synchronize() has to be called at
2098 * the end of the page fault to put the task to sleep and clean up the 2200 * the end of the page fault to complete the OOM handling.
2099 * OOM state.
2100 * 2201 *
2101 * Returns %true if an ongoing memcg OOM situation was detected and 2202 * Returns %true if an ongoing memcg OOM situation was detected and
2102 * finalized, %false otherwise. 2203 * completed, %false otherwise.
2103 */ 2204 */
2104bool mem_cgroup_oom_synchronize(void) 2205bool mem_cgroup_oom_synchronize(bool handle)
2105{ 2206{
2207 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2106 struct oom_wait_info owait; 2208 struct oom_wait_info owait;
2107 struct mem_cgroup *memcg; 2209 bool locked;
2108 2210
2109 /* OOM is global, do not handle */ 2211 /* OOM is global, do not handle */
2110 if (!current->memcg_oom.in_memcg_oom)
2111 return false;
2112
2113 /*
2114 * We invoked the OOM killer but there is a chance that a kill
2115 * did not free up any charges. Everybody else might already
2116 * be sleeping, so restart the fault and keep the rampage
2117 * going until some charges are released.
2118 */
2119 memcg = current->memcg_oom.wait_on_memcg;
2120 if (!memcg) 2212 if (!memcg)
2121 goto out; 2213 return false;
2122 2214
2123 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2215 if (!handle)
2124 goto out_memcg; 2216 goto cleanup;
2125 2217
2126 owait.memcg = memcg; 2218 owait.memcg = memcg;
2127 owait.wait.flags = 0; 2219 owait.wait.flags = 0;
@@ -2130,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
2130 INIT_LIST_HEAD(&owait.wait.task_list); 2222 INIT_LIST_HEAD(&owait.wait.task_list);
2131 2223
2132 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2224 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2133 /* Only sleep if we didn't miss any wakeups since OOM */ 2225 mem_cgroup_mark_under_oom(memcg);
2134 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2226
2227 locked = mem_cgroup_oom_trylock(memcg);
2228
2229 if (locked)
2230 mem_cgroup_oom_notify(memcg);
2231
2232 if (locked && !memcg->oom_kill_disable) {
2233 mem_cgroup_unmark_under_oom(memcg);
2234 finish_wait(&memcg_oom_waitq, &owait.wait);
2235 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2236 current->memcg_oom.order);
2237 } else {
2135 schedule(); 2238 schedule();
2136 finish_wait(&memcg_oom_waitq, &owait.wait); 2239 mem_cgroup_unmark_under_oom(memcg);
2137out_memcg: 2240 finish_wait(&memcg_oom_waitq, &owait.wait);
2138 mem_cgroup_unmark_under_oom(memcg); 2241 }
2139 if (current->memcg_oom.oom_locked) { 2242
2243 if (locked) {
2140 mem_cgroup_oom_unlock(memcg); 2244 mem_cgroup_oom_unlock(memcg);
2141 /* 2245 /*
2142 * There is no guarantee that an OOM-lock contender 2246 * There is no guarantee that an OOM-lock contender
@@ -2145,10 +2249,9 @@ out_memcg:
2145 */ 2249 */
2146 memcg_oom_recover(memcg); 2250 memcg_oom_recover(memcg);
2147 } 2251 }
2252cleanup:
2253 current->memcg_oom.memcg = NULL;
2148 css_put(&memcg->css); 2254 css_put(&memcg->css);
2149 current->memcg_oom.wait_on_memcg = NULL;
2150out:
2151 current->memcg_oom.in_memcg_oom = 0;
2152 return true; 2255 return true;
2153} 2256}
2154 2257
@@ -2562,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2562 || fatal_signal_pending(current))) 2665 || fatal_signal_pending(current)))
2563 goto bypass; 2666 goto bypass;
2564 2667
2668 if (unlikely(task_in_memcg_oom(current)))
2669 goto bypass;
2670
2565 /* 2671 /*
2566 * We always charge the cgroup the mm_struct belongs to. 2672 * We always charge the cgroup the mm_struct belongs to.
2567 * The mm_struct's mem_cgroup changes on task migration if the 2673 * The mm_struct's mem_cgroup changes on task migration if the
@@ -2660,6 +2766,8 @@ done:
2660 return 0; 2766 return 0;
2661nomem: 2767nomem:
2662 *ptr = NULL; 2768 *ptr = NULL;
2769 if (gfp_mask & __GFP_NOFAIL)
2770 return 0;
2663 return -ENOMEM; 2771 return -ENOMEM;
2664bypass: 2772bypass:
2665 *ptr = root_mem_cgroup; 2773 *ptr = root_mem_cgroup;
@@ -2812,7 +2920,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2812 unlock_page_cgroup(pc); 2920 unlock_page_cgroup(pc);
2813 2921
2814 /* 2922 /*
2815 * "charge_statistics" updated event counter. 2923 * "charge_statistics" updated event counter. Then, check it.
2924 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2925 * if they exceeds softlimit.
2816 */ 2926 */
2817 memcg_check_events(memcg, page); 2927 memcg_check_events(memcg, page);
2818} 2928}
@@ -4647,6 +4757,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4647 return ret; 4757 return ret;
4648} 4758}
4649 4759
4760unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4761 gfp_t gfp_mask,
4762 unsigned long *total_scanned)
4763{
4764 unsigned long nr_reclaimed = 0;
4765 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4766 unsigned long reclaimed;
4767 int loop = 0;
4768 struct mem_cgroup_tree_per_zone *mctz;
4769 unsigned long long excess;
4770 unsigned long nr_scanned;
4771
4772 if (order > 0)
4773 return 0;
4774
4775 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4776 /*
4777 * This loop can run a while, specially if mem_cgroup's continuously
4778 * keep exceeding their soft limit and putting the system under
4779 * pressure
4780 */
4781 do {
4782 if (next_mz)
4783 mz = next_mz;
4784 else
4785 mz = mem_cgroup_largest_soft_limit_node(mctz);
4786 if (!mz)
4787 break;
4788
4789 nr_scanned = 0;
4790 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4791 gfp_mask, &nr_scanned);
4792 nr_reclaimed += reclaimed;
4793 *total_scanned += nr_scanned;
4794 spin_lock(&mctz->lock);
4795
4796 /*
4797 * If we failed to reclaim anything from this memory cgroup
4798 * it is time to move on to the next cgroup
4799 */
4800 next_mz = NULL;
4801 if (!reclaimed) {
4802 do {
4803 /*
4804 * Loop until we find yet another one.
4805 *
4806 * By the time we get the soft_limit lock
4807 * again, someone might have aded the
4808 * group back on the RB tree. Iterate to
4809 * make sure we get a different mem.
4810 * mem_cgroup_largest_soft_limit_node returns
4811 * NULL if no other cgroup is present on
4812 * the tree
4813 */
4814 next_mz =
4815 __mem_cgroup_largest_soft_limit_node(mctz);
4816 if (next_mz == mz)
4817 css_put(&next_mz->memcg->css);
4818 else /* next_mz == NULL or other memcg */
4819 break;
4820 } while (1);
4821 }
4822 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4823 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4824 /*
4825 * One school of thought says that we should not add
4826 * back the node to the tree if reclaim returns 0.
4827 * But our reclaim could return 0, simply because due
4828 * to priority we are exposing a smaller subset of
4829 * memory to reclaim from. Consider this as a longer
4830 * term TODO.
4831 */
4832 /* If excess == 0, no tree ops */
4833 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4834 spin_unlock(&mctz->lock);
4835 css_put(&mz->memcg->css);
4836 loop++;
4837 /*
4838 * Could not reclaim anything and there are no more
4839 * mem cgroups to try or we seem to be looping without
4840 * reclaiming anything.
4841 */
4842 if (!nr_reclaimed &&
4843 (next_mz == NULL ||
4844 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4845 break;
4846 } while (!nr_reclaimed);
4847 if (next_mz)
4848 css_put(&next_mz->memcg->css);
4849 return nr_reclaimed;
4850}
4851
4650/** 4852/**
4651 * mem_cgroup_force_empty_list - clears LRU of a group 4853 * mem_cgroup_force_empty_list - clears LRU of a group
4652 * @memcg: group to clear 4854 * @memcg: group to clear
@@ -5911,6 +6113,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6113 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5912 mz = &pn->zoneinfo[zone]; 6114 mz = &pn->zoneinfo[zone];
5913 lruvec_init(&mz->lruvec); 6115 lruvec_init(&mz->lruvec);
6116 mz->usage_in_excess = 0;
6117 mz->on_tree = false;
5914 mz->memcg = memcg; 6118 mz->memcg = memcg;
5915 } 6119 }
5916 memcg->nodeinfo[node] = pn; 6120 memcg->nodeinfo[node] = pn;
@@ -5966,6 +6170,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5966 int node; 6170 int node;
5967 size_t size = memcg_size(); 6171 size_t size = memcg_size();
5968 6172
6173 mem_cgroup_remove_from_trees(memcg);
5969 free_css_id(&mem_cgroup_subsys, &memcg->css); 6174 free_css_id(&mem_cgroup_subsys, &memcg->css);
5970 6175
5971 for_each_node(node) 6176 for_each_node(node)
@@ -6002,6 +6207,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6002} 6207}
6003EXPORT_SYMBOL(parent_mem_cgroup); 6208EXPORT_SYMBOL(parent_mem_cgroup);
6004 6209
6210static void __init mem_cgroup_soft_limit_tree_init(void)
6211{
6212 struct mem_cgroup_tree_per_node *rtpn;
6213 struct mem_cgroup_tree_per_zone *rtpz;
6214 int tmp, node, zone;
6215
6216 for_each_node(node) {
6217 tmp = node;
6218 if (!node_state(node, N_NORMAL_MEMORY))
6219 tmp = -1;
6220 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6221 BUG_ON(!rtpn);
6222
6223 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6224
6225 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6226 rtpz = &rtpn->rb_tree_per_zone[zone];
6227 rtpz->rb_root = RB_ROOT;
6228 spin_lock_init(&rtpz->lock);
6229 }
6230 }
6231}
6232
6005static struct cgroup_subsys_state * __ref 6233static struct cgroup_subsys_state * __ref
6006mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6234mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6007{ 6235{
@@ -6031,7 +6259,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6031 mutex_init(&memcg->thresholds_lock); 6259 mutex_init(&memcg->thresholds_lock);
6032 spin_lock_init(&memcg->move_lock); 6260 spin_lock_init(&memcg->move_lock);
6033 vmpressure_init(&memcg->vmpressure); 6261 vmpressure_init(&memcg->vmpressure);
6034 spin_lock_init(&memcg->soft_lock);
6035 6262
6036 return &memcg->css; 6263 return &memcg->css;
6037 6264
@@ -6109,13 +6336,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6109 6336
6110 mem_cgroup_invalidate_reclaim_iterators(memcg); 6337 mem_cgroup_invalidate_reclaim_iterators(memcg);
6111 mem_cgroup_reparent_charges(memcg); 6338 mem_cgroup_reparent_charges(memcg);
6112 if (memcg->soft_contributed) {
6113 while ((memcg = parent_mem_cgroup(memcg)))
6114 atomic_dec(&memcg->children_in_excess);
6115
6116 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6117 atomic_dec(&root_mem_cgroup->children_in_excess);
6118 }
6119 mem_cgroup_destroy_all_caches(memcg); 6339 mem_cgroup_destroy_all_caches(memcg);
6120 vmpressure_cleanup(&memcg->vmpressure); 6340 vmpressure_cleanup(&memcg->vmpressure);
6121} 6341}
@@ -6790,6 +7010,7 @@ static int __init mem_cgroup_init(void)
6790{ 7010{
6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7011 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6792 enable_swap_cgroup(); 7012 enable_swap_cgroup();
7013 mem_cgroup_soft_limit_tree_init();
6793 memcg_stock_init(); 7014 memcg_stock_init();
6794 return 0; 7015 return 0;
6795} 7016}