diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 560 |
1 files changed, 407 insertions, 153 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | ||
42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
43 | #include <linux/swap.h> | 44 | #include <linux/swap.h> |
44 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone { | |||
160 | 161 | ||
161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 162 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
162 | 163 | ||
164 | struct rb_node tree_node; /* RB tree node */ | ||
165 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
166 | /* the soft limit is exceeded*/ | ||
167 | bool on_tree; | ||
163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 168 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
164 | /* use container_of */ | 169 | /* use container_of */ |
165 | }; | 170 | }; |
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node { | |||
168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 173 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
169 | }; | 174 | }; |
170 | 175 | ||
176 | /* | ||
177 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
178 | * their hierarchy representation | ||
179 | */ | ||
180 | |||
181 | struct mem_cgroup_tree_per_zone { | ||
182 | struct rb_root rb_root; | ||
183 | spinlock_t lock; | ||
184 | }; | ||
185 | |||
186 | struct mem_cgroup_tree_per_node { | ||
187 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
188 | }; | ||
189 | |||
190 | struct mem_cgroup_tree { | ||
191 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
192 | }; | ||
193 | |||
194 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
195 | |||
171 | struct mem_cgroup_threshold { | 196 | struct mem_cgroup_threshold { |
172 | struct eventfd_ctx *eventfd; | 197 | struct eventfd_ctx *eventfd; |
173 | u64 threshold; | 198 | u64 threshold; |
@@ -303,22 +328,6 @@ struct mem_cgroup { | |||
303 | atomic_t numainfo_events; | 328 | atomic_t numainfo_events; |
304 | atomic_t numainfo_updating; | 329 | atomic_t numainfo_updating; |
305 | #endif | 330 | #endif |
306 | /* | ||
307 | * Protects soft_contributed transitions. | ||
308 | * See mem_cgroup_update_soft_limit | ||
309 | */ | ||
310 | spinlock_t soft_lock; | ||
311 | |||
312 | /* | ||
313 | * If true then this group has increased parents' children_in_excess | ||
314 | * when it got over the soft limit. | ||
315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
316 | * is decreased and soft_contributed changed to false. | ||
317 | */ | ||
318 | bool soft_contributed; | ||
319 | |||
320 | /* Number of children that are in soft limit excess */ | ||
321 | atomic_t children_in_excess; | ||
322 | 331 | ||
323 | struct mem_cgroup_per_node *nodeinfo[0]; | 332 | struct mem_cgroup_per_node *nodeinfo[0]; |
324 | /* WARNING: nodeinfo must be the last member here */ | 333 | /* WARNING: nodeinfo must be the last member here */ |
@@ -422,6 +431,7 @@ static bool move_file(void) | |||
422 | * limit reclaim to prevent infinite loops, if they ever occur. | 431 | * limit reclaim to prevent infinite loops, if they ever occur. |
423 | */ | 432 | */ |
424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 433 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
434 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
425 | 435 | ||
426 | enum charge_type { | 436 | enum charge_type { |
427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 437 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
648 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 658 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
649 | } | 659 | } |
650 | 660 | ||
661 | static struct mem_cgroup_tree_per_zone * | ||
662 | soft_limit_tree_node_zone(int nid, int zid) | ||
663 | { | ||
664 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
665 | } | ||
666 | |||
667 | static struct mem_cgroup_tree_per_zone * | ||
668 | soft_limit_tree_from_page(struct page *page) | ||
669 | { | ||
670 | int nid = page_to_nid(page); | ||
671 | int zid = page_zonenum(page); | ||
672 | |||
673 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
674 | } | ||
675 | |||
676 | static void | ||
677 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
678 | struct mem_cgroup_per_zone *mz, | ||
679 | struct mem_cgroup_tree_per_zone *mctz, | ||
680 | unsigned long long new_usage_in_excess) | ||
681 | { | ||
682 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
683 | struct rb_node *parent = NULL; | ||
684 | struct mem_cgroup_per_zone *mz_node; | ||
685 | |||
686 | if (mz->on_tree) | ||
687 | return; | ||
688 | |||
689 | mz->usage_in_excess = new_usage_in_excess; | ||
690 | if (!mz->usage_in_excess) | ||
691 | return; | ||
692 | while (*p) { | ||
693 | parent = *p; | ||
694 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
695 | tree_node); | ||
696 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
697 | p = &(*p)->rb_left; | ||
698 | /* | ||
699 | * We can't avoid mem cgroups that are over their soft | ||
700 | * limit by the same amount | ||
701 | */ | ||
702 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
703 | p = &(*p)->rb_right; | ||
704 | } | ||
705 | rb_link_node(&mz->tree_node, parent, p); | ||
706 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
707 | mz->on_tree = true; | ||
708 | } | ||
709 | |||
710 | static void | ||
711 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
712 | struct mem_cgroup_per_zone *mz, | ||
713 | struct mem_cgroup_tree_per_zone *mctz) | ||
714 | { | ||
715 | if (!mz->on_tree) | ||
716 | return; | ||
717 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
718 | mz->on_tree = false; | ||
719 | } | ||
720 | |||
721 | static void | ||
722 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
723 | struct mem_cgroup_per_zone *mz, | ||
724 | struct mem_cgroup_tree_per_zone *mctz) | ||
725 | { | ||
726 | spin_lock(&mctz->lock); | ||
727 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
728 | spin_unlock(&mctz->lock); | ||
729 | } | ||
730 | |||
731 | |||
732 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
733 | { | ||
734 | unsigned long long excess; | ||
735 | struct mem_cgroup_per_zone *mz; | ||
736 | struct mem_cgroup_tree_per_zone *mctz; | ||
737 | int nid = page_to_nid(page); | ||
738 | int zid = page_zonenum(page); | ||
739 | mctz = soft_limit_tree_from_page(page); | ||
740 | |||
741 | /* | ||
742 | * Necessary to update all ancestors when hierarchy is used. | ||
743 | * because their event counter is not touched. | ||
744 | */ | ||
745 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
746 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
747 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
748 | /* | ||
749 | * We have to update the tree if mz is on RB-tree or | ||
750 | * mem is over its softlimit. | ||
751 | */ | ||
752 | if (excess || mz->on_tree) { | ||
753 | spin_lock(&mctz->lock); | ||
754 | /* if on-tree, remove it */ | ||
755 | if (mz->on_tree) | ||
756 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
757 | /* | ||
758 | * Insert again. mz->usage_in_excess will be updated. | ||
759 | * If excess is 0, no tree ops. | ||
760 | */ | ||
761 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
762 | spin_unlock(&mctz->lock); | ||
763 | } | ||
764 | } | ||
765 | } | ||
766 | |||
767 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
768 | { | ||
769 | int node, zone; | ||
770 | struct mem_cgroup_per_zone *mz; | ||
771 | struct mem_cgroup_tree_per_zone *mctz; | ||
772 | |||
773 | for_each_node(node) { | ||
774 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
775 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
776 | mctz = soft_limit_tree_node_zone(node, zone); | ||
777 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
778 | } | ||
779 | } | ||
780 | } | ||
781 | |||
782 | static struct mem_cgroup_per_zone * | ||
783 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
784 | { | ||
785 | struct rb_node *rightmost = NULL; | ||
786 | struct mem_cgroup_per_zone *mz; | ||
787 | |||
788 | retry: | ||
789 | mz = NULL; | ||
790 | rightmost = rb_last(&mctz->rb_root); | ||
791 | if (!rightmost) | ||
792 | goto done; /* Nothing to reclaim from */ | ||
793 | |||
794 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
795 | /* | ||
796 | * Remove the node now but someone else can add it back, | ||
797 | * we will to add it back at the end of reclaim to its correct | ||
798 | * position in the tree. | ||
799 | */ | ||
800 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
801 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
802 | !css_tryget(&mz->memcg->css)) | ||
803 | goto retry; | ||
804 | done: | ||
805 | return mz; | ||
806 | } | ||
807 | |||
808 | static struct mem_cgroup_per_zone * | ||
809 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
810 | { | ||
811 | struct mem_cgroup_per_zone *mz; | ||
812 | |||
813 | spin_lock(&mctz->lock); | ||
814 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
815 | spin_unlock(&mctz->lock); | ||
816 | return mz; | ||
817 | } | ||
818 | |||
651 | /* | 819 | /* |
652 | * Implementation Note: reading percpu statistics for memcg. | 820 | * Implementation Note: reading percpu statistics for memcg. |
653 | * | 821 | * |
@@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
822 | } | 990 | } |
823 | 991 | ||
824 | /* | 992 | /* |
825 | * Called from rate-limited memcg_check_events when enough | ||
826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
827 | * that all the parents up the hierarchy will be notified that this group | ||
828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
829 | * makes the transition a single action whenever the state flips from one to | ||
830 | * the other. | ||
831 | */ | ||
832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
833 | { | ||
834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
835 | struct mem_cgroup *parent = memcg; | ||
836 | int delta = 0; | ||
837 | |||
838 | spin_lock(&memcg->soft_lock); | ||
839 | if (excess) { | ||
840 | if (!memcg->soft_contributed) { | ||
841 | delta = 1; | ||
842 | memcg->soft_contributed = true; | ||
843 | } | ||
844 | } else { | ||
845 | if (memcg->soft_contributed) { | ||
846 | delta = -1; | ||
847 | memcg->soft_contributed = false; | ||
848 | } | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Necessary to update all ancestors when hierarchy is used | ||
853 | * because their event counter is not touched. | ||
854 | * We track children even outside the hierarchy for the root | ||
855 | * cgroup because tree walk starting at root should visit | ||
856 | * all cgroups and we want to prevent from pointless tree | ||
857 | * walk if no children is below the limit. | ||
858 | */ | ||
859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
860 | atomic_add(delta, &parent->children_in_excess); | ||
861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
863 | spin_unlock(&memcg->soft_lock); | ||
864 | } | ||
865 | |||
866 | /* | ||
867 | * Check events in order. | 993 | * Check events in order. |
868 | * | 994 | * |
869 | */ | 995 | */ |
@@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
886 | 1012 | ||
887 | mem_cgroup_threshold(memcg); | 1013 | mem_cgroup_threshold(memcg); |
888 | if (unlikely(do_softlimit)) | 1014 | if (unlikely(do_softlimit)) |
889 | mem_cgroup_update_soft_limit(memcg); | 1015 | mem_cgroup_update_tree(memcg, page); |
890 | #if MAX_NUMNODES > 1 | 1016 | #if MAX_NUMNODES > 1 |
891 | if (unlikely(do_numainfo)) | 1017 | if (unlikely(do_numainfo)) |
892 | atomic_inc(&memcg->numainfo_events); | 1018 | atomic_inc(&memcg->numainfo_events); |
@@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
929 | return memcg; | 1055 | return memcg; |
930 | } | 1056 | } |
931 | 1057 | ||
932 | static enum mem_cgroup_filter_t | ||
933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
934 | mem_cgroup_iter_filter cond) | ||
935 | { | ||
936 | if (!cond) | ||
937 | return VISIT; | ||
938 | return cond(memcg, root); | ||
939 | } | ||
940 | |||
941 | /* | 1058 | /* |
942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 1059 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
943 | * ref. count) or NULL if the whole root's subtree has been visited. | 1060 | * ref. count) or NULL if the whole root's subtree has been visited. |
@@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | |||
945 | * helper function to be used by mem_cgroup_iter | 1062 | * helper function to be used by mem_cgroup_iter |
946 | */ | 1063 | */ |
947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 1064 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) | 1065 | struct mem_cgroup *last_visited) |
949 | { | 1066 | { |
950 | struct cgroup_subsys_state *prev_css, *next_css; | 1067 | struct cgroup_subsys_state *prev_css, *next_css; |
951 | 1068 | ||
@@ -963,31 +1080,11 @@ skip_node: | |||
963 | if (next_css) { | 1080 | if (next_css) { |
964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 1081 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
965 | 1082 | ||
966 | switch (mem_cgroup_filter(mem, root, cond)) { | 1083 | if (css_tryget(&mem->css)) |
967 | case SKIP: | 1084 | return mem; |
1085 | else { | ||
968 | prev_css = next_css; | 1086 | prev_css = next_css; |
969 | goto skip_node; | 1087 | goto skip_node; |
970 | case SKIP_TREE: | ||
971 | if (mem == root) | ||
972 | return NULL; | ||
973 | /* | ||
974 | * css_rightmost_descendant is not an optimal way to | ||
975 | * skip through a subtree (especially for imbalanced | ||
976 | * trees leaning to right) but that's what we have right | ||
977 | * now. More effective solution would be traversing | ||
978 | * right-up for first non-NULL without calling | ||
979 | * css_next_descendant_pre afterwards. | ||
980 | */ | ||
981 | prev_css = css_rightmost_descendant(next_css); | ||
982 | goto skip_node; | ||
983 | case VISIT: | ||
984 | if (css_tryget(&mem->css)) | ||
985 | return mem; | ||
986 | else { | ||
987 | prev_css = next_css; | ||
988 | goto skip_node; | ||
989 | } | ||
990 | break; | ||
991 | } | 1088 | } |
992 | } | 1089 | } |
993 | 1090 | ||
@@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1051 | * @root: hierarchy root | 1148 | * @root: hierarchy root |
1052 | * @prev: previously returned memcg, NULL on first invocation | 1149 | * @prev: previously returned memcg, NULL on first invocation |
1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1150 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
1054 | * @cond: filter for visited nodes, NULL for no filter | ||
1055 | * | 1151 | * |
1056 | * Returns references to children of the hierarchy below @root, or | 1152 | * Returns references to children of the hierarchy below @root, or |
1057 | * @root itself, or %NULL after a full round-trip. | 1153 | * @root itself, or %NULL after a full round-trip. |
@@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1064 | * divide up the memcgs in the hierarchy among all concurrent | 1160 | * divide up the memcgs in the hierarchy among all concurrent |
1065 | * reclaimers operating on the same zone and priority. | 1161 | * reclaimers operating on the same zone and priority. |
1066 | */ | 1162 | */ |
1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | 1163 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, |
1068 | struct mem_cgroup *prev, | 1164 | struct mem_cgroup *prev, |
1069 | struct mem_cgroup_reclaim_cookie *reclaim, | 1165 | struct mem_cgroup_reclaim_cookie *reclaim) |
1070 | mem_cgroup_iter_filter cond) | ||
1071 | { | 1166 | { |
1072 | struct mem_cgroup *memcg = NULL; | 1167 | struct mem_cgroup *memcg = NULL; |
1073 | struct mem_cgroup *last_visited = NULL; | 1168 | struct mem_cgroup *last_visited = NULL; |
1074 | 1169 | ||
1075 | if (mem_cgroup_disabled()) { | 1170 | if (mem_cgroup_disabled()) |
1076 | /* first call must return non-NULL, second return NULL */ | 1171 | return NULL; |
1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
1078 | } | ||
1079 | 1172 | ||
1080 | if (!root) | 1173 | if (!root) |
1081 | root = root_mem_cgroup; | 1174 | root = root_mem_cgroup; |
@@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1179 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1087 | if (prev) | 1180 | if (prev) |
1088 | goto out_css_put; | 1181 | goto out_css_put; |
1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) | 1182 | return root; |
1090 | return root; | ||
1091 | return NULL; | ||
1092 | } | 1183 | } |
1093 | 1184 | ||
1094 | rcu_read_lock(); | 1185 | rcu_read_lock(); |
@@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1202 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1112 | } | 1203 | } |
1113 | 1204 | ||
1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); | 1205 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1115 | 1206 | ||
1116 | if (reclaim) { | 1207 | if (reclaim) { |
1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1208 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
@@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1122 | reclaim->generation = iter->generation; | 1213 | reclaim->generation = iter->generation; |
1123 | } | 1214 | } |
1124 | 1215 | ||
1125 | /* | 1216 | if (prev && !memcg) |
1126 | * We have finished the whole tree walk or no group has been | ||
1127 | * visited because filter told us to skip the root node. | ||
1128 | */ | ||
1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
1130 | goto out_unlock; | 1217 | goto out_unlock; |
1131 | } | 1218 | } |
1132 | out_unlock: | 1219 | out_unlock: |
@@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1767 | return total; | 1854 | return total; |
1768 | } | 1855 | } |
1769 | 1856 | ||
1770 | #if MAX_NUMNODES > 1 | ||
1771 | /** | 1857 | /** |
1772 | * test_mem_cgroup_node_reclaimable | 1858 | * test_mem_cgroup_node_reclaimable |
1773 | * @memcg: the target memcg | 1859 | * @memcg: the target memcg |
@@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1790 | return false; | 1876 | return false; |
1791 | 1877 | ||
1792 | } | 1878 | } |
1879 | #if MAX_NUMNODES > 1 | ||
1793 | 1880 | ||
1794 | /* | 1881 | /* |
1795 | * Always updating the nodemask is not very good - even if we have an empty | 1882 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1857 | return node; | 1944 | return node; |
1858 | } | 1945 | } |
1859 | 1946 | ||
1947 | /* | ||
1948 | * Check all nodes whether it contains reclaimable pages or not. | ||
1949 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1950 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1951 | * enough new information. We need to do double check. | ||
1952 | */ | ||
1953 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1954 | { | ||
1955 | int nid; | ||
1956 | |||
1957 | /* | ||
1958 | * quick check...making use of scan_node. | ||
1959 | * We can skip unused nodes. | ||
1960 | */ | ||
1961 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1962 | for (nid = first_node(memcg->scan_nodes); | ||
1963 | nid < MAX_NUMNODES; | ||
1964 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1965 | |||
1966 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1967 | return true; | ||
1968 | } | ||
1969 | } | ||
1970 | /* | ||
1971 | * Check rest of nodes. | ||
1972 | */ | ||
1973 | for_each_node_state(nid, N_MEMORY) { | ||
1974 | if (node_isset(nid, memcg->scan_nodes)) | ||
1975 | continue; | ||
1976 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1977 | return true; | ||
1978 | } | ||
1979 | return false; | ||
1980 | } | ||
1981 | |||
1860 | #else | 1982 | #else |
1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1983 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1862 | { | 1984 | { |
1863 | return 0; | 1985 | return 0; |
1864 | } | 1986 | } |
1865 | 1987 | ||
1866 | #endif | 1988 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1867 | |||
1868 | /* | ||
1869 | * A group is eligible for the soft limit reclaim under the given root | ||
1870 | * hierarchy if | ||
1871 | * a) it is over its soft limit | ||
1872 | * b) any parent up the hierarchy is over its soft limit | ||
1873 | * | ||
1874 | * If the given group doesn't have any children over the limit then it | ||
1875 | * doesn't make any sense to iterate its subtree. | ||
1876 | */ | ||
1877 | enum mem_cgroup_filter_t | ||
1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, | ||
1879 | struct mem_cgroup *root) | ||
1880 | { | 1989 | { |
1881 | struct mem_cgroup *parent; | 1990 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1882 | 1991 | } | |
1883 | if (!memcg) | 1992 | #endif |
1884 | memcg = root_mem_cgroup; | ||
1885 | parent = memcg; | ||
1886 | |||
1887 | if (res_counter_soft_limit_excess(&memcg->res)) | ||
1888 | return VISIT; | ||
1889 | 1993 | ||
1890 | /* | 1994 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1891 | * If any parent up to the root in the hierarchy is over its soft limit | 1995 | struct zone *zone, |
1892 | * then we have to obey and reclaim from this group as well. | 1996 | gfp_t gfp_mask, |
1893 | */ | 1997 | unsigned long *total_scanned) |
1894 | while ((parent = parent_mem_cgroup(parent))) { | 1998 | { |
1895 | if (res_counter_soft_limit_excess(&parent->res)) | 1999 | struct mem_cgroup *victim = NULL; |
1896 | return VISIT; | 2000 | int total = 0; |
1897 | if (parent == root) | 2001 | int loop = 0; |
2002 | unsigned long excess; | ||
2003 | unsigned long nr_scanned; | ||
2004 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
2005 | .zone = zone, | ||
2006 | .priority = 0, | ||
2007 | }; | ||
2008 | |||
2009 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | ||
2010 | |||
2011 | while (1) { | ||
2012 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | ||
2013 | if (!victim) { | ||
2014 | loop++; | ||
2015 | if (loop >= 2) { | ||
2016 | /* | ||
2017 | * If we have not been able to reclaim | ||
2018 | * anything, it might because there are | ||
2019 | * no reclaimable pages under this hierarchy | ||
2020 | */ | ||
2021 | if (!total) | ||
2022 | break; | ||
2023 | /* | ||
2024 | * We want to do more targeted reclaim. | ||
2025 | * excess >> 2 is not to excessive so as to | ||
2026 | * reclaim too much, nor too less that we keep | ||
2027 | * coming back to reclaim from this cgroup | ||
2028 | */ | ||
2029 | if (total >= (excess >> 2) || | ||
2030 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
2031 | break; | ||
2032 | } | ||
2033 | continue; | ||
2034 | } | ||
2035 | if (!mem_cgroup_reclaimable(victim, false)) | ||
2036 | continue; | ||
2037 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
2038 | zone, &nr_scanned); | ||
2039 | *total_scanned += nr_scanned; | ||
2040 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
1898 | break; | 2041 | break; |
1899 | } | 2042 | } |
1900 | 2043 | mem_cgroup_iter_break(root_memcg, victim); | |
1901 | if (!atomic_read(&memcg->children_in_excess)) | 2044 | return total; |
1902 | return SKIP_TREE; | ||
1903 | return SKIP; | ||
1904 | } | 2045 | } |
1905 | 2046 | ||
1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | 2047 | static DEFINE_SPINLOCK(memcg_oom_lock); |
@@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2812 | unlock_page_cgroup(pc); | 2953 | unlock_page_cgroup(pc); |
2813 | 2954 | ||
2814 | /* | 2955 | /* |
2815 | * "charge_statistics" updated event counter. | 2956 | * "charge_statistics" updated event counter. Then, check it. |
2957 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
2958 | * if they exceeds softlimit. | ||
2816 | */ | 2959 | */ |
2817 | memcg_check_events(memcg, page); | 2960 | memcg_check_events(memcg, page); |
2818 | } | 2961 | } |
@@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4647 | return ret; | 4790 | return ret; |
4648 | } | 4791 | } |
4649 | 4792 | ||
4793 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
4794 | gfp_t gfp_mask, | ||
4795 | unsigned long *total_scanned) | ||
4796 | { | ||
4797 | unsigned long nr_reclaimed = 0; | ||
4798 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
4799 | unsigned long reclaimed; | ||
4800 | int loop = 0; | ||
4801 | struct mem_cgroup_tree_per_zone *mctz; | ||
4802 | unsigned long long excess; | ||
4803 | unsigned long nr_scanned; | ||
4804 | |||
4805 | if (order > 0) | ||
4806 | return 0; | ||
4807 | |||
4808 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
4809 | /* | ||
4810 | * This loop can run a while, specially if mem_cgroup's continuously | ||
4811 | * keep exceeding their soft limit and putting the system under | ||
4812 | * pressure | ||
4813 | */ | ||
4814 | do { | ||
4815 | if (next_mz) | ||
4816 | mz = next_mz; | ||
4817 | else | ||
4818 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
4819 | if (!mz) | ||
4820 | break; | ||
4821 | |||
4822 | nr_scanned = 0; | ||
4823 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
4824 | gfp_mask, &nr_scanned); | ||
4825 | nr_reclaimed += reclaimed; | ||
4826 | *total_scanned += nr_scanned; | ||
4827 | spin_lock(&mctz->lock); | ||
4828 | |||
4829 | /* | ||
4830 | * If we failed to reclaim anything from this memory cgroup | ||
4831 | * it is time to move on to the next cgroup | ||
4832 | */ | ||
4833 | next_mz = NULL; | ||
4834 | if (!reclaimed) { | ||
4835 | do { | ||
4836 | /* | ||
4837 | * Loop until we find yet another one. | ||
4838 | * | ||
4839 | * By the time we get the soft_limit lock | ||
4840 | * again, someone might have aded the | ||
4841 | * group back on the RB tree. Iterate to | ||
4842 | * make sure we get a different mem. | ||
4843 | * mem_cgroup_largest_soft_limit_node returns | ||
4844 | * NULL if no other cgroup is present on | ||
4845 | * the tree | ||
4846 | */ | ||
4847 | next_mz = | ||
4848 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
4849 | if (next_mz == mz) | ||
4850 | css_put(&next_mz->memcg->css); | ||
4851 | else /* next_mz == NULL or other memcg */ | ||
4852 | break; | ||
4853 | } while (1); | ||
4854 | } | ||
4855 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
4856 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
4857 | /* | ||
4858 | * One school of thought says that we should not add | ||
4859 | * back the node to the tree if reclaim returns 0. | ||
4860 | * But our reclaim could return 0, simply because due | ||
4861 | * to priority we are exposing a smaller subset of | ||
4862 | * memory to reclaim from. Consider this as a longer | ||
4863 | * term TODO. | ||
4864 | */ | ||
4865 | /* If excess == 0, no tree ops */ | ||
4866 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
4867 | spin_unlock(&mctz->lock); | ||
4868 | css_put(&mz->memcg->css); | ||
4869 | loop++; | ||
4870 | /* | ||
4871 | * Could not reclaim anything and there are no more | ||
4872 | * mem cgroups to try or we seem to be looping without | ||
4873 | * reclaiming anything. | ||
4874 | */ | ||
4875 | if (!nr_reclaimed && | ||
4876 | (next_mz == NULL || | ||
4877 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
4878 | break; | ||
4879 | } while (!nr_reclaimed); | ||
4880 | if (next_mz) | ||
4881 | css_put(&next_mz->memcg->css); | ||
4882 | return nr_reclaimed; | ||
4883 | } | ||
4884 | |||
4650 | /** | 4885 | /** |
4651 | * mem_cgroup_force_empty_list - clears LRU of a group | 4886 | * mem_cgroup_force_empty_list - clears LRU of a group |
4652 | * @memcg: group to clear | 4887 | * @memcg: group to clear |
@@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 6146 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
5912 | mz = &pn->zoneinfo[zone]; | 6147 | mz = &pn->zoneinfo[zone]; |
5913 | lruvec_init(&mz->lruvec); | 6148 | lruvec_init(&mz->lruvec); |
6149 | mz->usage_in_excess = 0; | ||
6150 | mz->on_tree = false; | ||
5914 | mz->memcg = memcg; | 6151 | mz->memcg = memcg; |
5915 | } | 6152 | } |
5916 | memcg->nodeinfo[node] = pn; | 6153 | memcg->nodeinfo[node] = pn; |
@@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
5966 | int node; | 6203 | int node; |
5967 | size_t size = memcg_size(); | 6204 | size_t size = memcg_size(); |
5968 | 6205 | ||
6206 | mem_cgroup_remove_from_trees(memcg); | ||
5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 6207 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
5970 | 6208 | ||
5971 | for_each_node(node) | 6209 | for_each_node(node) |
@@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6002 | } | 6240 | } |
6003 | EXPORT_SYMBOL(parent_mem_cgroup); | 6241 | EXPORT_SYMBOL(parent_mem_cgroup); |
6004 | 6242 | ||
6243 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
6244 | { | ||
6245 | struct mem_cgroup_tree_per_node *rtpn; | ||
6246 | struct mem_cgroup_tree_per_zone *rtpz; | ||
6247 | int tmp, node, zone; | ||
6248 | |||
6249 | for_each_node(node) { | ||
6250 | tmp = node; | ||
6251 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
6252 | tmp = -1; | ||
6253 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
6254 | BUG_ON(!rtpn); | ||
6255 | |||
6256 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
6257 | |||
6258 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
6259 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
6260 | rtpz->rb_root = RB_ROOT; | ||
6261 | spin_lock_init(&rtpz->lock); | ||
6262 | } | ||
6263 | } | ||
6264 | } | ||
6265 | |||
6005 | static struct cgroup_subsys_state * __ref | 6266 | static struct cgroup_subsys_state * __ref |
6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6267 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
6007 | { | 6268 | { |
@@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6031 | mutex_init(&memcg->thresholds_lock); | 6292 | mutex_init(&memcg->thresholds_lock); |
6032 | spin_lock_init(&memcg->move_lock); | 6293 | spin_lock_init(&memcg->move_lock); |
6033 | vmpressure_init(&memcg->vmpressure); | 6294 | vmpressure_init(&memcg->vmpressure); |
6034 | spin_lock_init(&memcg->soft_lock); | ||
6035 | 6295 | ||
6036 | return &memcg->css; | 6296 | return &memcg->css; |
6037 | 6297 | ||
@@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6109 | 6369 | ||
6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6370 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
6111 | mem_cgroup_reparent_charges(memcg); | 6371 | mem_cgroup_reparent_charges(memcg); |
6112 | if (memcg->soft_contributed) { | ||
6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
6114 | atomic_dec(&memcg->children_in_excess); | ||
6115 | |||
6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
6118 | } | ||
6119 | mem_cgroup_destroy_all_caches(memcg); | 6372 | mem_cgroup_destroy_all_caches(memcg); |
6120 | vmpressure_cleanup(&memcg->vmpressure); | 6373 | vmpressure_cleanup(&memcg->vmpressure); |
6121 | } | 6374 | } |
@@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void) | |||
6790 | { | 7043 | { |
6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 7044 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6792 | enable_swap_cgroup(); | 7045 | enable_swap_cgroup(); |
7046 | mem_cgroup_soft_limit_tree_init(); | ||
6793 | memcg_stock_init(); | 7047 | memcg_stock_init(); |
6794 | return 0; | 7048 | return 0; |
6795 | } | 7049 | } |