diff options
author | Kevin Hilman <khilman@linaro.org> | 2013-10-14 18:29:10 -0400 |
---|---|---|
committer | Kevin Hilman <khilman@linaro.org> | 2013-10-14 18:29:24 -0400 |
commit | 7587b5965f57c1c4d6fd1377432a8473f5cd449a (patch) | |
tree | 85b7ced77656ac142369c6436df02b51d6d13527 /mm/memcontrol.c | |
parent | 6a9d10d529db69244baab335fb02caba3d6ebbc9 (diff) | |
parent | 8d71528343c69ce387bd5fdb4fd8dc2b9f69d97c (diff) |
Merge tag 'omap-for-v3.13/quirk-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap into next/dt
From Tony Lindgren:
Changes needed to prepare for making omap3 device tree only:
- Always build in board-generic, and add pdata quirks and auxdata
support for it so we have all the pdata related quirks
in the same place.
- Merge of the drivers/pinctrl changes that are needed for PM
to continue working on omap3 and also needed for other omaps
eventually. The three pinctrl related patches have been acked
by Linus Walleij and are pulled into both the pinctrl tree
and this branch.
- Few defconfig related changes for drivers needed.
* tag 'omap-for-v3.13/quirk-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap: (523 commits)
ARM: configs: omap2plus_defconfig: enable dwc3 and dependencies
ARM: OMAP2+: Add WLAN modules and of_serial to omap2plus_defconfig
ARM: OMAP2+: Run make savedefconfig on omap2plus_defconfig to shrink it
ARM: OMAP2+: Add minimal 8250 support for GPMC
ARM: OMAP2+: Use pdata quirks for wl12xx for omap3 evm and zoom3
ARM: OMAP: Move DT wake-up event handling over to use pinctrl-single-omap
ARM: OMAP2+: Add support for auxdata
pinctrl: single: Add support for auxdata
pinctrl: single: Add support for wake-up interrupts
pinctrl: single: Prepare for supporting SoC specific features
ARM: OMAP2+: igep0020: use display init from dss-common
ARM: OMAP2+: pdata-quirks: add legacy display init for IGEPv2 board
+Linux 3.12-rc4
Signed-off-by: Kevin Hilman <khilman@linaro.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 560 |
1 files changed, 407 insertions, 153 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | ||
42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
43 | #include <linux/swap.h> | 44 | #include <linux/swap.h> |
44 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone { | |||
160 | 161 | ||
161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 162 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
162 | 163 | ||
164 | struct rb_node tree_node; /* RB tree node */ | ||
165 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
166 | /* the soft limit is exceeded*/ | ||
167 | bool on_tree; | ||
163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 168 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
164 | /* use container_of */ | 169 | /* use container_of */ |
165 | }; | 170 | }; |
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node { | |||
168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 173 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
169 | }; | 174 | }; |
170 | 175 | ||
176 | /* | ||
177 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
178 | * their hierarchy representation | ||
179 | */ | ||
180 | |||
181 | struct mem_cgroup_tree_per_zone { | ||
182 | struct rb_root rb_root; | ||
183 | spinlock_t lock; | ||
184 | }; | ||
185 | |||
186 | struct mem_cgroup_tree_per_node { | ||
187 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
188 | }; | ||
189 | |||
190 | struct mem_cgroup_tree { | ||
191 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
192 | }; | ||
193 | |||
194 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
195 | |||
171 | struct mem_cgroup_threshold { | 196 | struct mem_cgroup_threshold { |
172 | struct eventfd_ctx *eventfd; | 197 | struct eventfd_ctx *eventfd; |
173 | u64 threshold; | 198 | u64 threshold; |
@@ -303,22 +328,6 @@ struct mem_cgroup { | |||
303 | atomic_t numainfo_events; | 328 | atomic_t numainfo_events; |
304 | atomic_t numainfo_updating; | 329 | atomic_t numainfo_updating; |
305 | #endif | 330 | #endif |
306 | /* | ||
307 | * Protects soft_contributed transitions. | ||
308 | * See mem_cgroup_update_soft_limit | ||
309 | */ | ||
310 | spinlock_t soft_lock; | ||
311 | |||
312 | /* | ||
313 | * If true then this group has increased parents' children_in_excess | ||
314 | * when it got over the soft limit. | ||
315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
316 | * is decreased and soft_contributed changed to false. | ||
317 | */ | ||
318 | bool soft_contributed; | ||
319 | |||
320 | /* Number of children that are in soft limit excess */ | ||
321 | atomic_t children_in_excess; | ||
322 | 331 | ||
323 | struct mem_cgroup_per_node *nodeinfo[0]; | 332 | struct mem_cgroup_per_node *nodeinfo[0]; |
324 | /* WARNING: nodeinfo must be the last member here */ | 333 | /* WARNING: nodeinfo must be the last member here */ |
@@ -422,6 +431,7 @@ static bool move_file(void) | |||
422 | * limit reclaim to prevent infinite loops, if they ever occur. | 431 | * limit reclaim to prevent infinite loops, if they ever occur. |
423 | */ | 432 | */ |
424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 433 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
434 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
425 | 435 | ||
426 | enum charge_type { | 436 | enum charge_type { |
427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 437 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
648 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 658 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
649 | } | 659 | } |
650 | 660 | ||
661 | static struct mem_cgroup_tree_per_zone * | ||
662 | soft_limit_tree_node_zone(int nid, int zid) | ||
663 | { | ||
664 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
665 | } | ||
666 | |||
667 | static struct mem_cgroup_tree_per_zone * | ||
668 | soft_limit_tree_from_page(struct page *page) | ||
669 | { | ||
670 | int nid = page_to_nid(page); | ||
671 | int zid = page_zonenum(page); | ||
672 | |||
673 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
674 | } | ||
675 | |||
676 | static void | ||
677 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
678 | struct mem_cgroup_per_zone *mz, | ||
679 | struct mem_cgroup_tree_per_zone *mctz, | ||
680 | unsigned long long new_usage_in_excess) | ||
681 | { | ||
682 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
683 | struct rb_node *parent = NULL; | ||
684 | struct mem_cgroup_per_zone *mz_node; | ||
685 | |||
686 | if (mz->on_tree) | ||
687 | return; | ||
688 | |||
689 | mz->usage_in_excess = new_usage_in_excess; | ||
690 | if (!mz->usage_in_excess) | ||
691 | return; | ||
692 | while (*p) { | ||
693 | parent = *p; | ||
694 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
695 | tree_node); | ||
696 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
697 | p = &(*p)->rb_left; | ||
698 | /* | ||
699 | * We can't avoid mem cgroups that are over their soft | ||
700 | * limit by the same amount | ||
701 | */ | ||
702 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
703 | p = &(*p)->rb_right; | ||
704 | } | ||
705 | rb_link_node(&mz->tree_node, parent, p); | ||
706 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
707 | mz->on_tree = true; | ||
708 | } | ||
709 | |||
710 | static void | ||
711 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
712 | struct mem_cgroup_per_zone *mz, | ||
713 | struct mem_cgroup_tree_per_zone *mctz) | ||
714 | { | ||
715 | if (!mz->on_tree) | ||
716 | return; | ||
717 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
718 | mz->on_tree = false; | ||
719 | } | ||
720 | |||
721 | static void | ||
722 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
723 | struct mem_cgroup_per_zone *mz, | ||
724 | struct mem_cgroup_tree_per_zone *mctz) | ||
725 | { | ||
726 | spin_lock(&mctz->lock); | ||
727 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
728 | spin_unlock(&mctz->lock); | ||
729 | } | ||
730 | |||
731 | |||
732 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
733 | { | ||
734 | unsigned long long excess; | ||
735 | struct mem_cgroup_per_zone *mz; | ||
736 | struct mem_cgroup_tree_per_zone *mctz; | ||
737 | int nid = page_to_nid(page); | ||
738 | int zid = page_zonenum(page); | ||
739 | mctz = soft_limit_tree_from_page(page); | ||
740 | |||
741 | /* | ||
742 | * Necessary to update all ancestors when hierarchy is used. | ||
743 | * because their event counter is not touched. | ||
744 | */ | ||
745 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
746 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
747 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
748 | /* | ||
749 | * We have to update the tree if mz is on RB-tree or | ||
750 | * mem is over its softlimit. | ||
751 | */ | ||
752 | if (excess || mz->on_tree) { | ||
753 | spin_lock(&mctz->lock); | ||
754 | /* if on-tree, remove it */ | ||
755 | if (mz->on_tree) | ||
756 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
757 | /* | ||
758 | * Insert again. mz->usage_in_excess will be updated. | ||
759 | * If excess is 0, no tree ops. | ||
760 | */ | ||
761 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
762 | spin_unlock(&mctz->lock); | ||
763 | } | ||
764 | } | ||
765 | } | ||
766 | |||
767 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
768 | { | ||
769 | int node, zone; | ||
770 | struct mem_cgroup_per_zone *mz; | ||
771 | struct mem_cgroup_tree_per_zone *mctz; | ||
772 | |||
773 | for_each_node(node) { | ||
774 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
775 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
776 | mctz = soft_limit_tree_node_zone(node, zone); | ||
777 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
778 | } | ||
779 | } | ||
780 | } | ||
781 | |||
782 | static struct mem_cgroup_per_zone * | ||
783 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
784 | { | ||
785 | struct rb_node *rightmost = NULL; | ||
786 | struct mem_cgroup_per_zone *mz; | ||
787 | |||
788 | retry: | ||
789 | mz = NULL; | ||
790 | rightmost = rb_last(&mctz->rb_root); | ||
791 | if (!rightmost) | ||
792 | goto done; /* Nothing to reclaim from */ | ||
793 | |||
794 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
795 | /* | ||
796 | * Remove the node now but someone else can add it back, | ||
797 | * we will to add it back at the end of reclaim to its correct | ||
798 | * position in the tree. | ||
799 | */ | ||
800 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
801 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
802 | !css_tryget(&mz->memcg->css)) | ||
803 | goto retry; | ||
804 | done: | ||
805 | return mz; | ||
806 | } | ||
807 | |||
808 | static struct mem_cgroup_per_zone * | ||
809 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
810 | { | ||
811 | struct mem_cgroup_per_zone *mz; | ||
812 | |||
813 | spin_lock(&mctz->lock); | ||
814 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
815 | spin_unlock(&mctz->lock); | ||
816 | return mz; | ||
817 | } | ||
818 | |||
651 | /* | 819 | /* |
652 | * Implementation Note: reading percpu statistics for memcg. | 820 | * Implementation Note: reading percpu statistics for memcg. |
653 | * | 821 | * |
@@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
822 | } | 990 | } |
823 | 991 | ||
824 | /* | 992 | /* |
825 | * Called from rate-limited memcg_check_events when enough | ||
826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
827 | * that all the parents up the hierarchy will be notified that this group | ||
828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
829 | * makes the transition a single action whenever the state flips from one to | ||
830 | * the other. | ||
831 | */ | ||
832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
833 | { | ||
834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
835 | struct mem_cgroup *parent = memcg; | ||
836 | int delta = 0; | ||
837 | |||
838 | spin_lock(&memcg->soft_lock); | ||
839 | if (excess) { | ||
840 | if (!memcg->soft_contributed) { | ||
841 | delta = 1; | ||
842 | memcg->soft_contributed = true; | ||
843 | } | ||
844 | } else { | ||
845 | if (memcg->soft_contributed) { | ||
846 | delta = -1; | ||
847 | memcg->soft_contributed = false; | ||
848 | } | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Necessary to update all ancestors when hierarchy is used | ||
853 | * because their event counter is not touched. | ||
854 | * We track children even outside the hierarchy for the root | ||
855 | * cgroup because tree walk starting at root should visit | ||
856 | * all cgroups and we want to prevent from pointless tree | ||
857 | * walk if no children is below the limit. | ||
858 | */ | ||
859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
860 | atomic_add(delta, &parent->children_in_excess); | ||
861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
863 | spin_unlock(&memcg->soft_lock); | ||
864 | } | ||
865 | |||
866 | /* | ||
867 | * Check events in order. | 993 | * Check events in order. |
868 | * | 994 | * |
869 | */ | 995 | */ |
@@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
886 | 1012 | ||
887 | mem_cgroup_threshold(memcg); | 1013 | mem_cgroup_threshold(memcg); |
888 | if (unlikely(do_softlimit)) | 1014 | if (unlikely(do_softlimit)) |
889 | mem_cgroup_update_soft_limit(memcg); | 1015 | mem_cgroup_update_tree(memcg, page); |
890 | #if MAX_NUMNODES > 1 | 1016 | #if MAX_NUMNODES > 1 |
891 | if (unlikely(do_numainfo)) | 1017 | if (unlikely(do_numainfo)) |
892 | atomic_inc(&memcg->numainfo_events); | 1018 | atomic_inc(&memcg->numainfo_events); |
@@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
929 | return memcg; | 1055 | return memcg; |
930 | } | 1056 | } |
931 | 1057 | ||
932 | static enum mem_cgroup_filter_t | ||
933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
934 | mem_cgroup_iter_filter cond) | ||
935 | { | ||
936 | if (!cond) | ||
937 | return VISIT; | ||
938 | return cond(memcg, root); | ||
939 | } | ||
940 | |||
941 | /* | 1058 | /* |
942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 1059 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
943 | * ref. count) or NULL if the whole root's subtree has been visited. | 1060 | * ref. count) or NULL if the whole root's subtree has been visited. |
@@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | |||
945 | * helper function to be used by mem_cgroup_iter | 1062 | * helper function to be used by mem_cgroup_iter |
946 | */ | 1063 | */ |
947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 1064 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) | 1065 | struct mem_cgroup *last_visited) |
949 | { | 1066 | { |
950 | struct cgroup_subsys_state *prev_css, *next_css; | 1067 | struct cgroup_subsys_state *prev_css, *next_css; |
951 | 1068 | ||
@@ -963,31 +1080,11 @@ skip_node: | |||
963 | if (next_css) { | 1080 | if (next_css) { |
964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 1081 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
965 | 1082 | ||
966 | switch (mem_cgroup_filter(mem, root, cond)) { | 1083 | if (css_tryget(&mem->css)) |
967 | case SKIP: | 1084 | return mem; |
1085 | else { | ||
968 | prev_css = next_css; | 1086 | prev_css = next_css; |
969 | goto skip_node; | 1087 | goto skip_node; |
970 | case SKIP_TREE: | ||
971 | if (mem == root) | ||
972 | return NULL; | ||
973 | /* | ||
974 | * css_rightmost_descendant is not an optimal way to | ||
975 | * skip through a subtree (especially for imbalanced | ||
976 | * trees leaning to right) but that's what we have right | ||
977 | * now. More effective solution would be traversing | ||
978 | * right-up for first non-NULL without calling | ||
979 | * css_next_descendant_pre afterwards. | ||
980 | */ | ||
981 | prev_css = css_rightmost_descendant(next_css); | ||
982 | goto skip_node; | ||
983 | case VISIT: | ||
984 | if (css_tryget(&mem->css)) | ||
985 | return mem; | ||
986 | else { | ||
987 | prev_css = next_css; | ||
988 | goto skip_node; | ||
989 | } | ||
990 | break; | ||
991 | } | 1088 | } |
992 | } | 1089 | } |
993 | 1090 | ||
@@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1051 | * @root: hierarchy root | 1148 | * @root: hierarchy root |
1052 | * @prev: previously returned memcg, NULL on first invocation | 1149 | * @prev: previously returned memcg, NULL on first invocation |
1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1150 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
1054 | * @cond: filter for visited nodes, NULL for no filter | ||
1055 | * | 1151 | * |
1056 | * Returns references to children of the hierarchy below @root, or | 1152 | * Returns references to children of the hierarchy below @root, or |
1057 | * @root itself, or %NULL after a full round-trip. | 1153 | * @root itself, or %NULL after a full round-trip. |
@@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1064 | * divide up the memcgs in the hierarchy among all concurrent | 1160 | * divide up the memcgs in the hierarchy among all concurrent |
1065 | * reclaimers operating on the same zone and priority. | 1161 | * reclaimers operating on the same zone and priority. |
1066 | */ | 1162 | */ |
1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | 1163 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, |
1068 | struct mem_cgroup *prev, | 1164 | struct mem_cgroup *prev, |
1069 | struct mem_cgroup_reclaim_cookie *reclaim, | 1165 | struct mem_cgroup_reclaim_cookie *reclaim) |
1070 | mem_cgroup_iter_filter cond) | ||
1071 | { | 1166 | { |
1072 | struct mem_cgroup *memcg = NULL; | 1167 | struct mem_cgroup *memcg = NULL; |
1073 | struct mem_cgroup *last_visited = NULL; | 1168 | struct mem_cgroup *last_visited = NULL; |
1074 | 1169 | ||
1075 | if (mem_cgroup_disabled()) { | 1170 | if (mem_cgroup_disabled()) |
1076 | /* first call must return non-NULL, second return NULL */ | 1171 | return NULL; |
1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
1078 | } | ||
1079 | 1172 | ||
1080 | if (!root) | 1173 | if (!root) |
1081 | root = root_mem_cgroup; | 1174 | root = root_mem_cgroup; |
@@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1179 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1087 | if (prev) | 1180 | if (prev) |
1088 | goto out_css_put; | 1181 | goto out_css_put; |
1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) | 1182 | return root; |
1090 | return root; | ||
1091 | return NULL; | ||
1092 | } | 1183 | } |
1093 | 1184 | ||
1094 | rcu_read_lock(); | 1185 | rcu_read_lock(); |
@@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1202 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1112 | } | 1203 | } |
1113 | 1204 | ||
1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); | 1205 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1115 | 1206 | ||
1116 | if (reclaim) { | 1207 | if (reclaim) { |
1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1208 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
@@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1122 | reclaim->generation = iter->generation; | 1213 | reclaim->generation = iter->generation; |
1123 | } | 1214 | } |
1124 | 1215 | ||
1125 | /* | 1216 | if (prev && !memcg) |
1126 | * We have finished the whole tree walk or no group has been | ||
1127 | * visited because filter told us to skip the root node. | ||
1128 | */ | ||
1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
1130 | goto out_unlock; | 1217 | goto out_unlock; |
1131 | } | 1218 | } |
1132 | out_unlock: | 1219 | out_unlock: |
@@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1767 | return total; | 1854 | return total; |
1768 | } | 1855 | } |
1769 | 1856 | ||
1770 | #if MAX_NUMNODES > 1 | ||
1771 | /** | 1857 | /** |
1772 | * test_mem_cgroup_node_reclaimable | 1858 | * test_mem_cgroup_node_reclaimable |
1773 | * @memcg: the target memcg | 1859 | * @memcg: the target memcg |
@@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1790 | return false; | 1876 | return false; |
1791 | 1877 | ||
1792 | } | 1878 | } |
1879 | #if MAX_NUMNODES > 1 | ||
1793 | 1880 | ||
1794 | /* | 1881 | /* |
1795 | * Always updating the nodemask is not very good - even if we have an empty | 1882 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1857 | return node; | 1944 | return node; |
1858 | } | 1945 | } |
1859 | 1946 | ||
1947 | /* | ||
1948 | * Check all nodes whether it contains reclaimable pages or not. | ||
1949 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1950 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1951 | * enough new information. We need to do double check. | ||
1952 | */ | ||
1953 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1954 | { | ||
1955 | int nid; | ||
1956 | |||
1957 | /* | ||
1958 | * quick check...making use of scan_node. | ||
1959 | * We can skip unused nodes. | ||
1960 | */ | ||
1961 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1962 | for (nid = first_node(memcg->scan_nodes); | ||
1963 | nid < MAX_NUMNODES; | ||
1964 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1965 | |||
1966 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1967 | return true; | ||
1968 | } | ||
1969 | } | ||
1970 | /* | ||
1971 | * Check rest of nodes. | ||
1972 | */ | ||
1973 | for_each_node_state(nid, N_MEMORY) { | ||
1974 | if (node_isset(nid, memcg->scan_nodes)) | ||
1975 | continue; | ||
1976 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1977 | return true; | ||
1978 | } | ||
1979 | return false; | ||
1980 | } | ||
1981 | |||
1860 | #else | 1982 | #else |
1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1983 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1862 | { | 1984 | { |
1863 | return 0; | 1985 | return 0; |
1864 | } | 1986 | } |
1865 | 1987 | ||
1866 | #endif | 1988 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1867 | |||
1868 | /* | ||
1869 | * A group is eligible for the soft limit reclaim under the given root | ||
1870 | * hierarchy if | ||
1871 | * a) it is over its soft limit | ||
1872 | * b) any parent up the hierarchy is over its soft limit | ||
1873 | * | ||
1874 | * If the given group doesn't have any children over the limit then it | ||
1875 | * doesn't make any sense to iterate its subtree. | ||
1876 | */ | ||
1877 | enum mem_cgroup_filter_t | ||
1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, | ||
1879 | struct mem_cgroup *root) | ||
1880 | { | 1989 | { |
1881 | struct mem_cgroup *parent; | 1990 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1882 | 1991 | } | |
1883 | if (!memcg) | 1992 | #endif |
1884 | memcg = root_mem_cgroup; | ||
1885 | parent = memcg; | ||
1886 | |||
1887 | if (res_counter_soft_limit_excess(&memcg->res)) | ||
1888 | return VISIT; | ||
1889 | 1993 | ||
1890 | /* | 1994 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1891 | * If any parent up to the root in the hierarchy is over its soft limit | 1995 | struct zone *zone, |
1892 | * then we have to obey and reclaim from this group as well. | 1996 | gfp_t gfp_mask, |
1893 | */ | 1997 | unsigned long *total_scanned) |
1894 | while ((parent = parent_mem_cgroup(parent))) { | 1998 | { |
1895 | if (res_counter_soft_limit_excess(&parent->res)) | 1999 | struct mem_cgroup *victim = NULL; |
1896 | return VISIT; | 2000 | int total = 0; |
1897 | if (parent == root) | 2001 | int loop = 0; |
2002 | unsigned long excess; | ||
2003 | unsigned long nr_scanned; | ||
2004 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
2005 | .zone = zone, | ||
2006 | .priority = 0, | ||
2007 | }; | ||
2008 | |||
2009 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | ||
2010 | |||
2011 | while (1) { | ||
2012 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | ||
2013 | if (!victim) { | ||
2014 | loop++; | ||
2015 | if (loop >= 2) { | ||
2016 | /* | ||
2017 | * If we have not been able to reclaim | ||
2018 | * anything, it might because there are | ||
2019 | * no reclaimable pages under this hierarchy | ||
2020 | */ | ||
2021 | if (!total) | ||
2022 | break; | ||
2023 | /* | ||
2024 | * We want to do more targeted reclaim. | ||
2025 | * excess >> 2 is not to excessive so as to | ||
2026 | * reclaim too much, nor too less that we keep | ||
2027 | * coming back to reclaim from this cgroup | ||
2028 | */ | ||
2029 | if (total >= (excess >> 2) || | ||
2030 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
2031 | break; | ||
2032 | } | ||
2033 | continue; | ||
2034 | } | ||
2035 | if (!mem_cgroup_reclaimable(victim, false)) | ||
2036 | continue; | ||
2037 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
2038 | zone, &nr_scanned); | ||
2039 | *total_scanned += nr_scanned; | ||
2040 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
1898 | break; | 2041 | break; |
1899 | } | 2042 | } |
1900 | 2043 | mem_cgroup_iter_break(root_memcg, victim); | |
1901 | if (!atomic_read(&memcg->children_in_excess)) | 2044 | return total; |
1902 | return SKIP_TREE; | ||
1903 | return SKIP; | ||
1904 | } | 2045 | } |
1905 | 2046 | ||
1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | 2047 | static DEFINE_SPINLOCK(memcg_oom_lock); |
@@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2812 | unlock_page_cgroup(pc); | 2953 | unlock_page_cgroup(pc); |
2813 | 2954 | ||
2814 | /* | 2955 | /* |
2815 | * "charge_statistics" updated event counter. | 2956 | * "charge_statistics" updated event counter. Then, check it. |
2957 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
2958 | * if they exceeds softlimit. | ||
2816 | */ | 2959 | */ |
2817 | memcg_check_events(memcg, page); | 2960 | memcg_check_events(memcg, page); |
2818 | } | 2961 | } |
@@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4647 | return ret; | 4790 | return ret; |
4648 | } | 4791 | } |
4649 | 4792 | ||
4793 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
4794 | gfp_t gfp_mask, | ||
4795 | unsigned long *total_scanned) | ||
4796 | { | ||
4797 | unsigned long nr_reclaimed = 0; | ||
4798 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
4799 | unsigned long reclaimed; | ||
4800 | int loop = 0; | ||
4801 | struct mem_cgroup_tree_per_zone *mctz; | ||
4802 | unsigned long long excess; | ||
4803 | unsigned long nr_scanned; | ||
4804 | |||
4805 | if (order > 0) | ||
4806 | return 0; | ||
4807 | |||
4808 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
4809 | /* | ||
4810 | * This loop can run a while, specially if mem_cgroup's continuously | ||
4811 | * keep exceeding their soft limit and putting the system under | ||
4812 | * pressure | ||
4813 | */ | ||
4814 | do { | ||
4815 | if (next_mz) | ||
4816 | mz = next_mz; | ||
4817 | else | ||
4818 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
4819 | if (!mz) | ||
4820 | break; | ||
4821 | |||
4822 | nr_scanned = 0; | ||
4823 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
4824 | gfp_mask, &nr_scanned); | ||
4825 | nr_reclaimed += reclaimed; | ||
4826 | *total_scanned += nr_scanned; | ||
4827 | spin_lock(&mctz->lock); | ||
4828 | |||
4829 | /* | ||
4830 | * If we failed to reclaim anything from this memory cgroup | ||
4831 | * it is time to move on to the next cgroup | ||
4832 | */ | ||
4833 | next_mz = NULL; | ||
4834 | if (!reclaimed) { | ||
4835 | do { | ||
4836 | /* | ||
4837 | * Loop until we find yet another one. | ||
4838 | * | ||
4839 | * By the time we get the soft_limit lock | ||
4840 | * again, someone might have aded the | ||
4841 | * group back on the RB tree. Iterate to | ||
4842 | * make sure we get a different mem. | ||
4843 | * mem_cgroup_largest_soft_limit_node returns | ||
4844 | * NULL if no other cgroup is present on | ||
4845 | * the tree | ||
4846 | */ | ||
4847 | next_mz = | ||
4848 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
4849 | if (next_mz == mz) | ||
4850 | css_put(&next_mz->memcg->css); | ||
4851 | else /* next_mz == NULL or other memcg */ | ||
4852 | break; | ||
4853 | } while (1); | ||
4854 | } | ||
4855 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
4856 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
4857 | /* | ||
4858 | * One school of thought says that we should not add | ||
4859 | * back the node to the tree if reclaim returns 0. | ||
4860 | * But our reclaim could return 0, simply because due | ||
4861 | * to priority we are exposing a smaller subset of | ||
4862 | * memory to reclaim from. Consider this as a longer | ||
4863 | * term TODO. | ||
4864 | */ | ||
4865 | /* If excess == 0, no tree ops */ | ||
4866 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
4867 | spin_unlock(&mctz->lock); | ||
4868 | css_put(&mz->memcg->css); | ||
4869 | loop++; | ||
4870 | /* | ||
4871 | * Could not reclaim anything and there are no more | ||
4872 | * mem cgroups to try or we seem to be looping without | ||
4873 | * reclaiming anything. | ||
4874 | */ | ||
4875 | if (!nr_reclaimed && | ||
4876 | (next_mz == NULL || | ||
4877 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
4878 | break; | ||
4879 | } while (!nr_reclaimed); | ||
4880 | if (next_mz) | ||
4881 | css_put(&next_mz->memcg->css); | ||
4882 | return nr_reclaimed; | ||
4883 | } | ||
4884 | |||
4650 | /** | 4885 | /** |
4651 | * mem_cgroup_force_empty_list - clears LRU of a group | 4886 | * mem_cgroup_force_empty_list - clears LRU of a group |
4652 | * @memcg: group to clear | 4887 | * @memcg: group to clear |
@@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 6146 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
5912 | mz = &pn->zoneinfo[zone]; | 6147 | mz = &pn->zoneinfo[zone]; |
5913 | lruvec_init(&mz->lruvec); | 6148 | lruvec_init(&mz->lruvec); |
6149 | mz->usage_in_excess = 0; | ||
6150 | mz->on_tree = false; | ||
5914 | mz->memcg = memcg; | 6151 | mz->memcg = memcg; |
5915 | } | 6152 | } |
5916 | memcg->nodeinfo[node] = pn; | 6153 | memcg->nodeinfo[node] = pn; |
@@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
5966 | int node; | 6203 | int node; |
5967 | size_t size = memcg_size(); | 6204 | size_t size = memcg_size(); |
5968 | 6205 | ||
6206 | mem_cgroup_remove_from_trees(memcg); | ||
5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 6207 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
5970 | 6208 | ||
5971 | for_each_node(node) | 6209 | for_each_node(node) |
@@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6002 | } | 6240 | } |
6003 | EXPORT_SYMBOL(parent_mem_cgroup); | 6241 | EXPORT_SYMBOL(parent_mem_cgroup); |
6004 | 6242 | ||
6243 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
6244 | { | ||
6245 | struct mem_cgroup_tree_per_node *rtpn; | ||
6246 | struct mem_cgroup_tree_per_zone *rtpz; | ||
6247 | int tmp, node, zone; | ||
6248 | |||
6249 | for_each_node(node) { | ||
6250 | tmp = node; | ||
6251 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
6252 | tmp = -1; | ||
6253 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
6254 | BUG_ON(!rtpn); | ||
6255 | |||
6256 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
6257 | |||
6258 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
6259 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
6260 | rtpz->rb_root = RB_ROOT; | ||
6261 | spin_lock_init(&rtpz->lock); | ||
6262 | } | ||
6263 | } | ||
6264 | } | ||
6265 | |||
6005 | static struct cgroup_subsys_state * __ref | 6266 | static struct cgroup_subsys_state * __ref |
6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6267 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
6007 | { | 6268 | { |
@@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6031 | mutex_init(&memcg->thresholds_lock); | 6292 | mutex_init(&memcg->thresholds_lock); |
6032 | spin_lock_init(&memcg->move_lock); | 6293 | spin_lock_init(&memcg->move_lock); |
6033 | vmpressure_init(&memcg->vmpressure); | 6294 | vmpressure_init(&memcg->vmpressure); |
6034 | spin_lock_init(&memcg->soft_lock); | ||
6035 | 6295 | ||
6036 | return &memcg->css; | 6296 | return &memcg->css; |
6037 | 6297 | ||
@@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6109 | 6369 | ||
6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6370 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
6111 | mem_cgroup_reparent_charges(memcg); | 6371 | mem_cgroup_reparent_charges(memcg); |
6112 | if (memcg->soft_contributed) { | ||
6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
6114 | atomic_dec(&memcg->children_in_excess); | ||
6115 | |||
6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
6118 | } | ||
6119 | mem_cgroup_destroy_all_caches(memcg); | 6372 | mem_cgroup_destroy_all_caches(memcg); |
6120 | vmpressure_cleanup(&memcg->vmpressure); | 6373 | vmpressure_cleanup(&memcg->vmpressure); |
6121 | } | 6374 | } |
@@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void) | |||
6790 | { | 7043 | { |
6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 7044 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6792 | enable_swap_cgroup(); | 7045 | enable_swap_cgroup(); |
7046 | mem_cgroup_soft_limit_tree_init(); | ||
6793 | memcg_stock_init(); | 7047 | memcg_stock_init(); |
6794 | return 0; | 7048 | return 0; |
6795 | } | 7049 | } |