diff options
-rw-r--r-- | mm/memcontrol.c | 265 |
1 files changed, 263 insertions, 2 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47cdc7eb1a6b..852dbec07ce6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | ||
42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
43 | #include <linux/swap.h> | 44 | #include <linux/swap.h> |
44 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
@@ -124,6 +125,7 @@ static const char * const mem_cgroup_lru_names[] = { | |||
124 | */ | 125 | */ |
125 | enum mem_cgroup_events_target { | 126 | enum mem_cgroup_events_target { |
126 | MEM_CGROUP_TARGET_THRESH, | 127 | MEM_CGROUP_TARGET_THRESH, |
128 | MEM_CGROUP_TARGET_SOFTLIMIT, | ||
127 | MEM_CGROUP_TARGET_NUMAINFO, | 129 | MEM_CGROUP_TARGET_NUMAINFO, |
128 | MEM_CGROUP_NTARGETS, | 130 | MEM_CGROUP_NTARGETS, |
129 | }; | 131 | }; |
@@ -159,6 +161,10 @@ struct mem_cgroup_per_zone { | |||
159 | 161 | ||
160 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 162 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
161 | 163 | ||
164 | struct rb_node tree_node; /* RB tree node */ | ||
165 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
166 | /* the soft limit is exceeded*/ | ||
167 | bool on_tree; | ||
162 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 168 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
163 | /* use container_of */ | 169 | /* use container_of */ |
164 | }; | 170 | }; |
@@ -167,6 +173,26 @@ struct mem_cgroup_per_node { | |||
167 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 173 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
168 | }; | 174 | }; |
169 | 175 | ||
176 | /* | ||
177 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
178 | * their hierarchy representation | ||
179 | */ | ||
180 | |||
181 | struct mem_cgroup_tree_per_zone { | ||
182 | struct rb_root rb_root; | ||
183 | spinlock_t lock; | ||
184 | }; | ||
185 | |||
186 | struct mem_cgroup_tree_per_node { | ||
187 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
188 | }; | ||
189 | |||
190 | struct mem_cgroup_tree { | ||
191 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
192 | }; | ||
193 | |||
194 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
195 | |||
170 | struct mem_cgroup_threshold { | 196 | struct mem_cgroup_threshold { |
171 | struct eventfd_ctx *eventfd; | 197 | struct eventfd_ctx *eventfd; |
172 | u64 threshold; | 198 | u64 threshold; |
@@ -405,6 +431,7 @@ static bool move_file(void) | |||
405 | * limit reclaim to prevent infinite loops, if they ever occur. | 431 | * limit reclaim to prevent infinite loops, if they ever occur. |
406 | */ | 432 | */ |
407 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 433 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
434 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
408 | 435 | ||
409 | enum charge_type { | 436 | enum charge_type { |
410 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 437 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -631,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
631 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 658 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
632 | } | 659 | } |
633 | 660 | ||
661 | static struct mem_cgroup_tree_per_zone * | ||
662 | soft_limit_tree_node_zone(int nid, int zid) | ||
663 | { | ||
664 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
665 | } | ||
666 | |||
667 | static struct mem_cgroup_tree_per_zone * | ||
668 | soft_limit_tree_from_page(struct page *page) | ||
669 | { | ||
670 | int nid = page_to_nid(page); | ||
671 | int zid = page_zonenum(page); | ||
672 | |||
673 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
674 | } | ||
675 | |||
676 | static void | ||
677 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
678 | struct mem_cgroup_per_zone *mz, | ||
679 | struct mem_cgroup_tree_per_zone *mctz, | ||
680 | unsigned long long new_usage_in_excess) | ||
681 | { | ||
682 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
683 | struct rb_node *parent = NULL; | ||
684 | struct mem_cgroup_per_zone *mz_node; | ||
685 | |||
686 | if (mz->on_tree) | ||
687 | return; | ||
688 | |||
689 | mz->usage_in_excess = new_usage_in_excess; | ||
690 | if (!mz->usage_in_excess) | ||
691 | return; | ||
692 | while (*p) { | ||
693 | parent = *p; | ||
694 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
695 | tree_node); | ||
696 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
697 | p = &(*p)->rb_left; | ||
698 | /* | ||
699 | * We can't avoid mem cgroups that are over their soft | ||
700 | * limit by the same amount | ||
701 | */ | ||
702 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
703 | p = &(*p)->rb_right; | ||
704 | } | ||
705 | rb_link_node(&mz->tree_node, parent, p); | ||
706 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
707 | mz->on_tree = true; | ||
708 | } | ||
709 | |||
710 | static void | ||
711 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
712 | struct mem_cgroup_per_zone *mz, | ||
713 | struct mem_cgroup_tree_per_zone *mctz) | ||
714 | { | ||
715 | if (!mz->on_tree) | ||
716 | return; | ||
717 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
718 | mz->on_tree = false; | ||
719 | } | ||
720 | |||
721 | static void | ||
722 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
723 | struct mem_cgroup_per_zone *mz, | ||
724 | struct mem_cgroup_tree_per_zone *mctz) | ||
725 | { | ||
726 | spin_lock(&mctz->lock); | ||
727 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
728 | spin_unlock(&mctz->lock); | ||
729 | } | ||
730 | |||
731 | |||
732 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
733 | { | ||
734 | unsigned long long excess; | ||
735 | struct mem_cgroup_per_zone *mz; | ||
736 | struct mem_cgroup_tree_per_zone *mctz; | ||
737 | int nid = page_to_nid(page); | ||
738 | int zid = page_zonenum(page); | ||
739 | mctz = soft_limit_tree_from_page(page); | ||
740 | |||
741 | /* | ||
742 | * Necessary to update all ancestors when hierarchy is used. | ||
743 | * because their event counter is not touched. | ||
744 | */ | ||
745 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
746 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
747 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
748 | /* | ||
749 | * We have to update the tree if mz is on RB-tree or | ||
750 | * mem is over its softlimit. | ||
751 | */ | ||
752 | if (excess || mz->on_tree) { | ||
753 | spin_lock(&mctz->lock); | ||
754 | /* if on-tree, remove it */ | ||
755 | if (mz->on_tree) | ||
756 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
757 | /* | ||
758 | * Insert again. mz->usage_in_excess will be updated. | ||
759 | * If excess is 0, no tree ops. | ||
760 | */ | ||
761 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
762 | spin_unlock(&mctz->lock); | ||
763 | } | ||
764 | } | ||
765 | } | ||
766 | |||
767 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
768 | { | ||
769 | int node, zone; | ||
770 | struct mem_cgroup_per_zone *mz; | ||
771 | struct mem_cgroup_tree_per_zone *mctz; | ||
772 | |||
773 | for_each_node(node) { | ||
774 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
775 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
776 | mctz = soft_limit_tree_node_zone(node, zone); | ||
777 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
778 | } | ||
779 | } | ||
780 | } | ||
781 | |||
782 | static struct mem_cgroup_per_zone * | ||
783 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
784 | { | ||
785 | struct rb_node *rightmost = NULL; | ||
786 | struct mem_cgroup_per_zone *mz; | ||
787 | |||
788 | retry: | ||
789 | mz = NULL; | ||
790 | rightmost = rb_last(&mctz->rb_root); | ||
791 | if (!rightmost) | ||
792 | goto done; /* Nothing to reclaim from */ | ||
793 | |||
794 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
795 | /* | ||
796 | * Remove the node now but someone else can add it back, | ||
797 | * we will to add it back at the end of reclaim to its correct | ||
798 | * position in the tree. | ||
799 | */ | ||
800 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
801 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
802 | !css_tryget(&mz->memcg->css)) | ||
803 | goto retry; | ||
804 | done: | ||
805 | return mz; | ||
806 | } | ||
807 | |||
808 | static struct mem_cgroup_per_zone * | ||
809 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
810 | { | ||
811 | struct mem_cgroup_per_zone *mz; | ||
812 | |||
813 | spin_lock(&mctz->lock); | ||
814 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
815 | spin_unlock(&mctz->lock); | ||
816 | return mz; | ||
817 | } | ||
818 | |||
634 | /* | 819 | /* |
635 | * Implementation Note: reading percpu statistics for memcg. | 820 | * Implementation Note: reading percpu statistics for memcg. |
636 | * | 821 | * |
@@ -789,6 +974,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
789 | case MEM_CGROUP_TARGET_THRESH: | 974 | case MEM_CGROUP_TARGET_THRESH: |
790 | next = val + THRESHOLDS_EVENTS_TARGET; | 975 | next = val + THRESHOLDS_EVENTS_TARGET; |
791 | break; | 976 | break; |
977 | case MEM_CGROUP_TARGET_SOFTLIMIT: | ||
978 | next = val + SOFTLIMIT_EVENTS_TARGET; | ||
979 | break; | ||
792 | case MEM_CGROUP_TARGET_NUMAINFO: | 980 | case MEM_CGROUP_TARGET_NUMAINFO: |
793 | next = val + NUMAINFO_EVENTS_TARGET; | 981 | next = val + NUMAINFO_EVENTS_TARGET; |
794 | break; | 982 | break; |
@@ -811,8 +999,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
811 | /* threshold event is triggered in finer grain than soft limit */ | 999 | /* threshold event is triggered in finer grain than soft limit */ |
812 | if (unlikely(mem_cgroup_event_ratelimit(memcg, | 1000 | if (unlikely(mem_cgroup_event_ratelimit(memcg, |
813 | MEM_CGROUP_TARGET_THRESH))) { | 1001 | MEM_CGROUP_TARGET_THRESH))) { |
1002 | bool do_softlimit; | ||
814 | bool do_numainfo __maybe_unused; | 1003 | bool do_numainfo __maybe_unused; |
815 | 1004 | ||
1005 | do_softlimit = mem_cgroup_event_ratelimit(memcg, | ||
1006 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
816 | #if MAX_NUMNODES > 1 | 1007 | #if MAX_NUMNODES > 1 |
817 | do_numainfo = mem_cgroup_event_ratelimit(memcg, | 1008 | do_numainfo = mem_cgroup_event_ratelimit(memcg, |
818 | MEM_CGROUP_TARGET_NUMAINFO); | 1009 | MEM_CGROUP_TARGET_NUMAINFO); |
@@ -820,6 +1011,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
820 | preempt_enable(); | 1011 | preempt_enable(); |
821 | 1012 | ||
822 | mem_cgroup_threshold(memcg); | 1013 | mem_cgroup_threshold(memcg); |
1014 | if (unlikely(do_softlimit)) | ||
1015 | mem_cgroup_update_tree(memcg, page); | ||
823 | #if MAX_NUMNODES > 1 | 1016 | #if MAX_NUMNODES > 1 |
824 | if (unlikely(do_numainfo)) | 1017 | if (unlikely(do_numainfo)) |
825 | atomic_inc(&memcg->numainfo_events); | 1018 | atomic_inc(&memcg->numainfo_events); |
@@ -1661,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1661 | return total; | 1854 | return total; |
1662 | } | 1855 | } |
1663 | 1856 | ||
1664 | #if MAX_NUMNODES > 1 | ||
1665 | /** | 1857 | /** |
1666 | * test_mem_cgroup_node_reclaimable | 1858 | * test_mem_cgroup_node_reclaimable |
1667 | * @memcg: the target memcg | 1859 | * @memcg: the target memcg |
@@ -1684,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1684 | return false; | 1876 | return false; |
1685 | 1877 | ||
1686 | } | 1878 | } |
1879 | #if MAX_NUMNODES > 1 | ||
1687 | 1880 | ||
1688 | /* | 1881 | /* |
1689 | * Always updating the nodemask is not very good - even if we have an empty | 1882 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -1751,12 +1944,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1751 | return node; | 1944 | return node; |
1752 | } | 1945 | } |
1753 | 1946 | ||
1947 | /* | ||
1948 | * Check all nodes whether it contains reclaimable pages or not. | ||
1949 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1950 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1951 | * enough new information. We need to do double check. | ||
1952 | */ | ||
1953 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1954 | { | ||
1955 | int nid; | ||
1956 | |||
1957 | /* | ||
1958 | * quick check...making use of scan_node. | ||
1959 | * We can skip unused nodes. | ||
1960 | */ | ||
1961 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1962 | for (nid = first_node(memcg->scan_nodes); | ||
1963 | nid < MAX_NUMNODES; | ||
1964 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1965 | |||
1966 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1967 | return true; | ||
1968 | } | ||
1969 | } | ||
1970 | /* | ||
1971 | * Check rest of nodes. | ||
1972 | */ | ||
1973 | for_each_node_state(nid, N_MEMORY) { | ||
1974 | if (node_isset(nid, memcg->scan_nodes)) | ||
1975 | continue; | ||
1976 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1977 | return true; | ||
1978 | } | ||
1979 | return false; | ||
1980 | } | ||
1981 | |||
1754 | #else | 1982 | #else |
1755 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1983 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1756 | { | 1984 | { |
1757 | return 0; | 1985 | return 0; |
1758 | } | 1986 | } |
1759 | 1987 | ||
1988 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1989 | { | ||
1990 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | ||
1991 | } | ||
1760 | #endif | 1992 | #endif |
1761 | 1993 | ||
1762 | /* | 1994 | /* |
@@ -2692,7 +2924,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2692 | unlock_page_cgroup(pc); | 2924 | unlock_page_cgroup(pc); |
2693 | 2925 | ||
2694 | /* | 2926 | /* |
2695 | * "charge_statistics" updated event counter. | 2927 | * "charge_statistics" updated event counter. Then, check it. |
2928 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
2929 | * if they exceeds softlimit. | ||
2696 | */ | 2930 | */ |
2697 | memcg_check_events(memcg, page); | 2931 | memcg_check_events(memcg, page); |
2698 | } | 2932 | } |
@@ -5791,6 +6025,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
5791 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 6025 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
5792 | mz = &pn->zoneinfo[zone]; | 6026 | mz = &pn->zoneinfo[zone]; |
5793 | lruvec_init(&mz->lruvec); | 6027 | lruvec_init(&mz->lruvec); |
6028 | mz->usage_in_excess = 0; | ||
6029 | mz->on_tree = false; | ||
5794 | mz->memcg = memcg; | 6030 | mz->memcg = memcg; |
5795 | } | 6031 | } |
5796 | memcg->nodeinfo[node] = pn; | 6032 | memcg->nodeinfo[node] = pn; |
@@ -5846,6 +6082,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
5846 | int node; | 6082 | int node; |
5847 | size_t size = memcg_size(); | 6083 | size_t size = memcg_size(); |
5848 | 6084 | ||
6085 | mem_cgroup_remove_from_trees(memcg); | ||
5849 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 6086 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
5850 | 6087 | ||
5851 | for_each_node(node) | 6088 | for_each_node(node) |
@@ -5882,6 +6119,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
5882 | } | 6119 | } |
5883 | EXPORT_SYMBOL(parent_mem_cgroup); | 6120 | EXPORT_SYMBOL(parent_mem_cgroup); |
5884 | 6121 | ||
6122 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
6123 | { | ||
6124 | struct mem_cgroup_tree_per_node *rtpn; | ||
6125 | struct mem_cgroup_tree_per_zone *rtpz; | ||
6126 | int tmp, node, zone; | ||
6127 | |||
6128 | for_each_node(node) { | ||
6129 | tmp = node; | ||
6130 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
6131 | tmp = -1; | ||
6132 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
6133 | BUG_ON(!rtpn); | ||
6134 | |||
6135 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
6136 | |||
6137 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
6138 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
6139 | rtpz->rb_root = RB_ROOT; | ||
6140 | spin_lock_init(&rtpz->lock); | ||
6141 | } | ||
6142 | } | ||
6143 | } | ||
6144 | |||
5885 | static struct cgroup_subsys_state * __ref | 6145 | static struct cgroup_subsys_state * __ref |
5886 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6146 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
5887 | { | 6147 | { |
@@ -6662,6 +6922,7 @@ static int __init mem_cgroup_init(void) | |||
6662 | { | 6922 | { |
6663 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 6923 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6664 | enable_swap_cgroup(); | 6924 | enable_swap_cgroup(); |
6925 | mem_cgroup_soft_limit_tree_init(); | ||
6665 | memcg_stock_init(); | 6926 | memcg_stock_init(); |
6666 | return 0; | 6927 | return 0; |
6667 | } | 6928 | } |