aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/memcontrol.c265
1 files changed, 263 insertions, 2 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 47cdc7eb1a6b..852dbec07ce6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/swap.h> 44#include <linux/swap.h>
44#include <linux/swapops.h> 45#include <linux/swapops.h>
@@ -124,6 +125,7 @@ static const char * const mem_cgroup_lru_names[] = {
124 */ 125 */
125enum mem_cgroup_events_target { 126enum mem_cgroup_events_target {
126 MEM_CGROUP_TARGET_THRESH, 127 MEM_CGROUP_TARGET_THRESH,
128 MEM_CGROUP_TARGET_SOFTLIMIT,
127 MEM_CGROUP_TARGET_NUMAINFO, 129 MEM_CGROUP_TARGET_NUMAINFO,
128 MEM_CGROUP_NTARGETS, 130 MEM_CGROUP_NTARGETS,
129}; 131};
@@ -159,6 +161,10 @@ struct mem_cgroup_per_zone {
159 161
160 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 162 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
161 163
164 struct rb_node tree_node; /* RB tree node */
165 unsigned long long usage_in_excess;/* Set to the value by which */
166 /* the soft limit is exceeded*/
167 bool on_tree;
162 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 168 struct mem_cgroup *memcg; /* Back pointer, we cannot */
163 /* use container_of */ 169 /* use container_of */
164}; 170};
@@ -167,6 +173,26 @@ struct mem_cgroup_per_node {
167 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 173 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
168}; 174};
169 175
176/*
177 * Cgroups above their limits are maintained in a RB-Tree, independent of
178 * their hierarchy representation
179 */
180
181struct mem_cgroup_tree_per_zone {
182 struct rb_root rb_root;
183 spinlock_t lock;
184};
185
186struct mem_cgroup_tree_per_node {
187 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
188};
189
190struct mem_cgroup_tree {
191 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
192};
193
194static struct mem_cgroup_tree soft_limit_tree __read_mostly;
195
170struct mem_cgroup_threshold { 196struct mem_cgroup_threshold {
171 struct eventfd_ctx *eventfd; 197 struct eventfd_ctx *eventfd;
172 u64 threshold; 198 u64 threshold;
@@ -405,6 +431,7 @@ static bool move_file(void)
405 * limit reclaim to prevent infinite loops, if they ever occur. 431 * limit reclaim to prevent infinite loops, if they ever occur.
406 */ 432 */
407#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 433#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
434#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
408 435
409enum charge_type { 436enum charge_type {
410 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 437 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -631,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
631 return mem_cgroup_zoneinfo(memcg, nid, zid); 658 return mem_cgroup_zoneinfo(memcg, nid, zid);
632} 659}
633 660
661static struct mem_cgroup_tree_per_zone *
662soft_limit_tree_node_zone(int nid, int zid)
663{
664 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
665}
666
667static struct mem_cgroup_tree_per_zone *
668soft_limit_tree_from_page(struct page *page)
669{
670 int nid = page_to_nid(page);
671 int zid = page_zonenum(page);
672
673 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
674}
675
676static void
677__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
678 struct mem_cgroup_per_zone *mz,
679 struct mem_cgroup_tree_per_zone *mctz,
680 unsigned long long new_usage_in_excess)
681{
682 struct rb_node **p = &mctz->rb_root.rb_node;
683 struct rb_node *parent = NULL;
684 struct mem_cgroup_per_zone *mz_node;
685
686 if (mz->on_tree)
687 return;
688
689 mz->usage_in_excess = new_usage_in_excess;
690 if (!mz->usage_in_excess)
691 return;
692 while (*p) {
693 parent = *p;
694 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
695 tree_node);
696 if (mz->usage_in_excess < mz_node->usage_in_excess)
697 p = &(*p)->rb_left;
698 /*
699 * We can't avoid mem cgroups that are over their soft
700 * limit by the same amount
701 */
702 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
703 p = &(*p)->rb_right;
704 }
705 rb_link_node(&mz->tree_node, parent, p);
706 rb_insert_color(&mz->tree_node, &mctz->rb_root);
707 mz->on_tree = true;
708}
709
710static void
711__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
712 struct mem_cgroup_per_zone *mz,
713 struct mem_cgroup_tree_per_zone *mctz)
714{
715 if (!mz->on_tree)
716 return;
717 rb_erase(&mz->tree_node, &mctz->rb_root);
718 mz->on_tree = false;
719}
720
721static void
722mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
723 struct mem_cgroup_per_zone *mz,
724 struct mem_cgroup_tree_per_zone *mctz)
725{
726 spin_lock(&mctz->lock);
727 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
728 spin_unlock(&mctz->lock);
729}
730
731
732static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
733{
734 unsigned long long excess;
735 struct mem_cgroup_per_zone *mz;
736 struct mem_cgroup_tree_per_zone *mctz;
737 int nid = page_to_nid(page);
738 int zid = page_zonenum(page);
739 mctz = soft_limit_tree_from_page(page);
740
741 /*
742 * Necessary to update all ancestors when hierarchy is used.
743 * because their event counter is not touched.
744 */
745 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
746 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
747 excess = res_counter_soft_limit_excess(&memcg->res);
748 /*
749 * We have to update the tree if mz is on RB-tree or
750 * mem is over its softlimit.
751 */
752 if (excess || mz->on_tree) {
753 spin_lock(&mctz->lock);
754 /* if on-tree, remove it */
755 if (mz->on_tree)
756 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
757 /*
758 * Insert again. mz->usage_in_excess will be updated.
759 * If excess is 0, no tree ops.
760 */
761 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
762 spin_unlock(&mctz->lock);
763 }
764 }
765}
766
767static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
768{
769 int node, zone;
770 struct mem_cgroup_per_zone *mz;
771 struct mem_cgroup_tree_per_zone *mctz;
772
773 for_each_node(node) {
774 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
775 mz = mem_cgroup_zoneinfo(memcg, node, zone);
776 mctz = soft_limit_tree_node_zone(node, zone);
777 mem_cgroup_remove_exceeded(memcg, mz, mctz);
778 }
779 }
780}
781
782static struct mem_cgroup_per_zone *
783__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
784{
785 struct rb_node *rightmost = NULL;
786 struct mem_cgroup_per_zone *mz;
787
788retry:
789 mz = NULL;
790 rightmost = rb_last(&mctz->rb_root);
791 if (!rightmost)
792 goto done; /* Nothing to reclaim from */
793
794 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
795 /*
796 * Remove the node now but someone else can add it back,
797 * we will to add it back at the end of reclaim to its correct
798 * position in the tree.
799 */
800 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
801 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
802 !css_tryget(&mz->memcg->css))
803 goto retry;
804done:
805 return mz;
806}
807
808static struct mem_cgroup_per_zone *
809mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
810{
811 struct mem_cgroup_per_zone *mz;
812
813 spin_lock(&mctz->lock);
814 mz = __mem_cgroup_largest_soft_limit_node(mctz);
815 spin_unlock(&mctz->lock);
816 return mz;
817}
818
634/* 819/*
635 * Implementation Note: reading percpu statistics for memcg. 820 * Implementation Note: reading percpu statistics for memcg.
636 * 821 *
@@ -789,6 +974,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
789 case MEM_CGROUP_TARGET_THRESH: 974 case MEM_CGROUP_TARGET_THRESH:
790 next = val + THRESHOLDS_EVENTS_TARGET; 975 next = val + THRESHOLDS_EVENTS_TARGET;
791 break; 976 break;
977 case MEM_CGROUP_TARGET_SOFTLIMIT:
978 next = val + SOFTLIMIT_EVENTS_TARGET;
979 break;
792 case MEM_CGROUP_TARGET_NUMAINFO: 980 case MEM_CGROUP_TARGET_NUMAINFO:
793 next = val + NUMAINFO_EVENTS_TARGET; 981 next = val + NUMAINFO_EVENTS_TARGET;
794 break; 982 break;
@@ -811,8 +999,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
811 /* threshold event is triggered in finer grain than soft limit */ 999 /* threshold event is triggered in finer grain than soft limit */
812 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1000 if (unlikely(mem_cgroup_event_ratelimit(memcg,
813 MEM_CGROUP_TARGET_THRESH))) { 1001 MEM_CGROUP_TARGET_THRESH))) {
1002 bool do_softlimit;
814 bool do_numainfo __maybe_unused; 1003 bool do_numainfo __maybe_unused;
815 1004
1005 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1006 MEM_CGROUP_TARGET_SOFTLIMIT);
816#if MAX_NUMNODES > 1 1007#if MAX_NUMNODES > 1
817 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1008 do_numainfo = mem_cgroup_event_ratelimit(memcg,
818 MEM_CGROUP_TARGET_NUMAINFO); 1009 MEM_CGROUP_TARGET_NUMAINFO);
@@ -820,6 +1011,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
820 preempt_enable(); 1011 preempt_enable();
821 1012
822 mem_cgroup_threshold(memcg); 1013 mem_cgroup_threshold(memcg);
1014 if (unlikely(do_softlimit))
1015 mem_cgroup_update_tree(memcg, page);
823#if MAX_NUMNODES > 1 1016#if MAX_NUMNODES > 1
824 if (unlikely(do_numainfo)) 1017 if (unlikely(do_numainfo))
825 atomic_inc(&memcg->numainfo_events); 1018 atomic_inc(&memcg->numainfo_events);
@@ -1661,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1661 return total; 1854 return total;
1662} 1855}
1663 1856
1664#if MAX_NUMNODES > 1
1665/** 1857/**
1666 * test_mem_cgroup_node_reclaimable 1858 * test_mem_cgroup_node_reclaimable
1667 * @memcg: the target memcg 1859 * @memcg: the target memcg
@@ -1684,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1684 return false; 1876 return false;
1685 1877
1686} 1878}
1879#if MAX_NUMNODES > 1
1687 1880
1688/* 1881/*
1689 * Always updating the nodemask is not very good - even if we have an empty 1882 * Always updating the nodemask is not very good - even if we have an empty
@@ -1751,12 +1944,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1751 return node; 1944 return node;
1752} 1945}
1753 1946
1947/*
1948 * Check all nodes whether it contains reclaimable pages or not.
1949 * For quick scan, we make use of scan_nodes. This will allow us to skip
1950 * unused nodes. But scan_nodes is lazily updated and may not cotain
1951 * enough new information. We need to do double check.
1952 */
1953static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1954{
1955 int nid;
1956
1957 /*
1958 * quick check...making use of scan_node.
1959 * We can skip unused nodes.
1960 */
1961 if (!nodes_empty(memcg->scan_nodes)) {
1962 for (nid = first_node(memcg->scan_nodes);
1963 nid < MAX_NUMNODES;
1964 nid = next_node(nid, memcg->scan_nodes)) {
1965
1966 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1967 return true;
1968 }
1969 }
1970 /*
1971 * Check rest of nodes.
1972 */
1973 for_each_node_state(nid, N_MEMORY) {
1974 if (node_isset(nid, memcg->scan_nodes))
1975 continue;
1976 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1977 return true;
1978 }
1979 return false;
1980}
1981
1754#else 1982#else
1755int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1983int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1756{ 1984{
1757 return 0; 1985 return 0;
1758} 1986}
1759 1987
1988static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1989{
1990 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1991}
1760#endif 1992#endif
1761 1993
1762/* 1994/*
@@ -2692,7 +2924,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2692 unlock_page_cgroup(pc); 2924 unlock_page_cgroup(pc);
2693 2925
2694 /* 2926 /*
2695 * "charge_statistics" updated event counter. 2927 * "charge_statistics" updated event counter. Then, check it.
2928 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2929 * if they exceeds softlimit.
2696 */ 2930 */
2697 memcg_check_events(memcg, page); 2931 memcg_check_events(memcg, page);
2698} 2932}
@@ -5791,6 +6025,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5791 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6025 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5792 mz = &pn->zoneinfo[zone]; 6026 mz = &pn->zoneinfo[zone];
5793 lruvec_init(&mz->lruvec); 6027 lruvec_init(&mz->lruvec);
6028 mz->usage_in_excess = 0;
6029 mz->on_tree = false;
5794 mz->memcg = memcg; 6030 mz->memcg = memcg;
5795 } 6031 }
5796 memcg->nodeinfo[node] = pn; 6032 memcg->nodeinfo[node] = pn;
@@ -5846,6 +6082,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5846 int node; 6082 int node;
5847 size_t size = memcg_size(); 6083 size_t size = memcg_size();
5848 6084
6085 mem_cgroup_remove_from_trees(memcg);
5849 free_css_id(&mem_cgroup_subsys, &memcg->css); 6086 free_css_id(&mem_cgroup_subsys, &memcg->css);
5850 6087
5851 for_each_node(node) 6088 for_each_node(node)
@@ -5882,6 +6119,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5882} 6119}
5883EXPORT_SYMBOL(parent_mem_cgroup); 6120EXPORT_SYMBOL(parent_mem_cgroup);
5884 6121
6122static void __init mem_cgroup_soft_limit_tree_init(void)
6123{
6124 struct mem_cgroup_tree_per_node *rtpn;
6125 struct mem_cgroup_tree_per_zone *rtpz;
6126 int tmp, node, zone;
6127
6128 for_each_node(node) {
6129 tmp = node;
6130 if (!node_state(node, N_NORMAL_MEMORY))
6131 tmp = -1;
6132 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6133 BUG_ON(!rtpn);
6134
6135 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6136
6137 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6138 rtpz = &rtpn->rb_tree_per_zone[zone];
6139 rtpz->rb_root = RB_ROOT;
6140 spin_lock_init(&rtpz->lock);
6141 }
6142 }
6143}
6144
5885static struct cgroup_subsys_state * __ref 6145static struct cgroup_subsys_state * __ref
5886mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6146mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5887{ 6147{
@@ -6662,6 +6922,7 @@ static int __init mem_cgroup_init(void)
6662{ 6922{
6663 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 6923 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6664 enable_swap_cgroup(); 6924 enable_swap_cgroup();
6925 mem_cgroup_soft_limit_tree_init();
6665 memcg_stock_init(); 6926 memcg_stock_init();
6666 return 0; 6927 return 0;
6667} 6928}