From 8f939a9f4c769500c58c31ec81ce17297388f4bf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:34 -0700 Subject: revert "memcg: track all children over limit in the root" Revert commit 1be171d60bdd ("memcg: track all children over limit in the root") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..5ee0af463f2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -851,15 +851,9 @@ static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) /* * Necessary to update all ancestors when hierarchy is used * because their event counter is not touched. - * We track children even outside the hierarchy for the root - * cgroup because tree walk starting at root should visit - * all cgroups and we want to prevent from pointless tree - * walk if no children is below the limit. */ while (delta && (parent = parent_mem_cgroup(parent))) atomic_add(delta, &parent->children_in_excess); - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) - atomic_add(delta, &root_mem_cgroup->children_in_excess); spin_unlock(&memcg->soft_lock); } @@ -6112,9 +6106,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) if (memcg->soft_contributed) { while ((memcg = parent_mem_cgroup(memcg))) atomic_dec(&memcg->children_in_excess); - - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) - atomic_dec(&root_mem_cgroup->children_in_excess); } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); -- cgit v1.2.2 From 3120055e869f2a208480f238680d097eec8f0e02 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:35 -0700 Subject: revert "memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything" Revert commit e839b6a1c8d0 ("memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5ee0af463f2e..5cf7726764cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1872,11 +1872,7 @@ enum mem_cgroup_filter_t mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { - struct mem_cgroup *parent; - - if (!memcg) - memcg = root_mem_cgroup; - parent = memcg; + struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) return VISIT; -- cgit v1.2.2 From 30361e51cae7a4df3fec89f935a450a6fe6f16fa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:36 -0700 Subject: revert "memcg: track children in soft limit excess to improve soft limit" Revert commit 7d910c054be4 ("memcg: track children in soft limit excess to improve soft limit") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 71 --------------------------------------------------------- 1 file changed, 71 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5cf7726764cc..916892c2b8e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -124,7 +124,6 @@ static const char * const mem_cgroup_lru_names[] = { */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -303,22 +302,6 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif - /* - * Protects soft_contributed transitions. - * See mem_cgroup_update_soft_limit - */ - spinlock_t soft_lock; - - /* - * If true then this group has increased parents' children_in_excess - * when it got over the soft limit. - * When a group falls bellow the soft limit, parents' children_in_excess - * is decreased and soft_contributed changed to false. - */ - bool soft_contributed; - - /* Number of children that are in soft limit excess */ - atomic_t children_in_excess; struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ @@ -806,9 +789,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; case MEM_CGROUP_TARGET_NUMAINFO: next = val + NUMAINFO_EVENTS_TARGET; break; @@ -821,42 +801,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, return false; } -/* - * Called from rate-limited memcg_check_events when enough - * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure - * that all the parents up the hierarchy will be notified that this group - * is in excess or that it is not in excess anymore. mmecg->soft_contributed - * makes the transition a single action whenever the state flips from one to - * the other. - */ -static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) -{ - unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); - struct mem_cgroup *parent = memcg; - int delta = 0; - - spin_lock(&memcg->soft_lock); - if (excess) { - if (!memcg->soft_contributed) { - delta = 1; - memcg->soft_contributed = true; - } - } else { - if (memcg->soft_contributed) { - delta = -1; - memcg->soft_contributed = false; - } - } - - /* - * Necessary to update all ancestors when hierarchy is used - * because their event counter is not touched. - */ - while (delta && (parent = parent_mem_cgroup(parent))) - atomic_add(delta, &parent->children_in_excess); - spin_unlock(&memcg->soft_lock); -} - /* * Check events in order. * @@ -867,11 +811,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit; bool do_numainfo __maybe_unused; - do_softlimit = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); @@ -879,8 +820,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); mem_cgroup_threshold(memcg); - if (unlikely(do_softlimit)) - mem_cgroup_update_soft_limit(memcg); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1864,9 +1803,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * hierarchy if * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit - * - * If the given group doesn't have any children over the limit then it - * doesn't make any sense to iterate its subtree. */ enum mem_cgroup_filter_t mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, @@ -1888,8 +1824,6 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, break; } - if (!atomic_read(&memcg->children_in_excess)) - return SKIP_TREE; return SKIP; } @@ -6021,7 +5955,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); - spin_lock_init(&memcg->soft_lock); return &memcg->css; @@ -6099,10 +6032,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); - if (memcg->soft_contributed) { - while ((memcg = parent_mem_cgroup(memcg))) - atomic_dec(&memcg->children_in_excess); - } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } -- cgit v1.2.2 From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:37 -0700 Subject: revert "memcg: enhance memcg iterator to support predicates" Revert commit de57780dc659 ("memcg: enhance memcg iterator to support predicates") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 70 +++++++++++++-------------------------------------------- 1 file changed, 15 insertions(+), 55 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 916892c2b8e0..65e7bec4b0f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -862,15 +862,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -static enum mem_cgroup_filter_t -mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, - mem_cgroup_iter_filter cond) -{ - if (!cond) - return VISIT; - return cond(memcg, root); -} - /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -878,7 +869,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) + struct mem_cgroup *last_visited) { struct cgroup_subsys_state *prev_css, *next_css; @@ -896,31 +887,11 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - switch (mem_cgroup_filter(mem, root, cond)) { - case SKIP: + if (css_tryget(&mem->css)) + return mem; + else { prev_css = next_css; goto skip_node; - case SKIP_TREE: - if (mem == root) - return NULL; - /* - * css_rightmost_descendant is not an optimal way to - * skip through a subtree (especially for imbalanced - * trees leaning to right) but that's what we have right - * now. More effective solution would be traversing - * right-up for first non-NULL without calling - * css_next_descendant_pre afterwards. - */ - prev_css = css_rightmost_descendant(next_css); - goto skip_node; - case VISIT: - if (css_tryget(&mem->css)) - return mem; - else { - prev_css = next_css; - goto skip_node; - } - break; } } @@ -984,7 +955,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks - * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -997,18 +967,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond) + struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) { - /* first call must return non-NULL, second return NULL */ - return (struct mem_cgroup *)(unsigned long)!prev; - } + if (mem_cgroup_disabled()) + return NULL; if (!root) root = root_mem_cgroup; @@ -1019,9 +986,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - if (mem_cgroup_filter(root, root, cond) == VISIT) - return root; - return NULL; + return root; } rcu_read_lock(); @@ -1044,7 +1009,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited, cond); + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1055,11 +1020,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, reclaim->generation = iter->generation; } - /* - * We have finished the whole tree walk or no group has been - * visited because filter told us to skip the root node. - */ - if (!memcg && (prev || (cond && !last_visited))) + if (prev && !memcg) goto out_unlock; } out_unlock: @@ -1804,14 +1765,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) - return VISIT; + return true; /* * If any parent up to the root in the hierarchy is over its soft limit @@ -1819,12 +1779,12 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) - return VISIT; + return true; if (parent == root) break; } - return SKIP; + return false; } static DEFINE_SPINLOCK(memcg_oom_lock); -- cgit v1.2.2 From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:38 -0700 Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim" Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also for targeted reclaim") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65e7bec4b0f0..47cdc7eb1a6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1760,13 +1760,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif /* - * A group is eligible for the soft limit reclaim under the given root - * hierarchy if - * a) it is over its soft limit + * A group is eligible for the soft limit reclaim if + * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root) +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) { struct mem_cgroup *parent = memcg; @@ -1774,14 +1772,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, return true; /* - * If any parent up to the root in the hierarchy is over its soft limit - * then we have to obey and reclaim from this group as well. + * If any parent up the hierarchy is over its soft limit then we + * have to obey and reclaim from this group as well. */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) return true; - if (parent == root) - break; } return false; -- cgit v1.2.2 From bb4cc1a8b5eaf3b9e5707d7c270400b05d11a2b7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:40 -0700 Subject: revert "memcg: get rid of soft-limit tree infrastructure" Revert commit e883110aad71 ("memcg: get rid of soft-limit tree infrastructure") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 265 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 263 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47cdc7eb1a6b..852dbec07ce6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -124,6 +125,7 @@ static const char * const mem_cgroup_lru_names[] = { */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -159,6 +161,10 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -167,6 +173,26 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; +/* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; u64 threshold; @@ -405,6 +431,7 @@ static bool move_file(void) * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -631,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) return mem_cgroup_zoneinfo(memcg, nid, zid); } +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + spin_lock(&mctz->lock); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); + spin_unlock(&mctz->lock); +} + + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +{ + unsigned long long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + int nid = page_to_nid(page); + int zid = page_zonenum(page); + mctz = soft_limit_tree_from_page(page); + + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = mem_cgroup_zoneinfo(memcg, nid, zid); + excess = res_counter_soft_limit_excess(&memcg->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(memcg, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) +{ + int node, zone; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + for_each_node(node) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = mem_cgroup_zoneinfo(memcg, node, zone); + mctz = soft_limit_tree_node_zone(node, zone); + mem_cgroup_remove_exceeded(memcg, mz, mctz); + } + } +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->memcg->res) || + !css_tryget(&mz->memcg->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock(&mctz->lock); + return mz; +} + /* * Implementation Note: reading percpu statistics for memcg. * @@ -789,6 +974,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; case MEM_CGROUP_TARGET_NUMAINFO: next = val + NUMAINFO_EVENTS_TARGET; break; @@ -811,8 +999,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; bool do_numainfo __maybe_unused; + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); @@ -820,6 +1011,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_tree(memcg, page); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1661,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, return total; } -#if MAX_NUMNODES > 1 /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1684,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } +#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -1751,12 +1944,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return node; } +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +{ + int nid; + + /* + * quick check...making use of scan_node. + * We can skip unused nodes. + */ + if (!nodes_empty(memcg->scan_nodes)) { + for (nid = first_node(memcg->scan_nodes); + nid < MAX_NUMNODES; + nid = next_node(nid, memcg->scan_nodes)) { + + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) + return true; + } + } + /* + * Check rest of nodes. + */ + for_each_node_state(nid, N_MEMORY) { + if (node_isset(nid, memcg->scan_nodes)) + continue; + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) + return true; + } + return false; +} + #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +{ + return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); +} #endif /* @@ -2692,7 +2924,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, unlock_page_cgroup(pc); /* - * "charge_statistics" updated event counter. + * "charge_statistics" updated event counter. Then, check it. + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. */ memcg_check_events(memcg, page); } @@ -5791,6 +6025,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); + mz->usage_in_excess = 0; + mz->on_tree = false; mz->memcg = memcg; } memcg->nodeinfo[node] = pn; @@ -5846,6 +6082,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; size_t size = memcg_size(); + mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) @@ -5882,6 +6119,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); +static void __init mem_cgroup_soft_limit_tree_init(void) +{ + struct mem_cgroup_tree_per_node *rtpn; + struct mem_cgroup_tree_per_zone *rtpz; + int tmp, node, zone; + + for_each_node(node) { + tmp = node; + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); + BUG_ON(!rtpn); + + soft_limit_tree.rb_tree_per_node[node] = rtpn; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + } +} + static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -6662,6 +6922,7 @@ static int __init mem_cgroup_init(void) { hotcpu_notifier(memcg_cpu_hotplug_callback, 0); enable_swap_cgroup(); + mem_cgroup_soft_limit_tree_init(); memcg_stock_init(); return 0; } -- cgit v1.2.2 From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:41 -0700 Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code" Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim tighter with zone shrinking code") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 142 insertions(+), 21 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 852dbec07ce6..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1991,28 +1991,57 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * A group is eligible for the soft limit reclaim if - * a) it is over its soft limit - * b) any parent up the hierarchy is over its soft limit - */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = memcg; - - if (res_counter_soft_limit_excess(&memcg->res)) - return true; - - /* - * If any parent up the hierarchy is over its soft limit then we - * have to obey and reclaim from this group as well. - */ - while ((parent = parent_mem_cgroup(parent))) { - if (res_counter_soft_limit_excess(&parent->res)) - return true; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; + int loop = 0; + unsigned long excess; + unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; + + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + + while (1) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { + loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!total) + break; + /* + * We want to do more targeted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) + break; + } + continue; + } + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } - - return false; + mem_cgroup_iter_break(root_memcg, victim); + return total; } static DEFINE_SPINLOCK(memcg_oom_lock); @@ -4761,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + unsigned long nr_scanned; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + nr_scanned = 0; + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + gfp_mask, &nr_scanned); + nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) + css_put(&next_mz->memcg->css); + else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->memcg->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->memcg->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->memcg->css); + return nr_reclaimed; +} + /** * mem_cgroup_force_empty_list - clears LRU of a group * @memcg: group to clear -- cgit v1.2.2