aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-12-10 18:42:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 20:41:05 -0500
commitb2052564e66da2f0551d34a09488411919cfa14d (patch)
tree7aed05ca9de80d1a9bcbdfb73b7cf0d093cc6c3b /mm/memcontrol.c
parent64f2199389414341ed3a570663f23616c131ba25 (diff)
mm: memcontrol: continue cache reclaim from offlined groups
On cgroup deletion, outstanding page cache charges are moved to the parent group so that they're not lost and can be reclaimed during pressure on/inside said parent. But this reparenting is fairly tricky and its synchroneous nature has led to several lock-ups in the past. Since c2931b70a32c ("cgroup: iterate cgroup_subsys_states directly") css iterators now also include offlined css, so memcg iterators can be changed to include offlined children during reclaim of a group, and leftover cache can just stay put. There is a slight change of behavior in that charges of deleted groups no longer show up as local charges in the parent. But they are still included in the parent's hierarchical statistics. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c218
1 files changed, 1 insertions, 217 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0e6484ea268d..f90e43c1499f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1132,7 +1132,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1132 if (css == &root->css) 1132 if (css == &root->css)
1133 break; 1133 break;
1134 1134
1135 if (css_tryget_online(css)) { 1135 if (css_tryget(css)) {
1136 /* 1136 /*
1137 * Make sure the memcg is initialized: 1137 * Make sure the memcg is initialized:
1138 * mem_cgroup_css_online() orders the the 1138 * mem_cgroup_css_online() orders the the
@@ -3316,79 +3316,6 @@ out:
3316 return ret; 3316 return ret;
3317} 3317}
3318 3318
3319/**
3320 * mem_cgroup_move_parent - moves page to the parent group
3321 * @page: the page to move
3322 * @pc: page_cgroup of the page
3323 * @child: page's cgroup
3324 *
3325 * move charges to its parent or the root cgroup if the group has no
3326 * parent (aka use_hierarchy==0).
3327 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3328 * mem_cgroup_move_account fails) the failure is always temporary and
3329 * it signals a race with a page removal/uncharge or migration. In the
3330 * first case the page is on the way out and it will vanish from the LRU
3331 * on the next attempt and the call should be retried later.
3332 * Isolation from the LRU fails only if page has been isolated from
3333 * the LRU since we looked at it and that usually means either global
3334 * reclaim or migration going on. The page will either get back to the
3335 * LRU or vanish.
3336 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3337 * (!PageCgroupUsed) or moved to a different group. The page will
3338 * disappear in the next attempt.
3339 */
3340static int mem_cgroup_move_parent(struct page *page,
3341 struct page_cgroup *pc,
3342 struct mem_cgroup *child)
3343{
3344 struct mem_cgroup *parent;
3345 unsigned int nr_pages;
3346 unsigned long uninitialized_var(flags);
3347 int ret;
3348
3349 VM_BUG_ON(mem_cgroup_is_root(child));
3350
3351 ret = -EBUSY;
3352 if (!get_page_unless_zero(page))
3353 goto out;
3354 if (isolate_lru_page(page))
3355 goto put;
3356
3357 nr_pages = hpage_nr_pages(page);
3358
3359 parent = parent_mem_cgroup(child);
3360 /*
3361 * If no parent, move charges to root cgroup.
3362 */
3363 if (!parent)
3364 parent = root_mem_cgroup;
3365
3366 if (nr_pages > 1) {
3367 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3368 flags = compound_lock_irqsave(page);
3369 }
3370
3371 ret = mem_cgroup_move_account(page, nr_pages,
3372 pc, child, parent);
3373 if (!ret) {
3374 if (!mem_cgroup_is_root(parent))
3375 css_get_many(&parent->css, nr_pages);
3376 /* Take charge off the local counters */
3377 page_counter_cancel(&child->memory, nr_pages);
3378 if (do_swap_account)
3379 page_counter_cancel(&child->memsw, nr_pages);
3380 css_put_many(&child->css, nr_pages);
3381 }
3382
3383 if (nr_pages > 1)
3384 compound_unlock_irqrestore(page, flags);
3385 putback_lru_page(page);
3386put:
3387 put_page(page);
3388out:
3389 return ret;
3390}
3391
3392#ifdef CONFIG_MEMCG_SWAP 3319#ifdef CONFIG_MEMCG_SWAP
3393static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3320static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3394 bool charge) 3321 bool charge)
@@ -3682,105 +3609,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3682 return nr_reclaimed; 3609 return nr_reclaimed;
3683} 3610}
3684 3611
3685/**
3686 * mem_cgroup_force_empty_list - clears LRU of a group
3687 * @memcg: group to clear
3688 * @node: NUMA node
3689 * @zid: zone id
3690 * @lru: lru to to clear
3691 *
3692 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3693 * reclaim the pages page themselves - pages are moved to the parent (or root)
3694 * group.
3695 */
3696static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3697 int node, int zid, enum lru_list lru)
3698{
3699 struct lruvec *lruvec;
3700 unsigned long flags;
3701 struct list_head *list;
3702 struct page *busy;
3703 struct zone *zone;
3704
3705 zone = &NODE_DATA(node)->node_zones[zid];
3706 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3707 list = &lruvec->lists[lru];
3708
3709 busy = NULL;
3710 do {
3711 struct page_cgroup *pc;
3712 struct page *page;
3713
3714 spin_lock_irqsave(&zone->lru_lock, flags);
3715 if (list_empty(list)) {
3716 spin_unlock_irqrestore(&zone->lru_lock, flags);
3717 break;
3718 }
3719 page = list_entry(list->prev, struct page, lru);
3720 if (busy == page) {
3721 list_move(&page->lru, list);
3722 busy = NULL;
3723 spin_unlock_irqrestore(&zone->lru_lock, flags);
3724 continue;
3725 }
3726 spin_unlock_irqrestore(&zone->lru_lock, flags);
3727
3728 pc = lookup_page_cgroup(page);
3729
3730 if (mem_cgroup_move_parent(page, pc, memcg)) {
3731 /* found lock contention or "pc" is obsolete. */
3732 busy = page;
3733 } else
3734 busy = NULL;
3735 cond_resched();
3736 } while (!list_empty(list));
3737}
3738
3739/*
3740 * make mem_cgroup's charge to be 0 if there is no task by moving
3741 * all the charges and pages to the parent.
3742 * This enables deleting this mem_cgroup.
3743 *
3744 * Caller is responsible for holding css reference on the memcg.
3745 */
3746static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3747{
3748 int node, zid;
3749
3750 do {
3751 /* This is for making all *used* pages to be on LRU. */
3752 lru_add_drain_all();
3753 drain_all_stock_sync(memcg);
3754 mem_cgroup_start_move(memcg);
3755 for_each_node_state(node, N_MEMORY) {
3756 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3757 enum lru_list lru;
3758 for_each_lru(lru) {
3759 mem_cgroup_force_empty_list(memcg,
3760 node, zid, lru);
3761 }
3762 }
3763 }
3764 mem_cgroup_end_move(memcg);
3765 memcg_oom_recover(memcg);
3766 cond_resched();
3767
3768 /*
3769 * Kernel memory may not necessarily be trackable to a specific
3770 * process. So they are not migrated, and therefore we can't
3771 * expect their value to drop to 0 here.
3772 * Having res filled up with kmem only is enough.
3773 *
3774 * This is a safety check because mem_cgroup_force_empty_list
3775 * could have raced with mem_cgroup_replace_page_cache callers
3776 * so the lru seemed empty but the page could have been added
3777 * right after the check. RES_USAGE should be safe as we always
3778 * charge before adding to the LRU.
3779 */
3780 } while (page_counter_read(&memcg->memory) -
3781 page_counter_read(&memcg->kmem) > 0);
3782}
3783
3784/* 3612/*
3785 * Test whether @memcg has children, dead or alive. Note that this 3613 * Test whether @memcg has children, dead or alive. Note that this
3786 * function doesn't care whether @memcg has use_hierarchy enabled and 3614 * function doesn't care whether @memcg has use_hierarchy enabled and
@@ -5323,7 +5151,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5323{ 5151{
5324 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5152 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5325 struct mem_cgroup_event *event, *tmp; 5153 struct mem_cgroup_event *event, *tmp;
5326 struct cgroup_subsys_state *iter;
5327 5154
5328 /* 5155 /*
5329 * Unregister events and notify userspace. 5156 * Unregister events and notify userspace.
@@ -5337,13 +5164,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5337 } 5164 }
5338 spin_unlock(&memcg->event_list_lock); 5165 spin_unlock(&memcg->event_list_lock);
5339 5166
5340 /*
5341 * This requires that offlining is serialized. Right now that is
5342 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
5343 */
5344 css_for_each_descendant_post(iter, css)
5345 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
5346
5347 memcg_unregister_all_caches(memcg); 5167 memcg_unregister_all_caches(memcg);
5348 vmpressure_cleanup(&memcg->vmpressure); 5168 vmpressure_cleanup(&memcg->vmpressure);
5349} 5169}
@@ -5351,42 +5171,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5351static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5171static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5352{ 5172{
5353 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5173 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5354 /*
5355 * XXX: css_offline() would be where we should reparent all
5356 * memory to prepare the cgroup for destruction. However,
5357 * memcg does not do css_tryget_online() and page_counter charging
5358 * under the same RCU lock region, which means that charging
5359 * could race with offlining. Offlining only happens to
5360 * cgroups with no tasks in them but charges can show up
5361 * without any tasks from the swapin path when the target
5362 * memcg is looked up from the swapout record and not from the
5363 * current task as it usually is. A race like this can leak
5364 * charges and put pages with stale cgroup pointers into
5365 * circulation:
5366 *
5367 * #0 #1
5368 * lookup_swap_cgroup_id()
5369 * rcu_read_lock()
5370 * mem_cgroup_lookup()
5371 * css_tryget_online()
5372 * rcu_read_unlock()
5373 * disable css_tryget_online()
5374 * call_rcu()
5375 * offline_css()
5376 * reparent_charges()
5377 * page_counter_try_charge()
5378 * css_put()
5379 * css_free()
5380 * pc->mem_cgroup = dead memcg
5381 * add page to lru
5382 *
5383 * The bulk of the charges are still moved in offline_css() to
5384 * avoid pinning a lot of pages in case a long-term reference
5385 * like a swapout record is deferring the css_free() to long
5386 * after offlining. But this makes sure we catch any charges
5387 * made after offlining:
5388 */
5389 mem_cgroup_reparent_charges(memcg);
5390 5174
5391 memcg_destroy_kmem(memcg); 5175 memcg_destroy_kmem(memcg);
5392 __mem_cgroup_free(memcg); 5176 __mem_cgroup_free(memcg);