diff options
author | Johannes Weiner <hannes@cmpxchg.org> | 2014-12-10 18:42:48 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-10 20:41:05 -0500 |
commit | b2052564e66da2f0551d34a09488411919cfa14d (patch) | |
tree | 7aed05ca9de80d1a9bcbdfb73b7cf0d093cc6c3b /mm/memcontrol.c | |
parent | 64f2199389414341ed3a570663f23616c131ba25 (diff) |
mm: memcontrol: continue cache reclaim from offlined groups
On cgroup deletion, outstanding page cache charges are moved to the parent
group so that they're not lost and can be reclaimed during pressure
on/inside said parent. But this reparenting is fairly tricky and its
synchroneous nature has led to several lock-ups in the past.
Since c2931b70a32c ("cgroup: iterate cgroup_subsys_states directly") css
iterators now also include offlined css, so memcg iterators can be changed
to include offlined children during reclaim of a group, and leftover cache
can just stay put.
There is a slight change of behavior in that charges of deleted groups no
longer show up as local charges in the parent. But they are still
included in the parent's hierarchical statistics.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 218 |
1 files changed, 1 insertions, 217 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e6484ea268d..f90e43c1499f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1132,7 +1132,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1132 | if (css == &root->css) | 1132 | if (css == &root->css) |
1133 | break; | 1133 | break; |
1134 | 1134 | ||
1135 | if (css_tryget_online(css)) { | 1135 | if (css_tryget(css)) { |
1136 | /* | 1136 | /* |
1137 | * Make sure the memcg is initialized: | 1137 | * Make sure the memcg is initialized: |
1138 | * mem_cgroup_css_online() orders the the | 1138 | * mem_cgroup_css_online() orders the the |
@@ -3316,79 +3316,6 @@ out: | |||
3316 | return ret; | 3316 | return ret; |
3317 | } | 3317 | } |
3318 | 3318 | ||
3319 | /** | ||
3320 | * mem_cgroup_move_parent - moves page to the parent group | ||
3321 | * @page: the page to move | ||
3322 | * @pc: page_cgroup of the page | ||
3323 | * @child: page's cgroup | ||
3324 | * | ||
3325 | * move charges to its parent or the root cgroup if the group has no | ||
3326 | * parent (aka use_hierarchy==0). | ||
3327 | * Although this might fail (get_page_unless_zero, isolate_lru_page or | ||
3328 | * mem_cgroup_move_account fails) the failure is always temporary and | ||
3329 | * it signals a race with a page removal/uncharge or migration. In the | ||
3330 | * first case the page is on the way out and it will vanish from the LRU | ||
3331 | * on the next attempt and the call should be retried later. | ||
3332 | * Isolation from the LRU fails only if page has been isolated from | ||
3333 | * the LRU since we looked at it and that usually means either global | ||
3334 | * reclaim or migration going on. The page will either get back to the | ||
3335 | * LRU or vanish. | ||
3336 | * Finaly mem_cgroup_move_account fails only if the page got uncharged | ||
3337 | * (!PageCgroupUsed) or moved to a different group. The page will | ||
3338 | * disappear in the next attempt. | ||
3339 | */ | ||
3340 | static int mem_cgroup_move_parent(struct page *page, | ||
3341 | struct page_cgroup *pc, | ||
3342 | struct mem_cgroup *child) | ||
3343 | { | ||
3344 | struct mem_cgroup *parent; | ||
3345 | unsigned int nr_pages; | ||
3346 | unsigned long uninitialized_var(flags); | ||
3347 | int ret; | ||
3348 | |||
3349 | VM_BUG_ON(mem_cgroup_is_root(child)); | ||
3350 | |||
3351 | ret = -EBUSY; | ||
3352 | if (!get_page_unless_zero(page)) | ||
3353 | goto out; | ||
3354 | if (isolate_lru_page(page)) | ||
3355 | goto put; | ||
3356 | |||
3357 | nr_pages = hpage_nr_pages(page); | ||
3358 | |||
3359 | parent = parent_mem_cgroup(child); | ||
3360 | /* | ||
3361 | * If no parent, move charges to root cgroup. | ||
3362 | */ | ||
3363 | if (!parent) | ||
3364 | parent = root_mem_cgroup; | ||
3365 | |||
3366 | if (nr_pages > 1) { | ||
3367 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
3368 | flags = compound_lock_irqsave(page); | ||
3369 | } | ||
3370 | |||
3371 | ret = mem_cgroup_move_account(page, nr_pages, | ||
3372 | pc, child, parent); | ||
3373 | if (!ret) { | ||
3374 | if (!mem_cgroup_is_root(parent)) | ||
3375 | css_get_many(&parent->css, nr_pages); | ||
3376 | /* Take charge off the local counters */ | ||
3377 | page_counter_cancel(&child->memory, nr_pages); | ||
3378 | if (do_swap_account) | ||
3379 | page_counter_cancel(&child->memsw, nr_pages); | ||
3380 | css_put_many(&child->css, nr_pages); | ||
3381 | } | ||
3382 | |||
3383 | if (nr_pages > 1) | ||
3384 | compound_unlock_irqrestore(page, flags); | ||
3385 | putback_lru_page(page); | ||
3386 | put: | ||
3387 | put_page(page); | ||
3388 | out: | ||
3389 | return ret; | ||
3390 | } | ||
3391 | |||
3392 | #ifdef CONFIG_MEMCG_SWAP | 3319 | #ifdef CONFIG_MEMCG_SWAP |
3393 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | 3320 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, |
3394 | bool charge) | 3321 | bool charge) |
@@ -3682,105 +3609,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3682 | return nr_reclaimed; | 3609 | return nr_reclaimed; |
3683 | } | 3610 | } |
3684 | 3611 | ||
3685 | /** | ||
3686 | * mem_cgroup_force_empty_list - clears LRU of a group | ||
3687 | * @memcg: group to clear | ||
3688 | * @node: NUMA node | ||
3689 | * @zid: zone id | ||
3690 | * @lru: lru to to clear | ||
3691 | * | ||
3692 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't | ||
3693 | * reclaim the pages page themselves - pages are moved to the parent (or root) | ||
3694 | * group. | ||
3695 | */ | ||
3696 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | ||
3697 | int node, int zid, enum lru_list lru) | ||
3698 | { | ||
3699 | struct lruvec *lruvec; | ||
3700 | unsigned long flags; | ||
3701 | struct list_head *list; | ||
3702 | struct page *busy; | ||
3703 | struct zone *zone; | ||
3704 | |||
3705 | zone = &NODE_DATA(node)->node_zones[zid]; | ||
3706 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
3707 | list = &lruvec->lists[lru]; | ||
3708 | |||
3709 | busy = NULL; | ||
3710 | do { | ||
3711 | struct page_cgroup *pc; | ||
3712 | struct page *page; | ||
3713 | |||
3714 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
3715 | if (list_empty(list)) { | ||
3716 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3717 | break; | ||
3718 | } | ||
3719 | page = list_entry(list->prev, struct page, lru); | ||
3720 | if (busy == page) { | ||
3721 | list_move(&page->lru, list); | ||
3722 | busy = NULL; | ||
3723 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3724 | continue; | ||
3725 | } | ||
3726 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3727 | |||
3728 | pc = lookup_page_cgroup(page); | ||
3729 | |||
3730 | if (mem_cgroup_move_parent(page, pc, memcg)) { | ||
3731 | /* found lock contention or "pc" is obsolete. */ | ||
3732 | busy = page; | ||
3733 | } else | ||
3734 | busy = NULL; | ||
3735 | cond_resched(); | ||
3736 | } while (!list_empty(list)); | ||
3737 | } | ||
3738 | |||
3739 | /* | ||
3740 | * make mem_cgroup's charge to be 0 if there is no task by moving | ||
3741 | * all the charges and pages to the parent. | ||
3742 | * This enables deleting this mem_cgroup. | ||
3743 | * | ||
3744 | * Caller is responsible for holding css reference on the memcg. | ||
3745 | */ | ||
3746 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | ||
3747 | { | ||
3748 | int node, zid; | ||
3749 | |||
3750 | do { | ||
3751 | /* This is for making all *used* pages to be on LRU. */ | ||
3752 | lru_add_drain_all(); | ||
3753 | drain_all_stock_sync(memcg); | ||
3754 | mem_cgroup_start_move(memcg); | ||
3755 | for_each_node_state(node, N_MEMORY) { | ||
3756 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
3757 | enum lru_list lru; | ||
3758 | for_each_lru(lru) { | ||
3759 | mem_cgroup_force_empty_list(memcg, | ||
3760 | node, zid, lru); | ||
3761 | } | ||
3762 | } | ||
3763 | } | ||
3764 | mem_cgroup_end_move(memcg); | ||
3765 | memcg_oom_recover(memcg); | ||
3766 | cond_resched(); | ||
3767 | |||
3768 | /* | ||
3769 | * Kernel memory may not necessarily be trackable to a specific | ||
3770 | * process. So they are not migrated, and therefore we can't | ||
3771 | * expect their value to drop to 0 here. | ||
3772 | * Having res filled up with kmem only is enough. | ||
3773 | * | ||
3774 | * This is a safety check because mem_cgroup_force_empty_list | ||
3775 | * could have raced with mem_cgroup_replace_page_cache callers | ||
3776 | * so the lru seemed empty but the page could have been added | ||
3777 | * right after the check. RES_USAGE should be safe as we always | ||
3778 | * charge before adding to the LRU. | ||
3779 | */ | ||
3780 | } while (page_counter_read(&memcg->memory) - | ||
3781 | page_counter_read(&memcg->kmem) > 0); | ||
3782 | } | ||
3783 | |||
3784 | /* | 3612 | /* |
3785 | * Test whether @memcg has children, dead or alive. Note that this | 3613 | * Test whether @memcg has children, dead or alive. Note that this |
3786 | * function doesn't care whether @memcg has use_hierarchy enabled and | 3614 | * function doesn't care whether @memcg has use_hierarchy enabled and |
@@ -5323,7 +5151,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
5323 | { | 5151 | { |
5324 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5152 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5325 | struct mem_cgroup_event *event, *tmp; | 5153 | struct mem_cgroup_event *event, *tmp; |
5326 | struct cgroup_subsys_state *iter; | ||
5327 | 5154 | ||
5328 | /* | 5155 | /* |
5329 | * Unregister events and notify userspace. | 5156 | * Unregister events and notify userspace. |
@@ -5337,13 +5164,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
5337 | } | 5164 | } |
5338 | spin_unlock(&memcg->event_list_lock); | 5165 | spin_unlock(&memcg->event_list_lock); |
5339 | 5166 | ||
5340 | /* | ||
5341 | * This requires that offlining is serialized. Right now that is | ||
5342 | * guaranteed because css_killed_work_fn() holds the cgroup_mutex. | ||
5343 | */ | ||
5344 | css_for_each_descendant_post(iter, css) | ||
5345 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); | ||
5346 | |||
5347 | memcg_unregister_all_caches(memcg); | 5167 | memcg_unregister_all_caches(memcg); |
5348 | vmpressure_cleanup(&memcg->vmpressure); | 5168 | vmpressure_cleanup(&memcg->vmpressure); |
5349 | } | 5169 | } |
@@ -5351,42 +5171,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
5351 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) | 5171 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) |
5352 | { | 5172 | { |
5353 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5173 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5354 | /* | ||
5355 | * XXX: css_offline() would be where we should reparent all | ||
5356 | * memory to prepare the cgroup for destruction. However, | ||
5357 | * memcg does not do css_tryget_online() and page_counter charging | ||
5358 | * under the same RCU lock region, which means that charging | ||
5359 | * could race with offlining. Offlining only happens to | ||
5360 | * cgroups with no tasks in them but charges can show up | ||
5361 | * without any tasks from the swapin path when the target | ||
5362 | * memcg is looked up from the swapout record and not from the | ||
5363 | * current task as it usually is. A race like this can leak | ||
5364 | * charges and put pages with stale cgroup pointers into | ||
5365 | * circulation: | ||
5366 | * | ||
5367 | * #0 #1 | ||
5368 | * lookup_swap_cgroup_id() | ||
5369 | * rcu_read_lock() | ||
5370 | * mem_cgroup_lookup() | ||
5371 | * css_tryget_online() | ||
5372 | * rcu_read_unlock() | ||
5373 | * disable css_tryget_online() | ||
5374 | * call_rcu() | ||
5375 | * offline_css() | ||
5376 | * reparent_charges() | ||
5377 | * page_counter_try_charge() | ||
5378 | * css_put() | ||
5379 | * css_free() | ||
5380 | * pc->mem_cgroup = dead memcg | ||
5381 | * add page to lru | ||
5382 | * | ||
5383 | * The bulk of the charges are still moved in offline_css() to | ||
5384 | * avoid pinning a lot of pages in case a long-term reference | ||
5385 | * like a swapout record is deferring the css_free() to long | ||
5386 | * after offlining. But this makes sure we catch any charges | ||
5387 | * made after offlining: | ||
5388 | */ | ||
5389 | mem_cgroup_reparent_charges(memcg); | ||
5390 | 5174 | ||
5391 | memcg_destroy_kmem(memcg); | 5175 | memcg_destroy_kmem(memcg); |
5392 | __mem_cgroup_free(memcg); | 5176 | __mem_cgroup_free(memcg); |