aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 11:18:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 11:18:24 -0500
commitd206e09036d6201f90b2719484c8a59526c46125 (patch)
tree84b9057919bcb8cfd1cff47baa5fc74457e77d6d /mm/memcontrol.c
parentfef3ff2eb777e76cfa5ae67591982d902c17139c (diff)
parent15ef4ffaa797034d5ff82844daf8f595d7c6d53c (diff)
Merge branch 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup changes from Tejun Heo: "A lot of activities on cgroup side. The big changes are focused on making cgroup hierarchy handling saner. - cgroup_rmdir() had peculiar semantics - it allowed cgroup destruction to be vetoed by individual controllers and tried to drain refcnt synchronously. The vetoing never worked properly and caused good deal of contortions in cgroup. memcg was the last reamining user. Michal Hocko removed the usage and cgroup_rmdir() path has been simplified significantly. This was done in a separate branch so that the memcg people can base further memcg changes on top. - The above allowed cleaning up cgroup lifecycle management and implementation of generic cgroup iterators which are used to improve hierarchy support. - cgroup_freezer updated to allow migration in and out of a frozen cgroup and handle hierarchy. If a cgroup is frozen, all descendant cgroups are frozen. - netcls_cgroup and netprio_cgroup updated to handle hierarchy properly. - Various fixes and cleanups. - Two merge commits. One to pull in memcg and rmdir cleanups (needed to build iterators). The other pulled in cgroup/for-3.7-fixes for device_cgroup fixes so that further device_cgroup patches can be stacked on top." Fixed up a trivial conflict in mm/memcontrol.c as per Tejun (due to commit bea8c150a7 ("memcg: fix hotplugged memory zone oops") in master touching code close to commit 2ef37d3fe4 ("memcg: Simplify mem_cgroup_force_empty_list error handling") in for-3.8) * 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (65 commits) cgroup: update Documentation/cgroups/00-INDEX cgroup_rm_file: don't delete the uncreated files cgroup: remove subsystem files when remounting cgroup cgroup: use cgroup_addrm_files() in cgroup_clear_directory() cgroup: warn about broken hierarchies only after css_online cgroup: list_del_init() on removed events cgroup: fix lockdep warning for event_control cgroup: move list add after list head initilization netprio_cgroup: allow nesting and inherit config on cgroup creation netprio_cgroup: implement netprio[_set]_prio() helpers netprio_cgroup: use cgroup->id instead of cgroup_netprio_state->prioidx netprio_cgroup: reimplement priomap expansion netprio_cgroup: shorten variable names in extend_netdev_table() netprio_cgroup: simplify write_priomap() netcls_cgroup: move config inheritance to ->css_online() and remove .broken_hierarchy marking cgroup: remove obsolete guarantee from cgroup_task_migrate. cgroup: add cgroup->id cgroup, cpuset: remove cgroup_subsys->post_clone() cgroup: s/CGRP_CLONE_CHILDREN/CGRP_CPUSET_CLONE_CHILDREN/ cgroup: rename ->create/post_create/pre_destroy/destroy() to ->css_alloc/online/offline/free() ...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c191
1 files changed, 96 insertions, 95 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf6d0df4849c..12307b3838fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2370,7 +2370,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2370again: 2370again:
2371 if (*ptr) { /* css should be a valid one */ 2371 if (*ptr) { /* css should be a valid one */
2372 memcg = *ptr; 2372 memcg = *ptr;
2373 VM_BUG_ON(css_is_removed(&memcg->css));
2374 if (mem_cgroup_is_root(memcg)) 2373 if (mem_cgroup_is_root(memcg))
2375 goto done; 2374 goto done;
2376 if (nr_pages == 1 && consume_stock(memcg)) 2375 if (nr_pages == 1 && consume_stock(memcg))
@@ -2510,9 +2509,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2510 2509
2511/* 2510/*
2512 * A helper function to get mem_cgroup from ID. must be called under 2511 * A helper function to get mem_cgroup from ID. must be called under
2513 * rcu_read_lock(). The caller must check css_is_removed() or some if 2512 * rcu_read_lock(). The caller is responsible for calling css_tryget if
2514 * it's concern. (dropping refcnt from swap can be called against removed 2513 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2515 * memcg.) 2514 * called against removed memcg.)
2516 */ 2515 */
2517static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2516static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2518{ 2517{
@@ -2709,13 +2708,6 @@ static int mem_cgroup_move_account(struct page *page,
2709 /* caller should have done css_get */ 2708 /* caller should have done css_get */
2710 pc->mem_cgroup = to; 2709 pc->mem_cgroup = to;
2711 mem_cgroup_charge_statistics(to, anon, nr_pages); 2710 mem_cgroup_charge_statistics(to, anon, nr_pages);
2712 /*
2713 * We charges against "to" which may not have any tasks. Then, "to"
2714 * can be under rmdir(). But in current implementation, caller of
2715 * this function is just force_empty() and move charge, so it's
2716 * guaranteed that "to" is never removed. So, we don't check rmdir
2717 * status here.
2718 */
2719 move_unlock_mem_cgroup(from, &flags); 2711 move_unlock_mem_cgroup(from, &flags);
2720 ret = 0; 2712 ret = 0;
2721unlock: 2713unlock:
@@ -2729,10 +2721,27 @@ out:
2729 return ret; 2721 return ret;
2730} 2722}
2731 2723
2732/* 2724/**
2733 * move charges to its parent. 2725 * mem_cgroup_move_parent - moves page to the parent group
2726 * @page: the page to move
2727 * @pc: page_cgroup of the page
2728 * @child: page's cgroup
2729 *
2730 * move charges to its parent or the root cgroup if the group has no
2731 * parent (aka use_hierarchy==0).
2732 * Although this might fail (get_page_unless_zero, isolate_lru_page or
2733 * mem_cgroup_move_account fails) the failure is always temporary and
2734 * it signals a race with a page removal/uncharge or migration. In the
2735 * first case the page is on the way out and it will vanish from the LRU
2736 * on the next attempt and the call should be retried later.
2737 * Isolation from the LRU fails only if page has been isolated from
2738 * the LRU since we looked at it and that usually means either global
2739 * reclaim or migration going on. The page will either get back to the
2740 * LRU or vanish.
2741 * Finaly mem_cgroup_move_account fails only if the page got uncharged
2742 * (!PageCgroupUsed) or moved to a different group. The page will
2743 * disappear in the next attempt.
2734 */ 2744 */
2735
2736static int mem_cgroup_move_parent(struct page *page, 2745static int mem_cgroup_move_parent(struct page *page,
2737 struct page_cgroup *pc, 2746 struct page_cgroup *pc,
2738 struct mem_cgroup *child) 2747 struct mem_cgroup *child)
@@ -2742,9 +2751,7 @@ static int mem_cgroup_move_parent(struct page *page,
2742 unsigned long uninitialized_var(flags); 2751 unsigned long uninitialized_var(flags);
2743 int ret; 2752 int ret;
2744 2753
2745 /* Is ROOT ? */ 2754 VM_BUG_ON(mem_cgroup_is_root(child));
2746 if (mem_cgroup_is_root(child))
2747 return -EINVAL;
2748 2755
2749 ret = -EBUSY; 2756 ret = -EBUSY;
2750 if (!get_page_unless_zero(page)) 2757 if (!get_page_unless_zero(page))
@@ -2761,8 +2768,10 @@ static int mem_cgroup_move_parent(struct page *page,
2761 if (!parent) 2768 if (!parent)
2762 parent = root_mem_cgroup; 2769 parent = root_mem_cgroup;
2763 2770
2764 if (nr_pages > 1) 2771 if (nr_pages > 1) {
2772 VM_BUG_ON(!PageTransHuge(page));
2765 flags = compound_lock_irqsave(page); 2773 flags = compound_lock_irqsave(page);
2774 }
2766 2775
2767 ret = mem_cgroup_move_account(page, nr_pages, 2776 ret = mem_cgroup_move_account(page, nr_pages,
2768 pc, child, parent); 2777 pc, child, parent);
@@ -2904,7 +2913,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2904 return; 2913 return;
2905 if (!memcg) 2914 if (!memcg)
2906 return; 2915 return;
2907 cgroup_exclude_rmdir(&memcg->css);
2908 2916
2909 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 2917 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2910 /* 2918 /*
@@ -2918,12 +2926,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2918 swp_entry_t ent = {.val = page_private(page)}; 2926 swp_entry_t ent = {.val = page_private(page)};
2919 mem_cgroup_uncharge_swap(ent); 2927 mem_cgroup_uncharge_swap(ent);
2920 } 2928 }
2921 /*
2922 * At swapin, we may charge account against cgroup which has no tasks.
2923 * So, rmdir()->pre_destroy() can be called while we do this charge.
2924 * In that case, we need to call pre_destroy() again. check it here.
2925 */
2926 cgroup_release_and_wakeup_rmdir(&memcg->css);
2927} 2929}
2928 2930
2929void mem_cgroup_commit_charge_swapin(struct page *page, 2931void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3371,8 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3371 3373
3372 if (!memcg) 3374 if (!memcg)
3373 return; 3375 return;
3374 /* blocks rmdir() */ 3376
3375 cgroup_exclude_rmdir(&memcg->css);
3376 if (!migration_ok) { 3377 if (!migration_ok) {
3377 used = oldpage; 3378 used = oldpage;
3378 unused = newpage; 3379 unused = newpage;
@@ -3406,13 +3407,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3406 */ 3407 */
3407 if (anon) 3408 if (anon)
3408 mem_cgroup_uncharge_page(used); 3409 mem_cgroup_uncharge_page(used);
3409 /*
3410 * At migration, we may charge account against cgroup which has no
3411 * tasks.
3412 * So, rmdir()->pre_destroy() can be called while we do this charge.
3413 * In that case, we need to call pre_destroy() again. check it here.
3414 */
3415 cgroup_release_and_wakeup_rmdir(&memcg->css);
3416} 3410}
3417 3411
3418/* 3412/*
@@ -3712,17 +3706,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3712 return nr_reclaimed; 3706 return nr_reclaimed;
3713} 3707}
3714 3708
3715/* 3709/**
3710 * mem_cgroup_force_empty_list - clears LRU of a group
3711 * @memcg: group to clear
3712 * @node: NUMA node
3713 * @zid: zone id
3714 * @lru: lru to to clear
3715 *
3716 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 3716 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3717 * reclaim the pages page themselves - it just removes the page_cgroups. 3717 * reclaim the pages page themselves - pages are moved to the parent (or root)
3718 * Returns true if some page_cgroups were not freed, indicating that the caller 3718 * group.
3719 * must retry this operation.
3720 */ 3719 */
3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3720static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3722 int node, int zid, enum lru_list lru) 3721 int node, int zid, enum lru_list lru)
3723{ 3722{
3724 struct lruvec *lruvec; 3723 struct lruvec *lruvec;
3725 unsigned long flags, loop; 3724 unsigned long flags;
3726 struct list_head *list; 3725 struct list_head *list;
3727 struct page *busy; 3726 struct page *busy;
3728 struct zone *zone; 3727 struct zone *zone;
@@ -3731,11 +3730,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 3730 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3732 list = &lruvec->lists[lru]; 3731 list = &lruvec->lists[lru];
3733 3732
3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3735 /* give some margin against EBUSY etc...*/
3736 loop += 256;
3737 busy = NULL; 3733 busy = NULL;
3738 while (loop--) { 3734 do {
3739 struct page_cgroup *pc; 3735 struct page_cgroup *pc;
3740 struct page *page; 3736 struct page *page;
3741 3737
@@ -3761,76 +3757,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3761 cond_resched(); 3757 cond_resched();
3762 } else 3758 } else
3763 busy = NULL; 3759 busy = NULL;
3764 } 3760 } while (!list_empty(list));
3765 return !list_empty(list);
3766} 3761}
3767 3762
3768/* 3763/*
3769 * make mem_cgroup's charge to be 0 if there is no task. 3764 * make mem_cgroup's charge to be 0 if there is no task by moving
3765 * all the charges and pages to the parent.
3770 * This enables deleting this mem_cgroup. 3766 * This enables deleting this mem_cgroup.
3767 *
3768 * Caller is responsible for holding css reference on the memcg.
3771 */ 3769 */
3772static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 3770static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3773{ 3771{
3774 int ret; 3772 int node, zid;
3775 int node, zid, shrink;
3776 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3777 struct cgroup *cgrp = memcg->css.cgroup;
3778
3779 css_get(&memcg->css);
3780 3773
3781 shrink = 0;
3782 /* should free all ? */
3783 if (free_all)
3784 goto try_to_free;
3785move_account:
3786 do { 3774 do {
3787 ret = -EBUSY;
3788 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3789 goto out;
3790 /* This is for making all *used* pages to be on LRU. */ 3775 /* This is for making all *used* pages to be on LRU. */
3791 lru_add_drain_all(); 3776 lru_add_drain_all();
3792 drain_all_stock_sync(memcg); 3777 drain_all_stock_sync(memcg);
3793 ret = 0;
3794 mem_cgroup_start_move(memcg); 3778 mem_cgroup_start_move(memcg);
3795 for_each_node_state(node, N_HIGH_MEMORY) { 3779 for_each_node_state(node, N_HIGH_MEMORY) {
3796 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3780 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3797 enum lru_list lru; 3781 enum lru_list lru;
3798 for_each_lru(lru) { 3782 for_each_lru(lru) {
3799 ret = mem_cgroup_force_empty_list(memcg, 3783 mem_cgroup_force_empty_list(memcg,
3800 node, zid, lru); 3784 node, zid, lru);
3801 if (ret)
3802 break;
3803 } 3785 }
3804 } 3786 }
3805 if (ret)
3806 break;
3807 } 3787 }
3808 mem_cgroup_end_move(memcg); 3788 mem_cgroup_end_move(memcg);
3809 memcg_oom_recover(memcg); 3789 memcg_oom_recover(memcg);
3810 cond_resched(); 3790 cond_resched();
3811 /* "ret" should also be checked to ensure all lists are empty. */
3812 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3813out:
3814 css_put(&memcg->css);
3815 return ret;
3816 3791
3817try_to_free: 3792 /*
3793 * This is a safety check because mem_cgroup_force_empty_list
3794 * could have raced with mem_cgroup_replace_page_cache callers
3795 * so the lru seemed empty but the page could have been added
3796 * right after the check. RES_USAGE should be safe as we always
3797 * charge before adding to the LRU.
3798 */
3799 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
3800}
3801
3802/*
3803 * Reclaims as many pages from the given memcg as possible and moves
3804 * the rest to the parent.
3805 *
3806 * Caller is responsible for holding css reference for memcg.
3807 */
3808static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3809{
3810 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3811 struct cgroup *cgrp = memcg->css.cgroup;
3812
3818 /* returns EBUSY if there is a task or if we come here twice. */ 3813 /* returns EBUSY if there is a task or if we come here twice. */
3819 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3814 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3820 ret = -EBUSY; 3815 return -EBUSY;
3821 goto out; 3816
3822 }
3823 /* we call try-to-free pages for make this cgroup empty */ 3817 /* we call try-to-free pages for make this cgroup empty */
3824 lru_add_drain_all(); 3818 lru_add_drain_all();
3825 /* try to free all pages in this cgroup */ 3819 /* try to free all pages in this cgroup */
3826 shrink = 1;
3827 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3820 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3828 int progress; 3821 int progress;
3829 3822
3830 if (signal_pending(current)) { 3823 if (signal_pending(current))
3831 ret = -EINTR; 3824 return -EINTR;
3832 goto out; 3825
3833 }
3834 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3826 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3835 false); 3827 false);
3836 if (!progress) { 3828 if (!progress) {
@@ -3841,13 +3833,23 @@ try_to_free:
3841 3833
3842 } 3834 }
3843 lru_add_drain(); 3835 lru_add_drain();
3844 /* try move_account...there may be some *locked* pages. */ 3836 mem_cgroup_reparent_charges(memcg);
3845 goto move_account; 3837
3838 return 0;
3846} 3839}
3847 3840
3848static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3841static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3849{ 3842{
3850 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3843 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3844 int ret;
3845
3846 if (mem_cgroup_is_root(memcg))
3847 return -EINVAL;
3848 css_get(&memcg->css);
3849 ret = mem_cgroup_force_empty(memcg);
3850 css_put(&memcg->css);
3851
3852 return ret;
3851} 3853}
3852 3854
3853 3855
@@ -4953,7 +4955,7 @@ err_cleanup:
4953} 4955}
4954 4956
4955static struct cgroup_subsys_state * __ref 4957static struct cgroup_subsys_state * __ref
4956mem_cgroup_create(struct cgroup *cont) 4958mem_cgroup_css_alloc(struct cgroup *cont)
4957{ 4959{
4958 struct mem_cgroup *memcg, *parent; 4960 struct mem_cgroup *memcg, *parent;
4959 long error = -ENOMEM; 4961 long error = -ENOMEM;
@@ -5034,14 +5036,14 @@ free_out:
5034 return ERR_PTR(error); 5036 return ERR_PTR(error);
5035} 5037}
5036 5038
5037static int mem_cgroup_pre_destroy(struct cgroup *cont) 5039static void mem_cgroup_css_offline(struct cgroup *cont)
5038{ 5040{
5039 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5041 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5040 5042
5041 return mem_cgroup_force_empty(memcg, false); 5043 mem_cgroup_reparent_charges(memcg);
5042} 5044}
5043 5045
5044static void mem_cgroup_destroy(struct cgroup *cont) 5046static void mem_cgroup_css_free(struct cgroup *cont)
5045{ 5047{
5046 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5048 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5047 5049
@@ -5631,16 +5633,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5631struct cgroup_subsys mem_cgroup_subsys = { 5633struct cgroup_subsys mem_cgroup_subsys = {
5632 .name = "memory", 5634 .name = "memory",
5633 .subsys_id = mem_cgroup_subsys_id, 5635 .subsys_id = mem_cgroup_subsys_id,
5634 .create = mem_cgroup_create, 5636 .css_alloc = mem_cgroup_css_alloc,
5635 .pre_destroy = mem_cgroup_pre_destroy, 5637 .css_offline = mem_cgroup_css_offline,
5636 .destroy = mem_cgroup_destroy, 5638 .css_free = mem_cgroup_css_free,
5637 .can_attach = mem_cgroup_can_attach, 5639 .can_attach = mem_cgroup_can_attach,
5638 .cancel_attach = mem_cgroup_cancel_attach, 5640 .cancel_attach = mem_cgroup_cancel_attach,
5639 .attach = mem_cgroup_move_task, 5641 .attach = mem_cgroup_move_task,
5640 .base_cftypes = mem_cgroup_files, 5642 .base_cftypes = mem_cgroup_files,
5641 .early_init = 0, 5643 .early_init = 0,
5642 .use_id = 1, 5644 .use_id = 1,
5643 .__DEPRECATED_clear_css_refs = true,
5644}; 5645};
5645 5646
5646#ifdef CONFIG_MEMCG_SWAP 5647#ifdef CONFIG_MEMCG_SWAP