aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-08-08 17:19:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:17 -0400
commit0a31bc97c80c3fa87b32c091d9a930ac19cd0c40 (patch)
tree06dafd237309f9b8ded980eb420a5377989e2c0b /mm/memcontrol.c
parent00501b531c4723972aa11d6d4ebcf8d6552007c8 (diff)
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's lifetime - truncation, reclaim, swapout, migration - is impressively complicated and fragile. Because anonymous and file pages were always charged before they had their page->mapping established, uncharges had to happen when the page type could still be known from the context; as in unmap for anonymous, page cache removal for file and shmem pages, and swap cache truncation for swap pages. However, these operations happen well before the page is actually freed, and so a lot of synchronization is necessary: - Charging, uncharging, page migration, and charge migration all need to take a per-page bit spinlock as they could race with uncharging. - Swap cache truncation happens during both swap-in and swap-out, and possibly repeatedly before the page is actually freed. This means that the memcg swapout code is called from many contexts that make no sense and it has to figure out the direction from page state to make sure memory and memory+swap are always correctly charged. - On page migration, the old page might be unmapped but then reused, so memcg code has to prevent untimely uncharging in that case. Because this code - which should be a simple charge transfer - is so special-cased, it is not reusable for replace_page_cache(). But now that charged pages always have a page->mapping, introduce mem_cgroup_uncharge(), which is called after the final put_page(), when we know for sure that nobody is looking at the page anymore. For page migration, introduce mem_cgroup_migrate(), which is called after the migration is successful and the new page is fully rmapped. Because the old page is no longer uncharged after migration, prevent double charges by decoupling the page's memcg association (PCG_USED and pc->mem_cgroup) from the page holding an actual charge. The new bits PCG_MEM and PCG_MEMSW represent the respective charges and are transferred to the new page during migration. mem_cgroup_migrate() is suitable for replace_page_cache() as well, which gets rid of mem_cgroup_replace_page_cache(). However, care needs to be taken because both the source and the target page can already be charged and on the LRU when fuse is splicing: grab the page lock on the charge moving side to prevent changing pc->mem_cgroup of a page under migration. Also, the lruvecs of both pages change as we uncharge the old and charge the new during migration, and putback may race with us, so grab the lru lock and isolate the pages iff on LRU to prevent races and ensure the pages are on the right lruvec afterward. Swap accounting is massively simplified: because the page is no longer uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry before the final put_page() in page reclaim. Finally, page_cgroup changes are now protected by whatever protection the page itself offers: anonymous pages are charged under the page table lock, whereas page cache insertions, swapin, and migration hold the page lock. Uncharging happens under full exclusion with no outstanding references. Charging and uncharging also ensure that the page is off-LRU, which serializes against charge migration. Remove the very costly page_cgroup lock and set pc->flags non-atomically. [mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable] [vdavydov@parallels.com: fix flags definition] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Hugh Dickins <hughd@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Tested-by: Jet Chen <jet.chen@intel.com> Acked-by: Michal Hocko <mhocko@suse.cz> Tested-by: Felipe Balbi <balbi@ti.com> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c828
1 files changed, 321 insertions, 507 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1cbe1e54ff5f..9106f1b12f56 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -754,9 +754,11 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
754static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 754static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
755 struct mem_cgroup_tree_per_zone *mctz) 755 struct mem_cgroup_tree_per_zone *mctz)
756{ 756{
757 spin_lock(&mctz->lock); 757 unsigned long flags;
758
759 spin_lock_irqsave(&mctz->lock, flags);
758 __mem_cgroup_remove_exceeded(mz, mctz); 760 __mem_cgroup_remove_exceeded(mz, mctz);
759 spin_unlock(&mctz->lock); 761 spin_unlock_irqrestore(&mctz->lock, flags);
760} 762}
761 763
762 764
@@ -779,7 +781,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
779 * mem is over its softlimit. 781 * mem is over its softlimit.
780 */ 782 */
781 if (excess || mz->on_tree) { 783 if (excess || mz->on_tree) {
782 spin_lock(&mctz->lock); 784 unsigned long flags;
785
786 spin_lock_irqsave(&mctz->lock, flags);
783 /* if on-tree, remove it */ 787 /* if on-tree, remove it */
784 if (mz->on_tree) 788 if (mz->on_tree)
785 __mem_cgroup_remove_exceeded(mz, mctz); 789 __mem_cgroup_remove_exceeded(mz, mctz);
@@ -788,7 +792,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
788 * If excess is 0, no tree ops. 792 * If excess is 0, no tree ops.
789 */ 793 */
790 __mem_cgroup_insert_exceeded(mz, mctz, excess); 794 __mem_cgroup_insert_exceeded(mz, mctz, excess);
791 spin_unlock(&mctz->lock); 795 spin_unlock_irqrestore(&mctz->lock, flags);
792 } 796 }
793 } 797 }
794} 798}
@@ -839,9 +843,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
839{ 843{
840 struct mem_cgroup_per_zone *mz; 844 struct mem_cgroup_per_zone *mz;
841 845
842 spin_lock(&mctz->lock); 846 spin_lock_irq(&mctz->lock);
843 mz = __mem_cgroup_largest_soft_limit_node(mctz); 847 mz = __mem_cgroup_largest_soft_limit_node(mctz);
844 spin_unlock(&mctz->lock); 848 spin_unlock_irq(&mctz->lock);
845 return mz; 849 return mz;
846} 850}
847 851
@@ -882,13 +886,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
882 return val; 886 return val;
883} 887}
884 888
885static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
886 bool charge)
887{
888 int val = (charge) ? 1 : -1;
889 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
890}
891
892static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 889static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
893 enum mem_cgroup_events_index idx) 890 enum mem_cgroup_events_index idx)
894{ 891{
@@ -909,13 +906,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
909 906
910static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 907static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
911 struct page *page, 908 struct page *page,
912 bool anon, int nr_pages) 909 int nr_pages)
913{ 910{
914 /* 911 /*
915 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 912 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
916 * counted as CACHE even if it's on ANON LRU. 913 * counted as CACHE even if it's on ANON LRU.
917 */ 914 */
918 if (anon) 915 if (PageAnon(page))
919 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 916 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
920 nr_pages); 917 nr_pages);
921 else 918 else
@@ -1013,7 +1010,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
1013 */ 1010 */
1014static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1011static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1015{ 1012{
1016 preempt_disable();
1017 /* threshold event is triggered in finer grain than soft limit */ 1013 /* threshold event is triggered in finer grain than soft limit */
1018 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1014 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1019 MEM_CGROUP_TARGET_THRESH))) { 1015 MEM_CGROUP_TARGET_THRESH))) {
@@ -1026,8 +1022,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1026 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1022 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1027 MEM_CGROUP_TARGET_NUMAINFO); 1023 MEM_CGROUP_TARGET_NUMAINFO);
1028#endif 1024#endif
1029 preempt_enable();
1030
1031 mem_cgroup_threshold(memcg); 1025 mem_cgroup_threshold(memcg);
1032 if (unlikely(do_softlimit)) 1026 if (unlikely(do_softlimit))
1033 mem_cgroup_update_tree(memcg, page); 1027 mem_cgroup_update_tree(memcg, page);
@@ -1035,8 +1029,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1035 if (unlikely(do_numainfo)) 1029 if (unlikely(do_numainfo))
1036 atomic_inc(&memcg->numainfo_events); 1030 atomic_inc(&memcg->numainfo_events);
1037#endif 1031#endif
1038 } else 1032 }
1039 preempt_enable();
1040} 1033}
1041 1034
1042struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1035struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -1347,20 +1340,6 @@ out:
1347 return lruvec; 1340 return lruvec;
1348} 1341}
1349 1342
1350/*
1351 * Following LRU functions are allowed to be used without PCG_LOCK.
1352 * Operations are called by routine of global LRU independently from memcg.
1353 * What we have to take care of here is validness of pc->mem_cgroup.
1354 *
1355 * Changes to pc->mem_cgroup happens when
1356 * 1. charge
1357 * 2. moving account
1358 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
1359 * It is added to LRU before charge.
1360 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
1361 * When moving account, the page is not on LRU. It's isolated.
1362 */
1363
1364/** 1343/**
1365 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1344 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1366 * @page: the page 1345 * @page: the page
@@ -2261,22 +2240,14 @@ cleanup:
2261 * 2240 *
2262 * Notes: Race condition 2241 * Notes: Race condition
2263 * 2242 *
2264 * We usually use lock_page_cgroup() for accessing page_cgroup member but 2243 * Charging occurs during page instantiation, while the page is
2265 * it tends to be costly. But considering some conditions, we doesn't need 2244 * unmapped and locked in page migration, or while the page table is
2266 * to do so _always_. 2245 * locked in THP migration. No race is possible.
2267 *
2268 * Considering "charge", lock_page_cgroup() is not required because all
2269 * file-stat operations happen after a page is attached to radix-tree. There
2270 * are no race with "charge".
2271 * 2246 *
2272 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2247 * Uncharge happens to pages with zero references, no race possible.
2273 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
2274 * if there are race with "uncharge". Statistics itself is properly handled
2275 * by flags.
2276 * 2248 *
2277 * Considering "move", this is an only case we see a race. To make the race 2249 * Charge moving between groups is protected by checking mm->moving
2278 * small, we check memcg->moving_account and detect there are possibility 2250 * account and taking the move_lock in the slowpath.
2279 * of race or not. If there is, we take a lock.
2280 */ 2251 */
2281 2252
2282void __mem_cgroup_begin_update_page_stat(struct page *page, 2253void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2689,6 +2660,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2689 return mem_cgroup_from_id(id); 2660 return mem_cgroup_from_id(id);
2690} 2661}
2691 2662
2663/*
2664 * try_get_mem_cgroup_from_page - look up page's memcg association
2665 * @page: the page
2666 *
2667 * Look up, get a css reference, and return the memcg that owns @page.
2668 *
2669 * The page must be locked to prevent racing with swap-in and page
2670 * cache charges. If coming from an unlocked page table, the caller
2671 * must ensure the page is on the LRU or this can race with charging.
2672 */
2692struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2673struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2693{ 2674{
2694 struct mem_cgroup *memcg = NULL; 2675 struct mem_cgroup *memcg = NULL;
@@ -2699,7 +2680,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2699 VM_BUG_ON_PAGE(!PageLocked(page), page); 2680 VM_BUG_ON_PAGE(!PageLocked(page), page);
2700 2681
2701 pc = lookup_page_cgroup(page); 2682 pc = lookup_page_cgroup(page);
2702 lock_page_cgroup(pc);
2703 if (PageCgroupUsed(pc)) { 2683 if (PageCgroupUsed(pc)) {
2704 memcg = pc->mem_cgroup; 2684 memcg = pc->mem_cgroup;
2705 if (memcg && !css_tryget_online(&memcg->css)) 2685 if (memcg && !css_tryget_online(&memcg->css))
@@ -2713,19 +2693,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2713 memcg = NULL; 2693 memcg = NULL;
2714 rcu_read_unlock(); 2694 rcu_read_unlock();
2715 } 2695 }
2716 unlock_page_cgroup(pc);
2717 return memcg; 2696 return memcg;
2718} 2697}
2719 2698
2699static void lock_page_lru(struct page *page, int *isolated)
2700{
2701 struct zone *zone = page_zone(page);
2702
2703 spin_lock_irq(&zone->lru_lock);
2704 if (PageLRU(page)) {
2705 struct lruvec *lruvec;
2706
2707 lruvec = mem_cgroup_page_lruvec(page, zone);
2708 ClearPageLRU(page);
2709 del_page_from_lru_list(page, lruvec, page_lru(page));
2710 *isolated = 1;
2711 } else
2712 *isolated = 0;
2713}
2714
2715static void unlock_page_lru(struct page *page, int isolated)
2716{
2717 struct zone *zone = page_zone(page);
2718
2719 if (isolated) {
2720 struct lruvec *lruvec;
2721
2722 lruvec = mem_cgroup_page_lruvec(page, zone);
2723 VM_BUG_ON_PAGE(PageLRU(page), page);
2724 SetPageLRU(page);
2725 add_page_to_lru_list(page, lruvec, page_lru(page));
2726 }
2727 spin_unlock_irq(&zone->lru_lock);
2728}
2729
2720static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2730static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2721 unsigned int nr_pages, bool anon, bool lrucare) 2731 unsigned int nr_pages, bool lrucare)
2722{ 2732{
2723 struct page_cgroup *pc = lookup_page_cgroup(page); 2733 struct page_cgroup *pc = lookup_page_cgroup(page);
2724 struct zone *uninitialized_var(zone); 2734 int isolated;
2725 struct lruvec *lruvec;
2726 bool was_on_lru = false;
2727 2735
2728 lock_page_cgroup(pc);
2729 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2736 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2730 /* 2737 /*
2731 * we don't need page_cgroup_lock about tail pages, becase they are not 2738 * we don't need page_cgroup_lock about tail pages, becase they are not
@@ -2736,39 +2743,38 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2736 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2743 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2737 * may already be on some other mem_cgroup's LRU. Take care of it. 2744 * may already be on some other mem_cgroup's LRU. Take care of it.
2738 */ 2745 */
2739 if (lrucare) { 2746 if (lrucare)
2740 zone = page_zone(page); 2747 lock_page_lru(page, &isolated);
2741 spin_lock_irq(&zone->lru_lock);
2742 if (PageLRU(page)) {
2743 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2744 ClearPageLRU(page);
2745 del_page_from_lru_list(page, lruvec, page_lru(page));
2746 was_on_lru = true;
2747 }
2748 }
2749 2748
2749 /*
2750 * Nobody should be changing or seriously looking at
2751 * pc->mem_cgroup and pc->flags at this point:
2752 *
2753 * - the page is uncharged
2754 *
2755 * - the page is off-LRU
2756 *
2757 * - an anonymous fault has exclusive page access, except for
2758 * a locked page table
2759 *
2760 * - a page cache insertion, a swapin fault, or a migration
2761 * have the page locked
2762 */
2750 pc->mem_cgroup = memcg; 2763 pc->mem_cgroup = memcg;
2751 SetPageCgroupUsed(pc); 2764 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
2752
2753 if (lrucare) {
2754 if (was_on_lru) {
2755 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2756 VM_BUG_ON_PAGE(PageLRU(page), page);
2757 SetPageLRU(page);
2758 add_page_to_lru_list(page, lruvec, page_lru(page));
2759 }
2760 spin_unlock_irq(&zone->lru_lock);
2761 }
2762 2765
2763 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2766 if (lrucare)
2764 unlock_page_cgroup(pc); 2767 unlock_page_lru(page, isolated);
2765 2768
2769 local_irq_disable();
2770 mem_cgroup_charge_statistics(memcg, page, nr_pages);
2766 /* 2771 /*
2767 * "charge_statistics" updated event counter. Then, check it. 2772 * "charge_statistics" updated event counter. Then, check it.
2768 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2773 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2769 * if they exceeds softlimit. 2774 * if they exceeds softlimit.
2770 */ 2775 */
2771 memcg_check_events(memcg, page); 2776 memcg_check_events(memcg, page);
2777 local_irq_enable();
2772} 2778}
2773 2779
2774static DEFINE_MUTEX(set_limit_mutex); 2780static DEFINE_MUTEX(set_limit_mutex);
@@ -3395,7 +3401,6 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3395 3401
3396#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3402#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3397 3403
3398#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3399/* 3404/*
3400 * Because tail pages are not marked as "used", set it. We're under 3405 * Because tail pages are not marked as "used", set it. We're under
3401 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3406 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -3416,7 +3421,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3416 for (i = 1; i < HPAGE_PMD_NR; i++) { 3421 for (i = 1; i < HPAGE_PMD_NR; i++) {
3417 pc = head_pc + i; 3422 pc = head_pc + i;
3418 pc->mem_cgroup = memcg; 3423 pc->mem_cgroup = memcg;
3419 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3424 pc->flags = head_pc->flags;
3420 } 3425 }
3421 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3426 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3422 HPAGE_PMD_NR); 3427 HPAGE_PMD_NR);
@@ -3446,7 +3451,6 @@ static int mem_cgroup_move_account(struct page *page,
3446{ 3451{
3447 unsigned long flags; 3452 unsigned long flags;
3448 int ret; 3453 int ret;
3449 bool anon = PageAnon(page);
3450 3454
3451 VM_BUG_ON(from == to); 3455 VM_BUG_ON(from == to);
3452 VM_BUG_ON_PAGE(PageLRU(page), page); 3456 VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -3460,15 +3464,21 @@ static int mem_cgroup_move_account(struct page *page,
3460 if (nr_pages > 1 && !PageTransHuge(page)) 3464 if (nr_pages > 1 && !PageTransHuge(page))
3461 goto out; 3465 goto out;
3462 3466
3463 lock_page_cgroup(pc); 3467 /*
3468 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
3469 * of its source page while we change it: page migration takes
3470 * both pages off the LRU, but page cache replacement doesn't.
3471 */
3472 if (!trylock_page(page))
3473 goto out;
3464 3474
3465 ret = -EINVAL; 3475 ret = -EINVAL;
3466 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3476 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3467 goto unlock; 3477 goto out_unlock;
3468 3478
3469 move_lock_mem_cgroup(from, &flags); 3479 move_lock_mem_cgroup(from, &flags);
3470 3480
3471 if (!anon && page_mapped(page)) { 3481 if (!PageAnon(page) && page_mapped(page)) {
3472 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3482 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3473 nr_pages); 3483 nr_pages);
3474 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3484 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@ -3482,20 +3492,25 @@ static int mem_cgroup_move_account(struct page *page,
3482 nr_pages); 3492 nr_pages);
3483 } 3493 }
3484 3494
3485 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3495 /*
3496 * It is safe to change pc->mem_cgroup here because the page
3497 * is referenced, charged, and isolated - we can't race with
3498 * uncharging, charging, migration, or LRU putback.
3499 */
3486 3500
3487 /* caller should have done css_get */ 3501 /* caller should have done css_get */
3488 pc->mem_cgroup = to; 3502 pc->mem_cgroup = to;
3489 mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3490 move_unlock_mem_cgroup(from, &flags); 3503 move_unlock_mem_cgroup(from, &flags);
3491 ret = 0; 3504 ret = 0;
3492unlock: 3505
3493 unlock_page_cgroup(pc); 3506 local_irq_disable();
3494 /* 3507 mem_cgroup_charge_statistics(to, page, nr_pages);
3495 * check events
3496 */
3497 memcg_check_events(to, page); 3508 memcg_check_events(to, page);
3509 mem_cgroup_charge_statistics(from, page, -nr_pages);
3498 memcg_check_events(from, page); 3510 memcg_check_events(from, page);
3511 local_irq_enable();
3512out_unlock:
3513 unlock_page(page);
3499out: 3514out:
3500 return ret; 3515 return ret;
3501} 3516}
@@ -3566,193 +3581,6 @@ out:
3566 return ret; 3581 return ret;
3567} 3582}
3568 3583
3569static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3570 unsigned int nr_pages,
3571 const enum charge_type ctype)
3572{
3573 struct memcg_batch_info *batch = NULL;
3574 bool uncharge_memsw = true;
3575
3576 /* If swapout, usage of swap doesn't decrease */
3577 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
3578 uncharge_memsw = false;
3579
3580 batch = &current->memcg_batch;
3581 /*
3582 * In usual, we do css_get() when we remember memcg pointer.
3583 * But in this case, we keep res->usage until end of a series of
3584 * uncharges. Then, it's ok to ignore memcg's refcnt.
3585 */
3586 if (!batch->memcg)
3587 batch->memcg = memcg;
3588 /*
3589 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
3590 * In those cases, all pages freed continuously can be expected to be in
3591 * the same cgroup and we have chance to coalesce uncharges.
3592 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
3593 * because we want to do uncharge as soon as possible.
3594 */
3595
3596 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
3597 goto direct_uncharge;
3598
3599 if (nr_pages > 1)
3600 goto direct_uncharge;
3601
3602 /*
3603 * In typical case, batch->memcg == mem. This means we can
3604 * merge a series of uncharges to an uncharge of res_counter.
3605 * If not, we uncharge res_counter ony by one.
3606 */
3607 if (batch->memcg != memcg)
3608 goto direct_uncharge;
3609 /* remember freed charge and uncharge it later */
3610 batch->nr_pages++;
3611 if (uncharge_memsw)
3612 batch->memsw_nr_pages++;
3613 return;
3614direct_uncharge:
3615 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
3616 if (uncharge_memsw)
3617 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
3618 if (unlikely(batch->memcg != memcg))
3619 memcg_oom_recover(memcg);
3620}
3621
3622/*
3623 * uncharge if !page_mapped(page)
3624 */
3625static struct mem_cgroup *
3626__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
3627 bool end_migration)
3628{
3629 struct mem_cgroup *memcg = NULL;
3630 unsigned int nr_pages = 1;
3631 struct page_cgroup *pc;
3632 bool anon;
3633
3634 if (mem_cgroup_disabled())
3635 return NULL;
3636
3637 if (PageTransHuge(page)) {
3638 nr_pages <<= compound_order(page);
3639 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3640 }
3641 /*
3642 * Check if our page_cgroup is valid
3643 */
3644 pc = lookup_page_cgroup(page);
3645 if (unlikely(!PageCgroupUsed(pc)))
3646 return NULL;
3647
3648 lock_page_cgroup(pc);
3649
3650 memcg = pc->mem_cgroup;
3651
3652 if (!PageCgroupUsed(pc))
3653 goto unlock_out;
3654
3655 anon = PageAnon(page);
3656
3657 switch (ctype) {
3658 case MEM_CGROUP_CHARGE_TYPE_ANON:
3659 /*
3660 * Generally PageAnon tells if it's the anon statistics to be
3661 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
3662 * used before page reached the stage of being marked PageAnon.
3663 */
3664 anon = true;
3665 /* fallthrough */
3666 case MEM_CGROUP_CHARGE_TYPE_DROP:
3667 /* See mem_cgroup_prepare_migration() */
3668 if (page_mapped(page))
3669 goto unlock_out;
3670 /*
3671 * Pages under migration may not be uncharged. But
3672 * end_migration() /must/ be the one uncharging the
3673 * unused post-migration page and so it has to call
3674 * here with the migration bit still set. See the
3675 * res_counter handling below.
3676 */
3677 if (!end_migration && PageCgroupMigration(pc))
3678 goto unlock_out;
3679 break;
3680 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3681 if (!PageAnon(page)) { /* Shared memory */
3682 if (page->mapping && !page_is_file_cache(page))
3683 goto unlock_out;
3684 } else if (page_mapped(page)) /* Anon */
3685 goto unlock_out;
3686 break;
3687 default:
3688 break;
3689 }
3690
3691 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
3692
3693 ClearPageCgroupUsed(pc);
3694 /*
3695 * pc->mem_cgroup is not cleared here. It will be accessed when it's
3696 * freed from LRU. This is safe because uncharged page is expected not
3697 * to be reused (freed soon). Exception is SwapCache, it's handled by
3698 * special functions.
3699 */
3700
3701 unlock_page_cgroup(pc);
3702 /*
3703 * even after unlock, we have memcg->res.usage here and this memcg
3704 * will never be freed, so it's safe to call css_get().
3705 */
3706 memcg_check_events(memcg, page);
3707 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3708 mem_cgroup_swap_statistics(memcg, true);
3709 css_get(&memcg->css);
3710 }
3711 /*
3712 * Migration does not charge the res_counter for the
3713 * replacement page, so leave it alone when phasing out the
3714 * page that is unused after the migration.
3715 */
3716 if (!end_migration)
3717 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3718
3719 return memcg;
3720
3721unlock_out:
3722 unlock_page_cgroup(pc);
3723 return NULL;
3724}
3725
3726void mem_cgroup_uncharge_page(struct page *page)
3727{
3728 /* early check. */
3729 if (page_mapped(page))
3730 return;
3731 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3732 /*
3733 * If the page is in swap cache, uncharge should be deferred
3734 * to the swap path, which also properly accounts swap usage
3735 * and handles memcg lifetime.
3736 *
3737 * Note that this check is not stable and reclaim may add the
3738 * page to swap cache at any time after this. However, if the
3739 * page is not in swap cache by the time page->mapcount hits
3740 * 0, there won't be any page table references to the swap
3741 * slot, and reclaim will free it and not actually write the
3742 * page to disk.
3743 */
3744 if (PageSwapCache(page))
3745 return;
3746 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3747}
3748
3749void mem_cgroup_uncharge_cache_page(struct page *page)
3750{
3751 VM_BUG_ON_PAGE(page_mapped(page), page);
3752 VM_BUG_ON_PAGE(page->mapping, page);
3753 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3754}
3755
3756/* 3584/*
3757 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3585 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
3758 * In that cases, pages are freed continuously and we can expect pages 3586 * In that cases, pages are freed continuously and we can expect pages
@@ -3763,6 +3591,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
3763 3591
3764void mem_cgroup_uncharge_start(void) 3592void mem_cgroup_uncharge_start(void)
3765{ 3593{
3594 unsigned long flags;
3595
3596 local_irq_save(flags);
3766 current->memcg_batch.do_batch++; 3597 current->memcg_batch.do_batch++;
3767 /* We can do nest. */ 3598 /* We can do nest. */
3768 if (current->memcg_batch.do_batch == 1) { 3599 if (current->memcg_batch.do_batch == 1) {
@@ -3770,21 +3601,18 @@ void mem_cgroup_uncharge_start(void)
3770 current->memcg_batch.nr_pages = 0; 3601 current->memcg_batch.nr_pages = 0;
3771 current->memcg_batch.memsw_nr_pages = 0; 3602 current->memcg_batch.memsw_nr_pages = 0;
3772 } 3603 }
3604 local_irq_restore(flags);
3773} 3605}
3774 3606
3775void mem_cgroup_uncharge_end(void) 3607void mem_cgroup_uncharge_end(void)
3776{ 3608{
3777 struct memcg_batch_info *batch = &current->memcg_batch; 3609 struct memcg_batch_info *batch = &current->memcg_batch;
3610 unsigned long flags;
3778 3611
3779 if (!batch->do_batch) 3612 local_irq_save(flags);
3780 return; 3613 VM_BUG_ON(!batch->do_batch);
3781 3614 if (--batch->do_batch) /* If stacked, do nothing */
3782 batch->do_batch--; 3615 goto out;
3783 if (batch->do_batch) /* If stacked, do nothing. */
3784 return;
3785
3786 if (!batch->memcg)
3787 return;
3788 /* 3616 /*
3789 * This "batch->memcg" is valid without any css_get/put etc... 3617 * This "batch->memcg" is valid without any css_get/put etc...
3790 * bacause we hide charges behind us. 3618 * bacause we hide charges behind us.
@@ -3796,61 +3624,16 @@ void mem_cgroup_uncharge_end(void)
3796 res_counter_uncharge(&batch->memcg->memsw, 3624 res_counter_uncharge(&batch->memcg->memsw,
3797 batch->memsw_nr_pages * PAGE_SIZE); 3625 batch->memsw_nr_pages * PAGE_SIZE);
3798 memcg_oom_recover(batch->memcg); 3626 memcg_oom_recover(batch->memcg);
3799 /* forget this pointer (for sanity check) */ 3627out:
3800 batch->memcg = NULL; 3628 local_irq_restore(flags);
3801}
3802
3803#ifdef CONFIG_SWAP
3804/*
3805 * called after __delete_from_swap_cache() and drop "page" account.
3806 * memcg information is recorded to swap_cgroup of "ent"
3807 */
3808void
3809mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3810{
3811 struct mem_cgroup *memcg;
3812 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3813
3814 if (!swapout) /* this was a swap cache but the swap is unused ! */
3815 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3816
3817 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3818
3819 /*
3820 * record memcg information, if swapout && memcg != NULL,
3821 * css_get() was called in uncharge().
3822 */
3823 if (do_swap_account && swapout && memcg)
3824 swap_cgroup_record(ent, mem_cgroup_id(memcg));
3825} 3629}
3826#endif
3827 3630
3828#ifdef CONFIG_MEMCG_SWAP 3631#ifdef CONFIG_MEMCG_SWAP
3829/* 3632static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3830 * called from swap_entry_free(). remove record in swap_cgroup and 3633 bool charge)
3831 * uncharge "memsw" account.
3832 */
3833void mem_cgroup_uncharge_swap(swp_entry_t ent)
3834{ 3634{
3835 struct mem_cgroup *memcg; 3635 int val = (charge) ? 1 : -1;
3836 unsigned short id; 3636 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
3837
3838 if (!do_swap_account)
3839 return;
3840
3841 id = swap_cgroup_record(ent, 0);
3842 rcu_read_lock();
3843 memcg = mem_cgroup_lookup(id);
3844 if (memcg) {
3845 /*
3846 * We uncharge this because swap is freed. This memcg can
3847 * be obsolete one. We avoid calling css_tryget_online().
3848 */
3849 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3850 mem_cgroup_swap_statistics(memcg, false);
3851 css_put(&memcg->css);
3852 }
3853 rcu_read_unlock();
3854} 3637}
3855 3638
3856/** 3639/**
@@ -3902,169 +3685,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3902} 3685}
3903#endif 3686#endif
3904 3687
3905/*
3906 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3907 * page belongs to.
3908 */
3909void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3910 struct mem_cgroup **memcgp)
3911{
3912 struct mem_cgroup *memcg = NULL;
3913 unsigned int nr_pages = 1;
3914 struct page_cgroup *pc;
3915
3916 *memcgp = NULL;
3917
3918 if (mem_cgroup_disabled())
3919 return;
3920
3921 if (PageTransHuge(page))
3922 nr_pages <<= compound_order(page);
3923
3924 pc = lookup_page_cgroup(page);
3925 lock_page_cgroup(pc);
3926 if (PageCgroupUsed(pc)) {
3927 memcg = pc->mem_cgroup;
3928 css_get(&memcg->css);
3929 /*
3930 * At migrating an anonymous page, its mapcount goes down
3931 * to 0 and uncharge() will be called. But, even if it's fully
3932 * unmapped, migration may fail and this page has to be
3933 * charged again. We set MIGRATION flag here and delay uncharge
3934 * until end_migration() is called
3935 *
3936 * Corner Case Thinking
3937 * A)
3938 * When the old page was mapped as Anon and it's unmap-and-freed
3939 * while migration was ongoing.
3940 * If unmap finds the old page, uncharge() of it will be delayed
3941 * until end_migration(). If unmap finds a new page, it's
3942 * uncharged when it make mapcount to be 1->0. If unmap code
3943 * finds swap_migration_entry, the new page will not be mapped
3944 * and end_migration() will find it(mapcount==0).
3945 *
3946 * B)
3947 * When the old page was mapped but migraion fails, the kernel
3948 * remaps it. A charge for it is kept by MIGRATION flag even
3949 * if mapcount goes down to 0. We can do remap successfully
3950 * without charging it again.
3951 *
3952 * C)
3953 * The "old" page is under lock_page() until the end of
3954 * migration, so, the old page itself will not be swapped-out.
3955 * If the new page is swapped out before end_migraton, our
3956 * hook to usual swap-out path will catch the event.
3957 */
3958 if (PageAnon(page))
3959 SetPageCgroupMigration(pc);
3960 }
3961 unlock_page_cgroup(pc);
3962 /*
3963 * If the page is not charged at this point,
3964 * we return here.
3965 */
3966 if (!memcg)
3967 return;
3968
3969 *memcgp = memcg;
3970 /*
3971 * We charge new page before it's used/mapped. So, even if unlock_page()
3972 * is called before end_migration, we can catch all events on this new
3973 * page. In the case new page is migrated but not remapped, new page's
3974 * mapcount will be finally 0 and we call uncharge in end_migration().
3975 */
3976 /*
3977 * The page is committed to the memcg, but it's not actually
3978 * charged to the res_counter since we plan on replacing the
3979 * old one and only one page is going to be left afterwards.
3980 */
3981 commit_charge(newpage, memcg, nr_pages, PageAnon(page), false);
3982}
3983
3984/* remove redundant charge if migration failed*/
3985void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3986 struct page *oldpage, struct page *newpage, bool migration_ok)
3987{
3988 struct page *used, *unused;
3989 struct page_cgroup *pc;
3990 bool anon;
3991
3992 if (!memcg)
3993 return;
3994
3995 if (!migration_ok) {
3996 used = oldpage;
3997 unused = newpage;
3998 } else {
3999 used = newpage;
4000 unused = oldpage;
4001 }
4002 anon = PageAnon(used);
4003 __mem_cgroup_uncharge_common(unused,
4004 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4005 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4006 true);
4007 css_put(&memcg->css);
4008 /*
4009 * We disallowed uncharge of pages under migration because mapcount
4010 * of the page goes down to zero, temporarly.
4011 * Clear the flag and check the page should be charged.
4012 */
4013 pc = lookup_page_cgroup(oldpage);
4014 lock_page_cgroup(pc);
4015 ClearPageCgroupMigration(pc);
4016 unlock_page_cgroup(pc);
4017
4018 /*
4019 * If a page is a file cache, radix-tree replacement is very atomic
4020 * and we can skip this check. When it was an Anon page, its mapcount
4021 * goes down to 0. But because we added MIGRATION flage, it's not
4022 * uncharged yet. There are several case but page->mapcount check
4023 * and USED bit check in mem_cgroup_uncharge_page() will do enough
4024 * check. (see prepare_charge() also)
4025 */
4026 if (anon)
4027 mem_cgroup_uncharge_page(used);
4028}
4029
4030/*
4031 * At replace page cache, newpage is not under any memcg but it's on
4032 * LRU. So, this function doesn't touch res_counter but handles LRU
4033 * in correct way. Both pages are locked so we cannot race with uncharge.
4034 */
4035void mem_cgroup_replace_page_cache(struct page *oldpage,
4036 struct page *newpage)
4037{
4038 struct mem_cgroup *memcg = NULL;
4039 struct page_cgroup *pc;
4040
4041 if (mem_cgroup_disabled())
4042 return;
4043
4044 pc = lookup_page_cgroup(oldpage);
4045 /* fix accounting on old pages */
4046 lock_page_cgroup(pc);
4047 if (PageCgroupUsed(pc)) {
4048 memcg = pc->mem_cgroup;
4049 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4050 ClearPageCgroupUsed(pc);
4051 }
4052 unlock_page_cgroup(pc);
4053
4054 /*
4055 * When called from shmem_replace_page(), in some cases the
4056 * oldpage has already been charged, and in some cases not.
4057 */
4058 if (!memcg)
4059 return;
4060 /*
4061 * Even if newpage->mapping was NULL before starting replacement,
4062 * the newpage may be on LRU(or pagevec for LRU) already. We lock
4063 * LRU while we overwrite pc->mem_cgroup.
4064 */
4065 commit_charge(newpage, memcg, 1, false, true);
4066}
4067
4068#ifdef CONFIG_DEBUG_VM 3688#ifdef CONFIG_DEBUG_VM
4069static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3689static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4070{ 3690{
@@ -4263,7 +3883,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4263 gfp_mask, &nr_scanned); 3883 gfp_mask, &nr_scanned);
4264 nr_reclaimed += reclaimed; 3884 nr_reclaimed += reclaimed;
4265 *total_scanned += nr_scanned; 3885 *total_scanned += nr_scanned;
4266 spin_lock(&mctz->lock); 3886 spin_lock_irq(&mctz->lock);
4267 3887
4268 /* 3888 /*
4269 * If we failed to reclaim anything from this memory cgroup 3889 * If we failed to reclaim anything from this memory cgroup
@@ -4303,7 +3923,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4303 */ 3923 */
4304 /* If excess == 0, no tree ops */ 3924 /* If excess == 0, no tree ops */
4305 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3925 __mem_cgroup_insert_exceeded(mz, mctz, excess);
4306 spin_unlock(&mctz->lock); 3926 spin_unlock_irq(&mctz->lock);
4307 css_put(&mz->memcg->css); 3927 css_put(&mz->memcg->css);
4308 loop++; 3928 loop++;
4309 /* 3929 /*
@@ -6265,9 +5885,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6265 if (page) { 5885 if (page) {
6266 pc = lookup_page_cgroup(page); 5886 pc = lookup_page_cgroup(page);
6267 /* 5887 /*
6268 * Do only loose check w/o page_cgroup lock. 5888 * Do only loose check w/o serialization.
6269 * mem_cgroup_move_account() checks the pc is valid or not under 5889 * mem_cgroup_move_account() checks the pc is valid or
6270 * the lock. 5890 * not under LRU exclusion.
6271 */ 5891 */
6272 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5892 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6273 ret = MC_TARGET_PAGE; 5893 ret = MC_TARGET_PAGE;
@@ -6729,6 +6349,67 @@ static void __init enable_swap_cgroup(void)
6729} 6349}
6730#endif 6350#endif
6731 6351
6352#ifdef CONFIG_MEMCG_SWAP
6353/**
6354 * mem_cgroup_swapout - transfer a memsw charge to swap
6355 * @page: page whose memsw charge to transfer
6356 * @entry: swap entry to move the charge to
6357 *
6358 * Transfer the memsw charge of @page to @entry.
6359 */
6360void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6361{
6362 struct page_cgroup *pc;
6363 unsigned short oldid;
6364
6365 VM_BUG_ON_PAGE(PageLRU(page), page);
6366 VM_BUG_ON_PAGE(page_count(page), page);
6367
6368 if (!do_swap_account)
6369 return;
6370
6371 pc = lookup_page_cgroup(page);
6372
6373 /* Readahead page, never charged */
6374 if (!PageCgroupUsed(pc))
6375 return;
6376
6377 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
6378
6379 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
6380 VM_BUG_ON_PAGE(oldid, page);
6381
6382 pc->flags &= ~PCG_MEMSW;
6383 css_get(&pc->mem_cgroup->css);
6384 mem_cgroup_swap_statistics(pc->mem_cgroup, true);
6385}
6386
6387/**
6388 * mem_cgroup_uncharge_swap - uncharge a swap entry
6389 * @entry: swap entry to uncharge
6390 *
6391 * Drop the memsw charge associated with @entry.
6392 */
6393void mem_cgroup_uncharge_swap(swp_entry_t entry)
6394{
6395 struct mem_cgroup *memcg;
6396 unsigned short id;
6397
6398 if (!do_swap_account)
6399 return;
6400
6401 id = swap_cgroup_record(entry, 0);
6402 rcu_read_lock();
6403 memcg = mem_cgroup_lookup(id);
6404 if (memcg) {
6405 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
6406 mem_cgroup_swap_statistics(memcg, false);
6407 css_put(&memcg->css);
6408 }
6409 rcu_read_unlock();
6410}
6411#endif
6412
6732/** 6413/**
6733 * mem_cgroup_try_charge - try charging a page 6414 * mem_cgroup_try_charge - try charging a page
6734 * @page: page to charge 6415 * @page: page to charge
@@ -6831,7 +6512,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6831 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6512 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6832 } 6513 }
6833 6514
6834 commit_charge(page, memcg, nr_pages, PageAnon(page), lrucare); 6515 commit_charge(page, memcg, nr_pages, lrucare);
6835 6516
6836 if (do_swap_account && PageSwapCache(page)) { 6517 if (do_swap_account && PageSwapCache(page)) {
6837 swp_entry_t entry = { .val = page_private(page) }; 6518 swp_entry_t entry = { .val = page_private(page) };
@@ -6873,6 +6554,139 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
6873 cancel_charge(memcg, nr_pages); 6554 cancel_charge(memcg, nr_pages);
6874} 6555}
6875 6556
6557/**
6558 * mem_cgroup_uncharge - uncharge a page
6559 * @page: page to uncharge
6560 *
6561 * Uncharge a page previously charged with mem_cgroup_try_charge() and
6562 * mem_cgroup_commit_charge().
6563 */
6564void mem_cgroup_uncharge(struct page *page)
6565{
6566 struct memcg_batch_info *batch;
6567 unsigned int nr_pages = 1;
6568 struct mem_cgroup *memcg;
6569 struct page_cgroup *pc;
6570 unsigned long pc_flags;
6571 unsigned long flags;
6572
6573 VM_BUG_ON_PAGE(PageLRU(page), page);
6574 VM_BUG_ON_PAGE(page_count(page), page);
6575
6576 if (mem_cgroup_disabled())
6577 return;
6578
6579 pc = lookup_page_cgroup(page);
6580
6581 /* Every final put_page() ends up here */
6582 if (!PageCgroupUsed(pc))
6583 return;
6584
6585 if (PageTransHuge(page)) {
6586 nr_pages <<= compound_order(page);
6587 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6588 }
6589 /*
6590 * Nobody should be changing or seriously looking at
6591 * pc->mem_cgroup and pc->flags at this point, we have fully
6592 * exclusive access to the page.
6593 */
6594 memcg = pc->mem_cgroup;
6595 pc_flags = pc->flags;
6596 pc->flags = 0;
6597
6598 local_irq_save(flags);
6599
6600 if (nr_pages > 1)
6601 goto direct;
6602 if (unlikely(test_thread_flag(TIF_MEMDIE)))
6603 goto direct;
6604 batch = &current->memcg_batch;
6605 if (!batch->do_batch)
6606 goto direct;
6607 if (batch->memcg && batch->memcg != memcg)
6608 goto direct;
6609 if (!batch->memcg)
6610 batch->memcg = memcg;
6611 if (pc_flags & PCG_MEM)
6612 batch->nr_pages++;
6613 if (pc_flags & PCG_MEMSW)
6614 batch->memsw_nr_pages++;
6615 goto out;
6616direct:
6617 if (pc_flags & PCG_MEM)
6618 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
6619 if (pc_flags & PCG_MEMSW)
6620 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
6621 memcg_oom_recover(memcg);
6622out:
6623 mem_cgroup_charge_statistics(memcg, page, -nr_pages);
6624 memcg_check_events(memcg, page);
6625
6626 local_irq_restore(flags);
6627}
6628
6629/**
6630 * mem_cgroup_migrate - migrate a charge to another page
6631 * @oldpage: currently charged page
6632 * @newpage: page to transfer the charge to
6633 * @lrucare: both pages might be on the LRU already
6634 *
6635 * Migrate the charge from @oldpage to @newpage.
6636 *
6637 * Both pages must be locked, @newpage->mapping must be set up.
6638 */
6639void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6640 bool lrucare)
6641{
6642 unsigned int nr_pages = 1;
6643 struct page_cgroup *pc;
6644 int isolated;
6645
6646 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6647 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6648 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
6649 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
6650 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6651
6652 if (mem_cgroup_disabled())
6653 return;
6654
6655 /* Page cache replacement: new page already charged? */
6656 pc = lookup_page_cgroup(newpage);
6657 if (PageCgroupUsed(pc))
6658 return;
6659
6660 /* Re-entrant migration: old page already uncharged? */
6661 pc = lookup_page_cgroup(oldpage);
6662 if (!PageCgroupUsed(pc))
6663 return;
6664
6665 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
6666 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
6667
6668 if (PageTransHuge(oldpage)) {
6669 nr_pages <<= compound_order(oldpage);
6670 VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage);
6671 VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage);
6672 }
6673
6674 if (lrucare)
6675 lock_page_lru(oldpage, &isolated);
6676
6677 pc->flags = 0;
6678
6679 if (lrucare)
6680 unlock_page_lru(oldpage, isolated);
6681
6682 local_irq_disable();
6683 mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages);
6684 memcg_check_events(pc->mem_cgroup, oldpage);
6685 local_irq_enable();
6686
6687 commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare);
6688}
6689
6876/* 6690/*
6877 * subsys_initcall() for memory controller. 6691 * subsys_initcall() for memory controller.
6878 * 6692 *