Merge branch 'akpm' (patchbomb from Andrew)

Merge first patchbomb from Andrew Morton: - a few minor cifs fixes - dma-debug upadtes - ocfs2 - slab - about half of MM - procfs - kernel/exit.c - panic.c tweaks - printk upates - lib/ updates - checkpatch updates - fs/binfmt updates - the drivers/rtc tree - nilfs - kmod fixes - more kernel/exit.c - various other misc tweaks and fixes * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (190 commits) exit: pidns: fix/update the comments in zap_pid_ns_processes() exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting exit: exit_notify: re-use "dead" list to autoreap current exit: reparent: call forget_original_parent() under tasklist_lock exit: reparent: avoid find_new_reaper() if no children exit: reparent: introduce find_alive_thread() exit: reparent: introduce find_child_reaper() exit: reparent: document the ->has_child_subreaper checks exit: reparent: s/while_each_thread/for_each_thread/ in find_new_reaper() exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER reparenting exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER reparenting exit: proc: don't try to flush /proc/tgid/task/tgid exit: release_task: fix the comment about group leader accounting exit: wait: drop tasklist_lock before psig->c* accounting exit: wait: don't use zombie->real_parent exit: wait: cleanup the ptrace_reparented() checks usermodehelper: kill the kmod_thread_locker logic usermodehelper: don't use CLONE_VFORK for ____call_usermodehelper() fs/hfs/catalog.c: fix comparison bug in hfs_cat_keycmp nilfs2: fix the nilfs_iget() vs. nilfs_new_inode() races ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-10 21:34:42 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-10 21:34:42 -0500
commit: b6da0076bab5a12afb19312ffee41c95490af2a0 (patch)
tree: 52a5675b9c2ff95d88b981d5b9a3822f6073c112 /mm
parent: cbfe0de303a55ed96d8831c2d5f56f8131cd6612 (diff)
parent: a53b831549141aa060a8b54b76e3a42870d74cc0 (diff)
28 files changed, 1236 insertions, 1954 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 8405eb0023a9..b3c6ce932c64 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,7 +55,9 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
+obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
+obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/cma.c b/mm/cma.c
index fde706e1284f..8e9ec13d31db 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -215,9 +215,21 @@ int __init cma_declare_contiguous(phys_addr_t base,
                        bool fixed, struct cma **res_cma)
 {
        phys_addr_t memblock_end = memblock_end_of_DRAM();
-        phys_addr_t highmem_start = __pa(high_memory);
+        phys_addr_t highmem_start;
        int ret = 0;
+#ifdef CONFIG_X86
+        /*
+         * high_memory isn't direct mapped memory so retrieving its physical
+         * address isn't appropriate.  But it would be useful to check the
+         * physical address of the highmem boundary so it's justfiable to get
+         * the physical address from it.  On x86 there is a validation check for
+         * this case, so the following workaround is needed to avoid it.
+         */
+        highmem_start = __pa_nodebug(high_memory);
+#else
+        highmem_start = __pa(high_memory);
+#endif
        pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
                __func__, &size, &base, &limit, &alignment);
diff --git a/mm/compaction.c b/mm/compaction.c
index f9792ba3537c..546e571e9d60 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 static unsigned long release_freepages(struct list_head *freelist)
 {
        struct page *page, *next;
-        unsigned long count = 0;
+        unsigned long high_pfn = 0;
        list_for_each_entry_safe(page, next, freelist, lru) {
+                unsigned long pfn = page_to_pfn(page);
                list_del(&page->lru);
                __free_page(page);
-                count++;
+                if (pfn > high_pfn)
+                        high_pfn = pfn;
        }
-        return count;
+        return high_pfn;
 }
 static void map_pages(struct list_head *list)
@@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc,
        /* Update where async and sync compaction should restart */
        if (migrate_scanner) {
-                if (cc->finished_update_migrate)
-                        return;
                if (pfn > zone->compact_cached_migrate_pfn[0])
                        zone->compact_cached_migrate_pfn[0] = pfn;
                if (cc->mode != MIGRATE_ASYNC &&
                    pfn > zone->compact_cached_migrate_pfn[1])
                        zone->compact_cached_migrate_pfn[1] = pfn;
        } else {
-                if (cc->finished_update_free)
-                        return;
                if (pfn < zone->compact_cached_free_pfn)
                        zone->compact_cached_free_pfn = pfn;
        }
@@ -715,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                del_page_from_lru_list(page, lruvec, page_lru(page));
 isolate_success:
-                cc->finished_update_migrate = true;
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
                nr_isolated++;
@@ -889,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc)
                                block_start_pfn - pageblock_nr_pages;
                /*
-                 * Set a flag that we successfully isolated in this pageblock.
-                 * In the next loop iteration, zone->compact_cached_free_pfn
-                 * will not be updated and thus it will effectively contain the
-                 * highest pageblock we isolated pages from.
-                 */
-                if (isolated)
-                        cc->finished_update_free = true;
-                /*
                 * isolate_freepages_block() might have aborted due to async
                 * compaction being contended
                 */
@@ -1086,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
        /* Compaction run is not finished if the watermark is not met */
        watermark = low_wmark_pages(zone);
-        watermark += (1 << cc->order);
-        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+        if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
+                                                        cc->alloc_flags))
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
@@ -1114,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
 *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
 *   COMPACT_CONTINUE - If compaction should run now
 */
-unsigned long compaction_suitable(struct zone *zone, int order)
+unsigned long compaction_suitable(struct zone *zone, int order,
+                                        int alloc_flags, int classzone_idx)
 {
        int fragindex;
        unsigned long watermark;
@@ -1126,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        if (order == -1)
                return COMPACT_CONTINUE;
+        watermark = low_wmark_pages(zone);
+        /*
+         * If watermarks for high-order allocation are already met, there
+         * should be no need for compaction at all.
+         */
+        if (zone_watermark_ok(zone, order, watermark, classzone_idx,
+                                                                alloc_flags))
+                return COMPACT_PARTIAL;
        /*
         * Watermarks for order-0 must be met for compaction. Note the 2UL.
         * This is because during migration, copies of pages need to be
         * allocated and for a short time, the footprint is higher
         */
-        watermark = low_wmark_pages(zone) + (2UL << order);
+        watermark += (2UL << order);
-        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+        if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
                return COMPACT_SKIPPED;
        /*
         * fragmentation index determines if allocation failures are due to
         * low memory or external fragmentation
         *
-         * index of -1000 implies allocations might succeed depending on
+         * index of -1000 would imply allocations might succeed depending on
-         * watermarks
+         * watermarks, but we already failed the high-order watermark check
         * index towards 0 implies failure is due to lack of memory
         * index towards 1000 implies failure is due to fragmentation
         *
@@ -1150,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
                return COMPACT_SKIPPED;
-        if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
-            0, 0))
-                return COMPACT_PARTIAL;
        return COMPACT_CONTINUE;
 }
@@ -1164,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        unsigned long end_pfn = zone_end_pfn(zone);
        const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
        const bool sync = cc->mode != MIGRATE_ASYNC;
+        unsigned long last_migrated_pfn = 0;
-        ret = compaction_suitable(zone, cc->order);
+        ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
+                                                        cc->classzone_idx);
        switch (ret) {
        case COMPACT_PARTIAL:
        case COMPACT_SKIPPED:
@@ -1208,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        while ((ret = compact_finished(zone, cc, migratetype)) ==
                                                COMPACT_CONTINUE) {
                int err;
+                unsigned long isolate_start_pfn = cc->migrate_pfn;
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
@@ -1216,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                        cc->nr_migratepages = 0;
                        goto out;
                case ISOLATE_NONE:
-                        continue;
+                        /*
+                         * We haven't isolated and migrated anything, but
+                         * there might still be unflushed migrations from
+                         * previous cc->order aligned block.
+                         */
+                        goto check_drain;
                case ISOLATE_SUCCESS:
                        ;
                }
@@ -1241,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                goto out;
                        }
                }
+                /*
+                 * Record where we could have freed pages by migration and not
+                 * yet flushed them to buddy allocator. We use the pfn that
+                 * isolate_migratepages() started from in this loop iteration
+                 * - this is the lowest page that could have been isolated and
+                 * then freed by migration.
+                 */
+                if (!last_migrated_pfn)
+                        last_migrated_pfn = isolate_start_pfn;
+check_drain:
+                /*
+                 * Has the migration scanner moved away from the previous
+                 * cc->order aligned block where we migrated from? If yes,
+                 * flush the pages that were freed, so that they can merge and
+                 * compact_finished() can detect immediately if allocation
+                 * would succeed.
+                 */
+                if (cc->order > 0 && last_migrated_pfn) {
+                        int cpu;
+                        unsigned long current_block_start =
+                                cc->migrate_pfn & ~((1UL << cc->order) - 1);
+                        if (last_migrated_pfn < current_block_start) {
+                                cpu = get_cpu();
+                                lru_add_drain_cpu(cpu);
+                                drain_local_pages(zone);
+                                put_cpu();
+                                /* No more flushing until we migrate again */
+                                last_migrated_pfn = 0;
+                        }
+                }
        }
 out:
-        /* Release free pages and check accounting */
+        /*
-        cc->nr_freepages -= release_freepages(&cc->freepages);
+         * Release free pages and update where the free scanner should restart,
-        VM_BUG_ON(cc->nr_freepages != 0);
+         * so we don't leave any returned pages behind in the next attempt.
+         */
+        if (cc->nr_freepages > 0) {
+                unsigned long free_pfn = release_freepages(&cc->freepages);
+                cc->nr_freepages = 0;
+                VM_BUG_ON(free_pfn == 0);
+                /* The cached pfn is always the first in a pageblock */
+                free_pfn &= ~(pageblock_nr_pages-1);
+                /*
+                 * Only go back, not forward. The cached pfn might have been
+                 * already reset to zone end in compact_finished()
+                 */
+                if (free_pfn > zone->compact_cached_free_pfn)
+                        zone->compact_cached_free_pfn = free_pfn;
+        }
        trace_mm_compaction_end(ret);
@@ -1254,7 +1305,8 @@ out:
 }
 static unsigned long compact_zone_order(struct zone *zone, int order,
-                gfp_t gfp_mask, enum migrate_mode mode, int *contended)
+                gfp_t gfp_mask, enum migrate_mode mode, int *contended,
+                int alloc_flags, int classzone_idx)
 {
        unsigned long ret;
        struct compact_control cc = {
@@ -1264,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
                .gfp_mask = gfp_mask,
                .zone = zone,
                .mode = mode,
+                .alloc_flags = alloc_flags,
+                .classzone_idx = classzone_idx,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -1288,14 +1342,13 @@ int sysctl_extfrag_threshold = 500;
 * @mode: The migration mode for async, sync light, or sync migration
 * @contended: Return value that determines if compaction was aborted due to
 *             need_resched() or lock contention
- * @candidate_zone: Return the zone where we think allocation should succeed
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
                        enum migrate_mode mode, int *contended,
-                        struct zone **candidate_zone)
+                        int alloc_flags, int classzone_idx)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1303,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_DEFERRED;
-        int alloc_flags = 0;
        int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
        *contended = COMPACT_CONTENDED_NONE;
@@ -1312,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        if (!order || !may_enter_fs || !may_perform_io)
                return COMPACT_SKIPPED;
-#ifdef CONFIG_CMA
-        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-                alloc_flags |= ALLOC_CMA;
-#endif
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
@@ -1326,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        continue;
                status = compact_zone_order(zone, order, gfp_mask, mode,
-                                                        &zone_contended);
+                                &zone_contended, alloc_flags, classzone_idx);
                rc = max(status, rc);
                /*
                 * It takes at least one zone that wasn't lock contended
@@ -1335,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                all_zones_contended &= zone_contended;
                /* If a normal allocation would succeed, stop compacting */
-                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
-                                      alloc_flags)) {
+                                        classzone_idx, alloc_flags)) {
-                        *candidate_zone = zone;
                        /*
                         * We think the allocation will succeed in this zone,
                         * but it is not certain, hence the false. The caller
@@ -1359,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        goto break_loop;
                }
-                if (mode != MIGRATE_ASYNC) {
+                if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
                        /*
                         * We think that allocation won't succeed in this zone
                         * so we defer compaction there. If it ends up
diff --git a/mm/debug.c b/mm/debug.c
index 5ce45c9a29b5..0e58f3211f89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
                dump_flags(page->flags & badflags,
                                pageflag_names, ARRAY_SIZE(pageflag_names));
        }
-        mem_cgroup_print_bad_page(page);
+#ifdef CONFIG_MEMCG
+        if (page->mem_cgroup)
+                pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
+#endif
 }
 void dump_page(struct page *page, const char *reason)
diff --git a/mm/frontswap.c b/mm/frontswap.c
index f2a3571c6e22..8d82809eb085 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map)
        if (frontswap_ops)
                frontswap_ops->init(type);
        else {
-                BUG_ON(type > MAX_SWAPFILES);
+                BUG_ON(type >= MAX_SWAPFILES);
                set_bit(type, need_init);
        }
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de984159cf0b..5b2c6875fc38 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
        if (!pmd_none(*pmd))
                return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
-        entry = pmd_wrprotect(entry);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fd722769927..30cd96879152 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2638,8 +2638,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+        address = start;
 again:
-        for (address = start; address < end; address += sz) {
+        for (; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
@@ -2686,6 +2687,7 @@ again:
                page_remove_rmap(page);
                force_flush = !__tlb_remove_page(tlb, page);
                if (force_flush) {
+                        address += sz;
                        spin_unlock(ptl);
                        break;
                }
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a67c26e0f360..037e1c00a5b7 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -14,6 +14,7 @@
 */
 #include <linux/cgroup.h>
+#include <linux/page_counter.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
@@ -23,7 +24,7 @@ struct hugetlb_cgroup {
        /*
         * the counter to account for hugepages from hugetlb.
         */
-        struct res_counter hugepage[HUGE_MAX_HSTATE];
+        struct page_counter hugepage[HUGE_MAX_HSTATE];
 };
 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
@@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
        int idx;
        for (idx = 0; idx < hugetlb_max_hstate; idx++) {
-                if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+                if (page_counter_read(&h_cg->hugepage[idx]))
                        return true;
        }
        return false;
@@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (parent_h_cgroup) {
                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
-                        res_counter_init(&h_cgroup->hugepage[idx],
+                        page_counter_init(&h_cgroup->hugepage[idx],
-                                         &parent_h_cgroup->hugepage[idx]);
+                                          &parent_h_cgroup->hugepage[idx]);
        } else {
                root_h_cgroup = h_cgroup;
                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
-                        res_counter_init(&h_cgroup->hugepage[idx], NULL);
+                        page_counter_init(&h_cgroup->hugepage[idx], NULL);
        }
        return &h_cgroup->css;
 }
@@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
                                       struct page *page)
 {
-        int csize;
+        unsigned int nr_pages;
-        struct res_counter *counter;
+        struct page_counter *counter;
-        struct res_counter *fail_res;
        struct hugetlb_cgroup *page_hcg;
        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
@@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
        if (!page_hcg || page_hcg != h_cg)
                goto out;
-        csize = PAGE_SIZE << compound_order(page);
+        nr_pages = 1 << compound_order(page);
        if (!parent) {
                parent = root_h_cgroup;
                /* root has no limit */
-                res_counter_charge_nofail(&parent->hugepage[idx],
+                page_counter_charge(&parent->hugepage[idx], nr_pages);
-                                          csize, &fail_res);
        }
        counter = &h_cg->hugepage[idx];
-        res_counter_uncharge_until(counter, counter->parent, csize);
+        /* Take the pages off the local counter */
+        page_counter_cancel(counter, nr_pages);
        set_hugetlb_cgroup(page, parent);
 out:
@@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                 struct hugetlb_cgroup **ptr)
 {
        int ret = 0;
-        struct res_counter *fail_res;
+        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = NULL;
-        unsigned long csize = nr_pages * PAGE_SIZE;
        if (hugetlb_cgroup_disabled())
                goto done;
@@ -187,7 +186,7 @@ again:
        }
        rcu_read_unlock();
-        ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+        ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter);
        css_put(&h_cg->css);
 done:
        *ptr = h_cg;
@@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
                                  struct page *page)
 {
        struct hugetlb_cgroup *h_cg;
-        unsigned long csize = nr_pages * PAGE_SIZE;
        if (hugetlb_cgroup_disabled())
                return;
@@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
        if (unlikely(!h_cg))
                return;
        set_hugetlb_cgroup(page, NULL);
-        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
        return;
 }
 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                    struct hugetlb_cgroup *h_cg)
 {
-        unsigned long csize = nr_pages * PAGE_SIZE;
        if (hugetlb_cgroup_disabled() || !h_cg)
                return;
        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
                return;
-        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
        return;
 }
+enum {
+        RES_USAGE,
+        RES_LIMIT,
+        RES_MAX_USAGE,
+        RES_FAILCNT,
+};
 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
                                   struct cftype *cft)
 {
-        int idx, name;
+        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
-        idx = MEMFILE_IDX(cft->private);
+        counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
-        name = MEMFILE_ATTR(cft->private);
-        return res_counter_read_u64(&h_cg->hugepage[idx], name);
+        switch (MEMFILE_ATTR(cft->private)) {
+        case RES_USAGE:
+                return (u64)page_counter_read(counter) * PAGE_SIZE;
+        case RES_LIMIT:
+                return (u64)counter->limit * PAGE_SIZE;
+        case RES_MAX_USAGE:
+                return (u64)counter->watermark * PAGE_SIZE;
+        case RES_FAILCNT:
+                return counter->failcnt;
+        default:
+                BUG();
+        }
 }
+static DEFINE_MUTEX(hugetlb_limit_mutex);
 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
 {
-        int idx, name, ret;
+        int ret, idx;
-        unsigned long long val;
+        unsigned long nr_pages;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+        if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
+                return -EINVAL;
        buf = strstrip(buf);
+        ret = page_counter_memparse(buf, &nr_pages);
+        if (ret)
+                return ret;
        idx = MEMFILE_IDX(of_cft(of)->private);
-        name = MEMFILE_ATTR(of_cft(of)->private);
-        switch (name) {
+        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_LIMIT:
-                if (hugetlb_cgroup_is_root(h_cg)) {
+                mutex_lock(&hugetlb_limit_mutex);
-                        /* Can't set limit on root */
+                ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
-                        ret = -EINVAL;
+                mutex_unlock(&hugetlb_limit_mutex);
-                        break;
-                }
-                /* This function does all necessary parse...reuse it */
-                ret = res_counter_memparse_write_strategy(buf, &val);
-                if (ret)
-                        break;
-                val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx]));
-                ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
                break;
        default:
                ret = -EINVAL;
@@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
 {
-        int idx, name, ret = 0;
+        int ret = 0;
+        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
-        idx = MEMFILE_IDX(of_cft(of)->private);
+        counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
-        name = MEMFILE_ATTR(of_cft(of)->private);
-        switch (name) {
+        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_MAX_USAGE:
-                res_counter_reset_max(&h_cg->hugepage[idx]);
+                page_counter_reset_watermark(counter);
                break;
        case RES_FAILCNT:
-                res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+                counter->failcnt = 0;
                break;
        default:
                ret = -EINVAL;
diff --git a/mm/internal.h b/mm/internal.h
index a4f90ba7068e..efad241f7014 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -161,13 +161,10 @@ struct compact_control {
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        enum migrate_mode mode;         /* Async or sync migration mode */
        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
-        bool finished_update_free;      /* True when the zone cached pfns are
-                                         * no longer being updated
-                                         */
-        bool finished_update_migrate;
        int order;                      /* order a direct compactor needs */
        const gfp_t gfp_mask;           /* gfp mask of a direct compactor */
+        const int alloc_flags;          /* alloc flags of a direct compactor */
+        const int classzone_idx;        /* zone index of a direct compactor */
        struct zone *zone;
        int contended;                  /* Signal need_sched() or lock
                                         * contention detected during
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ee48428cf8e3..85df503ec023 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
 * GNU General Public License for more details.
 */
-#include <linux/res_counter.h>
+#include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
@@ -51,7 +51,7 @@
 #include <linux/seq_file.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
-struct mem_cgroup_reclaim_iter {
+struct reclaim_iter {
-        /*
+        struct mem_cgroup *position;
-         * last scanned hierarchy member. Valid only if last_dead_count
-         * matches memcg->dead_count of the hierarchy root group.
-         */
-        struct mem_cgroup *last_visited;
-        int last_dead_count;
        /* scan generation, increased every round-trip */
        unsigned int generation;
 };
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone {
        struct lruvec           lruvec;
        unsigned long           lru_size[NR_LRU_LISTS];
-        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+        struct reclaim_iter     iter[DEF_PRIORITY + 1];
        struct rb_node          tree_node;      /* RB tree node */
-        unsigned long long      usage_in_excess;/* Set to the value by which */
+        unsigned long           usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                    on_tree;
        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
-        u64 threshold;
+        unsigned long threshold;
 };
 /* For threshold */
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 */
 struct mem_cgroup {
        struct cgroup_subsys_state css;
-        /*
-         * the counter to account for memory usage
+        /* Accounted resources */
-         */
+        struct page_counter memory;
-        struct res_counter res;
+        struct page_counter memsw;
+        struct page_counter kmem;
+        unsigned long soft_limit;
        /* vmpressure notifications */
        struct vmpressure vmpressure;
@@ -296,15 +293,6 @@ struct mem_cgroup {
        int initialized;
        /*
-         * the counter to account for mem+swap usage.
-         */
-        struct res_counter memsw;
-        /*
-         * the counter to account for kernel memory usage.
-         */
-        struct res_counter kmem;
-        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
@@ -352,7 +340,6 @@ struct mem_cgroup {
        struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
-        atomic_t        dead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct cg_proto tcp_mem;
 #endif
@@ -382,7 +369,6 @@ struct mem_cgroup {
 /* internal only representation about the status of kmem accounting. */
 enum {
        KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
-        KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
 #ifdef CONFIG_MEMCG_KMEM
@@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
-static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
-{
-        /*
-         * Our caller must use css_get() first, because memcg_uncharge_kmem()
-         * will call css_put() if it sees the memcg is dead.
-         */
-        smp_wmb();
-        if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
-                set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
-}
-static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
-{
-        return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
-                                  &memcg->kmem_account_flags);
-}
 #endif
 /* Stuffs for move charges at task migration. */
@@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
         * This check can't live in kmem destruction function,
         * since the charges will outlive the cgroup
         */
-        WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+        WARN_ON(page_counter_read(&memcg->kmem));
 }
 #else
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
@@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
        disarm_kmem_keys(memcg);
 }
-static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 {
@@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page)
 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
                                         struct mem_cgroup_tree_per_zone *mctz,
-                                         unsigned long long new_usage_in_excess)
+                                         unsigned long new_usage_in_excess)
 {
        struct rb_node **p = &mctz->rb_root.rb_node;
        struct rb_node *parent = NULL;
@@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
        spin_unlock_irqrestore(&mctz->lock, flags);
 }
+static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
+{
+        unsigned long nr_pages = page_counter_read(&memcg->memory);
+        unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+        unsigned long excess = 0;
+        if (nr_pages > soft_limit)
+                excess = nr_pages - soft_limit;
+        return excess;
+}
 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
-        unsigned long long excess;
+        unsigned long excess;
        struct mem_cgroup_per_zone *mz;
        struct mem_cgroup_tree_per_zone *mctz;
@@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
         */
        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
                mz = mem_cgroup_page_zoneinfo(memcg, page);
-                excess = res_counter_soft_limit_excess(&memcg->res);
+                excess = soft_limit_excess(memcg);
                /*
                 * We have to update the tree if mz is on RB-tree or
                 * mem is over its softlimit.
@@ -825,7 +804,7 @@ retry:
         * position in the tree.
         */
        __mem_cgroup_remove_exceeded(mz, mctz);
-        if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+        if (!soft_limit_excess(mz->memcg) ||
            !css_tryget_online(&mz->memcg->css))
                goto retry;
 done:
@@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
        return memcg;
 }
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-                struct mem_cgroup *last_visited)
-{
-        struct cgroup_subsys_state *prev_css, *next_css;
-        prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
-        next_css = css_next_descendant_pre(prev_css, &root->css);
-        /*
-         * Even if we found a group we have to make sure it is
-         * alive. css && !memcg means that the groups should be
-         * skipped and we should continue the tree walk.
-         * last_visited css is safe to use because it is
-         * protected by css_get and the tree walk is rcu safe.
-         *
-         * We do not take a reference on the root of the tree walk
-         * because we might race with the root removal when it would
-         * be the only node in the iterated hierarchy and mem_cgroup_iter
-         * would end up in an endless loop because it expects that at
-         * least one valid node will be returned. Root cannot disappear
-         * because caller of the iterator should hold it already so
-         * skipping css reference should be safe.
-         */
-        if (next_css) {
-                struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
-                if (next_css == &root->css)
-                        return memcg;
-                if (css_tryget_online(next_css)) {
-                        /*
-                         * Make sure the memcg is initialized:
-                         * mem_cgroup_css_online() orders the the
-                         * initialization against setting the flag.
-                         */
-                        if (smp_load_acquire(&memcg->initialized))
-                                return memcg;
-                        css_put(next_css);
-                }
-                prev_css = next_css;
-                goto skip_node;
-        }
-        return NULL;
-}
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
-        /*
-         * When a group in the hierarchy below root is destroyed, the
-         * hierarchy iterator can no longer be trusted since it might
-         * have pointed to the destroyed group.  Invalidate it.
-         */
-        atomic_inc(&root->dead_count);
-}
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-                     struct mem_cgroup *root,
-                     int *sequence)
-{
-        struct mem_cgroup *position = NULL;
-        /*
-         * A cgroup destruction happens in two stages: offlining and
-         * release.  They are separated by a RCU grace period.
-         *
-         * If the iterator is valid, we may still race with an
-         * offlining.  The RCU lock ensures the object won't be
-         * released, tryget will fail if we lost the race.
-         */
-        *sequence = atomic_read(&root->dead_count);
-        if (iter->last_dead_count == *sequence) {
-                smp_rmb();
-                position = iter->last_visited;
-                /*
-                 * We cannot take a reference to root because we might race
-                 * with root removal and returning NULL would end up in
-                 * an endless loop on the iterator user level when root
-                 * would be returned all the time.
-                 */
-                if (position && position != root &&
-                    !css_tryget_online(&position->css))
-                        position = NULL;
-        }
-        return position;
-}
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
-                                   struct mem_cgroup *last_visited,
-                                   struct mem_cgroup *new_position,
-                                   struct mem_cgroup *root,
-                                   int sequence)
-{
-        /* root reference counting symmetric to mem_cgroup_iter_load */
-        if (last_visited && last_visited != root)
-                css_put(&last_visited->css);
-        /*
-         * We store the sequence count from the time @last_visited was
-         * loaded successfully instead of rereading it here so that we
-         * don't lose destruction events in between.  We could have
-         * raced with the destruction of @new_position after all.
-         */
-        iter->last_visited = new_position;
-        smp_wmb();
-        iter->last_dead_count = sequence;
-}
 /**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
@@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                   struct mem_cgroup *prev,
                                   struct mem_cgroup_reclaim_cookie *reclaim)
 {
+        struct reclaim_iter *uninitialized_var(iter);
+        struct cgroup_subsys_state *css = NULL;
        struct mem_cgroup *memcg = NULL;
-        struct mem_cgroup *last_visited = NULL;
+        struct mem_cgroup *pos = NULL;
        if (mem_cgroup_disabled())
                return NULL;
@@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                root = root_mem_cgroup;
        if (prev && !reclaim)
-                last_visited = prev;
+                pos = prev;
        if (!root->use_hierarchy && root != root_mem_cgroup) {
                if (prev)
-                        goto out_css_put;
+                        goto out;
                return root;
        }
        rcu_read_lock();
-        while (!memcg) {
-                struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-                int uninitialized_var(seq);
-                if (reclaim) {
-                        struct mem_cgroup_per_zone *mz;
-                        mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
-                        iter = &mz->reclaim_iter[reclaim->priority];
-                        if (prev && reclaim->generation != iter->generation) {
-                                iter->last_visited = NULL;
-                                goto out_unlock;
-                        }
-                        last_visited = mem_cgroup_iter_load(iter, root, &seq);
+        if (reclaim) {
+                struct mem_cgroup_per_zone *mz;
+                mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+                iter = &mz->iter[reclaim->priority];
+                if (prev && reclaim->generation != iter->generation)
+                        goto out_unlock;
+                do {
+                        pos = ACCESS_ONCE(iter->position);
+                        /*
+                         * A racing update may change the position and
+                         * put the last reference, hence css_tryget(),
+                         * or retry to see the updated position.
+                         */
+                } while (pos && !css_tryget(&pos->css));
+        }
+        if (pos)
+                css = &pos->css;
+        for (;;) {
+                css = css_next_descendant_pre(css, &root->css);
+                if (!css) {
+                        /*
+                         * Reclaimers share the hierarchy walk, and a
+                         * new one might jump in right at the end of
+                         * the hierarchy - make sure they see at least
+                         * one group and restart from the beginning.
+                         */
+                        if (!prev)
+                                continue;
+                        break;
                }
-                memcg = __mem_cgroup_iter_next(root, last_visited);
+                /*
+                 * Verify the css and acquire a reference.  The root
+                 * is provided by the caller, so we know it's alive
+                 * and kicking, and don't take an extra reference.
+                 */
+                memcg = mem_cgroup_from_css(css);
+                if (css == &root->css)
+                        break;
-                if (reclaim) {
+                if (css_tryget(css)) {
-                        mem_cgroup_iter_update(iter, last_visited, memcg, root,
+                        /*
-                                        seq);
+                         * Make sure the memcg is initialized:
+                         * mem_cgroup_css_online() orders the the
+                         * initialization against setting the flag.
+                         */
+                        if (smp_load_acquire(&memcg->initialized))
+                                break;
-                        if (!memcg)
+                        css_put(css);
-                                iter->generation++;
-                        else if (!prev && memcg)
-                                reclaim->generation = iter->generation;
                }
-                if (prev && !memcg)
+                memcg = NULL;
-                        goto out_unlock;
+        }
+        if (reclaim) {
+                if (cmpxchg(&iter->position, pos, memcg) == pos) {
+                        if (memcg)
+                                css_get(&memcg->css);
+                        if (pos)
+                                css_put(&pos->css);
+                }
+                /*
+                 * pairs with css_tryget when dereferencing iter->position
+                 * above.
+                 */
+                if (pos)
+                        css_put(&pos->css);
+                if (!memcg)
+                        iter->generation++;
+                else if (!prev)
+                        reclaim->generation = iter->generation;
        }
 out_unlock:
        rcu_read_unlock();
-out_css_put:
+out:
        if (prev && prev != root)
                css_put(&prev->css);
@@ -1346,15 +1262,18 @@ out:
 }
 /**
- * mem_cgroup_page_lruvec - return lruvec for adding an lru page
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 * @page: the page
 * @zone: zone of the page
+ *
+ * This function is only safe when following the LRU page isolation
+ * and putback protocol: the LRU lock must be held, and the page must
+ * either be PageLRU() or the caller must have isolated/allocated it.
 */
 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 {
        struct mem_cgroup_per_zone *mz;
        struct mem_cgroup *memcg;
-        struct page_cgroup *pc;
        struct lruvec *lruvec;
        if (mem_cgroup_disabled()) {
@@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
                goto out;
        }
-        pc = lookup_page_cgroup(page);
+        memcg = page->mem_cgroup;
-        memcg = pc->mem_cgroup;
        /*
-         * Surreptitiously switch any uncharged offlist page to root:
+         * Swapcache readahead pages are added to the LRU - and
-         * an uncharged page off lru does nothing to secure
+         * possibly migrated - before they are charged.
-         * its former mem_cgroup from sudden removal.
-         *
-         * Our caller holds lru_lock, and PageCgroupUsed is updated
-         * under page_cgroup lock: between them, they make all uses
-         * of pc->mem_cgroup safe.
         */
-        if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+        if (!memcg)
-                pc->mem_cgroup = memcg = root_mem_cgroup;
+                memcg = root_mem_cgroup;
        mz = mem_cgroup_page_zoneinfo(memcg, page);
        lruvec = &mz->lruvec;
@@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
        VM_BUG_ON((long)(*lru_size) < 0);
 }
-/*
+bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
- * Checks whether given mem is same or in the root_mem_cgroup's
- * hierarchy subtree
- */
-bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-                                  struct mem_cgroup *memcg)
 {
-        if (root_memcg == memcg)
+        if (root == memcg)
                return true;
-        if (!root_memcg->use_hierarchy || !memcg)
+        if (!root->use_hierarchy)
                return false;
-        return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
+        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-                                       struct mem_cgroup *memcg)
-{
-        bool ret;
-        rcu_read_lock();
-        ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
-        rcu_read_unlock();
-        return ret;
 }
-bool task_in_mem_cgroup(struct task_struct *task,
+bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-                        const struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *curr = NULL;
+        struct mem_cgroup *task_memcg;
        struct task_struct *p;
        bool ret;
        p = find_lock_task_mm(task);
        if (p) {
-                curr = get_mem_cgroup_from_mm(p->mm);
+                task_memcg = get_mem_cgroup_from_mm(p->mm);
                task_unlock(p);
        } else {
                /*
@@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task,
                 * killed to prevent needlessly killing additional tasks.
                 */
                rcu_read_lock();
-                curr = mem_cgroup_from_task(task);
+                task_memcg = mem_cgroup_from_task(task);
-                if (curr)
+                css_get(&task_memcg->css);
-                        css_get(&curr->css);
                rcu_read_unlock();
        }
-        /*
+        ret = mem_cgroup_is_descendant(task_memcg, memcg);
-         * We should check use_hierarchy of "memcg" not "curr". Because checking
+        css_put(&task_memcg->css);
-         * use_hierarchy of "curr" here make this function true if hierarchy is
-         * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
-         * hierarchy(even if use_hierarchy is disabled in "memcg").
-         */
-        ret = mem_cgroup_same_or_subtree(memcg, curr);
-        css_put(&curr->css);
        return ret;
 }
@@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return inactive * inactive_ratio < active;
 }
-#define mem_cgroup_from_res_counter(counter, member)    \
+#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)
 /**
@@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 */
 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 {
-        unsigned long long margin;
+        unsigned long margin = 0;
+        unsigned long count;
+        unsigned long limit;
-        margin = res_counter_margin(&memcg->res);
+        count = page_counter_read(&memcg->memory);
-        if (do_swap_account)
+        limit = ACCESS_ONCE(memcg->memory.limit);
-                margin = min(margin, res_counter_margin(&memcg->memsw));
+        if (count < limit)
-        return margin >> PAGE_SHIFT;
+                margin = limit - count;
+        if (do_swap_account) {
+                count = page_counter_read(&memcg->memsw);
+                limit = ACCESS_ONCE(memcg->memsw.limit);
+                if (count <= limit)
+                        margin = min(margin, limit - count);
+        }
+        return margin;
 }
 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
@@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 }
 /*
- * memcg->moving_account is used for checking possibility that some thread is
- * calling move_account(). When a thread on CPU-A starts moving pages under
- * a memcg, other threads should check memcg->moving_account under
- * rcu_read_lock(), like this:
- *
- *         CPU-A                                    CPU-B
- *                                              rcu_read_lock()
- *         memcg->moving_account+1              if (memcg->mocing_account)
- *                                                   take heavy locks.
- *         synchronize_rcu()                    update something.
- *                                              rcu_read_unlock()
- *         start move here.
- */
-static void mem_cgroup_start_move(struct mem_cgroup *memcg)
-{
-        atomic_inc(&memcg->moving_account);
-        synchronize_rcu();
-}
-static void mem_cgroup_end_move(struct mem_cgroup *memcg)
-{
-        /*
-         * Now, mem_cgroup_clear_mc() may call this function with NULL.
-         * We check NULL in callee rather than caller.
-         */
-        if (memcg)
-                atomic_dec(&memcg->moving_account);
-}
-/*
 * A routine for checking "mem" is under move_account() or not.
 *
 * Checking a cgroup is mc.from or mc.to or under hierarchy of
@@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
        if (!from)
                goto unlock;
-        ret = mem_cgroup_same_or_subtree(memcg, from)
+        ret = mem_cgroup_is_descendant(from, memcg) ||
-                || mem_cgroup_same_or_subtree(memcg, to);
+                mem_cgroup_is_descendant(to, memcg);
 unlock:
        spin_unlock(&mc.lock);
        return ret;
@@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
        return false;
 }
-/*
- * Take this lock when
- * - a code tries to modify page's memcg while it's USED.
- * - a code tries to modify page state accounting in a memcg.
- */
-static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
-                                  unsigned long *flags)
-{
-        spin_lock_irqsave(&memcg->move_lock, *flags);
-}
-static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
-                                unsigned long *flags)
-{
-        spin_unlock_irqrestore(&memcg->move_lock, *flags);
-}
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /**
 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        rcu_read_unlock();
-        pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
+        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
-                res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
+                K((u64)page_counter_read(&memcg->memory)),
-                res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
+                K((u64)memcg->memory.limit), memcg->memory.failcnt);
-                res_counter_read_u64(&memcg->res, RES_FAILCNT));
+        pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
-        pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
+                K((u64)page_counter_read(&memcg->memsw)),
-                res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
+                K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
-                res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
+        pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
-                res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+                K((u64)page_counter_read(&memcg->kmem)),
-        pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+                K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
-                res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
-                res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
-                res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
        for_each_mem_cgroup_tree(iter, memcg) {
                pr_info("Memory cgroup stats for ");
@@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 /*
 * Return the memory (and swap, if configured) limit for a memcg.
 */
-static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
-        u64 limit;
+        unsigned long limit;
-        limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
-        /*
+        limit = memcg->memory.limit;
-         * Do not consider swap space if we cannot swap due to swappiness
-         */
        if (mem_cgroup_swappiness(memcg)) {
-                u64 memsw;
+                unsigned long memsw_limit;
-                limit += total_swap_pages << PAGE_SHIFT;
+                memsw_limit = memcg->memsw.limit;
-                memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                limit = min(limit + total_swap_pages, memsw_limit);
-                /*
-                 * If memsw is finite and limits the amount of swap space
-                 * available to this memcg, return that limit.
-                 */
-                limit = min(limit, memsw);
        }
        return limit;
 }
@@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        }
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-        totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+        totalpages = mem_cgroup_get_limit(memcg) ? : 1;
        for_each_mem_cgroup_tree(iter, memcg) {
                struct css_task_iter it;
                struct task_struct *task;
@@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
        memcg->last_scanned_node = node;
        return node;
 }
-/*
- * Check all nodes whether it contains reclaimable pages or not.
- * For quick scan, we make use of scan_nodes. This will allow us to skip
- * unused nodes. But scan_nodes is lazily updated and may not cotain
- * enough new information. We need to do double check.
- */
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-        int nid;
-        /*
-         * quick check...making use of scan_node.
-         * We can skip unused nodes.
-         */
-        if (!nodes_empty(memcg->scan_nodes)) {
-                for (nid = first_node(memcg->scan_nodes);
-                     nid < MAX_NUMNODES;
-                     nid = next_node(nid, memcg->scan_nodes)) {
-                        if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
-                                return true;
-                }
-        }
-        /*
-         * Check rest of nodes.
-         */
-        for_each_node_state(nid, N_MEMORY) {
-                if (node_isset(nid, memcg->scan_nodes))
-                        continue;
-                if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
-                        return true;
-        }
-        return false;
-}
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
        return 0;
 }
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-        return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
-}
 #endif
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
@@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
                .priority = 0,
        };
-        excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+        excess = soft_limit_excess(root_memcg);
        while (1) {
                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
@@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
                        }
                        continue;
                }
-                if (!mem_cgroup_reclaimable(victim, false))
-                        continue;
                total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
                                                     zone, &nr_scanned);
                *total_scanned += nr_scanned;
-                if (!res_counter_soft_limit_excess(&root_memcg->res))
+                if (!soft_limit_excess(root_memcg))
                        break;
        }
        mem_cgroup_iter_break(root_memcg, victim);
@@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
        oom_wait_memcg = oom_wait_info->memcg;
-        /*
+        if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
-         * Both of oom_wait_info->memcg and wake_memcg are stable under us.
+            !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
-         * Then we can use css_is_ancestor without taking care of RCU.
-         */
-        if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
-                && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
                return 0;
        return autoremove_wake_function(wait, mode, sync, arg);
 }
@@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
                                              unsigned long *flags)
 {
        struct mem_cgroup *memcg;
-        struct page_cgroup *pc;
        rcu_read_lock();
        if (mem_cgroup_disabled())
                return NULL;
-        pc = lookup_page_cgroup(page);
 again:
-        memcg = pc->mem_cgroup;
+        memcg = page->mem_cgroup;
-        if (unlikely(!memcg || !PageCgroupUsed(pc)))
+        if (unlikely(!memcg))
                return NULL;
        *locked = false;
        if (atomic_read(&memcg->moving_account) <= 0)
                return memcg;
-        move_lock_mem_cgroup(memcg, flags);
+        spin_lock_irqsave(&memcg->move_lock, *flags);
-        if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
+        if (memcg != page->mem_cgroup) {
-                move_unlock_mem_cgroup(memcg, flags);
+                spin_unlock_irqrestore(&memcg->move_lock, *flags);
                goto again;
        }
        *locked = true;
@@ -2261,11 +2048,11 @@ again:
 * @locked: value received from mem_cgroup_begin_page_stat()
 * @flags: value received from mem_cgroup_begin_page_stat()
 */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
-                              unsigned long flags)
+                              unsigned long *flags)
 {
-        if (memcg && locked)
+        if (memcg && *locked)
-                move_unlock_mem_cgroup(memcg, &flags);
+                spin_unlock_irqrestore(&memcg->move_lock, *flags);
        rcu_read_unlock();
 }
@@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex);
 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock;
-        bool ret = true;
+        bool ret = false;
        if (nr_pages > CHARGE_BATCH)
-                return false;
+                return ret;
        stock = &get_cpu_var(memcg_stock);
-        if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
                stock->nr_pages -= nr_pages;
-        else /* need to call res_counter_charge */
+                ret = true;
-                ret = false;
+        }
        put_cpu_var(memcg_stock);
        return ret;
 }
 /*
- * Returns stocks cached in percpu to res_counter and reset cached information.
+ * Returns stocks cached in percpu and reset cached information.
 */
 static void drain_stock(struct memcg_stock_pcp *stock)
 {
        struct mem_cgroup *old = stock->cached;
        if (stock->nr_pages) {
-                unsigned long bytes = stock->nr_pages * PAGE_SIZE;
+                page_counter_uncharge(&old->memory, stock->nr_pages);
-                res_counter_uncharge(&old->res, bytes);
                if (do_swap_account)
-                        res_counter_uncharge(&old->memsw, bytes);
+                        page_counter_uncharge(&old->memsw, stock->nr_pages);
+                css_put_many(&old->css, stock->nr_pages);
                stock->nr_pages = 0;
        }
        stock->cached = NULL;
@@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void)
 }
 /*
- * Cache charges(val) which is from res_counter, to local per_cpu area.
+ * Cache charges(val) to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
 */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 /*
 * Drains all per-CPU charge caches for given root_memcg resp. subtree
- * of the hierarchy under it. sync flag says whether we should block
+ * of the hierarchy under it.
- * until the work is done.
 */
-static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg)
 {
        int cpu, curcpu;
+        /* If someone's already draining, avoid adding running more workers. */
+        if (!mutex_trylock(&percpu_charge_mutex))
+                return;
        /* Notify other cpus that system-wide "drain" is running */
        get_online_cpus();
        curcpu = get_cpu();
@@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
                memcg = stock->cached;
                if (!memcg || !stock->nr_pages)
                        continue;
-                if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
+                if (!mem_cgroup_is_descendant(memcg, root_memcg))
                        continue;
                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
                        if (cpu == curcpu)
@@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
                }
        }
        put_cpu();
-        if (!sync)
-                goto out;
-        for_each_online_cpu(cpu) {
-                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-                if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
-                        flush_work(&stock->work);
-        }
-out:
        put_online_cpus();
-}
-/*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
- */
-static void drain_all_stock_async(struct mem_cgroup *root_memcg)
-{
-        /*
-         * If someone calls draining, avoid adding more kworker runs.
-         */
-        if (!mutex_trylock(&percpu_charge_mutex))
-                return;
-        drain_all_stock(root_memcg, false);
-        mutex_unlock(&percpu_charge_mutex);
-}
-/* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
-{
-        /* called when force_empty is called */
-        mutex_lock(&percpu_charge_mutex);
-        drain_all_stock(root_memcg, true);
        mutex_unlock(&percpu_charge_mutex);
 }
@@ -2506,9 +2259,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        unsigned int batch = max(CHARGE_BATCH, nr_pages);
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup *mem_over_limit;
-        struct res_counter *fail_res;
+        struct page_counter *counter;
        unsigned long nr_reclaimed;
-        unsigned long long size;
        bool may_swap = true;
        bool drained = false;
        int ret = 0;
@@ -2519,16 +2271,15 @@ retry:
        if (consume_stock(memcg, nr_pages))
                goto done;
-        size = batch * PAGE_SIZE;
        if (!do_swap_account ||
-            !res_counter_charge(&memcg->memsw, size, &fail_res)) {
+            !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
-                if (!res_counter_charge(&memcg->res, size, &fail_res))
+                if (!page_counter_try_charge(&memcg->memory, batch, &counter))
                        goto done_restock;
                if (do_swap_account)
-                        res_counter_uncharge(&memcg->memsw, size);
+                        page_counter_uncharge(&memcg->memsw, batch);
-                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+                mem_over_limit = mem_cgroup_from_counter(counter, memory);
        } else {
-                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+                mem_over_limit = mem_cgroup_from_counter(counter, memsw);
                may_swap = false;
        }
@@ -2561,7 +2312,7 @@ retry:
                goto retry;
        if (!drained) {
-                drain_all_stock_async(mem_over_limit);
+                drain_all_stock(mem_over_limit);
                drained = true;
                goto retry;
        }
@@ -2603,6 +2354,7 @@ bypass:
        return -EINTR;
 done_restock:
+        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
 done:
@@ -2611,32 +2363,14 @@ done:
 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-        unsigned long bytes = nr_pages * PAGE_SIZE;
        if (mem_cgroup_is_root(memcg))
                return;
-        res_counter_uncharge(&memcg->res, bytes);
+        page_counter_uncharge(&memcg->memory, nr_pages);
        if (do_swap_account)
-                res_counter_uncharge(&memcg->memsw, bytes);
+                page_counter_uncharge(&memcg->memsw, nr_pages);
-}
-/*
- * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
- * This is useful when moving usage to parent cgroup.
- */
-static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
-                                        unsigned int nr_pages)
-{
-        unsigned long bytes = nr_pages * PAGE_SIZE;
-        if (mem_cgroup_is_root(memcg))
-                return;
-        res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
+        css_put_many(&memcg->css, nr_pages);
-        if (do_swap_account)
-                res_counter_uncharge_until(&memcg->memsw,
-                                                memcg->memsw.parent, bytes);
 }
 /*
@@ -2665,17 +2399,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 */
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
-        struct mem_cgroup *memcg = NULL;
+        struct mem_cgroup *memcg;
-        struct page_cgroup *pc;
        unsigned short id;
        swp_entry_t ent;
        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        pc = lookup_page_cgroup(page);
+        memcg = page->mem_cgroup;
-        if (PageCgroupUsed(pc)) {
+        if (memcg) {
-                memcg = pc->mem_cgroup;
+                if (!css_tryget_online(&memcg->css))
-                if (memcg && !css_tryget_online(&memcg->css))
                        memcg = NULL;
        } else if (PageSwapCache(page)) {
                ent.val = page_private(page);
@@ -2723,14 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
                          bool lrucare)
 {
-        struct page_cgroup *pc = lookup_page_cgroup(page);
        int isolated;
-        VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
+        VM_BUG_ON_PAGE(page->mem_cgroup, page);
-        /*
-         * we don't need page_cgroup_lock about tail pages, becase they are not
-         * accessed by any other context at this point.
-         */
        /*
         * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
@@ -2741,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
        /*
         * Nobody should be changing or seriously looking at
-         * pc->mem_cgroup and pc->flags at this point:
+         * page->mem_cgroup at this point:
         *
         * - the page is uncharged
         *
@@ -2753,15 +2480,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
         * - a page cache insertion, a swapin fault, or a migration
         *   have the page locked
         */
-        pc->mem_cgroup = memcg;
+        page->mem_cgroup = memcg;
-        pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
        if (lrucare)
                unlock_page_lru(page, isolated);
 }
-static DEFINE_MUTEX(set_limit_mutex);
 #ifdef CONFIG_MEMCG_KMEM
 /*
 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
@@ -2769,8 +2493,6 @@ static DEFINE_MUTEX(set_limit_mutex);
 */
 static DEFINE_MUTEX(memcg_slab_mutex);
-static DEFINE_MUTEX(activate_kmem_mutex);
 /*
 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
 * in the memcg_cache_params struct.
@@ -2784,36 +2506,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
        return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
 }
-#ifdef CONFIG_SLABINFO
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
+                             unsigned long nr_pages)
-{
-        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        struct memcg_cache_params *params;
-        if (!memcg_kmem_is_active(memcg))
-                return -EIO;
-        print_slabinfo_header(m);
-        mutex_lock(&memcg_slab_mutex);
-        list_for_each_entry(params, &memcg->memcg_slab_caches, list)
-                cache_show(memcg_params_to_cache(params), m);
-        mutex_unlock(&memcg_slab_mutex);
-        return 0;
-}
-#endif
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 {
-        struct res_counter *fail_res;
+        struct page_counter *counter;
        int ret = 0;
-        ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+        ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-        if (ret)
+        if (ret < 0)
                return ret;
-        ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
+        ret = try_charge(memcg, gfp, nr_pages);
        if (ret == -EINTR)  {
                /*
                 * try_charge() chose to bypass to root due to OOM kill or
@@ -2830,37 +2533,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
                 * when the allocation triggers should have been already
                 * directed to the root cgroup in memcontrol.h
                 */
-                res_counter_charge_nofail(&memcg->res, size, &fail_res);
+                page_counter_charge(&memcg->memory, nr_pages);
                if (do_swap_account)
-                        res_counter_charge_nofail(&memcg->memsw, size,
+                        page_counter_charge(&memcg->memsw, nr_pages);
-                                                  &fail_res);
+                css_get_many(&memcg->css, nr_pages);
                ret = 0;
        } else if (ret)
-                res_counter_uncharge(&memcg->kmem, size);
+                page_counter_uncharge(&memcg->kmem, nr_pages);
        return ret;
 }
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
+                                unsigned long nr_pages)
 {
-        res_counter_uncharge(&memcg->res, size);
+        page_counter_uncharge(&memcg->memory, nr_pages);
        if (do_swap_account)
-                res_counter_uncharge(&memcg->memsw, size);
+                page_counter_uncharge(&memcg->memsw, nr_pages);
-        /* Not down to 0 */
+        page_counter_uncharge(&memcg->kmem, nr_pages);
-        if (res_counter_uncharge(&memcg->kmem, size))
-                return;
-        /*
+        css_put_many(&memcg->css, nr_pages);
-         * Releases a reference taken in kmem_cgroup_css_offline in case
-         * this last uncharge is racing with the offlining code or it is
-         * outliving the memcg existence.
-         *
-         * The memory barrier imposed by test&clear is paired with the
-         * explicit one in memcg_kmem_mark_dead().
-         */
-        if (memcg_kmem_test_and_clear_dead(memcg))
-                css_put(&memcg->css);
 }
 /*
@@ -3124,19 +2817,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 {
+        unsigned int nr_pages = 1 << order;
        int res;
-        res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
+        res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-                                PAGE_SIZE << order);
        if (!res)
-                atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+                atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
        return res;
 }
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
 {
-        memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
+        unsigned int nr_pages = 1 << order;
-        atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+        memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
+        atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
 }
 /*
@@ -3257,7 +2952,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
                return true;
        }
-        ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+        ret = memcg_charge_kmem(memcg, gfp, 1 << order);
        if (!ret)
                *_memcg = memcg;
@@ -3268,46 +2963,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
                              int order)
 {
-        struct page_cgroup *pc;
        VM_BUG_ON(mem_cgroup_is_root(memcg));
        /* The page allocation failed. Revert */
        if (!page) {
-                memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+                memcg_uncharge_kmem(memcg, 1 << order);
                return;
        }
-        /*
+        page->mem_cgroup = memcg;
-         * The page is freshly allocated and not visible to any
-         * outside callers yet.  Set up pc non-atomically.
-         */
-        pc = lookup_page_cgroup(page);
-        pc->mem_cgroup = memcg;
-        pc->flags = PCG_USED;
 }
 void __memcg_kmem_uncharge_pages(struct page *page, int order)
 {
-        struct mem_cgroup *memcg = NULL;
+        struct mem_cgroup *memcg = page->mem_cgroup;
-        struct page_cgroup *pc;
-        pc = lookup_page_cgroup(page);
-        if (!PageCgroupUsed(pc))
-                return;
-        memcg = pc->mem_cgroup;
-        pc->flags = 0;
-        /*
-         * We trust that only if there is a memcg associated with the page, it
-         * is a valid allocation
-         */
        if (!memcg)
                return;
        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+        memcg_uncharge_kmem(memcg, 1 << order);
+        page->mem_cgroup = NULL;
 }
 #else
 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3325,21 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
-        struct page_cgroup *head_pc = lookup_page_cgroup(head);
-        struct page_cgroup *pc;
-        struct mem_cgroup *memcg;
        int i;
        if (mem_cgroup_disabled())
                return;
-        memcg = head_pc->mem_cgroup;
+        for (i = 1; i < HPAGE_PMD_NR; i++)
-        for (i = 1; i < HPAGE_PMD_NR; i++) {
+                head[i].mem_cgroup = head->mem_cgroup;
-                pc = head_pc + i;
-                pc->mem_cgroup = memcg;
+        __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
-                pc->flags = head_pc->flags;
-        }
-        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
                       HPAGE_PMD_NR);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3348,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 * mem_cgroup_move_account - move account of the page
 * @page: the page
 * @nr_pages: number of regular pages (>1 for huge pages)
- * @pc: page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to: mem_cgroup which the page is moved to. @from != @to.
 *
@@ -3361,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 */
 static int mem_cgroup_move_account(struct page *page,
                                   unsigned int nr_pages,
-                                   struct page_cgroup *pc,
                                   struct mem_cgroup *from,
                                   struct mem_cgroup *to)
 {
@@ -3381,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
                goto out;
        /*
-         * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+         * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
         * of its source page while we change it: page migration takes
         * both pages off the LRU, but page cache replacement doesn't.
         */
@@ -3389,10 +3057,10 @@ static int mem_cgroup_move_account(struct page *page,
                goto out;
        ret = -EINVAL;
-        if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+        if (page->mem_cgroup != from)
                goto out_unlock;
-        move_lock_mem_cgroup(from, &flags);
+        spin_lock_irqsave(&from->move_lock, flags);
        if (!PageAnon(page) && page_mapped(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@ -3409,14 +3077,15 @@ static int mem_cgroup_move_account(struct page *page,
        }
        /*
-         * It is safe to change pc->mem_cgroup here because the page
+         * It is safe to change page->mem_cgroup here because the page
         * is referenced, charged, and isolated - we can't race with
         * uncharging, charging, migration, or LRU putback.
         */
        /* caller should have done css_get */
-        pc->mem_cgroup = to;
+        page->mem_cgroup = to;
-        move_unlock_mem_cgroup(from, &flags);
+        spin_unlock_irqrestore(&from->move_lock, flags);
        ret = 0;
        local_irq_disable();
@@ -3431,72 +3100,6 @@ out:
        return ret;
 }
-/**
- * mem_cgroup_move_parent - moves page to the parent group
- * @page: the page to move
- * @pc: page_cgroup of the page
- * @child: page's cgroup
- *
- * move charges to its parent or the root cgroup if the group has no
- * parent (aka use_hierarchy==0).
- * Although this might fail (get_page_unless_zero, isolate_lru_page or
- * mem_cgroup_move_account fails) the failure is always temporary and
- * it signals a race with a page removal/uncharge or migration. In the
- * first case the page is on the way out and it will vanish from the LRU
- * on the next attempt and the call should be retried later.
- * Isolation from the LRU fails only if page has been isolated from
- * the LRU since we looked at it and that usually means either global
- * reclaim or migration going on. The page will either get back to the
- * LRU or vanish.
- * Finaly mem_cgroup_move_account fails only if the page got uncharged
- * (!PageCgroupUsed) or moved to a different group. The page will
- * disappear in the next attempt.
- */
-static int mem_cgroup_move_parent(struct page *page,
-                                  struct page_cgroup *pc,
-                                  struct mem_cgroup *child)
-{
-        struct mem_cgroup *parent;
-        unsigned int nr_pages;
-        unsigned long uninitialized_var(flags);
-        int ret;
-        VM_BUG_ON(mem_cgroup_is_root(child));
-        ret = -EBUSY;
-        if (!get_page_unless_zero(page))
-                goto out;
-        if (isolate_lru_page(page))
-                goto put;
-        nr_pages = hpage_nr_pages(page);
-        parent = parent_mem_cgroup(child);
-        /*
-         * If no parent, move charges to root cgroup.
-         */
-        if (!parent)
-                parent = root_mem_cgroup;
-        if (nr_pages > 1) {
-                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-                flags = compound_lock_irqsave(page);
-        }
-        ret = mem_cgroup_move_account(page, nr_pages,
-                                pc, child, parent);
-        if (!ret)
-                __mem_cgroup_cancel_local_charge(child, nr_pages);
-        if (nr_pages > 1)
-                compound_unlock_irqrestore(page, flags);
-        putback_lru_page(page);
-put:
-        put_page(page);
-out:
-        return ret;
-}
 #ifdef CONFIG_MEMCG_SWAP
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
@@ -3516,7 +3119,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 *
 * Returns 0 on success, -EINVAL on failure.
 *
- * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * The caller must have charged to @to, IOW, called page_counter_charge() about
 * both res and memsw, and called css_get().
 */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
@@ -3532,7 +3135,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                mem_cgroup_swap_statistics(to, true);
                /*
                 * This function is only called from task migration context now.
-                 * It postpones res_counter and refcount handling till the end
+                 * It postpones page_counter and refcount handling till the end
                 * of task migration(mem_cgroup_clear_mc()) for performance
                 * improvement. But we cannot postpone css_get(to)  because if
                 * the process that has been moved to @to does swap-in, the
@@ -3554,96 +3157,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 }
 #endif
-#ifdef CONFIG_DEBUG_VM
+static DEFINE_MUTEX(memcg_limit_mutex);
-static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
-{
-        struct page_cgroup *pc;
-        pc = lookup_page_cgroup(page);
-        /*
-         * Can be NULL while feeding pages into the page allocator for
-         * the first time, i.e. during boot or memory hotplug;
-         * or when mem_cgroup_disabled().
-         */
-        if (likely(pc) && PageCgroupUsed(pc))
-                return pc;
-        return NULL;
-}
-bool mem_cgroup_bad_page_check(struct page *page)
-{
-        if (mem_cgroup_disabled())
-                return false;
-        return lookup_page_cgroup_used(page) != NULL;
-}
-void mem_cgroup_print_bad_page(struct page *page)
-{
-        struct page_cgroup *pc;
-        pc = lookup_page_cgroup_used(page);
-        if (pc) {
-                pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
-                         pc, pc->flags, pc->mem_cgroup);
-        }
-}
-#endif
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
-                                unsigned long long val)
+                                   unsigned long limit)
 {
+        unsigned long curusage;
+        unsigned long oldusage;
+        bool enlarge = false;
        int retry_count;
-        int ret = 0;
+        int ret;
-        int children = mem_cgroup_count_children(memcg);
-        u64 curusage, oldusage;
-        int enlarge;
        /*
         * For keeping hierarchical_reclaim simple, how long we should retry
         * is depends on callers. We set our retry-count to be function
         * of # of children which we should visit in this loop.
         */
-        retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
+        retry_count = MEM_CGROUP_RECLAIM_RETRIES *
+                      mem_cgroup_count_children(memcg);
-        oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+        oldusage = page_counter_read(&memcg->memory);
-        enlarge = 0;
+        do {
-        while (retry_count) {
                if (signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
-                /*
-                 * Rather than hide all in some function, I do this in
+                mutex_lock(&memcg_limit_mutex);
-                 * open coded manner. You see what this really does.
+                if (limit > memcg->memsw.limit) {
-                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
+                        mutex_unlock(&memcg_limit_mutex);
-                 */
-                mutex_lock(&set_limit_mutex);
-                if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
                        ret = -EINVAL;
-                        mutex_unlock(&set_limit_mutex);
                        break;
                }
+                if (limit > memcg->memory.limit)
-                if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
+                        enlarge = true;
-                        enlarge = 1;
+                ret = page_counter_limit(&memcg->memory, limit);
+                mutex_unlock(&memcg_limit_mutex);
-                ret = res_counter_set_limit(&memcg->res, val);
-                mutex_unlock(&set_limit_mutex);
                if (!ret)
                        break;
                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+                curusage = page_counter_read(&memcg->memory);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
                        retry_count--;
                else
                        oldusage = curusage;
-        }
+        } while (retry_count);
        if (!ret && enlarge)
                memcg_oom_recover(memcg);
@@ -3651,52 +3215,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 }
 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
-                                        unsigned long long val)
+                                         unsigned long limit)
 {
+        unsigned long curusage;
+        unsigned long oldusage;
+        bool enlarge = false;
        int retry_count;
-        u64 oldusage, curusage;
+        int ret;
-        int children = mem_cgroup_count_children(memcg);
-        int ret = -EBUSY;
-        int enlarge = 0;
        /* see mem_cgroup_resize_res_limit */
-        retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
+        retry_count = MEM_CGROUP_RECLAIM_RETRIES *
-        oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                      mem_cgroup_count_children(memcg);
-        while (retry_count) {
+        oldusage = page_counter_read(&memcg->memsw);
+        do {
                if (signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
-                /*
-                 * Rather than hide all in some function, I do this in
+                mutex_lock(&memcg_limit_mutex);
-                 * open coded manner. You see what this really does.
+                if (limit < memcg->memory.limit) {
-                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
+                        mutex_unlock(&memcg_limit_mutex);
-                 */
-                mutex_lock(&set_limit_mutex);
-                if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
                        ret = -EINVAL;
-                        mutex_unlock(&set_limit_mutex);
                        break;
                }
-                if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
+                if (limit > memcg->memsw.limit)
-                        enlarge = 1;
+                        enlarge = true;
-                ret = res_counter_set_limit(&memcg->memsw, val);
+                ret = page_counter_limit(&memcg->memsw, limit);
-                mutex_unlock(&set_limit_mutex);
+                mutex_unlock(&memcg_limit_mutex);
                if (!ret)
                        break;
                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                curusage = page_counter_read(&memcg->memsw);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
                        retry_count--;
                else
                        oldusage = curusage;
-        }
+        } while (retry_count);
        if (!ret && enlarge)
                memcg_oom_recover(memcg);
        return ret;
 }
@@ -3709,7 +3274,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        unsigned long reclaimed;
        int loop = 0;
        struct mem_cgroup_tree_per_zone *mctz;
-        unsigned long long excess;
+        unsigned long excess;
        unsigned long nr_scanned;
        if (order > 0)
@@ -3735,35 +3300,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                nr_reclaimed += reclaimed;
                *total_scanned += nr_scanned;
                spin_lock_irq(&mctz->lock);
+                __mem_cgroup_remove_exceeded(mz, mctz);
                /*
                 * If we failed to reclaim anything from this memory cgroup
                 * it is time to move on to the next cgroup
                 */
                next_mz = NULL;
-                if (!reclaimed) {
+                if (!reclaimed)
-                        do {
+                        next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
-                                /*
-                                 * Loop until we find yet another one.
+                excess = soft_limit_excess(mz->memcg);
-                                 *
-                                 * By the time we get the soft_limit lock
-                                 * again, someone might have aded the
-                                 * group back on the RB tree. Iterate to
-                                 * make sure we get a different mem.
-                                 * mem_cgroup_largest_soft_limit_node returns
-                                 * NULL if no other cgroup is present on
-                                 * the tree
-                                 */
-                                next_mz =
-                                __mem_cgroup_largest_soft_limit_node(mctz);
-                                if (next_mz == mz)
-                                        css_put(&next_mz->memcg->css);
-                                else /* next_mz == NULL or other memcg */
-                                        break;
-                        } while (1);
-                }
-                __mem_cgroup_remove_exceeded(mz, mctz);
-                excess = res_counter_soft_limit_excess(&mz->memcg->res);
                /*
                 * One school of thought says that we should not add
                 * back the node to the tree if reclaim returns 0.
@@ -3792,107 +3339,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        return nr_reclaimed;
 }
-/**
- * mem_cgroup_force_empty_list - clears LRU of a group
- * @memcg: group to clear
- * @node: NUMA node
- * @zid: zone id
- * @lru: lru to to clear
- *
- * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
- * reclaim the pages page themselves - pages are moved to the parent (or root)
- * group.
- */
-static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
-                                int node, int zid, enum lru_list lru)
-{
-        struct lruvec *lruvec;
-        unsigned long flags;
-        struct list_head *list;
-        struct page *busy;
-        struct zone *zone;
-        zone = &NODE_DATA(node)->node_zones[zid];
-        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-        list = &lruvec->lists[lru];
-        busy = NULL;
-        do {
-                struct page_cgroup *pc;
-                struct page *page;
-                spin_lock_irqsave(&zone->lru_lock, flags);
-                if (list_empty(list)) {
-                        spin_unlock_irqrestore(&zone->lru_lock, flags);
-                        break;
-                }
-                page = list_entry(list->prev, struct page, lru);
-                if (busy == page) {
-                        list_move(&page->lru, list);
-                        busy = NULL;
-                        spin_unlock_irqrestore(&zone->lru_lock, flags);
-                        continue;
-                }
-                spin_unlock_irqrestore(&zone->lru_lock, flags);
-                pc = lookup_page_cgroup(page);
-                if (mem_cgroup_move_parent(page, pc, memcg)) {
-                        /* found lock contention or "pc" is obsolete. */
-                        busy = page;
-                } else
-                        busy = NULL;
-                cond_resched();
-        } while (!list_empty(list));
-}
-/*
- * make mem_cgroup's charge to be 0 if there is no task by moving
- * all the charges and pages to the parent.
- * This enables deleting this mem_cgroup.
- *
- * Caller is responsible for holding css reference on the memcg.
- */
-static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
-{
-        int node, zid;
-        u64 usage;
-        do {
-                /* This is for making all *used* pages to be on LRU. */
-                lru_add_drain_all();
-                drain_all_stock_sync(memcg);
-                mem_cgroup_start_move(memcg);
-                for_each_node_state(node, N_MEMORY) {
-                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-                                enum lru_list lru;
-                                for_each_lru(lru) {
-                                        mem_cgroup_force_empty_list(memcg,
-                                                        node, zid, lru);
-                                }
-                        }
-                }
-                mem_cgroup_end_move(memcg);
-                memcg_oom_recover(memcg);
-                cond_resched();
-                /*
-                 * Kernel memory may not necessarily be trackable to a specific
-                 * process. So they are not migrated, and therefore we can't
-                 * expect their value to drop to 0 here.
-                 * Having res filled up with kmem only is enough.
-                 *
-                 * This is a safety check because mem_cgroup_force_empty_list
-                 * could have raced with mem_cgroup_replace_page_cache callers
-                 * so the lru seemed empty but the page could have been added
-                 * right after the check. RES_USAGE should be safe as we always
-                 * charge before adding to the LRU.
-                 */
-                usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
-                        res_counter_read_u64(&memcg->kmem, RES_USAGE);
-        } while (usage > 0);
-}
 /*
 * Test whether @memcg has children, dead or alive.  Note that this
 * function doesn't care whether @memcg has use_hierarchy enabled and
@@ -3930,7 +3376,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
        /* we call try-to-free pages for make this cgroup empty */
        lru_add_drain_all();
        /* try to free all pages in this cgroup */
-        while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
+        while (nr_retries && page_counter_read(&memcg->memory)) {
                int progress;
                if (signal_pending(current))
@@ -4001,8 +3447,8 @@ out:
        return retval;
 }
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+static unsigned long tree_stat(struct mem_cgroup *memcg,
-                                               enum mem_cgroup_stat_index idx)
+                               enum mem_cgroup_stat_index idx)
 {
        struct mem_cgroup *iter;
        long val = 0;
@@ -4020,55 +3466,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
        u64 val;
-        if (!mem_cgroup_is_root(memcg)) {
+        if (mem_cgroup_is_root(memcg)) {
+                val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
+                val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
+                if (swap)
+                        val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+        } else {
                if (!swap)
-                        return res_counter_read_u64(&memcg->res, RES_USAGE);
+                        val = page_counter_read(&memcg->memory);
                else
-                        return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                        val = page_counter_read(&memcg->memsw);
        }
-        /*
-         * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
-         * as well as in MEM_CGROUP_STAT_RSS_HUGE.
-         */
-        val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
-        if (swap)
-                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
        return val << PAGE_SHIFT;
 }
+enum {
+        RES_USAGE,
+        RES_LIMIT,
+        RES_MAX_USAGE,
+        RES_FAILCNT,
+        RES_SOFT_LIMIT,
+};
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
                               struct cftype *cft)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        enum res_type type = MEMFILE_TYPE(cft->private);
+        struct page_counter *counter;
-        int name = MEMFILE_ATTR(cft->private);
-        switch (type) {
+        switch (MEMFILE_TYPE(cft->private)) {
        case _MEM:
-                if (name == RES_USAGE)
+                counter = &memcg->memory;
-                        return mem_cgroup_usage(memcg, false);
+                break;
-                return res_counter_read_u64(&memcg->res, name);
        case _MEMSWAP:
-                if (name == RES_USAGE)
+                counter = &memcg->memsw;
-                        return mem_cgroup_usage(memcg, true);
+                break;
-                return res_counter_read_u64(&memcg->memsw, name);
        case _KMEM:
-                return res_counter_read_u64(&memcg->kmem, name);
+                counter = &memcg->kmem;
                break;
        default:
                BUG();
        }
+        switch (MEMFILE_ATTR(cft->private)) {
+        case RES_USAGE:
+                if (counter == &memcg->memory)
+                        return mem_cgroup_usage(memcg, false);
+                if (counter == &memcg->memsw)
+                        return mem_cgroup_usage(memcg, true);
+                return (u64)page_counter_read(counter) * PAGE_SIZE;
+        case RES_LIMIT:
+                return (u64)counter->limit * PAGE_SIZE;
+        case RES_MAX_USAGE:
+                return (u64)counter->watermark * PAGE_SIZE;
+        case RES_FAILCNT:
+                return counter->failcnt;
+        case RES_SOFT_LIMIT:
+                return (u64)memcg->soft_limit * PAGE_SIZE;
+        default:
+                BUG();
+        }
 }
 #ifdef CONFIG_MEMCG_KMEM
-/* should be called with activate_kmem_mutex held */
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
-static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+                               unsigned long nr_pages)
-                                 unsigned long long limit)
 {
        int err = 0;
        int memcg_id;
@@ -4115,7 +3577,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
         * We couldn't have accounted to this cgroup, because it hasn't got the
         * active bit set yet, so this should succeed.
         */
-        err = res_counter_set_limit(&memcg->kmem, limit);
+        err = page_counter_limit(&memcg->kmem, nr_pages);
        VM_BUG_ON(err);
        static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -4130,26 +3592,17 @@ out:
        return err;
 }
-static int memcg_activate_kmem(struct mem_cgroup *memcg,
-                               unsigned long long limit)
-{
-        int ret;
-        mutex_lock(&activate_kmem_mutex);
-        ret = __memcg_activate_kmem(memcg, limit);
-        mutex_unlock(&activate_kmem_mutex);
-        return ret;
-}
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
-                                   unsigned long long val)
+                                   unsigned long limit)
 {
        int ret;
+        mutex_lock(&memcg_limit_mutex);
        if (!memcg_kmem_is_active(memcg))
-                ret = memcg_activate_kmem(memcg, val);
+                ret = memcg_activate_kmem(memcg, limit);
        else
-                ret = res_counter_set_limit(&memcg->kmem, val);
+                ret = page_counter_limit(&memcg->kmem, limit);
+        mutex_unlock(&memcg_limit_mutex);
        return ret;
 }
@@ -4161,19 +3614,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
        if (!parent)
                return 0;
-        mutex_lock(&activate_kmem_mutex);
+        mutex_lock(&memcg_limit_mutex);
        /*
         * If the parent cgroup is not kmem-active now, it cannot be activated
         * after this point, because it has at least one child already.
         */
        if (memcg_kmem_is_active(parent))
-                ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+                ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
-        mutex_unlock(&activate_kmem_mutex);
+        mutex_unlock(&memcg_limit_mutex);
        return ret;
 }
 #else
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
-                                   unsigned long long val)
+                                   unsigned long limit)
 {
        return -EINVAL;
 }
@@ -4187,110 +3640,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                                char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-        enum res_type type;
+        unsigned long nr_pages;
-        int name;
-        unsigned long long val;
        int ret;
        buf = strstrip(buf);
-        type = MEMFILE_TYPE(of_cft(of)->private);
+        ret = page_counter_memparse(buf, &nr_pages);
-        name = MEMFILE_ATTR(of_cft(of)->private);
+        if (ret)
+                return ret;
-        switch (name) {
+        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_LIMIT:
                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
                        ret = -EINVAL;
                        break;
                }
-                /* This function does all necessary parse...reuse it */
+                switch (MEMFILE_TYPE(of_cft(of)->private)) {
-                ret = res_counter_memparse_write_strategy(buf, &val);
+                case _MEM:
-                if (ret)
+                        ret = mem_cgroup_resize_limit(memcg, nr_pages);
                        break;
-                if (type == _MEM)
+                case _MEMSWAP:
-                        ret = mem_cgroup_resize_limit(memcg, val);
+                        ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
-                else if (type == _MEMSWAP)
-                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
-                else if (type == _KMEM)
-                        ret = memcg_update_kmem_limit(memcg, val);
-                else
-                        return -EINVAL;
-                break;
-        case RES_SOFT_LIMIT:
-                ret = res_counter_memparse_write_strategy(buf, &val);
-                if (ret)
                        break;
-                /*
+                case _KMEM:
-                 * For memsw, soft limits are hard to implement in terms
+                        ret = memcg_update_kmem_limit(memcg, nr_pages);
-                 * of semantics, for now, we support soft limits for
+                        break;
-                 * control without swap
+                }
-                 */
-                if (type == _MEM)
-                        ret = res_counter_set_soft_limit(&memcg->res, val);
-                else
-                        ret = -EINVAL;
                break;
-        default:
+        case RES_SOFT_LIMIT:
-                ret = -EINVAL; /* should be BUG() ? */
+                memcg->soft_limit = nr_pages;
+                ret = 0;
                break;
        }
        return ret ?: nbytes;
 }
-static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
-                unsigned long long *mem_limit, unsigned long long *memsw_limit)
-{
-        unsigned long long min_limit, min_memsw_limit, tmp;
-        min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
-        min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
-        if (!memcg->use_hierarchy)
-                goto out;
-        while (memcg->css.parent) {
-                memcg = mem_cgroup_from_css(memcg->css.parent);
-                if (!memcg->use_hierarchy)
-                        break;
-                tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
-                min_limit = min(min_limit, tmp);
-                tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
-                min_memsw_limit = min(min_memsw_limit, tmp);
-        }
-out:
-        *mem_limit = min_limit;
-        *memsw_limit = min_memsw_limit;
-}
 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
                                size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-        int name;
+        struct page_counter *counter;
-        enum res_type type;
-        type = MEMFILE_TYPE(of_cft(of)->private);
+        switch (MEMFILE_TYPE(of_cft(of)->private)) {
-        name = MEMFILE_ATTR(of_cft(of)->private);
+        case _MEM:
+                counter = &memcg->memory;
+                break;
+        case _MEMSWAP:
+                counter = &memcg->memsw;
+                break;
+        case _KMEM:
+                counter = &memcg->kmem;
+                break;
+        default:
+                BUG();
+        }
-        switch (name) {
+        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_MAX_USAGE:
-                if (type == _MEM)
+                page_counter_reset_watermark(counter);
-                        res_counter_reset_max(&memcg->res);
-                else if (type == _MEMSWAP)
-                        res_counter_reset_max(&memcg->memsw);
-                else if (type == _KMEM)
-                        res_counter_reset_max(&memcg->kmem);
-                else
-                        return -EINVAL;
                break;
        case RES_FAILCNT:
-                if (type == _MEM)
+                counter->failcnt = 0;
-                        res_counter_reset_failcnt(&memcg->res);
-                else if (type == _MEMSWAP)
-                        res_counter_reset_failcnt(&memcg->memsw);
-                else if (type == _KMEM)
-                        res_counter_reset_failcnt(&memcg->kmem);
-                else
-                        return -EINVAL;
                break;
+        default:
+                BUG();
        }
        return nbytes;
@@ -4387,6 +3799,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
 static int memcg_stat_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+        unsigned long memory, memsw;
        struct mem_cgroup *mi;
        unsigned int i;
@@ -4406,14 +3819,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
                           mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
        /* Hierarchical information */
-        {
+        memory = memsw = PAGE_COUNTER_MAX;
-                unsigned long long limit, memsw_limit;
+        for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
-                memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
+                memory = min(memory, mi->memory.limit);
-                seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
+                memsw = min(memsw, mi->memsw.limit);
-                if (do_swap_account)
-                        seq_printf(m, "hierarchical_memsw_limit %llu\n",
-                                   memsw_limit);
        }
+        seq_printf(m, "hierarchical_memory_limit %llu\n",
+                   (u64)memory * PAGE_SIZE);
+        if (do_swap_account)
+                seq_printf(m, "hierarchical_memsw_limit %llu\n",
+                           (u64)memsw * PAGE_SIZE);
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                long long val = 0;
@@ -4497,7 +3912,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
        struct mem_cgroup_threshold_ary *t;
-        u64 usage;
+        unsigned long usage;
        int i;
        rcu_read_lock();
@@ -4596,10 +4011,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 {
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        u64 threshold, usage;
+        unsigned long threshold;
+        unsigned long usage;
        int i, size, ret;
-        ret = res_counter_memparse_write_strategy(args, &threshold);
+        ret = page_counter_memparse(args, &threshold);
        if (ret)
                return ret;
@@ -4689,7 +4105,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 {
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        u64 usage;
+        unsigned long usage;
        int i, j, size;
        mutex_lock(&memcg->thresholds_lock);
@@ -4855,40 +4271,6 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
        mem_cgroup_sockets_destroy(memcg);
 }
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
-{
-        if (!memcg_kmem_is_active(memcg))
-                return;
-        /*
-         * kmem charges can outlive the cgroup. In the case of slab
-         * pages, for instance, a page contain objects from various
-         * processes. As we prevent from taking a reference for every
-         * such allocation we have to be careful when doing uncharge
-         * (see memcg_uncharge_kmem) and here during offlining.
-         *
-         * The idea is that that only the _last_ uncharge which sees
-         * the dead memcg will drop the last reference. An additional
-         * reference is taken here before the group is marked dead
-         * which is then paired with css_put during uncharge resp. here.
-         *
-         * Although this might sound strange as this path is called from
-         * css_offline() when the referencemight have dropped down to 0 and
-         * shouldn't be incremented anymore (css_tryget_online() would
-         * fail) we do not have other options because of the kmem
-         * allocations lifetime.
-         */
-        css_get(&memcg->css);
-        memcg_kmem_mark_dead(memcg);
-        if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
-                return;
-        if (memcg_kmem_test_and_clear_dead(memcg))
-                css_put(&memcg->css);
-}
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
@@ -4898,10 +4280,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
-{
-}
 #endif
 /*
@@ -5228,7 +4606,10 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_SLABINFO
        {
                .name = "kmem.slabinfo",
-                .seq_show = mem_cgroup_slabinfo_read,
+                .seq_start = slab_start,
+                .seq_next = slab_next,
+                .seq_stop = slab_stop,
+                .seq_show = memcg_slab_show,
        },
 #endif
 #endif
@@ -5363,9 +4744,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 */
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
-        if (!memcg->res.parent)
+        if (!memcg->memory.parent)
                return NULL;
-        return mem_cgroup_from_res_counter(memcg->res.parent, res);
+        return mem_cgroup_from_counter(memcg->memory.parent, memory);
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
@@ -5410,9 +4791,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        /* root ? */
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
-                res_counter_init(&memcg->res, NULL);
+                page_counter_init(&memcg->memory, NULL);
-                res_counter_init(&memcg->memsw, NULL);
+                page_counter_init(&memcg->memsw, NULL);
-                res_counter_init(&memcg->kmem, NULL);
+                page_counter_init(&memcg->kmem, NULL);
        }
        memcg->last_scanned_node = MAX_NUMNODES;
@@ -5451,18 +4832,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
        memcg->swappiness = mem_cgroup_swappiness(parent);
        if (parent->use_hierarchy) {
-                res_counter_init(&memcg->res, &parent->res);
+                page_counter_init(&memcg->memory, &parent->memory);
-                res_counter_init(&memcg->memsw, &parent->memsw);
+                page_counter_init(&memcg->memsw, &parent->memsw);
-                res_counter_init(&memcg->kmem, &parent->kmem);
+                page_counter_init(&memcg->kmem, &parent->kmem);
                /*
                 * No need to take a reference to the parent because cgroup
                 * core guarantees its existence.
                 */
        } else {
-                res_counter_init(&memcg->res, NULL);
+                page_counter_init(&memcg->memory, NULL);
-                res_counter_init(&memcg->memsw, NULL);
+                page_counter_init(&memcg->memsw, NULL);
-                res_counter_init(&memcg->kmem, NULL);
+                page_counter_init(&memcg->kmem, NULL);
                /*
                 * Deeper hierachy with use_hierarchy == false doesn't make
                 * much sense so let cgroup subsystem know about this
@@ -5487,29 +4868,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
        return 0;
 }
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
-        struct mem_cgroup *parent = memcg;
-        while ((parent = parent_mem_cgroup(parent)))
-                mem_cgroup_iter_invalidate(parent);
-        /*
-         * if the root memcg is not hierarchical we have to check it
-         * explicitely.
-         */
-        if (!root_mem_cgroup->use_hierarchy)
-                mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_event *event, *tmp;
-        struct cgroup_subsys_state *iter;
        /*
         * Unregister events and notify userspace.
@@ -5523,17 +4885,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        }
        spin_unlock(&memcg->event_list_lock);
-        kmem_cgroup_css_offline(memcg);
-        mem_cgroup_invalidate_reclaim_iterators(memcg);
-        /*
-         * This requires that offlining is serialized.  Right now that is
-         * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
-         */
-        css_for_each_descendant_post(iter, css)
-                mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
        memcg_unregister_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }
@@ -5541,42 +4892,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        /*
-         * XXX: css_offline() would be where we should reparent all
-         * memory to prepare the cgroup for destruction.  However,
-         * memcg does not do css_tryget_online() and res_counter charging
-         * under the same RCU lock region, which means that charging
-         * could race with offlining.  Offlining only happens to
-         * cgroups with no tasks in them but charges can show up
-         * without any tasks from the swapin path when the target
-         * memcg is looked up from the swapout record and not from the
-         * current task as it usually is.  A race like this can leak
-         * charges and put pages with stale cgroup pointers into
-         * circulation:
-         *
-         * #0                        #1
-         *                           lookup_swap_cgroup_id()
-         *                           rcu_read_lock()
-         *                           mem_cgroup_lookup()
-         *                           css_tryget_online()
-         *                           rcu_read_unlock()
-         * disable css_tryget_online()
-         * call_rcu()
-         *   offline_css()
-         *     reparent_charges()
-         *                           res_counter_charge()
-         *                           css_put()
-         *                             css_free()
-         *                           pc->mem_cgroup = dead memcg
-         *                           add page to lru
-         *
-         * The bulk of the charges are still moved in offline_css() to
-         * avoid pinning a lot of pages in case a long-term reference
-         * like a swapout record is deferring the css_free() to long
-         * after offlining.  But this makes sure we catch any charges
-         * made after offlining:
-         */
-        mem_cgroup_reparent_charges(memcg);
        memcg_destroy_kmem(memcg);
        __mem_cgroup_free(memcg);
@@ -5599,10 +4914,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        mem_cgroup_resize_limit(memcg, ULLONG_MAX);
+        mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
-        mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
+        mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
-        memcg_update_kmem_limit(memcg, ULLONG_MAX);
+        memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
-        res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
+        memcg->soft_limit = 0;
 }
 #ifdef CONFIG_MMU
@@ -5758,7 +5073,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                unsigned long addr, pte_t ptent, union mc_target *target)
 {
        struct page *page = NULL;
-        struct page_cgroup *pc;
        enum mc_target_type ret = MC_TARGET_NONE;
        swp_entry_t ent = { .val = 0 };
@@ -5772,13 +5086,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
        if (!page && !ent.val)
                return ret;
        if (page) {
-                pc = lookup_page_cgroup(page);
                /*
                 * Do only loose check w/o serialization.
-                 * mem_cgroup_move_account() checks the pc is valid or
+                 * mem_cgroup_move_account() checks the page is valid or
                 * not under LRU exclusion.
                 */
-                if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+                if (page->mem_cgroup == mc.from) {
                        ret = MC_TARGET_PAGE;
                        if (target)
                                target->page = page;
@@ -5806,15 +5119,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
                unsigned long addr, pmd_t pmd, union mc_target *target)
 {
        struct page *page = NULL;
-        struct page_cgroup *pc;
        enum mc_target_type ret = MC_TARGET_NONE;
        page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
        if (!move_anon())
                return ret;
-        pc = lookup_page_cgroup(page);
+        if (page->mem_cgroup == mc.from) {
-        if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
                ret = MC_TARGET_PAGE;
                if (target) {
                        get_page(page);
@@ -5897,7 +5208,6 @@ static void __mem_cgroup_clear_mc(void)
 {
        struct mem_cgroup *from = mc.from;
        struct mem_cgroup *to = mc.to;
-        int i;
        /* we must uncharge all the leftover precharges from mc.to */
        if (mc.precharge) {
@@ -5916,19 +5226,17 @@ static void __mem_cgroup_clear_mc(void)
        if (mc.moved_swap) {
                /* uncharge swap account from the old cgroup */
                if (!mem_cgroup_is_root(mc.from))
-                        res_counter_uncharge(&mc.from->memsw,
+                        page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
-                                             PAGE_SIZE * mc.moved_swap);
-                for (i = 0; i < mc.moved_swap; i++)
-                        css_put(&mc.from->css);
                /*
-                 * we charged both to->res and to->memsw, so we should
+                 * we charged both to->memory and to->memsw, so we
-                 * uncharge to->res.
+                 * should uncharge to->memory.
                 */
                if (!mem_cgroup_is_root(mc.to))
-                        res_counter_uncharge(&mc.to->res,
+                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
-                                             PAGE_SIZE * mc.moved_swap);
+                css_put_many(&mc.from->css, mc.moved_swap);
                /* we've already done css_get(mc.to) */
                mc.moved_swap = 0;
        }
@@ -5939,8 +5247,6 @@ static void __mem_cgroup_clear_mc(void)
 static void mem_cgroup_clear_mc(void)
 {
-        struct mem_cgroup *from = mc.from;
        /*
         * we must clear moving_task before waking up waiters at the end of
         * task migration.
@@ -5951,7 +5257,6 @@ static void mem_cgroup_clear_mc(void)
        mc.from = NULL;
        mc.to = NULL;
        spin_unlock(&mc.lock);
-        mem_cgroup_end_move(from);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
@@ -5984,7 +5289,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                        VM_BUG_ON(mc.precharge);
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
-                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = memcg;
@@ -6004,7 +5309,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
                                     struct cgroup_taskset *tset)
 {
-        mem_cgroup_clear_mc();
+        if (mc.to)
+                mem_cgroup_clear_mc();
 }
 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
@@ -6018,7 +5324,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        enum mc_target_type target_type;
        union mc_target target;
        struct page *page;
-        struct page_cgroup *pc;
        /*
         * We don't take compound_lock() here but no race with splitting thp
@@ -6039,9 +5344,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                if (target_type == MC_TARGET_PAGE) {
                        page = target.page;
                        if (!isolate_lru_page(page)) {
-                                pc = lookup_page_cgroup(page);
                                if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
-                                                        pc, mc.from, mc.to)) {
+                                                             mc.from, mc.to)) {
                                        mc.precharge -= HPAGE_PMD_NR;
                                        mc.moved_charge += HPAGE_PMD_NR;
                                }
@@ -6069,9 +5373,7 @@ retry:
                        page = target.page;
                        if (isolate_lru_page(page))
                                goto put;
-                        pc = lookup_page_cgroup(page);
+                        if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
-                        if (!mem_cgroup_move_account(page, 1, pc,
-                                                     mc.from, mc.to)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -6115,6 +5417,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
        struct vm_area_struct *vma;
        lru_add_drain_all();
+        /*
+         * Signal mem_cgroup_begin_page_stat() to take the memcg's
+         * move_lock while we're moving its pages to another memcg.
+         * Then wait for already started RCU-only updates to finish.
+         */
+        atomic_inc(&mc.from->moving_account);
+        synchronize_rcu();
 retry:
        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
                /*
@@ -6147,6 +5456,7 @@ retry:
                        break;
        }
        up_read(&mm->mmap_sem);
+        atomic_dec(&mc.from->moving_account);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
@@ -6250,7 +5560,7 @@ static void __init enable_swap_cgroup(void)
 */
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
-        struct page_cgroup *pc;
+        struct mem_cgroup *memcg;
        unsigned short oldid;
        VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -6259,20 +5569,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
        if (!do_swap_account)
                return;
-        pc = lookup_page_cgroup(page);
+        memcg = page->mem_cgroup;
        /* Readahead page, never charged */
-        if (!PageCgroupUsed(pc))
+        if (!memcg)
                return;
-        VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
+        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
-        oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
        VM_BUG_ON_PAGE(oldid, page);
+        mem_cgroup_swap_statistics(memcg, true);
+        page->mem_cgroup = NULL;
-        pc->flags &= ~PCG_MEMSW;
+        if (!mem_cgroup_is_root(memcg))
-        css_get(&pc->mem_cgroup->css);
+                page_counter_uncharge(&memcg->memory, 1);
-        mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+        /* XXX: caller holds IRQ-safe mapping->tree_lock */
+        VM_BUG_ON(!irqs_disabled());
+        mem_cgroup_charge_statistics(memcg, page, -1);
+        memcg_check_events(memcg, page);
 }
 /**
@@ -6294,7 +5610,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
        memcg = mem_cgroup_lookup(id);
        if (memcg) {
                if (!mem_cgroup_is_root(memcg))
-                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                        page_counter_uncharge(&memcg->memsw, 1);
                mem_cgroup_swap_statistics(memcg, false);
                css_put(&memcg->css);
        }
@@ -6330,7 +5646,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                goto out;
        if (PageSwapCache(page)) {
-                struct page_cgroup *pc = lookup_page_cgroup(page);
                /*
                 * Every swap fault against a single page tries to charge the
                 * page, bail as early as possible.  shmem_unuse() encounters
@@ -6338,7 +5653,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 * the page lock, which serializes swap cache removal, which
                 * in turn serializes uncharging.
                 */
-                if (PageCgroupUsed(pc))
+                if (page->mem_cgroup)
                        goto out;
        }
@@ -6452,19 +5767,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
 }
 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-                           unsigned long nr_mem, unsigned long nr_memsw,
                           unsigned long nr_anon, unsigned long nr_file,
                           unsigned long nr_huge, struct page *dummy_page)
 {
+        unsigned long nr_pages = nr_anon + nr_file;
        unsigned long flags;
        if (!mem_cgroup_is_root(memcg)) {
-                if (nr_mem)
+                page_counter_uncharge(&memcg->memory, nr_pages);
-                        res_counter_uncharge(&memcg->res,
+                if (do_swap_account)
-                                             nr_mem * PAGE_SIZE);
+                        page_counter_uncharge(&memcg->memsw, nr_pages);
-                if (nr_memsw)
-                        res_counter_uncharge(&memcg->memsw,
-                                             nr_memsw * PAGE_SIZE);
                memcg_oom_recover(memcg);
        }
@@ -6473,27 +5785,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
        __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
-        __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
        memcg_check_events(memcg, dummy_page);
        local_irq_restore(flags);
+        if (!mem_cgroup_is_root(memcg))
+                css_put_many(&memcg->css, nr_pages);
 }
 static void uncharge_list(struct list_head *page_list)
 {
        struct mem_cgroup *memcg = NULL;
-        unsigned long nr_memsw = 0;
        unsigned long nr_anon = 0;
        unsigned long nr_file = 0;
        unsigned long nr_huge = 0;
        unsigned long pgpgout = 0;
-        unsigned long nr_mem = 0;
        struct list_head *next;
        struct page *page;
        next = page_list->next;
        do {
                unsigned int nr_pages = 1;
-                struct page_cgroup *pc;
                page = list_entry(next, struct page, lru);
                next = page->lru.next;
@@ -6501,24 +5813,22 @@ static void uncharge_list(struct list_head *page_list)
                VM_BUG_ON_PAGE(PageLRU(page), page);
                VM_BUG_ON_PAGE(page_count(page), page);
-                pc = lookup_page_cgroup(page);
+                if (!page->mem_cgroup)
-                if (!PageCgroupUsed(pc))
                        continue;
                /*
                 * Nobody should be changing or seriously looking at
-                 * pc->mem_cgroup and pc->flags at this point, we have
+                 * page->mem_cgroup at this point, we have fully
-                 * fully exclusive access to the page.
+                 * exclusive access to the page.
                 */
-                if (memcg != pc->mem_cgroup) {
+                if (memcg != page->mem_cgroup) {
                        if (memcg) {
-                                uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+                                uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                                               nr_anon, nr_file, nr_huge, page);
+                                               nr_huge, page);
-                                pgpgout = nr_mem = nr_memsw = 0;
+                                pgpgout = nr_anon = nr_file = nr_huge = 0;
-                                nr_anon = nr_file = nr_huge = 0;
                        }
-                        memcg = pc->mem_cgroup;
+                        memcg = page->mem_cgroup;
                }
                if (PageTransHuge(page)) {
@@ -6532,18 +5842,14 @@ static void uncharge_list(struct list_head *page_list)
                else
                        nr_file += nr_pages;
-                if (pc->flags & PCG_MEM)
+                page->mem_cgroup = NULL;
-                        nr_mem += nr_pages;
-                if (pc->flags & PCG_MEMSW)
-                        nr_memsw += nr_pages;
-                pc->flags = 0;
                pgpgout++;
        } while (next != page_list);
        if (memcg)
-                uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+                uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                               nr_anon, nr_file, nr_huge, page);
+                               nr_huge, page);
 }
 /**
@@ -6555,14 +5861,11 @@ static void uncharge_list(struct list_head *page_list)
 */
 void mem_cgroup_uncharge(struct page *page)
 {
-        struct page_cgroup *pc;
        if (mem_cgroup_disabled())
                return;
        /* Don't touch page->lru of any random page, pre-check: */
-        pc = lookup_page_cgroup(page);
+        if (!page->mem_cgroup)
-        if (!PageCgroupUsed(pc))
                return;
        INIT_LIST_HEAD(&page->lru);
@@ -6598,7 +5901,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
                        bool lrucare)
 {
-        struct page_cgroup *pc;
+        struct mem_cgroup *memcg;
        int isolated;
        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6613,27 +5916,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
                return;
        /* Page cache replacement: new page already charged? */
-        pc = lookup_page_cgroup(newpage);
+        if (newpage->mem_cgroup)
-        if (PageCgroupUsed(pc))
                return;
-        /* Re-entrant migration: old page already uncharged? */
+        /*
-        pc = lookup_page_cgroup(oldpage);
+         * Swapcache readahead pages can get migrated before being
-        if (!PageCgroupUsed(pc))
+         * charged, and migration from compaction can happen to an
+         * uncharged page when the PFN walker finds a page that
+         * reclaim just put back on the LRU but has not released yet.
+         */
+        memcg = oldpage->mem_cgroup;
+        if (!memcg)
                return;
-        VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
-        VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
        if (lrucare)
                lock_page_lru(oldpage, &isolated);
-        pc->flags = 0;
+        oldpage->mem_cgroup = NULL;
        if (lrucare)
                unlock_page_lru(oldpage, isolated);
-        commit_charge(newpage, pc->mem_cgroup, lrucare);
+        commit_charge(newpage, memcg, lrucare);
 }
 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b852b10ec76d..e5ee0ca7ae85 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,7 +233,7 @@ void shake_page(struct page *p, int access)
                lru_add_drain_all();
                if (PageLRU(p))
                        return;
-                drain_all_pages();
+                drain_all_pages(page_zone(p));
                if (PageLRU(p) || is_free_buddy_page(p))
                        return;
        }
@@ -1661,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
                        if (!is_free_buddy_page(page))
                                lru_add_drain_all();
                        if (!is_free_buddy_page(page))
-                                drain_all_pages();
+                                drain_all_pages(page_zone(page));
                        SetPageHWPoison(page);
                        if (!is_free_buddy_page(page))
                                pr_info("soft offline: %#lx: page leaked\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1bf4807cb21e..9fab10795bea 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1725,7 +1725,7 @@ repeat:
        if (drain) {
                lru_add_drain_all();
                cond_resched();
-                drain_all_pages();
+                drain_all_pages(zone);
        }
        pfn = scan_movable_pages(start_pfn, end_pfn);
@@ -1747,7 +1747,7 @@ repeat:
        lru_add_drain_all();
        yield();
        /* drain pcp pages, this is synchronous. */
-        drain_all_pages();
+        drain_all_pages(zone);
        /*
         * dissolve free hugepages in the memory block before doing offlining
         * actually in order to make hugetlbfs's object counting consistent.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5340f6b91312..3b014d326151 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -119,7 +119,7 @@ found:
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
-                const struct mem_cgroup *memcg, const nodemask_t *nodemask)
+                struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        if (is_global_init(p))
                return true;
@@ -353,7 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
 * swapents, oom_score_adj value, and name.
 */
-static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *task;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 19ceae87522d..d5d81f5384d1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page)
                dec_zone_page_state(page, NR_WRITEBACK);
                inc_zone_page_state(page, NR_WRITTEN);
        }
-        mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
+        mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
        return ret;
 }
@@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
                inc_zone_page_state(page, NR_WRITEBACK);
        }
-        mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
+        mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
        return ret;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 616a2c956b4b..a7198c065999 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,6 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
-#include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
@@ -641,8 +640,10 @@ static inline int free_pages_check(struct page *page)
                bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
                bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
        }
-        if (unlikely(mem_cgroup_bad_page_check(page)))
+#ifdef CONFIG_MEMCG
-                bad_reason = "cgroup check failed";
+        if (unlikely(page->mem_cgroup))
+                bad_reason = "page still charged to cgroup";
+#endif
        if (unlikely(bad_reason)) {
                bad_page(page, bad_reason, bad_flags);
                return 1;
@@ -741,6 +742,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        int i;
        int bad = 0;
+        VM_BUG_ON_PAGE(PageTail(page), page);
+        VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
        trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
@@ -898,8 +902,10 @@ static inline int check_new_page(struct page *page)
                bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
                bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
        }
-        if (unlikely(mem_cgroup_bad_page_check(page)))
+#ifdef CONFIG_MEMCG
-                bad_reason = "cgroup check failed";
+        if (unlikely(page->mem_cgroup))
+                bad_reason = "page still charged to cgroup";
+#endif
        if (unlikely(bad_reason)) {
                bad_page(page, bad_reason, bad_flags);
                return 1;
@@ -1267,55 +1273,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 #endif
 /*
- * Drain pages of the indicated processor.
+ * Drain pcplists of the indicated processor and zone.
 *
 * The processor must either be the current processor and the
 * thread pinned to the current processor or a processor that
 * is not online.
 */
-static void drain_pages(unsigned int cpu)
+static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
        unsigned long flags;
-        struct zone *zone;
+        struct per_cpu_pageset *pset;
+        struct per_cpu_pages *pcp;
-        for_each_populated_zone(zone) {
+        local_irq_save(flags);
-                struct per_cpu_pageset *pset;
+        pset = per_cpu_ptr(zone->pageset, cpu);
-                struct per_cpu_pages *pcp;
-                local_irq_save(flags);
+        pcp = &pset->pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
+        if (pcp->count) {
+                free_pcppages_bulk(zone, pcp->count, pcp);
+                pcp->count = 0;
+        }
+        local_irq_restore(flags);
+}
-                pcp = &pset->pcp;
+/*
-                if (pcp->count) {
+ * Drain pcplists of all zones on the indicated processor.
-                        free_pcppages_bulk(zone, pcp->count, pcp);
+ *
-                        pcp->count = 0;
+ * The processor must either be the current processor and the
-                }
+ * thread pinned to the current processor or a processor that
-                local_irq_restore(flags);
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
+{
+        struct zone *zone;
+        for_each_populated_zone(zone) {
+                drain_pages_zone(cpu, zone);
        }
 }
 /*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ *
+ * The CPU has to be pinned. When zone parameter is non-NULL, spill just
+ * the single zone's pages.
 */
-void drain_local_pages(void *arg)
+void drain_local_pages(struct zone *zone)
 {
-        drain_pages(smp_processor_id());
+        int cpu = smp_processor_id();
+        if (zone)
+                drain_pages_zone(cpu, zone);
+        else
+                drain_pages(cpu);
 }
 /*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
 *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
 * Note that this code is protected against sending an IPI to an offline
 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
 * nothing keeps CPUs from showing up after we populated the cpumask and
 * before the call to on_each_cpu_mask().
 */
-void drain_all_pages(void)
+void drain_all_pages(struct zone *zone)
 {
        int cpu;
-        struct per_cpu_pageset *pcp;
-        struct zone *zone;
        /*
         * Allocate in the BSS so we wont require allocation in
@@ -1330,20 +1356,31 @@ void drain_all_pages(void)
         * disables preemption as part of its processing
         */
        for_each_online_cpu(cpu) {
+                struct per_cpu_pageset *pcp;
+                struct zone *z;
                bool has_pcps = false;
-                for_each_populated_zone(zone) {
+                if (zone) {
                        pcp = per_cpu_ptr(zone->pageset, cpu);
-                        if (pcp->pcp.count) {
+                        if (pcp->pcp.count)
                                has_pcps = true;
-                                break;
+                } else {
+                        for_each_populated_zone(z) {
+                                pcp = per_cpu_ptr(z->pageset, cpu);
+                                if (pcp->pcp.count) {
+                                        has_pcps = true;
+                                        break;
+                                }
                        }
                }
                if (has_pcps)
                        cpumask_set_cpu(cpu, &cpus_with_pcps);
                else
                        cpumask_clear_cpu(cpu, &cpus_with_pcps);
        }
-        on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
+        on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
+                                                                zone, 1);
 }
 #ifdef CONFIG_HIBERNATION
@@ -1705,7 +1742,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                        unsigned long mark, int classzone_idx, int alloc_flags,
                        long free_pages)
 {
-        /* free_pages my go negative - that's OK */
+        /* free_pages may go negative - that's OK */
        long min = mark;
        int o;
        long free_cma = 0;
@@ -2296,7 +2333,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        int classzone_idx, int migratetype, enum migrate_mode mode,
        int *contended_compaction, bool *deferred_compaction)
 {
-        struct zone *last_compact_zone = NULL;
        unsigned long compact_result;
        struct page *page;
@@ -2307,7 +2343,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, mode,
                                                contended_compaction,
-                                                &last_compact_zone);
+                                                alloc_flags, classzone_idx);
        current->flags &= ~PF_MEMALLOC;
        switch (compact_result) {
@@ -2326,10 +2362,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         */
        count_vm_event(COMPACTSTALL);
-        /* Page migration frees to the PCP lists but we want merging */
-        drain_pages(get_cpu());
-        put_cpu();
        page = get_page_from_freelist(gfp_mask, nodemask,
                        order, zonelist, high_zoneidx,
                        alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2345,14 +2377,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        }
        /*
-         * last_compact_zone is where try_to_compact_pages thought allocation
-         * should succeed, so it did not defer compaction. But here we know
-         * that it didn't succeed, so we do the defer.
-         */
-        if (last_compact_zone && mode != MIGRATE_ASYNC)
-                defer_compaction(last_compact_zone, order);
-        /*
         * It's bad if compaction run occurs and fails. The most likely reason
         * is that pages exist, but not enough to satisfy watermarks.
         */
@@ -2433,7 +2457,7 @@ retry:
         * pages are pinned on the per-cpu lists. Drain them and try again
         */
        if (!page && !drained) {
-                drain_all_pages();
+                drain_all_pages(NULL);
                drained = true;
                goto retry;
        }
@@ -3893,14 +3917,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
        else
                page_group_by_mobility_disabled = 0;
-        printk("Built %i zonelists in %s order, mobility grouping %s.  "
+        pr_info("Built %i zonelists in %s order, mobility grouping %s.  "
                "Total pages: %ld\n",
                        nr_online_nodes,
                        zonelist_order_name[current_zonelist_order],
                        page_group_by_mobility_disabled ? "off" : "on",
                        vm_total_pages);
 #ifdef CONFIG_NUMA
-        printk("Policy zone: %s\n", zone_names[policy_zone]);
+        pr_info("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
@@ -4832,7 +4856,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 #endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
-        pgdat_page_cgroup_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
@@ -5334,33 +5357,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        find_zone_movable_pfns_for_nodes();
        /* Print out the zone ranges */
-        printk("Zone ranges:\n");
+        pr_info("Zone ranges:\n");
        for (i = 0; i < MAX_NR_ZONES; i++) {
                if (i == ZONE_MOVABLE)
                        continue;
-                printk(KERN_CONT "  %-8s ", zone_names[i]);
+                pr_info("  %-8s ", zone_names[i]);
                if (arch_zone_lowest_possible_pfn[i] ==
                                arch_zone_highest_possible_pfn[i])
-                        printk(KERN_CONT "empty\n");
+                        pr_cont("empty\n");
                else
-                        printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
+                        pr_cont("[mem %0#10lx-%0#10lx]\n",
                                arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
                                (arch_zone_highest_possible_pfn[i]
                                        << PAGE_SHIFT) - 1);
        }
        /* Print out the PFNs ZONE_MOVABLE begins at in each node */
-        printk("Movable zone start for each node\n");
+        pr_info("Movable zone start for each node\n");
        for (i = 0; i < MAX_NUMNODES; i++) {
                if (zone_movable_pfn[i])
-                        printk("  Node %d: %#010lx\n", i,
+                        pr_info("  Node %d: %#010lx\n", i,
                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
        /* Print out the early node map */
-        printk("Early memory node ranges\n");
+        pr_info("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+                pr_info("  node %3d: [mem %#010lx-%#010lx]\n", nid,
                       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
        /* Initialise every node */
@@ -5496,7 +5519,7 @@ void __init mem_init_print_info(const char *str)
 #undef  adj_init_size
-        printk("Memory: %luK/%luK available "
+        pr_info("Memory: %luK/%luK available "
               "(%luK kernel code, %luK rwdata, %luK rodata, "
               "%luK init, %luK bss, %luK reserved"
 #ifdef  CONFIG_HIGHMEM
@@ -6385,7 +6408,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         */
        lru_add_drain_all();
-        drain_all_pages();
+        drain_all_pages(cc.zone);
        order = 0;
        outer_start = start;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
deleted file mode 100644
index 5331c2bd85a2..000000000000
--- a/mm/page_cgroup.c
+++ /dev/null
@@ -1,530 +0,0 @@
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/bootmem.h>
-#include <linux/bit_spinlock.h>
-#include <linux/page_cgroup.h>
-#include <linux/hash.h>
-#include <linux/slab.h>
-#include <linux/memory.h>
-#include <linux/vmalloc.h>
-#include <linux/cgroup.h>
-#include <linux/swapops.h>
-#include <linux/kmemleak.h>
-static unsigned long total_usage;
-#if !defined(CONFIG_SPARSEMEM)
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
-        pgdat->node_page_cgroup = NULL;
-}
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
-        unsigned long pfn = page_to_pfn(page);
-        unsigned long offset;
-        struct page_cgroup *base;
-        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
-#ifdef CONFIG_DEBUG_VM
-        /*
-         * The sanity checks the page allocator does upon freeing a
-         * page can reach here before the page_cgroup arrays are
-         * allocated when feeding a range of pages to the allocator
-         * for the first time during bootup or memory hotplug.
-         */
-        if (unlikely(!base))
-                return NULL;
-#endif
-        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
-        return base + offset;
-}
-static int __init alloc_node_page_cgroup(int nid)
-{
-        struct page_cgroup *base;
-        unsigned long table_size;
-        unsigned long nr_pages;
-        nr_pages = NODE_DATA(nid)->node_spanned_pages;
-        if (!nr_pages)
-                return 0;
-        table_size = sizeof(struct page_cgroup) * nr_pages;
-        base = memblock_virt_alloc_try_nid_nopanic(
-                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
-                        BOOTMEM_ALLOC_ACCESSIBLE, nid);
-        if (!base)
-                return -ENOMEM;
-        NODE_DATA(nid)->node_page_cgroup = base;
-        total_usage += table_size;
-        return 0;
-}
-void __init page_cgroup_init_flatmem(void)
-{
-        int nid, fail;
-        if (mem_cgroup_disabled())
-                return;
-        for_each_online_node(nid)  {
-                fail = alloc_node_page_cgroup(nid);
-                if (fail)
-                        goto fail;
-        }
-        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
-        " don't want memory cgroups\n");
-        return;
-fail:
-        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
-        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
-        panic("Out of memory");
-}
-#else /* CONFIG_FLAT_NODE_MEM_MAP */
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
-        unsigned long pfn = page_to_pfn(page);
-        struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
-        /*
-         * The sanity checks the page allocator does upon freeing a
-         * page can reach here before the page_cgroup arrays are
-         * allocated when feeding a range of pages to the allocator
-         * for the first time during bootup or memory hotplug.
-         */
-        if (!section->page_cgroup)
-                return NULL;
-#endif
-        return section->page_cgroup + pfn;
-}
-static void *__meminit alloc_page_cgroup(size_t size, int nid)
-{
-        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
-        void *addr = NULL;
-        addr = alloc_pages_exact_nid(nid, size, flags);
-        if (addr) {
-                kmemleak_alloc(addr, size, 1, flags);
-                return addr;
-        }
-        if (node_state(nid, N_HIGH_MEMORY))
-                addr = vzalloc_node(size, nid);
-        else
-                addr = vzalloc(size);
-        return addr;
-}
-static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
-{
-        struct mem_section *section;
-        struct page_cgroup *base;
-        unsigned long table_size;
-        section = __pfn_to_section(pfn);
-        if (section->page_cgroup)
-                return 0;
-        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-        base = alloc_page_cgroup(table_size, nid);
-        /*
-         * The value stored in section->page_cgroup is (base - pfn)
-         * and it does not point to the memory block allocated above,
-         * causing kmemleak false positives.
-         */
-        kmemleak_not_leak(base);
-        if (!base) {
-                printk(KERN_ERR "page cgroup allocation failure\n");
-                return -ENOMEM;
-        }
-        /*
-         * The passed "pfn" may not be aligned to SECTION.  For the calculation
-         * we need to apply a mask.
-         */
-        pfn &= PAGE_SECTION_MASK;
-        section->page_cgroup = base - pfn;
-        total_usage += table_size;
-        return 0;
-}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_page_cgroup(void *addr)
-{
-        if (is_vmalloc_addr(addr)) {
-                vfree(addr);
-        } else {
-                struct page *page = virt_to_page(addr);
-                size_t table_size =
-                        sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-                BUG_ON(PageReserved(page));
-                kmemleak_free(addr);
-                free_pages_exact(addr, table_size);
-        }
-}
-static void __free_page_cgroup(unsigned long pfn)
-{
-        struct mem_section *ms;
-        struct page_cgroup *base;
-        ms = __pfn_to_section(pfn);
-        if (!ms || !ms->page_cgroup)
-                return;
-        base = ms->page_cgroup + pfn;
-        free_page_cgroup(base);
-        ms->page_cgroup = NULL;
-}
-static int __meminit online_page_cgroup(unsigned long start_pfn,
-                                unsigned long nr_pages,
-                                int nid)
-{
-        unsigned long start, end, pfn;
-        int fail = 0;
-        start = SECTION_ALIGN_DOWN(start_pfn);
-        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-        if (nid == -1) {
-                /*
-                 * In this case, "nid" already exists and contains valid memory.
-                 * "start_pfn" passed to us is a pfn which is an arg for
-                 * online__pages(), and start_pfn should exist.
-                 */
-                nid = pfn_to_nid(start_pfn);
-                VM_BUG_ON(!node_state(nid, N_ONLINE));
-        }
-        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
-                if (!pfn_present(pfn))
-                        continue;
-                fail = init_section_page_cgroup(pfn, nid);
-        }
-        if (!fail)
-                return 0;
-        /* rollback */
-        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
-                __free_page_cgroup(pfn);
-        return -ENOMEM;
-}
-static int __meminit offline_page_cgroup(unsigned long start_pfn,
-                                unsigned long nr_pages, int nid)
-{
-        unsigned long start, end, pfn;
-        start = SECTION_ALIGN_DOWN(start_pfn);
-        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
-                __free_page_cgroup(pfn);
-        return 0;
-}
-static int __meminit page_cgroup_callback(struct notifier_block *self,
-                               unsigned long action, void *arg)
-{
-        struct memory_notify *mn = arg;
-        int ret = 0;
-        switch (action) {
-        case MEM_GOING_ONLINE:
-                ret = online_page_cgroup(mn->start_pfn,
-                                   mn->nr_pages, mn->status_change_nid);
-                break;
-        case MEM_OFFLINE:
-                offline_page_cgroup(mn->start_pfn,
-                                mn->nr_pages, mn->status_change_nid);
-                break;
-        case MEM_CANCEL_ONLINE:
-                offline_page_cgroup(mn->start_pfn,
-                                mn->nr_pages, mn->status_change_nid);
-                break;
-        case MEM_GOING_OFFLINE:
-                break;
-        case MEM_ONLINE:
-        case MEM_CANCEL_OFFLINE:
-                break;
-        }
-        return notifier_from_errno(ret);
-}
-#endif
-void __init page_cgroup_init(void)
-{
-        unsigned long pfn;
-        int nid;
-        if (mem_cgroup_disabled())
-                return;
-        for_each_node_state(nid, N_MEMORY) {
-                unsigned long start_pfn, end_pfn;
-                start_pfn = node_start_pfn(nid);
-                end_pfn = node_end_pfn(nid);
-                /*
-                 * start_pfn and end_pfn may not be aligned to SECTION and the
-                 * page->flags of out of node pages are not initialized.  So we
-                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
-                 */
-                for (pfn = start_pfn;
-                     pfn < end_pfn;
-                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
-                        if (!pfn_valid(pfn))
-                                continue;
-                        /*
-                         * Nodes's pfns can be overlapping.
-                         * We know some arch can have a nodes layout such as
-                         * -------------pfn-------------->
-                         * N0 | N1 | N2 | N0 | N1 | N2|....
-                         */
-                        if (pfn_to_nid(pfn) != nid)
-                                continue;
-                        if (init_section_page_cgroup(pfn, nid))
-                                goto oom;
-                }
-        }
-        hotplug_memory_notifier(page_cgroup_callback, 0);
-        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
-                         "don't want memory cgroups\n");
-        return;
-oom:
-        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
-        panic("Out of memory");
-}
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
-        return;
-}
-#endif
-#ifdef CONFIG_MEMCG_SWAP
-static DEFINE_MUTEX(swap_cgroup_mutex);
-struct swap_cgroup_ctrl {
-        struct page **map;
-        unsigned long length;
-        spinlock_t      lock;
-};
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-struct swap_cgroup {
-        unsigned short          id;
-};
-#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
-/*
- * SwapCgroup implements "lookup" and "exchange" operations.
- * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
- * against SwapCache. At swap_free(), this is accessed directly from swap.
- *
- * This means,
- *  - we have no race in "exchange" when we're accessed via SwapCache because
- *    SwapCache(and its swp_entry) is under lock.
- *  - When called via swap_free(), there is no user of this entry and no race.
- * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
- */
-/*
- * allocate buffer for swap_cgroup.
- */
-static int swap_cgroup_prepare(int type)
-{
-        struct page *page;
-        struct swap_cgroup_ctrl *ctrl;
-        unsigned long idx, max;
-        ctrl = &swap_cgroup_ctrl[type];
-        for (idx = 0; idx < ctrl->length; idx++) {
-                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-                if (!page)
-                        goto not_enough_page;
-                ctrl->map[idx] = page;
-        }
-        return 0;
-not_enough_page:
-        max = idx;
-        for (idx = 0; idx < max; idx++)
-                __free_page(ctrl->map[idx]);
-        return -ENOMEM;
-}
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
-                                        struct swap_cgroup_ctrl **ctrlp)
-{
-        pgoff_t offset = swp_offset(ent);
-        struct swap_cgroup_ctrl *ctrl;
-        struct page *mappage;
-        struct swap_cgroup *sc;
-        ctrl = &swap_cgroup_ctrl[swp_type(ent)];
-        if (ctrlp)
-                *ctrlp = ctrl;
-        mappage = ctrl->map[offset / SC_PER_PAGE];
-        sc = page_address(mappage);
-        return sc + offset % SC_PER_PAGE;
-}
-/**
- * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @ent: swap entry to be cmpxchged
- * @old: old id
- * @new: new id
- *
- * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup using 0 as its id)
- */
-unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
-                                        unsigned short old, unsigned short new)
-{
-        struct swap_cgroup_ctrl *ctrl;
-        struct swap_cgroup *sc;
-        unsigned long flags;
-        unsigned short retval;
-        sc = lookup_swap_cgroup(ent, &ctrl);
-        spin_lock_irqsave(&ctrl->lock, flags);
-        retval = sc->id;
-        if (retval == old)
-                sc->id = new;
-        else
-                retval = 0;
-        spin_unlock_irqrestore(&ctrl->lock, flags);
-        return retval;
-}
-/**
- * swap_cgroup_record - record mem_cgroup for this swp_entry.
- * @ent: swap entry to be recorded into
- * @id: mem_cgroup to be recorded
- *
- * Returns old value at success, 0 at failure.
- * (Of course, old value can be 0.)
- */
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
-{
-        struct swap_cgroup_ctrl *ctrl;
-        struct swap_cgroup *sc;
-        unsigned short old;
-        unsigned long flags;
-        sc = lookup_swap_cgroup(ent, &ctrl);
-        spin_lock_irqsave(&ctrl->lock, flags);
-        old = sc->id;
-        sc->id = id;
-        spin_unlock_irqrestore(&ctrl->lock, flags);
-        return old;
-}
-/**
- * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
- * @ent: swap entry to be looked up.
- *
- * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
- */
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
-        return lookup_swap_cgroup(ent, NULL)->id;
-}
-int swap_cgroup_swapon(int type, unsigned long max_pages)
-{
-        void *array;
-        unsigned long array_size;
-        unsigned long length;
-        struct swap_cgroup_ctrl *ctrl;
-        if (!do_swap_account)
-                return 0;
-        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
-        array_size = length * sizeof(void *);
-        array = vzalloc(array_size);
-        if (!array)
-                goto nomem;
-        ctrl = &swap_cgroup_ctrl[type];
-        mutex_lock(&swap_cgroup_mutex);
-        ctrl->length = length;
-        ctrl->map = array;
-        spin_lock_init(&ctrl->lock);
-        if (swap_cgroup_prepare(type)) {
-                /* memory shortage */
-                ctrl->map = NULL;
-                ctrl->length = 0;
-                mutex_unlock(&swap_cgroup_mutex);
-                vfree(array);
-                goto nomem;
-        }
-        mutex_unlock(&swap_cgroup_mutex);
-        return 0;
-nomem:
-        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
-        printk(KERN_INFO
-                "swap_cgroup can be disabled by swapaccount=0 boot option\n");
-        return -ENOMEM;
-}
-void swap_cgroup_swapoff(int type)
-{
-        struct page **map;
-        unsigned long i, length;
-        struct swap_cgroup_ctrl *ctrl;
-        if (!do_swap_account)
-                return;
-        mutex_lock(&swap_cgroup_mutex);
-        ctrl = &swap_cgroup_ctrl[type];
-        map = ctrl->map;
-        length = ctrl->length;
-        ctrl->map = NULL;
-        ctrl->length = 0;
-        mutex_unlock(&swap_cgroup_mutex);
-        if (map) {
-                for (i = 0; i < length; i++) {
-                        struct page *page = map[i];
-                        if (page)
-                                __free_page(page);
-                }
-                vfree(map);
-        }
-}
-#endif
diff --git a/mm/page_counter.c b/mm/page_counter.c
new file mode 100644
index 000000000000..a009574fbba9
--- /dev/null
+++ b/mm/page_counter.c
@@ -0,0 +1,192 @@
+/*
+ * Lockless hierarchical page accounting & limiting
+ *
+ * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
+ */
+#include <linux/page_counter.h>
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/page.h>
+/**
+ * page_counter_cancel - take pages out of the local counter
+ * @counter: counter
+ * @nr_pages: number of pages to cancel
+ */
+void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
+{
+        long new;
+        new = atomic_long_sub_return(nr_pages, &counter->count);
+        /* More uncharges than charges? */
+        WARN_ON_ONCE(new < 0);
+}
+/**
+ * page_counter_charge - hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ *
+ * NOTE: This does not consider any configured counter limits.
+ */
+void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
+{
+        struct page_counter *c;
+        for (c = counter; c; c = c->parent) {
+                long new;
+                new = atomic_long_add_return(nr_pages, &c->count);
+                /*
+                 * This is indeed racy, but we can live with some
+                 * inaccuracy in the watermark.
+                 */
+                if (new > c->watermark)
+                        c->watermark = new;
+        }
+}
+/**
+ * page_counter_try_charge - try to hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ * @fail: points first counter to hit its limit, if any
+ *
+ * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
+ * its ancestors has hit its configured limit.
+ */
+int page_counter_try_charge(struct page_counter *counter,
+                            unsigned long nr_pages,
+                            struct page_counter **fail)
+{
+        struct page_counter *c;
+        for (c = counter; c; c = c->parent) {
+                long new;
+                /*
+                 * Charge speculatively to avoid an expensive CAS.  If
+                 * a bigger charge fails, it might falsely lock out a
+                 * racing smaller charge and send it into reclaim
+                 * early, but the error is limited to the difference
+                 * between the two sizes, which is less than 2M/4M in
+                 * case of a THP locking out a regular page charge.
+                 *
+                 * The atomic_long_add_return() implies a full memory
+                 * barrier between incrementing the count and reading
+                 * the limit.  When racing with page_counter_limit(),
+                 * we either see the new limit or the setter sees the
+                 * counter has changed and retries.
+                 */
+                new = atomic_long_add_return(nr_pages, &c->count);
+                if (new > c->limit) {
+                        atomic_long_sub(nr_pages, &c->count);
+                        /*
+                         * This is racy, but we can live with some
+                         * inaccuracy in the failcnt.
+                         */
+                        c->failcnt++;
+                        *fail = c;
+                        goto failed;
+                }
+                /*
+                 * Just like with failcnt, we can live with some
+                 * inaccuracy in the watermark.
+                 */
+                if (new > c->watermark)
+                        c->watermark = new;
+        }
+        return 0;
+failed:
+        for (c = counter; c != *fail; c = c->parent)
+                page_counter_cancel(c, nr_pages);
+        return -ENOMEM;
+}
+/**
+ * page_counter_uncharge - hierarchically uncharge pages
+ * @counter: counter
+ * @nr_pages: number of pages to uncharge
+ */
+void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
+{
+        struct page_counter *c;
+        for (c = counter; c; c = c->parent)
+                page_counter_cancel(c, nr_pages);
+}
+/**
+ * page_counter_limit - limit the number of pages allowed
+ * @counter: counter
+ * @limit: limit to set
+ *
+ * Returns 0 on success, -EBUSY if the current number of pages on the
+ * counter already exceeds the specified limit.
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+int page_counter_limit(struct page_counter *counter, unsigned long limit)
+{
+        for (;;) {
+                unsigned long old;
+                long count;
+                /*
+                 * Update the limit while making sure that it's not
+                 * below the concurrently-changing counter value.
+                 *
+                 * The xchg implies two full memory barriers before
+                 * and after, so the read-swap-read is ordered and
+                 * ensures coherency with page_counter_try_charge():
+                 * that function modifies the count before checking
+                 * the limit, so if it sees the old limit, we see the
+                 * modified counter and retry.
+                 */
+                count = atomic_long_read(&counter->count);
+                if (count > limit)
+                        return -EBUSY;
+                old = xchg(&counter->limit, limit);
+                if (atomic_long_read(&counter->count) <= count)
+                        return 0;
+                counter->limit = old;
+                cond_resched();
+        }
+}
+/**
+ * page_counter_memparse - memparse() for page counter limits
+ * @buf: string to parse
+ * @nr_pages: returns the result in number of pages
+ *
+ * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
+ * limited to %PAGE_COUNTER_MAX.
+ */
+int page_counter_memparse(const char *buf, unsigned long *nr_pages)
+{
+        char unlimited[] = "-1";
+        char *end;
+        u64 bytes;
+        if (!strncmp(buf, unlimited, sizeof(unlimited))) {
+                *nr_pages = PAGE_COUNTER_MAX;
+                return 0;
+        }
+        bytes = memparse(buf, &end);
+        if (*end != '\0')
+                return -EINVAL;
+        *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
+        return 0;
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c8778f7e208e..72f5ac381ab3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -68,7 +68,7 @@ out:
        spin_unlock_irqrestore(&zone->lock, flags);
        if (!ret)
-                drain_all_pages();
+                drain_all_pages(zone);
        return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e4c7213210c..45eba36fd673 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1053,7 +1053,7 @@ void page_add_file_rmap(struct page *page)
                __inc_zone_page_state(page, NR_FILE_MAPPED);
                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
        }
-        mem_cgroup_end_page_stat(memcg, locked, flags);
+        mem_cgroup_end_page_stat(memcg, &locked, &flags);
 }
 static void page_remove_file_rmap(struct page *page)
@@ -1083,7 +1083,7 @@ static void page_remove_file_rmap(struct page *page)
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
 out:
-        mem_cgroup_end_page_stat(memcg, locked, flags);
+        mem_cgroup_end_page_stat(memcg, &locked, &flags);
 }
 /**
diff --git a/mm/slab.c b/mm/slab.c
index f34e053ec46e..79e15f0a2a6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep,
         * Be lazy and only check for valid flags here,  keeping it out of the
         * critical path in kmem_cache_alloc().
         */
-        BUG_ON(flags & GFP_SLAB_BUG_MASK);
+        if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+                pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+                BUG();
+        }
        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
        /* Take the node list lock to change the colour_next on this node */
@@ -3580,11 +3583,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
        for_each_online_node(node) {
-                if (use_alien_caches) {
+                if (use_alien_caches) {
-                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
-                        if (!new_alien)
+                        if (!new_alien)
-                                goto fail;
+                                goto fail;
-                }
+                }
                new_shared = NULL;
                if (cachep->shared) {
@@ -4043,12 +4046,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 #ifdef CONFIG_DEBUG_SLAB_LEAK
-static void *leaks_start(struct seq_file *m, loff_t *pos)
-{
-        mutex_lock(&slab_mutex);
-        return seq_list_start(&slab_caches, *pos);
-}
 static inline int add_caller(unsigned long *n, unsigned long v)
 {
        unsigned long *p;
@@ -4170,7 +4167,7 @@ static int leaks_show(struct seq_file *m, void *p)
 }
 static const struct seq_operations slabstats_op = {
-        .start = leaks_start,
+        .start = slab_start,
        .next = slab_next,
        .stop = slab_stop,
        .show = leaks_show,
diff --git a/mm/slab.h b/mm/slab.h
index ab019e63e3c2..1cf4005482dd 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx)
        rcu_read_lock();
        params = rcu_dereference(s->memcg_params);
-        cachep = params->memcg_caches[idx];
-        rcu_read_unlock();
        /*
         * Make sure we will access the up-to-date value. The code updating
         * memcg_caches issues a write barrier to match this (see
         * memcg_register_cache()).
         */
-        smp_read_barrier_depends();
+        cachep = lockless_dereference(params->memcg_caches[idx]);
+        rcu_read_unlock();
        return cachep;
 }
@@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 #endif
+void *slab_start(struct seq_file *m, loff_t *pos);
 void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
+int memcg_slab_show(struct seq_file *m, void *p);
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index dcdab81bd240..e03dd6f2a272 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
        size = ALIGN(size, align);
        flags = kmem_cache_flags(size, flags, name, NULL);
-        list_for_each_entry(s, &slab_caches, list) {
+        list_for_each_entry_reverse(s, &slab_caches, list) {
                if (slab_unmergeable(s))
                        continue;
@@ -811,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
 #define SLABINFO_RIGHTS S_IRUSR
 #endif
-void print_slabinfo_header(struct seq_file *m)
+static void print_slabinfo_header(struct seq_file *m)
 {
        /*
         * Output format version, so at least we can change it
@@ -834,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m)
        seq_putc(m, '\n');
 }
-static void *s_start(struct seq_file *m, loff_t *pos)
+void *slab_start(struct seq_file *m, loff_t *pos)
 {
-        loff_t n = *pos;
        mutex_lock(&slab_mutex);
-        if (!n)
-                print_slabinfo_header(m);
        return seq_list_start(&slab_caches, *pos);
 }
@@ -881,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
        }
 }
-int cache_show(struct kmem_cache *s, struct seq_file *m)
+static void cache_show(struct kmem_cache *s, struct seq_file *m)
 {
        struct slabinfo sinfo;
@@ -900,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m)
                   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
        slabinfo_show_stats(m, s);
        seq_putc(m, '\n');
+}
+static int slab_show(struct seq_file *m, void *p)
+{
+        struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+        if (p == slab_caches.next)
+                print_slabinfo_header(m);
+        if (is_root_cache(s))
+                cache_show(s, m);
        return 0;
 }
-static int s_show(struct seq_file *m, void *p)
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_slab_show(struct seq_file *m, void *p)
 {
        struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        if (!is_root_cache(s))
+        if (p == slab_caches.next)
-                return 0;
+                print_slabinfo_header(m);
-        return cache_show(s, m);
+        if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+                cache_show(s, m);
+        return 0;
 }
+#endif
 /*
 * slabinfo_op - iterator that generates /proc/slabinfo
@@ -926,10 +936,10 @@ static int s_show(struct seq_file *m, void *p)
 * + further values on SMP and with statistics enabled
 */
 static const struct seq_operations slabinfo_op = {
-        .start = s_start,
+        .start = slab_start,
        .next = slab_next,
        .stop = slab_stop,
-        .show = s_show,
+        .show = slab_show,
 };
 static int slabinfo_open(struct inode *inode, struct file *file)
diff --git a/mm/slub.c b/mm/slub.c
index ae7b9f1ad394..386bbed76e94 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page)
        maxobj = order_objects(compound_order(page), s->size, s->reserved);
        if (page->objects > maxobj) {
                slab_err(s, page, "objects %u > max %u",
-                        s->name, page->objects, maxobj);
+                        page->objects, maxobj);
                return 0;
        }
        if (page->inuse > page->objects) {
                slab_err(s, page, "inuse %u > max %u",
-                        s->name, page->inuse, page->objects);
+                        page->inuse, page->objects);
                return 0;
        }
        /* Slab_pad_check fixes things up after itself */
@@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
        int nr = 0;
        void *fp;
        void *object = NULL;
-        unsigned long max_objects;
+        int max_objects;
        fp = page->freelist;
        while (fp && nr <= page->objects) {
@@ -1377,7 +1377,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        int order;
        int idx;
-        BUG_ON(flags & GFP_SLAB_BUG_MASK);
+        if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+                pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+                BUG();
+        }
        page = allocate_slab(s,
                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -2554,7 +2557,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                        } else { /* Needs to be taken off a list */
-                                n = get_node(s, page_to_nid(page));
+                                n = get_node(s, page_to_nid(page));
                                /*
                                 * Speculatively acquire the list_lock.
                                 * If the cmpxchg does not succeed then we may
@@ -2587,10 +2590,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                 * The list lock was not taken therefore no list
                 * activity can be necessary.
                 */
-                if (was_frozen)
+                if (was_frozen)
-                        stat(s, FREE_FROZEN);
+                        stat(s, FREE_FROZEN);
-                return;
+                return;
-        }
+        }
        if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
                goto slab_empty;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
new file mode 100644
index 000000000000..b5f7f24b8dd1
--- /dev/null
+++ b/mm/swap_cgroup.c
@@ -0,0 +1,208 @@
+#include <linux/swap_cgroup.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/swapops.h> /* depends on mm.h include */
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+        struct page **map;
+        unsigned long length;
+        spinlock_t      lock;
+};
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+struct swap_cgroup {
+        unsigned short          id;
+};
+#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ *  - we have no race in "exchange" when we're accessed via SwapCache because
+ *    SwapCache(and its swp_entry) is under lock.
+ *  - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+        struct page *page;
+        struct swap_cgroup_ctrl *ctrl;
+        unsigned long idx, max;
+        ctrl = &swap_cgroup_ctrl[type];
+        for (idx = 0; idx < ctrl->length; idx++) {
+                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+                if (!page)
+                        goto not_enough_page;
+                ctrl->map[idx] = page;
+        }
+        return 0;
+not_enough_page:
+        max = idx;
+        for (idx = 0; idx < max; idx++)
+                __free_page(ctrl->map[idx]);
+        return -ENOMEM;
+}
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+                                        struct swap_cgroup_ctrl **ctrlp)
+{
+        pgoff_t offset = swp_offset(ent);
+        struct swap_cgroup_ctrl *ctrl;
+        struct page *mappage;
+        struct swap_cgroup *sc;
+        ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+        if (ctrlp)
+                *ctrlp = ctrl;
+        mappage = ctrl->map[offset / SC_PER_PAGE];
+        sc = page_address(mappage);
+        return sc + offset % SC_PER_PAGE;
+}
+/**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @ent: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup using 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+                                        unsigned short old, unsigned short new)
+{
+        struct swap_cgroup_ctrl *ctrl;
+        struct swap_cgroup *sc;
+        unsigned long flags;
+        unsigned short retval;
+        sc = lookup_swap_cgroup(ent, &ctrl);
+        spin_lock_irqsave(&ctrl->lock, flags);
+        retval = sc->id;
+        if (retval == old)
+                sc->id = new;
+        else
+                retval = 0;
+        spin_unlock_irqrestore(&ctrl->lock, flags);
+        return retval;
+}
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @id: mem_cgroup to be recorded
+ *
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
+ */
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+{
+        struct swap_cgroup_ctrl *ctrl;
+        struct swap_cgroup *sc;
+        unsigned short old;
+        unsigned long flags;
+        sc = lookup_swap_cgroup(ent, &ctrl);
+        spin_lock_irqsave(&ctrl->lock, flags);
+        old = sc->id;
+        sc->id = id;
+        spin_unlock_irqrestore(&ctrl->lock, flags);
+        return old;
+}
+/**
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ */
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+        return lookup_swap_cgroup(ent, NULL)->id;
+}
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+        void *array;
+        unsigned long array_size;
+        unsigned long length;
+        struct swap_cgroup_ctrl *ctrl;
+        if (!do_swap_account)
+                return 0;
+        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
+        array_size = length * sizeof(void *);
+        array = vzalloc(array_size);
+        if (!array)
+                goto nomem;
+        ctrl = &swap_cgroup_ctrl[type];
+        mutex_lock(&swap_cgroup_mutex);
+        ctrl->length = length;
+        ctrl->map = array;
+        spin_lock_init(&ctrl->lock);
+        if (swap_cgroup_prepare(type)) {
+                /* memory shortage */
+                ctrl->map = NULL;
+                ctrl->length = 0;
+                mutex_unlock(&swap_cgroup_mutex);
+                vfree(array);
+                goto nomem;
+        }
+        mutex_unlock(&swap_cgroup_mutex);
+        return 0;
+nomem:
+        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+        printk(KERN_INFO
+                "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+        return -ENOMEM;
+}
+void swap_cgroup_swapoff(int type)
+{
+        struct page **map;
+        unsigned long i, length;
+        struct swap_cgroup_ctrl *ctrl;
+        if (!do_swap_account)
+                return;
+        mutex_lock(&swap_cgroup_mutex);
+        ctrl = &swap_cgroup_ctrl[type];
+        map = ctrl->map;
+        length = ctrl->length;
+        ctrl->map = NULL;
+        ctrl->length = 0;
+        mutex_unlock(&swap_cgroup_mutex);
+        if (map) {
+                for (i = 0; i < length; i++) {
+                        struct page *page = map[i];
+                        if (page)
+                                __free_page(page);
+                }
+                vfree(map);
+        }
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 154444918685..9711342987a0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,7 +17,6 @@
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
-#include <linux/page_cgroup.h>
 #include <asm/pgtable.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8798b2e0ac59..63f55ccb9b26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,7 +38,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                 unsigned char);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 90520af7f186..8a18196fcdff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -463,8 +463,7 @@ overflow:
                goto retry;
        }
        if (printk_ratelimit())
-                printk(KERN_WARNING
+                pr_warn("vmap allocation for size %lu failed: "
-                        "vmap allocation for size %lu failed: "
                        "use vmalloc=<size> to increase size.\n", size);
        kfree(va);
        return ERR_PTR(-EBUSY);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcb47074ae03..4636d9e822c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -260,8 +260,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
        do_div(delta, lru_pages + 1);
        total_scan += delta;
        if (total_scan < 0) {
-                printk(KERN_ERR
+                pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
-                "shrink_slab: %pF negative objects to delete nr=%ld\n",
                       shrinker->scan_objects, total_scan);
                total_scan = freeable;
        }
@@ -875,7 +874,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * end of the LRU a second time.
                 */
                mapping = page_mapping(page);
-                if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
+                if (((dirty || writeback) && mapping &&
+                     bdi_write_congested(mapping->backing_dev_info)) ||
                    (writeback && PageReclaim(page)))
                        nr_congested++;
@@ -2249,7 +2249,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
                return true;
        /* If compaction would go ahead or the allocation would succeed, stop */
-        switch (compaction_suitable(zone, sc->order)) {
+        switch (compaction_suitable(zone, sc->order, 0, 0)) {
        case COMPACT_PARTIAL:
        case COMPACT_CONTINUE:
                return false;
@@ -2346,7 +2346,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
         * If compaction is not ready to start and allocation is not likely
         * to succeed without it, then keep reclaiming.
         */
-        if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
+        if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED)
                return false;
        return watermark_ok;
@@ -2824,8 +2824,8 @@ static bool zone_balanced(struct zone *zone, int order,
                                    balance_gap, classzone_idx, 0))
                return false;
-        if (IS_ENABLED(CONFIG_COMPACTION) && order &&
+        if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
-            compaction_suitable(zone, order) == COMPACT_SKIPPED)
+                                order, 0, classzone_idx) == COMPACT_SKIPPED)
                return false;
        return true;
@@ -2952,8 +2952,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * from memory. Do not reclaim more than needed for compaction.
         */
        if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
-                        compaction_suitable(zone, sc->order) !=
+                        compaction_suitable(zone, sc->order, 0, classzone_idx)
-                                COMPACT_SKIPPED)
+                                                        != COMPACT_SKIPPED)
                testorder = 0;
        /*
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-10 21:34:42 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-10 21:34:42 -0500
commit	b6da0076bab5a12afb19312ffee41c95490af2a0 (patch)
tree	52a5675b9c2ff95d88b981d5b9a3822f6073c112 /mm
parent	cbfe0de303a55ed96d8831c2d5f56f8131cd6612 (diff)
parent	a53b831549141aa060a8b54b76e3a42870d74cc0 (diff)