17 files changed, 383 insertions, 154 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 021a2960ef9e..6cc604bd5649 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone,
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
+        /*
+         * Initialise the free scanner. The starting point is where we last
+         * scanned from (or the end of the zone if starting). The low point
+         * is the end of the pageblock the migration scanner is using.
+         */
        pfn = cc->free_pfn;
        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
-        high_pfn = low_pfn;
+        /*
+         * Take care that if the migration scanner is at the end of the zone
+         * that the free scanner does not accidentally move to the next zone
+         * in the next isolation cycle.
+         */
+        high_pfn = min(low_pfn, pfn);
        /*
         * Isolate free pages until enough are available to migrate the
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone)
        return isolated > (inactive + active) / 2;
 }
+/* possible outcome of isolate_migratepages */
+typedef enum {
+        ISOLATE_ABORT,          /* Abort compaction now */
+        ISOLATE_NONE,           /* No pages isolated, continue scanning */
+        ISOLATE_SUCCESS,        /* Pages isolated, migrate */
+} isolate_migrate_t;
 /*
 * Isolate all pages that can be migrated from the block pointed to by
 * the migrate scanner within compact_control.
 */
-static unsigned long isolate_migratepages(struct zone *zone,
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
        /* Do not cross the free scanner or scan within a memory hole */
        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
                cc->migrate_pfn = end_pfn;
-                return 0;
+                return ISOLATE_NONE;
        }
        /*
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone,
         * delay for some time until fewer pages are isolated
         */
        while (unlikely(too_many_isolated(zone))) {
+                /* async migration should just abort */
+                if (!cc->sync)
+                        return ISOLATE_ABORT;
                congestion_wait(BLK_RW_ASYNC, HZ/10);
                if (fatal_signal_pending(current))
-                        return 0;
+                        return ISOLATE_ABORT;
        }
        /* Time to isolate some pages for migration */
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
-        return cc->nr_migratepages;
+        return ISOLATE_SUCCESS;
 }
 /*
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone,
        if (cc->free_pfn <= cc->migrate_pfn)
                return COMPACT_COMPLETE;
-        /* Compaction run is not finished if the watermark is not met */
-        watermark = low_wmark_pages(zone);
-        watermark += (1 << cc->order);
-        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
-                return COMPACT_CONTINUE;
        /*
         * order == -1 is expected when compacting via
         * /proc/sys/vm/compact_memory
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone,
        if (cc->order == -1)
                return COMPACT_CONTINUE;
+        /* Compaction run is not finished if the watermark is not met */
+        watermark = low_wmark_pages(zone);
+        watermark += (1 << cc->order);
+        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
        for (order = cc->order; order < MAX_ORDER; order++) {
                /* Job done if page is free of the right migratetype */
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        unsigned long watermark;
        /*
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
+        if (order == -1)
+                return COMPACT_CONTINUE;
+        /*
         * Watermarks for order-0 must be met for compaction. Note the 2UL.
         * This is because during migration, copies of pages need to be
         * allocated and for a short time, the footprint is higher
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
                return COMPACT_SKIPPED;
        /*
-         * order == -1 is expected when compacting via
-         * /proc/sys/vm/compact_memory
-         */
-        if (order == -1)
-                return COMPACT_CONTINUE;
-        /*
         * fragmentation index determines if allocation failures are due to
         * low memory or external fragmentation
         *
-         * index of -1 implies allocations might succeed dependingon watermarks
+         * index of -1000 implies allocations might succeed depending on
+         * watermarks
         * index towards 0 implies failure is due to lack of memory
         * index towards 1000 implies failure is due to fragmentation
         *
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
                return COMPACT_SKIPPED;
-        if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+        if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+            0, 0))
                return COMPACT_PARTIAL;
        return COMPACT_CONTINUE;
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                unsigned long nr_migrate, nr_remaining;
                int err;
-                if (!isolate_migratepages(zone, cc))
+                switch (isolate_migratepages(zone, cc)) {
+                case ISOLATE_ABORT:
+                        ret = COMPACT_PARTIAL;
+                        goto out;
+                case ISOLATE_NONE:
                        continue;
+                case ISOLATE_SUCCESS:
+                        ;
+                }
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        }
+out:
        /* Release free pages and check accounting */
        cc->nr_freepages -= release_freepages(&cc->freepages);
        VM_BUG_ON(cc->nr_freepages != 0);
diff --git a/mm/filemap.c b/mm/filemap.c
index d7b10578a64b..a8251a8d3457 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file)
                error = security_inode_killpriv(dentry);
        if (!error && killsuid)
                error = __remove_suid(dentry, killsuid);
-        if (!error)
+        if (!error && (inode->i_sb->s_flags & MS_NOSEC))
                inode->i_flags |= S_NOSEC;
        return error;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 615d9743a3cb..81532f297fd2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2234,11 +2234,8 @@ static void khugepaged_loop(void)
        while (likely(khugepaged_enabled())) {
 #ifndef CONFIG_NUMA
                hpage = khugepaged_alloc_hugepage();
-                if (unlikely(!hpage)) {
+                if (unlikely(!hpage))
-                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                        break;
-                }
-                count_vm_event(THP_COLLAPSE_ALLOC);
 #else
                if (IS_ERR(hpage)) {
                        khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f33bb319b73f..bfcf153bc829 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
-                return ERR_PTR(chg);
+                return ERR_PTR(-VM_FAULT_OOM);
        if (chg)
                if (hugetlb_get_quota(inode->i_mapping, chg))
-                        return ERR_PTR(-ENOSPC);
+                        return ERR_PTR(-VM_FAULT_SIGBUS);
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -1111,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
                WARN_ON(page_count(page) != 1);
                prep_compound_huge_page(page, h->order);
                prep_new_huge_page(h, page, page_to_nid(page));
+                /*
+                 * If we had gigantic hugepages allocated at boot time, we need
+                 * to restore the 'stolen' pages to totalram_pages in order to
+                 * fix confusing memory reports from free(1) and another
+                 * side-effects, like CommitLimit going negative.
+                 */
+                if (h->order > (MAX_ORDER - 1))
+                        totalram_pages += 1 << h->order;
        }
 }
diff --git a/mm/ksm.c b/mm/ksm.c
index d708b3ef2260..9a68b0cf0a1c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
                slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
                ksm_scan.mm_slot = slot;
                spin_unlock(&ksm_mmlist_lock);
+                /*
+                 * Although we tested list_empty() above, a racing __ksm_exit
+                 * of the last mm on the list may have removed it since then.
+                 */
+                if (slot == &ksm_mm_head)
+                        return NULL;
 next_mm:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &slot->rmap_list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3ad..cf7d027a8844 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -359,7 +359,7 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -735,7 +735,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
                                struct mem_cgroup, css);
 }
-static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
        struct mem_cgroup *mem = NULL;
@@ -1663,15 +1663,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-        if (root_mem->memsw_is_minimum)
+        if (!check_soft && root_mem->memsw_is_minimum)
                noswap = true;
        while (1) {
                victim = mem_cgroup_select_victim(root_mem);
                if (victim == root_mem) {
                        loop++;
-                        if (loop >= 1)
+                        /*
-                                drain_all_stock_async();
+                         * We are not draining per cpu cached charges during
+                         * soft limit reclaim  because global reclaim doesn't
+                         * care about charges. It tries to free some memory and
+                         * charges will not give any.
+                         */
+                        if (!check_soft && loop >= 1)
+                                drain_all_stock_async(root_mem);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
@@ -1934,9 +1940,11 @@ struct memcg_stock_pcp {
        struct mem_cgroup *cached; /* this never be root cgroup */
        unsigned int nr_pages;
        struct work_struct work;
+        unsigned long flags;
+#define FLUSHING_CACHED_CHARGE  (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
 /*
 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +1992,7 @@ static void drain_local_stock(struct work_struct *dummy)
 {
        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
        drain_stock(stock);
+        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 /*
@@ -2008,26 +2017,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 * expects some charges will be back to res_counter later but cannot wait for
 * it.
 */
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
-        int cpu;
+        int cpu, curcpu;
-        /* This function is for scheduling "drain" in asynchronous way.
+        /*
-         * The result of "drain" is not directly handled by callers. Then,
+         * If someone calls draining, avoid adding more kworker runs.
-         * if someone is calling drain, we don't have to call drain more.
-         * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
-         * there is a race. We just do loose check here.
         */
-        if (atomic_read(&memcg_drain_count))
+        if (!mutex_trylock(&percpu_charge_mutex))
                return;
        /* Notify other cpus that system-wide "drain" is running */
-        atomic_inc(&memcg_drain_count);
        get_online_cpus();
+        /*
+         * Get a hint for avoiding draining charges on the current cpu,
+         * which must be exhausted by our charging.  It is not required that
+         * this be a precise check, so we use raw_smp_processor_id() instead of
+         * getcpu()/putcpu().
+         */
+        curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-                schedule_work_on(cpu, &stock->work);
+                struct mem_cgroup *mem;
+                if (cpu == curcpu)
+                        continue;
+                mem = stock->cached;
+                if (!mem)
+                        continue;
+                if (mem != root_mem) {
+                        if (!root_mem->use_hierarchy)
+                                continue;
+                        /* check whether "mem" is under tree of "root_mem" */
+                        if (!css_is_ancestor(&mem->css, &root_mem->css))
+                                continue;
+                }
+                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                        schedule_work_on(cpu, &stock->work);
        }
        put_online_cpus();
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
        /* We don't wait for flush_work */
 }
@@ -2035,9 +2063,9 @@ static void drain_all_stock_async(void)
 static void drain_all_stock_sync(void)
 {
        /* called when force_empty is called */
-        atomic_inc(&memcg_drain_count);
+        mutex_lock(&percpu_charge_mutex);
        schedule_on_each_cpu(drain_local_stock);
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
 }
 /*
@@ -4640,6 +4668,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "numa_stat",
                .open = mem_control_numa_stat_open,
+                .mode = S_IRUGO,
        },
 #endif
 };
@@ -5414,18 +5443,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *old_cont,
                                struct task_struct *p)
 {
-        struct mm_struct *mm;
+        struct mm_struct *mm = get_task_mm(p);
-        if (!mc.to)
-                /* no need to move charge */
-                return;
-        mm = get_task_mm(p);
        if (mm) {
-                mem_cgroup_move_charge(mm);
+                if (mc.to)
+                        mem_cgroup_move_charge(mm);
+                put_swap_token(mm);
                mmput(mm);
        }
-        mem_cgroup_clear_mc();
+        if (mc.to)
+                mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5c8f7e08928d..eac0ba561491 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,7 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1468,7 +1469,8 @@ int soft_offline_page(struct page *page, int flags)
        put_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
+                inc_zone_page_state(page, NR_ISOLATED_ANON +
+                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                                                0, true);
diff --git a/mm/memory.c b/mm/memory.c
index 6953d3926e01..87d935333f0d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1112,11 +1112,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        int force_flush = 0;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
+        pte_t *start_pte;
        pte_t *pte;
 again:
        init_rss_vec(rss);
-        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        pte = start_pte;
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
@@ -1196,7 +1198,7 @@ again:
        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();
-        pte_unmap_unlock(pte - 1, ptl);
+        pte_unmap_unlock(start_pte, ptl);
        /*
         * mmu_gather ran out of room to batch pages, we break out of
@@ -1296,7 +1298,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 /**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
+ * @tlb: address of the caller's struct mmu_gather
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9f646374e32f..02159c755136 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -494,6 +494,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        /* init node's zones as empty zones, we don't have any present pages.*/
        free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+        /*
+         * The node we allocated has no zone fallback lists. For avoiding
+         * to access not-initialized zonelist, build here.
+         */
+        build_all_zonelists(NULL);
        return pgdat;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index e4a5c912983d..666e4e677414 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
         */
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
-        if (PageSwapBacked(page)) {
+        if (!PageSwapCache(page) && PageSwapBacked(page)) {
                __dec_zone_page_state(page, NR_SHMEM);
                __inc_zone_page_state(newpage, NR_SHMEM);
        }
diff --git a/mm/mmap.c b/mm/mmap.c
index bbdc9af5e117..d49736ff8a8d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -906,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
        if (anon_vma)
                return anon_vma;
 try_prev:
-        /*
+        near = vma->vm_prev;
-         * It is potentially slow to have to call find_vma_prev here.
-         * But it's only on the first write fault on the vma, not
-         * every time, and we could devise a way to avoid it later
-         * (e.g. stash info in next's anon_vma_node when assigning
-         * an anon_vma, or when trying vma_merge).  Another time.
-         */
-        BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
        if (!near)
                goto none;
@@ -2044,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                return -EINVAL;
        /* Find the first overlapping VMA */
-        vma = find_vma_prev(mm, start, &prev);
+        vma = find_vma(mm, start);
        if (!vma)
                return 0;
+        prev = vma->vm_prev;
        /* we have  start < vma->vm_end  */
        /* if it doesn't overlap, we have nothing.. */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 74ccff61d1be..53bffc6c293e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr)
 }
 #endif
-static int __meminit init_section_page_cgroup(unsigned long pfn)
+static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 {
        struct page_cgroup *base, *pc;
        struct mem_section *section;
        unsigned long table_size;
        unsigned long nr;
-        int nid, index;
+        int index;
        nr = pfn_to_section_nr(pfn);
        section = __nr_to_section(nr);
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
        if (section->page_cgroup)
                return 0;
-        nid = page_to_nid(pfn_to_page(pfn));
        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
        base = alloc_page_cgroup(table_size, nid);
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
                pc = base + index;
                init_page_cgroup(pc, nr);
        }
+        /*
+         * The passed "pfn" may not be aligned to SECTION.  For the calculation
+         * we need to apply a mask.
+         */
+        pfn &= PAGE_SECTION_MASK;
        section->page_cgroup = base - pfn;
        total_usage += table_size;
        return 0;
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
        start = start_pfn & ~(PAGES_PER_SECTION - 1);
        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+        if (nid == -1) {
+                /*
+                 * In this case, "nid" already exists and contains valid memory.
+                 * "start_pfn" passed to us is a pfn which is an arg for
+                 * online__pages(), and start_pfn should exist.
+                 */
+                nid = pfn_to_nid(start_pfn);
+                VM_BUG_ON(!node_state(nid, N_ONLINE));
+        }
        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
                if (!pfn_present(pfn))
                        continue;
-                fail = init_section_page_cgroup(pfn);
+                fail = init_section_page_cgroup(pfn, nid);
        }
        if (!fail)
                return 0;
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
 void __init page_cgroup_init(void)
 {
        unsigned long pfn;
-        int fail = 0;
+        int nid;
        if (mem_cgroup_disabled())
                return;
-        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+        for_each_node_state(nid, N_HIGH_MEMORY) {
-                if (!pfn_present(pfn))
+                unsigned long start_pfn, end_pfn;
-                        continue;
-                fail = init_section_page_cgroup(pfn);
+                start_pfn = node_start_pfn(nid);
-        }
+                end_pfn = node_end_pfn(nid);
-        if (fail) {
+                /*
-                printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+                 * start_pfn and end_pfn may not be aligned to SECTION and the
-                panic("Out of memory");
+                 * page->flags of out of node pages are not initialized.  So we
-        } else {
+                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
-                hotplug_memory_notifier(page_cgroup_callback, 0);
+                 */
+                for (pfn = start_pfn;
+                     pfn < end_pfn;
+                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+                        if (!pfn_valid(pfn))
+                                continue;
+                        /*
+                         * Nodes's pfns can be overlapping.
+                         * We know some arch can have a nodes layout such as
+                         * -------------pfn-------------->
+                         * N0 | N1 | N2 | N0 | N1 | N2|....
+                         */
+                        if (pfn_to_nid(pfn) != nid)
+                                continue;
+                        if (init_section_page_cgroup(pfn, nid))
+                                goto oom;
+                }
        }
+        hotplug_memory_notifier(page_cgroup_callback, 0);
        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
+        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
-        " want memory cgroups\n");
+                         "don't want memory cgroups\n");
+        return;
+oom:
+        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+        panic("Out of memory");
 }
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0eb463ea88dd..27dfd3b82b0f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -112,9 +112,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
        kmem_cache_free(anon_vma_cachep, anon_vma);
 }
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
 {
-        return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
 }
 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +159,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                struct mm_struct *mm = vma->vm_mm;
                struct anon_vma *allocated;
-                avc = anon_vma_chain_alloc();
+                avc = anon_vma_chain_alloc(GFP_KERNEL);
                if (!avc)
                        goto out_enomem;
@@ -200,6 +200,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
        return -ENOMEM;
 }
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+        struct anon_vma *new_root = anon_vma->root;
+        if (new_root != root) {
+                if (WARN_ON_ONCE(root))
+                        mutex_unlock(&root->mutex);
+                root = new_root;
+                mutex_lock(&root->mutex);
+        }
+        return root;
+}
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+        if (root)
+                mutex_unlock(&root->mutex);
+}
 static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
@@ -208,13 +234,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
-        anon_vma_lock(anon_vma);
        /*
         * It's critical to add new vmas to the tail of the anon_vma,
         * see comment in huge_memory.c:__split_huge_page().
         */
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-        anon_vma_unlock(anon_vma);
 }
 /*
@@ -224,13 +248,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
        struct anon_vma_chain *avc, *pavc;
+        struct anon_vma *root = NULL;
        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
-                avc = anon_vma_chain_alloc();
+                struct anon_vma *anon_vma;
-                if (!avc)
-                        goto enomem_failure;
+                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
-                anon_vma_chain_link(dst, avc, pavc->anon_vma);
+                if (unlikely(!avc)) {
+                        unlock_anon_vma_root(root);
+                        root = NULL;
+                        avc = anon_vma_chain_alloc(GFP_KERNEL);
+                        if (!avc)
+                                goto enomem_failure;
+                }
+                anon_vma = pavc->anon_vma;
+                root = lock_anon_vma_root(root, anon_vma);
+                anon_vma_chain_link(dst, avc, anon_vma);
        }
+        unlock_anon_vma_root(root);
        return 0;
 enomem_failure:
@@ -263,7 +298,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
-        avc = anon_vma_chain_alloc();
+        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;
@@ -280,7 +315,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
+        anon_vma_lock(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
+        anon_vma_unlock(anon_vma);
        return 0;
@@ -291,36 +328,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        return -ENOMEM;
 }
-static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
-{
-        struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
-        int empty;
-        /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
-        if (!anon_vma)
-                return;
-        anon_vma_lock(anon_vma);
-        list_del(&anon_vma_chain->same_anon_vma);
-        /* We must garbage collect the anon_vma if it's empty */
-        empty = list_empty(&anon_vma->head);
-        anon_vma_unlock(anon_vma);
-        if (empty)
-                put_anon_vma(anon_vma);
-}
 void unlink_anon_vmas(struct vm_area_struct *vma)
 {
        struct anon_vma_chain *avc, *next;
+        struct anon_vma *root = NULL;
        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
-                anon_vma_unlink(avc);
+                struct anon_vma *anon_vma = avc->anon_vma;
+                root = lock_anon_vma_root(root, anon_vma);
+                list_del(&avc->same_anon_vma);
+                /*
+                 * Leave empty anon_vmas on the list - we'll need
+                 * to free them outside the lock.
+                 */
+                if (list_empty(&anon_vma->head))
+                        continue;
+                list_del(&avc->same_vma);
+                anon_vma_chain_free(avc);
+        }
+        unlock_anon_vma_root(root);
+        /*
+         * Iterate the list once more, it now only contains empty and unlinked
+         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+         * needing to acquire the anon_vma->root->mutex.
+         */
+        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+                struct anon_vma *anon_vma = avc->anon_vma;
+                put_anon_vma(anon_vma);
                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
diff --git a/mm/slab.c b/mm/slab.c
index bcfa4987c8ae..d96e223de775 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3604,13 +3604,14 @@ free_done:
 * Release an obj back to its cache. If the obj has a constructed state, it must
 * be in this state _before_ it is released.  Called with disabled ints.
 */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+    void *caller)
 {
        struct array_cache *ac = cpu_cache_get(cachep);
        check_irq_off();
        kmemleak_free_recursive(objp, cachep->flags);
-        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+        objp = cache_free_debugcheck(cachep, objp, caller);
        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        debug_check_no_locks_freed(objp, obj_size(cachep));
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(objp, obj_size(cachep));
-        __cache_free(cachep, objp);
+        __cache_free(cachep, objp, __builtin_return_address(0));
        local_irq_restore(flags);
        trace_kmem_cache_free(_RET_IP_, objp);
@@ -3831,7 +3832,7 @@ void kfree(const void *objp)
        c = virt_to_cache(objp);
        debug_check_no_locks_freed(objp, obj_size(c));
        debug_check_no_obj_freed(objp, obj_size(c));
-        __cache_free(c, (void *)objp);
+        __cache_free(c, (void *)objp, __builtin_return_address(0));
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b0..35f351f26193 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
                        SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
-#ifdef CONFIG_CMPXCHG_LOCAL
        /*
-         * Must align to double word boundary for the double cmpxchg instructions
+         * Must align to double word boundary for the double cmpxchg
-         * to work.
+         * instructions to work; see __pcpu_double_call_return_bool().
         */
-        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
-#else
+                                     2 * sizeof(void *));
-        /* Regular alignment is sufficient */
-        s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
-#endif
        if (!s->cpu_slab)
                return 0;
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd8..fabf2d0f5169 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,14 +21,40 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
+#include <linux/memcontrol.h>
+#include <trace/events/vmscan.h>
+#define TOKEN_AGING_INTERVAL    (0xFF)
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
+struct mem_cgroup *swap_token_memcg;
 static unsigned int global_faults;
+static unsigned int last_aging;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+        struct mem_cgroup *memcg;
+        memcg = try_get_mem_cgroup_from_mm(mm);
+        if (memcg)
+                css_put(mem_cgroup_css(memcg));
+        return memcg;
+}
+#else
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+        return NULL;
+}
+#endif
 void grab_swap_token(struct mm_struct *mm)
 {
        int current_interval;
+        unsigned int old_prio = mm->token_priority;
        global_faults++;
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm)
                return;
        /* First come first served */
-        if (swap_token_mm == NULL) {
+        if (!swap_token_mm)
-                mm->token_priority = mm->token_priority + 2;
+                goto replace_token;
-                swap_token_mm = mm;
-                goto out;
+        if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
+                swap_token_mm->token_priority /= 2;
+                last_aging = global_faults;
        }
-        if (mm != swap_token_mm) {
+        if (mm == swap_token_mm) {
-                if (current_interval < mm->last_interval)
-                        mm->token_priority++;
-                else {
-                        if (likely(mm->token_priority > 0))
-                                mm->token_priority--;
-                }
-                /* Check if we deserve the token */
-                if (mm->token_priority > swap_token_mm->token_priority) {
-                        mm->token_priority += 2;
-                        swap_token_mm = mm;
-                }
-        } else {
-                /* Token holder came in again! */
                mm->token_priority += 2;
+                goto update_priority;
+        }
+        if (current_interval < mm->last_interval)
+                mm->token_priority++;
+        else {
+                if (likely(mm->token_priority > 0))
+                        mm->token_priority--;
        }
+        /* Check if we deserve the token */
+        if (mm->token_priority > swap_token_mm->token_priority)
+                goto replace_token;
+update_priority:
+        trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
 out:
        mm->faultstamp = global_faults;
        mm->last_interval = current_interval;
        spin_unlock(&swap_token_lock);
+        return;
+replace_token:
+        mm->token_priority += 2;
+        trace_replace_swap_token(swap_token_mm, mm);
+        swap_token_mm = mm;
+        swap_token_memcg = swap_token_memcg_from_mm(mm);
+        last_aging = global_faults;
+        goto out;
 }
 /* Called on process exit. */
 void __put_swap_token(struct mm_struct *mm)
 {
        spin_lock(&swap_token_lock);
-        if (likely(mm == swap_token_mm))
+        if (likely(mm == swap_token_mm)) {
+                trace_put_swap_token(swap_token_mm);
                swap_token_mm = NULL;
+                swap_token_memcg = NULL;
+        }
        spin_unlock(&swap_token_lock);
 }
+static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
+{
+        if (!a)
+                return true;
+        if (!b)
+                return true;
+        if (a == b)
+                return true;
+        return false;
+}
+void disable_swap_token(struct mem_cgroup *memcg)
+{
+        /* memcg reclaim don't disable unrelated mm token. */
+        if (match_memcg(memcg, swap_token_memcg)) {
+                spin_lock(&swap_token_lock);
+                if (match_memcg(memcg, swap_token_memcg)) {
+                        trace_disable_swap_token(swap_token_mm);
+                        swap_token_mm = NULL;
+                        swap_token_memcg = NULL;
+                }
+                spin_unlock(&swap_token_lock);
+        }
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9cc..8ff834e19c24 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1124,8 +1124,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                        nr_lumpy_dirty++;
                                scan++;
                        } else {
-                                /* the page is freed already. */
+                                /*
-                                if (!page_count(cursor_page))
+                                 * Check if the page is freed already.
+                                 *
+                                 * We can't use page_count() as that
+                                 * requires compound_head and we don't
+                                 * have a pin on the page here. If a
+                                 * page is tail, we may or may not
+                                 * have isolated the head, so assume
+                                 * it's not free, it'd be tricky to
+                                 * track the head status without a
+                                 * page pin.
+                                 */
+                                if (!PageTail(cursor_page) &&
+                                    !atomic_read(&cursor_page->_count))
                                        continue;
                                break;
                        }
@@ -2081,7 +2093,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(sc->mem_cgroup);
                total_scanned += shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
@@ -2407,7 +2419,7 @@ loop_again:
                /* The swap token gets in the way of swapout... */
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(NULL);
                all_zones_ok = 1;
                balanced = 0;