32 files changed, 1342 insertions, 722 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2888024e0b0a..ebe5880c29d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -216,6 +216,7 @@ config PAGEFLAGS_EXTENDED
 #
 config SPLIT_PTLOCK_CPUS
        int
+        default "999999" if !MMU
        default "999999" if ARM && !CPU_CACHE_VIPT
        default "999999" if PARISC && !PA20
        default "4"
@@ -577,3 +578,6 @@ config PGTABLE_MAPPING
          You can check speed with zsmalloc benchmark:
          https://github.com/spartacus06/zsmapbench
+config GENERIC_EARLY_IOREMAP
+        bool
diff --git a/mm/Makefile b/mm/Makefile
index cdd741519ee0..9e5aaf92197d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o balloon_compaction.o \
+                           compaction.o balloon_compaction.o vmacache.o \
                           interval_tree.o list_lru.o workingset.o $(mmu-y)
 obj-y += init-mm.o
@@ -61,3 +61,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZBUD)      += zbud.o
 obj-$(CONFIG_ZSMALLOC)  += zsmalloc.o
+obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
diff --git a/mm/compaction.c b/mm/compaction.c
index b6ab77160068..37f976287068 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
-        int migratetype = get_pageblock_migratetype(page);
+        /* If the page is a large free page, then disallow migration */
-        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-        if (migratetype == MIGRATE_RESERVE)
-                return false;
-        if (is_migrate_isolate(migratetype))
-                return false;
-        /* If the page is a large free page, then allow migration */
        if (PageBuddy(page) && page_order(page) >= pageblock_order)
-                return true;
+                return false;
        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-        if (migrate_async_suitable(migratetype))
+        if (migrate_async_suitable(get_pageblock_migratetype(page)))
                return true;
        /* Otherwise skip the block */
@@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
        struct page *cursor, *valid_page = NULL;
        unsigned long flags;
        bool locked = false;
+        bool checked_pageblock = false;
        cursor = pfn_to_page(blockpfn);
@@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                        break;
                /* Recheck this is a suitable migration target under lock */
-                if (!strict && !suitable_migration_target(page))
+                if (!strict && !checked_pageblock) {
-                        break;
+                        /*
+                         * We need to check suitability of pageblock only once
+                         * and this isolate_freepages_block() is called with
+                         * pageblock range, so just check once is sufficient.
+                         */
+                        checked_pageblock = true;
+                        if (!suitable_migration_target(page))
+                                break;
+                }
                /* Recheck this is a buddy page under lock */
                if (!PageBuddy(page))
@@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
-        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
        unsigned long flags;
        bool locked = false;
        struct page *page = NULL, *valid_page = NULL;
        bool skipped_async_unsuitable = false;
+        const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+                                    (unevictable ? ISOLATE_UNEVICTABLE : 0);
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        cond_resched();
        for (; low_pfn < end_pfn; low_pfn++) {
                /* give a chance to irqs before checking need_resched() */
-                if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+                if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
                        if (should_release_lock(&zone->lru_lock)) {
                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                                locked = false;
@@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                /* If isolation recently failed, do not retry */
                pageblock_nr = low_pfn >> pageblock_order;
-                if (!isolation_suitable(cc, page))
+                if (last_pageblock_nr != pageblock_nr) {
-                        goto next_pageblock;
+                        int mt;
+                        last_pageblock_nr = pageblock_nr;
+                        if (!isolation_suitable(cc, page))
+                                goto next_pageblock;
+                        /*
+                         * For async migration, also only scan in MOVABLE
+                         * blocks. Async migration is optimistic to see if
+                         * the minimum amount of work satisfies the allocation
+                         */
+                        mt = get_pageblock_migratetype(page);
+                        if (!cc->sync && !migrate_async_suitable(mt)) {
+                                cc->finished_update_migrate = true;
+                                skipped_async_unsuitable = true;
+                                goto next_pageblock;
+                        }
+                }
                /*
                 * Skip if free. page_order cannot be used without zone->lock
@@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        continue;
                /*
-                 * For async migration, also only scan in MOVABLE blocks. Async
-                 * migration is optimistic to see if the minimum amount of work
-                 * satisfies the allocation
-                 */
-                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
-                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
-                        cc->finished_update_migrate = true;
-                        skipped_async_unsuitable = true;
-                        goto next_pageblock;
-                }
-                /*
                 * Check may be lockless but that's ok as we recheck later.
                 * It's possible to migrate LRU pages and balloon pages
                 * Skip any other type of page
@@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        if (unlikely(balloon_page_movable(page))) {
                                if (locked && balloon_page_isolate(page)) {
                                        /* Successfully isolated */
-                                        cc->finished_update_migrate = true;
+                                        goto isolate_success;
-                                        list_add(&page->lru, migratelist);
-                                        cc->nr_migratepages++;
-                                        nr_isolated++;
-                                        goto check_compact_cluster;
                                }
                        }
                        continue;
@@ -607,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        continue;
                }
-                if (!cc->sync)
-                        mode |= ISOLATE_ASYNC_MIGRATE;
-                if (unevictable)
-                        mode |= ISOLATE_UNEVICTABLE;
                lruvec = mem_cgroup_page_lruvec(page, zone);
                /* Try isolate the page */
@@ -622,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                VM_BUG_ON_PAGE(PageTransCompound(page), page);
                /* Successfully isolated */
-                cc->finished_update_migrate = true;
                del_page_from_lru_list(page, lruvec, page_lru(page));
+isolate_success:
+                cc->finished_update_migrate = true;
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
                nr_isolated++;
-check_compact_cluster:
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
                        ++low_pfn;
@@ -639,7 +636,6 @@ check_compact_cluster:
 next_pageblock:
                low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
-                last_pageblock_nr = pageblock_nr;
        }
        acct_isolated(zone, locked, cc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
new file mode 100644
index 000000000000..e10ccd299d66
--- /dev/null
+++ b/mm/early_ioremap.c
@@ -0,0 +1,245 @@
+/*
+ * Provide common bits of early_ioremap() support for architectures needing
+ * temporary mappings during boot before ioremap() is available.
+ *
+ * This is mostly a direct copy of the x86 early_ioremap implementation.
+ *
+ * (C) Copyright 1995 1996, 2014 Linus Torvalds
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/fixmap.h>
+#ifdef CONFIG_MMU
+static int early_ioremap_debug __initdata;
+static int __init early_ioremap_debug_setup(char *str)
+{
+        early_ioremap_debug = 1;
+        return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+static int after_paging_init __initdata;
+void __init __weak early_ioremap_shutdown(void)
+{
+}
+void __init early_ioremap_reset(void)
+{
+        early_ioremap_shutdown();
+        after_paging_init = 1;
+}
+/*
+ * Generally, ioremap() is available after paging_init() has been called.
+ * Architectures wanting to allow early_ioremap after paging_init() can
+ * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
+ */
+#ifndef __late_set_fixmap
+static inline void __init __late_set_fixmap(enum fixed_addresses idx,
+                                            phys_addr_t phys, pgprot_t prot)
+{
+        BUG();
+}
+#endif
+#ifndef __late_clear_fixmap
+static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
+{
+        BUG();
+}
+#endif
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+void __init early_ioremap_setup(void)
+{
+        int i;
+        for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+                if (WARN_ON(prev_map[i]))
+                        break;
+        for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+                slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+}
+static int __init check_early_ioremap_leak(void)
+{
+        int count = 0;
+        int i;
+        for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+                if (prev_map[i])
+                        count++;
+        if (WARN(count, KERN_WARNING
+                 "Debug warning: early ioremap leak of %d areas detected.\n"
+                 "please boot with early_ioremap_debug and report the dmesg.\n",
+                 count))
+                return 1;
+        return 0;
+}
+late_initcall(check_early_ioremap_leak);
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+        unsigned long offset;
+        resource_size_t last_addr;
+        unsigned int nrpages;
+        enum fixed_addresses idx;
+        int i, slot;
+        WARN_ON(system_state != SYSTEM_BOOTING);
+        slot = -1;
+        for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+                if (!prev_map[i]) {
+                        slot = i;
+                        break;
+                }
+        }
+        if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
+                 __func__, (u64)phys_addr, size))
+                return NULL;
+        /* Don't allow wraparound or zero size */
+        last_addr = phys_addr + size - 1;
+        if (WARN_ON(!size || last_addr < phys_addr))
+                return NULL;
+        prev_size[slot] = size;
+        /*
+         * Mappings have to be page-aligned
+         */
+        offset = phys_addr & ~PAGE_MASK;
+        phys_addr &= PAGE_MASK;
+        size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+        /*
+         * Mappings have to fit in the FIX_BTMAP area.
+         */
+        nrpages = size >> PAGE_SHIFT;
+        if (WARN_ON(nrpages > NR_FIX_BTMAPS))
+                return NULL;
+        /*
+         * Ok, go for it..
+         */
+        idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+        while (nrpages > 0) {
+                if (after_paging_init)
+                        __late_set_fixmap(idx, phys_addr, prot);
+                else
+                        __early_set_fixmap(idx, phys_addr, prot);
+                phys_addr += PAGE_SIZE;
+                --idx;
+                --nrpages;
+        }
+        WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
+             __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
+        prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+        return prev_map[slot];
+}
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+        unsigned long virt_addr;
+        unsigned long offset;
+        unsigned int nrpages;
+        enum fixed_addresses idx;
+        int i, slot;
+        slot = -1;
+        for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+                if (prev_map[i] == addr) {
+                        slot = i;
+                        break;
+                }
+        }
+        if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
+                 addr, size))
+                return;
+        if (WARN(prev_size[slot] != size,
+                 "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+                 addr, size, slot, prev_size[slot]))
+                return;
+        WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
+             addr, size, slot);
+        virt_addr = (unsigned long)addr;
+        if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
+                return;
+        offset = virt_addr & ~PAGE_MASK;
+        nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+        idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+        while (nrpages > 0) {
+                if (after_paging_init)
+                        __late_clear_fixmap(idx);
+                else
+                        __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+                --idx;
+                --nrpages;
+        }
+        prev_map[slot] = NULL;
+}
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+        return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
+}
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+        return (__force void *)__early_ioremap(phys_addr, size,
+                                               FIXMAP_PAGE_NORMAL);
+}
+#else /* CONFIG_MMU */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+        return (__force void __iomem *)phys_addr;
+}
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+        return (void *)phys_addr;
+}
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+}
+#endif /* CONFIG_MMU */
+void __init early_memunmap(void *addr, unsigned long size)
+{
+        early_iounmap((__force void __iomem *)addr, size);
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 21781f1fe52b..27ebc0c9571b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/rmap.h>
 #include "internal.h"
 #define CREATE_TRACE_POINTS
@@ -562,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page,
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageSwapBacked(page), page);
-        error = mem_cgroup_cache_charge(page, current->mm,
+        error = mem_cgroup_charge_file(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
        if (error)
                return error;
@@ -1952,11 +1953,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct inode *inode = mapping->host;
        pgoff_t offset = vmf->pgoff;
        struct page *page;
-        pgoff_t size;
+        loff_t size;
        int ret = 0;
-        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
-        if (offset >= size)
+        if (offset >= size >> PAGE_CACHE_SHIFT)
                return VM_FAULT_SIGBUS;
        /*
@@ -2005,8 +2006,8 @@ retry_find:
         * Found the page and have a reference on it.
         * We must recheck i_size under page lock.
         */
-        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
-        if (unlikely(offset >= size)) {
+        if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
                unlock_page(page);
                page_cache_release(page);
                return VM_FAULT_SIGBUS;
@@ -2064,6 +2065,78 @@ page_not_uptodate:
 }
 EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct radix_tree_iter iter;
+        void **slot;
+        struct file *file = vma->vm_file;
+        struct address_space *mapping = file->f_mapping;
+        loff_t size;
+        struct page *page;
+        unsigned long address = (unsigned long) vmf->virtual_address;
+        unsigned long addr;
+        pte_t *pte;
+        rcu_read_lock();
+        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
+                if (iter.index > vmf->max_pgoff)
+                        break;
+repeat:
+                page = radix_tree_deref_slot(slot);
+                if (unlikely(!page))
+                        goto next;
+                if (radix_tree_exception(page)) {
+                        if (radix_tree_deref_retry(page))
+                                break;
+                        else
+                                goto next;
+                }
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *slot)) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                if (!PageUptodate(page) ||
+                                PageReadahead(page) ||
+                                PageHWPoison(page))
+                        goto skip;
+                if (!trylock_page(page))
+                        goto skip;
+                if (page->mapping != mapping || !PageUptodate(page))
+                        goto unlock;
+                size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
+                if (page->index >= size >> PAGE_CACHE_SHIFT)
+                        goto unlock;
+                pte = vmf->pte + page->index - vmf->pgoff;
+                if (!pte_none(*pte))
+                        goto unlock;
+                if (file->f_ra.mmap_miss > 0)
+                        file->f_ra.mmap_miss--;
+                addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
+                do_set_pte(vma, addr, page, pte, false, false);
+                unlock_page(page);
+                goto next;
+unlock:
+                unlock_page(page);
+skip:
+                page_cache_release(page);
+next:
+                if (iter.index == vmf->max_pgoff)
+                        break;
+        }
+        rcu_read_unlock();
+}
+EXPORT_SYMBOL(filemap_map_pages);
 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
@@ -2093,6 +2166,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
        .page_mkwrite   = filemap_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ac89e9f82ef..64635f5278ff 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-        if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+        if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                               __GFP_OTHER_NODE,
                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
-                             mem_cgroup_newpage_charge(pages[i], mm,
+                             mem_cgroup_charge_anon(pages[i], mm,
                                                       GFP_KERNEL))) {
                        if (pages[i])
                                put_page(pages[i]);
@@ -1101,7 +1101,7 @@ alloc:
                goto out;
        }
-        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+        if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
                if (page) {
                        split_huge_page(page);
@@ -1891,17 +1891,22 @@ out:
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
 {
-        struct mm_struct *mm = vma->vm_mm;
        switch (advice) {
        case MADV_HUGEPAGE:
+#ifdef CONFIG_S390
+                /*
+                 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
+                 * can't handle this properly after s390_enable_sie, so we simply
+                 * ignore the madvise to prevent qemu from causing a SIGSEGV.
+                 */
+                if (mm_has_pgste(vma->vm_mm))
+                        return 0;
+#endif
                /*
                 * Be somewhat over-protective like KSM for now!
                 */
                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
                        return -EINVAL;
-                if (mm->def_flags & VM_NOHUGEPAGE)
-                        return -EINVAL;
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
@@ -2354,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (!new_page)
                return;
-        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+        if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)))
                return;
        /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c02b9dadfb0..dd30f22b35e0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
+#include <linux/compiler.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
@@ -1535,6 +1536,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
        while (min_count < persistent_huge_pages(h)) {
                if (!free_pool_huge_page(h, nodes_allowed, 0))
                        break;
+                cond_resched_lock(&hugetlb_lock);
        }
        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2690,7 +2692,8 @@ retry_avoidcopy:
                                BUG_ON(huge_pte_none(pte));
                                spin_lock(ptl);
                                ptep = huge_pte_offset(mm, address & huge_page_mask(h));
-                                if (likely(pte_same(huge_ptep_get(ptep), pte)))
+                                if (likely(ptep &&
+                                           pte_same(huge_ptep_get(ptep), pte)))
                                        goto retry_avoidcopy;
                                /*
                                 * race occurs while re-acquiring page table
@@ -2734,7 +2737,7 @@ retry_avoidcopy:
         */
        spin_lock(ptl);
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
-        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+        if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
                ClearPagePrivate(new_page);
                /* Break COW */
@@ -2896,8 +2899,7 @@ retry:
        if (anon_rmap) {
                ClearPagePrivate(page);
                hugepage_add_new_anon_rmap(page, vma, address);
-        }
+        } else
-        else
                page_dup_rmap(page);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
@@ -3185,6 +3187,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
+        mmu_notifier_invalidate_range_start(mm, start, end);
        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
        for (; address < end; address += huge_page_size(h)) {
                spinlock_t *ptl;
@@ -3214,6 +3217,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         */
        flush_tlb_range(vma, start, end);
        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        return pages << h->order;
 }
@@ -3518,7 +3522,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 /* Can be overriden by architectures */
-__attribute__((weak)) struct page *
+struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
               pud_t *pud, int write)
 {
diff --git a/mm/internal.h b/mm/internal.h
index 29e1e761f9eb..07b67361a40a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
 #ifndef __MM_INTERNAL_H
 #define __MM_INTERNAL_H
+#include <linux/fs.h>
 #include <linux/mm.h>
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
        atomic_set(&page->_count, v);
 }
+extern int __do_page_cache_readahead(struct address_space *mapping,
+                struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+                unsigned long lookahead_size);
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static inline unsigned long ra_submit(struct file_ra_state *ra,
+                struct address_space *mapping, struct file *filp)
+{
+        return __do_page_cache_readahead(mapping, filp,
+                                        ra->start, ra->size, ra->async_size);
+}
 /*
 * Turn a non-refcounted page (->_count == 0) into refcounted with
 * a count of one.
@@ -370,5 +385,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET            0x40 /* check for correct cpuset */
 #define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR              0x100 /* fair zone allocation */
 #endif  /* __MM_INTERNAL_H */
diff --git a/mm/memblock.c b/mm/memblock.c
index 7fe5354e7552..e9d6ca9a01a9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1253,7 +1253,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
                pages += end_pfn - start_pfn;
        }
-        return (phys_addr_t)pages << PAGE_SHIFT;
+        return PFN_PHYS(pages);
 }
 /* lowest address */
@@ -1271,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
 void __init memblock_enforce_memory_limit(phys_addr_t limit)
 {
-        unsigned long i;
        phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+        struct memblock_region *r;
        if (!limit)
                return;
        /* find out max address */
-        for (i = 0; i < memblock.memory.cnt; i++) {
+        for_each_memblock(memory, r) {
-                struct memblock_region *r = &memblock.memory.regions[i];
                if (limit <= r->size) {
                        max_addr = r->base + limit;
                        break;
@@ -1326,7 +1324,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
                         unsigned long *start_pfn, unsigned long *end_pfn)
 {
        struct memblock_type *type = &memblock.memory;
-        int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
+        int mid = memblock_search(type, PFN_PHYS(pfn));
        if (mid == -1)
                return -1;
@@ -1379,13 +1377,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
 void __init_memblock memblock_trim_memory(phys_addr_t align)
 {
-        int i;
        phys_addr_t start, end, orig_start, orig_end;
-        struct memblock_type *mem = &memblock.memory;
+        struct memblock_region *r;
-        for (i = 0; i < mem->cnt; i++) {
+        for_each_memblock(memory, r) {
-                orig_start = mem->regions[i].base;
+                orig_start = r->base;
-                orig_end = mem->regions[i].base + mem->regions[i].size;
+                orig_end = r->base + r->size;
                start = round_up(orig_start, align);
                end = round_down(orig_end, align);
@@ -1393,11 +1390,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
                        continue;
                if (start < end) {
-                        mem->regions[i].base = start;
+                        r->base = start;
-                        mem->regions[i].size = end - start;
+                        r->size = end - start;
                } else {
-                        memblock_remove_region(mem, i);
+                        memblock_remove_region(&memblock.memory,
-                        i--;
+                                               r - memblock.memory.regions);
+                        r--;
                }
        }
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dcc8153a1681..29501f040568 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -921,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                                         struct page *page,
                                         bool anon, int nr_pages)
 {
-        preempt_disable();
        /*
         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
         * counted as CACHE even if it's on ANON LRU.
@@ -947,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
        }
        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-        preempt_enable();
 }
 unsigned long
@@ -1075,22 +1071,15 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
        struct mem_cgroup *memcg = NULL;
-        if (!mm)
-                return NULL;
-        /*
-         * Because we have no locks, mm->owner's may be being moved to other
-         * cgroup. We use css_tryget() here even if this looks
-         * pessimistic (rather than adding locks here).
-         */
        rcu_read_lock();
        do {
                memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
                if (unlikely(!memcg))
-                        break;
+                        memcg = root_mem_cgroup;
        } while (!css_tryget(&memcg->css));
        rcu_read_unlock();
        return memcg;
@@ -1486,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task,
        p = find_lock_task_mm(task);
        if (p) {
-                curr = try_get_mem_cgroup_from_mm(p->mm);
+                curr = get_mem_cgroup_from_mm(p->mm);
                task_unlock(p);
        } else {
                /*
@@ -1500,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task,
                        css_get(&curr->css);
                rcu_read_unlock();
        }
-        if (!curr)
-                return false;
        /*
         * We should check use_hierarchy of "memcg" not "curr". Because checking
         * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -2588,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 }
-/* See __mem_cgroup_try_charge() for details */
+/* See mem_cgroup_try_charge() for details */
 enum {
        CHARGE_OK,              /* success */
        CHARGE_RETRY,           /* need to retry but retry is not bad */
@@ -2661,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        return CHARGE_NOMEM;
 }
-/*
+/**
- * __mem_cgroup_try_charge() does
+ * mem_cgroup_try_charge - try charging a memcg
- * 1. detect memcg to be charged against from passed *mm and *ptr,
+ * @memcg: memcg to charge
- * 2. update res_counter
+ * @nr_pages: number of pages to charge
- * 3. call memory reclaim if necessary.
+ * @oom: trigger OOM if reclaim fails
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- *  0       ...  on success, filling *ptr with a valid memcg pointer.
- *  -ENOMEM ...  charge failure because of resource limits.
- *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
 *
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
- * the oom-killer can be invoked.
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
 */
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
-                                   gfp_t gfp_mask,
+                                 gfp_t gfp_mask,
-                                   unsigned int nr_pages,
+                                 unsigned int nr_pages,
-                                   struct mem_cgroup **ptr,
+                                 bool oom)
-                                   bool oom)
 {
        unsigned int batch = max(CHARGE_BATCH, nr_pages);
        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        struct mem_cgroup *memcg = NULL;
        int ret;
+        if (mem_cgroup_is_root(memcg))
+                goto done;
        /*
-         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+         * Unlike in global OOM situations, memcg is not in a physical
-         * in system level. So, allow to go ahead dying process in addition to
+         * memory shortage.  Allow dying and OOM-killed tasks to
-         * MEMDIE process.
+         * bypass the last charges so that they can exit quickly and
+         * free their memory.
         */
-        if (unlikely(test_thread_flag(TIF_MEMDIE)
+        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
-                     || fatal_signal_pending(current)))
+                     fatal_signal_pending(current)))
                goto bypass;
        if (unlikely(task_in_memcg_oom(current)))
@@ -2707,73 +2683,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
        if (gfp_mask & __GFP_NOFAIL)
                oom = false;
-        /*
-         * We always charge the cgroup the mm_struct belongs to.
-         * The mm_struct's mem_cgroup changes on task migration if the
-         * thread group leader migrates. It's possible that mm is not
-         * set, if so charge the root memcg (happens for pagecache usage).
-         */
-        if (!*ptr && !mm)
-                *ptr = root_mem_cgroup;
 again:
-        if (*ptr) { /* css should be a valid one */
+        if (consume_stock(memcg, nr_pages))
-                memcg = *ptr;
+                goto done;
-                if (mem_cgroup_is_root(memcg))
-                        goto done;
-                if (consume_stock(memcg, nr_pages))
-                        goto done;
-                css_get(&memcg->css);
-        } else {
-                struct task_struct *p;
-                rcu_read_lock();
-                p = rcu_dereference(mm->owner);
-                /*
-                 * Because we don't have task_lock(), "p" can exit.
-                 * In that case, "memcg" can point to root or p can be NULL with
-                 * race with swapoff. Then, we have small risk of mis-accouning.
-                 * But such kind of mis-account by race always happens because
-                 * we don't have cgroup_mutex(). It's overkill and we allo that
-                 * small race, here.
-                 * (*) swapoff at el will charge against mm-struct not against
-                 * task-struct. So, mm->owner can be NULL.
-                 */
-                memcg = mem_cgroup_from_task(p);
-                if (!memcg)
-                        memcg = root_mem_cgroup;
-                if (mem_cgroup_is_root(memcg)) {
-                        rcu_read_unlock();
-                        goto done;
-                }
-                if (consume_stock(memcg, nr_pages)) {
-                        /*
-                         * It seems dagerous to access memcg without css_get().
-                         * But considering how consume_stok works, it's not
-                         * necessary. If consume_stock success, some charges
-                         * from this memcg are cached on this cpu. So, we
-                         * don't need to call css_get()/css_tryget() before
-                         * calling consume_stock().
-                         */
-                        rcu_read_unlock();
-                        goto done;
-                }
-                /* after here, we may be blocked. we need to get refcnt */
-                if (!css_tryget(&memcg->css)) {
-                        rcu_read_unlock();
-                        goto again;
-                }
-                rcu_read_unlock();
-        }
        do {
                bool invoke_oom = oom && !nr_oom_retries;
                /* If killed, bypass charge */
-                if (fatal_signal_pending(current)) {
+                if (fatal_signal_pending(current))
-                        css_put(&memcg->css);
                        goto bypass;
-                }
                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
                                           nr_pages, invoke_oom);
@@ -2782,17 +2701,12 @@ again:
                        break;
                case CHARGE_RETRY: /* not in OOM situation but retry */
                        batch = nr_pages;
-                        css_put(&memcg->css);
-                        memcg = NULL;
                        goto again;
                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-                        css_put(&memcg->css);
                        goto nomem;
                case CHARGE_NOMEM: /* OOM routine works */
-                        if (!oom || invoke_oom) {
+                        if (!oom || invoke_oom)
-                                css_put(&memcg->css);
                                goto nomem;
-                        }
                        nr_oom_retries--;
                        break;
                }
@@ -2800,20 +2714,44 @@ again:
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
-        css_put(&memcg->css);
 done:
-        *ptr = memcg;
        return 0;
 nomem:
-        if (!(gfp_mask & __GFP_NOFAIL)) {
+        if (!(gfp_mask & __GFP_NOFAIL))
-                *ptr = NULL;
                return -ENOMEM;
-        }
 bypass:
-        *ptr = root_mem_cgroup;
        return -EINTR;
 }
+/**
+ * mem_cgroup_try_charge_mm - try charging a mm
+ * @mm: mm_struct to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns the charged mem_cgroup associated with the given mm_struct or
+ * NULL the charge failed.
+ */
+static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
+                                 gfp_t gfp_mask,
+                                 unsigned int nr_pages,
+                                 bool oom)
+{
+        struct mem_cgroup *memcg;
+        int ret;
+        memcg = get_mem_cgroup_from_mm(mm);
+        ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+        css_put(&memcg->css);
+        if (ret == -EINTR)
+                memcg = root_mem_cgroup;
+        else if (ret)
+                memcg = NULL;
+        return memcg;
+}
 /*
 * Somemtimes we have to undo a charge we got by try_charge().
 * This function is for that and do uncharge, put css's refcnt.
@@ -3009,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 {
        struct res_counter *fail_res;
-        struct mem_cgroup *_memcg;
        int ret = 0;
        ret = res_counter_charge(&memcg->kmem, size, &fail_res);
        if (ret)
                return ret;
-        _memcg = memcg;
+        ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
-        ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+                                    oom_gfp_allowed(gfp));
-                                      &_memcg, oom_gfp_allowed(gfp));
        if (ret == -EINTR)  {
                /*
-                 * __mem_cgroup_try_charge() chosed to bypass to root due to
+                 * mem_cgroup_try_charge() chosed to bypass to root due to
                 * OOM kill or fatal signal.  Since our only options are to
                 * either fail the allocation or charge it to this cgroup, do
                 * it as a temporary condition. But we can't fail. From a
@@ -3032,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
                 *
                 * This condition will only trigger if the task entered
                 * memcg_charge_kmem in a sane state, but was OOM-killed during
-                 * __mem_cgroup_try_charge() above. Tasks that were already
+                 * mem_cgroup_try_charge() above. Tasks that were already
                 * dying when the allocation triggers should have been already
                 * directed to the root cgroup in memcontrol.h
                 */
@@ -3159,6 +3094,29 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
        return 0;
 }
+char *memcg_create_cache_name(struct mem_cgroup *memcg,
+                              struct kmem_cache *root_cache)
+{
+        static char *buf = NULL;
+        /*
+         * We need a mutex here to protect the shared buffer. Since this is
+         * expected to be called only on cache creation, we can employ the
+         * slab_mutex for that purpose.
+         */
+        lockdep_assert_held(&slab_mutex);
+        if (!buf) {
+                buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+                if (!buf)
+                        return NULL;
+        }
+        cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
+        return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+                         memcg_cache_id(memcg), buf);
+}
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
                             struct kmem_cache *root_cache)
 {
@@ -3182,6 +3140,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
                s->memcg_params->root_cache = root_cache;
                INIT_WORK(&s->memcg_params->destroy,
                                kmem_cache_destroy_work_func);
+                css_get(&memcg->css);
        } else
                s->memcg_params->is_root_cache = true;
@@ -3190,6 +3149,10 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 void memcg_free_cache_params(struct kmem_cache *s)
 {
+        if (!s->memcg_params)
+                return;
+        if (!s->memcg_params->is_root_cache)
+                css_put(&s->memcg_params->memcg->css);
        kfree(s->memcg_params);
 }
@@ -3212,9 +3175,6 @@ void memcg_register_cache(struct kmem_cache *s)
        memcg = s->memcg_params->memcg;
        id = memcg_cache_id(memcg);
-        css_get(&memcg->css);
        /*
         * Since readers won't lock (see cache_from_memcg_idx()), we need a
         * barrier here to ensure nobody will see the kmem_cache partially
@@ -3263,10 +3223,8 @@ void memcg_unregister_cache(struct kmem_cache *s)
         * after removing it from the memcg_slab_caches list, otherwise we can
         * fail to convert memcg_params_to_cache() while traversing the list.
         */
-        VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+        VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
        root->memcg_params->memcg_caches[id] = NULL;
-        css_put(&memcg->css);
 }
 /*
@@ -3363,55 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
        schedule_work(&cachep->memcg_params->destroy);
 }
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
-                                                  struct kmem_cache *s)
-{
-        struct kmem_cache *new = NULL;
-        static char *tmp_path = NULL, *tmp_name = NULL;
-        static DEFINE_MUTEX(mutex);     /* protects tmp_name */
-        BUG_ON(!memcg_can_account_kmem(memcg));
-        mutex_lock(&mutex);
-        /*
-         * kmem_cache_create_memcg duplicates the given name and
-         * cgroup_name for this name requires RCU context.
-         * This static temporary buffer is used to prevent from
-         * pointless shortliving allocation.
-         */
-        if (!tmp_path || !tmp_name) {
-                if (!tmp_path)
-                        tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
-                if (!tmp_name)
-                        tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-                if (!tmp_path || !tmp_name)
-                        goto out;
-        }
-        cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
-        snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
-                 memcg_cache_id(memcg), tmp_name);
-        new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
-                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
-        if (new)
-                new->allocflags |= __GFP_KMEMCG;
-        else
-                new = s;
-out:
-        mutex_unlock(&mutex);
-        return new;
-}
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
        struct kmem_cache *c;
-        int i;
+        int i, failed = 0;
-        if (!s->memcg_params)
-                return;
-        if (!s->memcg_params->is_root_cache)
-                return;
        /*
         * If the cache is being destroyed, we trust that there is no one else
@@ -3445,16 +3358,14 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
                c->memcg_params->dead = false;
                cancel_work_sync(&c->memcg_params->destroy);
                kmem_cache_destroy(c);
+                if (cache_from_memcg_idx(s, i))
+                        failed++;
        }
        mutex_unlock(&activate_kmem_mutex);
+        return failed;
 }
-struct create_work {
-        struct mem_cgroup *memcg;
-        struct kmem_cache *cachep;
-        struct work_struct work;
-};
 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
 {
        struct kmem_cache *cachep;
@@ -3472,13 +3383,20 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
        mutex_unlock(&memcg->slab_caches_mutex);
 }
+struct create_work {
+        struct mem_cgroup *memcg;
+        struct kmem_cache *cachep;
+        struct work_struct work;
+};
 static void memcg_create_cache_work_func(struct work_struct *w)
 {
-        struct create_work *cw;
+        struct create_work *cw = container_of(w, struct create_work, work);
+        struct mem_cgroup *memcg = cw->memcg;
+        struct kmem_cache *cachep = cw->cachep;
-        cw = container_of(w, struct create_work, work);
+        kmem_cache_create_memcg(memcg, cachep);
-        memcg_create_kmem_cache(cw->memcg, cw->cachep);
+        css_put(&memcg->css);
-        css_put(&cw->memcg->css);
        kfree(cw);
 }
@@ -3637,15 +3555,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
        if (!current->mm || current->memcg_kmem_skip_account)
                return true;
-        memcg = try_get_mem_cgroup_from_mm(current->mm);
+        memcg = get_mem_cgroup_from_mm(current->mm);
-        /*
-         * very rare case described in mem_cgroup_from_task. Unfortunately there
-         * isn't much we can do without complicating this too much, and it would
-         * be gfp-dependent anyway. Just let it go
-         */
-        if (unlikely(!memcg))
-                return true;
        if (!memcg_can_account_kmem(memcg)) {
                css_put(&memcg->css);
@@ -3748,19 +3658,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline
-void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
-                                        struct mem_cgroup *to,
-                                        unsigned int nr_pages,
-                                        enum mem_cgroup_stat_index idx)
-{
-        /* Update stat data for mem_cgroup */
-        preempt_disable();
-        __this_cpu_sub(from->stat->count[idx], nr_pages);
-        __this_cpu_add(to->stat->count[idx], nr_pages);
-        preempt_enable();
-}
 /**
 * mem_cgroup_move_account - move account of the page
 * @page: the page
@@ -3806,13 +3703,19 @@ static int mem_cgroup_move_account(struct page *page,
        move_lock_mem_cgroup(from, &flags);
-        if (!anon && page_mapped(page))
+        if (!anon && page_mapped(page)) {
-                mem_cgroup_move_account_page_stat(from, to, nr_pages,
+                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                        MEM_CGROUP_STAT_FILE_MAPPED);
+                               nr_pages);
+                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+                               nr_pages);
+        }
-        if (PageWriteback(page))
+        if (PageWriteback(page)) {
-                mem_cgroup_move_account_page_stat(from, to, nr_pages,
+                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                        MEM_CGROUP_STAT_WRITEBACK);
+                               nr_pages);
+                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+                               nr_pages);
+        }
        mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
@@ -3898,19 +3801,19 @@ out:
        return ret;
 }
-/*
+int mem_cgroup_charge_anon(struct page *page,
- * Charge the memory controller for page usage.
+                              struct mm_struct *mm, gfp_t gfp_mask)
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask, enum charge_type ctype)
 {
-        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
+        struct mem_cgroup *memcg;
        bool oom = true;
-        int ret;
+        if (mem_cgroup_disabled())
+                return 0;
+        VM_BUG_ON_PAGE(page_mapped(page), page);
+        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+        VM_BUG_ON(!mm);
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
@@ -3922,25 +3825,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                oom = false;
        }
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
+        memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
-        if (ret == -ENOMEM)
+        if (!memcg)
-                return ret;
+                return -ENOMEM;
-        __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
+        __mem_cgroup_commit_charge(memcg, page, nr_pages,
+                                   MEM_CGROUP_CHARGE_TYPE_ANON, false);
        return 0;
 }
-int mem_cgroup_newpage_charge(struct page *page,
-                              struct mm_struct *mm, gfp_t gfp_mask)
-{
-        if (mem_cgroup_disabled())
-                return 0;
-        VM_BUG_ON_PAGE(page_mapped(page), page);
-        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
-        VM_BUG_ON(!mm);
-        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_ANON);
-}
 /*
 * While swap-in, try_charge -> commit or cancel, the page is locked.
 * And when try_charge() successfully returns, one refcnt to memcg without
@@ -3952,7 +3844,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                                          gfp_t mask,
                                          struct mem_cgroup **memcgp)
 {
-        struct mem_cgroup *memcg;
+        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        int ret;
@@ -3965,31 +3857,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
         * in turn serializes uncharging.
         */
        if (PageCgroupUsed(pc))
-                return 0;
+                goto out;
-        if (!do_swap_account)
+        if (do_swap_account)
-                goto charge_cur_mm;
+                memcg = try_get_mem_cgroup_from_page(page);
-        memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
-                goto charge_cur_mm;
+                memcg = get_mem_cgroup_from_mm(mm);
-        *memcgp = memcg;
+        ret = mem_cgroup_try_charge(memcg, mask, 1, true);
-        ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
        css_put(&memcg->css);
        if (ret == -EINTR)
-                ret = 0;
+                memcg = root_mem_cgroup;
-        return ret;
+        else if (ret)
-charge_cur_mm:
+                return ret;
-        ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
+out:
-        if (ret == -EINTR)
+        *memcgp = memcg;
-                ret = 0;
+        return 0;
-        return ret;
 }
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
                                 gfp_t gfp_mask, struct mem_cgroup **memcgp)
 {
-        *memcgp = NULL;
+        if (mem_cgroup_disabled()) {
-        if (mem_cgroup_disabled())
+                *memcgp = NULL;
                return 0;
+        }
        /*
         * A racing thread's fault, or swapoff, may have already
         * updated the pte, and even removed page from swap cache: in
@@ -3997,12 +3887,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
         * there's also a KSM case which does need to charge the page.
         */
        if (!PageSwapCache(page)) {
-                int ret;
+                struct mem_cgroup *memcg;
-                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+                memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
-                if (ret == -EINTR)
+                if (!memcg)
-                        ret = 0;
+                        return -ENOMEM;
-                return ret;
+                *memcgp = memcg;
+                return 0;
        }
        return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
 }
@@ -4046,11 +3937,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
                                          MEM_CGROUP_CHARGE_TYPE_ANON);
 }
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        struct mem_cgroup *memcg = NULL;
        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        struct mem_cgroup *memcg;
        int ret;
        if (mem_cgroup_disabled())
@@ -4058,15 +3949,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
        if (PageCompound(page))
                return 0;
-        if (!PageSwapCache(page))
+        if (PageSwapCache(page)) { /* shmem */
-                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-        else { /* page is swapcache/shmem */
                ret = __mem_cgroup_try_charge_swapin(mm, page,
                                                     gfp_mask, &memcg);
-                if (!ret)
+                if (ret)
-                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
+                        return ret;
+                __mem_cgroup_commit_charge_swapin(page, memcg, type);
+                return 0;
        }
-        return ret;
+        /*
+         * Page cache insertions can happen without an actual mm
+         * context, e.g. during disk probing on boot.
+         */
+        if (unlikely(!mm))
+                memcg = root_mem_cgroup;
+        else {
+                memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+                if (!memcg)
+                        return -ENOMEM;
+        }
+        __mem_cgroup_commit_charge(memcg, page, 1, type, false);
+        return 0;
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -6678,8 +6582,7 @@ one_by_one:
                        batch_count = PRECHARGE_COUNT_AT_ONCE;
                        cond_resched();
                }
-                ret = __mem_cgroup_try_charge(NULL,
+                ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
-                                        GFP_KERNEL, 1, &memcg, false);
                if (ret)
                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 82c1e4cf00d1..d0f0bef3be48 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,6 +60,7 @@
 #include <linux/migrate.h>
 #include <linux/string.h>
 #include <linux/dma-debug.h>
+#include <linux/debugfs.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -1320,9 +1321,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
-                         * cleanup path of do_mmap_pgoff. When
+                         * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
-                         * do_mmap_pgoff() nullifies vma->vm_file
+                         * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
@@ -2781,7 +2782,7 @@ reuse:
                 */
                if (!page_mkwrite) {
                        wait_on_page_locked(dirty_page);
-                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                        set_page_dirty_balance(dirty_page);
                        /* file_update_time outside page_lock */
                        if (vma->vm_file)
                                file_update_time(vma->vm_file);
@@ -2827,7 +2828,7 @@ gotten:
        }
        __SetPageUptodate(new_page);
-        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+        if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
        mmun_start  = address & PAGE_MASK;
@@ -3280,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        __SetPageUptodate(page);
-        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+        if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
                goto oom_free_page;
        entry = mk_pte(page, vma->vm_page_prot);
@@ -3342,7 +3343,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        return ret;
 }
-static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
+/**
+ * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
+ *
+ * @vma: virtual memory area
+ * @address: user virtual address
+ * @page: page to map
+ * @pte: pointer to target page table entry
+ * @write: true, if new entry is writable
+ * @anon: true, if it's anonymous page
+ *
+ * Caller must hold page table lock relevant for @pte.
+ *
+ * Target users are page handler itself and implementations of
+ * vm_ops->map_pages.
+ */
+void do_set_pte(struct vm_area_struct *vma, unsigned long address,
                struct page *page, pte_t *pte, bool write, bool anon)
 {
        pte_t entry;
@@ -3366,6 +3382,105 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
        update_mmu_cache(vma, address, pte);
 }
+#define FAULT_AROUND_ORDER 4
+#ifdef CONFIG_DEBUG_FS
+static unsigned int fault_around_order = FAULT_AROUND_ORDER;
+static int fault_around_order_get(void *data, u64 *val)
+{
+        *val = fault_around_order;
+        return 0;
+}
+static int fault_around_order_set(void *data, u64 val)
+{
+        BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE);
+        if (1UL << val > PTRS_PER_PTE)
+                return -EINVAL;
+        fault_around_order = val;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops,
+                fault_around_order_get, fault_around_order_set, "%llu\n");
+static int __init fault_around_debugfs(void)
+{
+        void *ret;
+        ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL,
+                        &fault_around_order_fops);
+        if (!ret)
+                pr_warn("Failed to create fault_around_order in debugfs");
+        return 0;
+}
+late_initcall(fault_around_debugfs);
+static inline unsigned long fault_around_pages(void)
+{
+        return 1UL << fault_around_order;
+}
+static inline unsigned long fault_around_mask(void)
+{
+        return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
+}
+#else
+static inline unsigned long fault_around_pages(void)
+{
+        unsigned long nr_pages;
+        nr_pages = 1UL << FAULT_AROUND_ORDER;
+        BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
+        return nr_pages;
+}
+static inline unsigned long fault_around_mask(void)
+{
+        return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
+}
+#endif
+static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
+                pte_t *pte, pgoff_t pgoff, unsigned int flags)
+{
+        unsigned long start_addr;
+        pgoff_t max_pgoff;
+        struct vm_fault vmf;
+        int off;
+        start_addr = max(address & fault_around_mask(), vma->vm_start);
+        off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+        pte -= off;
+        pgoff -= off;
+        /*
+         *  max_pgoff is either end of page table or end of vma
+         *  or fault_around_pages() from pgoff, depending what is neast.
+         */
+        max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+                PTRS_PER_PTE - 1;
+        max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
+                        pgoff + fault_around_pages() - 1);
+        /* Check if it makes any sense to call ->map_pages */
+        while (!pte_none(*pte)) {
+                if (++pgoff > max_pgoff)
+                        return;
+                start_addr += PAGE_SIZE;
+                if (start_addr >= vma->vm_end)
+                        return;
+                pte++;
+        }
+        vmf.virtual_address = (void __user *) start_addr;
+        vmf.pte = pte;
+        vmf.pgoff = pgoff;
+        vmf.max_pgoff = max_pgoff;
+        vmf.flags = flags;
+        vma->vm_ops->map_pages(vma, &vmf);
+}
 static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd,
                pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
@@ -3373,7 +3488,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *fault_page;
        spinlock_t *ptl;
        pte_t *pte;
-        int ret;
+        int ret = 0;
+        /*
+         * Let's call ->map_pages() first and use ->fault() as fallback
+         * if page by the offset is not ready to be mapped (cold cache or
+         * something).
+         */
+        if (vma->vm_ops->map_pages) {
+                pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+                do_fault_around(vma, address, pte, pgoff, flags);
+                if (!pte_same(*pte, orig_pte))
+                        goto unlock_out;
+                pte_unmap_unlock(pte, ptl);
+        }
        ret = __do_fault(vma, address, pgoff, flags, &fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3387,8 +3515,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                return ret;
        }
        do_set_pte(vma, address, fault_page, pte, false, false);
-        pte_unmap_unlock(pte, ptl);
        unlock_page(fault_page);
+unlock_out:
+        pte_unmap_unlock(pte, ptl);
        return ret;
 }
@@ -3408,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!new_page)
                return VM_FAULT_OOM;
-        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+        if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
                page_cache_release(new_page);
                return VM_FAULT_OOM;
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e3ab02822799..78e1472933ea 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
        return err;
 }
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy.  Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
-        if (p->mempolicy)
-                p->flags |= PF_MEMPOLICY;
-        else
-                p->flags &= ~PF_MEMPOLICY;
-}
-static void mpol_set_task_struct_flag(void)
-{
-        mpol_fix_fork_child_flag(current);
-}
 /* Set the process memory policy */
 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
@@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
        }
        old = current->mempolicy;
        current->mempolicy = new;
-        mpol_set_task_struct_flag();
        if (new && new->mode == MPOL_INTERLEAVE &&
            nodes_weight(new->v.nodes))
                current->il_next = first_node(new->v.nodes);
@@ -1782,21 +1751,18 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 /*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
- * @policy must be protected by freeing by the caller.  If @policy is
- * the current task's mempolicy, this protection is implicit, as only the
- * task can change it's policy.  The system default policy requires no
- * such protection.
 */
-unsigned slab_node(void)
+unsigned int mempolicy_slab_node(void)
 {
        struct mempolicy *policy;
+        int node = numa_mem_id();
        if (in_interrupt())
-                return numa_node_id();
+                return node;
        policy = current->mempolicy;
        if (!policy || policy->flags & MPOL_F_LOCAL)
-                return numa_node_id();
+                return node;
        switch (policy->mode) {
        case MPOL_PREFERRED:
@@ -1816,11 +1782,11 @@ unsigned slab_node(void)
                struct zonelist *zonelist;
                struct zone *zone;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
-                zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+                zonelist = &NODE_DATA(node)->node_zonelists[0];
                (void)first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->v.nodes,
                                                        &zone);
-                return zone ? zone->node : numa_node_id();
+                return zone ? zone->node : node;
        }
        default:
diff --git a/mm/mempool.c b/mm/mempool.c
index 659aa42bad16..905434f18c97 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -304,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool)
         * ensures that there will be frees which return elements to the
         * pool waking up the waiters.
         */
-        if (pool->curr_nr < pool->min_nr) {
+        if (unlikely(pool->curr_nr < pool->min_nr)) {
                spin_lock_irqsave(&pool->lock, flags);
-                if (pool->curr_nr < pool->min_nr) {
+                if (likely(pool->curr_nr < pool->min_nr)) {
                        add_element(pool, element);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        wake_up(&pool->wait);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4e1a68162285..b1eb53634005 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page)
 */
 void mlock_vma_page(struct page *page)
 {
+        /* Serialize with page migration */
        BUG_ON(!PageLocked(page));
        if (!TestSetPageMlocked(page)) {
@@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page)
        unsigned int nr_pages;
        struct zone *zone = page_zone(page);
+        /* For try_to_munlock() and to serialize with page migration */
        BUG_ON(!PageLocked(page));
        /*
diff --git a/mm/mmap.c b/mm/mmap.c
index 46433e137abc..b1202cf81f4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
        prev->vm_next = next = vma->vm_next;
        if (next)
                next->vm_prev = prev;
-        if (mm->mmap_cache == vma)
-                mm->mmap_cache = prev;
+        /* Kill the cache */
+        vmacache_invalidate(mm);
 }
 /*
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
-        struct vm_area_struct *vma = NULL;
+        struct rb_node *rb_node;
+        struct vm_area_struct *vma;
        /* Check the cache first. */
-        /* (Cache hit rate is typically around 35%.) */
+        vma = vmacache_find(mm, addr);
-        vma = ACCESS_ONCE(mm->mmap_cache);
+        if (likely(vma))
-        if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+                return vma;
-                struct rb_node *rb_node;
-                rb_node = mm->mm_rb.rb_node;
+        rb_node = mm->mm_rb.rb_node;
-                vma = NULL;
+        vma = NULL;
-                while (rb_node) {
+        while (rb_node) {
-                        struct vm_area_struct *vma_tmp;
+                struct vm_area_struct *tmp;
-                        vma_tmp = rb_entry(rb_node,
+                tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-                                           struct vm_area_struct, vm_rb);
+                if (tmp->vm_end > addr) {
-                        if (vma_tmp->vm_end > addr) {
+                        vma = tmp;
-                                vma = vma_tmp;
+                        if (tmp->vm_start <= addr)
-                                if (vma_tmp->vm_start <= addr)
+                                break;
-                                        break;
+                        rb_node = rb_node->rb_left;
-                                rb_node = rb_node->rb_left;
+                } else
-                        } else
+                        rb_node = rb_node->rb_right;
-                                rb_node = rb_node->rb_right;
-                }
-                if (vma)
-                        mm->mmap_cache = vma;
        }
+        if (vma)
+                vmacache_update(addr, vma);
        return vma;
 }
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        } else
                mm->highest_vm_end = prev ? prev->vm_end : 0;
        tail_vma->vm_next = NULL;
-        mm->mmap_cache = NULL;          /* Kill the cache. */
+        /* Kill the cache */
+        vmacache_invalidate(mm);
 }
 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 769a67a15803..c43d557941f8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 }
 #endif
+/*
+ * For a prot_numa update we only hold mmap_sem for read so there is a
+ * potential race with faulting where a pmd was temporarily none. This
+ * function checks for a transhuge pmd under the appropriate lock. It
+ * returns a pte if it was successfully locked or NULL if it raced with
+ * a transhuge insertion.
+ */
+static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
+                        unsigned long addr, int prot_numa, spinlock_t **ptl)
+{
+        pte_t *pte;
+        spinlock_t *pmdl;
+        /* !prot_numa is protected by mmap_sem held for write */
+        if (!prot_numa)
+                return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+        pmdl = pmd_lock(vma->vm_mm, pmd);
+        if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
+                spin_unlock(pmdl);
+                return NULL;
+        }
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+        spin_unlock(pmdl);
+        return pte;
+}
 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable, int prot_numa)
@@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
        spinlock_t *ptl;
        unsigned long pages = 0;
-        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
+        if (!pte)
+                return 0;
        arch_enter_lazy_mmu_mode();
        do {
                oldpte = *pte;
@@ -109,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
        pmd_t *pmd;
+        struct mm_struct *mm = vma->vm_mm;
        unsigned long next;
        unsigned long pages = 0;
        unsigned long nr_huge_updates = 0;
+        unsigned long mni_start = 0;
        pmd = pmd_offset(pud, addr);
        do {
                unsigned long this_pages;
                next = pmd_addr_end(addr, end);
+                if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+                        continue;
+                /* invoke the mmu notifier if the pmd is populated */
+                if (!mni_start) {
+                        mni_start = addr;
+                        mmu_notifier_invalidate_range_start(mm, mni_start, end);
+                }
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                split_huge_page_pmd(vma, addr, pmd);
@@ -130,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                                                pages += HPAGE_PMD_NR;
                                                nr_huge_updates++;
                                        }
+                                        /* huge pmd was handled */
                                        continue;
                                }
                        }
-                        /* fall through */
+                        /* fall through, the trans huge pmd just split */
                }
-                if (pmd_none_or_clear_bad(pmd))
-                        continue;
                this_pages = change_pte_range(vma, pmd, addr, next, newprot,
                                 dirty_accountable, prot_numa);
                pages += this_pages;
        } while (pmd++, addr = next, addr != end);
+        if (mni_start)
+                mmu_notifier_invalidate_range_end(mm, mni_start, end);
        if (nr_huge_updates)
                count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
        return pages;
@@ -201,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
                       unsigned long end, pgprot_t newprot,
                       int dirty_accountable, int prot_numa)
 {
-        struct mm_struct *mm = vma->vm_mm;
        unsigned long pages;
-        mmu_notifier_invalidate_range_start(mm, start, end);
        if (is_vm_hugetlb_page(vma))
                pages = hugetlb_change_protection(vma, start, end, newprot);
        else
                pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
-        mmu_notifier_invalidate_range_end(mm, start, end);
        return pages;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index a554e5a451cd..85f8d6698d48 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/file.h>
@@ -24,6 +25,7 @@
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/compiler.h>
 #include <linux/mount.h>
 #include <linux/personality.h>
 #include <linux/security.h>
@@ -296,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
                count = -(unsigned long) addr;
        memcpy(addr, buf, count);
-        return(count);
+        return count;
 }
 /*
@@ -459,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
 * have one.
 */
-void  __attribute__((weak)) vmalloc_sync_all(void)
+void __weak vmalloc_sync_all(void)
 {
 }
@@ -768,16 +770,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 */
 static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
+        int i;
        struct address_space *mapping;
        struct mm_struct *mm = vma->vm_mm;
+        struct task_struct *curr = current;
        kenter("%p", vma);
        protect_vma(vma, 0);
        mm->map_count--;
-        if (mm->mmap_cache == vma)
+        for (i = 0; i < VMACACHE_SIZE; i++) {
-                mm->mmap_cache = NULL;
+                /* if the vma is cached, invalidate the entire cache */
+                if (curr->vmacache[i] == vma) {
+                        vmacache_invalidate(curr->mm);
+                        break;
+                }
+        }
        /* remove the VMA from the mapping */
        if (vma->vm_file) {
@@ -825,8 +834,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
        struct vm_area_struct *vma;
        /* check the cache first */
-        vma = ACCESS_ONCE(mm->mmap_cache);
+        vma = vmacache_find(mm, addr);
-        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+        if (likely(vma))
                return vma;
        /* trawl the list (there may be multiple mappings in which addr
@@ -835,7 +844,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
                if (vma->vm_start > addr)
                        return NULL;
                if (vma->vm_end > addr) {
-                        mm->mmap_cache = vma;
+                        vmacache_update(addr, vma);
                        return vma;
                }
        }
@@ -874,8 +883,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
        unsigned long end = addr + len;
        /* check the cache first */
-        vma = mm->mmap_cache;
+        vma = vmacache_find_exact(mm, addr, end);
-        if (vma && vma->vm_start == addr && vma->vm_end == end)
+        if (vma)
                return vma;
        /* trawl the list (there may be multiple mappings in which addr
@@ -886,7 +895,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
                if (vma->vm_start > addr)
                        return NULL;
                if (vma->vm_end == end) {
-                        mm->mmap_cache = vma;
+                        vmacache_update(addr, vma);
                        return vma;
                }
        }
@@ -1003,8 +1012,7 @@ static int validate_mmap_request(struct file *file,
                        /* we mustn't privatise shared mappings */
                        capabilities &= ~BDI_CAP_MAP_COPY;
-                }
+                } else {
-                else {
                        /* we're going to read the file into private memory we
                         * allocate */
                        if (!(capabilities & BDI_CAP_MAP_COPY))
@@ -1035,23 +1043,20 @@ static int validate_mmap_request(struct file *file,
                if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
                        if (prot & PROT_EXEC)
                                return -EPERM;
-                }
+                } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
-                else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
                        /* handle implication of PROT_EXEC by PROT_READ */
                        if (current->personality & READ_IMPLIES_EXEC) {
                                if (capabilities & BDI_CAP_EXEC_MAP)
                                        prot |= PROT_EXEC;
                        }
-                }
+                } else if ((prot & PROT_READ) &&
-                else if ((prot & PROT_READ) &&
                         (prot & PROT_EXEC) &&
                         !(capabilities & BDI_CAP_EXEC_MAP)
                         ) {
                        /* backing file is not executable, try to copy */
                        capabilities &= ~BDI_CAP_MAP_DIRECT;
                }
-        }
+        } else {
-        else {
                /* anonymous mappings are always memory backed and can be
                 * privately mapped
                 */
@@ -1659,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        /* find the first potentially overlapping VMA */
        vma = find_vma(mm, start);
        if (!vma) {
-                static int limit = 0;
+                static int limit;
                if (limit < 5) {
                        printk(KERN_WARNING
                               "munmap of memory not mmapped by process %d"
@@ -1985,6 +1990,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        BUG();
+}
+EXPORT_SYMBOL(filemap_map_pages);
 int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
                             unsigned long size, pgoff_t pgoff)
 {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7106cb1aca8e..ef413492a149 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1562,9 +1562,9 @@ pause:
                bdi_start_background_writeback(bdi);
 }
-void set_page_dirty_balance(struct page *page, int page_mkwrite)
+void set_page_dirty_balance(struct page *page)
 {
-        if (set_page_dirty(page) || page_mkwrite) {
+        if (set_page_dirty(page)) {
                struct address_space *mapping = page_mapping(page);
                if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 979378deccbf..5dba2933c9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page)
 }
 #endif
-static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
+static void bad_page(struct page *page, const char *reason,
+                unsigned long bad_flags)
 {
        static unsigned long resume;
        static unsigned long nr_shown;
@@ -623,7 +624,7 @@ out:
 static inline int free_pages_check(struct page *page)
 {
-        char *bad_reason = NULL;
+        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
        if (unlikely(page_mapcount(page)))
@@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page,
 */
 static inline int check_new_page(struct page *page)
 {
-        char *bad_reason = NULL;
+        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
        if (unlikely(page_mapcount(page)))
@@ -1238,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        }
        local_irq_restore(flags);
 }
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-        return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-        return false;
-}
 #endif
 /*
@@ -1583,12 +1575,7 @@ again:
                                          get_pageblock_migratetype(page));
        }
-        /*
+        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
-         * NOTE: GFP_THISNODE allocations do not partake in the kswapd
-         * aging protocol, so they can't be fair.
-         */
-        if (!gfp_thisnode_allocation(gfp_flags))
-                __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1870,7 +1857,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
 {
        int i;
-        for_each_online_node(i)
+        for_each_node_state(i, N_MEMORY)
                if (node_distance(nid, i) <= RECLAIM_DISTANCE)
                        node_set(i, NODE_DATA(nid)->reclaim_nodes);
                else
@@ -1954,23 +1941,12 @@ zonelist_scan:
                 * zone size to ensure fair page aging.  The zone a
                 * page was allocated in should have no effect on the
                 * time the page has in memory before being reclaimed.
-                 *
-                 * Try to stay in local zones in the fastpath.  If
-                 * that fails, the slowpath is entered, which will do
-                 * another pass starting with the local zones, but
-                 * ultimately fall back to remote zones that do not
-                 * partake in the fairness round-robin cycle of this
-                 * zonelist.
-                 *
-                 * NOTE: GFP_THISNODE allocations do not partake in
-                 * the kswapd aging protocol, so they can't be fair.
                 */
-                if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                if (alloc_flags & ALLOC_FAIR) {
-                    !gfp_thisnode_allocation(gfp_mask)) {
-                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                                continue;
                        if (!zone_local(preferred_zone, zone))
                                continue;
+                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                                continue;
                }
                /*
                 * When allocating a page cache page for writing, we
@@ -2408,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
        return page;
 }
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
+static void reset_alloc_batches(struct zonelist *zonelist,
-                             struct zonelist *zonelist,
+                                enum zone_type high_zoneidx,
-                             enum zone_type high_zoneidx,
+                                struct zone *preferred_zone)
-                             struct zone *preferred_zone)
 {
        struct zoneref *z;
        struct zone *zone;
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                if (!(gfp_mask & __GFP_NO_KSWAPD))
-                        wakeup_kswapd(zone, order, zone_idx(preferred_zone));
                /*
                 * Only reset the batches of zones that were actually
-                 * considered in the fast path, we don't want to
+                 * considered in the fairness pass, we don't want to
-                 * thrash fairness information for zones that are not
+                 * trash fairness information for zones that are not
                 * actually part of this zonelist's round-robin cycle.
                 */
                if (!zone_local(preferred_zone, zone))
                        continue;
                mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                    high_wmark_pages(zone) -
+                        high_wmark_pages(zone) - low_wmark_pages(zone) -
-                                    low_wmark_pages(zone) -
+                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-                                    zone_page_state(zone, NR_ALLOC_BATCH));
        }
 }
+static void wake_all_kswapds(unsigned int order,
+                             struct zonelist *zonelist,
+                             enum zone_type high_zoneidx,
+                             struct zone *preferred_zone)
+{
+        struct zoneref *z;
+        struct zone *zone;
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
@@ -2522,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * allowed per node queues are empty and that nodes are
         * over allocated.
         */
-        if (gfp_thisnode_allocation(gfp_mask))
+        if (IS_ENABLED(CONFIG_NUMA) &&
+            (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
 restart:
-        prepare_slowpath(gfp_mask, order, zonelist,
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                         high_zoneidx, preferred_zone);
+                wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2711,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
-        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
        struct mem_cgroup *memcg = NULL;
        gfp_mask &= gfp_allowed_mask;
@@ -2752,12 +2737,29 @@ retry_cpuset:
        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
+retry:
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, migratetype);
        if (unlikely(!page)) {
                /*
+                 * The first pass makes sure allocations are spread
+                 * fairly within the local node.  However, the local
+                 * node might have free pages left after the fairness
+                 * batches are exhausted, and remote zones haven't
+                 * even been considered yet.  Try once more without
+                 * fairness, and include remote zones now, before
+                 * entering the slowpath and waking kswapd: prefer
+                 * spilling to a remote zone over swapping locally.
+                 */
+                if (alloc_flags & ALLOC_FAIR) {
+                        reset_alloc_batches(zonelist, high_zoneidx,
+                                            preferred_zone);
+                        alloc_flags &= ~ALLOC_FAIR;
+                        goto retry;
+                }
+                /*
                 * Runtime PM, block IO and its error handling path
                 * can deadlock because I/O on the device might not
                 * complete.
@@ -4919,7 +4921,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
-        init_zone_allows_reclaim(nid);
+        if (node_state(nid, N_MEMORY))
+                init_zone_allows_reclaim(nid);
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 #endif
@@ -5070,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        nodemask_t saved_node_state = node_states[N_MEMORY];
        unsigned long totalpages = early_calculate_totalpages();
        int usable_nodes = nodes_weight(node_states[N_MEMORY]);
-        struct memblock_type *type = &memblock.memory;
+        struct memblock_region *r;
        /* Need to find movable_zone earlier when movable_node is specified. */
        find_usable_zone_for_movable();
@@ -5080,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void)
         * options.
         */
        if (movable_node_is_enabled()) {
-                for (i = 0; i < type->cnt; i++) {
+                for_each_memblock(memory, r) {
-                        if (!memblock_is_hotpluggable(&type->regions[i]))
+                        if (!memblock_is_hotpluggable(r))
                                continue;
-                        nid = type->regions[i].nid;
+                        nid = r->nid;
-                        usable_startpfn = PFN_DOWN(type->regions[i].base);
+                        usable_startpfn = PFN_DOWN(r->base);
                        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
                                min(usable_startpfn, zone_movable_pfn[nid]) :
                                usable_startpfn;
@@ -6544,7 +6547,8 @@ static void dump_page_flags(unsigned long flags)
        printk(")\n");
 }
-void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
+void dump_page_badflags(struct page *page, const char *reason,
+                unsigned long badflags)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
@@ -6560,8 +6564,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
        mem_cgroup_print_bad_page(page);
 }
-void dump_page(struct page *page, char *reason)
+void dump_page(struct page *page, const char *reason)
 {
        dump_page_badflags(page, reason, 0);
 }
-EXPORT_SYMBOL_GPL(dump_page);
+EXPORT_SYMBOL(dump_page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1af5a0c..0ca36a7770b1 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
 */
 #include <linux/kernel.h>
-#include <linux/fs.h>
 #include <linux/gfp.h>
-#include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include "internal.h"
 /*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
@@ -149,8 +149,7 @@ out:
 *
 * Returns the number of pages requested, or the maximum amount of I/O allowed.
 */
-static int
+int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
                        pgoff_t offset, unsigned long nr_to_read,
                        unsigned long lookahead_size)
 {
@@ -244,20 +243,6 @@ unsigned long max_sane_readahead(unsigned long nr)
 }
 /*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-unsigned long ra_submit(struct file_ra_state *ra,
-                       struct address_space *mapping, struct file *filp)
-{
-        int actual;
-        actual = __do_page_cache_readahead(mapping, filp,
-                                        ra->start, ra->size, ra->async_size);
-        return actual;
-}
-/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
diff --git a/mm/rmap.c b/mm/rmap.c
index 11cf322f8133..9c3e77396d1a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1332,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                BUG_ON(!page || PageAnon(page));
                if (locked_vma) {
-                        mlock_vma_page(page);   /* no-op if already mlocked */
+                        if (page == check_page) {
-                        if (page == check_page)
+                                /* we know we have check_page locked */
+                                mlock_vma_page(page);
                                ret = SWAP_MLOCK;
+                        } else if (trylock_page(page)) {
+                                /*
+                                 * If we can lock the page, perform mlock.
+                                 * Otherwise leave the page alone, it will be
+                                 * eventually encountered again later.
+                                 */
+                                mlock_vma_page(page);
+                                unlock_page(page);
+                        }
                        continue;       /* don't unmap */
                }
diff --git a/mm/shmem.c b/mm/shmem.c
index a3ba988ec946..70273f8df586 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
         */
-        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+        error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
        /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1080,7 +1080,7 @@ repeat:
                                goto failed;
                }
-                error = mem_cgroup_cache_charge(page, current->mm,
+                error = mem_cgroup_charge_file(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
@@ -1134,7 +1134,7 @@ repeat:
                SetPageSwapBacked(page);
                __set_page_locked(page);
-                error = mem_cgroup_cache_charge(page, current->mm,
+                error = mem_cgroup_charge_file(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
                if (error)
                        goto decused;
@@ -2723,6 +2723,7 @@ static const struct super_operations shmem_ops = {
 static const struct vm_operations_struct shmem_vm_ops = {
        .fault          = shmem_fault,
+        .map_pages      = filemap_map_pages,
 #ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
diff --git a/mm/slab.c b/mm/slab.c
index 9153c802e2fe..3db4cb06e32e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3027,7 +3027,7 @@ out:
 #ifdef CONFIG_NUMA
 /*
- * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
 *
 * If we are in_interrupt, then process context, including cpusets and
 * mempolicy, may not apply and should not be used for allocation policy.
@@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
-                nid_alloc = slab_node();
+                nid_alloc = mempolicy_slab_node();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
-        zonelist = node_zonelist(slab_node(), flags);
+        zonelist = node_zonelist(mempolicy_slab_node(), flags);
 retry:
        /*
@@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
 {
        void *objp;
-        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+        if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
                objp = alternate_node_alloc(cache, flags);
                if (objp)
                        goto out;
diff --git a/mm/slab.h b/mm/slab.h
index 8184a7cde272..3045316b7c9d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
 struct mem_cgroup;
 #ifdef CONFIG_SLUB
 struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+__kmem_cache_alias(const char *name, size_t size, size_t align,
-                   size_t align, unsigned long flags, void (*ctor)(void *));
+                   unsigned long flags, void (*ctor)(void *));
 #else
 static inline struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+__kmem_cache_alias(const char *name, size_t size, size_t align,
-                   size_t align, unsigned long flags, void (*ctor)(void *))
+                   unsigned long flags, void (*ctor)(void *))
 { return NULL; }
 #endif
@@ -119,13 +119,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
        return !s->memcg_params || s->memcg_params->is_root_cache;
 }
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
-                                     struct mem_cgroup *memcg)
-{
-        return (is_root_cache(cachep) && !memcg) ||
-                                (cachep->memcg_params->memcg == memcg);
-}
 static inline void memcg_bind_pages(struct kmem_cache *s, int order)
 {
        if (!is_root_cache(s))
@@ -204,12 +197,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
        return true;
 }
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
-                                     struct mem_cgroup *memcg)
-{
-        return true;
-}
 static inline void memcg_bind_pages(struct kmem_cache *s, int order)
 {
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1ec3c619ba04..f3cfccf76dda 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 #ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+static int kmem_cache_sanity_check(const char *name, size_t size)
-                                   size_t size)
 {
        struct kmem_cache *s = NULL;
@@ -57,13 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
                }
 #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
-                /*
+                if (!strcmp(s->name, name)) {
-                 * For simplicity, we won't check this in the list of memcg
-                 * caches. We have control over memcg naming, and if there
-                 * aren't duplicates in the global list, there won't be any
-                 * duplicates in the memcg lists as well.
-                 */
-                if (!memcg && !strcmp(s->name, name)) {
                        pr_err("%s (%s): Cache name already exists.\n",
                               __func__, name);
                        dump_stack();
@@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
        return 0;
 }
 #else
-static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
-                                          const char *name, size_t size)
 {
        return 0;
 }
@@ -139,6 +131,46 @@ unsigned long calculate_alignment(unsigned long flags,
        return ALIGN(align, sizeof(void *));
 }
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+                     unsigned long flags, void (*ctor)(void *),
+                     struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+        struct kmem_cache *s;
+        int err;
+        err = -ENOMEM;
+        s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+        if (!s)
+                goto out;
+        s->name = name;
+        s->object_size = object_size;
+        s->size = size;
+        s->align = align;
+        s->ctor = ctor;
+        err = memcg_alloc_cache_params(memcg, s, root_cache);
+        if (err)
+                goto out_free_cache;
+        err = __kmem_cache_create(s, flags);
+        if (err)
+                goto out_free_cache;
+        s->refcount = 1;
+        list_add(&s->list, &slab_caches);
+        memcg_register_cache(s);
+out:
+        if (err)
+                return ERR_PTR(err);
+        return s;
+out_free_cache:
+        memcg_free_cache_params(s);
+        kfree(s);
+        goto out;
+}
 /*
 * kmem_cache_create - Create a cache.
@@ -164,34 +196,21 @@ unsigned long calculate_alignment(unsigned long flags,
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
 struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+kmem_cache_create(const char *name, size_t size, size_t align,
-                        size_t align, unsigned long flags, void (*ctor)(void *),
+                  unsigned long flags, void (*ctor)(void *))
-                        struct kmem_cache *parent_cache)
 {
-        struct kmem_cache *s = NULL;
+        struct kmem_cache *s;
+        char *cache_name;
        int err;
        get_online_cpus();
        mutex_lock(&slab_mutex);
-        err = kmem_cache_sanity_check(memcg, name, size);
+        err = kmem_cache_sanity_check(name, size);
        if (err)
                goto out_unlock;
-        if (memcg) {
-                /*
-                 * Since per-memcg caches are created asynchronously on first
-                 * allocation (see memcg_kmem_get_cache()), several threads can
-                 * try to create the same cache, but only one of them may
-                 * succeed. Therefore if we get here and see the cache has
-                 * already been created, we silently return NULL.
-                 */
-                if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
-                        goto out_unlock;
-        }
        /*
         * Some allocators will constraint the set of valid flags to a subset
         * of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -200,50 +219,29 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
         */
        flags &= CACHE_CREATE_MASK;
-        s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
+        s = __kmem_cache_alias(name, size, align, flags, ctor);
        if (s)
                goto out_unlock;
-        err = -ENOMEM;
+        cache_name = kstrdup(name, GFP_KERNEL);
-        s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+        if (!cache_name) {
-        if (!s)
+                err = -ENOMEM;
                goto out_unlock;
+        }
-        s->object_size = s->size = size;
+        s = do_kmem_cache_create(cache_name, size, size,
-        s->align = calculate_alignment(flags, align, size);
+                                 calculate_alignment(flags, align, size),
-        s->ctor = ctor;
+                                 flags, ctor, NULL, NULL);
+        if (IS_ERR(s)) {
-        s->name = kstrdup(name, GFP_KERNEL);
+                err = PTR_ERR(s);
-        if (!s->name)
+                kfree(cache_name);
-                goto out_free_cache;
+        }
-        err = memcg_alloc_cache_params(memcg, s, parent_cache);
-        if (err)
-                goto out_free_cache;
-        err = __kmem_cache_create(s, flags);
-        if (err)
-                goto out_free_cache;
-        s->refcount = 1;
-        list_add(&s->list, &slab_caches);
-        memcg_register_cache(s);
 out_unlock:
        mutex_unlock(&slab_mutex);
        put_online_cpus();
        if (err) {
-                /*
-                 * There is no point in flooding logs with warnings or
-                 * especially crashing the system if we fail to create a cache
-                 * for a memcg. In this case we will be accounting the memcg
-                 * allocation to the root cgroup until we succeed to create its
-                 * own cache, but it isn't that critical.
-                 */
-                if (!memcg)
-                        return NULL;
                if (flags & SLAB_PANIC)
                        panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
                                name, err);
@@ -255,52 +253,112 @@ out_unlock:
                return NULL;
        }
        return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
-out_free_cache:
+#ifdef CONFIG_MEMCG_KMEM
-        memcg_free_cache_params(s);
+/*
-        kfree(s->name);
+ * kmem_cache_create_memcg - Create a cache for a memory cgroup.
-        kmem_cache_free(kmem_cache, s);
+ * @memcg: The memory cgroup the new cache is for.
-        goto out_unlock;
+ * @root_cache: The parent of the new cache.
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+        struct kmem_cache *s;
+        char *cache_name;
+        get_online_cpus();
+        mutex_lock(&slab_mutex);
+        /*
+         * Since per-memcg caches are created asynchronously on first
+         * allocation (see memcg_kmem_get_cache()), several threads can try to
+         * create the same cache, but only one of them may succeed.
+         */
+        if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
+                goto out_unlock;
+        cache_name = memcg_create_cache_name(memcg, root_cache);
+        if (!cache_name)
+                goto out_unlock;
+        s = do_kmem_cache_create(cache_name, root_cache->object_size,
+                                 root_cache->size, root_cache->align,
+                                 root_cache->flags, root_cache->ctor,
+                                 memcg, root_cache);
+        if (IS_ERR(s)) {
+                kfree(cache_name);
+                goto out_unlock;
+        }
+        s->allocflags |= __GFP_KMEMCG;
+out_unlock:
+        mutex_unlock(&slab_mutex);
+        put_online_cpus();
 }
-struct kmem_cache *
+static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
-kmem_cache_create(const char *name, size_t size, size_t align,
-                  unsigned long flags, void (*ctor)(void *))
 {
-        return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+        int rc;
+        if (!s->memcg_params ||
+            !s->memcg_params->is_root_cache)
+                return 0;
+        mutex_unlock(&slab_mutex);
+        rc = __kmem_cache_destroy_memcg_children(s);
+        mutex_lock(&slab_mutex);
+        return rc;
 }
-EXPORT_SYMBOL(kmem_cache_create);
+#else
+static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+        return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-        /* Destroy all the children caches if we aren't a memcg cache */
-        kmem_cache_destroy_memcg_children(s);
        get_online_cpus();
        mutex_lock(&slab_mutex);
        s->refcount--;
-        if (!s->refcount) {
+        if (s->refcount)
-                list_del(&s->list);
+                goto out_unlock;
-                if (!__kmem_cache_shutdown(s)) {
+        if (kmem_cache_destroy_memcg_children(s) != 0)
-                        memcg_unregister_cache(s);
+                goto out_unlock;
-                        mutex_unlock(&slab_mutex);
-                        if (s->flags & SLAB_DESTROY_BY_RCU)
+        list_del(&s->list);
-                                rcu_barrier();
+        memcg_unregister_cache(s);
-                        memcg_free_cache_params(s);
+        if (__kmem_cache_shutdown(s) != 0) {
-                        kfree(s->name);
+                list_add(&s->list, &slab_caches);
-                        kmem_cache_free(kmem_cache, s);
+                memcg_register_cache(s);
-                } else {
+                printk(KERN_ERR "kmem_cache_destroy %s: "
-                        list_add(&s->list, &slab_caches);
+                       "Slab cache still has objects\n", s->name);
-                        mutex_unlock(&slab_mutex);
+                dump_stack();
-                        printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
+                goto out_unlock;
-                                s->name);
-                        dump_stack();
-                }
-        } else {
-                mutex_unlock(&slab_mutex);
        }
+        mutex_unlock(&slab_mutex);
+        if (s->flags & SLAB_DESTROY_BY_RCU)
+                rcu_barrier();
+        memcg_free_cache_params(s);
+        kfree(s->name);
+        kmem_cache_free(kmem_cache, s);
+        goto out_put_cpus;
+out_unlock:
+        mutex_unlock(&slab_mutex);
+out_put_cpus:
        put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index fe6d7be22ef0..f620bbf4054a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -224,7 +224,11 @@ static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
-        __this_cpu_inc(s->cpu_slab->stat[si]);
+        /*
+         * The rmw is racy on a preemptible kernel but this is acceptable, so
+         * avoid this_cpu_add()'s irq-disable overhead.
+         */
+        raw_cpu_inc(s->cpu_slab->stat[si]);
 #endif
 }
@@ -1685,7 +1689,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
-                zonelist = node_zonelist(slab_node(), flags);
+                zonelist = node_zonelist(mempolicy_slab_node(), flags);
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        struct kmem_cache_node *n;
@@ -3685,6 +3689,9 @@ static int slab_unmergeable(struct kmem_cache *s)
        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
                return 1;
+        if (!is_root_cache(s))
+                return 1;
        if (s->ctor)
                return 1;
@@ -3697,9 +3704,8 @@ static int slab_unmergeable(struct kmem_cache *s)
        return 0;
 }
-static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
+static struct kmem_cache *find_mergeable(size_t size, size_t align,
-                size_t align, unsigned long flags, const char *name,
+                unsigned long flags, const char *name, void (*ctor)(void *))
-                void (*ctor)(void *))
 {
        struct kmem_cache *s;
@@ -3722,7 +3728,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
                        continue;
                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
-                                continue;
+                        continue;
                /*
                 * Check if alignment is compatible.
                 * Courtesy of Adrian Drzewiecki
@@ -3733,23 +3739,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
                if (s->size - size >= sizeof(void *))
                        continue;
-                if (!cache_match_memcg(s, memcg))
-                        continue;
                return s;
        }
        return NULL;
 }
 struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+__kmem_cache_alias(const char *name, size_t size, size_t align,
-                   size_t align, unsigned long flags, void (*ctor)(void *))
+                   unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
-        s = find_mergeable(memcg, size, align, flags, name, ctor);
+        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
+                int i;
+                struct kmem_cache *c;
                s->refcount++;
                /*
                 * Adjust the object sizes so that we clear
                 * the complete object on kzalloc.
@@ -3757,6 +3764,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
                s->object_size = max(s->object_size, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+                for_each_memcg_cache_index(i) {
+                        c = cache_from_memcg_idx(s, i);
+                        if (!c)
+                                continue;
+                        c->object_size = s->object_size;
+                        c->inuse = max_t(int, c->inuse,
+                                         ALIGN(size, sizeof(void *)));
+                }
                if (sysfs_slab_alias(s, name)) {
                        s->refcount--;
                        s = NULL;
@@ -5126,6 +5142,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {
 static struct kset *slab_kset;
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+        if (!is_root_cache(s))
+                return s->memcg_params->root_cache->memcg_kset;
+#endif
+        return slab_kset;
+}
 #define ID_STR_LENGTH 64
 /* Create a unique string id for a slab cache:
@@ -5191,26 +5216,39 @@ static int sysfs_slab_add(struct kmem_cache *s)
                name = create_unique_id(s);
        }
-        s->kobj.kset = slab_kset;
+        s->kobj.kset = cache_kset(s);
        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
-        if (err) {
+        if (err)
-                kobject_put(&s->kobj);
+                goto out_put_kobj;
-                return err;
-        }
        err = sysfs_create_group(&s->kobj, &slab_attr_group);
-        if (err) {
+        if (err)
-                kobject_del(&s->kobj);
+                goto out_del_kobj;
-                kobject_put(&s->kobj);
-                return err;
+#ifdef CONFIG_MEMCG_KMEM
+        if (is_root_cache(s)) {
+                s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+                if (!s->memcg_kset) {
+                        err = -ENOMEM;
+                        goto out_del_kobj;
+                }
        }
+#endif
        kobject_uevent(&s->kobj, KOBJ_ADD);
        if (!unmergeable) {
                /* Setup first alias */
                sysfs_slab_alias(s, s->name);
-                kfree(name);
        }
-        return 0;
+out:
+        if (!unmergeable)
+                kfree(name);
+        return err;
+out_del_kobj:
+        kobject_del(&s->kobj);
+out_put_kobj:
+        kobject_put(&s->kobj);
+        goto out;
 }
 static void sysfs_slab_remove(struct kmem_cache *s)
@@ -5222,6 +5260,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)
                 */
                return;
+#ifdef CONFIG_MEMCG_KMEM
+        kset_unregister(s->memcg_kset);
+#endif
        kobject_uevent(&s->kobj, KOBJ_REMOVE);
        kobject_del(&s->kobj);
        kobject_put(&s->kobj);
diff --git a/mm/sparse.c b/mm/sparse.c
index 38cad8fd7397..d1b48b691ac8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,10 +5,12 @@
 #include <linux/slab.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
+#include <linux/compiler.h>
 #include <linux/highmem.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include "internal.h"
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
@@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 }
 #endif
-void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+void __weak __meminit vmemmap_populate_print_last(void)
 {
 }
diff --git a/mm/util.c b/mm/util.c
index a24aa22f2473..d7813e6d4cc7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,6 +1,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/sched.h>
@@ -307,7 +308,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 * If the architecture not support this function, simply return with no
 * page pinned
 */
-int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+int __weak __get_user_pages_fast(unsigned long start,
                                 int nr_pages, int write, struct page **pages)
 {
        return 0;
@@ -338,7 +339,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
 * callers need to carefully consider what to use. On many architectures,
 * get_user_pages_fast simply falls back to get_user_pages.
 */
-int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+int __weak get_user_pages_fast(unsigned long start,
                                int nr_pages, int write, struct page **pages)
 {
        struct mm_struct *mm = current->mm;
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..d4224b397c0e
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+        struct task_struct *g, *p;
+        rcu_read_lock();
+        for_each_process_thread(g, p) {
+                /*
+                 * Only flush the vmacache pointers as the
+                 * mm seqnum is already set and curr's will
+                 * be set upon invalidation when the next
+                 * lookup is done.
+                 */
+                if (mm == p->mm)
+                        vmacache_flush(p);
+        }
+        rcu_read_unlock();
+}
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma().  The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own).  There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+        return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+        if (vmacache_valid_mm(newvma->vm_mm))
+                current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+static bool vmacache_valid(struct mm_struct *mm)
+{
+        struct task_struct *curr;
+        if (!vmacache_valid_mm(mm))
+                return false;
+        curr = current;
+        if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+                /*
+                 * First attempt will always be invalid, initialize
+                 * the new cache for this task here.
+                 */
+                curr->vmacache_seqnum = mm->vmacache_seqnum;
+                vmacache_flush(curr);
+                return false;
+        }
+        return true;
+}
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+        int i;
+        if (!vmacache_valid(mm))
+                return NULL;
+        for (i = 0; i < VMACACHE_SIZE; i++) {
+                struct vm_area_struct *vma = current->vmacache[i];
+                if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
+                        BUG_ON(vma->vm_mm != mm);
+                        return vma;
+                }
+        }
+        return NULL;
+}
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+                                           unsigned long start,
+                                           unsigned long end)
+{
+        int i;
+        if (!vmacache_valid(mm))
+                return NULL;
+        for (i = 0; i < VMACACHE_SIZE; i++) {
+                struct vm_area_struct *vma = current->vmacache[i];
+                if (vma && vma->vm_start == start && vma->vm_end == end)
+                        return vma;
+        }
+        return NULL;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..bf233b283319 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -27,7 +27,9 @@
 #include <linux/pfn.h>
 #include <linux/kmemleak.h>
 #include <linux/atomic.h>
+#include <linux/compiler.h>
 #include <linux/llist.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
@@ -1083,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram);
 * @node: prefer to allocate data structures on this node
 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
 *
+ * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
+ * faster than vmap so it's good.  But if you mix long-life and short-life
+ * objects with vm_map_ram(), it could consume lots of address space through
+ * fragmentation (especially on a 32bit machine).  You could see failures in
+ * the end.  Please use this function for short-lived objects.
+ *
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
@@ -2181,7 +2189,7 @@ EXPORT_SYMBOL(remap_vmalloc_range);
 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
 * have one.
 */
-void  __attribute__((weak)) vmalloc_sync_all(void)
+void __weak vmalloc_sync_all(void)
 {
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1f56a80a7c41..06879ead7380 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2314,15 +2314,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        unsigned long lru_pages = 0;
        bool aborted_reclaim = false;
        struct reclaim_state *reclaim_state = current->reclaim_state;
+        gfp_t orig_mask;
        struct shrink_control shrink = {
                .gfp_mask = sc->gfp_mask,
        };
+        enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
        /*
         * If the number of buffer_heads in the machine exceeds the maximum
         * allowed level, force direct reclaim to scan the highmem zone as
         * highmem pages could be pinning lowmem pages storing buffer_heads
         */
+        orig_mask = sc->gfp_mask;
        if (buffer_heads_over_limit)
                sc->gfp_mask |= __GFP_HIGHMEM;
@@ -2356,7 +2359,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                 * noticeable problem, like transparent huge
                                 * page allocations.
                                 */
-                                if (compaction_ready(zone, sc)) {
+                                if ((zonelist_zone_idx(z) <= requested_highidx)
+                                    && compaction_ready(zone, sc)) {
                                        aborted_reclaim = true;
                                        continue;
                                }
@@ -2393,6 +2397,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                }
        }
+        /*
+         * Restore to original mask to avoid the impact on the caller if we
+         * promoted it to __GFP_HIGHMEM.
+         */
+        sc->gfp_mask = orig_mask;
        return aborted_reclaim;
 }
diff --git a/mm/zswap.c b/mm/zswap.c
index d7337fbf6605..aeaef0fb5624 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20;
 module_param_named(max_pool_percent,
                        zswap_max_pool_percent, uint, 0644);
+/* zbud_pool is shared by all of zswap backend  */
+static struct zbud_pool *zswap_pool;
 /*********************************
 * compression functions
 **********************************/
@@ -160,14 +163,14 @@ static void zswap_comp_exit(void)
 * rbnode - links the entry into red-black tree for the appropriate swap type
 * refcount - the number of outstanding reference to the entry. This is needed
 *            to protect against premature freeing of the entry by code
- *            concurent calls to load, invalidate, and writeback.  The lock
+ *            concurrent calls to load, invalidate, and writeback.  The lock
 *            for the zswap_tree structure that contains the entry must
 *            be held while changing the refcount.  Since the lock must
 *            be held, there is no reason to also make refcount atomic.
 * offset - the swap offset for the entry.  Index into the red-black tree.
- * handle - zsmalloc allocation handle that stores the compressed page data
+ * handle - zbud allocation handle that stores the compressed page data
 * length - the length in bytes of the compressed page data.  Needed during
- *           decompression
+ *          decompression
 */
 struct zswap_entry {
        struct rb_node rbnode;
@@ -189,7 +192,6 @@ struct zswap_header {
 struct zswap_tree {
        struct rb_root rbroot;
        spinlock_t lock;
-        struct zbud_pool *pool;
 };
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
@@ -202,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache;
 static int zswap_entry_cache_create(void)
 {
        zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
-        return (zswap_entry_cache == NULL);
+        return zswap_entry_cache == NULL;
 }
 static void zswap_entry_cache_destory(void)
@@ -282,16 +284,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 }
 /*
- * Carries out the common pattern of freeing and entry's zsmalloc allocation,
+ * Carries out the common pattern of freeing and entry's zbud allocation,
 * freeing the entry itself, and decrementing the number of stored pages.
 */
-static void zswap_free_entry(struct zswap_tree *tree,
+static void zswap_free_entry(struct zswap_entry *entry)
-                        struct zswap_entry *entry)
 {
-        zbud_free(tree->pool, entry->handle);
+        zbud_free(zswap_pool, entry->handle);
        zswap_entry_cache_free(entry);
        atomic_dec(&zswap_stored_pages);
-        zswap_pool_pages = zbud_get_pool_size(tree->pool);
+        zswap_pool_pages = zbud_get_pool_size(zswap_pool);
 }
 /* caller must hold the tree lock */
@@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree,
        BUG_ON(refcount < 0);
        if (refcount == 0) {
                zswap_rb_erase(&tree->rbroot, entry);
-                zswap_free_entry(tree, entry);
+                zswap_free_entry(entry);
        }
 }
@@ -407,8 +408,8 @@ cleanup:
 **********************************/
 static bool zswap_is_full(void)
 {
-        return (totalram_pages * zswap_max_pool_percent / 100 <
+        return totalram_pages * zswap_max_pool_percent / 100 <
-                zswap_pool_pages);
+                zswap_pool_pages;
 }
 /*********************************
@@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
        zbud_unmap(pool, handle);
        tree = zswap_trees[swp_type(swpentry)];
        offset = swp_offset(swpentry);
-        BUG_ON(pool != tree->pool);
        /* find and ref zswap entry */
        spin_lock(&tree->lock);
@@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
                /* decompress */
                dlen = PAGE_SIZE;
-                src = (u8 *)zbud_map(tree->pool, entry->handle) +
+                src = (u8 *)zbud_map(zswap_pool, entry->handle) +
                        sizeof(struct zswap_header);
                dst = kmap_atomic(page);
                ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
                                entry->length, dst, &dlen);
                kunmap_atomic(dst);
-                zbud_unmap(tree->pool, entry->handle);
+                zbud_unmap(zswap_pool, entry->handle);
                BUG_ON(ret);
                BUG_ON(dlen != PAGE_SIZE);
@@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* reclaim space if needed */
        if (zswap_is_full()) {
                zswap_pool_limit_hit++;
-                if (zbud_reclaim_page(tree->pool, 8)) {
+                if (zbud_reclaim_page(zswap_pool, 8)) {
                        zswap_reject_reclaim_fail++;
                        ret = -ENOMEM;
                        goto reject;
@@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* store */
        len = dlen + sizeof(struct zswap_header);
-        ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
+        ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
                &handle);
        if (ret == -ENOSPC) {
                zswap_reject_compress_poor++;
@@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                zswap_reject_alloc_fail++;
                goto freepage;
        }
-        zhdr = zbud_map(tree->pool, handle);
+        zhdr = zbud_map(zswap_pool, handle);
        zhdr->swpentry = swp_entry(type, offset);
        buf = (u8 *)(zhdr + 1);
        memcpy(buf, dst, dlen);
-        zbud_unmap(tree->pool, handle);
+        zbud_unmap(zswap_pool, handle);
        put_cpu_var(zswap_dstmem);
        /* populate entry */
@@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* update stats */
        atomic_inc(&zswap_stored_pages);
-        zswap_pool_pages = zbud_get_pool_size(tree->pool);
+        zswap_pool_pages = zbud_get_pool_size(zswap_pool);
        return 0;
@@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
        /* decompress */
        dlen = PAGE_SIZE;
-        src = (u8 *)zbud_map(tree->pool, entry->handle) +
+        src = (u8 *)zbud_map(zswap_pool, entry->handle) +
                        sizeof(struct zswap_header);
        dst = kmap_atomic(page);
        ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
                dst, &dlen);
        kunmap_atomic(dst);
-        zbud_unmap(tree->pool, entry->handle);
+        zbud_unmap(zswap_pool, entry->handle);
        BUG_ON(ret);
        spin_lock(&tree->lock);
@@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        /* walk the tree and free everything */
        spin_lock(&tree->lock);
        rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-                zswap_free_entry(tree, entry);
+                zswap_free_entry(entry);
        tree->rbroot = RB_ROOT;
        spin_unlock(&tree->lock);
-        zbud_destroy_pool(tree->pool);
        kfree(tree);
        zswap_trees[type] = NULL;
 }
@@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type)
        struct zswap_tree *tree;
        tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
-        if (!tree)
+        if (!tree) {
-                goto err;
+                pr_err("alloc failed, zswap disabled for swap type %d\n", type);
-        tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+                return;
-        if (!tree->pool)
+        }
-                goto freetree;
        tree->rbroot = RB_ROOT;
        spin_lock_init(&tree->lock);
        zswap_trees[type] = tree;
-        return;
-freetree:
-        kfree(tree);
-err:
-        pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 }
 static struct frontswap_ops zswap_frontswap_ops = {
@@ -907,9 +899,16 @@ static int __init init_zswap(void)
                return 0;
        pr_info("loading zswap\n");
+        zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+        if (!zswap_pool) {
+                pr_err("zbud pool creation failed\n");
+                goto error;
+        }
        if (zswap_entry_cache_create()) {
                pr_err("entry cache creation failed\n");
-                goto error;
+                goto cachefail;
        }
        if (zswap_comp_init()) {
                pr_err("compressor initialization failed\n");
@@ -919,6 +918,7 @@ static int __init init_zswap(void)
                pr_err("per-cpu initialization failed\n");
                goto pcpufail;
        }
        frontswap_register_ops(&zswap_frontswap_ops);
        if (zswap_debugfs_init())
                pr_warn("debugfs initialization failed\n");
@@ -927,6 +927,8 @@ pcpufail:
        zswap_comp_exit();
 compfail:
        zswap_entry_cache_destory();
+cachefail:
+        zbud_destroy_pool(zswap_pool);
 error:
        return -ENOMEM;
 }