Merge branch 'akpm' (second patch-bomb from Andrew)

Merge second patchbomb from Andrew Morton: - the rest of MM - misc fs fixes - add execveat() syscall - new ratelimit feature for fault-injection - decompressor updates - ipc/ updates - fallocate feature creep - fsnotify cleanups - a few other misc things * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (99 commits) cgroups: Documentation: fix trivial typos and wrong paragraph numberings parisc: percpu: update comments referring to __get_cpu_var percpu: update local_ops.txt to reflect this_cpu operations percpu: remove __get_cpu_var and __raw_get_cpu_var macros fsnotify: remove destroy_list from fsnotify_mark fsnotify: unify inode and mount marks handling fallocate: create FAN_MODIFY and IN_MODIFY events mm/cma: make kmemleak ignore CMA regions slub: fix cpuset check in get_any_partial slab: fix cpuset check in fallback_alloc shmdt: use i_size_read() instead of ->i_size ipc/shm.c: fix overly aggressive shmdt() when calls span multiple segments ipc/msg: increase MSGMNI, remove scaling ipc/sem.c: increase SEMMSL, SEMMNI, SEMOPM ipc/sem.c: change memory barrier in sem_lock() to smp_rmb() lib/decompress.c: consistency of compress formats for kernel image decompress_bunzip2: off by one in get_next_block() usr/Kconfig: make initrd compression algorithm selection not expert fault-inject: add ratelimit option ratelimit: add initialization macro ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-13 16:00:36 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-13 16:00:36 -0500
commit: 78a45c6f067824cf5d0a9fedea7339ac2e28603c (patch)
tree: b4f78c8b6b9059ddace0a18c11629b8d2045f793 /mm
parent: f96fe225677b3efb74346ebd56fafe3997b02afa (diff)
parent: 29d293b6007b91a4463f05bc8d0b26e0e65c5816 (diff)
32 files changed, 1409 insertions, 524 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 4b2443254de2..56badfc4810a 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,8 +1,18 @@
+config PAGE_EXTENSION
+        bool "Extend memmap on extra space for more information on page"
+        ---help---
+          Extend memmap on extra space for more information on page. This
+          could be used for debugging features that need to insert extra
+          field for every page. This extension enables us to save memory
+          by not allocating this extra memory according to boottime
+          configuration.
 config DEBUG_PAGEALLOC
        bool "Debug page memory allocations"
        depends on DEBUG_KERNEL
        depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
        depends on !KMEMCHECK
+        select PAGE_EXTENSION
        select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
        select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
        ---help---
diff --git a/mm/Makefile b/mm/Makefile
index b3c6ce932c64..4bf586e66378 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZPOOL)     += zpool.o
@@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)       += cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
diff --git a/mm/cma.c b/mm/cma.c
index 8e9ec13d31db..f8917629cbdd 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
 #include <linux/log2.h>
 #include <linux/cma.h>
 #include <linux/highmem.h>
+#include <linux/io.h>
 struct cma {
        unsigned long   base_pfn;
@@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
        return (1UL << (align_order - cma->order_per_bit)) - 1;
 }
+static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+{
+        unsigned int alignment;
+        if (align_order <= cma->order_per_bit)
+                return 0;
+        alignment = 1UL << (align_order - cma->order_per_bit);
+        return ALIGN(cma->base_pfn, alignment) -
+                (cma->base_pfn >> cma->order_per_bit);
+}
 static unsigned long cma_bitmap_maxno(struct cma *cma)
 {
        return cma->count >> cma->order_per_bit;
@@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base,
                        }
                }
+                /*
+                 * kmemleak scans/reads tracked objects for pointers to other
+                 * objects but this address isn't mapped and accessible
+                 */
+                kmemleak_ignore(phys_to_virt(addr));
                base = addr;
        }
@@ -340,7 +357,7 @@ err:
 */
 struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
 {
-        unsigned long mask, pfn, start = 0;
+        unsigned long mask, offset, pfn, start = 0;
        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
        struct page *page = NULL;
        int ret;
@@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
                return NULL;
        mask = cma_bitmap_aligned_mask(cma, align);
+        offset = cma_bitmap_aligned_offset(cma, align);
        bitmap_maxno = cma_bitmap_maxno(cma);
        bitmap_count = cma_bitmap_pages_to_bits(cma, count);
        for (;;) {
                mutex_lock(&cma->lock);
-                bitmap_no = bitmap_find_next_zero_area(cma->bitmap,
+                bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
-                                bitmap_maxno, start, bitmap_count, mask);
+                                bitmap_maxno, start, bitmap_count, mask,
+                                offset);
                if (bitmap_no >= bitmap_maxno) {
                        mutex_unlock(&cma->lock);
                        break;
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 789ff70c8a4a..5bf5906ce13b 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -2,23 +2,55 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
-#include <linux/page-debug-flags.h>
+#include <linux/page_ext.h>
 #include <linux/poison.h>
 #include <linux/ratelimit.h>
+static bool page_poisoning_enabled __read_mostly;
+static bool need_page_poisoning(void)
+{
+        if (!debug_pagealloc_enabled())
+                return false;
+        return true;
+}
+static void init_page_poisoning(void)
+{
+        if (!debug_pagealloc_enabled())
+                return;
+        page_poisoning_enabled = true;
+}
+struct page_ext_operations page_poisoning_ops = {
+        .need = need_page_poisoning,
+        .init = init_page_poisoning,
+};
 static inline void set_page_poison(struct page *page)
 {
-        __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+        struct page_ext *page_ext;
+        page_ext = lookup_page_ext(page);
+        __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
 static inline void clear_page_poison(struct page *page)
 {
-        __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+        struct page_ext *page_ext;
+        page_ext = lookup_page_ext(page);
+        __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
 static inline bool page_poison(struct page *page)
 {
-        return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+        struct page_ext *page_ext;
+        page_ext = lookup_page_ext(page);
+        return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
 static void poison_page(struct page *page)
@@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n)
                unpoison_page(page + i);
 }
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
+        if (!page_poisoning_enabled)
+                return;
        if (enable)
                unpoison_pages(page, numpages);
        else
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81db45e..2ad7adf4f0a4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
                        __filemap_fdatawrite_range(mapping, offset, endbyte,
                                                   WB_SYNC_NONE);
-                /* First and last FULL page! */
+                /*
+                 * First and last FULL page! Partial pages are deliberately
+                 * preserved on the expectation that it is better to preserve
+                 * needed memory than to discard unneeded memory.
+                 */
                start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
                end_index = (endbyte >> PAGE_CACHE_SHIFT);
diff --git a/mm/filemap.c b/mm/filemap.c
index 14b4642279f1..e8905bc3cbd7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,16 +62,16 @@
 /*
 * Lock ordering:
 *
- *  ->i_mmap_mutex              (truncate_pagecache)
+ *  ->i_mmap_rwsem              (truncate_pagecache)
 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
 *
 *  ->i_mutex
- *    ->i_mmap_mutex            (truncate->unmap_mapping_range)
+ *    ->i_mmap_rwsem            (truncate->unmap_mapping_range)
 *
 *  ->mmap_sem
- *    ->i_mmap_mutex
+ *    ->i_mmap_rwsem
 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
 *
@@ -85,7 +85,7 @@
 *    sb_lock                   (fs/fs-writeback.c)
 *    ->mapping->tree_lock      (__sync_single_inode)
 *
- *  ->i_mmap_mutex
+ *  ->i_mmap_rwsem
 *    ->anon_vma.lock           (vma_adjust)
 *
 *  ->anon_vma.lock
@@ -105,7 +105,7 @@
 *    ->inode->i_lock           (zap_pte_range->set_page_dirty)
 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 *
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
 *   ->tasklist_lock            (memory_failure, collect_procs_ao)
 */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3f685c..0d105aeff82f 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 EXPORT_SYMBOL_GPL(xip_file_read);
 /*
- * __xip_unmap is invoked from xip_unmap and
+ * __xip_unmap is invoked from xip_unmap and xip_write
- * xip_write
 *
 * This function walks all vmas of the address_space and unmaps the
 * __xip_sparse_page when found at pgoff.
 */
-static void
+static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-__xip_unmap (struct address_space * mapping,
-                     unsigned long pgoff)
 {
        struct vm_area_struct *vma;
-        struct mm_struct *mm;
-        unsigned long address;
-        pte_t *pte;
-        pte_t pteval;
-        spinlock_t *ptl;
        struct page *page;
        unsigned count;
        int locked = 0;
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping,
                return;
 retry:
-        mutex_lock(&mapping->i_mmap_mutex);
+        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                mm = vma->vm_mm;
+                pte_t *pte, pteval;
-                address = vma->vm_start +
+                spinlock_t *ptl;
+                struct mm_struct *mm = vma->vm_mm;
+                unsigned long address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
                pte = page_check_address(page, mm, address, &ptl, 1);
                if (pte) {
@@ -202,7 +197,7 @@ retry:
                        page_cache_release(page);
                }
        }
-        mutex_unlock(&mapping->i_mmap_mutex);
+        i_mmap_unlock_read(mapping);
        if (locked) {
                mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa361433..11ef7ec40d13 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -238,13 +238,13 @@ get_write_lock:
                        }
                        goto out_freed;
                }
-                mutex_lock(&mapping->i_mmap_mutex);
+                i_mmap_lock_write(mapping);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
                vma_interval_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
        }
        if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 919b86a2164d..47f6070d7c46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
        return 0;
 found:
-        BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+        BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
        /* Put them into a private list first because mem_map is not up yet */
        list_add(&m->list, &huge_boot_pages);
        m->hstate = h;
@@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node)
 * devices of nodes that have memory.  All on-line nodes should have
 * registered their associated device by this time.
 */
-static void hugetlb_register_all_nodes(void)
+static void __init hugetlb_register_all_nodes(void)
 {
        int nid;
@@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
         * on its way out.  We're lucky that the flag has such an appropriate
         * name, and can in fact be safely cleared here. We could clear it
         * before the __unmap_hugepage_range above, but all that's necessary
-         * is to clear it before releasing the i_mmap_mutex. This works
+         * is to clear it before releasing the i_mmap_rwsem. This works
         * because in the context this is called, the VMA is about to be
-         * destroyed and the i_mmap_mutex is held.
+         * destroyed and the i_mmap_rwsem is held.
         */
        vma->vm_flags &= ~VM_MAYSHARE;
 }
@@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * this mapping should be shared between all the VMAs,
         * __unmap_hugepage_range() is called as the lock is already held
         */
-        mutex_lock(&mapping->i_mmap_mutex);
+        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
@@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                        unmap_hugepage_range(iter_vma, address,
                                             address + huge_page_size(h), page);
        }
-        mutex_unlock(&mapping->i_mmap_mutex);
+        i_mmap_unlock_write(mapping);
 }
 /*
@@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
        flush_cache_range(vma, address, end);
        mmu_notifier_invalidate_range_start(mm, start, end);
-        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        i_mmap_lock_write(vma->vm_file->f_mapping);
        for (; address < end; address += huge_page_size(h)) {
                spinlock_t *ptl;
                ptep = huge_pte_offset(mm, address);
@@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                spin_unlock(ptl);
        }
        /*
-         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+         * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
         * may have cleared our pud entry and done put_page on the page table:
-         * once we release i_mmap_mutex, another task can do the final put_page
+         * once we release i_mmap_rwsem, another task can do the final put_page
         * and that page table be reused and filled with junk.
         */
        flush_tlb_range(vma, start, end);
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        i_mmap_unlock_write(vma->vm_file->f_mapping);
        mmu_notifier_invalidate_range_end(mm, start, end);
        return pages << h->order;
@@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 * and returns the corresponding pte. While this is not necessary for the
 * !shared pmd case because we can allocate the pmd later as well, it makes the
 * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
 * bad pmd for sharing.
 */
@@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!vma_shareable(vma, addr))
                return (pte_t *)pmd_alloc(mm, pud, addr);
-        mutex_lock(&mapping->i_mmap_mutex);
+        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
@@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        spin_unlock(ptl);
 out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
-        mutex_unlock(&mapping->i_mmap_mutex);
+        i_mmap_unlock_write(mapping);
        return pte;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 6ecb0d937fb5..252b77bdf65e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 }
 /**
- * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
- * @base: the base phys addr of the region
- * @size: the size of the region
 *
- * This function isolates region [@base, @base + @size), and mark it with flag
+ * This function isolates region [@base, @base + @size), and sets/clears flag
- * MEMBLOCK_HOTPLUG.
 *
 * Return 0 on succees, -errno on failure.
 */
-int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_setclr_flag(phys_addr_t base,
+                                phys_addr_t size, int set, int flag)
 {
        struct memblock_type *type = &memblock.memory;
        int i, ret, start_rgn, end_rgn;
@@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
                return ret;
        for (i = start_rgn; i < end_rgn; i++)
-                memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+                if (set)
+                        memblock_set_region_flags(&type->regions[i], flag);
+                else
+                        memblock_clear_region_flags(&type->regions[i], flag);
        memblock_merge_regions(type);
        return 0;
 }
 /**
- * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
- * This function isolates region [@base, @base + @size), and clear flag
+ * Return 0 on succees, -errno on failure.
- * MEMBLOCK_HOTPLUG for the isolated regions.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+        return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
+}
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
 *
 * Return 0 on succees, -errno on failure.
 */
 int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 {
-        struct memblock_type *type = &memblock.memory;
+        return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
-        int i, ret, start_rgn, end_rgn;
-        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
-        if (ret)
-                return ret;
-        for (i = start_rgn; i < end_rgn; i++)
-                memblock_clear_region_flags(&type->regions[i],
-                                            MEMBLOCK_HOTPLUG);
-        memblock_merge_regions(type);
-        return 0;
 }
 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85df503ec023..ef91e856c7e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -296,7 +296,6 @@ struct mem_cgroup {
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
-        unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
        bool            oom_lock;
        atomic_t        under_oom;
@@ -366,22 +365,11 @@ struct mem_cgroup {
        /* WARNING: nodeinfo must be the last member here */
 };
-/* internal only representation about the status of kmem accounting. */
-enum {
-        KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
-};
 #ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
-{
-        set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
-}
 static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+        return memcg->kmemcg_id >= 0;
 }
 #endif
 /* Stuffs for move charges at task migration. */
@@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
-        if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
                set_thread_flag(TIF_MEMDIE);
                return;
        }
@@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                         NULL, "Memory cgroup out of memory");
 }
+#if MAX_NUMNODES > 1
 /**
 * test_mem_cgroup_node_reclaimable
 * @memcg: the target memcg
@@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
        return false;
 }
-#if MAX_NUMNODES > 1
 /*
 * Always updating the nodemask is not very good - even if we have an empty
@@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
        if (!cachep)
                return;
-        css_get(&memcg->css);
        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
        /*
@@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
        list_del(&cachep->memcg_params->list);
        kmem_cache_destroy(cachep);
-        /* drop the reference taken in memcg_register_cache */
-        css_put(&memcg->css);
-}
-/*
- * During the creation a new cache, we need to disable our accounting mechanism
- * altogether. This is true even if we are not creating, but rather just
- * enqueing new caches to be created.
- *
- * This is because that process will trigger allocations; some visible, like
- * explicit kmallocs to auxiliary data structures, name strings and internal
- * cache structures; some well concealed, like INIT_WORK() that can allocate
- * objects during debug.
- *
- * If any allocation happens during memcg_kmem_get_cache, we will recurse back
- * to it. This may not be a bounded recursion: since the first cache creation
- * failed to complete (waiting on the allocation), we'll just try to create the
- * cache again, failing at the same point.
- *
- * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
- * memcg_kmem_skip_account. So we enclose anything that might allocate memory
- * inside the following two functions.
- */
-static inline void memcg_stop_kmem_account(void)
-{
-        VM_BUG_ON(!current->mm);
-        current->memcg_kmem_skip_account++;
-}
-static inline void memcg_resume_kmem_account(void)
-{
-        VM_BUG_ON(!current->mm);
-        current->memcg_kmem_skip_account--;
 }
 int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
        mutex_lock(&memcg_slab_mutex);
        list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
                cachep = memcg_params_to_cache(params);
-                kmem_cache_shrink(cachep);
+                memcg_unregister_cache(cachep);
-                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
-                        memcg_unregister_cache(cachep);
        }
        mutex_unlock(&memcg_slab_mutex);
 }
@@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
        struct memcg_register_cache_work *cw;
        cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
-        if (cw == NULL) {
+        if (!cw)
-                css_put(&memcg->css);
                return;
-        }
+        css_get(&memcg->css);
        cw->memcg = memcg;
        cw->cachep = cachep;
@@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
         * this point we can't allow ourselves back into memcg_kmem_get_cache,
         * the safest choice is to do it like this, wrapping the whole function.
         */
-        memcg_stop_kmem_account();
+        current->memcg_kmem_skip_account = 1;
        __memcg_schedule_register_cache(memcg, cachep);
-        memcg_resume_kmem_account();
+        current->memcg_kmem_skip_account = 0;
 }
 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 {
        unsigned int nr_pages = 1 << order;
-        int res;
-        res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
+        return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-        if (!res)
-                atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
-        return res;
 }
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
        unsigned int nr_pages = 1 << order;
        memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-        atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
 }
 /*
@@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
 * Can't be called in interrupt context or from kernel threads.
 * This function needs to be called with rcu_read_lock() held.
 */
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
-                                          gfp_t gfp)
 {
        struct mem_cgroup *memcg;
        struct kmem_cache *memcg_cachep;
@@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
        VM_BUG_ON(!cachep->memcg_params);
        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
-        if (!current->mm || current->memcg_kmem_skip_account)
+        if (current->memcg_kmem_skip_account)
                return cachep;
-        rcu_read_lock();
+        memcg = get_mem_cgroup_from_mm(current->mm);
-        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
        if (!memcg_kmem_is_active(memcg))
                goto out;
        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
-        if (likely(memcg_cachep)) {
+        if (likely(memcg_cachep))
-                cachep = memcg_cachep;
+                return memcg_cachep;
-                goto out;
-        }
-        /* The corresponding put will be done in the workqueue. */
-        if (!css_tryget_online(&memcg->css))
-                goto out;
-        rcu_read_unlock();
        /*
         * If we are in a safe context (can wait, and not in interrupt
@@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
         * defer everything.
         */
        memcg_schedule_register_cache(memcg, cachep);
-        return cachep;
 out:
-        rcu_read_unlock();
+        css_put(&memcg->css);
        return cachep;
 }
+void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+        if (!is_root_cache(cachep))
+                css_put(&cachep->memcg_params->memcg->css);
+}
 /*
 * We need to verify if the allocation against current->mm->owner's memcg is
 * possible for the given order. But the page is not allocated yet, so we'll
@@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
        *_memcg = NULL;
-        /*
-         * Disabling accounting is only relevant for some specific memcg
-         * internal allocations. Therefore we would initially not have such
-         * check here, since direct calls to the page allocator that are
-         * accounted to kmemcg (alloc_kmem_pages and friends) only happen
-         * outside memcg core. We are mostly concerned with cache allocations,
-         * and by having this test at memcg_kmem_get_cache, we are already able
-         * to relay the allocation to the root cache and bypass the memcg cache
-         * altogether.
-         *
-         * There is one exception, though: the SLUB allocator does not create
-         * large order caches, but rather service large kmallocs directly from
-         * the page allocator. Therefore, the following sequence when backed by
-         * the SLUB allocator:
-         *
-         *      memcg_stop_kmem_account();
-         *      kmalloc(<large_number>)
-         *      memcg_resume_kmem_account();
-         *
-         * would effectively ignore the fact that we should skip accounting,
-         * since it will drive us directly to this function without passing
-         * through the cache selector memcg_kmem_get_cache. Such large
-         * allocations are extremely rare but can happen, for instance, for the
-         * cache arrays. We bring this test here.
-         */
-        if (!current->mm || current->memcg_kmem_skip_account)
-                return true;
        memcg = get_mem_cgroup_from_mm(current->mm);
        if (!memcg_kmem_is_active(memcg)) {
@@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        memcg_uncharge_kmem(memcg, 1 << order);
        page->mem_cgroup = NULL;
 }
-#else
-static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-}
 #endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3539,12 +3449,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
                return 0;
        /*
-         * We are going to allocate memory for data shared by all memory
-         * cgroups so let's stop accounting here.
-         */
-        memcg_stop_kmem_account();
-        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
         * be changed if the cgroup has children already, or if tasks had
         * already joined.
@@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
                goto out;
        }
-        memcg->kmemcg_id = memcg_id;
-        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
        /*
-         * We couldn't have accounted to this cgroup, because it hasn't got the
+         * We couldn't have accounted to this cgroup, because it hasn't got
-         * active bit set yet, so this should succeed.
+         * activated yet, so this should succeed.
         */
        err = page_counter_limit(&memcg->kmem, nr_pages);
        VM_BUG_ON(err);
        static_key_slow_inc(&memcg_kmem_enabled_key);
        /*
-         * Setting the active bit after enabling static branching will
+         * A memory cgroup is considered kmem-active as soon as it gets
+         * kmemcg_id. Setting the id after enabling static branching will
         * guarantee no one starts accounting before all call sites are
         * patched.
         */
-        memcg_kmem_set_active(memcg);
+        memcg->kmemcg_id = memcg_id;
 out:
-        memcg_resume_kmem_account();
        return err;
 }
@@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
 }
 #endif /* CONFIG_NUMA */
-static inline void mem_cgroup_lru_names_not_uptodate(void)
-{
-        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
-}
 static int memcg_stat_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
        struct mem_cgroup *mi;
        unsigned int i;
+        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
@@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
        int ret;
-        memcg->kmemcg_id = -1;
        ret = memcg_propagate_kmem(memcg);
        if (ret)
                return ret;
@@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
+        memcg_unregister_all_caches(memcg);
        mem_cgroup_sockets_destroy(memcg);
 }
 #else
@@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
        free_percpu(memcg->stat);
-        /*
-         * We need to make sure that (at least for now), the jump label
-         * destruction code runs outside of the cgroup lock. This is because
-         * get_online_cpus(), which is called from the static_branch update,
-         * can't be called inside the cgroup_lock. cpusets are the ones
-         * enforcing this dependency, so if they ever change, we might as well.
-         *
-         * schedule_work() will guarantee this happens. Be careful if you need
-         * to move this code around, and make sure it is outside
-         * the cgroup_lock.
-         */
        disarm_static_keys(memcg);
        kfree(memcg);
 }
@@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        vmpressure_init(&memcg->vmpressure);
        INIT_LIST_HEAD(&memcg->event_list);
        spin_lock_init(&memcg->event_list_lock);
+#ifdef CONFIG_MEMCG_KMEM
+        memcg->kmemcg_id = -1;
+        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+#endif
        return &memcg->css;
@@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        }
        spin_unlock(&memcg->event_list_lock);
-        memcg_unregister_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e5ee0ca7ae85..feb803bf3443 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -239,19 +239,14 @@ void shake_page(struct page *p, int access)
        }
        /*
-         * Only call shrink_slab here (which would also shrink other caches) if
+         * Only call shrink_node_slabs here (which would also shrink
-         * access is not potentially fatal.
+         * other caches) if access is not potentially fatal.
         */
        if (access) {
                int nr;
                int nid = page_to_nid(p);
                do {
-                        struct shrink_control shrink = {
+                        nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
-                                .gfp_mask = GFP_KERNEL,
-                        };
-                        node_set(nid, shrink.nodes_to_scan);
-                        nr = shrink_slab(&shrink, 1000, 1000);
                        if (page_count(p) == 1)
                                break;
                } while (nr > 10);
@@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct address_space *mapping = page->mapping;
-        mutex_lock(&mapping->i_mmap_mutex);
+        i_mmap_lock_read(mapping);
        read_lock(&tasklist_lock);
        for_each_process(tsk) {
                pgoff_t pgoff = page_to_pgoff(page);
@@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                }
        }
        read_unlock(&tasklist_lock);
-        mutex_unlock(&mapping->i_mmap_mutex);
+        i_mmap_unlock_read(mapping);
 }
 /*
diff --git a/mm/memory.c b/mm/memory.c
index 4b5a282e1107..fbf74112de5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
-                                mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                i_mmap_lock_write(vma->vm_file->f_mapping);
                                __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
-                                mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                i_mmap_unlock_write(vma->vm_file->f_mapping);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
@@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping,
                details.last_index = ULONG_MAX;
-        mutex_lock(&mapping->i_mmap_mutex);
+        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
-        mutex_unlock(&mapping->i_mmap_mutex);
+        i_mmap_unlock_read(mapping);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return ret;
 }
+EXPORT_SYMBOL_GPL(handle_mm_fault);
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 01439953abf5..253474c22239 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 *  MIGRATEPAGE_SUCCESS - success
 */
 static int move_to_new_page(struct page *newpage, struct page *page,
-                                int remap_swapcache, enum migrate_mode mode)
+                                int page_was_mapped, enum migrate_mode mode)
 {
        struct address_space *mapping;
        int rc;
@@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
                newpage->mapping = NULL;
        } else {
                mem_cgroup_migrate(page, newpage, false);
-                if (remap_swapcache)
+                if (page_was_mapped)
                        remove_migration_ptes(page, newpage);
                page->mapping = NULL;
        }
@@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                                int force, enum migrate_mode mode)
 {
        int rc = -EAGAIN;
-        int remap_swapcache = 1;
+        int page_was_mapped = 0;
        struct anon_vma *anon_vma = NULL;
        if (!trylock_page(page)) {
@@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                         * migrated but are not remapped when migration
                         * completes
                         */
-                        remap_swapcache = 0;
                } else {
                        goto out_unlock;
                }
@@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        }
        /* Establish migration ptes or remove ptes */
-        try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+        if (page_mapped(page)) {
+                try_to_unmap(page,
+                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+                page_was_mapped = 1;
+        }
 skip_unmap:
        if (!page_mapped(page))
-                rc = move_to_new_page(newpage, page, remap_swapcache, mode);
+                rc = move_to_new_page(newpage, page, page_was_mapped, mode);
-        if (rc && remap_swapcache)
+        if (rc && page_was_mapped)
                remove_migration_ptes(page, page);
        /* Drop an anon_vma reference if we took one */
@@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 {
        int rc = 0;
        int *result = NULL;
+        int page_was_mapped = 0;
        struct page *new_hpage;
        struct anon_vma *anon_vma = NULL;
@@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (PageAnon(hpage))
                anon_vma = page_get_anon_vma(hpage);
-        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+        if (page_mapped(hpage)) {
+                try_to_unmap(hpage,
+                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+                page_was_mapped = 1;
+        }
        if (!page_mapped(hpage))
-                rc = move_to_new_page(new_hpage, hpage, 1, mode);
+                rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
-        if (rc != MIGRATEPAGE_SUCCESS)
+        if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
                remove_migration_ptes(hpage, hpage);
        if (anon_vma)
diff --git a/mm/mincore.c b/mm/mincore.c
index 725c80961048..c8c528b36641 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                } else { /* pte is a swap entry */
                        swp_entry_t entry = pte_to_swp_entry(pte);
-                        if (is_migration_entry(entry)) {
+                        if (non_swap_entry(entry)) {
-                                /* migration entries are always uptodate */
+                                /*
+                                 * migration or hwpoison entries are always
+                                 * uptodate
+                                 */
                                *vec = 1;
                        } else {
 #ifdef CONFIG_SWAP
diff --git a/mm/mmap.c b/mm/mmap.c
index b6c0a77fc1c8..7b36aa7cc89a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ error:
 }
 /*
- * Requires inode->i_mapping->i_mmap_mutex
+ * Requires inode->i_mapping->i_mmap_rwsem
 */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
        if (file) {
                struct address_space *mapping = file->f_mapping;
-                mutex_lock(&mapping->i_mmap_mutex);
+                i_mmap_lock_write(mapping);
                __remove_shared_vm_struct(vma, file, mapping);
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
        }
 }
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
-                mutex_lock(&mapping->i_mmap_mutex);
+                i_mmap_lock_write(mapping);
        }
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
        if (mapping)
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
        mm->map_count++;
        validate_mm(mm);
@@ -796,7 +796,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                                                        next->vm_end);
                }
-                mutex_lock(&mapping->i_mmap_mutex);
+                i_mmap_lock_write(mapping);
                if (insert) {
                        /*
                         * Put into interval tree now, so instantiated pages
@@ -883,7 +883,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                anon_vma_unlock_write(anon_vma);
        }
        if (mapping)
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
        if (root) {
                uprobe_mmap(vma);
@@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
 }
 #endif
+EXPORT_SYMBOL_GPL(find_extend_vma);
 /*
 * Ok - we have the memory areas we should free on the vma list,
 * so release them, and do the vma updates.
@@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm)
 /* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_mutex is taken here.
+ * then i_mmap_rwsem is taken here.
 */
 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
@@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
-                mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
+                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
        }
 }
@@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
+ * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
 * takes more than one of them in a row. Secondly we're protected
 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
 *
@@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index b147f66f4c40..84aa36f9f308 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        spinlock_t *old_ptl, *new_ptl;
        /*
-         * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+         * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
         * locks to ensure that rmap will always observe either the old or the
         * new ptes. This is the easiest way to avoid races with
         * truncate_pagecache(), page migration, etc...
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        if (need_rmap_locks) {
                if (vma->vm_file) {
                        mapping = vma->vm_file->f_mapping;
-                        mutex_lock(&mapping->i_mmap_mutex);
+                        i_mmap_lock_write(mapping);
                }
                if (vma->anon_vma) {
                        anon_vma = vma->anon_vma;
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        if (anon_vma)
                anon_vma_unlock_write(anon_vma);
        if (mapping)
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
 }
 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index bd1808e194a7..b51eadf6d952 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
-                mutex_lock(&mapping->i_mmap_mutex);
+                i_mmap_lock_write(mapping);
                flush_dcache_mmap_lock(mapping);
                vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
        }
        /* add the VMA to the tree */
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
-                mutex_lock(&mapping->i_mmap_mutex);
+                i_mmap_lock_write(mapping);
                flush_dcache_mmap_lock(mapping);
                vma_interval_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
-                mutex_unlock(&mapping->i_mmap_mutex);
+                i_mmap_unlock_write(mapping);
        }
        /* remove from the MM's tree and list */
@@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
                           unsigned long len,
                           unsigned long capabilities)
 {
-        struct page *pages;
+        unsigned long total, point;
-        unsigned long total, point, n;
        void *base;
        int ret, order;
@@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma,
        order = get_order(len);
        kdebug("alloc order %d for %lx", order, len);
-        pages = alloc_pages(GFP_KERNEL, order);
-        if (!pages)
-                goto enomem;
        total = 1 << order;
-        atomic_long_add(total, &mmap_pages_allocated);
        point = len >> PAGE_SHIFT;
-        /* we allocated a power-of-2 sized page set, so we may want to trim off
+        /* we don't want to allocate a power-of-2 sized page set */
-         * the excess */
        if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
-                while (total > point) {
+                total = point;
-                        order = ilog2(total - point);
+                kdebug("try to alloc exact %lu pages", total);
-                        n = 1 << order;
+                base = alloc_pages_exact(len, GFP_KERNEL);
-                        kdebug("shave %lu/%lu @%lu", n, total - point, total);
+        } else {
-                        atomic_long_sub(n, &mmap_pages_allocated);
+                base = (void *)__get_free_pages(GFP_KERNEL, order);
-                        total -= n;
-                        set_page_refcounted(pages + total);
-                        __free_pages(pages + total, order);
-                }
        }
-        for (point = 1; point < total; point++)
+        if (!base)
-                set_page_refcounted(&pages[point]);
+                goto enomem;
+        atomic_long_add(total, &mmap_pages_allocated);
-        base = page_address(pages);
        region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
        region->vm_start = (unsigned long) base;
        region->vm_end   = region->vm_start + len;
@@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        down_write(&nommu_region_sem);
-        mutex_lock(&inode->i_mapping->i_mmap_mutex);
+        i_mmap_lock_read(inode->i_mapping);
        /* search for VMAs that fall within the dead zone */
        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
-                        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+                        i_mmap_unlock_read(inode->i_mapping);
                        up_write(&nommu_region_sem);
                        return -ETXTBSY; /* not quite true, but near enough */
                }
@@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
         * we don't check for any regions that start beyond the EOF as there
         * shouldn't be any
         */
-        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
-                                  0, ULONG_MAX) {
                if (!(vma->vm_flags & VM_SHARED))
                        continue;
@@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                }
        }
-        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+        i_mmap_unlock_read(inode->i_mapping);
        up_write(&nommu_region_sem);
        return 0;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 864bba992735..d503e9ce1c7b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
        if (oom_task_origin(task))
                return OOM_SCAN_SELECT;
-        if (task->flags & PF_EXITING && !force_kill) {
+        if (task_will_free_mem(task) && !force_kill)
-                /*
+                return OOM_SCAN_ABORT;
-                 * If this task is not being ptraced on exit, then wait for it
-                 * to finish before killing some other task unnecessarily.
-                 */
-                if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-                        return OOM_SCAN_ABORT;
-        }
        return OOM_SCAN_OK;
 }
@@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
-        if (p->flags & PF_EXITING) {
+        if (task_will_free_mem(p)) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
                put_task_struct(p);
                return;
@@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
-        if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
                set_thread_flag(TIF_MEMDIE);
                return;
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df542feaac3b..fa974d87f60d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
+#include <linux/page_ext.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
@@ -55,9 +56,10 @@
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
-#include <linux/page-debug-flags.h>
+#include <linux/page_ext.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <linux/page_owner.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order,
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
+bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_guardpage_enabled __read_mostly;
+static int __init early_debug_pagealloc(char *buf)
+{
+        if (!buf)
+                return -EINVAL;
+        if (strcmp(buf, "on") == 0)
+                _debug_pagealloc_enabled = true;
+        return 0;
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+static bool need_debug_guardpage(void)
+{
+        /* If we don't use debug_pagealloc, we don't need guard page */
+        if (!debug_pagealloc_enabled())
+                return false;
+        return true;
+}
+static void init_debug_guardpage(void)
+{
+        if (!debug_pagealloc_enabled())
+                return;
+        _debug_guardpage_enabled = true;
+}
+struct page_ext_operations debug_guardpage_ops = {
+        .need = need_debug_guardpage,
+        .init = init_debug_guardpage,
+};
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
@@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
-static inline void set_page_guard_flag(struct page *page)
+static inline void set_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype)
 {
-        __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+        struct page_ext *page_ext;
+        if (!debug_guardpage_enabled())
+                return;
+        page_ext = lookup_page_ext(page);
+        __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+        INIT_LIST_HEAD(&page->lru);
+        set_page_private(page, order);
+        /* Guard pages are not available for any usage */
+        __mod_zone_freepage_state(zone, -(1 << order), migratetype);
 }
-static inline void clear_page_guard_flag(struct page *page)
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype)
 {
-        __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+        struct page_ext *page_ext;
+        if (!debug_guardpage_enabled())
+                return;
+        page_ext = lookup_page_ext(page);
+        __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+        set_page_private(page, 0);
+        if (!is_migrate_isolate(migratetype))
+                __mod_zone_freepage_state(zone, (1 << order), migratetype);
 }
 #else
-static inline void set_page_guard_flag(struct page *page) { }
+struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void clear_page_guard_flag(struct page *page) { }
+static inline void set_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype) {}
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype) {}
 #endif
 static inline void set_page_order(struct page *page, unsigned int order)
@@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page,
                 * merge with it and move up one order.
                 */
                if (page_is_guard(buddy)) {
-                        clear_page_guard_flag(buddy);
+                        clear_page_guard(zone, buddy, order, migratetype);
-                        set_page_private(buddy, 0);
-                        if (!is_migrate_isolate(migratetype)) {
-                                __mod_zone_freepage_state(zone, 1 << order,
-                                                          migratetype);
-                        }
                } else {
                        list_del(&buddy->lru);
                        zone->free_area[order].nr_free--;
@@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        if (bad)
                return false;
+        reset_page_owner(page, order);
        if (!PageHighMem(page)) {
                debug_check_no_locks_freed(page_address(page),
                                           PAGE_SIZE << order);
@@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page,
                size >>= 1;
                VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
-#ifdef CONFIG_DEBUG_PAGEALLOC
+                if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
-                if (high < debug_guardpage_minorder()) {
+                        debug_guardpage_enabled() &&
+                        high < debug_guardpage_minorder()) {
                        /*
                         * Mark as guard pages (or page), that will allow to
                         * merge back to allocator when buddy will be freed.
                         * Corresponding page table entries will not be touched,
                         * pages will stay not present in virtual address space
                         */
-                        INIT_LIST_HEAD(&page[size].lru);
+                        set_page_guard(zone, &page[size], high, migratetype);
-                        set_page_guard_flag(&page[size]);
-                        set_page_private(&page[size], high);
-                        /* Guard pages are not available for any usage */
-                        __mod_zone_freepage_state(zone, -(1 << high),
-                                                  migratetype);
                        continue;
                }
-#endif
                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
@@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
        if (order && (gfp_flags & __GFP_COMP))
                prep_compound_page(page, order);
+        set_page_owner(page, order, gfp_flags);
        return 0;
 }
@@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order)
                split_page(virt_to_page(page[0].shadow), order);
 #endif
-        for (i = 1; i < (1 << order); i++)
+        set_page_owner(page, 0, 0);
+        for (i = 1; i < (1 << order); i++) {
                set_page_refcounted(page + i);
+                set_page_owner(page + i, 0, 0);
+        }
 }
 EXPORT_SYMBOL_GPL(split_page);
@@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
                }
        }
+        set_page_owner(page, order, 0);
        return 1UL << order;
 }
@@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 #endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
+        pgdat_page_ext_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
@@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 * and per-cpu initialisations
                 */
                memmap_pages = calc_memmap_size(size, realsize);
-                if (freesize >= memmap_pages) {
+                if (!is_highmem_idx(j)) {
-                        freesize -= memmap_pages;
+                        if (freesize >= memmap_pages) {
-                        if (memmap_pages)
+                                freesize -= memmap_pages;
-                                printk(KERN_DEBUG
+                                if (memmap_pages)
-                                       "  %s zone: %lu pages used for memmap\n",
+                                        printk(KERN_DEBUG
-                                       zone_names[j], memmap_pages);
+                                               "  %s zone: %lu pages used for memmap\n",
-                } else
+                                               zone_names[j], memmap_pages);
-                        printk(KERN_WARNING
+                        } else
-                                "  %s zone: %lu pages exceeds freesize %lu\n",
+                                printk(KERN_WARNING
-                                zone_names[j], memmap_pages, freesize);
+                                        "  %s zone: %lu pages exceeds freesize %lu\n",
+                                        zone_names[j], memmap_pages, freesize);
+                }
                /* Account for reserved pages */
                if (j == 0 && freesize > dma_reserve) {
@@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                if (!PageLRU(page))
                        found++;
                /*
-                 * If there are RECLAIMABLE pages, we need to check it.
+                 * If there are RECLAIMABLE pages, we need to check
-                 * But now, memory offline itself doesn't call shrink_slab()
+                 * it.  But now, memory offline itself doesn't call
-                 * and it still to be fixed.
+                 * shrink_node_slabs() and it still to be fixed.
                 */
                /*
                 * If the page is not RAM, page_count()should be 0.
diff --git a/mm/page_ext.c b/mm/page_ext.c
new file mode 100644
index 000000000000..d86fd2f5353f
--- /dev/null
+++ b/mm/page_ext.c
@@ -0,0 +1,403 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/page_ext.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
+#include <linux/page_owner.h>
+/*
+ * struct page extension
+ *
+ * This is the feature to manage memory for extended data per page.
+ *
+ * Until now, we must modify struct page itself to store extra data per page.
+ * This requires rebuilding the kernel and it is really time consuming process.
+ * And, sometimes, rebuild is impossible due to third party module dependency.
+ * At last, enlarging struct page could cause un-wanted system behaviour change.
+ *
+ * This feature is intended to overcome above mentioned problems. This feature
+ * allocates memory for extended data per page in certain place rather than
+ * the struct page itself. This memory can be accessed by the accessor
+ * functions provided by this code. During the boot process, it checks whether
+ * allocation of huge chunk of memory is needed or not. If not, it avoids
+ * allocating memory at all. With this advantage, we can include this feature
+ * into the kernel in default and can avoid rebuild and solve related problems.
+ *
+ * To help these things to work well, there are two callbacks for clients. One
+ * is the need callback which is mandatory if user wants to avoid useless
+ * memory allocation at boot-time. The other is optional, init callback, which
+ * is used to do proper initialization after memory is allocated.
+ *
+ * The need callback is used to decide whether extended memory allocation is
+ * needed or not. Sometimes users want to deactivate some features in this
+ * boot and extra memory would be unneccessary. In this case, to avoid
+ * allocating huge chunk of memory, each clients represent their need of
+ * extra memory through the need callback. If one of the need callbacks
+ * returns true, it means that someone needs extra memory so that
+ * page extension core should allocates memory for page extension. If
+ * none of need callbacks return true, memory isn't needed at all in this boot
+ * and page extension core can skip to allocate memory. As result,
+ * none of memory is wasted.
+ *
+ * The init callback is used to do proper initialization after page extension
+ * is completely initialized. In sparse memory system, extra memory is
+ * allocated some time later than memmap is allocated. In other words, lifetime
+ * of memory for page extension isn't same with memmap for struct page.
+ * Therefore, clients can't store extra data until page extension is
+ * initialized, even if pages are allocated and used freely. This could
+ * cause inadequate state of extra data per page, so, to prevent it, client
+ * can utilize this callback to initialize the state of it correctly.
+ */
+static struct page_ext_operations *page_ext_ops[] = {
+        &debug_guardpage_ops,
+#ifdef CONFIG_PAGE_POISONING
+        &page_poisoning_ops,
+#endif
+#ifdef CONFIG_PAGE_OWNER
+        &page_owner_ops,
+#endif
+};
+static unsigned long total_usage;
+static bool __init invoke_need_callbacks(void)
+{
+        int i;
+        int entries = ARRAY_SIZE(page_ext_ops);
+        for (i = 0; i < entries; i++) {
+                if (page_ext_ops[i]->need && page_ext_ops[i]->need())
+                        return true;
+        }
+        return false;
+}
+static void __init invoke_init_callbacks(void)
+{
+        int i;
+        int entries = ARRAY_SIZE(page_ext_ops);
+        for (i = 0; i < entries; i++) {
+                if (page_ext_ops[i]->init)
+                        page_ext_ops[i]->init();
+        }
+}
+#if !defined(CONFIG_SPARSEMEM)
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+        pgdat->node_page_ext = NULL;
+}
+struct page_ext *lookup_page_ext(struct page *page)
+{
+        unsigned long pfn = page_to_pfn(page);
+        unsigned long offset;
+        struct page_ext *base;
+        base = NODE_DATA(page_to_nid(page))->node_page_ext;
+#ifdef CONFIG_DEBUG_VM
+        /*
+         * The sanity checks the page allocator does upon freeing a
+         * page can reach here before the page_ext arrays are
+         * allocated when feeding a range of pages to the allocator
+         * for the first time during bootup or memory hotplug.
+         */
+        if (unlikely(!base))
+                return NULL;
+#endif
+        offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
+                                        MAX_ORDER_NR_PAGES);
+        return base + offset;
+}
+static int __init alloc_node_page_ext(int nid)
+{
+        struct page_ext *base;
+        unsigned long table_size;
+        unsigned long nr_pages;
+        nr_pages = NODE_DATA(nid)->node_spanned_pages;
+        if (!nr_pages)
+                return 0;
+        /*
+         * Need extra space if node range is not aligned with
+         * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
+         * checks buddy's status, range could be out of exact node range.
+         */
+        if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
+                !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
+                nr_pages += MAX_ORDER_NR_PAGES;
+        table_size = sizeof(struct page_ext) * nr_pages;
+        base = memblock_virt_alloc_try_nid_nopanic(
+                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+                        BOOTMEM_ALLOC_ACCESSIBLE, nid);
+        if (!base)
+                return -ENOMEM;
+        NODE_DATA(nid)->node_page_ext = base;
+        total_usage += table_size;
+        return 0;
+}
+void __init page_ext_init_flatmem(void)
+{
+        int nid, fail;
+        if (!invoke_need_callbacks())
+                return;
+        for_each_online_node(nid)  {
+                fail = alloc_node_page_ext(nid);
+                if (fail)
+                        goto fail;
+        }
+        pr_info("allocated %ld bytes of page_ext\n", total_usage);
+        invoke_init_callbacks();
+        return;
+fail:
+        pr_crit("allocation of page_ext failed.\n");
+        panic("Out of memory");
+}
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+struct page_ext *lookup_page_ext(struct page *page)
+{
+        unsigned long pfn = page_to_pfn(page);
+        struct mem_section *section = __pfn_to_section(pfn);
+#ifdef CONFIG_DEBUG_VM
+        /*
+         * The sanity checks the page allocator does upon freeing a
+         * page can reach here before the page_ext arrays are
+         * allocated when feeding a range of pages to the allocator
+         * for the first time during bootup or memory hotplug.
+         */
+        if (!section->page_ext)
+                return NULL;
+#endif
+        return section->page_ext + pfn;
+}
+static void *__meminit alloc_page_ext(size_t size, int nid)
+{
+        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
+        void *addr = NULL;
+        addr = alloc_pages_exact_nid(nid, size, flags);
+        if (addr) {
+                kmemleak_alloc(addr, size, 1, flags);
+                return addr;
+        }
+        if (node_state(nid, N_HIGH_MEMORY))
+                addr = vzalloc_node(size, nid);
+        else
+                addr = vzalloc(size);
+        return addr;
+}
+static int __meminit init_section_page_ext(unsigned long pfn, int nid)
+{
+        struct mem_section *section;
+        struct page_ext *base;
+        unsigned long table_size;
+        section = __pfn_to_section(pfn);
+        if (section->page_ext)
+                return 0;
+        table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+        base = alloc_page_ext(table_size, nid);
+        /*
+         * The value stored in section->page_ext is (base - pfn)
+         * and it does not point to the memory block allocated above,
+         * causing kmemleak false positives.
+         */
+        kmemleak_not_leak(base);
+        if (!base) {
+                pr_err("page ext allocation failure\n");
+                return -ENOMEM;
+        }
+        /*
+         * The passed "pfn" may not be aligned to SECTION.  For the calculation
+         * we need to apply a mask.
+         */
+        pfn &= PAGE_SECTION_MASK;
+        section->page_ext = base - pfn;
+        total_usage += table_size;
+        return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_ext(void *addr)
+{
+        if (is_vmalloc_addr(addr)) {
+                vfree(addr);
+        } else {
+                struct page *page = virt_to_page(addr);
+                size_t table_size;
+                table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+                BUG_ON(PageReserved(page));
+                free_pages_exact(addr, table_size);
+        }
+}
+static void __free_page_ext(unsigned long pfn)
+{
+        struct mem_section *ms;
+        struct page_ext *base;
+        ms = __pfn_to_section(pfn);
+        if (!ms || !ms->page_ext)
+                return;
+        base = ms->page_ext + pfn;
+        free_page_ext(base);
+        ms->page_ext = NULL;
+}
+static int __meminit online_page_ext(unsigned long start_pfn,
+                                unsigned long nr_pages,
+                                int nid)
+{
+        unsigned long start, end, pfn;
+        int fail = 0;
+        start = SECTION_ALIGN_DOWN(start_pfn);
+        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+        if (nid == -1) {
+                /*
+                 * In this case, "nid" already exists and contains valid memory.
+                 * "start_pfn" passed to us is a pfn which is an arg for
+                 * online__pages(), and start_pfn should exist.
+                 */
+                nid = pfn_to_nid(start_pfn);
+                VM_BUG_ON(!node_state(nid, N_ONLINE));
+        }
+        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+                if (!pfn_present(pfn))
+                        continue;
+                fail = init_section_page_ext(pfn, nid);
+        }
+        if (!fail)
+                return 0;
+        /* rollback */
+        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+                __free_page_ext(pfn);
+        return -ENOMEM;
+}
+static int __meminit offline_page_ext(unsigned long start_pfn,
+                                unsigned long nr_pages, int nid)
+{
+        unsigned long start, end, pfn;
+        start = SECTION_ALIGN_DOWN(start_pfn);
+        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+                __free_page_ext(pfn);
+        return 0;
+}
+static int __meminit page_ext_callback(struct notifier_block *self,
+                               unsigned long action, void *arg)
+{
+        struct memory_notify *mn = arg;
+        int ret = 0;
+        switch (action) {
+        case MEM_GOING_ONLINE:
+                ret = online_page_ext(mn->start_pfn,
+                                   mn->nr_pages, mn->status_change_nid);
+                break;
+        case MEM_OFFLINE:
+                offline_page_ext(mn->start_pfn,
+                                mn->nr_pages, mn->status_change_nid);
+                break;
+        case MEM_CANCEL_ONLINE:
+                offline_page_ext(mn->start_pfn,
+                                mn->nr_pages, mn->status_change_nid);
+                break;
+        case MEM_GOING_OFFLINE:
+                break;
+        case MEM_ONLINE:
+        case MEM_CANCEL_OFFLINE:
+                break;
+        }
+        return notifier_from_errno(ret);
+}
+#endif
+void __init page_ext_init(void)
+{
+        unsigned long pfn;
+        int nid;
+        if (!invoke_need_callbacks())
+                return;
+        for_each_node_state(nid, N_MEMORY) {
+                unsigned long start_pfn, end_pfn;
+                start_pfn = node_start_pfn(nid);
+                end_pfn = node_end_pfn(nid);
+                /*
+                 * start_pfn and end_pfn may not be aligned to SECTION and the
+                 * page->flags of out of node pages are not initialized.  So we
+                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
+                 */
+                for (pfn = start_pfn; pfn < end_pfn;
+                        pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+                        if (!pfn_valid(pfn))
+                                continue;
+                        /*
+                         * Nodes's pfns can be overlapping.
+                         * We know some arch can have a nodes layout such as
+                         * -------------pfn-------------->
+                         * N0 | N1 | N2 | N0 | N1 | N2|....
+                         */
+                        if (pfn_to_nid(pfn) != nid)
+                                continue;
+                        if (init_section_page_ext(pfn, nid))
+                                goto oom;
+                }
+        }
+        hotplug_memory_notifier(page_ext_callback, 0);
+        pr_info("allocated %ld bytes of page_ext\n", total_usage);
+        invoke_init_callbacks();
+        return;
+oom:
+        panic("Out of memory");
+}
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+}
+#endif
diff --git a/mm/page_owner.c b/mm/page_owner.c
new file mode 100644
index 000000000000..9ab4a9b5bc09
--- /dev/null
+++ b/mm/page_owner.c
@@ -0,0 +1,311 @@
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/bootmem.h>
+#include <linux/stacktrace.h>
+#include <linux/page_owner.h>
+#include "internal.h"
+static bool page_owner_disabled = true;
+bool page_owner_inited __read_mostly;
+static void init_early_allocated_pages(void);
+static int early_page_owner_param(char *buf)
+{
+        if (!buf)
+                return -EINVAL;
+        if (strcmp(buf, "on") == 0)
+                page_owner_disabled = false;
+        return 0;
+}
+early_param("page_owner", early_page_owner_param);
+static bool need_page_owner(void)
+{
+        if (page_owner_disabled)
+                return false;
+        return true;
+}
+static void init_page_owner(void)
+{
+        if (page_owner_disabled)
+                return;
+        page_owner_inited = true;
+        init_early_allocated_pages();
+}
+struct page_ext_operations page_owner_ops = {
+        .need = need_page_owner,
+        .init = init_page_owner,
+};
+void __reset_page_owner(struct page *page, unsigned int order)
+{
+        int i;
+        struct page_ext *page_ext;
+        for (i = 0; i < (1 << order); i++) {
+                page_ext = lookup_page_ext(page + i);
+                __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
+        }
+}
+void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
+{
+        struct page_ext *page_ext;
+        struct stack_trace *trace;
+        page_ext = lookup_page_ext(page);
+        trace = &page_ext->trace;
+        trace->nr_entries = 0;
+        trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
+        trace->entries = &page_ext->trace_entries[0];
+        trace->skip = 3;
+        save_stack_trace(&page_ext->trace);
+        page_ext->order = order;
+        page_ext->gfp_mask = gfp_mask;
+        __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+}
+static ssize_t
+print_page_owner(char __user *buf, size_t count, unsigned long pfn,
+                struct page *page, struct page_ext *page_ext)
+{
+        int ret;
+        int pageblock_mt, page_mt;
+        char *kbuf;
+        kbuf = kmalloc(count, GFP_KERNEL);
+        if (!kbuf)
+                return -ENOMEM;
+        ret = snprintf(kbuf, count,
+                        "Page allocated via order %u, mask 0x%x\n",
+                        page_ext->order, page_ext->gfp_mask);
+        if (ret >= count)
+                goto err;
+        /* Print information relevant to grouping pages by mobility */
+        pageblock_mt = get_pfnblock_migratetype(page, pfn);
+        page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
+        ret += snprintf(kbuf + ret, count - ret,
+                        "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+                        pfn,
+                        pfn >> pageblock_order,
+                        pageblock_mt,
+                        pageblock_mt != page_mt ? "Fallback" : "        ",
+                        PageLocked(page)        ? "K" : " ",
+                        PageError(page)         ? "E" : " ",
+                        PageReferenced(page)    ? "R" : " ",
+                        PageUptodate(page)      ? "U" : " ",
+                        PageDirty(page)         ? "D" : " ",
+                        PageLRU(page)           ? "L" : " ",
+                        PageActive(page)        ? "A" : " ",
+                        PageSlab(page)          ? "S" : " ",
+                        PageWriteback(page)     ? "W" : " ",
+                        PageCompound(page)      ? "C" : " ",
+                        PageSwapCache(page)     ? "B" : " ",
+                        PageMappedToDisk(page)  ? "M" : " ");
+        if (ret >= count)
+                goto err;
+        ret += snprint_stack_trace(kbuf + ret, count - ret,
+                                        &page_ext->trace, 0);
+        if (ret >= count)
+                goto err;
+        ret += snprintf(kbuf + ret, count - ret, "\n");
+        if (ret >= count)
+                goto err;
+        if (copy_to_user(buf, kbuf, ret))
+                ret = -EFAULT;
+        kfree(kbuf);
+        return ret;
+err:
+        kfree(kbuf);
+        return -ENOMEM;
+}
+static ssize_t
+read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+        unsigned long pfn;
+        struct page *page;
+        struct page_ext *page_ext;
+        if (!page_owner_inited)
+                return -EINVAL;
+        page = NULL;
+        pfn = min_low_pfn + *ppos;
+        /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
+        while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
+                pfn++;
+        drain_all_pages(NULL);
+        /* Find an allocated page */
+        for (; pfn < max_pfn; pfn++) {
+                /*
+                 * If the new page is in a new MAX_ORDER_NR_PAGES area,
+                 * validate the area as existing, skip it if not
+                 */
+                if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
+                        pfn += MAX_ORDER_NR_PAGES - 1;
+                        continue;
+                }
+                /* Check for holes within a MAX_ORDER area */
+                if (!pfn_valid_within(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                if (PageBuddy(page)) {
+                        unsigned long freepage_order = page_order_unsafe(page);
+                        if (freepage_order < MAX_ORDER)
+                                pfn += (1UL << freepage_order) - 1;
+                        continue;
+                }
+                page_ext = lookup_page_ext(page);
+                /*
+                 * Some pages could be missed by concurrent allocation or free,
+                 * because we don't hold the zone lock.
+                 */
+                if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+                        continue;
+                /* Record the next PFN to read in the file offset */
+                *ppos = (pfn - min_low_pfn) + 1;
+                return print_page_owner(buf, count, pfn, page, page_ext);
+        }
+        return 0;
+}
+static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
+{
+        struct page *page;
+        struct page_ext *page_ext;
+        unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+        unsigned long end_pfn = pfn + zone->spanned_pages;
+        unsigned long count = 0;
+        /* Scan block by block. First and last block may be incomplete */
+        pfn = zone->zone_start_pfn;
+        /*
+         * Walk the zone in pageblock_nr_pages steps. If a page block spans
+         * a zone boundary, it will be double counted between zones. This does
+         * not matter as the mixed block count will still be correct
+         */
+        for (; pfn < end_pfn; ) {
+                if (!pfn_valid(pfn)) {
+                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+                        continue;
+                }
+                block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+                block_end_pfn = min(block_end_pfn, end_pfn);
+                page = pfn_to_page(pfn);
+                for (; pfn < block_end_pfn; pfn++) {
+                        if (!pfn_valid_within(pfn))
+                                continue;
+                        page = pfn_to_page(pfn);
+                        /*
+                         * We are safe to check buddy flag and order, because
+                         * this is init stage and only single thread runs.
+                         */
+                        if (PageBuddy(page)) {
+                                pfn += (1UL << page_order(page)) - 1;
+                                continue;
+                        }
+                        if (PageReserved(page))
+                                continue;
+                        page_ext = lookup_page_ext(page);
+                        /* Maybe overraping zone */
+                        if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+                                continue;
+                        /* Found early allocated page */
+                        set_page_owner(page, 0, 0);
+                        count++;
+                }
+        }
+        pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
+                pgdat->node_id, zone->name, count);
+}
+static void init_zones_in_node(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        struct zone *node_zones = pgdat->node_zones;
+        unsigned long flags;
+        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+                if (!populated_zone(zone))
+                        continue;
+                spin_lock_irqsave(&zone->lock, flags);
+                init_pages_in_zone(pgdat, zone);
+                spin_unlock_irqrestore(&zone->lock, flags);
+        }
+}
+static void init_early_allocated_pages(void)
+{
+        pg_data_t *pgdat;
+        drain_all_pages(NULL);
+        for_each_online_pgdat(pgdat)
+                init_zones_in_node(pgdat);
+}
+static const struct file_operations proc_page_owner_operations = {
+        .read           = read_page_owner,
+};
+static int __init pageowner_init(void)
+{
+        struct dentry *dentry;
+        if (!page_owner_inited) {
+                pr_info("page_owner is disabled\n");
+                return 0;
+        }
+        dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
+                        NULL, &proc_page_owner_operations);
+        if (IS_ERR(dentry))
+                return PTR_ERR(dentry);
+        return 0;
+}
+module_init(pageowner_init)
diff --git a/mm/rmap.c b/mm/rmap.c
index 45eba36fd673..c52f43a69eea 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
 * inode->i_mutex       (while writing or truncating, not reading or faulting)
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_mutex
+ *       mapping->i_mmap_rwsem
 *         anon_vma->rwsem
 *           mm->page_table_lock or pte_lock
 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1260,7 +1260,7 @@ out_mlock:
        /*
         * We need mmap_sem locking, Otherwise VM_LOCKED check makes
         * unstable result and race. Plus, We can't wait here because
-         * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
+         * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
         * if trylock failed, the page remain in evictable lru and later
         * vmscan could retry to move the page to unevictable lru if the
         * page is actually mlocked.
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
 static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 {
        struct anon_vma *anon_vma;
-        pgoff_t pgoff = page_to_pgoff(page);
+        pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
        if (!anon_vma)
                return ret;
+        pgoff = page_to_pgoff(page);
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
@@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
        struct address_space *mapping = page->mapping;
-        pgoff_t pgoff = page_to_pgoff(page);
+        pgoff_t pgoff;
        struct vm_area_struct *vma;
        int ret = SWAP_AGAIN;
@@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
-         * so we can safely take mapping->i_mmap_mutex.
+         * so we can safely take mapping->i_mmap_rwsem.
         */
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (!mapping)
                return ret;
-        mutex_lock(&mapping->i_mmap_mutex);
+        pgoff = page_to_pgoff(page);
+        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
@@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
                goto done;
        ret = rwc->file_nonlinear(page, mapping, rwc->arg);
 done:
-        mutex_unlock(&mapping->i_mmap_mutex);
+        i_mmap_unlock_read(mapping);
        return ret;
 }
diff --git a/mm/slab.c b/mm/slab.c
index fee275b5b6b7..65b5dcb6f671 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3015,7 +3015,7 @@ retry:
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                nid = zone_to_nid(zone);
-                if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) &&
+                if (cpuset_zone_allowed(zone, flags) &&
                        get_node(cache, nid) &&
                        get_node(cache, nid)->free_objects) {
                                obj = ____cache_alloc_node(cache,
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
                        memset(ptr, 0, cachep->object_size);
        }
+        memcg_kmem_put_cache(cachep);
        return ptr;
 }
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
                        memset(objp, 0, cachep->object_size);
        }
+        memcg_kmem_put_cache(cachep);
        return objp;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 765c5884d03d..fe376fe1f4fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
        kmemleak_free(x);
 }
-static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+                                                     gfp_t flags)
 {
        flags &= gfp_allowed_mask;
        lockdep_trace_alloc(flags);
        might_sleep_if(flags & __GFP_WAIT);
-        return should_failslab(s->object_size, flags, s->flags);
+        if (should_failslab(s->object_size, flags, s->flags))
+                return NULL;
+        return memcg_kmem_get_cache(s, flags);
 }
 static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
        flags &= gfp_allowed_mask;
        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
+        memcg_kmem_put_cache(s);
 }
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
                        n = get_node(s, zone_to_nid(zone));
-                        if (n && cpuset_zone_allowed(zone,
+                        if (n && cpuset_zone_allowed(zone, flags) &&
-                                                     flags | __GFP_HARDWALL) &&
                                        n->nr_partial > s->min_partial) {
                                object = get_partial_node(s, n, c, flags);
                                if (object) {
@@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
        struct page *page;
        unsigned long tid;
-        if (slab_pre_alloc_hook(s, gfpflags))
+        s = slab_pre_alloc_hook(s, gfpflags);
+        if (!s)
                return NULL;
-        s = memcg_kmem_get_cache(s, gfpflags);
 redo:
        /*
         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 9f25af825dec..b6e3662fe339 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm)
 {
        struct task_struct *g, *p;
+        count_vm_vmacache_event(VMACACHE_FULL_FLUSHES);
        /*
         * Single threaded tasks need not iterate the entire
         * list of process. We can avoid the flushing as well
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8a18196fcdff..39c338896416 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
                if (!counters)
                        return;
-                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
-                smp_rmb();
                if (v->flags & VM_UNINITIALIZED)
                        return;
+                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+                smp_rmb();
                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a384339bf718..bd9a72bc4a1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
-static unsigned long
+static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
-shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+                                  struct shrinker *shrinker,
-                 unsigned long nr_pages_scanned, unsigned long lru_pages)
+                                  unsigned long nr_scanned,
+                                  unsigned long nr_eligible)
 {
        unsigned long freed = 0;
        unsigned long long delta;
@@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
        nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
        total_scan = nr;
-        delta = (4 * nr_pages_scanned) / shrinker->seeks;
+        delta = (4 * nr_scanned) / shrinker->seeks;
        delta *= freeable;
-        do_div(delta, lru_pages + 1);
+        do_div(delta, nr_eligible + 1);
        total_scan += delta;
        if (total_scan < 0) {
                pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
                total_scan = freeable * 2;
        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-                                nr_pages_scanned, lru_pages,
+                                   nr_scanned, nr_eligible,
-                                freeable, delta, total_scan);
+                                   freeable, delta, total_scan);
        /*
         * Normally, we should not scan less than batch_size objects in one
@@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
        return freed;
 }
-/*
+/**
- * Call the shrink functions to age shrinkable caches
+ * shrink_node_slabs - shrink slab caches of a given node
- *
+ * @gfp_mask: allocation context
- * Here we assume it costs one seek to replace a lru page and that it also
+ * @nid: node whose slab caches to target
- * takes a seek to recreate a cache object.  With this in mind we age equal
+ * @nr_scanned: pressure numerator
- * percentages of the lru and ageable caches.  This should balance the seeks
+ * @nr_eligible: pressure denominator
- * generated by these structures.
 *
- * If the vm encountered mapped pages on the LRU it increase the pressure on
+ * Call the shrink functions to age shrinkable caches.
- * slab to avoid swapping.
 *
- * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
+ * unaware shrinkers will receive a node id of 0 instead.
 *
- * `lru_pages' represents the number of on-LRU pages in all the zones which
+ * @nr_scanned and @nr_eligible form a ratio that indicate how much of
- * are eligible for the caller's allocation attempt.  It is used for balancing
+ * the available objects should be scanned.  Page reclaim for example
- * slab reclaim versus page reclaim.
+ * passes the number of pages scanned and the number of pages on the
+ * LRU lists that it considered on @nid, plus a bias in @nr_scanned
+ * when it encountered mapped pages.  The ratio is further biased by
+ * the ->seeks setting of the shrink function, which indicates the
+ * cost to recreate an object relative to that of an LRU page.
 *
- * Returns the number of slab objects which we shrunk.
+ * Returns the number of reclaimed slab objects.
 */
-unsigned long shrink_slab(struct shrink_control *shrinkctl,
+unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
-                          unsigned long nr_pages_scanned,
+                                unsigned long nr_scanned,
-                          unsigned long lru_pages)
+                                unsigned long nr_eligible)
 {
        struct shrinker *shrinker;
        unsigned long freed = 0;
-        if (nr_pages_scanned == 0)
+        if (nr_scanned == 0)
-                nr_pages_scanned = SWAP_CLUSTER_MAX;
+                nr_scanned = SWAP_CLUSTER_MAX;
        if (!down_read_trylock(&shrinker_rwsem)) {
                /*
@@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
        }
        list_for_each_entry(shrinker, &shrinker_list, list) {
-                if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+                struct shrink_control sc = {
-                        shrinkctl->nid = 0;
+                        .gfp_mask = gfp_mask,
-                        freed += shrink_slab_node(shrinkctl, shrinker,
+                        .nid = nid,
-                                        nr_pages_scanned, lru_pages);
+                };
-                        continue;
-                }
-                for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+                if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
-                        if (node_online(shrinkctl->nid))
+                        sc.nid = 0;
-                                freed += shrink_slab_node(shrinkctl, shrinker,
-                                                nr_pages_scanned, lru_pages);
-                }
+                freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
        }
        up_read(&shrinker_rwsem);
 out:
        cond_resched();
@@ -1876,7 +1877,8 @@ enum scan_balance {
 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
 */
 static void get_scan_count(struct lruvec *lruvec, int swappiness,
-                           struct scan_control *sc, unsigned long *nr)
+                           struct scan_control *sc, unsigned long *nr,
+                           unsigned long *lru_pages)
 {
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        u64 fraction[2];
@@ -2022,6 +2024,7 @@ out:
        some_scanned = false;
        /* Only use force_scan on second pass. */
        for (pass = 0; !some_scanned && pass < 2; pass++) {
+                *lru_pages = 0;
                for_each_evictable_lru(lru) {
                        int file = is_file_lru(lru);
                        unsigned long size;
@@ -2048,14 +2051,19 @@ out:
                        case SCAN_FILE:
                        case SCAN_ANON:
                                /* Scan one type exclusively */
-                                if ((scan_balance == SCAN_FILE) != file)
+                                if ((scan_balance == SCAN_FILE) != file) {
+                                        size = 0;
                                        scan = 0;
+                                }
                                break;
                        default:
                                /* Look ma, no brain */
                                BUG();
                        }
+                        *lru_pages += size;
                        nr[lru] = scan;
                        /*
                         * Skip the second pass and don't force_scan,
                         * if we found something to scan.
@@ -2069,7 +2077,7 @@ out:
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
 static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
-                          struct scan_control *sc)
+                          struct scan_control *sc, unsigned long *lru_pages)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long targets[NR_LRU_LISTS];
@@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
        struct blk_plug plug;
        bool scan_adjusted;
-        get_scan_count(lruvec, swappiness, sc, nr);
+        get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
@@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
        }
 }
-static bool shrink_zone(struct zone *zone, struct scan_control *sc)
+static bool shrink_zone(struct zone *zone, struct scan_control *sc,
+                        bool is_classzone)
 {
        unsigned long nr_reclaimed, nr_scanned;
        bool reclaimable = false;
@@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
                        .zone = zone,
                        .priority = sc->priority,
                };
+                unsigned long zone_lru_pages = 0;
                struct mem_cgroup *memcg;
                nr_reclaimed = sc->nr_reclaimed;
@@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
                memcg = mem_cgroup_iter(root, NULL, &reclaim);
                do {
+                        unsigned long lru_pages;
                        struct lruvec *lruvec;
                        int swappiness;
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
                        swappiness = mem_cgroup_swappiness(memcg);
-                        shrink_lruvec(lruvec, swappiness, sc);
+                        shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
+                        zone_lru_pages += lru_pages;
                        /*
                         * Direct reclaim and kswapd have to scan all memory
@@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
                        memcg = mem_cgroup_iter(root, memcg, &reclaim);
                } while (memcg);
+                /*
+                 * Shrink the slab caches in the same proportion that
+                 * the eligible LRU pages were scanned.
+                 */
+                if (global_reclaim(sc) && is_classzone) {
+                        struct reclaim_state *reclaim_state;
+                        shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
+                                          sc->nr_scanned - nr_scanned,
+                                          zone_lru_pages);
+                        reclaim_state = current->reclaim_state;
+                        if (reclaim_state) {
+                                sc->nr_reclaimed +=
+                                        reclaim_state->reclaimed_slab;
+                                reclaim_state->reclaimed_slab = 0;
+                        }
+                }
                vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
                           sc->nr_scanned - nr_scanned,
                           sc->nr_reclaimed - nr_reclaimed);
@@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
-        unsigned long lru_pages = 0;
-        struct reclaim_state *reclaim_state = current->reclaim_state;
        gfp_t orig_mask;
-        struct shrink_control shrink = {
-                .gfp_mask = sc->gfp_mask,
-        };
        enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
        bool reclaimable = false;
@@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        if (buffer_heads_over_limit)
                sc->gfp_mask |= __GFP_HIGHMEM;
-        nodes_clear(shrink.nodes_to_scan);
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
+                                        requested_highidx, sc->nodemask) {
+                enum zone_type classzone_idx;
                if (!populated_zone(zone))
                        continue;
+                classzone_idx = requested_highidx;
+                while (!populated_zone(zone->zone_pgdat->node_zones +
+                                                        classzone_idx))
+                        classzone_idx--;
                /*
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
@@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                 GFP_KERNEL | __GFP_HARDWALL))
                                continue;
-                        lru_pages += zone_reclaimable_pages(zone);
-                        node_set(zone_to_nid(zone), shrink.nodes_to_scan);
                        if (sc->priority != DEF_PRIORITY &&
                            !zone_reclaimable(zone))
                                continue;       /* Let kswapd poll it */
@@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                        /* need some check for avoid more shrink_zone() */
                }
-                if (shrink_zone(zone, sc))
+                if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
                        reclaimable = true;
                if (global_reclaim(sc) &&
@@ -2459,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        }
        /*
-         * Don't shrink slabs when reclaiming memory from over limit cgroups
-         * but do shrink slab at least once when aborting reclaim for
-         * compaction to avoid unevenly scanning file/anon LRU pages over slab
-         * pages.
-         */
-        if (global_reclaim(sc)) {
-                shrink_slab(&shrink, sc->nr_scanned, lru_pages);
-                if (reclaim_state) {
-                        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-                        reclaim_state->reclaimed_slab = 0;
-                }
-        }
-        /*
         * Restore to original mask to avoid the impact on the caller if we
         * promoted it to __GFP_HIGHMEM.
         */
@@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
        };
        struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
        int swappiness = mem_cgroup_swappiness(memcg);
+        unsigned long lru_pages;
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-        shrink_lruvec(lruvec, swappiness, &sc);
+        shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static bool kswapd_shrink_zone(struct zone *zone,
                               int classzone_idx,
                               struct scan_control *sc,
-                               unsigned long lru_pages,
                               unsigned long *nr_attempted)
 {
        int testorder = sc->order;
        unsigned long balance_gap;
-        struct reclaim_state *reclaim_state = current->reclaim_state;
-        struct shrink_control shrink = {
-                .gfp_mask = sc->gfp_mask,
-        };
        bool lowmem_pressure;
        /* Reclaim above the high watermark. */
@@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
                                                balance_gap, classzone_idx))
                return true;
-        shrink_zone(zone, sc);
+        shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
-        nodes_clear(shrink.nodes_to_scan);
-        node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-        reclaim_state->reclaimed_slab = 0;
-        shrink_slab(&shrink, sc->nr_scanned, lru_pages);
-        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
        /* Account for the number of pages attempted to reclaim */
        *nr_attempted += sc->nr_to_reclaim;
@@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        count_vm_event(PAGEOUTRUN);
        do {
-                unsigned long lru_pages = 0;
                unsigned long nr_attempted = 0;
                bool raise_priority = true;
                bool pgdat_needs_compaction = (order > 0);
@@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                        if (!populated_zone(zone))
                                continue;
-                        lru_pages += zone_reclaimable_pages(zone);
                        /*
                         * If any zone is currently balanced then kswapd will
                         * not call compaction as it is expected that the
@@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                         * that that high watermark would be met at 100%
                         * efficiency.
                         */
-                        if (kswapd_shrink_zone(zone, end_zone, &sc,
+                        if (kswapd_shrink_zone(zone, end_zone,
-                                        lru_pages, &nr_attempted))
+                                               &sc, &nr_attempted))
                                raise_priority = false;
                }
@@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
                .may_swap = 1,
        };
-        struct shrink_control shrink = {
-                .gfp_mask = sc.gfp_mask,
-        };
-        unsigned long nr_slab_pages0, nr_slab_pages1;
        cond_resched();
        /*
@@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * priorities until we have enough memory freed.
                 */
                do {
-                        shrink_zone(zone, &sc);
+                        shrink_zone(zone, &sc, true);
                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
        }
-        nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-        if (nr_slab_pages0 > zone->min_slab_pages) {
-                /*
-                 * shrink_slab() does not currently allow us to determine how
-                 * many pages were freed in this zone. So we take the current
-                 * number of slab pages and shake the slab until it is reduced
-                 * by the same nr_pages that we used for reclaiming unmapped
-                 * pages.
-                 */
-                nodes_clear(shrink.nodes_to_scan);
-                node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-                for (;;) {
-                        unsigned long lru_pages = zone_reclaimable_pages(zone);
-                        /* No reclaimable slab or very low memory pressure */
-                        if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
-                                break;
-                        /* Freed enough memory */
-                        nr_slab_pages1 = zone_page_state(zone,
-                                                        NR_SLAB_RECLAIMABLE);
-                        if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
-                                break;
-                }
-                /*
-                 * Update nr_reclaimed by the number of slab pages we
-                 * reclaimed from this zone.
-                 */
-                nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-                if (nr_slab_pages1 < nr_slab_pages0)
-                        sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
-        }
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
        lockdep_clear_current_reclaim_state();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1b12d390dc68..1284f89fca08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,6 +22,8 @@
 #include <linux/writeback.h>
 #include <linux/compaction.h>
 #include <linux/mm_inline.h>
+#include <linux/page_ext.h>
+#include <linux/page_owner.h>
 #include "internal.h"
@@ -898,6 +900,7 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_DEBUG_VM_VMACACHE
        "vmacache_find_calls",
        "vmacache_find_hits",
+        "vmacache_full_flushes",
 #endif
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
@@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
        return 0;
 }
+#ifdef CONFIG_PAGE_OWNER
+static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+                                                        pg_data_t *pgdat,
+                                                        struct zone *zone)
+{
+        struct page *page;
+        struct page_ext *page_ext;
+        unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+        unsigned long end_pfn = pfn + zone->spanned_pages;
+        unsigned long count[MIGRATE_TYPES] = { 0, };
+        int pageblock_mt, page_mt;
+        int i;
+        /* Scan block by block. First and last block may be incomplete */
+        pfn = zone->zone_start_pfn;
+        /*
+         * Walk the zone in pageblock_nr_pages steps. If a page block spans
+         * a zone boundary, it will be double counted between zones. This does
+         * not matter as the mixed block count will still be correct
+         */
+        for (; pfn < end_pfn; ) {
+                if (!pfn_valid(pfn)) {
+                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+                        continue;
+                }
+                block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+                block_end_pfn = min(block_end_pfn, end_pfn);
+                page = pfn_to_page(pfn);
+                pageblock_mt = get_pfnblock_migratetype(page, pfn);
+                for (; pfn < block_end_pfn; pfn++) {
+                        if (!pfn_valid_within(pfn))
+                                continue;
+                        page = pfn_to_page(pfn);
+                        if (PageBuddy(page)) {
+                                pfn += (1UL << page_order(page)) - 1;
+                                continue;
+                        }
+                        if (PageReserved(page))
+                                continue;
+                        page_ext = lookup_page_ext(page);
+                        if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+                                continue;
+                        page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
+                        if (pageblock_mt != page_mt) {
+                                if (is_migrate_cma(pageblock_mt))
+                                        count[MIGRATE_MOVABLE]++;
+                                else
+                                        count[pageblock_mt]++;
+                                pfn = block_end_pfn;
+                                break;
+                        }
+                        pfn += (1UL << page_ext->order) - 1;
+                }
+        }
+        /* Print counts */
+        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+        for (i = 0; i < MIGRATE_TYPES; i++)
+                seq_printf(m, "%12lu ", count[i]);
+        seq_putc(m, '\n');
+}
+#endif /* CONFIG_PAGE_OWNER */
+/*
+ * Print out the number of pageblocks for each migratetype that contain pages
+ * of other types. This gives an indication of how well fallbacks are being
+ * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
+ * to determine what is going on
+ */
+static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
+{
+#ifdef CONFIG_PAGE_OWNER
+        int mtype;
+        if (!page_owner_inited)
+                return;
+        drain_all_pages(NULL);
+        seq_printf(m, "\n%-23s", "Number of mixed blocks ");
+        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+                seq_printf(m, "%12s ", migratetype_names[mtype]);
+        seq_putc(m, '\n');
+        walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+#endif /* CONFIG_PAGE_OWNER */
+}
 /*
 * This prints out statistics in relation to grouping pages by mobility.
 * It is expensive to collect so do not constantly read the file.
@@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
        seq_putc(m, '\n');
        pagetypeinfo_showfree(m, pgdat);
        pagetypeinfo_showblockcount(m, pgdat);
+        pagetypeinfo_showmixedcount(m, pgdat);
        return 0;
 }
diff --git a/mm/zbud.c b/mm/zbud.c
index ec71b37fb06c..4e387bea702e 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = {
 static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
 {
-        return zbud_create_pool(gfp, &zbud_zpool_ops);
+        return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
 }
 static void zbud_zpool_destroy(void *pool)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 839a48c3ca27..4d0a063145ec 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -155,8 +155,6 @@
 *  (reason above)
 */
 #define ZS_SIZE_CLASS_DELTA     (PAGE_SIZE >> 8)
-#define ZS_SIZE_CLASSES         ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
-                                        ZS_SIZE_CLASS_DELTA + 1)
 /*
 * We do not maintain any list for completely empty or full pages
@@ -171,6 +169,11 @@ enum fullness_group {
 };
 /*
+ * number of size_classes
+ */
+static int zs_size_classes;
+/*
 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
 *      n <= N / f, where
 * n = number of allocated objects
@@ -214,7 +217,7 @@ struct link_free {
 };
 struct zs_pool {
-        struct size_class size_class[ZS_SIZE_CLASSES];
+        struct size_class **size_class;
        gfp_t flags;    /* allocation flags used when growing pool */
        atomic_long_t pages_allocated;
@@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
        if (newfg == currfg)
                goto out;
-        class = &pool->size_class[class_idx];
+        class = pool->size_class[class_idx];
        remove_zspage(page, class, currfg);
        insert_zspage(page, class, newfg);
        set_zspage_mapping(page, class_idx, newfg);
@@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                struct page *next_page;
                struct link_free *link;
                unsigned int i = 1;
+                void *vaddr;
                /*
                 * page->index stores offset of first object starting
@@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                if (page != first_page)
                        page->index = off;
-                link = (struct link_free *)kmap_atomic(page) +
+                vaddr = kmap_atomic(page);
-                                                off / sizeof(*link);
+                link = (struct link_free *)vaddr + off / sizeof(*link);
                while ((off += class->size) < PAGE_SIZE) {
                        link->next = obj_location_to_handle(page, i++);
@@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                 */
                next_page = get_next_page(page);
                link->next = obj_location_to_handle(next_page, 0);
-                kunmap_atomic(link);
+                kunmap_atomic(vaddr);
                page = next_page;
                off %= PAGE_SIZE;
        }
@@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
         */
        if (area->vm_buf)
                return 0;
-        area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+        area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
        if (!area->vm_buf)
                return -ENOMEM;
        return 0;
@@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
 static inline void __zs_cpu_down(struct mapping_area *area)
 {
-        if (area->vm_buf)
+        kfree(area->vm_buf);
-                free_page((unsigned long)area->vm_buf);
        area->vm_buf = NULL;
 }
@@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = {
        .notifier_call = zs_cpu_notifier
 };
-static void zs_exit(void)
+static void zs_unregister_cpu_notifier(void)
 {
        int cpu;
-#ifdef CONFIG_ZPOOL
-        zpool_unregister_driver(&zs_zpool_driver);
-#endif
        cpu_notifier_register_begin();
        for_each_online_cpu(cpu)
@@ -898,31 +897,74 @@ static void zs_exit(void)
        cpu_notifier_register_done();
 }
-static int zs_init(void)
+static int zs_register_cpu_notifier(void)
 {
-        int cpu, ret;
+        int cpu, uninitialized_var(ret);
        cpu_notifier_register_begin();
        __register_cpu_notifier(&zs_cpu_nb);
        for_each_online_cpu(cpu) {
                ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-                if (notifier_to_errno(ret)) {
+                if (notifier_to_errno(ret))
-                        cpu_notifier_register_done();
+                        break;
-                        goto fail;
-                }
        }
        cpu_notifier_register_done();
+        return notifier_to_errno(ret);
+}
+static void init_zs_size_classes(void)
+{
+        int nr;
+        nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
+        if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
+                nr += 1;
+        zs_size_classes = nr;
+}
+static void __exit zs_exit(void)
+{
 #ifdef CONFIG_ZPOOL
-        zpool_register_driver(&zs_zpool_driver);
+        zpool_unregister_driver(&zs_zpool_driver);
 #endif
+        zs_unregister_cpu_notifier();
+}
+static int __init zs_init(void)
+{
+        int ret = zs_register_cpu_notifier();
+        if (ret) {
+                zs_unregister_cpu_notifier();
+                return ret;
+        }
+        init_zs_size_classes();
+#ifdef CONFIG_ZPOOL
+        zpool_register_driver(&zs_zpool_driver);
+#endif
        return 0;
-fail:
+}
-        zs_exit();
-        return notifier_to_errno(ret);
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+        return pages_per_zspage * PAGE_SIZE / size;
+}
+static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+{
+        if (prev->pages_per_zspage != pages_per_zspage)
+                return false;
+        if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
+                != get_maxobj_per_zspage(size, pages_per_zspage))
+                return false;
+        return true;
 }
 /**
@@ -937,33 +979,71 @@ fail:
 */
 struct zs_pool *zs_create_pool(gfp_t flags)
 {
-        int i, ovhd_size;
+        int i;
        struct zs_pool *pool;
+        struct size_class *prev_class = NULL;
-        ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-        pool = kzalloc(ovhd_size, GFP_KERNEL);
        if (!pool)
                return NULL;
-        for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+        pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
+                        GFP_KERNEL);
+        if (!pool->size_class) {
+                kfree(pool);
+                return NULL;
+        }
+        /*
+         * Iterate reversly, because, size of size_class that we want to use
+         * for merging should be larger or equal to current size.
+         */
+        for (i = zs_size_classes - 1; i >= 0; i--) {
                int size;
+                int pages_per_zspage;
                struct size_class *class;
                size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
                if (size > ZS_MAX_ALLOC_SIZE)
                        size = ZS_MAX_ALLOC_SIZE;
+                pages_per_zspage = get_pages_per_zspage(size);
+                /*
+                 * size_class is used for normal zsmalloc operation such
+                 * as alloc/free for that size. Although it is natural that we
+                 * have one size_class for each size, there is a chance that we
+                 * can get more memory utilization if we use one size_class for
+                 * many different sizes whose size_class have same
+                 * characteristics. So, we makes size_class point to
+                 * previous size_class if possible.
+                 */
+                if (prev_class) {
+                        if (can_merge(prev_class, size, pages_per_zspage)) {
+                                pool->size_class[i] = prev_class;
+                                continue;
+                        }
+                }
+                class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
+                if (!class)
+                        goto err;
-                class = &pool->size_class[i];
                class->size = size;
                class->index = i;
+                class->pages_per_zspage = pages_per_zspage;
                spin_lock_init(&class->lock);
-                class->pages_per_zspage = get_pages_per_zspage(size);
+                pool->size_class[i] = class;
+                prev_class = class;
        }
        pool->flags = flags;
        return pool;
+err:
+        zs_destroy_pool(pool);
+        return NULL;
 }
 EXPORT_SYMBOL_GPL(zs_create_pool);
@@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool)
 {
        int i;
-        for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+        for (i = 0; i < zs_size_classes; i++) {
                int fg;
-                struct size_class *class = &pool->size_class[i];
+                struct size_class *class = pool->size_class[i];
+                if (!class)
+                        continue;
+                if (class->index != i)
+                        continue;
                for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
                        if (class->fullness_list[fg]) {
@@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool)
                                        class->size, fg);
                        }
                }
+                kfree(class);
        }
+        kfree(pool->size_class);
        kfree(pool);
 }
 EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 {
        unsigned long obj;
        struct link_free *link;
-        int class_idx;
        struct size_class *class;
+        void *vaddr;
        struct page *first_page, *m_page;
        unsigned long m_objidx, m_offset;
@@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
        if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
                return 0;
-        class_idx = get_size_class_index(size);
+        class = pool->size_class[get_size_class_index(size)];
-        class = &pool->size_class[class_idx];
-        BUG_ON(class_idx != class->index);
        spin_lock(&class->lock);
        first_page = find_get_zspage(class);
@@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
        obj_handle_to_location(obj, &m_page, &m_objidx);
        m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-        link = (struct link_free *)kmap_atomic(m_page) +
+        vaddr = kmap_atomic(m_page);
-                                        m_offset / sizeof(*link);
+        link = (struct link_free *)vaddr + m_offset / sizeof(*link);
        first_page->freelist = link->next;
        memset(link, POISON_INUSE, sizeof(*link));
-        kunmap_atomic(link);
+        kunmap_atomic(vaddr);
        first_page->inuse++;
        /* Now move the zspage to another fullness group, if required */
@@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
        struct link_free *link;
        struct page *first_page, *f_page;
        unsigned long f_objidx, f_offset;
+        void *vaddr;
        int class_idx;
        struct size_class *class;
@@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
        first_page = get_first_page(f_page);
        get_zspage_mapping(first_page, &class_idx, &fullness);
-        class = &pool->size_class[class_idx];
+        class = pool->size_class[class_idx];
        f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
        spin_lock(&class->lock);
        /* Insert this object in containing zspage's freelist */
-        link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
+        vaddr = kmap_atomic(f_page);
-                                                        + f_offset);
+        link = (struct link_free *)(vaddr + f_offset);
        link->next = first_page->freelist;
-        kunmap_atomic(link);
+        kunmap_atomic(vaddr);
        first_page->freelist = (void *)obj;
        first_page->inuse--;
@@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
        obj_handle_to_location(handle, &page, &obj_idx);
        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
-        class = &pool->size_class[class_idx];
+        class = pool->size_class[class_idx];
        off = obj_idx_to_offset(page, obj_idx, class->size);
        area = &get_cpu_var(zs_map_area);
@@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
        obj_handle_to_location(handle, &page, &obj_idx);
        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
-        class = &pool->size_class[class_idx];
+        class = pool->size_class[class_idx];
        off = obj_idx_to_offset(page, obj_idx, class->size);
        area = this_cpu_ptr(&zs_map_area);
diff --git a/mm/zswap.c b/mm/zswap.c
index c1543061a192..0cfce9bc51e4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -149,11 +149,10 @@ static int __init zswap_comp_init(void)
        return 0;
 }
-static void zswap_comp_exit(void)
+static void __init zswap_comp_exit(void)
 {
        /* free percpu transforms */
-        if (zswap_comp_pcpu_tfms)
+        free_percpu(zswap_comp_pcpu_tfms);
-                free_percpu(zswap_comp_pcpu_tfms);
 }
 /*********************************
@@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
 **********************************/
 static struct kmem_cache *zswap_entry_cache;
-static int zswap_entry_cache_create(void)
+static int __init zswap_entry_cache_create(void)
 {
        zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
        return zswap_entry_cache == NULL;
@@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = {
        .notifier_call = zswap_cpu_notifier
 };
-static int zswap_cpu_init(void)
+static int __init zswap_cpu_init(void)
 {
        unsigned long cpu;
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-13 16:00:36 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-13 16:00:36 -0500
commit	78a45c6f067824cf5d0a9fedea7339ac2e28603c (patch)
tree	b4f78c8b6b9059ddace0a18c11629b8d2045f793 /mm
parent	f96fe225677b3efb74346ebd56fafe3997b02afa (diff)
parent	29d293b6007b91a4463f05bc8d0b26e0e65c5816 (diff)