Merge branch 'akpm' (patchbomb from Andrew Morton)

Merge incoming from Andrew Morton: - Various misc things. - arch/sh updates. - Part of ocfs2. Review is slow. - Slab updates. - Most of -mm. - printk updates. - lib/ updates. - checkpatch updates. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (226 commits) checkpatch: update $declaration_macros, add uninitialized_var checkpatch: warn on missing spaces in broken up quoted checkpatch: fix false positives for --strict "space after cast" test checkpatch: fix false positive MISSING_BREAK warnings with --file checkpatch: add test for native c90 types in unusual order checkpatch: add signed generic types checkpatch: add short int to c variable types checkpatch: add for_each tests to indentation and brace tests checkpatch: fix brace style misuses of else and while checkpatch: add --fix option for a couple OPEN_BRACE misuses checkpatch: use the correct indentation for which() checkpatch: add fix_insert_line and fix_delete_line helpers checkpatch: add ability to insert and delete lines to patch/file checkpatch: add an index variable for fixed lines checkpatch: warn on break after goto or return with same tab indentation checkpatch: emit a warning on file add/move/delete checkpatch: add test for commit id formatting style in commit log checkpatch: emit fewer kmalloc_array/kcalloc conversion warnings checkpatch: improve "no space after cast" test checkpatch: allow multiple const * types ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-07 00:14:42 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-07 00:14:42 -0400
commit: 33caee39925b887a99a2400dc5c980097c3573f9 (patch)
tree: 8e68ad97e1fee88c4a3f31453041f8d139f2027e /mm
parent: 6456a0438b984186a0c9c8ecc9fe3d97b7ac3613 (diff)
parent: f84223087402c45179be5e7060c5736c17a7b271 (diff)
36 files changed, 2141 insertions, 1307 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e9977a9d657..886db2158538 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -508,21 +508,34 @@ config CMA_DEBUG
          processing calls such as dma_alloc_from_contiguous().
          This option does not affect warning and error messages.
-config ZBUD
+config CMA_AREAS
-        tristate
+        int "Maximum count of the CMA areas"
-        default n
+        depends on CMA
+        default 7
        help
-          A special purpose allocator for storing compressed pages.
+          CMA allows to create CMA areas for particular purpose, mainly,
-          It is designed to store up to two compressed pages per physical
+          used as device private area. This parameter sets the maximum
-          page.  While this design limits storage density, it has simple and
+          number of CMA area in the system.
-          deterministic reclaim properties that make it preferable to a higher
-          density approach when reclaim will be used.
+          If unsure, leave the default value "7".
+config MEM_SOFT_DIRTY
+        bool "Track memory changes"
+        depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
+        select PROC_PAGE_MONITOR
+        help
+          This option enables memory changes tracking by introducing a
+          soft-dirty bit on pte-s. This bit it set when someone writes
+          into a page just as regular dirty bit, but unlike the latter
+          it can be cleared by hands.
+          See Documentation/vm/soft-dirty.txt for more details.
 config ZSWAP
        bool "Compressed cache for swap pages (EXPERIMENTAL)"
        depends on FRONTSWAP && CRYPTO=y
        select CRYPTO_LZO
-        select ZBUD
+        select ZPOOL
        default n
        help
          A lightweight compressed cache for swap pages.  It takes
@@ -538,17 +551,22 @@ config ZSWAP
          they have not be fully explored on the large set of potential
          configurations and workloads that exist.
-config MEM_SOFT_DIRTY
+config ZPOOL
-        bool "Track memory changes"
+        tristate "Common API for compressed memory storage"
-        depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
+        default n
-        select PROC_PAGE_MONITOR
        help
-          This option enables memory changes tracking by introducing a
+          Compressed memory storage API.  This allows using either zbud or
-          soft-dirty bit on pte-s. This bit it set when someone writes
+          zsmalloc.
-          into a page just as regular dirty bit, but unlike the latter
-          it can be cleared by hands.
-          See Documentation/vm/soft-dirty.txt for more details.
+config ZBUD
+        tristate "Low density storage for compressed pages"
+        default n
+        help
+          A special purpose allocator for storing compressed pages.
+          It is designed to store up to two compressed pages per physical
+          page.  While this design limits storage density, it has simple and
+          deterministic reclaim properties that make it preferable to a higher
+          density approach when reclaim will be used.
 config ZSMALLOC
        tristate "Memory allocator for compressed pages"
diff --git a/mm/Makefile b/mm/Makefile
index 4064f3ec145e..632ae77e6070 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -59,6 +59,8 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_ZPOOL)     += zpool.o
 obj-$(CONFIG_ZBUD)      += zbud.o
 obj-$(CONFIG_ZSMALLOC)  += zsmalloc.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
+obj-$(CONFIG_CMA)       += cma.o
diff --git a/mm/cma.c b/mm/cma.c
new file mode 100644
index 000000000000..c17751c0dcaf
--- /dev/null
+++ b/mm/cma.c
@@ -0,0 +1,335 @@
+/*
+ * Contiguous Memory Allocator
+ *
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Copyright IBM Corporation, 2013
+ * Copyright LG Electronics Inc., 2014
+ * Written by:
+ *      Marek Szyprowski <m.szyprowski@samsung.com>
+ *      Michal Nazarewicz <mina86@mina86.com>
+ *      Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *      Joonsoo Kim <iamjoonsoo.kim@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+#define pr_fmt(fmt) "cma: " fmt
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+#  define DEBUG
+#endif
+#endif
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/cma.h>
+struct cma {
+        unsigned long   base_pfn;
+        unsigned long   count;
+        unsigned long   *bitmap;
+        unsigned int order_per_bit; /* Order of pages represented by one bit */
+        struct mutex    lock;
+};
+static struct cma cma_areas[MAX_CMA_AREAS];
+static unsigned cma_area_count;
+static DEFINE_MUTEX(cma_mutex);
+phys_addr_t cma_get_base(struct cma *cma)
+{
+        return PFN_PHYS(cma->base_pfn);
+}
+unsigned long cma_get_size(struct cma *cma)
+{
+        return cma->count << PAGE_SHIFT;
+}
+static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
+{
+        return (1UL << (align_order >> cma->order_per_bit)) - 1;
+}
+static unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+        return cma->count >> cma->order_per_bit;
+}
+static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
+                                                unsigned long pages)
+{
+        return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
+}
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
+{
+        unsigned long bitmap_no, bitmap_count;
+        bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
+        bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+        mutex_lock(&cma->lock);
+        bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
+        mutex_unlock(&cma->lock);
+}
+static int __init cma_activate_area(struct cma *cma)
+{
+        int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long);
+        unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
+        unsigned i = cma->count >> pageblock_order;
+        struct zone *zone;
+        cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!cma->bitmap)
+                return -ENOMEM;
+        WARN_ON_ONCE(!pfn_valid(pfn));
+        zone = page_zone(pfn_to_page(pfn));
+        do {
+                unsigned j;
+                base_pfn = pfn;
+                for (j = pageblock_nr_pages; j; --j, pfn++) {
+                        WARN_ON_ONCE(!pfn_valid(pfn));
+                        /*
+                         * alloc_contig_range requires the pfn range
+                         * specified to be in the same zone. Make this
+                         * simple by forcing the entire CMA resv range
+                         * to be in the same zone.
+                         */
+                        if (page_zone(pfn_to_page(pfn)) != zone)
+                                goto err;
+                }
+                init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+        } while (--i);
+        mutex_init(&cma->lock);
+        return 0;
+err:
+        kfree(cma->bitmap);
+        return -EINVAL;
+}
+static int __init cma_init_reserved_areas(void)
+{
+        int i;
+        for (i = 0; i < cma_area_count; i++) {
+                int ret = cma_activate_area(&cma_areas[i]);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+core_initcall(cma_init_reserved_areas);
+/**
+ * cma_declare_contiguous() - reserve custom contiguous area
+ * @base: Base address of the reserved area optional, use 0 for any
+ * @size: Size of the reserved area (in bytes),
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ * @alignment: Alignment for the CMA area, should be power of 2 or zero
+ * @order_per_bit: Order of pages represented by one bit on bitmap.
+ * @fixed: hint about where to place the reserved area
+ * @res_cma: Pointer to store the created cma region.
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory. This function allows to create custom reserved areas.
+ *
+ * If @fixed is true, reserve contiguous area at exactly @base.  If false,
+ * reserve in range from @base to @limit.
+ */
+int __init cma_declare_contiguous(phys_addr_t base,
+                        phys_addr_t size, phys_addr_t limit,
+                        phys_addr_t alignment, unsigned int order_per_bit,
+                        bool fixed, struct cma **res_cma)
+{
+        struct cma *cma;
+        int ret = 0;
+        pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
+                __func__, (unsigned long)size, (unsigned long)base,
+                (unsigned long)limit, (unsigned long)alignment);
+        if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+                pr_err("Not enough slots for CMA reserved regions!\n");
+                return -ENOSPC;
+        }
+        if (!size)
+                return -EINVAL;
+        if (alignment && !is_power_of_2(alignment))
+                return -EINVAL;
+        /*
+         * Sanitise input arguments.
+         * Pages both ends in CMA area could be merged into adjacent unmovable
+         * migratetype page by page allocator's buddy algorithm. In the case,
+         * you couldn't get a contiguous memory, which is not what we want.
+         */
+        alignment = max(alignment,
+                (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order));
+        base = ALIGN(base, alignment);
+        size = ALIGN(size, alignment);
+        limit &= ~(alignment - 1);
+        /* size should be aligned with order_per_bit */
+        if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
+                return -EINVAL;
+        /* Reserve memory */
+        if (base && fixed) {
+                if (memblock_is_region_reserved(base, size) ||
+                    memblock_reserve(base, size) < 0) {
+                        ret = -EBUSY;
+                        goto err;
+                }
+        } else {
+                phys_addr_t addr = memblock_alloc_range(size, alignment, base,
+                                                        limit);
+                if (!addr) {
+                        ret = -ENOMEM;
+                        goto err;
+                } else {
+                        base = addr;
+                }
+        }
+        /*
+         * Each reserved area must be initialised later, when more kernel
+         * subsystems (like slab allocator) are available.
+         */
+        cma = &cma_areas[cma_area_count];
+        cma->base_pfn = PFN_DOWN(base);
+        cma->count = size >> PAGE_SHIFT;
+        cma->order_per_bit = order_per_bit;
+        *res_cma = cma;
+        cma_area_count++;
+        pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M,
+                (unsigned long)base);
+        return 0;
+err:
+        pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+        return ret;
+}
+/**
+ * cma_alloc() - allocate pages from contiguous area
+ * @cma:   Contiguous memory region for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ *
+ * This function allocates part of contiguous memory on specific
+ * contiguous memory area.
+ */
+struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
+{
+        unsigned long mask, pfn, start = 0;
+        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+        struct page *page = NULL;
+        int ret;
+        if (!cma || !cma->count)
+                return NULL;
+        pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
+                 count, align);
+        if (!count)
+                return NULL;
+        mask = cma_bitmap_aligned_mask(cma, align);
+        bitmap_maxno = cma_bitmap_maxno(cma);
+        bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+        for (;;) {
+                mutex_lock(&cma->lock);
+                bitmap_no = bitmap_find_next_zero_area(cma->bitmap,
+                                bitmap_maxno, start, bitmap_count, mask);
+                if (bitmap_no >= bitmap_maxno) {
+                        mutex_unlock(&cma->lock);
+                        break;
+                }
+                bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
+                /*
+                 * It's safe to drop the lock here. We've marked this region for
+                 * our exclusive use. If the migration fails we will take the
+                 * lock again and unmark it.
+                 */
+                mutex_unlock(&cma->lock);
+                pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
+                mutex_lock(&cma_mutex);
+                ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+                mutex_unlock(&cma_mutex);
+                if (ret == 0) {
+                        page = pfn_to_page(pfn);
+                        break;
+                }
+                cma_clear_bitmap(cma, pfn, count);
+                if (ret != -EBUSY)
+                        break;
+                pr_debug("%s(): memory range at %p is busy, retrying\n",
+                         __func__, pfn_to_page(pfn));
+                /* try again with a bit different memory target */
+                start = bitmap_no + mask + 1;
+        }
+        pr_debug("%s(): returned %p\n", __func__, page);
+        return page;
+}
+/**
+ * cma_release() - release allocated pages
+ * @cma:   Contiguous memory region for which the allocation is performed.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by alloc_cma().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool cma_release(struct cma *cma, struct page *pages, int count)
+{
+        unsigned long pfn;
+        if (!cma || !pages)
+                return false;
+        pr_debug("%s(page %p)\n", __func__, (void *)pages);
+        pfn = page_to_pfn(pages);
+        if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+                return false;
+        VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+        free_contig_range(pfn, count);
+        cma_clear_bitmap(cma, pfn, count);
+        return true;
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d44fd88c78..af19a6b079f5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -808,6 +808,17 @@ int __lock_page_killable(struct page *page)
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
+/*
+ * Return values:
+ * 1 - page is locked; mmap_sem is still held.
+ * 0 - page is not locked.
+ *     mmap_sem has been released (up_read()), unless flags had both
+ *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
+ *     which case mmap_sem is still held.
+ *
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
+ * with the page locked and the mmap_sem unperturbed.
+ */
 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                         unsigned int flags)
 {
@@ -1091,9 +1102,9 @@ no_page:
                if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
                        fgp_flags |= FGP_LOCK;
-                /* Init accessed so avoit atomic mark_page_accessed later */
+                /* Init accessed so avoid atomic mark_page_accessed later */
                if (fgp_flags & FGP_ACCESSED)
-                        init_page_accessed(page);
+                        __SetPageReferenced(page);
                err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
                if (unlikely(err)) {
@@ -1827,6 +1838,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
+ *
+ * vma->vm_mm->mmap_sem must be held on entry.
+ *
+ * If our return value has VM_FAULT_RETRY set, it's because
+ * lock_page_or_retry() returned 0.
+ * The mmap_sem has usually been released in this case.
+ * See __lock_page_or_retry() for the exception.
+ *
+ * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+ * has not been released.
+ *
+ * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
 */
 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
diff --git a/mm/gup.c b/mm/gup.c
index cc5a9e7adea7..91d044b1600d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -258,6 +258,11 @@ unmap:
        return ret;
 }
+/*
+ * mmap_sem must be held on entry.  If @nonblocking != NULL and
+ * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
+ * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
+ */
 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
                unsigned long address, unsigned int *flags, int *nonblocking)
 {
@@ -373,7 +378,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 * with a put_page() call when it is finished with. vmas will only
 * remain valid while mmap_sem is held.
 *
- * Must be called with mmap_sem held for read or write.
+ * Must be called with mmap_sem held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
@@ -396,7 +401,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 *
 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 * or mmap_sem contention, and if waiting is needed to pin all pages,
- * *@nonblocking will be set to 0.
+ * *@nonblocking will be set to 0.  Further, if @gup_flags does not
+ * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
+ * this case.
+ *
+ * A caller using such a combination of @nonblocking and @gup_flags
+ * must therefore hold the mmap_sem for reading only, and recognize
+ * when it's been released.  Otherwise, it must be held for either
+ * reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
@@ -528,7 +540,7 @@ EXPORT_SYMBOL(__get_user_pages);
 * such architectures, gup() will not be enough to make a subsequent access
 * succeed.
 *
- * This should be called with the mm_sem held for read.
+ * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
 */
 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long address, unsigned int fault_flags)
diff --git a/mm/highmem.c b/mm/highmem.c
index b32b70cdaed6..123bcd3ed4f2 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx);
 */
 #ifdef CONFIG_HIGHMEM
+/*
+ * Architecture with aliasing data cache may define the following family of
+ * helper functions in its asm/highmem.h to control cache color of virtual
+ * addresses where physical memory pages are mapped by kmap.
+ */
+#ifndef get_pkmap_color
+/*
+ * Determine color of virtual address where the page should be mapped.
+ */
+static inline unsigned int get_pkmap_color(struct page *page)
+{
+        return 0;
+}
+#define get_pkmap_color get_pkmap_color
+/*
+ * Get next index for mapping inside PKMAP region for page with given color.
+ */
+static inline unsigned int get_next_pkmap_nr(unsigned int color)
+{
+        static unsigned int last_pkmap_nr;
+        last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+        return last_pkmap_nr;
+}
+/*
+ * Determine if page index inside PKMAP region (pkmap_nr) of given color
+ * has wrapped around PKMAP region end. When this happens an attempt to
+ * flush all unused PKMAP slots is made.
+ */
+static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color)
+{
+        return pkmap_nr == 0;
+}
+/*
+ * Get the number of PKMAP entries of the given color. If no free slot is
+ * found after checking that many entries, kmap will sleep waiting for
+ * someone to call kunmap and free PKMAP slot.
+ */
+static inline int get_pkmap_entries_count(unsigned int color)
+{
+        return LAST_PKMAP;
+}
+/*
+ * Get head of a wait queue for PKMAP entries of the given color.
+ * Wait queues for different mapping colors should be independent to avoid
+ * unnecessary wakeups caused by freeing of slots of other colors.
+ */
+static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
+{
+        static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+        return &pkmap_map_wait;
+}
+#endif
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
@@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void)
 }
 static int pkmap_count[LAST_PKMAP];
-static unsigned int last_pkmap_nr;
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
 pte_t * pkmap_page_table;
-static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
 /*
 * Most architectures have no use for kmap_high_get(), so let's abstract
 * the disabling of IRQ out of the locking in that case to save on a
@@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page)
 {
        unsigned long vaddr;
        int count;
+        unsigned int last_pkmap_nr;
+        unsigned int color = get_pkmap_color(page);
 start:
-        count = LAST_PKMAP;
+        count = get_pkmap_entries_count(color);
        /* Find an empty entry */
        for (;;) {
-                last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+                last_pkmap_nr = get_next_pkmap_nr(color);
-                if (!last_pkmap_nr) {
+                if (no_more_pkmaps(last_pkmap_nr, color)) {
                        flush_all_zero_pkmaps();
-                        count = LAST_PKMAP;
+                        count = get_pkmap_entries_count(color);
                }
                if (!pkmap_count[last_pkmap_nr])
                        break;  /* Found a usable entry */
@@ -181,12 +240,14 @@ start:
                 */
                {
                        DECLARE_WAITQUEUE(wait, current);
+                        wait_queue_head_t *pkmap_map_wait =
+                                get_pkmap_wait_queue_head(color);
                        __set_current_state(TASK_UNINTERRUPTIBLE);
-                        add_wait_queue(&pkmap_map_wait, &wait);
+                        add_wait_queue(pkmap_map_wait, &wait);
                        unlock_kmap();
                        schedule();
-                        remove_wait_queue(&pkmap_map_wait, &wait);
+                        remove_wait_queue(pkmap_map_wait, &wait);
                        lock_kmap();
                        /* Somebody else might have mapped it while we slept */
@@ -274,6 +335,8 @@ void kunmap_high(struct page *page)
        unsigned long nr;
        unsigned long flags;
        int need_wakeup;
+        unsigned int color = get_pkmap_color(page);
+        wait_queue_head_t *pkmap_map_wait;
        lock_kmap_any(flags);
        vaddr = (unsigned long)page_address(page);
@@ -299,13 +362,14 @@ void kunmap_high(struct page *page)
                 * no need for the wait-queue-head's lock.  Simply
                 * test if the queue is empty.
                 */
-                need_wakeup = waitqueue_active(&pkmap_map_wait);
+                pkmap_map_wait = get_pkmap_wait_queue_head(color);
+                need_wakeup = waitqueue_active(pkmap_map_wait);
        }
        unlock_kmap_any(flags);
        /* do wake-up, if needed, race-free outside of the spin lock */
        if (need_wakeup)
-                wake_up(&pkmap_map_wait);
+                wake_up(pkmap_map_wait);
 }
 EXPORT_SYMBOL(kunmap_high);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33514d88fef9..3630d577e987 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-        if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) {
+        if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1132,7 +1132,7 @@ alloc:
                goto out;
        }
-        if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) {
+        if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) {
                put_page(new_page);
                if (page) {
                        split_huge_page(page);
@@ -1681,7 +1681,7 @@ static void __split_huge_page_refcount(struct page *page,
                           &page_tail->_count);
                /* after clearing PageTail the gup refcount can be released */
-                smp_mb();
+                smp_mb__after_atomic();
                /*
                 * retain hwpoison flag of the poisoned tail page:
@@ -1775,6 +1775,8 @@ static int __split_huge_page_map(struct page *page,
        if (pmd) {
                pgtable = pgtable_trans_huge_withdraw(mm, pmd);
                pmd_populate(mm, &_pmd, pgtable);
+                if (pmd_write(*pmd))
+                        BUG_ON(page_mapcount(page) != 1);
                haddr = address;
                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1784,8 +1786,6 @@ static int __split_huge_page_map(struct page *page,
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (!pmd_write(*pmd))
                                entry = pte_wrprotect(entry);
-                        else
-                                BUG_ON(page_mapcount(page) != 1);
                        if (!pmd_young(*pmd))
                                entry = pte_mkold(entry);
                        if (pmd_numa(*pmd))
@@ -2233,6 +2233,30 @@ static void khugepaged_alloc_sleep(void)
 static int khugepaged_node_load[MAX_NUMNODES];
+static bool khugepaged_scan_abort(int nid)
+{
+        int i;
+        /*
+         * If zone_reclaim_mode is disabled, then no extra effort is made to
+         * allocate memory locally.
+         */
+        if (!zone_reclaim_mode)
+                return false;
+        /* If there is a count for this node already, it must be acceptable */
+        if (khugepaged_node_load[nid])
+                return false;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                if (!khugepaged_node_load[i])
+                        continue;
+                if (node_distance(nid, i) > RECLAIM_DISTANCE)
+                        return true;
+        }
+        return false;
+}
 #ifdef CONFIG_NUMA
 static int khugepaged_find_target_node(void)
 {
@@ -2399,7 +2423,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (!new_page)
                return;
-        if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)))
+        if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE)))
                return;
        /*
@@ -2545,6 +2569,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 * hit record.
                 */
                node = page_to_nid(page);
+                if (khugepaged_scan_abort(node))
+                        goto out_unmap;
                khugepaged_node_load[node]++;
                VM_BUG_ON_PAGE(PageCompound(page), page);
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7a0a73d2fcff..eeceeeb09019 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,6 @@
 #include <linux/node.h>
 #include "internal.h"
-const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 unsigned long hugepages_treat_as_movable;
 int hugetlb_max_hstate __read_mostly;
@@ -1089,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
        unsigned long pfn;
        struct hstate *h;
+        if (!hugepages_supported())
+                return;
        /* Set scan step to minimum hugepage size */
        for_each_hstate(h)
                if (order > huge_page_order(h))
@@ -1734,21 +1736,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
        return sprintf(buf, "%lu\n", nr_huge_pages);
 }
-static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
-                        struct kobject *kobj, struct kobj_attribute *attr,
+                                           struct hstate *h, int nid,
-                        const char *buf, size_t len)
+                                           unsigned long count, size_t len)
 {
        int err;
-        int nid;
-        unsigned long count;
-        struct hstate *h;
        NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
-        err = kstrtoul(buf, 10, &count);
-        if (err)
-                goto out;
-        h = kobj_to_hstate(kobj, &nid);
        if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
                err = -EINVAL;
                goto out;
@@ -1784,6 +1778,23 @@ out:
        return err;
 }
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+                                         struct kobject *kobj, const char *buf,
+                                         size_t len)
+{
+        struct hstate *h;
+        unsigned long count;
+        int nid;
+        int err;
+        err = kstrtoul(buf, 10, &count);
+        if (err)
+                return err;
+        h = kobj_to_hstate(kobj, &nid);
+        return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
+}
 static ssize_t nr_hugepages_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
 {
@@ -1793,7 +1804,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj,
 static ssize_t nr_hugepages_store(struct kobject *kobj,
               struct kobj_attribute *attr, const char *buf, size_t len)
 {
-        return nr_hugepages_store_common(false, kobj, attr, buf, len);
+        return nr_hugepages_store_common(false, kobj, buf, len);
 }
 HSTATE_ATTR(nr_hugepages);
@@ -1812,7 +1823,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
               struct kobj_attribute *attr, const char *buf, size_t len)
 {
-        return nr_hugepages_store_common(true, kobj, attr, buf, len);
+        return nr_hugepages_store_common(true, kobj, buf, len);
 }
 HSTATE_ATTR(nr_hugepages_mempolicy);
 #endif
@@ -2248,36 +2259,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                         void __user *buffer, size_t *length, loff_t *ppos)
 {
        struct hstate *h = &default_hstate;
-        unsigned long tmp;
+        unsigned long tmp = h->max_huge_pages;
        int ret;
        if (!hugepages_supported())
                return -ENOTSUPP;
-        tmp = h->max_huge_pages;
-        if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
-                return -EINVAL;
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
        ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
        if (ret)
                goto out;
-        if (write) {
+        if (write)
-                NODEMASK_ALLOC(nodemask_t, nodes_allowed,
+                ret = __nr_hugepages_store_common(obey_mempolicy, h,
-                                                GFP_KERNEL | __GFP_NORETRY);
+                                                  NUMA_NO_NODE, tmp, *length);
-                if (!(obey_mempolicy &&
-                               init_nodemask_of_mempolicy(nodes_allowed))) {
-                        NODEMASK_FREE(nodes_allowed);
-                        nodes_allowed = &node_states[N_MEMORY];
-                }
-                h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
-                if (nodes_allowed != &node_states[N_MEMORY])
-                        NODEMASK_FREE(nodes_allowed);
-        }
 out:
        return ret;
 }
@@ -2754,8 +2750,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 * from other VMAs and let the children be SIGKILLed if they are faulting the
 * same region.
 */
-static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
-                                struct page *page, unsigned long address)
+                              struct page *page, unsigned long address)
 {
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
@@ -2794,8 +2790,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                                             address + huge_page_size(h), page);
        }
        mutex_unlock(&mapping->i_mmap_mutex);
-        return 1;
 }
 /*
@@ -2810,7 +2804,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
-        int outside_reserve = 0;
+        int ret = 0, outside_reserve = 0;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
@@ -2840,14 +2834,14 @@ retry_avoidcopy:
        page_cache_get(old_page);
-        /* Drop page table lock as buddy allocator may be called */
+        /*
+         * Drop page table lock as buddy allocator may be called. It will
+         * be acquired again before returning to the caller, as expected.
+         */
        spin_unlock(ptl);
        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
-                long err = PTR_ERR(new_page);
-                page_cache_release(old_page);
                /*
                 * If a process owning a MAP_PRIVATE mapping fails to COW,
                 * it is due to references held by a child and an insufficient
@@ -2856,29 +2850,25 @@ retry_avoidcopy:
                 * may get SIGKILLed if it later faults.
                 */
                if (outside_reserve) {
+                        page_cache_release(old_page);
                        BUG_ON(huge_pte_none(pte));
-                        if (unmap_ref_private(mm, vma, old_page, address)) {
+                        unmap_ref_private(mm, vma, old_page, address);
-                                BUG_ON(huge_pte_none(pte));
+                        BUG_ON(huge_pte_none(pte));
-                                spin_lock(ptl);
+                        spin_lock(ptl);
-                                ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+                        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
-                                if (likely(ptep &&
+                        if (likely(ptep &&
-                                           pte_same(huge_ptep_get(ptep), pte)))
+                                   pte_same(huge_ptep_get(ptep), pte)))
-                                        goto retry_avoidcopy;
+                                goto retry_avoidcopy;
-                                /*
+                        /*
-                                 * race occurs while re-acquiring page table
+                         * race occurs while re-acquiring page table
-                                 * lock, and our job is done.
+                         * lock, and our job is done.
-                                 */
+                         */
-                                return 0;
+                        return 0;
-                        }
-                        WARN_ON_ONCE(1);
                }
-                /* Caller expects lock to be held */
+                ret = (PTR_ERR(new_page) == -ENOMEM) ?
-                spin_lock(ptl);
+                        VM_FAULT_OOM : VM_FAULT_SIGBUS;
-                if (err == -ENOMEM)
+                goto out_release_old;
-                        return VM_FAULT_OOM;
-                else
-                        return VM_FAULT_SIGBUS;
        }
        /*
@@ -2886,11 +2876,8 @@ retry_avoidcopy:
         * anon_vma prepared.
         */
        if (unlikely(anon_vma_prepare(vma))) {
-                page_cache_release(new_page);
+                ret = VM_FAULT_OOM;
-                page_cache_release(old_page);
+                goto out_release_all;
-                /* Caller expects lock to be held */
-                spin_lock(ptl);
-                return VM_FAULT_OOM;
        }
        copy_user_huge_page(new_page, old_page, address, vma,
@@ -2900,6 +2887,7 @@ retry_avoidcopy:
        mmun_start = address & huge_page_mask(h);
        mmun_end = mmun_start + huge_page_size(h);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Retake the page table lock to check for racing updates
         * before the page tables are altered
@@ -2920,12 +2908,13 @@ retry_avoidcopy:
        }
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out_release_all:
        page_cache_release(new_page);
+out_release_old:
        page_cache_release(old_page);
-        /* Caller expects lock to be held */
+        spin_lock(ptl); /* Caller expects lock to be held */
-        spin_lock(ptl);
+        return ret;
-        return 0;
 }
 /* Return the pagecache page at a given address within a VMA */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 95487c71cad5..329caf56df22 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
 static void pfn_inject_exit(void)
 {
-        if (hwpoison_dir)
+        debugfs_remove_recursive(hwpoison_dir);
-                debugfs_remove_recursive(hwpoison_dir);
 }
 static int pfn_inject_init(void)
diff --git a/mm/internal.h b/mm/internal.h
index 7f22a11fcc66..a1b651b11c5f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -247,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 static inline struct page *mem_map_offset(struct page *base, int offset)
 {
        if (unlikely(offset >= MAX_ORDER_NR_PAGES))
-                return pfn_to_page(page_to_pfn(base) + offset);
+                return nth_page(base, offset);
        return base + offset;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index a402f8fdc68e..0938b30da4ab 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 /*
 * Application wants to free up the pages and associated backing store.
 * This is effectively punching a hole into the middle of a file.
- *
- * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
- * Other filesystems return -ENOSYS.
 */
 static long madvise_remove(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f009a14918d2..90dc501eaf3f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2551,55 +2551,72 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
+/**
-/* See mem_cgroup_try_charge() for details */
+ * mem_cgroup_try_charge - try charging a memcg
-enum {
+ * @memcg: memcg to charge
-        CHARGE_OK,              /* success */
+ * @nr_pages: number of pages to charge
-        CHARGE_RETRY,           /* need to retry but retry is not bad */
+ *
-        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
-        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
-};
+ */
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
-static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                                 gfp_t gfp_mask,
-                                unsigned int nr_pages, unsigned int min_pages,
+                                 unsigned int nr_pages)
-                                bool invoke_oom)
 {
-        unsigned long csize = nr_pages * PAGE_SIZE;
+        unsigned int batch = max(CHARGE_BATCH, nr_pages);
+        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup *mem_over_limit;
        struct res_counter *fail_res;
+        unsigned long nr_reclaimed;
        unsigned long flags = 0;
-        int ret;
+        unsigned long long size;
+        int ret = 0;
-        ret = res_counter_charge(&memcg->res, csize, &fail_res);
+retry:
+        if (consume_stock(memcg, nr_pages))
+                goto done;
-        if (likely(!ret)) {
+        size = batch * PAGE_SIZE;
+        if (!res_counter_charge(&memcg->res, size, &fail_res)) {
                if (!do_swap_account)
-                        return CHARGE_OK;
+                        goto done_restock;
-                ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
+                if (!res_counter_charge(&memcg->memsw, size, &fail_res))
-                if (likely(!ret))
+                        goto done_restock;
-                        return CHARGE_OK;
+                res_counter_uncharge(&memcg->res, size);
-                res_counter_uncharge(&memcg->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        if (batch > nr_pages) {
+                batch = nr_pages;
+                goto retry;
+        }
        /*
-         * Never reclaim on behalf of optional batching, retry with a
+         * Unlike in global OOM situations, memcg is not in a physical
-         * single page instead.
+         * memory shortage.  Allow dying and OOM-killed tasks to
+         * bypass the last charges so that they can exit quickly and
+         * free their memory.
         */
-        if (nr_pages > min_pages)
+        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
-                return CHARGE_RETRY;
+                     fatal_signal_pending(current) ||
+                     current->flags & PF_EXITING))
+                goto bypass;
+        if (unlikely(task_in_memcg_oom(current)))
+                goto nomem;
        if (!(gfp_mask & __GFP_WAIT))
-                return CHARGE_WOULDBLOCK;
+                goto nomem;
-        if (gfp_mask & __GFP_NORETRY)
+        nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
-                return CHARGE_NOMEM;
-        ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
-                return CHARGE_RETRY;
+                goto retry;
+        if (gfp_mask & __GFP_NORETRY)
+                goto nomem;
        /*
         * Even though the limit is exceeded at this point, reclaim
         * may have been able to free some pages.  Retry the charge
@@ -2609,96 +2626,38 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * unlikely to succeed so close to the limit, and we fall back
         * to regular pages anyway in case of failure.
         */
-        if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
+        if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
-                return CHARGE_RETRY;
+                goto retry;
        /*
         * At task move, charge accounts can be doubly counted. So, it's
         * better to wait until the end of task_move if something is going on.
         */
        if (mem_cgroup_wait_acct_move(mem_over_limit))
-                return CHARGE_RETRY;
+                goto retry;
-        if (invoke_oom)
-                mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
-        return CHARGE_NOMEM;
-}
-/**
- * mem_cgroup_try_charge - try charging a memcg
- * @memcg: memcg to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
- *
- * Returns 0 if @memcg was charged successfully, -EINTR if the charge
- * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
- */
-static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
-                                 gfp_t gfp_mask,
-                                 unsigned int nr_pages,
-                                 bool oom)
-{
-        unsigned int batch = max(CHARGE_BATCH, nr_pages);
-        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        int ret;
-        if (mem_cgroup_is_root(memcg))
-                goto done;
-        /*
-         * Unlike in global OOM situations, memcg is not in a physical
-         * memory shortage.  Allow dying and OOM-killed tasks to
-         * bypass the last charges so that they can exit quickly and
-         * free their memory.
-         */
-        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
-                     fatal_signal_pending(current) ||
-                     current->flags & PF_EXITING))
-                goto bypass;
-        if (unlikely(task_in_memcg_oom(current)))
+        if (nr_retries--)
-                goto nomem;
+                goto retry;
        if (gfp_mask & __GFP_NOFAIL)
-                oom = false;
+                goto bypass;
-again:
-        if (consume_stock(memcg, nr_pages))
-                goto done;
-        do {
-                bool invoke_oom = oom && !nr_oom_retries;
-                /* If killed, bypass charge */
-                if (fatal_signal_pending(current))
-                        goto bypass;
-                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+        if (fatal_signal_pending(current))
-                                           nr_pages, invoke_oom);
+                goto bypass;
-                switch (ret) {
-                case CHARGE_OK:
-                        break;
-                case CHARGE_RETRY: /* not in OOM situation but retry */
-                        batch = nr_pages;
-                        goto again;
-                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-                        goto nomem;
-                case CHARGE_NOMEM: /* OOM routine works */
-                        if (!oom || invoke_oom)
-                                goto nomem;
-                        nr_oom_retries--;
-                        break;
-                }
-        } while (ret != CHARGE_OK);
-        if (batch > nr_pages)
+        mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
-                refill_stock(memcg, batch - nr_pages);
-done:
-        return 0;
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
                return -ENOMEM;
 bypass:
-        return -EINTR;
+        memcg = root_mem_cgroup;
+        ret = -EINTR;
+        goto retry;
+done_restock:
+        if (batch > nr_pages)
+                refill_stock(memcg, batch - nr_pages);
+done:
+        return ret;
 }
 /**
@@ -2712,15 +2671,14 @@ bypass:
 */
 static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
                                 gfp_t gfp_mask,
-                                 unsigned int nr_pages,
+                                 unsigned int nr_pages)
-                                 bool oom)
 {
        struct mem_cgroup *memcg;
        int ret;
        memcg = get_mem_cgroup_from_mm(mm);
-        ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+        ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages);
        css_put(&memcg->css);
        if (ret == -EINTR)
                memcg = root_mem_cgroup;
@@ -2738,13 +2696,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
                                       unsigned int nr_pages)
 {
-        if (!mem_cgroup_is_root(memcg)) {
+        unsigned long bytes = nr_pages * PAGE_SIZE;
-                unsigned long bytes = nr_pages * PAGE_SIZE;
-                res_counter_uncharge(&memcg->res, bytes);
+        res_counter_uncharge(&memcg->res, bytes);
-                if (do_swap_account)
+        if (do_swap_account)
-                        res_counter_uncharge(&memcg->memsw, bytes);
+                res_counter_uncharge(&memcg->memsw, bytes);
-        }
 }
 /*
@@ -2756,9 +2712,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 {
        unsigned long bytes = nr_pages * PAGE_SIZE;
-        if (mem_cgroup_is_root(memcg))
-                return;
        res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
        if (do_swap_account)
                res_counter_uncharge_until(&memcg->memsw,
@@ -2842,14 +2795,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        }
        pc->mem_cgroup = memcg;
-        /*
-         * We access a page_cgroup asynchronously without lock_page_cgroup().
-         * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
-         * is accessed after testing USED bit. To make pc->mem_cgroup visible
-         * before USED bit, we need memory barrier here.
-         * See mem_cgroup_add_lru_list(), etc.
-         */
-        smp_wmb();
        SetPageCgroupUsed(pc);
        if (lrucare) {
@@ -2937,8 +2882,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
        if (ret)
                return ret;
-        ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
+        ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT);
-                                    oom_gfp_allowed(gfp));
        if (ret == -EINTR)  {
                /*
                 * mem_cgroup_try_charge() chosed to bypass to root due to
@@ -3463,12 +3407,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
                memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
                return;
        }
+        /*
+         * The page is freshly allocated and not visible to any
+         * outside callers yet.  Set up pc non-atomically.
+         */
        pc = lookup_page_cgroup(page);
-        lock_page_cgroup(pc);
        pc->mem_cgroup = memcg;
-        SetPageCgroupUsed(pc);
+        pc->flags = PCG_USED;
-        unlock_page_cgroup(pc);
 }
 void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -3478,19 +3423,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        pc = lookup_page_cgroup(page);
-        /*
-         * Fast unlocked return. Theoretically might have changed, have to
-         * check again after locking.
-         */
        if (!PageCgroupUsed(pc))
                return;
-        lock_page_cgroup(pc);
+        memcg = pc->mem_cgroup;
-        if (PageCgroupUsed(pc)) {
+        pc->flags = 0;
-                memcg = pc->mem_cgroup;
-                ClearPageCgroupUsed(pc);
-        }
-        unlock_page_cgroup(pc);
        /*
         * We trust that only if there is a memcg associated with the page, it
@@ -3531,7 +3468,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
        for (i = 1; i < HPAGE_PMD_NR; i++) {
                pc = head_pc + i;
                pc->mem_cgroup = memcg;
-                smp_wmb();/* see __commit_charge() */
                pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
        }
        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
@@ -3687,7 +3623,6 @@ int mem_cgroup_charge_anon(struct page *page,
 {
        unsigned int nr_pages = 1;
        struct mem_cgroup *memcg;
-        bool oom = true;
        if (mem_cgroup_disabled())
                return 0;
@@ -3699,14 +3634,9 @@ int mem_cgroup_charge_anon(struct page *page,
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-                /*
-                 * Never OOM-kill a process for a huge page.  The
-                 * fault handler will fall back to regular pages.
-                 */
-                oom = false;
        }
-        memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+        memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages);
        if (!memcg)
                return -ENOMEM;
        __mem_cgroup_commit_charge(memcg, page, nr_pages,
@@ -3743,7 +3673,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
-        ret = mem_cgroup_try_charge(memcg, mask, 1, true);
+        ret = mem_cgroup_try_charge(memcg, mask, 1);
        css_put(&memcg->css);
        if (ret == -EINTR)
                memcg = root_mem_cgroup;
@@ -3770,7 +3700,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
        if (!PageSwapCache(page)) {
                struct mem_cgroup *memcg;
-                memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+                memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
                if (!memcg)
                        return -ENOMEM;
                *memcgp = memcg;
@@ -3839,7 +3769,7 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
                return 0;
        }
-        memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+        memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
        if (!memcg)
                return -ENOMEM;
        __mem_cgroup_commit_charge(memcg, page, 1, type, false);
@@ -3993,7 +3923,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
         * replacement page, so leave it alone when phasing out the
         * page that is unused after the migration.
         */
-        if (!end_migration && !mem_cgroup_is_root(memcg))
+        if (!end_migration)
                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
        return memcg;
@@ -4126,8 +4056,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
                 * We uncharge this because swap is freed.  This memcg can
                 * be obsolete one. We avoid calling css_tryget_online().
                 */
-                if (!mem_cgroup_is_root(memcg))
+                res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
-                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
                mem_cgroup_swap_statistics(memcg, false);
                css_put(&memcg->css);
        }
@@ -4817,78 +4746,24 @@ out:
        return retval;
 }
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
-                                               enum mem_cgroup_stat_index idx)
-{
-        struct mem_cgroup *iter;
-        long val = 0;
-        /* Per-cpu values can be negative, use a signed accumulator */
-        for_each_mem_cgroup_tree(iter, memcg)
-                val += mem_cgroup_read_stat(iter, idx);
-        if (val < 0) /* race ? */
-                val = 0;
-        return val;
-}
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
-{
-        u64 val;
-        if (!mem_cgroup_is_root(memcg)) {
-                if (!swap)
-                        return res_counter_read_u64(&memcg->res, RES_USAGE);
-                else
-                        return res_counter_read_u64(&memcg->memsw, RES_USAGE);
-        }
-        /*
-         * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
-         * as well as in MEM_CGROUP_STAT_RSS_HUGE.
-         */
-        val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
-        if (swap)
-                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
-        return val << PAGE_SHIFT;
-}
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
-                                   struct cftype *cft)
+                               struct cftype *cft)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        u64 val;
+        enum res_type type = MEMFILE_TYPE(cft->private);
-        int name;
+        int name = MEMFILE_ATTR(cft->private);
-        enum res_type type;
-        type = MEMFILE_TYPE(cft->private);
-        name = MEMFILE_ATTR(cft->private);
        switch (type) {
        case _MEM:
-                if (name == RES_USAGE)
+                return res_counter_read_u64(&memcg->res, name);
-                        val = mem_cgroup_usage(memcg, false);
-                else
-                        val = res_counter_read_u64(&memcg->res, name);
-                break;
        case _MEMSWAP:
-                if (name == RES_USAGE)
+                return res_counter_read_u64(&memcg->memsw, name);
-                        val = mem_cgroup_usage(memcg, true);
-                else
-                        val = res_counter_read_u64(&memcg->memsw, name);
-                break;
        case _KMEM:
-                val = res_counter_read_u64(&memcg->kmem, name);
+                return res_counter_read_u64(&memcg->kmem, name);
                break;
        default:
                BUG();
        }
-        return val;
 }
 #ifdef CONFIG_MEMCG_KMEM
@@ -5350,7 +5225,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
        if (!t)
                goto unlock;
-        usage = mem_cgroup_usage(memcg, swap);
+        if (!swap)
+                usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+        else
+                usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
        /*
         * current_threshold points to threshold just below or equal to usage.
@@ -5446,15 +5324,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
        mutex_lock(&memcg->thresholds_lock);
-        if (type == _MEM)
+        if (type == _MEM) {
                thresholds = &memcg->thresholds;
-        else if (type == _MEMSWAP)
+                usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+        } else if (type == _MEMSWAP) {
                thresholds = &memcg->memsw_thresholds;
-        else
+                usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+        } else
                BUG();
-        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
        /* Check if a threshold crossed before adding a new one */
        if (thresholds->primary)
                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@ -5534,18 +5412,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
        int i, j, size;
        mutex_lock(&memcg->thresholds_lock);
-        if (type == _MEM)
+        if (type == _MEM) {
                thresholds = &memcg->thresholds;
-        else if (type == _MEMSWAP)
+                usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+        } else if (type == _MEMSWAP) {
                thresholds = &memcg->memsw_thresholds;
-        else
+                usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+        } else
                BUG();
        if (!thresholds->primary)
                goto unlock;
-        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
        /* Check if a threshold crossed before removing */
        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@ -6299,9 +6178,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                 * core guarantees its existence.
                 */
        } else {
-                res_counter_init(&memcg->res, NULL);
+                res_counter_init(&memcg->res, &root_mem_cgroup->res);
-                res_counter_init(&memcg->memsw, NULL);
+                res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
-                res_counter_init(&memcg->kmem, NULL);
+                res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
                /*
                 * Deeper hierachy with use_hierarchy == false doesn't make
                 * much sense so let cgroup subsystem know about this
@@ -6435,55 +6314,39 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
-#define PRECHARGE_COUNT_AT_ONCE 256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
-        int ret = 0;
+        int ret;
-        int batch_count = PRECHARGE_COUNT_AT_ONCE;
-        struct mem_cgroup *memcg = mc.to;
-        if (mem_cgroup_is_root(memcg)) {
+        /* Try a single bulk charge without reclaim first */
+        ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+        if (!ret) {
                mc.precharge += count;
-                /* we don't need css_get for root */
                return ret;
        }
-        /* try to charge at once */
+        if (ret == -EINTR) {
-        if (count > 1) {
+                __mem_cgroup_cancel_charge(root_mem_cgroup, count);
-                struct res_counter *dummy;
-                /*
-                 * "memcg" cannot be under rmdir() because we've already checked
-                 * by cgroup_lock_live_cgroup() that it is not removed and we
-                 * are still under the same cgroup_mutex. So we can postpone
-                 * css_get().
-                 */
-                if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
-                        goto one_by_one;
-                if (do_swap_account && res_counter_charge(&memcg->memsw,
-                                                PAGE_SIZE * count, &dummy)) {
-                        res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
-                        goto one_by_one;
-                }
-                mc.precharge += count;
                return ret;
        }
-one_by_one:
-        /* fall back to one by one charge */
+        /* Try charges one by one with reclaim */
        while (count--) {
-                if (signal_pending(current)) {
+                ret = mem_cgroup_try_charge(mc.to,
-                        ret = -EINTR;
+                                            GFP_KERNEL & ~__GFP_NORETRY, 1);
-                        break;
+                /*
-                }
+                 * In case of failure, any residual charges against
-                if (!batch_count--) {
+                 * mc.to will be dropped by mem_cgroup_clear_mc()
-                        batch_count = PRECHARGE_COUNT_AT_ONCE;
+                 * later on.  However, cancel any charges that are
-                        cond_resched();
+                 * bypassed to root right away or they'll be lost.
-                }
+                 */
-                ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
+                if (ret == -EINTR)
+                        __mem_cgroup_cancel_charge(root_mem_cgroup, 1);
                if (ret)
-                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return ret;
                mc.precharge++;
+                cond_resched();
        }
-        return ret;
+        return 0;
 }
 /**
@@ -6760,21 +6623,18 @@ static void __mem_cgroup_clear_mc(void)
        /* we must fixup refcnts and charges */
        if (mc.moved_swap) {
                /* uncharge swap account from the old cgroup */
-                if (!mem_cgroup_is_root(mc.from))
+                res_counter_uncharge(&mc.from->memsw,
-                        res_counter_uncharge(&mc.from->memsw,
+                                     PAGE_SIZE * mc.moved_swap);
-                                                PAGE_SIZE * mc.moved_swap);
                for (i = 0; i < mc.moved_swap; i++)
                        css_put(&mc.from->css);
-                if (!mem_cgroup_is_root(mc.to)) {
+                /*
-                        /*
+                 * we charged both to->res and to->memsw, so we should
-                         * we charged both to->res and to->memsw, so we should
+                 * uncharge to->res.
-                         * uncharge to->res.
+                 */
-                         */
+                res_counter_uncharge(&mc.to->res,
-                        res_counter_uncharge(&mc.to->res,
+                                     PAGE_SIZE * mc.moved_swap);
-                                                PAGE_SIZE * mc.moved_swap);
-                }
                /* we've already done css_get(mc.to) */
                mc.moved_swap = 0;
        }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a013bc94ebbe..44c6bd201d3a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1173,6 +1173,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        lock_page(hpage);
        /*
+         * The page could have changed compound pages during the locking.
+         * If this happens just bail out.
+         */
+        if (compound_head(p) != hpage) {
+                action_result(pfn, "different compound page after locking", IGNORED);
+                res = -EBUSY;
+                goto out;
+        }
+        /*
         * We use page flags to determine what action should be taken, but
         * the flags can be modified by the error containment action.  One
         * example is an mlocked page, where PG_mlocked is cleared by
diff --git a/mm/memory.c b/mm/memory.c
index 8b44f765b645..5c55270729f7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -884,7 +884,7 @@ out_set_pte:
        return 0;
 }
-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
                   unsigned long addr, unsigned long end)
 {
@@ -2399,7 +2399,10 @@ EXPORT_SYMBOL(unmap_mapping_range);
 /*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We return with pte unmapped and unlocked.
+ *
+ * We return with the mmap_sem locked or unlocked in the same cases
+ * as does filemap_fault().
 */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
@@ -2688,6 +2691,11 @@ oom:
        return VM_FAULT_OOM;
 }
+/*
+ * The mmap_sem must have been held on entry, and may have been
+ * released depending on flags and vma->vm_ops->fault() return value.
+ * See filemap_fault() and __lock_page_retry().
+ */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
                pgoff_t pgoff, unsigned int flags, struct page **page)
 {
@@ -2744,7 +2752,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-                pte_mksoft_dirty(entry);
+                entry = pte_mksoft_dirty(entry);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
                page_add_new_anon_rmap(page, vma, address);
@@ -2758,17 +2766,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
        update_mmu_cache(vma, address, pte);
 }
-static unsigned long fault_around_bytes = rounddown_pow_of_two(65536);
+static unsigned long fault_around_bytes __read_mostly =
+        rounddown_pow_of_two(65536);
-static inline unsigned long fault_around_pages(void)
-{
-        return fault_around_bytes >> PAGE_SHIFT;
-}
-static inline unsigned long fault_around_mask(void)
-{
-        return ~(fault_around_bytes - 1) & PAGE_MASK;
-}
 #ifdef CONFIG_DEBUG_FS
 static int fault_around_bytes_get(void *data, u64 *val)
@@ -2834,12 +2833,15 @@ late_initcall(fault_around_debugfs);
 static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
                pte_t *pte, pgoff_t pgoff, unsigned int flags)
 {
-        unsigned long start_addr;
+        unsigned long start_addr, nr_pages, mask;
        pgoff_t max_pgoff;
        struct vm_fault vmf;
        int off;
-        start_addr = max(address & fault_around_mask(), vma->vm_start);
+        nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
+        start_addr = max(address & mask, vma->vm_start);
        off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
        pte -= off;
        pgoff -= off;
@@ -2851,7 +2853,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
        max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
                PTRS_PER_PTE - 1;
        max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
-                        pgoff + fault_around_pages() - 1);
+                        pgoff + nr_pages - 1);
        /* Check if it makes any sense to call ->map_pages */
        while (!pte_none(*pte)) {
@@ -2886,7 +2888,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * something).
         */
        if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
-            fault_around_pages() > 1) {
+            fault_around_bytes >> PAGE_SHIFT > 1) {
                pte = pte_offset_map_lock(mm, pmd, address, &ptl);
                do_fault_around(vma, address, pte, pgoff, flags);
                if (!pte_same(*pte, orig_pte))
@@ -3016,6 +3018,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return ret;
 }
+/*
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults).
+ * The mmap_sem may have been released depending on flags and our
+ * return value.  See filemap_fault() and __lock_page_or_retry().
+ */
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
@@ -3040,7 +3048,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We return with pte unmapped and unlocked.
+ * The mmap_sem may have been released depending on flags and our
+ * return value.  See filemap_fault() and __lock_page_or_retry().
 */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
@@ -3172,7 +3182,10 @@ out:
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We return with pte unmapped and unlocked.
+ *
+ * The mmap_sem may have been released depending on flags and our
+ * return value.  See filemap_fault() and __lock_page_or_retry().
 */
 static int handle_pte_fault(struct mm_struct *mm,
                     struct vm_area_struct *vma, unsigned long address,
@@ -3181,7 +3194,7 @@ static int handle_pte_fault(struct mm_struct *mm,
        pte_t entry;
        spinlock_t *ptl;
-        entry = *pte;
+        entry = ACCESS_ONCE(*pte);
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
@@ -3232,6 +3245,9 @@ unlock:
 /*
 * By the time we get here, we already hold the mm semaphore
+ *
+ * The mmap_sem may have been released depending on flags and our
+ * return value.  See filemap_fault() and __lock_page_or_retry().
 */
 static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                             unsigned long address, unsigned int flags)
@@ -3313,6 +3329,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
+/*
+ * By the time we get here, we already hold the mm semaphore
+ *
+ * The mmap_sem may have been released depending on flags and our
+ * return value.  See filemap_fault() and __lock_page_or_retry().
+ */
 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                    unsigned long address, unsigned int flags)
 {
@@ -3591,11 +3613,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                ret = get_user_pages(tsk, mm, addr, 1,
                                write, 1, &page, &vma);
                if (ret <= 0) {
+#ifndef CONFIG_HAVE_IOREMAP_PROT
+                        break;
+#else
                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
-#ifdef CONFIG_HAVE_IOREMAP_PROT
                        vma = find_vma(mm, addr);
                        if (!vma || vma->vm_start > addr)
                                break;
@@ -3603,9 +3627,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                                ret = vma->vm_ops->access(vma, addr, buf,
                                                          len, write);
                        if (ret <= 0)
-#endif
                                break;
                        bytes = ret;
+#endif
                } else {
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 469bbf505f85..2ff8c2325e96 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -284,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
+static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
-                           unsigned long end_pfn)
+                                     unsigned long end_pfn)
 {
        unsigned long old_zone_end_pfn;
@@ -427,8 +427,8 @@ out_fail:
        return -1;
 }
-static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
+static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
-                            unsigned long end_pfn)
+                                      unsigned long end_pfn)
 {
        unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
@@ -977,15 +977,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        zone = page_zone(pfn_to_page(pfn));
        ret = -EINVAL;
-        if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
+        if ((zone_idx(zone) > ZONE_NORMAL ||
+            online_type == MMOP_ONLINE_MOVABLE) &&
            !can_online_high_movable(zone))
                goto out;
-        if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
+        if (online_type == MMOP_ONLINE_KERNEL &&
+            zone_idx(zone) == ZONE_MOVABLE) {
                if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
                        goto out;
        }
-        if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
+        if (online_type == MMOP_ONLINE_MOVABLE &&
+            zone_idx(zone) == ZONE_MOVABLE - 1) {
                if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
                        goto out;
        }
@@ -1156,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size)
        return 0;
 }
+/*
+ * If movable zone has already been setup, newly added memory should be check.
+ * If its address is higher than movable zone, it should be added as movable.
+ * Without this check, movable zone may overlap with other zone.
+ */
+static int should_add_memory_movable(int nid, u64 start, u64 size)
+{
+        unsigned long start_pfn = start >> PAGE_SHIFT;
+        pg_data_t *pgdat = NODE_DATA(nid);
+        struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
+        if (zone_is_empty(movable_zone))
+                return 0;
+        if (movable_zone->zone_start_pfn <= start_pfn)
+                return 1;
+        return 0;
+}
+int zone_for_memory(int nid, u64 start, u64 size, int zone_default)
+{
+        if (should_add_memory_movable(nid, start, size))
+                return ZONE_MOVABLE;
+        return zone_default;
+}
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 int __ref add_memory(int nid, u64 start, u64 size)
 {
diff --git a/mm/mlock.c b/mm/mlock.c
index b1eb53634005..ce84cb0b83ef 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -210,12 +210,19 @@ out:
 * @vma:   target vma
 * @start: start address
 * @end:   end address
+ * @nonblocking:
 *
 * This takes care of making the pages present too.
 *
 * return 0 on success, negative error code on error.
 *
- * vma->vm_mm->mmap_sem must be held for at least read.
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released.  If it's released, *@nonblocking will be set to 0.
 */
 long __mlock_vma_pages_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *nonblocking)
diff --git a/mm/mmap.c b/mm/mmap.c
index 129b847d30cc..64c9d736155c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
+#include <linux/mmdebug.h>
 #include <linux/perf_event.h>
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
@@ -134,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
        unsigned long free, allowed, reserve;
+        VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
+                        -(s64)vm_committed_as_batch * num_online_cpus(),
+                        "memory commitment underflow");
        vm_acct_memory(pages);
        /*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 41cefdf0aadd..950813b1eb36 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -23,6 +23,25 @@
 static struct srcu_struct srcu;
 /*
+ * This function allows mmu_notifier::release callback to delay a call to
+ * a function that will free appropriate resources. The function must be
+ * quick and must not block.
+ */
+void mmu_notifier_call_srcu(struct rcu_head *rcu,
+                            void (*func)(struct rcu_head *rcu))
+{
+        call_srcu(&srcu, rcu, func);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
+void mmu_notifier_synchronize(void)
+{
+        /* Wait for any running method to finish. */
+        srcu_barrier(&srcu);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+/*
 * This function can't run concurrently against mmu_notifier_register
 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
@@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-        srcu_read_unlock(&srcu, id);
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
                hlist_del_init_rcu(&mn->hlist);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
+        srcu_read_unlock(&srcu, id);
        /*
         * synchronize_srcu here prevents mmu_notifier_release from returning to
@@ -325,6 +344,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+/*
+ * Same as mmu_notifier_unregister but no callback and no srcu synchronization.
+ */
+void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
+                                        struct mm_struct *mm)
+{
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        /*
+         * Can not use list_del_rcu() since __mmu_notifier_release
+         * can delete it before we hold the lock.
+         */
+        hlist_del_init_rcu(&mn->hlist);
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
 static int __init mmu_notifier_init(void)
 {
        return init_srcu_struct(&srcu);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3291e82d4352..1e11df8fa7ec 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                unsigned long totalpages, const nodemask_t *nodemask,
                bool force_kill)
 {
-        if (task->exit_state)
-                return OOM_SCAN_CONTINUE;
        if (oom_unkillable_task(task, NULL, nodemask))
                return OOM_SCAN_CONTINUE;
@@ -559,28 +557,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 * if a parallel OOM killing is already taking place that includes a zone in
 * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
 */
-int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
 {
        struct zoneref *z;
        struct zone *zone;
-        int ret = 1;
+        bool ret = true;
        spin_lock(&zone_scan_lock);
-        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
                if (zone_is_oom_locked(zone)) {
-                        ret = 0;
+                        ret = false;
                        goto out;
                }
-        }
-        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+        /*
-                /*
+         * Lock each zone in the zonelist under zone_scan_lock so a parallel
-                 * Lock each zone in the zonelist under zone_scan_lock so a
+         * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
-                 * parallel invocation of try_set_zonelist_oom() doesn't succeed
+         */
-                 * when it shouldn't.
+        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-                 */
                zone_set_flag(zone, ZONE_OOM_LOCKED);
-        }
 out:
        spin_unlock(&zone_scan_lock);
@@ -592,15 +587,14 @@ out:
 * allocation attempts with zonelists containing them may now recall the OOM
 * killer, if necessary.
 */
-void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 {
        struct zoneref *z;
        struct zone *zone;
        spin_lock(&zone_scan_lock);
-        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
                zone_clear_flag(zone, ZONE_OOM_LOCKED);
-        }
        spin_unlock(&zone_scan_lock);
 }
@@ -694,9 +688,9 @@ void pagefault_out_of_memory(void)
        if (mem_cgroup_oom_synchronize(true))
                return;
-        zonelist = node_zonelist(first_online_node, GFP_KERNEL);
+        zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
-        if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+        if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
                out_of_memory(NULL, 0, 0, NULL, false);
-                clear_zonelist_oom(zonelist, GFP_KERNEL);
+                oom_zonelist_unlock(zonelist, GFP_KERNEL);
        }
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e0c943014eb7..91d73ef1744d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -261,14 +261,11 @@ static unsigned long global_dirtyable_memory(void)
 */
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
+        const unsigned long available_memory = global_dirtyable_memory();
        unsigned long background;
        unsigned long dirty;
-        unsigned long uninitialized_var(available_memory);
        struct task_struct *tsk;
-        if (!vm_dirty_bytes || !dirty_background_bytes)
-                available_memory = global_dirtyable_memory();
        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
        else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef44ad736ca1..18cee0d4c8a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
        int migratetype = 0;
        int batch_free = 0;
        int to_free = count;
+        unsigned long nr_scanned;
        spin_lock(&zone->lock);
-        zone->pages_scanned = 0;
+        nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+        if (nr_scanned)
+                __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
        while (to_free) {
                struct page *page;
@@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone,
                                unsigned int order,
                                int migratetype)
 {
+        unsigned long nr_scanned;
        spin_lock(&zone->lock);
-        zone->pages_scanned = 0;
+        nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+        if (nr_scanned)
+                __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
        __free_one_page(page, pfn, zone, order, migratetype);
        if (unlikely(!is_migrate_isolate(migratetype)))
@@ -1257,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
        unsigned long flags;
-        int to_drain;
+        int to_drain, batch;
-        unsigned long batch;
        local_irq_save(flags);
        batch = ACCESS_ONCE(pcp->batch);
-        if (pcp->count >= batch)
+        to_drain = min(pcp->count, batch);
-                to_drain = batch;
-        else
-                to_drain = pcp->count;
        if (to_drain > 0) {
                free_pcppages_bulk(zone, to_drain, pcp);
                pcp->count -= to_drain;
@@ -1610,6 +1612,9 @@ again:
        }
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+        if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
+            !zone_is_fair_depleted(zone))
+                zone_set_flag(zone, ZONE_FAIR_DEPLETED);
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1712,7 +1717,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long lowmem_reserve = z->lowmem_reserve[classzone_idx];
        int o;
        long free_cma = 0;
@@ -1727,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
-        if (free_pages - free_cma <= min + lowmem_reserve)
+        if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
@@ -1922,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 #endif  /* CONFIG_NUMA */
+static void reset_alloc_batches(struct zone *preferred_zone)
+{
+        struct zone *zone = preferred_zone->zone_pgdat->node_zones;
+        do {
+                mod_zone_page_state(zone, NR_ALLOC_BATCH,
+                        high_wmark_pages(zone) - low_wmark_pages(zone) -
+                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+                zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+        } while (zone++ != preferred_zone);
+}
 /*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
@@ -1939,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
        bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                                (gfp_mask & __GFP_WRITE);
+        int nr_fair_skipped = 0;
+        bool zonelist_rescan;
 zonelist_scan:
+        zonelist_rescan = false;
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1964,9 +1984,11 @@ zonelist_scan:
                 */
                if (alloc_flags & ALLOC_FAIR) {
                        if (!zone_local(preferred_zone, zone))
+                                break;
+                        if (zone_is_fair_depleted(zone)) {
+                                nr_fair_skipped++;
                                continue;
-                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                        }
-                                continue;
                }
                /*
                 * When allocating a page cache page for writing, we
@@ -2072,13 +2094,7 @@ this_zone_full:
                        zlc_mark_zone_full(zonelist, z);
        }
-        if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
+        if (page) {
-                /* Disable zlc cache for second zonelist scan */
-                zlc_active = 0;
-                goto zonelist_scan;
-        }
-        if (page)
                /*
                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
                 * necessary to allocate the page. The expectation is
@@ -2087,8 +2103,37 @@ this_zone_full:
                 * for !PFMEMALLOC purposes.
                 */
                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+                return page;
+        }
-        return page;
+        /*
+         * The first pass makes sure allocations are spread fairly within the
+         * local node.  However, the local node might have free pages left
+         * after the fairness batches are exhausted, and remote zones haven't
+         * even been considered yet.  Try once more without fairness, and
+         * include remote zones now, before entering the slowpath and waking
+         * kswapd: prefer spilling to a remote zone over swapping locally.
+         */
+        if (alloc_flags & ALLOC_FAIR) {
+                alloc_flags &= ~ALLOC_FAIR;
+                if (nr_fair_skipped) {
+                        zonelist_rescan = true;
+                        reset_alloc_batches(preferred_zone);
+                }
+                if (nr_online_nodes > 1)
+                        zonelist_rescan = true;
+        }
+        if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
+                /* Disable zlc cache for second zonelist scan */
+                zlc_active = 0;
+                zonelist_rescan = true;
+        }
+        if (zonelist_rescan)
+                goto zonelist_scan;
+        return NULL;
 }
 /*
@@ -2201,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page;
-        /* Acquire the OOM killer lock for the zones in zonelist */
+        /* Acquire the per-zone oom lock for each zone */
-        if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
+        if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
                schedule_timeout_uninterruptible(1);
                return NULL;
        }
@@ -2240,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
-        clear_zonelist_oom(zonelist, gfp_mask);
+        oom_zonelist_unlock(zonelist, gfp_mask);
        return page;
 }
@@ -2409,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
        return page;
 }
-static void reset_alloc_batches(struct zonelist *zonelist,
-                                enum zone_type high_zoneidx,
-                                struct zone *preferred_zone)
-{
-        struct zoneref *z;
-        struct zone *zone;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                /*
-                 * Only reset the batches of zones that were actually
-                 * considered in the fairness pass, we don't want to
-                 * trash fairness information for zones that are not
-                 * actually part of this zonelist's round-robin cycle.
-                 */
-                if (!zone_local(preferred_zone, zone))
-                        continue;
-                mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                        high_wmark_pages(zone) - low_wmark_pages(zone) -
-                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-        }
-}
 static void wake_all_kswapds(unsigned int order,
                             struct zonelist *zonelist,
                             enum zone_type high_zoneidx,
@@ -2616,14 +2639,6 @@ rebalance:
                goto got_pg;
        /*
-         * It can become very expensive to allocate transparent hugepages at
-         * fault, so use asynchronous memory compaction for THP unless it is
-         * khugepaged trying to collapse.
-         */
-        if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
-                migration_mode = MIGRATE_SYNC_LIGHT;
-        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
         * requested a movable allocation that does not heavily disrupt the
@@ -2633,6 +2648,15 @@ rebalance:
                                                (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;
+        /*
+         * It can become very expensive to allocate transparent hugepages at
+         * fault, so use asynchronous memory compaction for THP unless it is
+         * khugepaged trying to collapse.
+         */
+        if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
+                                                (current->flags & PF_KTHREAD))
+                migration_mode = MIGRATE_SYNC_LIGHT;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2766,29 +2790,12 @@ retry_cpuset:
        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
-retry:
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, classzone_idx, migratetype);
        if (unlikely(!page)) {
                /*
-                 * The first pass makes sure allocations are spread
-                 * fairly within the local node.  However, the local
-                 * node might have free pages left after the fairness
-                 * batches are exhausted, and remote zones haven't
-                 * even been considered yet.  Try once more without
-                 * fairness, and include remote zones now, before
-                 * entering the slowpath and waking kswapd: prefer
-                 * spilling to a remote zone over swapping locally.
-                 */
-                if (alloc_flags & ALLOC_FAIR) {
-                        reset_alloc_batches(zonelist, high_zoneidx,
-                                            preferred_zone);
-                        alloc_flags &= ~ALLOC_FAIR;
-                        goto retry;
-                }
-                /*
                 * Runtime PM, block IO and its error handling path
                 * can deadlock because I/O on the device might not
                 * complete.
@@ -2962,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
 * but is not exact.
 */
-void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
        unsigned order = get_order(size);
        struct page *p = alloc_pages_node(nid, gfp_mask, order);
@@ -2970,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
                return NULL;
        return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
-EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
 * free_pages_exact - release memory allocated via alloc_pages_exact()
@@ -3052,7 +3058,7 @@ static inline void show_node(struct zone *zone)
 void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
-        val->sharedram = 0;
+        val->sharedram = global_page_state(NR_SHMEM);
        val->freeram = global_page_state(NR_FREE_PAGES);
        val->bufferram = nr_blockdev_pages();
        val->totalhigh = totalhigh_pages;
@@ -3072,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
                managed_pages += pgdat->node_zones[zone_type].managed_pages;
        val->totalram = managed_pages;
+        val->sharedram = node_page_state(nid, NR_SHMEM);
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3253,12 +3260,12 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_BOUNCE)),
                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-                        zone->pages_scanned,
+                        K(zone_page_state(zone, NR_PAGES_SCANNED)),
                        (!zone_reclaimable(zone) ? "yes" : "no")
                        );
                printk("lowmem_reserve[]:");
                for (i = 0; i < MAX_NR_ZONES; i++)
-                        printk(" %lu", zone->lowmem_reserve[i]);
+                        printk(" %ld", zone->lowmem_reserve[i]);
                printk("\n");
        }
@@ -5579,7 +5586,7 @@ static void calculate_totalreserve_pages(void)
        for_each_online_pgdat(pgdat) {
                for (i = 0; i < MAX_NR_ZONES; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        unsigned long max = 0;
+                        long max = 0;
                        /* Find valid and maximum lowmem_reserve in the zone */
                        for (j = i; j < MAX_NR_ZONES; j++) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 0ca36a7770b1..17b9172ec37f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
 *      - thrashing threshold in memory tight systems
 */
 static pgoff_t count_history_pages(struct address_space *mapping,
-                                   struct file_ra_state *ra,
                                   pgoff_t offset, unsigned long max)
 {
        pgoff_t head;
@@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping,
 {
        pgoff_t size;
-        size = count_history_pages(mapping, ra, offset, max);
+        size = count_history_pages(mapping, offset, max);
        /*
         * not enough history pages:
diff --git a/mm/shmem.c b/mm/shmem.c
index af68b15a8fc1..302d1cf7ad07 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -149,6 +149,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
                vm_unacct_memory(VM_ACCT(size));
 }
+static inline int shmem_reacct_size(unsigned long flags,
+                loff_t oldsize, loff_t newsize)
+{
+        if (!(flags & VM_NORESERVE)) {
+                if (VM_ACCT(newsize) > VM_ACCT(oldsize))
+                        return security_vm_enough_memory_mm(current->mm,
+                                        VM_ACCT(newsize) - VM_ACCT(oldsize));
+                else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
+                        vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
+        }
+        return 0;
+}
 /*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow huge sparse files.
@@ -280,7 +293,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
 */
 static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
-                                   pgoff_t index, gfp_t gfp, void *expected)
+                                   pgoff_t index, void *expected)
 {
        int error;
@@ -549,6 +562,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                loff_t newsize = attr->ia_size;
                if (newsize != oldsize) {
+                        error = shmem_reacct_size(SHMEM_I(inode)->flags,
+                                        oldsize, newsize);
+                        if (error)
+                                return error;
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
                }
@@ -649,7 +666,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
         */
        if (!error)
                error = shmem_add_to_page_cache(*pagep, mapping, index,
-                                                GFP_NOWAIT, radswap);
+                                                radswap);
        if (error != -ENOMEM) {
                /*
                 * Truncation and eviction use free_swap_and_cache(), which
@@ -1095,7 +1112,7 @@ repeat:
                                                gfp & GFP_RECLAIM_MASK);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
-                                                gfp, swp_to_radix_entry(swap));
+                                                swp_to_radix_entry(swap));
                        /*
                         * We already confirmed swap under page lock, and make
                         * no memory allocation here, so usually no possibility
@@ -1149,7 +1166,7 @@ repeat:
                __SetPageSwapBacked(page);
                __set_page_locked(page);
                if (sgp == SGP_WRITE)
-                        init_page_accessed(page);
+                        __SetPageReferenced(page);
                error = mem_cgroup_charge_file(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
@@ -1158,7 +1175,7 @@ repeat:
                error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
-                                                        gfp, NULL);
+                                                        NULL);
                        radix_tree_preload_end();
                }
                if (error) {
@@ -2932,16 +2949,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
        this.len = strlen(name);
        this.hash = 0; /* will go */
        sb = shm_mnt->mnt_sb;
+        path.mnt = mntget(shm_mnt);
        path.dentry = d_alloc_pseudo(sb, &this);
        if (!path.dentry)
                goto put_memory;
        d_set_d_op(path.dentry, &anon_ops);
-        path.mnt = mntget(shm_mnt);
        res = ERR_PTR(-ENOSPC);
        inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
        if (!inode)
-                goto put_dentry;
+                goto put_memory;
        inode->i_flags |= i_flags;
        d_instantiate(path.dentry, inode);
@@ -2949,19 +2966,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
        clear_nlink(inode);     /* It is unlinked */
        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
        if (IS_ERR(res))
-                goto put_dentry;
+                goto put_path;
        res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
                  &shmem_file_operations);
        if (IS_ERR(res))
-                goto put_dentry;
+                goto put_path;
        return res;
-put_dentry:
-        path_put(&path);
 put_memory:
        shmem_unacct_size(flags, size);
+put_path:
+        path_put(&path);
        return res;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 3070b929a1bf..2e60bf3dedbb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -191,7 +191,6 @@ struct array_cache {
        unsigned int limit;
        unsigned int batchcount;
        unsigned int touched;
-        spinlock_t lock;
        void *entry[];  /*
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
@@ -203,6 +202,11 @@ struct array_cache {
                         */
 };
+struct alien_cache {
+        spinlock_t lock;
+        struct array_cache ac;
+};
 #define SLAB_OBJ_PFMEMALLOC     1
 static inline bool is_obj_pfmemalloc(void *objp)
 {
@@ -242,7 +246,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
 static int drain_freelist(struct kmem_cache *cache,
                        struct kmem_cache_node *n, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
-                        int node);
+                        int node, struct list_head *list);
+static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
@@ -267,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 #define MAKE_LIST(cachep, listp, slab, nodeid)                          \
        do {                                                            \
                INIT_LIST_HEAD(listp);                                  \
-                list_splice(&(cachep->node[nodeid]->slab), listp);      \
+                list_splice(&get_node(cachep, nodeid)->slab, listp);    \
        } while (0)
 #define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
@@ -465,143 +470,6 @@ static struct kmem_cache kmem_cache_boot = {
        .name = "kmem_cache",
 };
-#define BAD_ALIEN_MAGIC 0x01020304ul
-#ifdef CONFIG_LOCKDEP
-/*
- * Slab sometimes uses the kmalloc slabs to store the slab headers
- * for other slabs "off slab".
- * The locking for this is tricky in that it nests within the locks
- * of all other slabs in a few places; to deal with this special
- * locking we put on-slab caches into a separate lock-class.
- *
- * We set lock class for alien array caches which are up during init.
- * The lock annotation will be lost if all cpus of a node goes down and
- * then comes back up during hotplug
- */
-static struct lock_class_key on_slab_l3_key;
-static struct lock_class_key on_slab_alc_key;
-static struct lock_class_key debugobj_l3_key;
-static struct lock_class_key debugobj_alc_key;
-static void slab_set_lock_classes(struct kmem_cache *cachep,
-                struct lock_class_key *l3_key, struct lock_class_key *alc_key,
-                int q)
-{
-        struct array_cache **alc;
-        struct kmem_cache_node *n;
-        int r;
-        n = cachep->node[q];
-        if (!n)
-                return;
-        lockdep_set_class(&n->list_lock, l3_key);
-        alc = n->alien;
-        /*
-         * FIXME: This check for BAD_ALIEN_MAGIC
-         * should go away when common slab code is taught to
-         * work even without alien caches.
-         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
-         * for alloc_alien_cache,
-         */
-        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                return;
-        for_each_node(r) {
-                if (alc[r])
-                        lockdep_set_class(&alc[r]->lock, alc_key);
-        }
-}
-static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
-{
-        slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
-}
-static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
-{
-        int node;
-        for_each_online_node(node)
-                slab_set_debugobj_lock_classes_node(cachep, node);
-}
-static void init_node_lock_keys(int q)
-{
-        int i;
-        if (slab_state < UP)
-                return;
-        for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
-                struct kmem_cache_node *n;
-                struct kmem_cache *cache = kmalloc_caches[i];
-                if (!cache)
-                        continue;
-                n = cache->node[q];
-                if (!n || OFF_SLAB(cache))
-                        continue;
-                slab_set_lock_classes(cache, &on_slab_l3_key,
-                                &on_slab_alc_key, q);
-        }
-}
-static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
-{
-        if (!cachep->node[q])
-                return;
-        slab_set_lock_classes(cachep, &on_slab_l3_key,
-                        &on_slab_alc_key, q);
-}
-static inline void on_slab_lock_classes(struct kmem_cache *cachep)
-{
-        int node;
-        VM_BUG_ON(OFF_SLAB(cachep));
-        for_each_node(node)
-                on_slab_lock_classes_node(cachep, node);
-}
-static inline void init_lock_keys(void)
-{
-        int node;
-        for_each_node(node)
-                init_node_lock_keys(node);
-}
-#else
-static void init_node_lock_keys(int q)
-{
-}
-static inline void init_lock_keys(void)
-{
-}
-static inline void on_slab_lock_classes(struct kmem_cache *cachep)
-{
-}
-static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
-{
-}
-static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
-{
-}
-static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
-{
-}
-#endif
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -792,13 +660,8 @@ static void start_cpu_timer(int cpu)
        }
 }
-static struct array_cache *alloc_arraycache(int node, int entries,
+static void init_arraycache(struct array_cache *ac, int limit, int batch)
-                                            int batchcount, gfp_t gfp)
 {
-        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
-        struct array_cache *nc = NULL;
-        nc = kmalloc_node(memsize, gfp, node);
        /*
         * The array_cache structures contain pointers to free object.
         * However, when such objects are allocated or transferred to another
@@ -806,15 +669,24 @@ static struct array_cache *alloc_arraycache(int node, int entries,
         * valid references during a kmemleak scan. Therefore, kmemleak must
         * not scan such objects.
         */
-        kmemleak_no_scan(nc);
+        kmemleak_no_scan(ac);
-        if (nc) {
+        if (ac) {
-                nc->avail = 0;
+                ac->avail = 0;
-                nc->limit = entries;
+                ac->limit = limit;
-                nc->batchcount = batchcount;
+                ac->batchcount = batch;
-                nc->touched = 0;
+                ac->touched = 0;
-                spin_lock_init(&nc->lock);
        }
-        return nc;
+}
+static struct array_cache *alloc_arraycache(int node, int entries,
+                                            int batchcount, gfp_t gfp)
+{
+        size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
+        struct array_cache *ac = NULL;
+        ac = kmalloc_node(memsize, gfp, node);
+        init_arraycache(ac, entries, batchcount);
+        return ac;
 }
 static inline bool is_slab_pfmemalloc(struct page *page)
@@ -826,7 +698,7 @@ static inline bool is_slab_pfmemalloc(struct page *page)
 static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
                                                struct array_cache *ac)
 {
-        struct kmem_cache_node *n = cachep->node[numa_mem_id()];
+        struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
        struct page *page;
        unsigned long flags;
@@ -881,7 +753,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
                 * If there are empty slabs on the slabs_free list and we are
                 * being forced to refill the cache, mark this one !pfmemalloc.
                 */
-                n = cachep->node[numa_mem_id()];
+                n = get_node(cachep, numa_mem_id());
                if (!list_empty(&n->slabs_free) && force_refill) {
                        struct page *page = virt_to_head_page(objp);
                        ClearPageSlabPfmemalloc(page);
@@ -961,12 +833,13 @@ static int transfer_objects(struct array_cache *to,
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, n) do { } while (0)
-static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
+static inline struct alien_cache **alloc_alien_cache(int node,
+                                                int limit, gfp_t gfp)
 {
-        return (struct array_cache **)BAD_ALIEN_MAGIC;
+        return NULL;
 }
-static inline void free_alien_cache(struct array_cache **ac_ptr)
+static inline void free_alien_cache(struct alien_cache **ac_ptr)
 {
 }
@@ -992,46 +865,60 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
-static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
+static struct alien_cache *__alloc_alien_cache(int node, int entries,
+                                                int batch, gfp_t gfp)
+{
+        size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
+        struct alien_cache *alc = NULL;
+        alc = kmalloc_node(memsize, gfp, node);
+        init_arraycache(&alc->ac, entries, batch);
+        spin_lock_init(&alc->lock);
+        return alc;
+}
+static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
-        struct array_cache **ac_ptr;
+        struct alien_cache **alc_ptr;
-        int memsize = sizeof(void *) * nr_node_ids;
+        size_t memsize = sizeof(void *) * nr_node_ids;
        int i;
        if (limit > 1)
                limit = 12;
-        ac_ptr = kzalloc_node(memsize, gfp, node);
+        alc_ptr = kzalloc_node(memsize, gfp, node);
-        if (ac_ptr) {
+        if (!alc_ptr)
-                for_each_node(i) {
+                return NULL;
-                        if (i == node || !node_online(i))
-                                continue;
+        for_each_node(i) {
-                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
+                if (i == node || !node_online(i))
-                        if (!ac_ptr[i]) {
+                        continue;
-                                for (i--; i >= 0; i--)
+                alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
-                                        kfree(ac_ptr[i]);
+                if (!alc_ptr[i]) {
-                                kfree(ac_ptr);
+                        for (i--; i >= 0; i--)
-                                return NULL;
+                                kfree(alc_ptr[i]);
-                        }
+                        kfree(alc_ptr);
+                        return NULL;
                }
        }
-        return ac_ptr;
+        return alc_ptr;
 }
-static void free_alien_cache(struct array_cache **ac_ptr)
+static void free_alien_cache(struct alien_cache **alc_ptr)
 {
        int i;
-        if (!ac_ptr)
+        if (!alc_ptr)
                return;
        for_each_node(i)
-            kfree(ac_ptr[i]);
+            kfree(alc_ptr[i]);
-        kfree(ac_ptr);
+        kfree(alc_ptr);
 }
 static void __drain_alien_cache(struct kmem_cache *cachep,
-                                struct array_cache *ac, int node)
+                                struct array_cache *ac, int node,
+                                struct list_head *list)
 {
-        struct kmem_cache_node *n = cachep->node[node];
+        struct kmem_cache_node *n = get_node(cachep, node);
        if (ac->avail) {
                spin_lock(&n->list_lock);
@@ -1043,7 +930,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
                if (n->shared)
                        transfer_objects(n->shared, ac, ac->limit);
-                free_block(cachep, ac->entry, ac->avail, node);
+                free_block(cachep, ac->entry, ac->avail, node, list);
                ac->avail = 0;
                spin_unlock(&n->list_lock);
        }
@@ -1057,28 +944,40 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
        int node = __this_cpu_read(slab_reap_node);
        if (n->alien) {
-                struct array_cache *ac = n->alien[node];
+                struct alien_cache *alc = n->alien[node];
+                struct array_cache *ac;
+                if (alc) {
+                        ac = &alc->ac;
+                        if (ac->avail && spin_trylock_irq(&alc->lock)) {
+                                LIST_HEAD(list);
-                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
+                                __drain_alien_cache(cachep, ac, node, &list);
-                        __drain_alien_cache(cachep, ac, node);
+                                spin_unlock_irq(&alc->lock);
-                        spin_unlock_irq(&ac->lock);
+                                slabs_destroy(cachep, &list);
+                        }
                }
        }
 }
 static void drain_alien_cache(struct kmem_cache *cachep,
-                                struct array_cache **alien)
+                                struct alien_cache **alien)
 {
        int i = 0;
+        struct alien_cache *alc;
        struct array_cache *ac;
        unsigned long flags;
        for_each_online_node(i) {
-                ac = alien[i];
+                alc = alien[i];
-                if (ac) {
+                if (alc) {
-                        spin_lock_irqsave(&ac->lock, flags);
+                        LIST_HEAD(list);
-                        __drain_alien_cache(cachep, ac, i);
-                        spin_unlock_irqrestore(&ac->lock, flags);
+                        ac = &alc->ac;
+                        spin_lock_irqsave(&alc->lock, flags);
+                        __drain_alien_cache(cachep, ac, i, &list);
+                        spin_unlock_irqrestore(&alc->lock, flags);
+                        slabs_destroy(cachep, &list);
                }
        }
 }
@@ -1087,8 +986,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
        int nodeid = page_to_nid(virt_to_page(objp));
        struct kmem_cache_node *n;
-        struct array_cache *alien = NULL;
+        struct alien_cache *alien = NULL;
+        struct array_cache *ac;
        int node;
+        LIST_HEAD(list);
        node = numa_mem_id();
@@ -1099,21 +1000,25 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
        if (likely(nodeid == node))
                return 0;
-        n = cachep->node[node];
+        n = get_node(cachep, node);
        STATS_INC_NODEFREES(cachep);
        if (n->alien && n->alien[nodeid]) {
                alien = n->alien[nodeid];
+                ac = &alien->ac;
                spin_lock(&alien->lock);
-                if (unlikely(alien->avail == alien->limit)) {
+                if (unlikely(ac->avail == ac->limit)) {
                        STATS_INC_ACOVERFLOW(cachep);
-                        __drain_alien_cache(cachep, alien, nodeid);
+                        __drain_alien_cache(cachep, ac, nodeid, &list);
                }
-                ac_put_obj(cachep, alien, objp);
+                ac_put_obj(cachep, ac, objp);
                spin_unlock(&alien->lock);
+                slabs_destroy(cachep, &list);
        } else {
-                spin_lock(&(cachep->node[nodeid])->list_lock);
+                n = get_node(cachep, nodeid);
-                free_block(cachep, &objp, 1, nodeid);
+                spin_lock(&n->list_lock);
-                spin_unlock(&(cachep->node[nodeid])->list_lock);
+                free_block(cachep, &objp, 1, nodeid, &list);
+                spin_unlock(&n->list_lock);
+                slabs_destroy(cachep, &list);
        }
        return 1;
 }
@@ -1132,7 +1037,7 @@ static int init_cache_node_node(int node)
 {
        struct kmem_cache *cachep;
        struct kmem_cache_node *n;
-        const int memsize = sizeof(struct kmem_cache_node);
+        const size_t memsize = sizeof(struct kmem_cache_node);
        list_for_each_entry(cachep, &slab_caches, list) {
                /*
@@ -1140,7 +1045,8 @@ static int init_cache_node_node(int node)
                 * begin anything. Make sure some other cpu on this
                 * node has not already allocated this
                 */
-                if (!cachep->node[node]) {
+                n = get_node(cachep, node);
+                if (!n) {
                        n = kmalloc_node(memsize, GFP_KERNEL, node);
                        if (!n)
                                return -ENOMEM;
@@ -1156,11 +1062,11 @@ static int init_cache_node_node(int node)
                        cachep->node[node] = n;
                }
-                spin_lock_irq(&cachep->node[node]->list_lock);
+                spin_lock_irq(&n->list_lock);
-                cachep->node[node]->free_limit =
+                n->free_limit =
                        (1 + nr_cpus_node(node)) *
                        cachep->batchcount + cachep->num;
-                spin_unlock_irq(&cachep->node[node]->list_lock);
+                spin_unlock_irq(&n->list_lock);
        }
        return 0;
 }
@@ -1181,12 +1087,13 @@ static void cpuup_canceled(long cpu)
        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared;
-                struct array_cache **alien;
+                struct alien_cache **alien;
+                LIST_HEAD(list);
                /* cpu is dead; no one can alloc from it. */
                nc = cachep->array[cpu];
                cachep->array[cpu] = NULL;
-                n = cachep->node[node];
+                n = get_node(cachep, node);
                if (!n)
                        goto free_array_cache;
@@ -1196,7 +1103,7 @@ static void cpuup_canceled(long cpu)
                /* Free limit for this kmem_cache_node */
                n->free_limit -= cachep->batchcount;
                if (nc)
-                        free_block(cachep, nc->entry, nc->avail, node);
+                        free_block(cachep, nc->entry, nc->avail, node, &list);
                if (!cpumask_empty(mask)) {
                        spin_unlock_irq(&n->list_lock);
@@ -1206,7 +1113,7 @@ static void cpuup_canceled(long cpu)
                shared = n->shared;
                if (shared) {
                        free_block(cachep, shared->entry,
-                                   shared->avail, node);
+                                   shared->avail, node, &list);
                        n->shared = NULL;
                }
@@ -1221,6 +1128,7 @@ static void cpuup_canceled(long cpu)
                        free_alien_cache(alien);
                }
 free_array_cache:
+                slabs_destroy(cachep, &list);
                kfree(nc);
        }
        /*
@@ -1229,7 +1137,7 @@ free_array_cache:
         * shrink each nodelist to its limit.
         */
        list_for_each_entry(cachep, &slab_caches, list) {
-                n = cachep->node[node];
+                n = get_node(cachep, node);
                if (!n)
                        continue;
                drain_freelist(cachep, n, slabs_tofree(cachep, n));
@@ -1260,7 +1168,7 @@ static int cpuup_prepare(long cpu)
        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared = NULL;
-                struct array_cache **alien = NULL;
+                struct alien_cache **alien = NULL;
                nc = alloc_arraycache(node, cachep->limit,
                                        cachep->batchcount, GFP_KERNEL);
@@ -1284,7 +1192,7 @@ static int cpuup_prepare(long cpu)
                        }
                }
                cachep->array[cpu] = nc;
-                n = cachep->node[node];
+                n = get_node(cachep, node);
                BUG_ON(!n);
                spin_lock_irq(&n->list_lock);
@@ -1305,13 +1213,7 @@ static int cpuup_prepare(long cpu)
                spin_unlock_irq(&n->list_lock);
                kfree(shared);
                free_alien_cache(alien);
-                if (cachep->flags & SLAB_DEBUG_OBJECTS)
-                        slab_set_debugobj_lock_classes_node(cachep, node);
-                else if (!OFF_SLAB(cachep) &&
-                         !(cachep->flags & SLAB_DESTROY_BY_RCU))
-                        on_slab_lock_classes_node(cachep, node);
        }
-        init_node_lock_keys(node);
        return 0;
 bad:
@@ -1395,7 +1297,7 @@ static int __meminit drain_cache_node_node(int node)
        list_for_each_entry(cachep, &slab_caches, list) {
                struct kmem_cache_node *n;
-                n = cachep->node[node];
+                n = get_node(cachep, node);
                if (!n)
                        continue;
@@ -1575,10 +1477,6 @@ void __init kmem_cache_init(void)
                memcpy(ptr, cpu_cache_get(kmem_cache),
                       sizeof(struct arraycache_init));
-                /*
-                 * Do not assume that spinlocks can be initialized via memcpy:
-                 */
-                spin_lock_init(&ptr->lock);
                kmem_cache->array[smp_processor_id()] = ptr;
@@ -1588,10 +1486,6 @@ void __init kmem_cache_init(void)
                       != &initarray_generic.cache);
                memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
                       sizeof(struct arraycache_init));
-                /*
-                 * Do not assume that spinlocks can be initialized via memcpy:
-                 */
-                spin_lock_init(&ptr->lock);
                kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
        }
@@ -1628,9 +1522,6 @@ void __init kmem_cache_init_late(void)
                        BUG();
        mutex_unlock(&slab_mutex);
-        /* Annotate slab for lockdep -- annotate the malloc caches */
-        init_lock_keys();
        /* Done! */
        slab_state = FULL;
@@ -1690,14 +1581,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
        printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
                cachep->name, cachep->size, cachep->gfporder);
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(cachep, node, n) {
                unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
                unsigned long active_slabs = 0, num_slabs = 0;
-                n = cachep->node[node];
-                if (!n)
-                        continue;
                spin_lock_irqsave(&n->list_lock, flags);
                list_for_each_entry(page, &n->slabs_full, lru) {
                        active_objs += cachep->num;
@@ -1724,7 +1611,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 }
 /*
- * Interface to system's page allocator. No need to hold the cache-lock.
+ * Interface to system's page allocator. No need to hold the
+ * kmem_cache_node ->list_lock.
 *
 * If we requested dmaable memory, we will get it. Even if we
 * did not request dmaable memory, we might get it, but that
@@ -2026,9 +1914,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
 * @cachep: cache pointer being destroyed
 * @page: page pointer being destroyed
 *
- * Destroy all the objs in a slab, and release the mem back to the system.
+ * Destroy all the objs in a slab page, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.  The
+ * Before calling the slab page must have been unlinked from the cache. The
- * cache-lock is not held/needed.
+ * kmem_cache_node ->list_lock is not held/needed.
 */
 static void slab_destroy(struct kmem_cache *cachep, struct page *page)
 {
@@ -2060,6 +1948,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
                kmem_cache_free(cachep->freelist_cache, freelist);
 }
+static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
+{
+        struct page *page, *n;
+        list_for_each_entry_safe(page, n, list, lru) {
+                list_del(&page->lru);
+                slab_destroy(cachep, page);
+        }
+}
 /**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
@@ -2405,17 +2303,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                return err;
        }
-        if (flags & SLAB_DEBUG_OBJECTS) {
-                /*
-                 * Would deadlock through slab_destroy()->call_rcu()->
-                 * debug_object_activate()->kmem_cache_alloc().
-                 */
-                WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
-                slab_set_debugobj_lock_classes(cachep);
-        } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
-                on_slab_lock_classes(cachep);
        return 0;
 }
@@ -2434,7 +2321,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
        check_irq_off();
-        assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock);
+        assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
 #endif
 }
@@ -2442,7 +2329,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
        check_irq_off();
-        assert_spin_locked(&cachep->node[node]->list_lock);
+        assert_spin_locked(&get_node(cachep, node)->list_lock);
 #endif
 }
@@ -2462,12 +2349,16 @@ static void do_drain(void *arg)
        struct kmem_cache *cachep = arg;
        struct array_cache *ac;
        int node = numa_mem_id();
+        struct kmem_cache_node *n;
+        LIST_HEAD(list);
        check_irq_off();
        ac = cpu_cache_get(cachep);
-        spin_lock(&cachep->node[node]->list_lock);
+        n = get_node(cachep, node);
-        free_block(cachep, ac->entry, ac->avail, node);
+        spin_lock(&n->list_lock);
-        spin_unlock(&cachep->node[node]->list_lock);
+        free_block(cachep, ac->entry, ac->avail, node, &list);
+        spin_unlock(&n->list_lock);
+        slabs_destroy(cachep, &list);
        ac->avail = 0;
 }
@@ -2478,17 +2369,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
        on_each_cpu(do_drain, cachep, 1);
        check_irq_on();
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(cachep, node, n)
-                n = cachep->node[node];
+                if (n->alien)
-                if (n && n->alien)
                        drain_alien_cache(cachep, n->alien);
-        }
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(cachep, node, n)
-                n = cachep->node[node];
+                drain_array(cachep, n, n->shared, 1, node);
-                if (n)
-                        drain_array(cachep, n, n->shared, 1, node);
-        }
 }
 /*
@@ -2534,17 +2420,14 @@ out:
 int __kmem_cache_shrink(struct kmem_cache *cachep)
 {
-        int ret = 0, i = 0;
+        int ret = 0;
+        int node;
        struct kmem_cache_node *n;
        drain_cpu_caches(cachep);
        check_irq_on();
-        for_each_online_node(i) {
+        for_each_kmem_cache_node(cachep, node, n) {
-                n = cachep->node[i];
-                if (!n)
-                        continue;
                drain_freelist(cachep, n, slabs_tofree(cachep, n));
                ret += !list_empty(&n->slabs_full) ||
@@ -2566,13 +2449,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
            kfree(cachep->array[i]);
        /* NUMA: free the node structures */
-        for_each_online_node(i) {
+        for_each_kmem_cache_node(cachep, i, n) {
-                n = cachep->node[i];
+                kfree(n->shared);
-                if (n) {
+                free_alien_cache(n->alien);
-                        kfree(n->shared);
+                kfree(n);
-                        free_alien_cache(n->alien);
+                cachep->node[i] = NULL;
-                        kfree(n);
-                }
        }
        return 0;
 }
@@ -2751,7 +2632,7 @@ static int cache_grow(struct kmem_cache *cachep,
        /* Take the node list lock to change the colour_next on this node */
        check_irq_off();
-        n = cachep->node[nodeid];
+        n = get_node(cachep, nodeid);
        spin_lock(&n->list_lock);
        /* Get colour for the slab, and cal the next value. */
@@ -2920,7 +2801,7 @@ retry:
                 */
                batchcount = BATCHREFILL_LIMIT;
        }
-        n = cachep->node[node];
+        n = get_node(cachep, node);
        BUG_ON(ac->avail > 0 || !n);
        spin_lock(&n->list_lock);
@@ -3060,7 +2941,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
-        if (cachep == kmem_cache)
+        if (unlikely(cachep == kmem_cache))
                return false;
        return should_failslab(cachep->object_size, flags, cachep->flags);
@@ -3169,8 +3050,8 @@ retry:
                nid = zone_to_nid(zone);
                if (cpuset_zone_allowed_hardwall(zone, flags) &&
-                        cache->node[nid] &&
+                        get_node(cache, nid) &&
-                        cache->node[nid]->free_objects) {
+                        get_node(cache, nid)->free_objects) {
                                obj = ____cache_alloc_node(cache,
                                        flags | GFP_THISNODE, nid);
                                if (obj)
@@ -3233,7 +3114,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
        int x;
        VM_BUG_ON(nodeid > num_online_nodes());
-        n = cachep->node[nodeid];
+        n = get_node(cachep, nodeid);
        BUG_ON(!n);
 retry:
@@ -3304,7 +3185,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        if (nodeid == NUMA_NO_NODE)
                nodeid = slab_node;
-        if (unlikely(!cachep->node[nodeid])) {
+        if (unlikely(!get_node(cachep, nodeid))) {
                /* Node not bootstrapped yet */
                ptr = fallback_alloc(cachep, flags);
                goto out;
@@ -3405,12 +3286,13 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 /*
 * Caller needs to acquire correct kmem_cache_node's list_lock
+ * @list: List of detached free slabs should be freed by caller
 */
-static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
+static void free_block(struct kmem_cache *cachep, void **objpp,
-                       int node)
+                        int nr_objects, int node, struct list_head *list)
 {
        int i;
-        struct kmem_cache_node *n;
+        struct kmem_cache_node *n = get_node(cachep, node);
        for (i = 0; i < nr_objects; i++) {
                void *objp;
@@ -3420,7 +3302,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
                objp = objpp[i];
                page = virt_to_head_page(objp);
-                n = cachep->node[node];
                list_del(&page->lru);
                check_spinlock_acquired_node(cachep, node);
                slab_put_obj(cachep, page, objp, node);
@@ -3431,13 +3312,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
                if (page->active == 0) {
                        if (n->free_objects > n->free_limit) {
                                n->free_objects -= cachep->num;
-                                /* No need to drop any previously held
+                                list_add_tail(&page->lru, list);
-                                 * lock here, even if we have a off-slab slab
-                                 * descriptor it is guaranteed to come from
-                                 * a different cache, refer to comments before
-                                 * alloc_slabmgmt.
-                                 */
-                                slab_destroy(cachep, page);
                        } else {
                                list_add(&page->lru, &n->slabs_free);
                        }
@@ -3456,13 +3331,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
        int batchcount;
        struct kmem_cache_node *n;
        int node = numa_mem_id();
+        LIST_HEAD(list);
        batchcount = ac->batchcount;
 #if DEBUG
        BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
        check_irq_off();
-        n = cachep->node[node];
+        n = get_node(cachep, node);
        spin_lock(&n->list_lock);
        if (n->shared) {
                struct array_cache *shared_array = n->shared;
@@ -3477,7 +3353,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
                }
        }
-        free_block(cachep, ac->entry, batchcount, node);
+        free_block(cachep, ac->entry, batchcount, node, &list);
 free_done:
 #if STATS
        {
@@ -3498,6 +3374,7 @@ free_done:
        }
 #endif
        spin_unlock(&n->list_lock);
+        slabs_destroy(cachep, &list);
        ac->avail -= batchcount;
        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
 }
@@ -3754,7 +3631,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
        int node;
        struct kmem_cache_node *n;
        struct array_cache *new_shared;
-        struct array_cache **new_alien = NULL;
+        struct alien_cache **new_alien = NULL;
        for_each_online_node(node) {
@@ -3775,15 +3652,16 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
                        }
                }
-                n = cachep->node[node];
+                n = get_node(cachep, node);
                if (n) {
                        struct array_cache *shared = n->shared;
+                        LIST_HEAD(list);
                        spin_lock_irq(&n->list_lock);
                        if (shared)
                                free_block(cachep, shared->entry,
-                                                shared->avail, node);
+                                                shared->avail, node, &list);
                        n->shared = new_shared;
                        if (!n->alien) {
@@ -3793,6 +3671,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
                        n->free_limit = (1 + nr_cpus_node(node)) *
                                        cachep->batchcount + cachep->num;
                        spin_unlock_irq(&n->list_lock);
+                        slabs_destroy(cachep, &list);
                        kfree(shared);
                        free_alien_cache(new_alien);
                        continue;
@@ -3820,9 +3699,8 @@ fail:
                /* Cache is not active yet. Roll back what we did */
                node--;
                while (node >= 0) {
-                        if (cachep->node[node]) {
+                        n = get_node(cachep, node);
-                                n = cachep->node[node];
+                        if (n) {
                                kfree(n->shared);
                                free_alien_cache(n->alien);
                                kfree(n);
@@ -3883,12 +3761,20 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
        cachep->shared = shared;
        for_each_online_cpu(i) {
+                LIST_HEAD(list);
                struct array_cache *ccold = new->new[i];
+                int node;
+                struct kmem_cache_node *n;
                if (!ccold)
                        continue;
-                spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
-                free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
+                node = cpu_to_mem(i);
-                spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
+                n = get_node(cachep, node);
+                spin_lock_irq(&n->list_lock);
+                free_block(cachep, ccold->entry, ccold->avail, node, &list);
+                spin_unlock_irq(&n->list_lock);
+                slabs_destroy(cachep, &list);
                kfree(ccold);
        }
        kfree(new);
@@ -3996,6 +3882,7 @@ skip_setup:
 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
                         struct array_cache *ac, int force, int node)
 {
+        LIST_HEAD(list);
        int tofree;
        if (!ac || !ac->avail)
@@ -4008,12 +3895,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
                        if (tofree > ac->avail)
                                tofree = (ac->avail + 1) / 2;
-                        free_block(cachep, ac->entry, tofree, node);
+                        free_block(cachep, ac->entry, tofree, node, &list);
                        ac->avail -= tofree;
                        memmove(ac->entry, &(ac->entry[tofree]),
                                sizeof(void *) * ac->avail);
                }
                spin_unlock_irq(&n->list_lock);
+                slabs_destroy(cachep, &list);
        }
 }
@@ -4048,7 +3936,7 @@ static void cache_reap(struct work_struct *w)
                 * have established with reasonable certainty that
                 * we can do some work if the lock was obtained.
                 */
-                n = searchp->node[node];
+                n = get_node(searchp, node);
                reap_alien(searchp, n);
@@ -4100,10 +3988,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
        active_objs = 0;
        num_slabs = 0;
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(cachep, node, n) {
-                n = cachep->node[node];
-                if (!n)
-                        continue;
                check_irq_on();
                spin_lock_irq(&n->list_lock);
@@ -4328,10 +4213,7 @@ static int leaks_show(struct seq_file *m, void *p)
        x[1] = 0;
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(cachep, node, n) {
-                n = cachep->node[node];
-                if (!n)
-                        continue;
                check_irq_on();
                spin_lock_irq(&n->list_lock);
diff --git a/mm/slab.h b/mm/slab.h
index 961a3fb1f5a2..0e0fdd365840 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -256,13 +256,12 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
                return cachep;
        pr_err("%s: Wrong slab cache. %s but object is from %s\n",
-                __FUNCTION__, cachep->name, s->name);
+               __func__, cachep->name, s->name);
        WARN_ON_ONCE(1);
        return s;
 }
-#endif
+#ifndef CONFIG_SLOB
 /*
 * The slab lists for all objects.
 */
@@ -277,7 +276,7 @@ struct kmem_cache_node {
        unsigned int free_limit;
        unsigned int colour_next;       /* Per-node cache coloring */
        struct array_cache *shared;     /* shared per node */
-        struct array_cache **alien;     /* on other nodes */
+        struct alien_cache **alien;     /* on other nodes */
        unsigned long next_reap;        /* updated without locking */
        int free_touched;               /* updated without locking */
 #endif
@@ -294,5 +293,22 @@ struct kmem_cache_node {
 };
+static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
+{
+        return s->node[node];
+}
+/*
+ * Iterator over all nodes. The body will be executed for each node that has
+ * a kmem_cache_node structure allocated (which is true for all online nodes)
+ */
+#define for_each_kmem_cache_node(__s, __node, __n) \
+        for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \
+                 if (__n)
+#endif
 void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
+#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d31c4bacc6a2..d319502b2403 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -19,6 +19,8 @@
 #include <asm/tlbflush.h>
 #include <asm/page.h>
 #include <linux/memcontrol.h>
+#define CREATE_TRACE_POINTS
 #include <trace/events/kmem.h>
 #include "slab.h"
@@ -787,3 +789,102 @@ static int __init slab_proc_init(void)
 }
 module_init(slab_proc_init);
 #endif /* CONFIG_SLABINFO */
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+                                           gfp_t flags)
+{
+        void *ret;
+        size_t ks = 0;
+        if (p)
+                ks = ksize(p);
+        if (ks >= new_size)
+                return (void *)p;
+        ret = kmalloc_track_caller(new_size, flags);
+        if (ret && p)
+                memcpy(ret, p, ks);
+        return ret;
+}
+/**
+ * __krealloc - like krealloc() but don't free @p.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * This function is like krealloc() except it never frees the originally
+ * allocated buffer. Use this if you don't want to free the buffer immediately
+ * like, for example, with RCU.
+ */
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+        if (unlikely(!new_size))
+                return ZERO_SIZE_PTR;
+        return __do_krealloc(p, new_size, flags);
+}
+EXPORT_SYMBOL(__krealloc);
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+        void *ret;
+        if (unlikely(!new_size)) {
+                kfree(p);
+                return ZERO_SIZE_PTR;
+        }
+        ret = __do_krealloc(p, new_size, flags);
+        if (ret && p != ret)
+                kfree(p);
+        return ret;
+}
+EXPORT_SYMBOL(krealloc);
+/**
+ * kzfree - like kfree but zero memory
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kzfree() does nothing.
+ *
+ * Note: this function zeroes the whole allocated buffer which can be a good
+ * deal bigger than the requested buffer size passed to kmalloc(). So be
+ * careful when using this function in performance sensitive code.
+ */
+void kzfree(const void *p)
+{
+        size_t ks;
+        void *mem = (void *)p;
+        if (unlikely(ZERO_OR_NULL_PTR(mem)))
+                return;
+        ks = ksize(mem);
+        memset(mem, 0, ks);
+        kfree(mem);
+}
+EXPORT_SYMBOL(kzfree);
+/* Tracepoints definitions. */
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/slub.c b/mm/slub.c
index 73004808537e..3e8afcc07a76 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -233,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
 *                      Core slab cache functions
 *******************************************************************/
-static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
-{
-        return s->node[node];
-}
 /* Verify that a pointer has an address that is valid within a slab page */
 static inline int check_valid_pointer(struct kmem_cache *s,
                                struct page *page, const void *object)
@@ -288,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
                        __p += (__s)->size)
+#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
+        for (__p = (__addr), __idx = 1; __idx <= __objects;\
+                        __p += (__s)->size, __idx++)
 /* Determine object index from a given position */
 static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 {
@@ -382,9 +381,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
        if (s->flags & __CMPXCHG_DOUBLE) {
                if (cmpxchg_double(&page->freelist, &page->counters,
-                        freelist_old, counters_old,
+                                   freelist_old, counters_old,
-                        freelist_new, counters_new))
+                                   freelist_new, counters_new))
-                return 1;
+                        return 1;
        } else
 #endif
        {
@@ -418,9 +417,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
        if (s->flags & __CMPXCHG_DOUBLE) {
                if (cmpxchg_double(&page->freelist, &page->counters,
-                        freelist_old, counters_old,
+                                   freelist_old, counters_old,
-                        freelist_new, counters_new))
+                                   freelist_new, counters_new))
-                return 1;
+                        return 1;
        } else
 #endif
        {
@@ -945,60 +944,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
 }
 /*
- * Hooks for other subsystems that check memory allocations. In a typical
- * production configuration these hooks all should produce no code at all.
- */
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
-        kmemleak_alloc(ptr, size, 1, flags);
-}
-static inline void kfree_hook(const void *x)
-{
-        kmemleak_free(x);
-}
-static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
-{
-        flags &= gfp_allowed_mask;
-        lockdep_trace_alloc(flags);
-        might_sleep_if(flags & __GFP_WAIT);
-        return should_failslab(s->object_size, flags, s->flags);
-}
-static inline void slab_post_alloc_hook(struct kmem_cache *s,
-                                        gfp_t flags, void *object)
-{
-        flags &= gfp_allowed_mask;
-        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
-        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
-}
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
-{
-        kmemleak_free_recursive(x, s->flags);
-        /*
-         * Trouble is that we may no longer disable interrupts in the fast path
-         * So in order to make the debug calls that expect irqs to be
-         * disabled we need to disable interrupts temporarily.
-         */
-#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
-        {
-                unsigned long flags;
-                local_irq_save(flags);
-                kmemcheck_slab_free(s, x, s->object_size);
-                debug_check_no_locks_freed(x, s->object_size);
-                local_irq_restore(flags);
-        }
-#endif
-        if (!(s->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(x, s->object_size);
-}
-/*
 * Tracking of fully allocated slabs for debugging purposes.
 */
 static void add_full(struct kmem_cache *s,
@@ -1282,6 +1227,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
 static inline void dec_slabs_node(struct kmem_cache *s, int node,
                                                        int objects) {}
+#endif /* CONFIG_SLUB_DEBUG */
+/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
        kmemleak_alloc(ptr, size, 1, flags);
@@ -1293,21 +1244,44 @@ static inline void kfree_hook(const void *x)
 }
 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
-                                                        { return 0; }
+{
+        flags &= gfp_allowed_mask;
+        lockdep_trace_alloc(flags);
+        might_sleep_if(flags & __GFP_WAIT);
+        return should_failslab(s->object_size, flags, s->flags);
+}
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+static inline void slab_post_alloc_hook(struct kmem_cache *s,
-                void *object)
+                                        gfp_t flags, void *object)
 {
-        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
+        flags &= gfp_allowed_mask;
-                flags & gfp_allowed_mask);
+        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 }
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
 {
        kmemleak_free_recursive(x, s->flags);
-}
-#endif /* CONFIG_SLUB_DEBUG */
+        /*
+         * Trouble is that we may no longer disable interrupts in the fast path
+         * So in order to make the debug calls that expect irqs to be
+         * disabled we need to disable interrupts temporarily.
+         */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+        {
+                unsigned long flags;
+                local_irq_save(flags);
+                kmemcheck_slab_free(s, x, s->object_size);
+                debug_check_no_locks_freed(x, s->object_size);
+                local_irq_restore(flags);
+        }
+#endif
+        if (!(s->flags & SLAB_DEBUG_OBJECTS))
+                debug_check_no_obj_freed(x, s->object_size);
+}
 /*
 * Slab allocation and freeing
@@ -1409,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
        struct page *page;
        void *start;
-        void *last;
        void *p;
        int order;
+        int idx;
        BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@ -1432,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (unlikely(s->flags & SLAB_POISON))
                memset(start, POISON_INUSE, PAGE_SIZE << order);
-        last = start;
+        for_each_object_idx(p, idx, s, start, page->objects) {
-        for_each_object(p, s, start, page->objects) {
+                setup_object(s, page, p);
-                setup_object(s, page, last);
+                if (likely(idx < page->objects))
-                set_freepointer(s, last, p);
+                        set_freepointer(s, p, p + s->size);
-                last = p;
+                else
+                        set_freepointer(s, p, NULL);
        }
-        setup_object(s, page, last);
-        set_freepointer(s, last, NULL);
        page->freelist = start;
        page->inuse = page->objects;
@@ -2162,6 +2135,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
        static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
        int node;
+        struct kmem_cache_node *n;
        if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
                return;
@@ -2176,15 +2150,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
                pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
                        s->name);
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(s, node, n) {
-                struct kmem_cache_node *n = get_node(s, node);
                unsigned long nr_slabs;
                unsigned long nr_objs;
                unsigned long nr_free;
-                if (!n)
-                        continue;
                nr_free  = count_partial(n, count_free);
                nr_slabs = node_nr_slabs(n);
                nr_objs  = node_nr_objs(n);
@@ -2928,13 +2898,10 @@ static void early_kmem_cache_node_alloc(int node)
 static void free_kmem_cache_nodes(struct kmem_cache *s)
 {
        int node;
+        struct kmem_cache_node *n;
-        for_each_node_state(node, N_NORMAL_MEMORY) {
+        for_each_kmem_cache_node(s, node, n) {
-                struct kmem_cache_node *n = s->node[node];
+                kmem_cache_free(kmem_cache_node, n);
-                if (n)
-                        kmem_cache_free(kmem_cache_node, n);
                s->node[node] = NULL;
        }
 }
@@ -3222,12 +3189,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 static inline int kmem_cache_close(struct kmem_cache *s)
 {
        int node;
+        struct kmem_cache_node *n;
        flush_all(s);
        /* Attempt to free all objects */
-        for_each_node_state(node, N_NORMAL_MEMORY) {
+        for_each_kmem_cache_node(s, node, n) {
-                struct kmem_cache_node *n = get_node(s, node);
                free_partial(s, n);
                if (n->nr_partial || slabs_node(s, node))
                        return 1;
@@ -3412,9 +3378,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
                return -ENOMEM;
        flush_all(s);
-        for_each_node_state(node, N_NORMAL_MEMORY) {
+        for_each_kmem_cache_node(s, node, n) {
-                n = get_node(s, node);
                if (!n->nr_partial)
                        continue;
@@ -3586,6 +3550,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 {
        int node;
        struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+        struct kmem_cache_node *n;
        memcpy(s, static_cache, kmem_cache->object_size);
@@ -3595,19 +3560,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
         * IPIs around.
         */
        __flush_cpu_slab(s, smp_processor_id());
-        for_each_node_state(node, N_NORMAL_MEMORY) {
+        for_each_kmem_cache_node(s, node, n) {
-                struct kmem_cache_node *n = get_node(s, node);
                struct page *p;
-                if (n) {
+                list_for_each_entry(p, &n->partial, lru)
-                        list_for_each_entry(p, &n->partial, lru)
+                        p->slab_cache = s;
-                                p->slab_cache = s;
 #ifdef CONFIG_SLUB_DEBUG
-                        list_for_each_entry(p, &n->full, lru)
+                list_for_each_entry(p, &n->full, lru)
-                                p->slab_cache = s;
+                        p->slab_cache = s;
 #endif
-                }
        }
        list_add(&s->list, &slab_caches);
        return s;
@@ -3960,16 +3922,14 @@ static long validate_slab_cache(struct kmem_cache *s)
        unsigned long count = 0;
        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
                                sizeof(unsigned long), GFP_KERNEL);
+        struct kmem_cache_node *n;
        if (!map)
                return -ENOMEM;
        flush_all(s);
-        for_each_node_state(node, N_NORMAL_MEMORY) {
+        for_each_kmem_cache_node(s, node, n)
-                struct kmem_cache_node *n = get_node(s, node);
                count += validate_slab_node(s, n, map);
-        }
        kfree(map);
        return count;
 }
@@ -4123,6 +4083,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
        int node;
        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
                                     sizeof(unsigned long), GFP_KERNEL);
+        struct kmem_cache_node *n;
        if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
                                     GFP_TEMPORARY)) {
@@ -4132,8 +4093,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
        /* Push back cpu slabs */
        flush_all(s);
-        for_each_node_state(node, N_NORMAL_MEMORY) {
+        for_each_kmem_cache_node(s, node, n) {
-                struct kmem_cache_node *n = get_node(s, node);
                unsigned long flags;
                struct page *page;
@@ -4205,7 +4165,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 #endif
 #ifdef SLUB_RESILIENCY_TEST
-static void resiliency_test(void)
+static void __init resiliency_test(void)
 {
        u8 *p;
@@ -4332,8 +4292,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
        get_online_mems();
 #ifdef CONFIG_SLUB_DEBUG
        if (flags & SO_ALL) {
-                for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n;
-                        struct kmem_cache_node *n = get_node(s, node);
+                for_each_kmem_cache_node(s, node, n) {
                        if (flags & SO_TOTAL)
                                x = atomic_long_read(&n->total_objects);
@@ -4349,9 +4310,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
        } else
 #endif
        if (flags & SO_PARTIAL) {
-                for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n;
-                        struct kmem_cache_node *n = get_node(s, node);
+                for_each_kmem_cache_node(s, node, n) {
                        if (flags & SO_TOTAL)
                                x = count_partial(n, count_total);
                        else if (flags & SO_OBJECTS)
@@ -4364,7 +4325,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
        }
        x = sprintf(buf, "%lu", total);
 #ifdef CONFIG_NUMA
-        for_each_node_state(node, N_NORMAL_MEMORY)
+        for (node = 0; node < nr_node_ids; node++)
                if (nodes[node])
                        x += sprintf(buf + x, " N%d=%lu",
                                        node, nodes[node]);
@@ -4378,16 +4339,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 static int any_slab_objects(struct kmem_cache *s)
 {
        int node;
+        struct kmem_cache_node *n;
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(s, node, n)
-                struct kmem_cache_node *n = get_node(s, node);
-                if (!n)
-                        continue;
                if (atomic_long_read(&n->total_objects))
                        return 1;
-        }
        return 0;
 }
 #endif
@@ -4509,7 +4466,7 @@ SLAB_ATTR_RO(ctor);
 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", s->refcount - 1);
+        return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
 }
 SLAB_ATTR_RO(aliases);
@@ -5171,12 +5128,6 @@ static char *create_unique_id(struct kmem_cache *s)
                *p++ = '-';
        p += sprintf(p, "%07d", s->size);
-#ifdef CONFIG_MEMCG_KMEM
-        if (!is_root_cache(s))
-                p += sprintf(p, "-%08d",
-                                memcg_cache_id(s->memcg_params->memcg));
-#endif
        BUG_ON(p > name + ID_STR_LENGTH - 1);
        return name;
 }
@@ -5342,13 +5293,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
        unsigned long nr_objs = 0;
        unsigned long nr_free = 0;
        int node;
+        struct kmem_cache_node *n;
-        for_each_online_node(node) {
+        for_each_kmem_cache_node(s, node, n) {
-                struct kmem_cache_node *n = get_node(s, node);
-                if (!n)
-                        continue;
                nr_slabs += node_nr_slabs(n);
                nr_objs += node_nr_objs(n);
                nr_free += count_partial(n, count_free);
diff --git a/mm/swap.c b/mm/swap.c
index 9e8e3472248b..c789d01c9ec3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -501,7 +501,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
                SetPageActive(page);
                lru += LRU_ACTIVE;
                add_page_to_lru_list(page, lruvec, lru);
-                trace_mm_lru_activate(page, page_to_pfn(page));
+                trace_mm_lru_activate(page);
                __count_vm_event(PGACTIVATE);
                update_page_reclaim_stat(lruvec, file, 1);
@@ -589,6 +589,9 @@ static void __lru_cache_activate_page(struct page *page)
 * inactive,unreferenced        ->      inactive,referenced
 * inactive,referenced          ->      active,unreferenced
 * active,unreferenced          ->      active,referenced
+ *
+ * When a newly allocated page is not yet visible, so safe for non-atomic ops,
+ * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
 */
 void mark_page_accessed(struct page *page)
 {
@@ -614,17 +617,6 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
-/*
- * Used to mark_page_accessed(page) that is not visible yet and when it is
- * still safe to use non-atomic ops
- */
-void init_page_accessed(struct page *page)
-{
-        if (!PageReferenced(page))
-                __SetPageReferenced(page);
-}
-EXPORT_SYMBOL(init_page_accessed);
 static void __lru_cache_add(struct page *page)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
@@ -996,7 +988,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
        SetPageLRU(page);
        add_page_to_lru_list(page, lruvec, lru);
        update_page_reclaim_stat(lruvec, file, active);
-        trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
+        trace_mm_lru_insertion(page, lru);
 }
 /*
diff --git a/mm/util.c b/mm/util.c
index d5ea733c5082..7b6608df2ee8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -16,9 +16,6 @@
 #include "internal.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/kmem.h>
 /**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
@@ -112,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len)
 }
 EXPORT_SYMBOL(memdup_user);
-static __always_inline void *__do_krealloc(const void *p, size_t new_size,
-                                           gfp_t flags)
-{
-        void *ret;
-        size_t ks = 0;
-        if (p)
-                ks = ksize(p);
-        if (ks >= new_size)
-                return (void *)p;
-        ret = kmalloc_track_caller(new_size, flags);
-        if (ret && p)
-                memcpy(ret, p, ks);
-        return ret;
-}
-/**
- * __krealloc - like krealloc() but don't free @p.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * This function is like krealloc() except it never frees the originally
- * allocated buffer. Use this if you don't want to free the buffer immediately
- * like, for example, with RCU.
- */
-void *__krealloc(const void *p, size_t new_size, gfp_t flags)
-{
-        if (unlikely(!new_size))
-                return ZERO_SIZE_PTR;
-        return __do_krealloc(p, new_size, flags);
-}
-EXPORT_SYMBOL(__krealloc);
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.  If @p is %NULL, krealloc()
- * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
-        void *ret;
-        if (unlikely(!new_size)) {
-                kfree(p);
-                return ZERO_SIZE_PTR;
-        }
-        ret = __do_krealloc(p, new_size, flags);
-        if (ret && p != ret)
-                kfree(p);
-        return ret;
-}
-EXPORT_SYMBOL(krealloc);
-/**
- * kzfree - like kfree but zero memory
- * @p: object to free memory of
- *
- * The memory of the object @p points to is zeroed before freed.
- * If @p is %NULL, kzfree() does nothing.
- *
- * Note: this function zeroes the whole allocated buffer which can be a good
- * deal bigger than the requested buffer size passed to kmalloc(). So be
- * careful when using this function in performance sensitive code.
- */
-void kzfree(const void *p)
-{
-        size_t ks;
-        void *mem = (void *)p;
-        if (unlikely(ZERO_OR_NULL_PTR(mem)))
-                return;
-        ks = ksize(mem);
-        memset(mem, 0, ks);
-        kfree(mem);
-}
-EXPORT_SYMBOL(kzfree);
 /*
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
@@ -504,11 +410,3 @@ out_mm:
 out:
        return res;
 }
-/* Tracepoints definitions. */
-EXPORT_TRACEPOINT_SYMBOL(kmalloc);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kfree);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f64632b67196..2b0aa5486092 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(unmap_kernel_range);
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
 {
        unsigned long addr = (unsigned long)area->addr;
        unsigned long end = addr + get_vm_area_size(area);
        int err;
-        err = vmap_page_range(addr, end, prot, *pages);
+        err = vmap_page_range(addr, end, prot, pages);
-        if (err > 0) {
-                *pages += err;
-                err = 0;
-        }
-        return err;
+        return err > 0 ? 0 : err;
 }
 EXPORT_SYMBOL_GPL(map_vm_area);
@@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count,
        if (!area)
                return NULL;
-        if (map_vm_area(area, prot, &pages)) {
+        if (map_vm_area(area, prot, pages)) {
                vunmap(area->addr);
                return NULL;
        }
@@ -1566,7 +1562,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        const int order = 0;
        struct page **pages;
        unsigned int nr_pages, array_size, i;
-        gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+        const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
        nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
        array_size = (nr_pages * sizeof(struct page *));
@@ -1589,12 +1586,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        for (i = 0; i < area->nr_pages; i++) {
                struct page *page;
-                gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
                if (node == NUMA_NO_NODE)
-                        page = alloc_page(tmp_mask);
+                        page = alloc_page(alloc_mask);
                else
-                        page = alloc_pages_node(node, tmp_mask, order);
+                        page = alloc_pages_node(node, alloc_mask, order);
                if (unlikely(!page)) {
                        /* Successfully allocated i pages, free them in __vunmap() */
@@ -1602,9 +1598,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                        goto fail;
                }
                area->pages[i] = page;
+                if (gfp_mask & __GFP_WAIT)
+                        cond_resched();
        }
-        if (map_vm_area(area, prot, &pages))
+        if (map_vm_area(area, prot, pages))
                goto fail;
        return area->addr;
@@ -2690,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
        prev_end = VMALLOC_START;
-        spin_lock(&vmap_area_lock);
+        rcu_read_lock();
        if (list_empty(&vmap_area_list)) {
                vmi->largest_chunk = VMALLOC_TOTAL;
                goto out;
        }
-        list_for_each_entry(va, &vmap_area_list, list) {
+        list_for_each_entry_rcu(va, &vmap_area_list, list) {
                unsigned long addr = va->va_start;
                /*
@@ -2724,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
                vmi->largest_chunk = VMALLOC_END - prev_end;
 out:
-        spin_unlock(&vmap_area_lock);
+        rcu_read_unlock();
 }
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0f16ffe8eb67..d2f65c856350 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -59,35 +59,20 @@
 #include <trace/events/vmscan.h>
 struct scan_control {
-        /* Incremented by the number of inactive pages that were scanned */
-        unsigned long nr_scanned;
-        /* Number of pages freed so far during a call to shrink_zones() */
-        unsigned long nr_reclaimed;
        /* How many pages shrink_list() should reclaim */
        unsigned long nr_to_reclaim;
-        unsigned long hibernation_mode;
        /* This context's GFP mask */
        gfp_t gfp_mask;
-        int may_writepage;
+        /* Allocation order */
-        /* Can mapped pages be reclaimed? */
-        int may_unmap;
-        /* Can pages be swapped as part of reclaim? */
-        int may_swap;
        int order;
-        /* Scan (total_size >> priority) pages at once */
+        /*
-        int priority;
+         * Nodemask of nodes allowed by the caller. If NULL, all nodes
+         * are scanned.
-        /* anon vs. file LRUs scanning "ratio" */
+         */
-        int swappiness;
+        nodemask_t      *nodemask;
        /*
         * The memory cgroup that hit its limit and as a result is the
@@ -95,11 +80,27 @@ struct scan_control {
         */
        struct mem_cgroup *target_mem_cgroup;
-        /*
+        /* Scan (total_size >> priority) pages at once */
-         * Nodemask of nodes allowed by the caller. If NULL, all nodes
+        int priority;
-         * are scanned.
-         */
+        unsigned int may_writepage:1;
-        nodemask_t      *nodemask;
+        /* Can mapped pages be reclaimed? */
+        unsigned int may_unmap:1;
+        /* Can pages be swapped as part of reclaim? */
+        unsigned int may_swap:1;
+        unsigned int hibernation_mode:1;
+        /* One of the zones is ready for compaction */
+        unsigned int compaction_ready:1;
+        /* Incremented by the number of inactive pages that were scanned */
+        unsigned long nr_scanned;
+        /* Number of pages freed so far during a call to shrink_zones() */
+        unsigned long nr_reclaimed;
 };
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -136,7 +137,11 @@ struct scan_control {
 * From 0 .. 100.  Higher means more swappy.
 */
 int vm_swappiness = 60;
-unsigned long vm_total_pages;   /* The total number of pages which the VM controls */
+/*
+ * The total number of pages which are beyond the high watermark within all
+ * zones.
+ */
+unsigned long vm_total_pages;
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -169,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
 bool zone_reclaimable(struct zone *zone)
 {
-        return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+        return zone_page_state(zone, NR_PAGES_SCANNED) <
+                zone_reclaimable_pages(zone) * 6;
 }
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -1503,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
        if (global_reclaim(sc)) {
-                zone->pages_scanned += nr_scanned;
+                __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
                else
@@ -1693,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                     &nr_scanned, sc, isolate_mode, lru);
        if (global_reclaim(sc))
-                zone->pages_scanned += nr_scanned;
+                __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
        reclaim_stat->recent_scanned[file] += nr_taken;
@@ -1750,7 +1756,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
         * Count referenced pages from currently used mappings as rotated,
         * even though only some of them are actually re-activated.  This
         * helps balance scan pressure between file and anonymous pages in
-         * get_scan_ratio.
+         * get_scan_count.
         */
        reclaim_stat->recent_rotated[file] += nr_rotated;
@@ -1865,8 +1871,8 @@ enum scan_balance {
 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
 */
-static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+static void get_scan_count(struct lruvec *lruvec, int swappiness,
-                           unsigned long *nr)
+                           struct scan_control *sc, unsigned long *nr)
 {
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        u64 fraction[2];
@@ -1909,7 +1915,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * using the memory controller's swap limit feature would be
         * too expensive.
         */
-        if (!global_reclaim(sc) && !sc->swappiness) {
+        if (!global_reclaim(sc) && !swappiness) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -1919,16 +1925,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * system is close to OOM, scan both anon and file equally
         * (unless the swappiness setting disagrees with swapping).
         */
-        if (!sc->priority && sc->swappiness) {
+        if (!sc->priority && swappiness) {
                scan_balance = SCAN_EQUAL;
                goto out;
        }
-        anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
-                get_lru_size(lruvec, LRU_INACTIVE_ANON);
-        file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
-                get_lru_size(lruvec, LRU_INACTIVE_FILE);
        /*
         * Prevent the reclaimer from falling into the cache trap: as
         * cache pages start out inactive, every cache fault will tip
@@ -1939,9 +1940,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * anon pages.  Try to detect this based on file LRU size.
         */
        if (global_reclaim(sc)) {
-                unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+                unsigned long zonefile;
+                unsigned long zonefree;
-                if (unlikely(file + free <= high_wmark_pages(zone))) {
+                zonefree = zone_page_state(zone, NR_FREE_PAGES);
+                zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
+                           zone_page_state(zone, NR_INACTIVE_FILE);
+                if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
                        scan_balance = SCAN_ANON;
                        goto out;
                }
@@ -1962,7 +1968,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-        anon_prio = sc->swappiness;
+        anon_prio = swappiness;
        file_prio = 200 - anon_prio;
        /*
@@ -1976,6 +1982,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
+        anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+                get_lru_size(lruvec, LRU_INACTIVE_ANON);
+        file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+                get_lru_size(lruvec, LRU_INACTIVE_FILE);
        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                reclaim_stat->recent_scanned[0] /= 2;
@@ -2052,7 +2064,8 @@ out:
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
+                          struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long targets[NR_LRU_LISTS];
@@ -2063,7 +2076,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
        struct blk_plug plug;
        bool scan_adjusted;
-        get_scan_count(lruvec, sc, nr);
+        get_scan_count(lruvec, swappiness, sc, nr);
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
@@ -2241,9 +2254,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
        }
 }
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static bool shrink_zone(struct zone *zone, struct scan_control *sc)
 {
        unsigned long nr_reclaimed, nr_scanned;
+        bool reclaimable = false;
        do {
                struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2259,11 +2273,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
                memcg = mem_cgroup_iter(root, NULL, &reclaim);
                do {
                        struct lruvec *lruvec;
+                        int swappiness;
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+                        swappiness = mem_cgroup_swappiness(memcg);
-                        sc->swappiness = mem_cgroup_swappiness(memcg);
+                        shrink_lruvec(lruvec, swappiness, sc);
-                        shrink_lruvec(lruvec, sc);
                        /*
                         * Direct reclaim and kswapd have to scan all memory
@@ -2287,20 +2302,21 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
                           sc->nr_scanned - nr_scanned,
                           sc->nr_reclaimed - nr_reclaimed);
+                if (sc->nr_reclaimed - nr_reclaimed)
+                        reclaimable = true;
        } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
+        return reclaimable;
 }
 /* Returns true if compaction should go ahead for a high-order request */
-static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+static inline bool compaction_ready(struct zone *zone, int order)
 {
        unsigned long balance_gap, watermark;
        bool watermark_ok;
-        /* Do not consider compaction for orders reclaim is meant to satisfy */
-        if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
-                return false;
        /*
         * Compaction takes time to run and there are potentially other
         * callers using the pages just freed. Continue reclaiming until
@@ -2309,18 +2325,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         */
        balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
                        zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
-        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+        watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
        /*
         * If compaction is deferred, reclaim up to a point where
         * compaction will have a chance of success when re-enabled
         */
-        if (compaction_deferred(zone, sc->order))
+        if (compaction_deferred(zone, order))
                return watermark_ok;
        /* If compaction is not ready to start, keep reclaiming */
-        if (!compaction_suitable(zone, sc->order))
+        if (!compaction_suitable(zone, order))
                return false;
        return watermark_ok;
@@ -2342,10 +2358,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 *
- * This function returns true if a zone is being reclaimed for a costly
+ * Returns true if a zone was reclaimable.
- * high-order allocation and compaction is ready to begin. This indicates to
- * the caller that it should consider retrying the allocation instead of
- * further reclaim.
 */
 static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
@@ -2354,13 +2367,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        unsigned long lru_pages = 0;
-        bool aborted_reclaim = false;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        gfp_t orig_mask;
        struct shrink_control shrink = {
                .gfp_mask = sc->gfp_mask,
        };
        enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+        bool reclaimable = false;
        /*
         * If the number of buffer_heads in the machine exceeds the maximum
@@ -2391,22 +2404,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                        if (sc->priority != DEF_PRIORITY &&
                            !zone_reclaimable(zone))
                                continue;       /* Let kswapd poll it */
-                        if (IS_ENABLED(CONFIG_COMPACTION)) {
-                                /*
+                        /*
-                                 * If we already have plenty of memory free for
+                         * If we already have plenty of memory free for
-                                 * compaction in this zone, don't free any more.
+                         * compaction in this zone, don't free any more.
-                                 * Even though compaction is invoked for any
+                         * Even though compaction is invoked for any
-                                 * non-zero order, only frequent costly order
+                         * non-zero order, only frequent costly order
-                                 * reclamation is disruptive enough to become a
+                         * reclamation is disruptive enough to become a
-                                 * noticeable problem, like transparent huge
+                         * noticeable problem, like transparent huge
-                                 * page allocations.
+                         * page allocations.
-                                 */
+                         */
-                                if ((zonelist_zone_idx(z) <= requested_highidx)
+                        if (IS_ENABLED(CONFIG_COMPACTION) &&
-                                    && compaction_ready(zone, sc)) {
+                            sc->order > PAGE_ALLOC_COSTLY_ORDER &&
-                                        aborted_reclaim = true;
+                            zonelist_zone_idx(z) <= requested_highidx &&
-                                        continue;
+                            compaction_ready(zone, sc->order)) {
-                                }
+                                sc->compaction_ready = true;
+                                continue;
                        }
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2419,10 +2434,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                &nr_soft_scanned);
                        sc->nr_reclaimed += nr_soft_reclaimed;
                        sc->nr_scanned += nr_soft_scanned;
+                        if (nr_soft_reclaimed)
+                                reclaimable = true;
                        /* need some check for avoid more shrink_zone() */
                }
-                shrink_zone(zone, sc);
+                if (shrink_zone(zone, sc))
+                        reclaimable = true;
+                if (global_reclaim(sc) &&
+                    !reclaimable && zone_reclaimable(zone))
+                        reclaimable = true;
        }
        /*
@@ -2445,27 +2467,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         */
        sc->gfp_mask = orig_mask;
-        return aborted_reclaim;
+        return reclaimable;
-}
-/* All zones in zonelist are unreclaimable? */
-static bool all_unreclaimable(struct zonelist *zonelist,
-                struct scan_control *sc)
-{
-        struct zoneref *z;
-        struct zone *zone;
-        for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                        gfp_zone(sc->gfp_mask), sc->nodemask) {
-                if (!populated_zone(zone))
-                        continue;
-                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                        continue;
-                if (zone_reclaimable(zone))
-                        return false;
-        }
-        return true;
 }
 /*
@@ -2489,7 +2491,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 {
        unsigned long total_scanned = 0;
        unsigned long writeback_threshold;
-        bool aborted_reclaim;
+        bool zones_reclaimable;
        delayacct_freepages_start();
@@ -2500,11 +2502,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
                                sc->priority);
                sc->nr_scanned = 0;
-                aborted_reclaim = shrink_zones(zonelist, sc);
+                zones_reclaimable = shrink_zones(zonelist, sc);
                total_scanned += sc->nr_scanned;
                if (sc->nr_reclaimed >= sc->nr_to_reclaim)
-                        goto out;
+                        break;
+                if (sc->compaction_ready)
+                        break;
                /*
                 * If we're getting trouble reclaiming, start doing
@@ -2526,28 +2531,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                                WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
-        } while (--sc->priority >= 0 && !aborted_reclaim);
+        } while (--sc->priority >= 0);
-out:
        delayacct_freepages_end();
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
-        /*
-         * As hibernation is going on, kswapd is freezed so that it can't mark
-         * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
-         * check.
-         */
-        if (oom_killer_disabled)
-                return 0;
        /* Aborted reclaim to try compaction? don't OOM, then */
-        if (aborted_reclaim)
+        if (sc->compaction_ready)
                return 1;
-        /* top priority shrink_zones still had more to do? don't OOM, then */
+        /* Any of the zones still reclaimable?  Don't OOM. */
-        if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
+        if (zones_reclaimable)
                return 1;
        return 0;
@@ -2684,15 +2680,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 {
        unsigned long nr_reclaimed;
        struct scan_control sc = {
+                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+                .order = order,
+                .nodemask = nodemask,
+                .priority = DEF_PRIORITY,
                .may_writepage = !laptop_mode,
-                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
                .may_swap = 1,
-                .order = order,
-                .priority = DEF_PRIORITY,
-                .target_mem_cgroup = NULL,
-                .nodemask = nodemask,
        };
        /*
@@ -2722,17 +2717,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                unsigned long *nr_scanned)
 {
        struct scan_control sc = {
-                .nr_scanned = 0,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
+                .target_mem_cgroup = memcg,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-                .order = 0,
-                .priority = 0,
-                .swappiness = mem_cgroup_swappiness(memcg),
-                .target_mem_cgroup = memcg,
        };
        struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+        int swappiness = mem_cgroup_swappiness(memcg);
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2748,7 +2740,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-        shrink_lruvec(lruvec, &sc);
+        shrink_lruvec(lruvec, swappiness, &sc);
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2764,16 +2756,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
        unsigned long nr_reclaimed;
        int nid;
        struct scan_control sc = {
-                .may_writepage = !laptop_mode,
-                .may_unmap = 1,
-                .may_swap = !noswap,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
-                .order = 0,
-                .priority = DEF_PRIORITY,
-                .target_mem_cgroup = memcg,
-                .nodemask = NULL, /* we don't care the placement */
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+                .target_mem_cgroup = memcg,
+                .priority = DEF_PRIORITY,
+                .may_writepage = !laptop_mode,
+                .may_unmap = 1,
+                .may_swap = !noswap,
        };
        /*
@@ -3031,12 +3021,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
+                .order = order,
                .priority = DEF_PRIORITY,
+                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = 1,
-                .may_writepage = !laptop_mode,
-                .order = order,
-                .target_mem_cgroup = NULL,
        };
        count_vm_event(PAGEOUTRUN);
@@ -3417,14 +3406,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
        struct reclaim_state reclaim_state;
        struct scan_control sc = {
+                .nr_to_reclaim = nr_to_reclaim,
                .gfp_mask = GFP_HIGHUSER_MOVABLE,
-                .may_swap = 1,
+                .priority = DEF_PRIORITY,
-                .may_unmap = 1,
                .may_writepage = 1,
-                .nr_to_reclaim = nr_to_reclaim,
+                .may_unmap = 1,
+                .may_swap = 1,
                .hibernation_mode = 1,
-                .order = 0,
-                .priority = DEF_PRIORITY,
        };
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        struct task_struct *p = current;
@@ -3604,13 +3592,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
        struct scan_control sc = {
-                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-                .may_swap = 1,
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,
+                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+                .may_swap = 1,
        };
        struct shrink_control shrink = {
                .gfp_mask = sc.gfp_mask,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b37bd49bfd55..e9ab104b956f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                        continue;
                threshold = (*calculate_pressure)(zone);
-                for_each_possible_cpu(cpu)
+                for_each_online_cpu(cpu)
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                        = threshold;
        }
@@ -763,6 +763,7 @@ const char * const vmstat_text[] = {
        "nr_shmem",
        "nr_dirtied",
        "nr_written",
+        "nr_pages_scanned",
 #ifdef CONFIG_NUMA
        "numa_hit",
@@ -1067,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
-                   zone->pages_scanned,
+                   zone_page_state(zone, NR_PAGES_SCANNED),
                   zone->spanned_pages,
                   zone->present_pages,
                   zone->managed_pages);
@@ -1077,10 +1078,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                                zone_page_state(zone, i));
        seq_printf(m,
-                   "\n        protection: (%lu",
+                   "\n        protection: (%ld",
                   zone->lowmem_reserve[0]);
        for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
-                seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+                seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
        seq_printf(m,
                   ")"
                   "\n  pagesets");
diff --git a/mm/zbud.c b/mm/zbud.c
index 01df13a7e2e1..a05790b1915e 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -51,6 +51,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/zbud.h>
+#include <linux/zpool.h>
 /*****************
 * Structures
@@ -113,6 +114,90 @@ struct zbud_header {
 };
 /*****************
+ * zpool
+ ****************/
+#ifdef CONFIG_ZPOOL
+static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
+{
+        return zpool_evict(pool, handle);
+}
+static struct zbud_ops zbud_zpool_ops = {
+        .evict =        zbud_zpool_evict
+};
+static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+{
+        return zbud_create_pool(gfp, &zbud_zpool_ops);
+}
+static void zbud_zpool_destroy(void *pool)
+{
+        zbud_destroy_pool(pool);
+}
+static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+                        unsigned long *handle)
+{
+        return zbud_alloc(pool, size, gfp, handle);
+}
+static void zbud_zpool_free(void *pool, unsigned long handle)
+{
+        zbud_free(pool, handle);
+}
+static int zbud_zpool_shrink(void *pool, unsigned int pages,
+                        unsigned int *reclaimed)
+{
+        unsigned int total = 0;
+        int ret = -EINVAL;
+        while (total < pages) {
+                ret = zbud_reclaim_page(pool, 8);
+                if (ret < 0)
+                        break;
+                total++;
+        }
+        if (reclaimed)
+                *reclaimed = total;
+        return ret;
+}
+static void *zbud_zpool_map(void *pool, unsigned long handle,
+                        enum zpool_mapmode mm)
+{
+        return zbud_map(pool, handle);
+}
+static void zbud_zpool_unmap(void *pool, unsigned long handle)
+{
+        zbud_unmap(pool, handle);
+}
+static u64 zbud_zpool_total_size(void *pool)
+{
+        return zbud_get_pool_size(pool) * PAGE_SIZE;
+}
+static struct zpool_driver zbud_zpool_driver = {
+        .type =         "zbud",
+        .owner =        THIS_MODULE,
+        .create =       zbud_zpool_create,
+        .destroy =      zbud_zpool_destroy,
+        .malloc =       zbud_zpool_malloc,
+        .free =         zbud_zpool_free,
+        .shrink =       zbud_zpool_shrink,
+        .map =          zbud_zpool_map,
+        .unmap =        zbud_zpool_unmap,
+        .total_size =   zbud_zpool_total_size,
+};
+#endif /* CONFIG_ZPOOL */
+/*****************
 * Helpers
 *****************/
 /* Just to make the code easier to read */
@@ -122,7 +207,7 @@ enum buddy {
 };
 /* Converts an allocation size in bytes to size in zbud chunks */
-static int size_to_chunks(int size)
+static int size_to_chunks(size_t size)
 {
        return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 }
@@ -247,7 +332,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
 * a new page.
 */
-int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp,
+int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
                        unsigned long *handle)
 {
        int chunks, i, freechunks;
@@ -511,11 +596,20 @@ static int __init init_zbud(void)
        /* Make sure the zbud header will fit in one chunk */
        BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
        pr_info("loaded\n");
+#ifdef CONFIG_ZPOOL
+        zpool_register_driver(&zbud_zpool_driver);
+#endif
        return 0;
 }
 static void __exit exit_zbud(void)
 {
+#ifdef CONFIG_ZPOOL
+        zpool_unregister_driver(&zbud_zpool_driver);
+#endif
        pr_info("unloaded\n");
 }
diff --git a/mm/zpool.c b/mm/zpool.c
new file mode 100644
index 000000000000..e40612a1df00
--- /dev/null
+++ b/mm/zpool.c
@@ -0,0 +1,364 @@
+/*
+ * zpool memory storage api
+ *
+ * Copyright (C) 2014 Dan Streetman
+ *
+ * This is a common frontend for memory storage pool implementations.
+ * Typically, this is used to store compressed memory.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/zpool.h>
+struct zpool {
+        char *type;
+        struct zpool_driver *driver;
+        void *pool;
+        struct zpool_ops *ops;
+        struct list_head list;
+};
+static LIST_HEAD(drivers_head);
+static DEFINE_SPINLOCK(drivers_lock);
+static LIST_HEAD(pools_head);
+static DEFINE_SPINLOCK(pools_lock);
+/**
+ * zpool_register_driver() - register a zpool implementation.
+ * @driver:     driver to register
+ */
+void zpool_register_driver(struct zpool_driver *driver)
+{
+        spin_lock(&drivers_lock);
+        atomic_set(&driver->refcount, 0);
+        list_add(&driver->list, &drivers_head);
+        spin_unlock(&drivers_lock);
+}
+EXPORT_SYMBOL(zpool_register_driver);
+/**
+ * zpool_unregister_driver() - unregister a zpool implementation.
+ * @driver:     driver to unregister.
+ *
+ * Module usage counting is used to prevent using a driver
+ * while/after unloading, so if this is called from module
+ * exit function, this should never fail; if called from
+ * other than the module exit function, and this returns
+ * failure, the driver is in use and must remain available.
+ */
+int zpool_unregister_driver(struct zpool_driver *driver)
+{
+        int ret = 0, refcount;
+        spin_lock(&drivers_lock);
+        refcount = atomic_read(&driver->refcount);
+        WARN_ON(refcount < 0);
+        if (refcount > 0)
+                ret = -EBUSY;
+        else
+                list_del(&driver->list);
+        spin_unlock(&drivers_lock);
+        return ret;
+}
+EXPORT_SYMBOL(zpool_unregister_driver);
+/**
+ * zpool_evict() - evict callback from a zpool implementation.
+ * @pool:       pool to evict from.
+ * @handle:     handle to evict.
+ *
+ * This can be used by zpool implementations to call the
+ * user's evict zpool_ops struct evict callback.
+ */
+int zpool_evict(void *pool, unsigned long handle)
+{
+        struct zpool *zpool;
+        spin_lock(&pools_lock);
+        list_for_each_entry(zpool, &pools_head, list) {
+                if (zpool->pool == pool) {
+                        spin_unlock(&pools_lock);
+                        if (!zpool->ops || !zpool->ops->evict)
+                                return -EINVAL;
+                        return zpool->ops->evict(zpool, handle);
+                }
+        }
+        spin_unlock(&pools_lock);
+        return -ENOENT;
+}
+EXPORT_SYMBOL(zpool_evict);
+static struct zpool_driver *zpool_get_driver(char *type)
+{
+        struct zpool_driver *driver;
+        spin_lock(&drivers_lock);
+        list_for_each_entry(driver, &drivers_head, list) {
+                if (!strcmp(driver->type, type)) {
+                        bool got = try_module_get(driver->owner);
+                        if (got)
+                                atomic_inc(&driver->refcount);
+                        spin_unlock(&drivers_lock);
+                        return got ? driver : NULL;
+                }
+        }
+        spin_unlock(&drivers_lock);
+        return NULL;
+}
+static void zpool_put_driver(struct zpool_driver *driver)
+{
+        atomic_dec(&driver->refcount);
+        module_put(driver->owner);
+}
+/**
+ * zpool_create_pool() - Create a new zpool
+ * @type        The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @gfp         The GFP flags to use when allocating the pool.
+ * @ops         The optional ops callback.
+ *
+ * This creates a new zpool of the specified type.  The gfp flags will be
+ * used when allocating memory, if the implementation supports it.  If the
+ * ops param is NULL, then the created zpool will not be shrinkable.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: New zpool on success, NULL on failure.
+ */
+struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+{
+        struct zpool_driver *driver;
+        struct zpool *zpool;
+        pr_info("creating pool type %s\n", type);
+        driver = zpool_get_driver(type);
+        if (!driver) {
+                request_module(type);
+                driver = zpool_get_driver(type);
+        }
+        if (!driver) {
+                pr_err("no driver for type %s\n", type);
+                return NULL;
+        }
+        zpool = kmalloc(sizeof(*zpool), gfp);
+        if (!zpool) {
+                pr_err("couldn't create zpool - out of memory\n");
+                zpool_put_driver(driver);
+                return NULL;
+        }
+        zpool->type = driver->type;
+        zpool->driver = driver;
+        zpool->pool = driver->create(gfp, ops);
+        zpool->ops = ops;
+        if (!zpool->pool) {
+                pr_err("couldn't create %s pool\n", type);
+                zpool_put_driver(driver);
+                kfree(zpool);
+                return NULL;
+        }
+        pr_info("created %s pool\n", type);
+        spin_lock(&pools_lock);
+        list_add(&zpool->list, &pools_head);
+        spin_unlock(&pools_lock);
+        return zpool;
+}
+/**
+ * zpool_destroy_pool() - Destroy a zpool
+ * @pool        The zpool to destroy.
+ *
+ * Implementations must guarantee this to be thread-safe,
+ * however only when destroying different pools.  The same
+ * pool should only be destroyed once, and should not be used
+ * after it is destroyed.
+ *
+ * This destroys an existing zpool.  The zpool should not be in use.
+ */
+void zpool_destroy_pool(struct zpool *zpool)
+{
+        pr_info("destroying pool type %s\n", zpool->type);
+        spin_lock(&pools_lock);
+        list_del(&zpool->list);
+        spin_unlock(&pools_lock);
+        zpool->driver->destroy(zpool->pool);
+        zpool_put_driver(zpool->driver);
+        kfree(zpool);
+}
+/**
+ * zpool_get_type() - Get the type of the zpool
+ * @pool        The zpool to check
+ *
+ * This returns the type of the pool.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: The type of zpool.
+ */
+char *zpool_get_type(struct zpool *zpool)
+{
+        return zpool->type;
+}
+/**
+ * zpool_malloc() - Allocate memory
+ * @pool        The zpool to allocate from.
+ * @size        The amount of memory to allocate.
+ * @gfp         The GFP flags to use when allocating memory.
+ * @handle      Pointer to the handle to set
+ *
+ * This allocates the requested amount of memory from the pool.
+ * The gfp flags will be used when allocating memory, if the
+ * implementation supports it.  The provided @handle will be
+ * set to the allocated object handle.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: 0 on success, negative value on error.
+ */
+int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
+                        unsigned long *handle)
+{
+        return zpool->driver->malloc(zpool->pool, size, gfp, handle);
+}
+/**
+ * zpool_free() - Free previously allocated memory
+ * @pool        The zpool that allocated the memory.
+ * @handle      The handle to the memory to free.
+ *
+ * This frees previously allocated memory.  This does not guarantee
+ * that the pool will actually free memory, only that the memory
+ * in the pool will become available for use by the pool.
+ *
+ * Implementations must guarantee this to be thread-safe,
+ * however only when freeing different handles.  The same
+ * handle should only be freed once, and should not be used
+ * after freeing.
+ */
+void zpool_free(struct zpool *zpool, unsigned long handle)
+{
+        zpool->driver->free(zpool->pool, handle);
+}
+/**
+ * zpool_shrink() - Shrink the pool size
+ * @pool        The zpool to shrink.
+ * @pages       The number of pages to shrink the pool.
+ * @reclaimed   The number of pages successfully evicted.
+ *
+ * This attempts to shrink the actual memory size of the pool
+ * by evicting currently used handle(s).  If the pool was
+ * created with no zpool_ops, or the evict call fails for any
+ * of the handles, this will fail.  If non-NULL, the @reclaimed
+ * parameter will be set to the number of pages reclaimed,
+ * which may be more than the number of pages requested.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: 0 on success, negative value on error/failure.
+ */
+int zpool_shrink(struct zpool *zpool, unsigned int pages,
+                        unsigned int *reclaimed)
+{
+        return zpool->driver->shrink(zpool->pool, pages, reclaimed);
+}
+/**
+ * zpool_map_handle() - Map a previously allocated handle into memory
+ * @pool        The zpool that the handle was allocated from
+ * @handle      The handle to map
+ * @mm          How the memory should be mapped
+ *
+ * This maps a previously allocated handle into memory.  The @mm
+ * param indicates to the implementation how the memory will be
+ * used, i.e. read-only, write-only, read-write.  If the
+ * implementation does not support it, the memory will be treated
+ * as read-write.
+ *
+ * This may hold locks, disable interrupts, and/or preemption,
+ * and the zpool_unmap_handle() must be called to undo those
+ * actions.  The code that uses the mapped handle should complete
+ * its operatons on the mapped handle memory quickly and unmap
+ * as soon as possible.  As the implementation may use per-cpu
+ * data, multiple handles should not be mapped concurrently on
+ * any cpu.
+ *
+ * Returns: A pointer to the handle's mapped memory area.
+ */
+void *zpool_map_handle(struct zpool *zpool, unsigned long handle,
+                        enum zpool_mapmode mapmode)
+{
+        return zpool->driver->map(zpool->pool, handle, mapmode);
+}
+/**
+ * zpool_unmap_handle() - Unmap a previously mapped handle
+ * @pool        The zpool that the handle was allocated from
+ * @handle      The handle to unmap
+ *
+ * This unmaps a previously mapped handle.  Any locks or other
+ * actions that the implementation took in zpool_map_handle()
+ * will be undone here.  The memory area returned from
+ * zpool_map_handle() should no longer be used after this.
+ */
+void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
+{
+        zpool->driver->unmap(zpool->pool, handle);
+}
+/**
+ * zpool_get_total_size() - The total size of the pool
+ * @pool        The zpool to check
+ *
+ * This returns the total size in bytes of the pool.
+ *
+ * Returns: Total size of the zpool in bytes.
+ */
+u64 zpool_get_total_size(struct zpool *zpool)
+{
+        return zpool->driver->total_size(zpool->pool);
+}
+static int __init init_zpool(void)
+{
+        pr_info("loaded\n");
+        return 0;
+}
+static void __exit exit_zpool(void)
+{
+        pr_info("unloaded\n");
+}
+module_init(init_zpool);
+module_exit(exit_zpool);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
+MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index fe78189624cf..4e2fc83cb394 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -92,6 +92,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/zsmalloc.h>
+#include <linux/zpool.h>
 /*
 * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -240,6 +241,81 @@ struct mapping_area {
        enum zs_mapmode vm_mm; /* mapping mode */
 };
+/* zpool driver */
+#ifdef CONFIG_ZPOOL
+static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+{
+        return zs_create_pool(gfp);
+}
+static void zs_zpool_destroy(void *pool)
+{
+        zs_destroy_pool(pool);
+}
+static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+                        unsigned long *handle)
+{
+        *handle = zs_malloc(pool, size);
+        return *handle ? 0 : -1;
+}
+static void zs_zpool_free(void *pool, unsigned long handle)
+{
+        zs_free(pool, handle);
+}
+static int zs_zpool_shrink(void *pool, unsigned int pages,
+                        unsigned int *reclaimed)
+{
+        return -EINVAL;
+}
+static void *zs_zpool_map(void *pool, unsigned long handle,
+                        enum zpool_mapmode mm)
+{
+        enum zs_mapmode zs_mm;
+        switch (mm) {
+        case ZPOOL_MM_RO:
+                zs_mm = ZS_MM_RO;
+                break;
+        case ZPOOL_MM_WO:
+                zs_mm = ZS_MM_WO;
+                break;
+        case ZPOOL_MM_RW: /* fallthru */
+        default:
+                zs_mm = ZS_MM_RW;
+                break;
+        }
+        return zs_map_object(pool, handle, zs_mm);
+}
+static void zs_zpool_unmap(void *pool, unsigned long handle)
+{
+        zs_unmap_object(pool, handle);
+}
+static u64 zs_zpool_total_size(void *pool)
+{
+        return zs_get_total_size_bytes(pool);
+}
+static struct zpool_driver zs_zpool_driver = {
+        .type =         "zsmalloc",
+        .owner =        THIS_MODULE,
+        .create =       zs_zpool_create,
+        .destroy =      zs_zpool_destroy,
+        .malloc =       zs_zpool_malloc,
+        .free =         zs_zpool_free,
+        .shrink =       zs_zpool_shrink,
+        .map =          zs_zpool_map,
+        .unmap =        zs_zpool_unmap,
+        .total_size =   zs_zpool_total_size,
+};
+#endif /* CONFIG_ZPOOL */
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -690,7 +766,7 @@ static inline void __zs_cpu_down(struct mapping_area *area)
 static inline void *__zs_map_object(struct mapping_area *area,
                                struct page *pages[2], int off, int size)
 {
-        BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
+        BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
        area->vm_addr = area->vm->addr;
        return area->vm_addr + off;
 }
@@ -814,6 +890,10 @@ static void zs_exit(void)
 {
        int cpu;
+#ifdef CONFIG_ZPOOL
+        zpool_unregister_driver(&zs_zpool_driver);
+#endif
        cpu_notifier_register_begin();
        for_each_online_cpu(cpu)
@@ -840,6 +920,10 @@ static int zs_init(void)
        cpu_notifier_register_done();
+#ifdef CONFIG_ZPOOL
+        zpool_register_driver(&zs_zpool_driver);
+#endif
        return 0;
 fail:
        zs_exit();
diff --git a/mm/zswap.c b/mm/zswap.c
index 008388fe7b0f..032c21eeab2b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -34,7 +34,7 @@
 #include <linux/swap.h>
 #include <linux/crypto.h>
 #include <linux/mempool.h>
-#include <linux/zbud.h>
+#include <linux/zpool.h>
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
@@ -45,8 +45,8 @@
 /*********************************
 * statistics
 **********************************/
-/* Number of memory pages used by the compressed pool */
+/* Total bytes used by the compressed storage */
-static u64 zswap_pool_pages;
+static u64 zswap_pool_total_size;
 /* The number of compressed pages currently stored in zswap */
 static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
@@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20;
 module_param_named(max_pool_percent,
                        zswap_max_pool_percent, uint, 0644);
-/* zbud_pool is shared by all of zswap backend  */
+/* Compressed storage to use */
-static struct zbud_pool *zswap_pool;
+#define ZSWAP_ZPOOL_DEFAULT "zbud"
+static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+module_param_named(zpool, zswap_zpool_type, charp, 0444);
+/* zpool is shared by all of zswap backend  */
+static struct zpool *zswap_pool;
 /*********************************
 * compression functions
@@ -168,7 +173,7 @@ static void zswap_comp_exit(void)
 *            be held while changing the refcount.  Since the lock must
 *            be held, there is no reason to also make refcount atomic.
 * offset - the swap offset for the entry.  Index into the red-black tree.
- * handle - zbud allocation handle that stores the compressed page data
+ * handle - zpool allocation handle that stores the compressed page data
 * length - the length in bytes of the compressed page data.  Needed during
 *          decompression
 */
@@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 }
 /*
- * Carries out the common pattern of freeing and entry's zbud allocation,
+ * Carries out the common pattern of freeing and entry's zpool allocation,
 * freeing the entry itself, and decrementing the number of stored pages.
 */
 static void zswap_free_entry(struct zswap_entry *entry)
 {
-        zbud_free(zswap_pool, entry->handle);
+        zpool_free(zswap_pool, entry->handle);
        zswap_entry_cache_free(entry);
        atomic_dec(&zswap_stored_pages);
-        zswap_pool_pages = zbud_get_pool_size(zswap_pool);
+        zswap_pool_total_size = zpool_get_total_size(zswap_pool);
 }
 /* caller must hold the tree lock */
@@ -409,7 +414,7 @@ cleanup:
 static bool zswap_is_full(void)
 {
        return totalram_pages * zswap_max_pool_percent / 100 <
-                zswap_pool_pages;
+                DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 }
 /*********************************
@@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
 * the swap cache, the compressed version stored by zswap can be
 * freed.
 */
-static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 {
        struct zswap_header *zhdr;
        swp_entry_t swpentry;
@@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
        };
        /* extract swpentry from data */
-        zhdr = zbud_map(pool, handle);
+        zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
        swpentry = zhdr->swpentry; /* here */
-        zbud_unmap(pool, handle);
+        zpool_unmap_handle(pool, handle);
        tree = zswap_trees[swp_type(swpentry)];
        offset = swp_offset(swpentry);
@@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
                /* decompress */
                dlen = PAGE_SIZE;
-                src = (u8 *)zbud_map(zswap_pool, entry->handle) +
+                src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
-                        sizeof(struct zswap_header);
+                                ZPOOL_MM_RO) + sizeof(struct zswap_header);
                dst = kmap_atomic(page);
                ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
                                entry->length, dst, &dlen);
                kunmap_atomic(dst);
-                zbud_unmap(zswap_pool, entry->handle);
+                zpool_unmap_handle(zswap_pool, entry->handle);
                BUG_ON(ret);
                BUG_ON(dlen != PAGE_SIZE);
@@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* reclaim space if needed */
        if (zswap_is_full()) {
                zswap_pool_limit_hit++;
-                if (zbud_reclaim_page(zswap_pool, 8)) {
+                if (zpool_shrink(zswap_pool, 1, NULL)) {
                        zswap_reject_reclaim_fail++;
                        ret = -ENOMEM;
                        goto reject;
@@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* store */
        len = dlen + sizeof(struct zswap_header);
-        ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
+        ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
                &handle);
        if (ret == -ENOSPC) {
                zswap_reject_compress_poor++;
@@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                zswap_reject_alloc_fail++;
                goto freepage;
        }
-        zhdr = zbud_map(zswap_pool, handle);
+        zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
        zhdr->swpentry = swp_entry(type, offset);
        buf = (u8 *)(zhdr + 1);
        memcpy(buf, dst, dlen);
-        zbud_unmap(zswap_pool, handle);
+        zpool_unmap_handle(zswap_pool, handle);
        put_cpu_var(zswap_dstmem);
        /* populate entry */
@@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* update stats */
        atomic_inc(&zswap_stored_pages);
-        zswap_pool_pages = zbud_get_pool_size(zswap_pool);
+        zswap_pool_total_size = zpool_get_total_size(zswap_pool);
        return 0;
@@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
        /* decompress */
        dlen = PAGE_SIZE;
-        src = (u8 *)zbud_map(zswap_pool, entry->handle) +
+        src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
-                        sizeof(struct zswap_header);
+                        ZPOOL_MM_RO) + sizeof(struct zswap_header);
        dst = kmap_atomic(page);
        ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
                dst, &dlen);
        kunmap_atomic(dst);
-        zbud_unmap(zswap_pool, entry->handle);
+        zpool_unmap_handle(zswap_pool, entry->handle);
        BUG_ON(ret);
        spin_lock(&tree->lock);
@@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        zswap_trees[type] = NULL;
 }
-static struct zbud_ops zswap_zbud_ops = {
+static struct zpool_ops zswap_zpool_ops = {
        .evict = zswap_writeback_entry
 };
@@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void)
                        zswap_debugfs_root, &zswap_written_back_pages);
        debugfs_create_u64("duplicate_entry", S_IRUGO,
                        zswap_debugfs_root, &zswap_duplicate_entry);
-        debugfs_create_u64("pool_pages", S_IRUGO,
+        debugfs_create_u64("pool_total_size", S_IRUGO,
-                        zswap_debugfs_root, &zswap_pool_pages);
+                        zswap_debugfs_root, &zswap_pool_total_size);
        debugfs_create_atomic_t("stored_pages", S_IRUGO,
                        zswap_debugfs_root, &zswap_stored_pages);
@@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { }
 **********************************/
 static int __init init_zswap(void)
 {
+        gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
        if (!zswap_enabled)
                return 0;
        pr_info("loading zswap\n");
-        zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+        zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+        if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+                pr_info("%s zpool not available\n", zswap_zpool_type);
+                zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+                zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+                                        &zswap_zpool_ops);
+        }
        if (!zswap_pool) {
-                pr_err("zbud pool creation failed\n");
+                pr_err("%s zpool not available\n", zswap_zpool_type);
+                pr_err("zpool creation failed\n");
                goto error;
        }
+        pr_info("using %s pool\n", zswap_zpool_type);
        if (zswap_entry_cache_create()) {
                pr_err("entry cache creation failed\n");
@@ -928,7 +943,7 @@ pcpufail:
 compfail:
        zswap_entry_cache_destory();
 cachefail:
-        zbud_destroy_pool(zswap_pool);
+        zpool_destroy_pool(zswap_pool);
 error:
        return -ENOMEM;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-08-07 00:14:42 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-08-07 00:14:42 -0400
commit	33caee39925b887a99a2400dc5c980097c3573f9 (patch)
tree	8e68ad97e1fee88c4a3f31453041f8d139f2027e /mm
parent	6456a0438b984186a0c9c8ecc9fe3d97b7ac3613 (diff)
parent	f84223087402c45179be5e7060c5736c17a7b271 (diff)